# Makes summary graphs of Snowflake statistics from 2023. Run this in a # checkout of https://gitlab.torproject.org/dcf/snowflake-graphs. # Usage: Rscript snowflake-2023.r library("cowplot") library("tidyverse") DATE_LIMITS <- lubridate::ymd(c( "2022-12-25", "2024-01-03" )) TOP_N <- 5 WANTED_FINGERPRINTS <- c( "7659DA0F96B156C322FBFF3ACCC9B9DC01C27C73" = "snowman", "5481936581E23D2D178105D44DB6915AB06BFB7F" = "snowflake-01", "91DA221A149007D0FD9E5515F5786C3DD07E4BB0" = "snowflake-02" ) LINE_COLOR_PALETTE <- c( "5481936581E23D2D178105D44DB6915AB06BFB7F" = "dodgerblue", "91DA221A149007D0FD9E5515F5786C3DD07E4BB0" = "navy" ) theme_set( theme_minimal() + theme(plot.background = element_rect(fill = "white", color = NA, linewidth = 0)) ) update_geom_defaults("line", aes(linewidth = 0.4)) GRAPH_COMMON <- list( scale_x_date( date_breaks = "1 month", minor_breaks = "1 week", date_labels = "%b\n%Y" ), coord_cartesian(xlim = DATE_LIMITS, expand = FALSE) ) EVENTS <- tribble( ~date, ~nudge_x, ~nudge_y, ~line, ~label, "2023-01-20 00:00:00", 5, -40000, T, "Domain fronting rendezvous\ntemporarily blocked in Iran", "2023-03-13 00:00:00", 9, 0, F, "Bridge performance fix", "2023-09-20 14:00:00", 9, 0, F, "Change in rendezvous\ndomain front", ) |> mutate(date = lubridate::ymd_hms(date) |> lubridate::as_date()) userstats_bridge_transport <- read_csv("userstats-bridge-transport-multi.csv", comment = "#", show_col_types = FALSE) |> filter(transport == "snowflake" & fingerprint %in% names(WANTED_FINGERPRINTS)) |> group_by(fingerprint, transport) |> complete(date = seq.Date(min(date), max(date), "days")) |> ungroup() |> # Compensate for days when not all descriptors were published. mutate(users = users / (coverage / pmax(num_instances, coverage))) |> filter(lubridate::`%within%`(date, do.call(lubridate::interval, as.list(DATE_LIMITS)))) userstats_bridge_combined <- read_csv("userstats-bridge-combined-multi.csv", comment = "#", show_col_types = FALSE) |> filter(transport == "snowflake" & fingerprint %in% names(WANTED_FINGERPRINTS)) |> group_by(fingerprint, transport, country) |> complete(date = seq.Date(min(date), max(date), "days")) |> ungroup() |> # Compensate for days when not all descriptors were published. mutate(across(c(low, high), ~ .x / (coverage / pmax(num_instances, coverage)))) |> filter(lubridate::`%within%`(date, do.call(lubridate::interval, as.list(DATE_LIMITS)))) bandwidth <- read_csv("bandwidth-multi.csv", comment = "#", show_col_types = FALSE) |> filter(fingerprint %in% names(WANTED_FINGERPRINTS)) |> group_by(fingerprint, type) |> complete(date = seq.Date(min(date), max(date), "days")) |> ungroup() |> # Compensate for days when not all descriptors were published. mutate(bytes = bytes / (coverage / pmax(num_instances, coverage))) |> pivot_wider(id_cols = c(date, fingerprint), names_from = c(type), values_from = c(bytes)) |> # Subtract out dirreq traffic, then average read and write to estimate goodput. mutate( good_read = read - `dirreq-read`, good_write = write - `dirreq-write`, good_avg = (good_read + good_write) / 2 ) |> filter(lubridate::`%within%`(date, do.call(lubridate::interval, as.list(DATE_LIMITS)))) USERSTATS_BRIDGE_TRANSPORT_PLOT_BASE <- list( scale_y_continuous( limits = c(0, NA), labels = scales::label_comma() ), scale_color_manual( name = "Bridge", values = LINE_COLOR_PALETTE, labels = WANTED_FINGERPRINTS ), labs( title = "Daily Snowflake users", x = NULL, y = "Average simultaneous users" ), GRAPH_COMMON ) userstats_bridge_transport_all <- userstats_bridge_transport |> group_by(date, transport) |> summarize(users = sum(users, na.rm = TRUE), .groups = "drop") userstats_bridge_transport_all_fn <- with(userstats_bridge_transport_all, approxfun(date, users)) userstats_bridge_transport_plot <- ggplot(userstats_bridge_transport_all) + geom_line(aes(x = date, y = users)) + geom_point( data = EVENTS, aes(x = date, y = userstats_bridge_transport_all_fn(date)), shape = 1, color = "red", size = 7.5 ) + geom_text( data = EVENTS, aes( x = date, y = userstats_bridge_transport_all_fn(date), label = label ), hjust = 0, nudge_x = EVENTS$nudge_x, nudge_y = EVENTS$nudge_y, lineheight = 0.8, color = "black", size = 3 ) + geom_segment( data = filter(EVENTS, line), aes( x = date + nudge_x, y = userstats_bridge_transport_all_fn(date) + nudge_y, xend = date + 0.18 * nudge_x, yend = userstats_bridge_transport_all_fn(date) + 0.18 * nudge_y ), alpha = 0.4 ) + USERSTATS_BRIDGE_TRANSPORT_PLOT_BASE userstats_bridge_transport_multi_plot <- ggplot(userstats_bridge_transport) + geom_line(aes(x = date, y = users, color = fingerprint)) + USERSTATS_BRIDGE_TRANSPORT_PLOT_BASE BANDWIDTH_PLOT_BASE <- list( scale_y_continuous( limits = c(0, NA), labels = scales::label_bytes() ), scale_color_manual( name = "Bridge", values = LINE_COLOR_PALETTE, labels = WANTED_FINGERPRINTS ), labs( title = "Daily Snowflake bandwidth", x = NULL, y = "Bytes" ), GRAPH_COMMON ) bandwidth_plot <- ggplot(bandwidth |> group_by(date) |> summarize(across(c(read, write, `dirreq-read`, `dirreq-write`, good_read, good_write, good_avg), sum, na.rm = TRUE), .groups = "drop") ) + geom_line(aes(x = date, y = good_avg)) + BANDWIDTH_PLOT_BASE bandwidth_plot_multi <- ggplot(bandwidth) + geom_line(aes(x = date, y = good_avg, color = fingerprint)) + BANDWIDTH_PLOT_BASE top_countries_plot <- ggplot(userstats_bridge_combined |> group_by(date, transport, country) |> summarize(low = sum(low, na.rm = TRUE), high = sum(high, na.rm = TRUE), .groups = "drop") |> group_by(date, transport) |> slice_max((low + high) / 2, n = TOP_N) |> ungroup() |> mutate(country = fct_drop(country)) |> # Order country factors by their most recent value. mutate(country = fct_reorder2(country, date, (low + high) / 2)) |> group_by(transport, country) |> complete(date = seq.Date(min(date), max(date), "days")) |> ungroup() ) + geom_ribbon( aes(x = date, ymin = low, ymax = high, fill = country, color = country), outline.type = "lower", alpha = 0.9 ) + scale_fill_brewer(palette = "Set2") + scale_color_brewer(palette = "Set2") + guides(fill = guide_legend(override.aes = list(alpha = 1.0, size = 1)), color = NULL) + scale_y_continuous( limits = c(0, NA), labels = scales::label_comma() ) + labs( title = sprintf("Top %d countries with the most Snowflake users by day", TOP_N), x = NULL, y = "Average simultaneous users" ) + GRAPH_COMMON plots <- align_plots(plotlist = list( userstats_bridge_transport_plot, userstats_bridge_transport_multi_plot, bandwidth_plot, bandwidth_plot_multi, top_countries_plot ), align = "v", axis = "lr") userstats_bridge_transport_plot <- plots[[1]] userstats_bridge_transport_multi_plot <- plots[[2]] bandwidth_plot <- plots[[3]] bandwidth_plot_multi <- plots[[4]] top_countries_plot <- plots[[5]] ggsave("snowflake-userstats-bridge-transport-2023.png", userstats_bridge_transport_plot, width = 8, height = 2.5) ggsave("snowflake-userstats-bridge-transport-multi-2023.png", userstats_bridge_transport_multi_plot, width = 8, height = 2.5) ggsave("snowflake-bandwidth-2023.png", bandwidth_plot, width = 8, height = 2.5) ggsave("snowflake-bandwidth-multi-2023.png", bandwidth_plot_multi, width = 8, height = 2.5) ggsave("snowflake-top-countries-2023.png", top_countries_plot, width = 8, height = 3.0)