From 3505594899160deeda9e16e1773d26b19e246362 Mon Sep 17 00:00:00 2001 From: Kieran Mace Date: Thu, 22 May 2025 22:30:35 -0500 Subject: [PATCH] Add bin_prop computed variable to stat_bin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings feature parity with stat_count by adding `after_stat(bin_prop)` functionality to stat_bin. The bin_prop variable shows the proportion of each group within each bin, enabling proportion-based visualizations for binned continuous data. Key features: - bin_prop = count_in_group / total_count_in_bin - Works with multiple groups and respects weights - Backwards compatible (bin_prop = 1 for single groups) - Properly handles empty bins Usage: ggplot(data, aes(x = continuous_var, y = after_stat(bin_prop), fill = group)) + stat_bin(geom = "col", position = "dodge") 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- R/stat-bin.R | 51 ++++++++++++++++++++++- tests/testthat/test-stat-bin.R | 74 ++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/R/stat-bin.R b/R/stat-bin.R index 1b2361f250..893de12cd6 100644 --- a/R/stat-bin.R +++ b/R/stat-bin.R @@ -63,6 +63,54 @@ StatBin <- ggproto( flip_data(bins, flipped_aes) }, + compute_panel = function(self, data, scales, binwidth = NULL, bins = NULL, + center = NULL, boundary = NULL, + closed = c("right", "left"), pad = FALSE, + breaks = NULL, flipped_aes = FALSE, drop = "none") { + # First call parent's compute_panel to get binned data for all groups + data <- ggproto_parent(Stat, self)$compute_panel( + data, scales, binwidth = binwidth, bins = bins, + center = center, boundary = boundary, closed = closed, + pad = pad, breaks = breaks, flipped_aes = flipped_aes, drop = drop + ) + + # Only calculate bin_prop if we have the necessary columns and multiple groups + if (!is.null(data) && nrow(data) > 0 && + all(c("count", "xmin", "xmax") %in% names(data))) { + + # Calculate bin_prop: proportion of each group within each bin + # Create a unique bin identifier using rounded values to handle floating point precision + data$bin_id <- paste(round(data$xmin, 10), round(data$xmax, 10), sep = "_") + + # Calculate total count per bin across all groups + bin_totals <- stats::aggregate(data$count, by = list(bin_id = data$bin_id), FUN = sum) + names(bin_totals)[2] <- "bin_total" + + # Merge back to get bin totals for each row + data <- merge(data, bin_totals, by = "bin_id", sort = FALSE) + + # Calculate bin_prop: count within group / total count in bin + # When bin_total = 0 (empty bin), set bin_prop based on whether there are multiple groups + n_groups <- length(unique(data$group)) + if (n_groups == 1) { + # With only one group, bin_prop is always 1 (100% of the bin belongs to this group) + data$bin_prop <- 1 + } else { + # With multiple groups, bin_prop = count / total_count_in_bin, or 0 for empty bins + data$bin_prop <- ifelse(data$bin_total > 0, data$count / data$bin_total, 0) + } + + # Remove the temporary columns + data$bin_id <- NULL + data$bin_total <- NULL + } else { + # If we don't have the necessary data, just add a default bin_prop column + data$bin_prop <- if (nrow(data) > 0) rep(1, nrow(data)) else numeric(0) + } + + data + }, + default_aes = aes(x = after_stat(count), y = after_stat(count), weight = 1), required_aes = "x|y", @@ -108,7 +156,8 @@ StatBin <- ggproto( #' density = "density of points in bin, scaled to integrate to 1.", #' ncount = "count, scaled to a maximum of 1.", #' ndensity = "density, scaled to a maximum of 1.", -#' width = "widths of bins." +#' width = "widths of bins.", +#' bin_prop = "proportion of points in bin that belong to each group." #' ) #' #' @section Dropped variables: diff --git a/tests/testthat/test-stat-bin.R b/tests/testthat/test-stat-bin.R index 3568faa819..177321b637 100644 --- a/tests/testthat/test-stat-bin.R +++ b/tests/testthat/test-stat-bin.R @@ -257,3 +257,77 @@ test_that("stat_count preserves x order for continuous and discrete", { expect_identical(b$layout$panel_params[[1]]$x$get_labels(), c("4","1","2","3","6","8")) expect_identical(b$data[[1]]$y, c(10,7,10,3,1,1)) }) + +# Test bin_prop functionality --------------------------------------------- + +test_that("stat_bin calculates bin_prop correctly", { + # Create test data with two distinct groups + test_data <- data_frame( + x = c(rep(c(1, 2, 3, 4, 5), each = 10), rep(c(3, 4, 5, 6, 7), each = 10)), + group = rep(c("A", "B"), each = 50) + ) + + # Test with 5 bins to get predictable overlap + p <- ggplot(test_data, aes(x, fill = group)) + geom_histogram(bins = 5) + data <- get_layer_data(p) + + # bin_prop should be available + expect_true("bin_prop" %in% names(data)) + + # All bin_prop values should be between 0 and 1 + expect_true(all(data$bin_prop >= 0 & data$bin_prop <= 1)) + + # For bins that contain both groups, bin_prop should sum to 1 across groups + bins_with_both_groups <- aggregate(data$count > 0, by = list(paste(data$xmin, data$xmax)), sum) + overlapping_bins <- bins_with_both_groups[bins_with_both_groups$x == 2, ]$Group.1 + + for (bin in overlapping_bins) { + bin_data <- data[paste(data$xmin, data$xmax) == bin, ] + total_prop <- sum(bin_data$bin_prop) + expect_equal(total_prop, 1, tolerance = 1e-6) + } + + # Test after_stat(bin_prop) usage + p2 <- ggplot(test_data, aes(x, y = after_stat(bin_prop), fill = group)) + + stat_bin(geom = "col", bins = 5, position = "dodge") + data2 <- get_layer_data(p2) + + # y should contain the bin_prop values + expect_equal(data2$y, data2$bin_prop) +}) + +test_that("stat_bin bin_prop works with single group", { + # Test that bin_prop = 1 for all bins when there's only one group + test_data <- data_frame(x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + + p <- ggplot(test_data, aes(x)) + geom_histogram(bins = 5) + data <- get_layer_data(p) + + # All bin_prop values should be 1 (even for empty bins) + expect_true(all(data$bin_prop == 1)) +}) + +test_that("stat_bin bin_prop works with weights", { + # Test that bin_prop calculation respects weights + test_data <- data_frame( + x = c(1, 1, 2, 2), + group = c("A", "B", "A", "B"), + w = c(1, 3, 2, 2) # Different weights for each observation + ) + + p <- ggplot(test_data, aes(x, fill = group, weight = w)) + geom_histogram(bins = 2) + data <- get_layer_data(p) + + # bin_prop should be available + expect_true("bin_prop" %in% names(data)) + + # Check that proportions are calculated correctly with weights + # Bin 1: A=1, B=3, total=4, so A should have bin_prop=0.25, B should have bin_prop=0.75 + # Bin 2: A=2, B=2, total=4, so A should have bin_prop=0.5, B should have bin_prop=0.5 + bin1_data <- data[data$x == min(data$x), ] + bin2_data <- data[data$x == max(data$x), ] + + # For each bin, proportions should sum to 1 + expect_equal(sum(bin1_data$bin_prop), 1, tolerance = 1e-6) + expect_equal(sum(bin2_data$bin_prop), 1, tolerance = 1e-6) +})