-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Add bin_prop computed variable to stat_bin #6477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,6 +63,54 @@ StatBin <- ggproto( | |
flip_data(bins, flipped_aes) | ||
}, | ||
|
||
compute_panel = function(self, data, scales, binwidth = NULL, bins = NULL, | ||
center = NULL, boundary = NULL, | ||
closed = c("right", "left"), pad = FALSE, | ||
breaks = NULL, flipped_aes = FALSE, drop = "none") { | ||
# First call parent's compute_panel to get binned data for all groups | ||
data <- ggproto_parent(Stat, self)$compute_panel( | ||
data, scales, binwidth = binwidth, bins = bins, | ||
center = center, boundary = boundary, closed = closed, | ||
pad = pad, breaks = breaks, flipped_aes = flipped_aes, drop = drop | ||
) | ||
|
||
# Only calculate bin_prop if we have the necessary columns and multiple groups | ||
if (!is.null(data) && nrow(data) > 0 && | ||
all(c("count", "xmin", "xmax") %in% names(data))) { | ||
|
||
# Calculate bin_prop: proportion of each group within each bin | ||
# Create a unique bin identifier using rounded values to handle floating point precision | ||
data$bin_id <- paste(round(data$xmin, 10), round(data$xmax, 10), sep = "_") | ||
|
||
# Calculate total count per bin across all groups | ||
bin_totals <- stats::aggregate(data$count, by = list(bin_id = data$bin_id), FUN = sum) | ||
names(bin_totals)[2] <- "bin_total" | ||
|
||
# Merge back to get bin totals for each row | ||
data <- merge(data, bin_totals, by = "bin_id", sort = FALSE) | ||
|
||
# Calculate bin_prop: count within group / total count in bin | ||
# When bin_total = 0 (empty bin), set bin_prop based on whether there are multiple groups | ||
n_groups <- length(unique(data$group)) | ||
if (n_groups == 1) { | ||
# With only one group, bin_prop is always 1 (100% of the bin belongs to this group) | ||
data$bin_prop <- 1 | ||
} else { | ||
# With multiple groups, bin_prop = count / total_count_in_bin, or 0 for empty bins | ||
data$bin_prop <- ifelse(data$bin_total > 0, data$count / data$bin_total, 0) | ||
} | ||
|
||
# Remove the temporary columns | ||
data$bin_id <- NULL | ||
data$bin_total <- NULL | ||
} else { | ||
# If we don't have the necessary data, just add a default bin_prop column | ||
data$bin_prop <- if (nrow(data) > 0) rep(1, nrow(data)) else numeric(0) | ||
} | ||
|
||
data | ||
}, | ||
|
||
default_aes = aes(x = after_stat(count), y = after_stat(count), weight = 1), | ||
|
||
required_aes = "x|y", | ||
|
@@ -108,7 +156,8 @@ StatBin <- ggproto( | |
#' density = "density of points in bin, scaled to integrate to 1.", | ||
#' ncount = "count, scaled to a maximum of 1.", | ||
#' ndensity = "density, scaled to a maximum of 1.", | ||
#' width = "widths of bins." | ||
#' width = "widths of bins.", | ||
#' bin_prop = "proportion of points in bin that belong to each group." | ||
Comment on lines
+159
to
+160
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you regenerate the .Rd files as well? |
||
#' ) | ||
#' | ||
#' @section Dropped variables: | ||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -257,3 +257,77 @@ test_that("stat_count preserves x order for continuous and discrete", { | |||||||||
expect_identical(b$layout$panel_params[[1]]$x$get_labels(), c("4","1","2","3","6","8")) | ||||||||||
expect_identical(b$data[[1]]$y, c(10,7,10,3,1,1)) | ||||||||||
}) | ||||||||||
|
||||||||||
# Test bin_prop functionality --------------------------------------------- | ||||||||||
|
||||||||||
test_that("stat_bin calculates bin_prop correctly", { | ||||||||||
# Create test data with two distinct groups | ||||||||||
test_data <- data_frame( | ||||||||||
x = c(rep(c(1, 2, 3, 4, 5), each = 10), rep(c(3, 4, 5, 6, 7), each = 10)), | ||||||||||
group = rep(c("A", "B"), each = 50) | ||||||||||
) | ||||||||||
|
||||||||||
# Test with 5 bins to get predictable overlap | ||||||||||
p <- ggplot(test_data, aes(x, fill = group)) + geom_histogram(bins = 5) | ||||||||||
Comment on lines
+270
to
+271
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Breaks can be set directly if predictability is an issue |
||||||||||
data <- get_layer_data(p) | ||||||||||
|
||||||||||
# bin_prop should be available | ||||||||||
expect_true("bin_prop" %in% names(data)) | ||||||||||
|
||||||||||
# All bin_prop values should be between 0 and 1 | ||||||||||
expect_true(all(data$bin_prop >= 0 & data$bin_prop <= 1)) | ||||||||||
|
||||||||||
# For bins that contain both groups, bin_prop should sum to 1 across groups | ||||||||||
bins_with_both_groups <- aggregate(data$count > 0, by = list(paste(data$xmin, data$xmax)), sum) | ||||||||||
overlapping_bins <- bins_with_both_groups[bins_with_both_groups$x == 2, ]$Group.1 | ||||||||||
|
||||||||||
for (bin in overlapping_bins) { | ||||||||||
bin_data <- data[paste(data$xmin, data$xmax) == bin, ] | ||||||||||
total_prop <- sum(bin_data$bin_prop) | ||||||||||
expect_equal(total_prop, 1, tolerance = 1e-6) | ||||||||||
} | ||||||||||
Comment on lines
+281
to
+288
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't it more simple to test that the sum over bins is 1, regardless of how many groups? |
||||||||||
|
||||||||||
# Test after_stat(bin_prop) usage | ||||||||||
p2 <- ggplot(test_data, aes(x, y = after_stat(bin_prop), fill = group)) + | ||||||||||
stat_bin(geom = "col", bins = 5, position = "dodge") | ||||||||||
data2 <- get_layer_data(p2) | ||||||||||
|
||||||||||
# y should contain the bin_prop values | ||||||||||
expect_equal(data2$y, data2$bin_prop) | ||||||||||
}) | ||||||||||
|
||||||||||
test_that("stat_bin bin_prop works with single group", { | ||||||||||
# Test that bin_prop = 1 for all bins when there's only one group | ||||||||||
test_data <- data_frame(x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) | ||||||||||
|
||||||||||
p <- ggplot(test_data, aes(x)) + geom_histogram(bins = 5) | ||||||||||
data <- get_layer_data(p) | ||||||||||
|
||||||||||
# All bin_prop values should be 1 (even for empty bins) | ||||||||||
expect_true(all(data$bin_prop == 1)) | ||||||||||
}) | ||||||||||
|
||||||||||
test_that("stat_bin bin_prop works with weights", { | ||||||||||
# Test that bin_prop calculation respects weights | ||||||||||
test_data <- data_frame( | ||||||||||
x = c(1, 1, 2, 2), | ||||||||||
group = c("A", "B", "A", "B"), | ||||||||||
w = c(1, 3, 2, 2) # Different weights for each observation | ||||||||||
) | ||||||||||
|
||||||||||
p <- ggplot(test_data, aes(x, fill = group, weight = w)) + geom_histogram(bins = 2) | ||||||||||
data <- get_layer_data(p) | ||||||||||
|
||||||||||
# bin_prop should be available | ||||||||||
expect_true("bin_prop" %in% names(data)) | ||||||||||
|
||||||||||
# Check that proportions are calculated correctly with weights | ||||||||||
# Bin 1: A=1, B=3, total=4, so A should have bin_prop=0.25, B should have bin_prop=0.75 | ||||||||||
# Bin 2: A=2, B=2, total=4, so A should have bin_prop=0.5, B should have bin_prop=0.5 | ||||||||||
bin1_data <- data[data$x == min(data$x), ] | ||||||||||
bin2_data <- data[data$x == max(data$x), ] | ||||||||||
Comment on lines
+327
to
+328
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
We know from the test data what these values should be |
||||||||||
|
||||||||||
# For each bin, proportions should sum to 1 | ||||||||||
expect_equal(sum(bin1_data$bin_prop), 1, tolerance = 1e-6) | ||||||||||
expect_equal(sum(bin2_data$bin_prop), 1, tolerance = 1e-6) | ||||||||||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This all seems more complicated than it needs to be. Can't this be computed more directly?