Skip to content

Commit 3505594

Browse files
kieran-maceClaude
and
Claude
committed
Add bin_prop computed variable to stat_bin
Brings feature parity with stat_count by adding `after_stat(bin_prop)` functionality to stat_bin. The bin_prop variable shows the proportion of each group within each bin, enabling proportion-based visualizations for binned continuous data. Key features: - bin_prop = count_in_group / total_count_in_bin - Works with multiple groups and respects weights - Backwards compatible (bin_prop = 1 for single groups) - Properly handles empty bins Usage: ggplot(data, aes(x = continuous_var, y = after_stat(bin_prop), fill = group)) + stat_bin(geom = "col", position = "dodge") 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 4263d12 commit 3505594

File tree

2 files changed

+124
-1
lines changed

2 files changed

+124
-1
lines changed

R/stat-bin.R

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,54 @@ StatBin <- ggproto(
6363
flip_data(bins, flipped_aes)
6464
},
6565

66+
compute_panel = function(self, data, scales, binwidth = NULL, bins = NULL,
67+
center = NULL, boundary = NULL,
68+
closed = c("right", "left"), pad = FALSE,
69+
breaks = NULL, flipped_aes = FALSE, drop = "none") {
70+
# First call parent's compute_panel to get binned data for all groups
71+
data <- ggproto_parent(Stat, self)$compute_panel(
72+
data, scales, binwidth = binwidth, bins = bins,
73+
center = center, boundary = boundary, closed = closed,
74+
pad = pad, breaks = breaks, flipped_aes = flipped_aes, drop = drop
75+
)
76+
77+
# Only calculate bin_prop if we have the necessary columns and multiple groups
78+
if (!is.null(data) && nrow(data) > 0 &&
79+
all(c("count", "xmin", "xmax") %in% names(data))) {
80+
81+
# Calculate bin_prop: proportion of each group within each bin
82+
# Create a unique bin identifier using rounded values to handle floating point precision
83+
data$bin_id <- paste(round(data$xmin, 10), round(data$xmax, 10), sep = "_")
84+
85+
# Calculate total count per bin across all groups
86+
bin_totals <- stats::aggregate(data$count, by = list(bin_id = data$bin_id), FUN = sum)
87+
names(bin_totals)[2] <- "bin_total"
88+
89+
# Merge back to get bin totals for each row
90+
data <- merge(data, bin_totals, by = "bin_id", sort = FALSE)
91+
92+
# Calculate bin_prop: count within group / total count in bin
93+
# When bin_total = 0 (empty bin), set bin_prop based on whether there are multiple groups
94+
n_groups <- length(unique(data$group))
95+
if (n_groups == 1) {
96+
# With only one group, bin_prop is always 1 (100% of the bin belongs to this group)
97+
data$bin_prop <- 1
98+
} else {
99+
# With multiple groups, bin_prop = count / total_count_in_bin, or 0 for empty bins
100+
data$bin_prop <- ifelse(data$bin_total > 0, data$count / data$bin_total, 0)
101+
}
102+
103+
# Remove the temporary columns
104+
data$bin_id <- NULL
105+
data$bin_total <- NULL
106+
} else {
107+
# If we don't have the necessary data, just add a default bin_prop column
108+
data$bin_prop <- if (nrow(data) > 0) rep(1, nrow(data)) else numeric(0)
109+
}
110+
111+
data
112+
},
113+
66114
default_aes = aes(x = after_stat(count), y = after_stat(count), weight = 1),
67115

68116
required_aes = "x|y",
@@ -108,7 +156,8 @@ StatBin <- ggproto(
108156
#' density = "density of points in bin, scaled to integrate to 1.",
109157
#' ncount = "count, scaled to a maximum of 1.",
110158
#' ndensity = "density, scaled to a maximum of 1.",
111-
#' width = "widths of bins."
159+
#' width = "widths of bins.",
160+
#' bin_prop = "proportion of points in bin that belong to each group."
112161
#' )
113162
#'
114163
#' @section Dropped variables:

tests/testthat/test-stat-bin.R

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,77 @@ test_that("stat_count preserves x order for continuous and discrete", {
257257
expect_identical(b$layout$panel_params[[1]]$x$get_labels(), c("4","1","2","3","6","8"))
258258
expect_identical(b$data[[1]]$y, c(10,7,10,3,1,1))
259259
})
260+
261+
# Test bin_prop functionality ---------------------------------------------
262+
263+
test_that("stat_bin calculates bin_prop correctly", {
264+
# Create test data with two distinct groups
265+
test_data <- data_frame(
266+
x = c(rep(c(1, 2, 3, 4, 5), each = 10), rep(c(3, 4, 5, 6, 7), each = 10)),
267+
group = rep(c("A", "B"), each = 50)
268+
)
269+
270+
# Test with 5 bins to get predictable overlap
271+
p <- ggplot(test_data, aes(x, fill = group)) + geom_histogram(bins = 5)
272+
data <- get_layer_data(p)
273+
274+
# bin_prop should be available
275+
expect_true("bin_prop" %in% names(data))
276+
277+
# All bin_prop values should be between 0 and 1
278+
expect_true(all(data$bin_prop >= 0 & data$bin_prop <= 1))
279+
280+
# For bins that contain both groups, bin_prop should sum to 1 across groups
281+
bins_with_both_groups <- aggregate(data$count > 0, by = list(paste(data$xmin, data$xmax)), sum)
282+
overlapping_bins <- bins_with_both_groups[bins_with_both_groups$x == 2, ]$Group.1
283+
284+
for (bin in overlapping_bins) {
285+
bin_data <- data[paste(data$xmin, data$xmax) == bin, ]
286+
total_prop <- sum(bin_data$bin_prop)
287+
expect_equal(total_prop, 1, tolerance = 1e-6)
288+
}
289+
290+
# Test after_stat(bin_prop) usage
291+
p2 <- ggplot(test_data, aes(x, y = after_stat(bin_prop), fill = group)) +
292+
stat_bin(geom = "col", bins = 5, position = "dodge")
293+
data2 <- get_layer_data(p2)
294+
295+
# y should contain the bin_prop values
296+
expect_equal(data2$y, data2$bin_prop)
297+
})
298+
299+
test_that("stat_bin bin_prop works with single group", {
300+
# Test that bin_prop = 1 for all bins when there's only one group
301+
test_data <- data_frame(x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
302+
303+
p <- ggplot(test_data, aes(x)) + geom_histogram(bins = 5)
304+
data <- get_layer_data(p)
305+
306+
# All bin_prop values should be 1 (even for empty bins)
307+
expect_true(all(data$bin_prop == 1))
308+
})
309+
310+
test_that("stat_bin bin_prop works with weights", {
311+
# Test that bin_prop calculation respects weights
312+
test_data <- data_frame(
313+
x = c(1, 1, 2, 2),
314+
group = c("A", "B", "A", "B"),
315+
w = c(1, 3, 2, 2) # Different weights for each observation
316+
)
317+
318+
p <- ggplot(test_data, aes(x, fill = group, weight = w)) + geom_histogram(bins = 2)
319+
data <- get_layer_data(p)
320+
321+
# bin_prop should be available
322+
expect_true("bin_prop" %in% names(data))
323+
324+
# Check that proportions are calculated correctly with weights
325+
# Bin 1: A=1, B=3, total=4, so A should have bin_prop=0.25, B should have bin_prop=0.75
326+
# Bin 2: A=2, B=2, total=4, so A should have bin_prop=0.5, B should have bin_prop=0.5
327+
bin1_data <- data[data$x == min(data$x), ]
328+
bin2_data <- data[data$x == max(data$x), ]
329+
330+
# For each bin, proportions should sum to 1
331+
expect_equal(sum(bin1_data$bin_prop), 1, tolerance = 1e-6)
332+
expect_equal(sum(bin2_data$bin_prop), 1, tolerance = 1e-6)
333+
})

0 commit comments

Comments
 (0)