Skip to content

Add bin_prop computed variable to stat_bin #6477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion R/stat-bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,54 @@ StatBin <- ggproto(
flip_data(bins, flipped_aes)
},

compute_panel = function(self, data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, flipped_aes = FALSE, drop = "none") {
# First call parent's compute_panel to get binned data for all groups
data <- ggproto_parent(Stat, self)$compute_panel(
data, scales, binwidth = binwidth, bins = bins,
center = center, boundary = boundary, closed = closed,
pad = pad, breaks = breaks, flipped_aes = flipped_aes, drop = drop
)

# Only calculate bin_prop if we have the necessary columns and multiple groups
if (!is.null(data) && nrow(data) > 0 &&
all(c("count", "xmin", "xmax") %in% names(data))) {

# Calculate bin_prop: proportion of each group within each bin
# Create a unique bin identifier using rounded values to handle floating point precision
data$bin_id <- paste(round(data$xmin, 10), round(data$xmax, 10), sep = "_")

# Calculate total count per bin across all groups
bin_totals <- stats::aggregate(data$count, by = list(bin_id = data$bin_id), FUN = sum)
names(bin_totals)[2] <- "bin_total"

# Merge back to get bin totals for each row
data <- merge(data, bin_totals, by = "bin_id", sort = FALSE)

# Calculate bin_prop: count within group / total count in bin
# When bin_total = 0 (empty bin), set bin_prop based on whether there are multiple groups
n_groups <- length(unique(data$group))
if (n_groups == 1) {
# With only one group, bin_prop is always 1 (100% of the bin belongs to this group)
data$bin_prop <- 1
} else {
# With multiple groups, bin_prop = count / total_count_in_bin, or 0 for empty bins
data$bin_prop <- ifelse(data$bin_total > 0, data$count / data$bin_total, 0)
}

# Remove the temporary columns
data$bin_id <- NULL
data$bin_total <- NULL
} else {
# If we don't have the necessary data, just add a default bin_prop column
data$bin_prop <- if (nrow(data) > 0) rep(1, nrow(data)) else numeric(0)
}
Comment on lines +78 to +109
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This all seems more complicated than it needs to be. Can't this be computed more directly?


data
},

default_aes = aes(x = after_stat(count), y = after_stat(count), weight = 1),

required_aes = "x|y",
Expand Down Expand Up @@ -108,7 +156,8 @@ StatBin <- ggproto(
#' density = "density of points in bin, scaled to integrate to 1.",
#' ncount = "count, scaled to a maximum of 1.",
#' ndensity = "density, scaled to a maximum of 1.",
#' width = "widths of bins."
#' width = "widths of bins.",
#' bin_prop = "proportion of points in bin that belong to each group."
Comment on lines +159 to +160
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you regenerate the .Rd files as well?

#' )
#'
#' @section Dropped variables:
Expand Down
74 changes: 74 additions & 0 deletions tests/testthat/test-stat-bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,77 @@ test_that("stat_count preserves x order for continuous and discrete", {
expect_identical(b$layout$panel_params[[1]]$x$get_labels(), c("4","1","2","3","6","8"))
expect_identical(b$data[[1]]$y, c(10,7,10,3,1,1))
})

# Test bin_prop functionality ---------------------------------------------

test_that("stat_bin calculates bin_prop correctly", {
# Create test data with two distinct groups
test_data <- data_frame(
x = c(rep(c(1, 2, 3, 4, 5), each = 10), rep(c(3, 4, 5, 6, 7), each = 10)),
group = rep(c("A", "B"), each = 50)
)

# Test with 5 bins to get predictable overlap
p <- ggplot(test_data, aes(x, fill = group)) + geom_histogram(bins = 5)
Comment on lines +270 to +271
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Breaks can be set directly if predictability is an issue

data <- get_layer_data(p)

# bin_prop should be available
expect_true("bin_prop" %in% names(data))

# All bin_prop values should be between 0 and 1
expect_true(all(data$bin_prop >= 0 & data$bin_prop <= 1))

# For bins that contain both groups, bin_prop should sum to 1 across groups
bins_with_both_groups <- aggregate(data$count > 0, by = list(paste(data$xmin, data$xmax)), sum)
overlapping_bins <- bins_with_both_groups[bins_with_both_groups$x == 2, ]$Group.1

for (bin in overlapping_bins) {
bin_data <- data[paste(data$xmin, data$xmax) == bin, ]
total_prop <- sum(bin_data$bin_prop)
expect_equal(total_prop, 1, tolerance = 1e-6)
}
Comment on lines +281 to +288
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it more simple to test that the sum over bins is 1, regardless of how many groups?


# Test after_stat(bin_prop) usage
p2 <- ggplot(test_data, aes(x, y = after_stat(bin_prop), fill = group)) +
stat_bin(geom = "col", bins = 5, position = "dodge")
data2 <- get_layer_data(p2)

# y should contain the bin_prop values
expect_equal(data2$y, data2$bin_prop)
})

test_that("stat_bin bin_prop works with single group", {
# Test that bin_prop = 1 for all bins when there's only one group
test_data <- data_frame(x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))

p <- ggplot(test_data, aes(x)) + geom_histogram(bins = 5)
data <- get_layer_data(p)

# All bin_prop values should be 1 (even for empty bins)
expect_true(all(data$bin_prop == 1))
})

test_that("stat_bin bin_prop works with weights", {
# Test that bin_prop calculation respects weights
test_data <- data_frame(
x = c(1, 1, 2, 2),
group = c("A", "B", "A", "B"),
w = c(1, 3, 2, 2) # Different weights for each observation
)

p <- ggplot(test_data, aes(x, fill = group, weight = w)) + geom_histogram(bins = 2)
data <- get_layer_data(p)

# bin_prop should be available
expect_true("bin_prop" %in% names(data))

# Check that proportions are calculated correctly with weights
# Bin 1: A=1, B=3, total=4, so A should have bin_prop=0.25, B should have bin_prop=0.75
# Bin 2: A=2, B=2, total=4, so A should have bin_prop=0.5, B should have bin_prop=0.5
bin1_data <- data[data$x == min(data$x), ]
bin2_data <- data[data$x == max(data$x), ]
Comment on lines +327 to +328
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bin1_data <- data[data$x == min(data$x), ]
bin2_data <- data[data$x == max(data$x), ]
bin1_data <- data[data$x == 1, ]
bin2_data <- data[data$x == 2, ]

We know from the test data what these values should be


# For each bin, proportions should sum to 1
expect_equal(sum(bin1_data$bin_prop), 1, tolerance = 1e-6)
expect_equal(sum(bin2_data$bin_prop), 1, tolerance = 1e-6)
})