Skip to content

Commit 4d213f0

Browse files
committed
fix #63 + some efforts to fix #64
1 parent bc2cb3f commit 4d213f0

File tree

10 files changed

+154
-23
lines changed

10 files changed

+154
-23
lines changed

src/InMemoryDatasets.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ export
6363
repeat!,
6464
select,
6565
select!,
66+
delete,
6667
mapcols,
6768
insertcols!,
6869
mask,

src/dataset/combine.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
618618
# we will use new_lengths later for assigning the grouping info of the new ds
619619
if _first_vector_res == 0
620620
new_lengths = ones(Int, ngroups)
621-
cumsum!(new_lengths, new_lengths)
621+
our_cumsum!(new_lengths)
622622
total_lengths = ngroups
623623
else
624624
if ms[_first_vector_res].first isa Tuple
@@ -637,7 +637,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
637637
_compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
638638
end
639639
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
640-
cumsum!(new_lengths, new_lengths)
640+
our_cumsum!(new_lengths)
641641
total_lengths = new_lengths[end]
642642
end
643643
all_names = _names(ds)
@@ -715,7 +715,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
715715
# we will use new_lengths later for assigning the grouping info of the new ds
716716
if _first_vector_res == 0
717717
new_lengths = ones(Int, ngroups)
718-
cumsum!(new_lengths, new_lengths)
718+
our_cumsum!(new_lengths)
719719
total_lengths = ngroups
720720
else
721721
if ms[_first_vector_res].first isa Tuple
@@ -734,7 +734,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
734734
_compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
735735
end
736736
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
737-
cumsum!(new_lengths, new_lengths)
737+
our_cumsum!(new_lengths)
738738
total_lengths = new_lengths[end]
739739
end
740740
all_names = _names(ds)

src/dataset/other.jl

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -974,7 +974,7 @@ A convenient shortcut for `ds[byrow(ds, type, cols; ...), :]`.
974974
975975
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
976976
977-
See [`byrow`](@ref), [`filter!`](@ref)
977+
See [`byrow`](@ref), [`filter!`](@ref), [`delete!`](@ref), [`delete`](@ref)
978978
979979
# Examples
980980
@@ -1056,10 +1056,110 @@ It is a convenient shortcut for `deleteat![ds, .!byrow(ds, type, cols; ...)]`.
10561056
10571057
Refer to [`filter`](@ref) for exmaples.
10581058
1059-
See [`byrow`](@ref), [`filter`](@ref)
1059+
See [`byrow`](@ref), [`filter`](@ref), [`delete!`](@ref), [`delete`](@ref)
10601060
"""
10611061
Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, .!byrow(ds, type, cols; kwargs...))
10621062

1063+
# filter out `true`s
1064+
"""
1065+
delete(ds::AbstractDataset, cols; [type = all,...])
1066+
1067+
A convenient shortcut for `ds[.!byrow(ds, type, cols; ...), :]`.
1068+
1069+
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
1070+
1071+
Compare to [`deleteat!`](@ref)
1072+
1073+
See [`delete!`](@ref), [`byrow`](@ref), [`filter!`](@ref), [`filter`](@ref)
1074+
1075+
# Examples
1076+
1077+
```jldoctest
1078+
julia> ds = Dataset(x = [1,2,3,4,5], y = [1.5,2.3,-1,0,2.0], z = Bool[1,0,1,0,1])
1079+
5×3 Dataset
1080+
Row │ x y z
1081+
│ identity identity identity
1082+
│ Int64? Float64? Bool?
1083+
─────┼──────────────────────────────
1084+
1 │ 1 1.5 true
1085+
2 │ 2 2.3 false
1086+
3 │ 3 -1.0 true
1087+
4 │ 4 0.0 false
1088+
5 │ 5 2.0 true
1089+
1090+
julia> delete(ds, :z)
1091+
2×3 Dataset
1092+
Row │ x y z
1093+
│ identity identity identity
1094+
│ Int64? Float64? Bool?
1095+
─────┼──────────────────────────────
1096+
1 │ 2 2.3 false
1097+
2 │ 4 0.0 false
1098+
1099+
julia> delete(ds, 1:2, by = [iseven, >(2.0)])
1100+
4×3 Dataset
1101+
Row │ x y z
1102+
│ identity identity identity
1103+
│ Int64? Float64? Bool?
1104+
─────┼──────────────────────────────
1105+
1 │ 1 1.5 true
1106+
2 │ 3 -1.0 true
1107+
3 │ 4 0.0 false
1108+
4 │ 5 2.0 true
1109+
1110+
julia> delete(ds, 1:2, type = any, by = [iseven, >(2.0)])
1111+
3×3 Dataset
1112+
Row │ x y z
1113+
│ identity identity identity
1114+
│ Int64? Float64? Bool?
1115+
─────┼──────────────────────────────
1116+
1 │ 1 1.5 true
1117+
2 │ 3 -1.0 true
1118+
3 │ 5 2.0 true
1119+
1120+
julia> delete(ds, 1:3, type = issorted, rev = true)
1121+
3×3 Dataset
1122+
Row │ x y z
1123+
│ identity identity identity
1124+
│ Int64? Float64? Bool?
1125+
─────┼──────────────────────────────
1126+
1 │ 1 1.5 true
1127+
2 │ 2 2.3 false
1128+
3 │ 3 -1.0 true
1129+
1130+
julia> delete(ds, 2:3, type = isless, with = :x)
1131+
2×3 Dataset
1132+
Row │ x y z
1133+
│ identity identity identity
1134+
│ Int64? Float64? Bool?
1135+
─────┼──────────────────────────────
1136+
1 │ 1 1.5 true
1137+
2 │ 2 2.3 false
1138+
```
1139+
"""
1140+
function delete(ds::AbstractDataset, cols::Union{ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...)
1141+
if view
1142+
Base.view(ds, .!byrow(ds, type, cols; kwargs...), :)
1143+
else
1144+
ds[.!byrow(ds, type, cols; kwargs...), :]
1145+
end
1146+
end
1147+
"""
1148+
delete!(ds::AbstractDataset, cols; [type = all, ...])
1149+
1150+
Variant of `delete` which replaces the passed data set with the filtered one.
1151+
1152+
It is a convenient shortcut for `deleteat![ds, byrow(ds, type, cols; ...)]`.
1153+
1154+
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
1155+
1156+
Compare to [`deleteat!`](@ref)
1157+
1158+
Refer to [`delete`](@ref) for exmaples.
1159+
1160+
See [`delete`](@ref), [`byrow`](@ref), [`filter`](@ref), [`filter!`](@ref)
1161+
"""
1162+
Base.delete!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, byrow(ds, type, cols; kwargs...))
10631163

10641164
"""
10651165
mapcols(ds::AbstractDataset, f, cols)

src/dataset/transpose.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,7 @@ function flatten!(ds::Dataset,
814814
sort_permute_idxcols = [1]
815815
idxcols_sorted = idxcols
816816
end
817-
cumsum!(lengths, lengths)
817+
our_cumsum!(lengths)
818818
for col in 1:length(idxcols_sorted)
819819
col_to_flatten = all_idxcols[sort_permute_idxcols[col]]
820820

@@ -884,7 +884,7 @@ function flatten(ds::AbstractDataset,
884884
sort_permute_idxcols = [1]
885885
idxcols_sorted = idxcols
886886
end
887-
cumsum!(lengths, lengths)
887+
our_cumsum!(lengths)
888888
for col in 1:length(idxcols_sorted)
889889
col_to_flatten = all_idxcols[sort_permute_idxcols[col]]
890890

src/join/closejoin.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m
336336
ranges, a, idx, minval, reps, sz, right_cols_2= _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], onright[1:end-1], mapformats, true, Val(T), threads = threads)
337337
filter!(!=(0), reps)
338338
pushfirst!(reps, 1)
339-
cumsum!(reps, reps)
339+
our_cumsum!(reps)
340340
pop!(reps)
341341
grng = GIVENRANGE(idx, reps, Int[], length(reps))
342342
starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, oncols_right[end], stable, alg, mapformats, nsfpaj, grng, threads = threads)

src/join/join.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en,
420420
ranges[i] = 1:0
421421
end
422422
end
423-
cumsum!(revised_ends, revised_ends)
423+
our_cumsum!(revised_ends)
424424
end
425425

426426
function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj = true, threads = true)
@@ -487,7 +487,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map
487487
end
488488
end
489489
new_ends = map(x -> max(1, length(x)), ranges)
490-
cumsum!(new_ends, new_ends)
490+
our_cumsum!(new_ends)
491491
total_length = new_ends[end]
492492

493493
if check
@@ -579,7 +579,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
579579
end
580580

581581
new_ends = map(x -> max(1, length(x)), ranges)
582-
cumsum!(new_ends, new_ends)
582+
our_cumsum!(new_ends)
583583
total_length = new_ends[end]
584584

585585
if check
@@ -673,7 +673,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
673673
ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads)
674674
filter!(!=(0), reps)
675675
pushfirst!(reps, 1)
676-
cumsum!(reps, reps)
676+
our_cumsum!(reps)
677677
pop!(reps)
678678
grng = GIVENRANGE(idx, reps, Int[], length(reps))
679679
starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads)
@@ -700,7 +700,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
700700

701701

702702
new_ends = map(length, ranges)
703-
cumsum!(new_ends, new_ends)
703+
our_cumsum!(new_ends)
704704
total_length = new_ends[end]
705705

706706
inbits = nothing
@@ -896,7 +896,7 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu
896896
end
897897
new_ends = map(x -> max(1, length(x)), ranges)
898898
notinleft = _find_right_not_in_left(ranges, nrow(dsr), idx)
899-
cumsum!(new_ends, new_ends)
899+
our_cumsum!(new_ends)
900900
total_length = new_ends[end] + length(notinleft)
901901

902902
if check

src/join/join_dict.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ function _join_left_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T}
209209
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
210210

211211
new_ends = map(x -> max(1, length(x)), ranges)
212-
cumsum!(new_ends, new_ends)
212+
our_cumsum!(new_ends)
213213
total_length = new_ends[end]
214214

215215
if check
@@ -292,7 +292,7 @@ function _join_left!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
292292
throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set"))
293293
end
294294
new_ends = map(x -> max(1, length(x)), ranges)
295-
cumsum!(new_ends, new_ends)
295+
our_cumsum!(new_ends)
296296
total_length = new_ends[end]
297297

298298
if check
@@ -354,7 +354,7 @@ function _join_inner_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
354354
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
355355

356356
new_ends = map(length, ranges)
357-
cumsum!(new_ends, new_ends)
357+
our_cumsum!(new_ends)
358358
total_length = new_ends[end]
359359

360360
if check
@@ -430,7 +430,7 @@ function _join_outer_dict(dsl, dsr, ranges, onleft, onright, oncols_left, oncols
430430
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
431431
new_ends = map(x -> max(1, length(x)), ranges)
432432
notinleft = _find_right_not_in_left(ranges, nrow(dsr), 1:nrow(dsr))
433-
cumsum!(new_ends, new_ends)
433+
our_cumsum!(new_ends)
434434
total_length = new_ends[end] + length(notinleft)
435435

436436
if check

src/other/utils.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ function allocatecol(T, len)
9393
_our_vect_alloc(Union{Missing, T}, len)
9494
end
9595

96+
function our_cumsum!(x)
97+
@inbounds for i in 2:length(x)
98+
x[i] += x[i-1]
99+
end
100+
x
101+
end
102+
103+
96104
function _generate_inverted_dict_pool(x)
97105
invp = DataAPI.invrefpool(x)
98106
if invp isa Dict
@@ -174,7 +182,7 @@ function _sortitout!(res, starts, x)
174182
starts[x[i] + 1] += 1
175183
end
176184
starts_normalised = map(>(0), starts)
177-
cumsum!(starts, starts)
185+
our_cumsum!(starts)
178186
for i in 1:length(x)
179187
label = x[i]
180188
res[starts[label]] = i
@@ -226,7 +234,7 @@ function _calculate_ends(groups, ngroups, ::Val{T}) where T
226234
@inbounds for i = 1:length(groups)
227235
where[groups[i]] += 1
228236
end
229-
START_END(false, length(groups), cumsum!(where, where))
237+
START_END(false, length(groups), our_cumsum!(where))
230238
end
231239

232240

src/sort/groupby.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
196196
# we will use new_lengths later for assigning the grouping info of the new ds
197197
if _first_vector_res == 0
198198
new_lengths = ones(Int, ngroups)
199-
cumsum!(new_lengths, new_lengths)
199+
our_cumsum!(new_lengths)
200200
total_lengths = ngroups
201201
else
202202
if ms[_first_vector_res].first isa Tuple
@@ -215,7 +215,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
215215
_compute_the_mutli_row_trans!(special_res, new_lengths, _threaded_permute_for_groupby(_columns(gds.parent)[index(gds.parent)[ms[_first_vector_res].first]], a[1], threads = threads), nrow(gds.parent), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
216216
end
217217
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
218-
cumsum!(new_lengths, new_lengths)
218+
our_cumsum!(new_lengths)
219219
total_lengths = new_lengths[end]
220220
end
221221
all_names = _names(gds.parent)

test/data.jl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,28 @@ end
365365
@test byrow(ds, all, :, by = [>(5), ==(10)], threads = false) == [falses(500);trues(500)]
366366
@test byrow(mask(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [>(5), ==(10)], [2,1], threads = false), all, threads = false) == [trues(500);falses(500)]
367367
@test byrow(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), all, [2,1], by = [>(5), ==(10)], threads = false) == [trues(500);falses(500)]
368+
369+
370+
ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
371+
@test delete(ds, 1, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
372+
@test delete!(ds, 1, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])
373+
374+
ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
375+
@test delete(ds, :x, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
376+
@test delete!(ds, :x, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])
377+
378+
ds = Dataset(x = [1,2,missing,1], y = ["a", "d", "c", "f"])
379+
@test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
380+
setformat!(ds, 1=>isodd)
381+
@test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
382+
@test delete(ds, :, type = all, by = [isequal(1), >("a")], mapformats = true) == ds[[true, true, true, false],:]
383+
@test delete(ds, :, by = [isequal(1), ==("a")], mapformats = true) == ds[[false, true, true, true],:]
384+
setformat!(ds, 1=>iseven)
385+
@test delete(ds, 1, by = isequal(1), mapformats = true) == ds[[true, false, true, true],:]
386+
387+
ds = Dataset(x = repeat(1:10, inner = 100), y = 10)
388+
@test delete(ds, :, by = [>(5), ==(10)]) == ds[[trues(500);falses(500)],:]
389+
@test delete(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [2,1], by = [>(5), ==(10)]) == view(ds, nrow(ds):-1:1, ncol(ds):-1:1)[[falses(500);trues(500)],:]
368390
end
369391

370392
@testset "ffill, ffill!, bfill, bfill!" begin

0 commit comments

Comments
 (0)