Skip to content

Commit 7342d9a

Browse files
committed
add longest_common_prefix_avx2_8, but is slower than 64bit version
1 parent 7af99c6 commit 7342d9a

File tree

5 files changed

+46
-0
lines changed

5 files changed

+46
-0
lines changed

bench_lcp.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ BENCHMARK_F(LCP, sse, LCPFixture, 0, 10000)
9090

9191

9292
#ifdef __AVX2__
93+
BENCHMARK_F(LCP, avx2s, LCPFixture, 0, 10000)
94+
{
95+
celero::DoNotOptimizeAway(longest_common_prefix_avx2_8(instance().m_stra, instance().m_strb, instance().m_length));
96+
}
9397
BENCHMARK_F(LCP, avx2, LCPFixture, 0, 10000)
9498
{
9599
celero::DoNotOptimizeAway(longest_common_prefix_avx2(instance().m_stra, instance().m_strb, instance().m_length));

lcp.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,37 @@ size_t longest_common_prefix_sse(const char*const a, const char*const b, const s
136136
}
137137
#endif //SSE2
138138

139+
139140
#ifdef __AVX2__
141+
142+
size_t longest_common_prefix_avx2_8(const char*const a, const char*const b, const size_t length) {
143+
constexpr size_t register_size = 256/8;
144+
size_t c_length = 0; // c_length in register_size units
145+
for(; c_length < length / register_size; ++c_length) {
146+
__m256i ma = _mm256_load_si256((__m256i*)(a+c_length*register_size));
147+
__m256i mb = _mm256_load_si256((__m256i*)(b+c_length*register_size));
148+
const unsigned int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(ma, mb));
149+
if(~mask == 0) {
150+
continue;
151+
}
152+
if(mask == 0) {
153+
return c_length == 0 ? 0 : (c_length-1)*register_size;
154+
}
155+
const size_t pos = __builtin_ctz(~mask);
156+
157+
const size_t ret = c_length*register_size + pos;
158+
DCHECK_EQ(ret, longest_common_prefix_character(a, b, length));
159+
return ret;
160+
}
161+
const size_t ret = c_length*register_size +
162+
longest_common_prefix_packed(
163+
a + c_length*register_size,
164+
b + c_length*register_size,
165+
length - c_length*register_size);
166+
DCHECK_EQ(ret, longest_common_prefix_character(a, b, length));
167+
return ret;
168+
}
169+
140170
size_t longest_common_prefix_avx2(const char*const a, const char*const b, const size_t length) {
141171
constexpr size_t register_size = 256/8;
142172
size_t c_length = 0; // c_length in register_size units

lcp.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,18 @@ size_t longest_common_prefix(const uint64_t* a, const uint64_t* b, const size_t
1818
size_t longest_common_prefix_character(const uint64_t*const a, const uint64_t*const b, const size_t length);
1919

2020
#ifdef __SSE3__
21+
/**
22+
* uses _mm256_cmpeq_epi64 to compare 64-bit packed characters, and resorts to longest_common_prefix_packed for the final packed character
23+
*/
2124
size_t longest_common_prefix_sse(const uint64_t*const a, const uint64_t*const b, const size_t length);
2225
#endif
2326

2427
#ifdef __AVX2__
2528
size_t longest_common_prefix_avx2(const uint64_t*const a, const uint64_t*const b, const size_t length);
29+
/**
30+
* uses _mm256_cmpeq_epi8 to compare 8-bit characters in blocks of 256 bit
31+
*/
32+
size_t longest_common_prefix_avx2_8(const char*const a, const char*const b, const size_t length);
2633
#endif
2734

2835

precalc_lcp_common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ const char*const lcp_name[] =
106106
, "sse"
107107
#endif
108108
#ifdef __AVX2__
109+
, "avx2_8"
109110
, "avx2"
110111
#endif
111112
#ifdef __AVX512__
@@ -119,6 +120,7 @@ const char*const lcp_name[] =
119120
, packed::longest_common_prefix_sse
120121
#endif
121122
#ifdef __AVX2__
123+
, packed::longest_common_prefix_avx2_8
122124
, packed::longest_common_prefix_avx2
123125
#endif
124126
#ifdef __AVX512__

test_lcp.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ TEST(LCP64_off, sse) { test64_lcp_offset(longest_common_prefix_sse); }
9999
#endif
100100

101101
#ifdef __AVX2__
102+
TEST(LCP, avx2_8) { test_lcp(longest_common_prefix_avx2_8); }
103+
TEST(LCPeq, avx2_8) { test_eq(longest_common_prefix_avx2_8); }
104+
102105
TEST(LCP, avx2) { test_lcp(longest_common_prefix_avx2); }
103106
TEST(LCPeq, avx2) { test_eq(longest_common_prefix_avx2); }
104107

0 commit comments

Comments
 (0)