Skip to content

Commit

Permalink
Perf tuning for gcc + aarch64
Browse files Browse the repository at this point in the history
  • Loading branch information
RedBeard0531 committed Jan 18, 2024
1 parent 27f34a5 commit 5ec5d16
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 20 deletions.
24 changes: 18 additions & 6 deletions snappy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,7 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
#ifdef __x86_64__
// Ideally this would just be
//
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
Expand All @@ -250,6 +248,14 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc");
#elif defined(__aarch64__)
asm("cmp %w[xorval], 0\n\t"
"csel %x[a2], %[a3], %[a2], eq\n\t"
: [a2] "+r" (a2)
: [a3] "r" (a3) , [xorval] "r" (xorval)
: "cc");
#else
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#endif
*data = a2 >> (shift & (3 * 8));
return std::pair<size_t, bool>(matched_bytes, true);
Expand All @@ -276,14 +282,20 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
#ifdef __x86_64__
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc");
#elif defined(__aarch64__)
asm("cmp %w[xorval], 0\n\t"
"csel %x[a2], %[a3], %[a2], eq\n\t"
: [a2] "+r" (a2)
: [a3] "r" (a3) , [xorval] "r" (xorval)
: "cc");
#else
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#endif
*data = a2 >> (shift & (3 * 8));
matched += matched_bytes;
Expand Down
79 changes: 65 additions & 14 deletions snappy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,57 @@ using internal::V128_StoreU;
using internal::V128_DupChar;
#endif

// GCC dispatches to libc for memmoves > 16 bytes, so we need to
// do some work to get good code from that compiler. Clang handles
// powers-of-2 at least up to 64 well.
#if !defined(__GNUC__) || defined(__clang__)
template <size_t SIZE>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove(void* dest, const void* src) {
memmove(dest, src, SIZE);
}
#else

template <size_t SIZE>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove(void* dest, const void* src) {
if (SIZE <= 16) {
// gcc has patterns for memmove up to 16 bytes
memmove(dest, src, SIZE);
} else {
// This generates reasonable code on x86_64, but on aarch64 this produces a
// dead store to tmp, plus takes up stack space.
char tmp[SIZE];
memcpy(tmp, src, SIZE);
memcpy(dest, tmp, SIZE);
}
}

#ifdef __aarch64__ // Implies neon support
template <>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove<32>(void* dest, const void* src) {
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
V128_StoreU(reinterpret_cast<V128*>(dest), a);
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
}

template <>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove<64>(void* dest, const void* src) {
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
V128 c = V128_LoadU(reinterpret_cast<const V128*>(src) + 2);
V128 d = V128_LoadU(reinterpret_cast<const V128*>(src) + 3);
V128_StoreU(reinterpret_cast<V128*>(dest), a);
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
V128_StoreU(reinterpret_cast<V128*>(dest) + 2, c);
V128_StoreU(reinterpret_cast<V128*>(dest) + 3, d);
}
#endif
#endif

// We translate the information encoded in a tag through a lookup table to a
// format that requires fewer instructions to decode. Effectively we store
// the length minus the tag part of the offset. The lowest significant byte
Expand Down Expand Up @@ -1060,13 +1111,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
}
#elif defined(__aarch64__)
// Emperically it is faster to just copy all 64 rather than branching.
(void)kShortMemCopy;
(void)size;
FixedSizeMemMove<64>(dst, src);
#else
std::memmove(dst, src, kShortMemCopy);
FixedSizeMemMove<kShortMemCopy>(dst, src);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
std::memmove(dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy,
64 - kShortMemCopy);
FixedSizeMemMove<kShortMemCopy>(
dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy);
}
#endif
}
Expand Down Expand Up @@ -1102,14 +1158,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
// instruction (csinc) and it removes several register moves.
const size_t tag_type = *tag & 3;
const bool is_literal = (tag_type == 0);
if (is_literal) {
size_t next_literal_tag = (*tag >> 2) + 1;
*tag = ip[next_literal_tag];
ip += next_literal_tag + 1;
} else {
*tag = ip[tag_type];
ip += tag_type + 1;
}
const size_t next_tag = is_literal ? (*tag >> 2) + 1 : tag_type;
*tag = ip[next_tag];
ip += (next_tag) + 1;
return tag_type;
}

Expand Down Expand Up @@ -2013,7 +2064,7 @@ class SnappyArrayWriter {
*op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
return true;
}
std::memmove(op, op - offset, kSlopBytes);
FixedSizeMemMove<kSlopBytes>(op, op - offset);
*op_p = op_end;
return true;
}
Expand Down Expand Up @@ -2265,7 +2316,7 @@ class SnappyScatteredWriter {
}
// Fast path
char* const op_end = op + len;
std::memmove(op, op - offset, kSlopBytes);
FixedSizeMemMove<kSlopBytes>(op, op - offset);
*op_p = op_end;
return true;
}
Expand Down

0 comments on commit 5ec5d16

Please sign in to comment.