Bug 1301742 - Improve UTF-16 to UTF-8 conversion speed. r=froydnj

This adds an SSE2 optimized path for UTF-16 strings that can be trivially
converted to UTF-8 (they contain no characters > 0X7F). It has a
speedup for the case where many of the leading characters are < 0X80 and
should have the same performance in the worst-case scenario where the first
character is > 0X7F.
This commit is contained in:
Eric Rahm
2016-09-15 15:46:29 -07:00
parent 3ffd3ed130
commit bed8f9f769
5 changed files with 314 additions and 8 deletions

View File

@@ -5,14 +5,79 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsReadableUtils.h"
#include "nsReadableUtilsImpl.h"
#include <algorithm>
#include "mozilla/CheckedInt.h"
#include "nscore.h"
#include "nsMemory.h"
#include "nsString.h"
#include "nsTArray.h"
#include "nsUTF8Utils.h"
using mozilla::IsASCII;
/**
* Fallback implementation for finding the first non-ASCII character in a
* UTF-16 string.
*/
static inline int32_t
FirstNonASCIIUnvectorized(const char16_t* aBegin, const char16_t* aEnd)
{
typedef mozilla::NonASCIIParameters<sizeof(size_t)> p;
const size_t kMask = p::mask();
const uintptr_t kAlignMask = p::alignMask();
const size_t kNumUnicharsPerWord = p::numUnicharsPerWord();
const char16_t* idx = aBegin;
// Align ourselves to a word boundary.
for (; idx != aEnd && ((uintptr_t(idx) & kAlignMask) != 0); idx++) {
if (!IsASCII(*idx)) {
return idx - aBegin;
}
}
// Check one word at a time.
const char16_t* wordWalkEnd = mozilla::aligned(aEnd, kAlignMask);
for (; idx != wordWalkEnd; idx += kNumUnicharsPerWord) {
const size_t word = *reinterpret_cast<const size_t*>(idx);
if (word & kMask) {
return idx - aBegin;
}
}
// Take care of the remainder one character at a time.
for (; idx != aEnd; idx++) {
if (!IsASCII(*idx)) {
return idx - aBegin;
}
}
return -1;
}
/*
* This function returns -1 if all characters in str are ASCII characters.
* Otherwise, it returns a value less than or equal to the index of the first
* ASCII character in str. For example, if first non-ASCII character is at
* position 25, it may return 25, 24, or 16. But it guarantees
* there are only ASCII characters before returned value.
*/
static inline int32_t
FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2()) {
return mozilla::SSE2::FirstNonASCII(aBegin, aEnd);
}
#endif
return FirstNonASCIIUnvectorized(aBegin, aEnd);
}
void
LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest)
{
@@ -180,16 +245,46 @@ bool
AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
const mozilla::fallible_t& aFallible)
{
// At 16 characters analysis showed better performance of both the all ASCII
// and non-ASCII cases, so we limit calling |FirstNonASCII| to strings of
// that length.
const nsAString::size_type kFastPathMinLength = 16;
int32_t firstNonASCII = 0;
if (aSource.Length() >= kFastPathMinLength) {
firstNonASCII = FirstNonASCII(aSource.BeginReading(), aSource.EndReading());
}
if (firstNonASCII == -1) {
// This is all ASCII, we can use the more efficient lossy append.
mozilla::CheckedInt<nsACString::size_type> new_length(aSource.Length());
new_length += aDest.Length();
if (!new_length.isValid() ||
!aDest.SetCapacity(new_length.value(), aFallible)) {
return false;
}
LossyAppendUTF16toASCII(aSource, aDest);
return true;
}
nsAString::const_iterator source_start, source_end;
CalculateUTF8Size calculator;
copy_string(aSource.BeginReading(source_start),
aSource.EndReading(source_end), calculator);
aSource.BeginReading(source_start);
aSource.EndReading(source_end);
size_t count = calculator.Size();
// Skip the characters that we know are single byte.
source_start.advance(firstNonASCII);
copy_string(source_start,
source_end, calculator);
// Include the ASCII characters that were skipped in the count.
size_t count = calculator.Size() + firstNonASCII;
if (count) {
auto old_dest_length = aDest.Length();
// Grow the buffer if we need to.
mozilla::CheckedInt<nsACString::size_type> new_length(count);
new_length += old_dest_length;
@@ -201,11 +296,30 @@ AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
// All ready? Time to convert
ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length);
copy_string(aSource.BeginReading(source_start),
nsAString::const_iterator ascii_end;
aSource.BeginReading(ascii_end);
if (firstNonASCII >= static_cast<int32_t>(kFastPathMinLength)) {
// Use the more efficient lossy converter for the ASCII portion.
LossyConvertEncoding16to8 lossy_converter(
aDest.BeginWriting() + old_dest_length);
nsAString::const_iterator ascii_start;
aSource.BeginReading(ascii_start);
ascii_end.advance(firstNonASCII);
copy_string(ascii_start, ascii_end, lossy_converter);
} else {
// Not using the lossy shortcut, we need to include the leading ASCII
// chars.
firstNonASCII = 0;
}
ConvertUTF16toUTF8 converter(
aDest.BeginWriting() + old_dest_length + firstNonASCII);
copy_string(ascii_end,
aSource.EndReading(source_end), converter);
NS_ASSERTION(converter.Size() == count,
NS_ASSERTION(converter.Size() == count - firstNonASCII,
"Unexpected disparity between CalculateUTF8Size and "
"ConvertUTF16toUTF8");
}