Respect Scintilla codepage when doing MB-WC conversions

pnedev · pnedev · commit 73b47941e760 · 2022-07-28T18:36:24.000+03:00
diff --git a/src/Engine/Engine.cpp b/src/Engine/Engine.cpp
@@ -388,18 +388,19 @@ inline uint64_t lineRangeHash(uint64_t hashSeed, std::vector<wchar_t>& line, int
 }
 
 
-uint64_t regexIgnoreLineHash(uint64_t hashSeed, const std::vector<char>& line, const CompareOptions& options)
+uint64_t regexIgnoreLineHash(uint64_t hashSeed, int codepage, const std::vector<char>& line,
+	const CompareOptions& options)
 {
-	const intptr_t len = static_cast<intptr_t>(line.size());
+	const int len = static_cast<int>(line.size());
 
 	if (len == 0)
 		return hashSeed;
 
-	const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, line.data(), static_cast<int>(len), NULL, 0);
+	const int wLen = ::MultiByteToWideChar(codepage, 0, line.data(), len, NULL, 0);
 
 	std::vector<wchar_t> wLine(wLen);
 
-	::MultiByteToWideChar(CP_UTF8, 0, line.data(), static_cast<int>(len), wLine.data(), wLen);
+	::MultiByteToWideChar(codepage, 0, line.data(), len, wLine.data(), wLen);
 
 #ifndef MULTITHREAD
 	LOGD(LOG_ALGO, "line len " + std::to_string(len) + " to wide char len " + std::to_string(wLen) + "\n");
@@ -459,6 +460,8 @@ void getLines(DocCmpInfo& doc, const CompareOptions& options)
 			return;
 		}
 
+		const int codepage			= getCodepage(doc.view);
+
 		const intptr_t docLine		= secLine + doc.section.off;
 		const intptr_t lineStart	= getLineStart(doc.view, docLine);
 		const intptr_t lineEnd		= getLineEnd(doc.view, docLine);
@@ -478,12 +481,12 @@ void getLines(DocCmpInfo& doc, const CompareOptions& options)
 						", view " + std::to_string(doc.view) + "\n");
 #endif
 
-				newLine.hash = regexIgnoreLineHash(newLine.hash, line, options);
+				newLine.hash = regexIgnoreLineHash(newLine.hash, codepage, line, options);
 			}
 			else
 			{
 				if (options.ignoreCase)
-					toLowerCase(line);
+					toLowerCase(line, codepage);
 
 				for (intptr_t i = 0; i < lineEnd - lineStart; ++i)
 				{
@@ -513,19 +516,19 @@ charType getCharTypeW(wchar_t letter)
 }
 
 
-inline void recalculateWordPos(std::vector<Word>& words, const std::vector<wchar_t>& line)
+inline void recalculateWordPos(int codepage, std::vector<Word>& words, const std::vector<wchar_t>& line)
 {
 	intptr_t bytePos = 0;
 	intptr_t currPos = 0;
 
 	for (auto& word : words)
 	{
 		if (currPos < word.pos)
-			bytePos += ::WideCharToMultiByte(CP_UTF8, 0, line.data() + currPos, static_cast<int>(word.pos - currPos),
+			bytePos += ::WideCharToMultiByte(codepage, 0, line.data() + currPos, static_cast<int>(word.pos - currPos),
 					NULL, 0, NULL, NULL);
 
 		currPos = word.pos + word.len;
-		word.len = ::WideCharToMultiByte(CP_UTF8, 0, line.data() + word.pos, static_cast<int>(word.len),
+		word.len = ::WideCharToMultiByte(codepage, 0, line.data() + word.pos, static_cast<int>(word.len),
 				NULL, 0, NULL, NULL);
 		word.pos = bytePos;
 		bytePos += word.len;
@@ -616,6 +619,8 @@ std::vector<Word> getLineWords(int view, intptr_t docLine, const CompareOptions&
 {
 	std::vector<Word> words;
 
+	const int codepage			= getCodepage(view);
+
 	const intptr_t lineStart	= getLineStart(view, docLine);
 	const intptr_t lineEnd		= getLineEnd(view, docLine);
 
@@ -625,11 +630,11 @@ std::vector<Word> getLineWords(int view, intptr_t docLine, const CompareOptions&
 
 		const int len = static_cast<int>(line.size());
 
-		const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, line.data(), len, NULL, 0);
+		const int wLen = ::MultiByteToWideChar(codepage, 0, line.data(), len, NULL, 0);
 
 		std::vector<wchar_t> wLine(wLen);
 
-		::MultiByteToWideChar(CP_UTF8, 0, line.data(), len, wLine.data(), wLen);
+		::MultiByteToWideChar(codepage, 0, line.data(), len, wLine.data(), wLen);
 
 		if (options.ignoreRegex)
 			words = getRegexIgnoreLineWords(wLine, options);
@@ -638,26 +643,26 @@ std::vector<Word> getLineWords(int view, intptr_t docLine, const CompareOptions&
 
 		// In case of UTF-16 or UTF-32 find words byte positions and lengths because Scintilla uses those
 		if (wLen != len)
-			recalculateWordPos(words, wLine);
+			recalculateWordPos(codepage, words, wLine);
 	}
 
 	return words;
 }
 
 
-inline void recalculateCharPos(std::vector<Char>& chars, const std::vector<wchar_t>& sec)
+inline void recalculateCharPos(int codepage, std::vector<Char>& chars, const std::vector<wchar_t>& sec)
 {
 	intptr_t bytePos = 0;
 	intptr_t currPos = 0;
 
 	for (auto& ch : chars)
 	{
 		if (currPos < ch.pos)
-			bytePos += ::WideCharToMultiByte(CP_UTF8, 0, sec.data() + currPos, static_cast<int>(ch.pos - currPos),
+			bytePos += ::WideCharToMultiByte(codepage, 0, sec.data() + currPos, static_cast<int>(ch.pos - currPos),
 					NULL, 0, NULL, NULL);
 
 		currPos = ch.pos + 1;
-		const int charLen = ::WideCharToMultiByte(CP_UTF8, 0, sec.data() + ch.pos, 1, NULL, 0, NULL, NULL);
+		const int charLen = ::WideCharToMultiByte(codepage, 0, sec.data() + ch.pos, 1, NULL, 0, NULL, NULL);
 		ch.pos = bytePos;
 		bytePos += charLen;
 	}
@@ -695,23 +700,25 @@ std::vector<Char> getSectionChars(int view, intptr_t secStart, intptr_t secEnd,
 
 	if (secStart < secEnd)
 	{
+		const int codepage = getCodepage(view);
+
 		std::vector<char> sec = getText(view, secStart, secEnd);
 
 		const int len = static_cast<int>(sec.size());
 
-		const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, sec.data(), len, NULL, 0);
+		const int wLen = ::MultiByteToWideChar(codepage, 0, sec.data(), len, NULL, 0);
 
 		std::vector<wchar_t> wSec(wLen);
 
-		::MultiByteToWideChar(CP_UTF8, 0, sec.data(), len, wSec.data(), wLen);
+		::MultiByteToWideChar(codepage, 0, sec.data(), len, wSec.data(), wLen);
 
 		chars.reserve(wLen - 1);
 
 		getSectionRangeChars(chars, wSec, 0, wLen - 1, options);
 
 		// In case of UTF-16 or UTF-32 find chars byte positions because Scintilla uses those
 		if (wLen != len)
-			recalculateCharPos(chars, wSec);
+			recalculateCharPos(codepage, chars, wSec);
 	}
 
 	return chars;
@@ -722,17 +729,19 @@ std::vector<Char> getRegexIgnoreChars(int view, intptr_t secStart, intptr_t secE
 {
 	std::vector<Char> chars;
 
+	const int codepage = getCodepage(view);
+
 	if (secStart < secEnd)
 	{
 		std::vector<char> sec = getText(view, secStart, secEnd);
 
 		const int len = static_cast<int>(sec.size());
 
-		const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, sec.data(), len, NULL, 0);
+		const int wLen = ::MultiByteToWideChar(codepage, 0, sec.data(), len, NULL, 0);
 
 		std::vector<wchar_t> wSec(wLen);
 
-		::MultiByteToWideChar(CP_UTF8, 0, sec.data(), len, wSec.data(), wLen);
+		::MultiByteToWideChar(codepage, 0, sec.data(), len, wSec.data(), wLen);
 
 		chars.reserve(wLen - 1);
 
@@ -753,7 +762,7 @@ std::vector<Char> getRegexIgnoreChars(int view, intptr_t secStart, intptr_t secE
 
 		// In case of UTF-16 or UTF-32 find chars byte positions because Scintilla uses those
 		if (wLen != len)
-			recalculateCharPos(chars, wSec);
+			recalculateCharPos(codepage, chars, wSec);
 	}
 
 	return chars;
diff --git a/src/NppHelpers.cpp b/src/NppHelpers.cpp
@@ -535,24 +535,24 @@ std::vector<char> getText(int view, intptr_t startPos, intptr_t endPos)
 }
 
 
-void toLowerCase(std::vector<char>& text)
+void toLowerCase(std::vector<char>& text, int codepage)
 {
 	const int len = static_cast<int>(text.size());
 
 	if (len == 0)
 		return;
 
-	const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, text.data(), len, NULL, 0);
+	const int wLen = ::MultiByteToWideChar(codepage, 0, text.data(), len, NULL, 0);
 
 	std::vector<wchar_t> wText(wLen);
 
-	::MultiByteToWideChar(CP_UTF8, 0, text.data(), len, wText.data(), wLen);
+	::MultiByteToWideChar(codepage, 0, text.data(), len, wText.data(), wLen);
 
 	wText.push_back(L'\0');
 	::CharLowerW((LPWSTR)wText.data());
 	wText.pop_back();
 
-	::WideCharToMultiByte(CP_UTF8, 0, wText.data(), wLen, text.data(), len, NULL, NULL);
+	::WideCharToMultiByte(codepage, 0, wText.data(), wLen, text.data(), len, NULL, NULL);
 }
 
 
diff --git a/src/NppHelpers.h b/src/NppHelpers.h
@@ -421,6 +421,18 @@ inline int getEncoding(LRESULT buffId)
 }
 
 
+inline int getCodepage(int view)
+{
+	return (int)CallScintilla(view, SCI_GETCODEPAGE, 0, 0);
+}
+
+
+inline int getCharacterSet(int view, int style = STYLE_DEFAULT)
+{
+	return (int)CallScintilla(view, SCI_STYLEGETCHARACTERSET, style, 0);
+}
+
+
 inline intptr_t getDocId(int view)
 {
 	return CallScintilla(view, SCI_GETDOCPOINTER, 0, 0);
@@ -653,7 +665,7 @@ inline void clearAnnotation(int view, intptr_t line)
 void clearAnnotations(int view, intptr_t startLine, intptr_t length);
 
 std::vector<char> getText(int view, intptr_t startPos, intptr_t endPos);
-void toLowerCase(std::vector<char>& text);
+void toLowerCase(std::vector<char>& text, int codepage = CP_UTF8);
 
 void addBlankSection(int view, intptr_t line, intptr_t length, intptr_t selectionMarkPosition = 0,
 		const char *text = nullptr);

Original file line number	Diff line number	Diff line change
`@@ -535,24 +535,24 @@ std::vector<char> getText(int view, intptr_t startPos, intptr_t endPos)`
`535`	`535`	`}`
`536`	`536`
`537`	`537`
`538`		`-void toLowerCase(std::vector<char>& text)`
	`538`	`+void toLowerCase(std::vector<char>& text, int codepage)`
`539`	`539`	`{`
`540`	`540`	`const int len = static_cast<int>(text.size());`
`541`	`541`
`542`	`542`	`if (len == 0)`
`543`	`543`	`return;`
`544`	`544`
`545`		`- const int wLen = ::MultiByteToWideChar(CP_UTF8, 0, text.data(), len, NULL, 0);`
	`545`	`+ const int wLen = ::MultiByteToWideChar(codepage, 0, text.data(), len, NULL, 0);`
`546`	`546`
`547`	`547`	`std::vector<wchar_t> wText(wLen);`
`548`	`548`
`549`		`- ::MultiByteToWideChar(CP_UTF8, 0, text.data(), len, wText.data(), wLen);`
	`549`	`+ ::MultiByteToWideChar(codepage, 0, text.data(), len, wText.data(), wLen);`
`550`	`550`
`551`	`551`	`wText.push_back(L'\0');`
`552`	`552`	`::CharLowerW((LPWSTR)wText.data());`
`553`	`553`	`wText.pop_back();`
`554`	`554`
`555`		`- ::WideCharToMultiByte(CP_UTF8, 0, wText.data(), wLen, text.data(), len, NULL, NULL);`
	`555`	`+ ::WideCharToMultiByte(codepage, 0, wText.data(), wLen, text.data(), len, NULL, NULL);`
`556`	`556`	`}`
`557`	`557`
`558`	`558`