From ae87bea35cc11de1cae34f27e5b2df6bed8ccebf Mon Sep 17 00:00:00 2001 From: "METANEOCORTEX\\Kotti" Date: Thu, 12 Mar 2026 01:49:14 +0100 Subject: [PATCH 1/2] fix: encoding detection bug --- src/EncodingDetection.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index c59929b18..18e35e3d7 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -1233,7 +1233,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint); // --------------------------------------------------------------------------- } - encDetRes.bPureASCII7Bit = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsPureAscii7Bit(lpData, cbData); + encDetRes.bPureASCII7Bit = IsPureAscii7Bit(lpData, cbData); if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeHint; @@ -1297,22 +1297,17 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD } else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) { - if (!encDetRes.bIsAnalysisReliable && !Encoding_IsUTF8(encDetRes.analyzedEncoding) && encDetRes.bValidUTF8) { - encDetRes.Encoding = CPI_UTF8; // unreliable non-UTF-8 guess, but data is valid UTF-8 - } else { - encDetRes.Encoding = encDetRes.analyzedEncoding; - } + encDetRes.Encoding = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) ? CPI_UTF8 : encDetRes.analyzedEncoding; } - else if (!encDetRes.bIsAnalysisReliable && (Encoding_IsValid(encDetRes.analyzedEncoding) || encDetRes.bPureASCII7Bit)) - { - // UCHARDET below confidence threshold (UseReliableCEDonly is true) - encDetRes.Encoding = encDetRes.bValidUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - } - else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && (iConfidence > 50)) + else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) { // unicodeAnalysis (IsTextUnicode) confirms Unicode structure, // iConfidence is from UCHARDET analysis — use analyzedEncoding (intentional) - encDetRes.Encoding = encDetRes.analyzedEncoding; + encDetRes.Encoding = Encoding_IsValid(encDetRes.analyzedEncoding) ? encDetRes.analyzedEncoding : encDetRes.unicodeAnalysis; + } + else if (encDetRes.bPureASCII7Bit) { + // UCHARDET below confidence threshold (UseReliableCEDonly is true) + encDetRes.Encoding = encDetRes.bValidUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; } else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) { @@ -1323,6 +1318,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD encDetRes.Encoding = iAnalyzeHint; } + // final check if (!Encoding_IsValid(encDetRes.Encoding)) { encDetRes.Encoding = CPI_PREFERRED_ENCODING; } From 8d5a289dadcc4f1053667a37739058708f83fa15 Mon Sep 17 00:00:00 2001 From: "METANEOCORTEX\\Kotti" Date: Thu, 12 Mar 2026 02:18:22 +0100 Subject: [PATCH 2/2] fix: revert ASCII as UTF-8 for unicode detection - false positives --- src/EncodingDetection.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 18e35e3d7..460a50f04 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -1303,9 +1303,17 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD { // unicodeAnalysis (IsTextUnicode) confirms Unicode structure, // iConfidence is from UCHARDET analysis — use analyzedEncoding (intentional) - encDetRes.Encoding = Encoding_IsValid(encDetRes.analyzedEncoding) ? encDetRes.analyzedEncoding : encDetRes.unicodeAnalysis; + if (Encoding_IsValid(encDetRes.analyzedEncoding)) { + encDetRes.Encoding = encDetRes.analyzedEncoding; + } + //~else if ((encDetRes.analyzedEncoding == CPI_ASCII_7BIT) && encDetRes.bValidUTF8) { + //~ encDetRes.Encoding = CPI_UTF8; + //~} + else { + encDetRes.Encoding = encDetRes.unicodeAnalysis; + } } - else if (encDetRes.bPureASCII7Bit) { + else if (encDetRes.bPureASCII7Bit || (encDetRes.analyzedEncoding == CPI_ASCII_7BIT)) { // UCHARDET below confidence threshold (UseReliableCEDonly is true) encDetRes.Encoding = encDetRes.bValidUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; }