├── .git-blame-ignore-revs
├── .gitattributes
├── .gitignore
├── .gitlab-ci.yml
├── .kde-ci.yml
├── CMakeLists.txt
├── KF6CodecsConfig.cmake.in
├── LICENSES
    ├── BSD-3-Clause.txt
    ├── CC0-1.0.txt
    ├── GPL-2.0-or-later.txt
    ├── LGPL-2.0-only.txt
    ├── LGPL-2.0-or-later.txt
    ├── LGPL-2.1-or-later.txt
    ├── MIT.txt
    └── MPL-1.1.txt
├── README.md
├── autotests
    ├── CMakeLists.txt
    ├── base45test.cpp
    ├── base64benchmark.cpp
    ├── codectest.cpp
    ├── codectest.h
    ├── data
    │   ├── binary_data
    │   ├── codec_b
    │   │   ├── basic-decode.b
    │   │   ├── basic-decode.b.expected
    │   │   ├── basic-encode
    │   │   ├── basic-encode.expected
    │   │   ├── null-decode.b
    │   │   ├── null-decode.b.expected
    │   │   ├── null-encode
    │   │   ├── null-encode.expected
    │   │   ├── padding0-encode
    │   │   ├── padding0-encode.expected
    │   │   ├── padding1-encode
    │   │   ├── padding1-encode.expected
    │   │   ├── padding2-encode
    │   │   └── padding2-encode.expected
    │   ├── codec_base64
    │   │   ├── basic-decode.base64
    │   │   ├── basic-decode.base64.expected
    │   │   ├── basic-encode
    │   │   ├── basic-encode.expected
    │   │   ├── corrupt.base64
    │   │   ├── corrupt.base64.expected
    │   │   ├── very_small-encode
    │   │   └── very_small-encode.expected
    │   ├── codec_q
    │   │   ├── all-encoded-decode.q
    │   │   ├── all-encoded-decode.q.expected
    │   │   ├── basic-encode
    │   │   ├── basic-encode.expected
    │   │   ├── null-decode.q
    │   │   ├── null-decode.q.expected
    │   │   ├── null-encode
    │   │   └── null-encode.expected
    │   ├── codec_quoted-printable
    │   │   ├── basic-decode.quoted-printable
    │   │   ├── basic-decode.quoted-printable.expected
    │   │   ├── basic-encode
    │   │   ├── basic-encode.expected
    │   │   ├── corrupt.quoted-printable
    │   │   ├── corrupt.quoted-printable.expected
    │   │   ├── corrupt2.quoted-printable
    │   │   ├── corrupt2.quoted-printable.expected
    │   │   ├── corrupt3.quoted-printable
    │   │   ├── corrupt3.quoted-printable.expected
    │   │   ├── corrupt4.quoted-printable
    │   │   ├── corrupt4.quoted-printable.expected
    │   │   ├── wrap-encode
    │   │   └── wrap-encode.expected
    │   ├── codec_x-kmime-rfc2231
    │   │   ├── all-encoded.x-kmime-rfc2231-decode
    │   │   ├── all-encoded.x-kmime-rfc2231-decode.expected
    │   │   ├── basic-encode
    │   │   ├── basic-encode.expected
    │   │   ├── null-decode.x-kmime-rfc2231
    │   │   ├── null-decode.x-kmime-rfc2231.expected
    │   │   ├── null-encode
    │   │   └── null-encode.expected
    │   └── codec_x-uuencode
    │   │   ├── basic-decode.x-uuencode
    │   │   └── basic-decode.x-uuencode.expected
    ├── kcharsetstest.cpp
    ├── kcharsetstest.h
    ├── kemailaddresstest.cpp
    ├── kemailaddresstest.h
    ├── kencodingprobertest.cpp
    ├── kencodingprobertest.h
    ├── rfc2047test.cpp
    └── rfc2047test.h
├── docs
    └── Doxyfile.local
├── metainfo.yaml
├── poqm
    ├── af
    │   └── kcodecs6_qt.po
    ├── ar
    │   └── kcodecs6_qt.po
    ├── as
    │   └── kcodecs6_qt.po
    ├── ast
    │   └── kcodecs6_qt.po
    ├── az
    │   └── kcodecs6_qt.po
    ├── be
    │   └── kcodecs6_qt.po
    ├── be@latin
    │   └── kcodecs6_qt.po
    ├── bg
    │   └── kcodecs6_qt.po
    ├── bn
    │   └── kcodecs6_qt.po
    ├── bn_IN
    │   └── kcodecs6_qt.po
    ├── br
    │   └── kcodecs6_qt.po
    ├── bs
    │   └── kcodecs6_qt.po
    ├── ca
    │   └── kcodecs6_qt.po
    ├── ca@valencia
    │   └── kcodecs6_qt.po
    ├── crh
    │   └── kcodecs6_qt.po
    ├── cs
    │   └── kcodecs6_qt.po
    ├── csb
    │   └── kcodecs6_qt.po
    ├── cy
    │   └── kcodecs6_qt.po
    ├── da
    │   └── kcodecs6_qt.po
    ├── de
    │   └── kcodecs6_qt.po
    ├── el
    │   └── kcodecs6_qt.po
    ├── en_GB
    │   └── kcodecs6_qt.po
    ├── eo
    │   └── kcodecs6_qt.po
    ├── es
    │   └── kcodecs6_qt.po
    ├── et
    │   └── kcodecs6_qt.po
    ├── eu
    │   └── kcodecs6_qt.po
    ├── fa
    │   └── kcodecs6_qt.po
    ├── fi
    │   └── kcodecs6_qt.po
    ├── fr
    │   └── kcodecs6_qt.po
    ├── fy
    │   └── kcodecs6_qt.po
    ├── ga
    │   └── kcodecs6_qt.po
    ├── gd
    │   └── kcodecs6_qt.po
    ├── gl
    │   └── kcodecs6_qt.po
    ├── gu
    │   └── kcodecs6_qt.po
    ├── ha
    │   └── kcodecs6_qt.po
    ├── he
    │   └── kcodecs6_qt.po
    ├── hi
    │   └── kcodecs6_qt.po
    ├── hne
    │   └── kcodecs6_qt.po
    ├── hr
    │   └── kcodecs6_qt.po
    ├── hsb
    │   └── kcodecs6_qt.po
    ├── hu
    │   └── kcodecs6_qt.po
    ├── hy
    │   └── kcodecs6_qt.po
    ├── ia
    │   └── kcodecs6_qt.po
    ├── id
    │   └── kcodecs6_qt.po
    ├── is
    │   └── kcodecs6_qt.po
    ├── it
    │   └── kcodecs6_qt.po
    ├── ja
    │   └── kcodecs6_qt.po
    ├── ka
    │   └── kcodecs6_qt.po
    ├── kk
    │   └── kcodecs6_qt.po
    ├── km
    │   └── kcodecs6_qt.po
    ├── kn
    │   └── kcodecs6_qt.po
    ├── ko
    │   └── kcodecs6_qt.po
    ├── ku
    │   └── kcodecs6_qt.po
    ├── lb
    │   └── kcodecs6_qt.po
    ├── lt
    │   └── kcodecs6_qt.po
    ├── lv
    │   └── kcodecs6_qt.po
    ├── mai
    │   └── kcodecs6_qt.po
    ├── mk
    │   └── kcodecs6_qt.po
    ├── ml
    │   └── kcodecs6_qt.po
    ├── mr
    │   └── kcodecs6_qt.po
    ├── ms
    │   └── kcodecs6_qt.po
    ├── nb
    │   └── kcodecs6_qt.po
    ├── nds
    │   └── kcodecs6_qt.po
    ├── ne
    │   └── kcodecs6_qt.po
    ├── nl
    │   └── kcodecs6_qt.po
    ├── nn
    │   └── kcodecs6_qt.po
    ├── oc
    │   └── kcodecs6_qt.po
    ├── or
    │   └── kcodecs6_qt.po
    ├── pa
    │   └── kcodecs6_qt.po
    ├── pl
    │   └── kcodecs6_qt.po
    ├── ps
    │   └── kcodecs6_qt.po
    ├── pt
    │   └── kcodecs6_qt.po
    ├── pt_BR
    │   └── kcodecs6_qt.po
    ├── ro
    │   └── kcodecs6_qt.po
    ├── ru
    │   └── kcodecs6_qt.po
    ├── sa
    │   └── kcodecs6_qt.po
    ├── se
    │   └── kcodecs6_qt.po
    ├── si
    │   └── kcodecs6_qt.po
    ├── sk
    │   └── kcodecs6_qt.po
    ├── sl
    │   └── kcodecs6_qt.po
    ├── sq
    │   └── kcodecs6_qt.po
    ├── sr
    │   └── kcodecs6_qt.po
    ├── sr@ijekavian
    │   └── kcodecs6_qt.po
    ├── sr@ijekavianlatin
    │   └── kcodecs6_qt.po
    ├── sr@latin
    │   └── kcodecs6_qt.po
    ├── sv
    │   └── kcodecs6_qt.po
    ├── ta
    │   └── kcodecs6_qt.po
    ├── te
    │   └── kcodecs6_qt.po
    ├── tg
    │   └── kcodecs6_qt.po
    ├── th
    │   └── kcodecs6_qt.po
    ├── tr
    │   └── kcodecs6_qt.po
    ├── tt
    │   └── kcodecs6_qt.po
    ├── ug
    │   └── kcodecs6_qt.po
    ├── uk
    │   └── kcodecs6_qt.po
    ├── uz
    │   └── kcodecs6_qt.po
    ├── uz@cyrillic
    │   └── kcodecs6_qt.po
    ├── vi
    │   └── kcodecs6_qt.po
    ├── wa
    │   └── kcodecs6_qt.po
    ├── xh
    │   └── kcodecs6_qt.po
    ├── zh_CN
    │   └── kcodecs6_qt.po
    ├── zh_HK
    │   └── kcodecs6_qt.po
    └── zh_TW
    │   └── kcodecs6_qt.po
└── src
    ├── CMakeLists.txt
    ├── Messages.sh
    ├── kcharsets.cpp
    ├── kcharsets.h
    ├── kcharsets_p.h
    ├── kcodecs-index.qdoc
    ├── kcodecs.cpp
    ├── kcodecs.h
    ├── kcodecs.qdoc
    ├── kcodecs.qdocconf
    ├── kcodecs_p.h
    ├── kcodecsbase45.cpp
    ├── kcodecsbase64.cpp
    ├── kcodecsbase64.h
    ├── kcodecsqp.cpp
    ├── kcodecsqp.h
    ├── kcodecsuuencode.cpp
    ├── kcodecsuuencode.h
    ├── kemailaddress.cpp
    ├── kemailaddress.h
    ├── kencodingprober.cpp
    ├── kencodingprober.h
    └── probers
        ├── CharDistribution.cpp
        ├── CharDistribution.h
        ├── ChineseGroupProber.cpp
        ├── ChineseGroupProber.h
        ├── JapaneseGroupProber.cpp
        ├── JapaneseGroupProber.h
        ├── JpCntx.cpp
        ├── JpCntx.h
        ├── LangBulgarianModel.cpp
        ├── LangCyrillicModel.cpp
        ├── LangGreekModel.cpp
        ├── LangHebrewModel.cpp
        ├── LangHungarianModel.cpp
        ├── LangThaiModel.cpp
        ├── UnicodeGroupProber.cpp
        ├── UnicodeGroupProber.h
        ├── nsBig5Prober.cpp
        ├── nsBig5Prober.h
        ├── nsCharSetProber.cpp
        ├── nsCharSetProber.h
        ├── nsCodingStateMachine.h
        ├── nsEUCJPProber.cpp
        ├── nsEUCJPProber.h
        ├── nsEUCKRProber.cpp
        ├── nsEUCKRProber.h
        ├── nsEscCharsetProber.cpp
        ├── nsEscCharsetProber.h
        ├── nsEscSM.cpp
        ├── nsGB2312Prober.cpp
        ├── nsGB2312Prober.h
        ├── nsHebrewProber.cpp
        ├── nsHebrewProber.h
        ├── nsLatin1Prober.cpp
        ├── nsLatin1Prober.h
        ├── nsMBCSGroupProber.cpp
        ├── nsMBCSGroupProber.h
        ├── nsMBCSSM.cpp
        ├── nsPkgInt.h
        ├── nsSBCSGroupProber.cpp
        ├── nsSBCSGroupProber.h
        ├── nsSBCharSetProber.cpp
        ├── nsSBCharSetProber.h
        ├── nsSJISProber.cpp
        ├── nsSJISProber.h
        ├── nsUniversalDetector.cpp
        ├── nsUniversalDetector.h
        └── tables
            ├── Big5Freq.tab
            ├── EUCKRFreq.tab
            ├── GB2312Freq.tab
            └── JISFreq.tab


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | #clang-format/tidy
2 | 88d295069b046faa5846e320c2ed81805cdcd44c
3 | bf6a25dad4c25070bdb292a4adbf6058acd50ad1
4 | 439b22f0e561c7637b0df364b6dc4958e1aad617
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.expected -crlf
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore the following files
 2 | *~
 3 | *.[oa]
 4 | *.diff
 5 | *.kate-swp
 6 | *.kdev4
 7 | .kdev_include_paths
 8 | *.kdevelop.pcs
 9 | *.moc
10 | *.moc.cpp
11 | *.orig
12 | *.user
13 | .*.swp
14 | .swp.*
15 | Doxyfile
16 | Makefile
17 | avail
18 | random_seed
19 | /build*/
20 | /.vscode/
21 | CMakeLists.txt.user*
22 | *.unc-backup*
23 | .cmake/
24 | /.clang-format
25 | /compile_commands.json
26 | .clangd
27 | .idea
28 | /cmake-build*
29 | .cache
30 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2020 Volker Krause <vkrause@kde.org>
 2 | # SPDX-License-Identifier: CC0-1.0
 3 | 
 4 | include:
 5 |   - project: sysadmin/ci-utilities
 6 |     file:
 7 |       - /gitlab-templates/linux-qt6.yml
 8 |       - /gitlab-templates/linux-qt6-next.yml
 9 |       - /gitlab-templates/linux-qt6-static.yml
10 |       - /gitlab-templates/android-qt6.yml
11 |       - /gitlab-templates/freebsd-qt6.yml
12 |       - /gitlab-templates/windows-qt6.yml
13 |       - /gitlab-templates/alpine-qt6.yml
14 |       - /gitlab-templates/xml-lint.yml
15 |       - /gitlab-templates/yaml-lint.yml
16 | 


--------------------------------------------------------------------------------
/.kde-ci.yml:
--------------------------------------------------------------------------------
1 | Dependencies:
2 |  - 'on': ['@all']
3 |    'require':
4 |     'frameworks/extra-cmake-modules': '@same'
5 | 
6 | Options:
7 |  test-before-installing: True
8 |  require-passing-tests-on: ['Linux', 'FreeBSD']
9 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | set(KF_VERSION "6.15.0") # handled by release scripts
 4 | project(KCodecs VERSION ${KF_VERSION})
 5 | 
 6 | include(FeatureSummary)
 7 | find_package(ECM 6.14.0  NO_MODULE)
 8 | set_package_properties(ECM PROPERTIES TYPE REQUIRED DESCRIPTION "Extra CMake Modules." URL "https://commits.kde.org/extra-cmake-modules")
 9 | feature_summary(WHAT REQUIRED_PACKAGES_NOT_FOUND FATAL_ON_MISSING_REQUIRED_PACKAGES)
10 | 
11 | 
12 | set(CMAKE_MODULE_PATH ${ECM_MODULE_PATH})
13 | 
14 | include(KDEInstallDirs)
15 | include(KDEFrameworkCompilerSettings NO_POLICY_SCOPE)
16 | include(KDECMakeSettings)
17 | include(KDEGitCommitHooks)
18 | include(ECMQtDeclareLoggingCategory)
19 | include(ECMDeprecationSettings)
20 | 
21 | set(REQUIRED_QT_VERSION 6.7.0)
22 | find_package(Qt6Core ${REQUIRED_QT_VERSION} REQUIRED NO_MODULE)
23 | 
24 | include(ECMGenerateExportHeader)
25 | include(CMakePackageConfigHelpers)
26 | include(ECMSetupVersion)
27 | include(ECMGenerateHeaders)
28 | include(ECMGenerateQDoc)
29 | 
30 | include(ECMPoQmTools)
31 | 
32 | set(EXCLUDE_DEPRECATED_BEFORE_AND_AT 0 CACHE STRING "Control the range of deprecated API excluded from the build [default=0].")
33 | 
34 | set(kcodecs_version_header "${CMAKE_CURRENT_BINARY_DIR}/src/kcodecs_version.h")
35 | ecm_setup_version(PROJECT VARIABLE_PREFIX KCODECS
36 |                         VERSION_HEADER "${kcodecs_version_header}"
37 |                         PACKAGE_VERSION_FILE "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfigVersion.cmake"
38 |                         SOVERSION 6)
39 | 
40 | ecm_install_po_files_as_qm(poqm)
41 | 
42 | ecm_set_disabled_deprecation_versions(
43 |     QT 6.9.0
44 | )
45 | 
46 | add_subdirectory(src)
47 | 
48 | if (BUILD_TESTING)
49 |     add_subdirectory(autotests)
50 | endif()
51 | 
52 | # create a Config.cmake and a ConfigVersion.cmake file and install them
53 | set(CMAKECONFIG_INSTALL_DIR "${KDE_INSTALL_CMAKEPACKAGEDIR}/KF6Codecs")
54 | 
55 | configure_package_config_file("${CMAKE_CURRENT_SOURCE_DIR}/KF6CodecsConfig.cmake.in"
56 |                               "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfig.cmake"
57 |                               INSTALL_DESTINATION  ${CMAKECONFIG_INSTALL_DIR}
58 |                               )
59 | 
60 | install(FILES  "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfig.cmake"
61 |                "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfigVersion.cmake"
62 |         DESTINATION "${CMAKECONFIG_INSTALL_DIR}"
63 |         COMPONENT Devel )
64 | 
65 | install(EXPORT KF6CodecsTargets DESTINATION "${CMAKECONFIG_INSTALL_DIR}" FILE KF6CodecsTargets.cmake NAMESPACE KF6:: )
66 | 
67 | install(FILES ${kcodecs_version_header}
68 |         DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF}/KCodecs COMPONENT Devel)
69 | 
70 | include(ECMFeatureSummary)
71 | ecm_feature_summary(WHAT ALL FATAL_ON_MISSING_REQUIRED_PACKAGES)
72 | 
73 | kde_configure_git_pre_commit_hook(CHECKS CLANG_FORMAT)
74 | 


--------------------------------------------------------------------------------
/KF6CodecsConfig.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 | 
3 | include(CMakeFindDependencyMacro)
4 | find_dependency(Qt6Core @REQUIRED_QT_VERSION@)
5 | 
6 | include("${CMAKE_CURRENT_LIST_DIR}/KF6CodecsTargets.cmake")
7 | 


--------------------------------------------------------------------------------
/LICENSES/BSD-3-Clause.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) <year> <owner>. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification,
 4 | are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice,
 7 | this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors
14 | may be used to endorse or promote products derived from this software without
15 | specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
26 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/LICENSES/CC0-1.0.txt:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/LICENSES/MIT.txt:
--------------------------------------------------------------------------------
 1 | MIT License Copyright (c) <year> <copyright holders>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice (including the next
11 | paragraph) shall be included in all copies or substantial portions of the
12 | Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
17 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 | OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KCodecs
 2 | 
 3 | String encoding library
 4 | 
 5 | ## Introduction
 6 | 
 7 | KCodecs provide a collection of methods to manipulate strings using various
 8 | encodings.
 9 | 
10 | It can automatically determine the charset of a string, translate XML entities,
11 | validate email addresses, and find encodings by name in a more tolerant way than QTextCodec
12 | (useful e.g. for data coming from the Internet).
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/autotests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(ECMAddTests)
 2 | 
 3 | find_package(Qt6Test ${REQUIRED_QT_VERSION} CONFIG QUIET)
 4 | 
 5 | if(NOT TARGET Qt6::Test)
 6 |     message(STATUS "Qt6Test not found, autotests will not be built.")
 7 |     return()
 8 | endif()
 9 | 
10 | ecm_add_tests(
11 |     kencodingprobertest.cpp
12 |     rfc2047test.cpp
13 |     base45test.cpp
14 |     codectest.cpp
15 |     kemailaddresstest.cpp
16 |     LINK_LIBRARIES KF6::Codecs Qt6::Test
17 | )
18 | 
19 | ecm_add_test(
20 |     kcharsetstest.cpp
21 |     LINK_LIBRARIES KF6::Codecs Qt6::Test
22 | )
23 | 
24 | # Benchmark, compiled, but not run automatically with ctest
25 | add_executable(base64benchmark base64benchmark.cpp)
26 | target_link_libraries(base64benchmark KF6::Codecs Qt6::Test)
27 | 


--------------------------------------------------------------------------------
/autotests/base45test.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
 3 | 
 4 |     SPDX-License-Identifier: LGPL-2.0-or-later
 5 | */
 6 | 
 7 | #include <QTest>
 8 | 
 9 | #include <KCodecs>
10 | 
11 | class Base45Test : public QObject
12 | {
13 |     Q_OBJECT
14 | private Q_SLOTS:
15 |     void testBase45Decode_data()
16 |     {
17 |         QTest::addColumn<QByteArray>("in");
18 |         QTest::addColumn<QByteArray>("out");
19 | 
20 |         QTest::newRow("empty") << QByteArray() << QByteArray();
21 | 
22 |         // examples from the RFC - https://datatracker.ietf.org/doc/draft-faltstrom-base45/
23 |         QTest::newRow("hello") << QByteArray("%69 VD92EX0") << QByteArray("Hello!!");
24 |         QTest::newRow("base-45") << QByteArray("UJCLQE7W581") << QByteArray("base-45");
25 |         QTest::newRow("ietf") << QByteArray("QED8WEX0") << QByteArray("ietf!");
26 | 
27 |         // from EU DCG test data - https://github.com/eu-digital-green-certificates/dgc-testdata
28 |         QTest::newRow("eu-dcg")
29 |             << QByteArray(
30 |                    "6BF+70790T9WJWG.FKY*4GO0.O1CV2 O5 "
31 |                    "N2FBBRW1*70HS8WY04AC*WIFN0AHCD8KD97TK0F90KECTHGWJC0FDC:5AIA%G7X+AQB9746HS80:54IBQF60R6$A80X6S1BTYACG6M+9XG8KIAWNA91AY%67092L4WJCT3EHS8XJC$+"
32 |                    "DXJCCWENF6OF63W5NW6WF6%JC QE/IAYJC5LEW34U3ET7DXC9 QE-ED8%E.JCBECB1A-:8$96646AL60A60S6Q$D.UDRYA "
33 |                    "96NF6L/5QW6307KQEPD09WEQDD+Q6TW6FA7C466KCN9E%961A6DL6FA7D46JPCT3E5JDLA7$Q6E464W5TG6..DX%DZJC6/DTZ9 QE5$CB$DA/D "
34 |                    "JC1/D3Z8WED1ECW.CCWE.Y92OAGY8MY9L+9MPCG/D5 C5IA5N9$PC5$CUZCY$5Y$527B+A4KZNQG5TKOWWD9FL%I8U$F7O2IBM85CWOC%LEZU4R/BXHDAHN "
35 |                    "11$CA5MRI:AONFN7091K9FKIGIY%VWSSSU9%01FO2*FTPQ3C3F")
36 |             << QByteArray::fromHex(
37 |                    "789c0163019cfed28443a10126a104480c4b15512be9140159010da401624445061a60b29429041a61f39fa9390103a101a4617681aa626369782f55524e3a555643493a303"
38 |                    "144452f495a3132333435412f3543574c553132524e4f4239525853454f5036464738235762636f62444562646e026264746a323032312d30352d323962697374526f626572"
39 |                    "74204b6f63682d496e737469747574626d616d4f52472d313030303331313834626d706c45552f312f32302f3135303762736402627467693834303533393030366276706a3"
40 |                    "131313933343930303763646f626a313936342d30382d3132636e616da462666e6a4d75737465726d616e6e62676e654572696b6163666e746a4d55535445524d414e4e6367"
41 |                    "6e74654552494b416376657265312e302e305840218ebc2a2a77c1796c95a8c942987d461411b0075fd563447295250d5ead69f3b8f6083a515bd97656e87aca01529e6aa0e"
42 |                    "09144fc07e2884c93080f1419e82f1c66773a");
43 |     }
44 | 
45 |     void testBase45Decode()
46 |     {
47 |         QFETCH(QByteArray, in);
48 |         QFETCH(QByteArray, out);
49 | 
50 |         QCOMPARE(KCodecs::base45Decode(in), out);
51 |     }
52 | 
53 |     void testBase45DecodeInvalid_data()
54 |     {
55 |         QTest::addColumn<QByteArray>("in");
56 |         QTest::newRow("1 byte") << QByteArray("X");
57 |         QTest::newRow("invalid chars") << QByteArray("%69 vD92Ex0");
58 |     }
59 | 
60 |     void testBase45DecodeInvalid()
61 |     {
62 |         QFETCH(QByteArray, in);
63 | 
64 |         // undefined return value, but must not crash or produce ASAN errors
65 |         KCodecs::base45Decode(in);
66 |     }
67 | };
68 | 
69 | QTEST_APPLESS_MAIN(Base45Test)
70 | 
71 | #include "base45test.moc"
72 | 


--------------------------------------------------------------------------------
/autotests/base64benchmark.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     SPDX-FileCopyrightText: 2010 Volker Krause <vkrause@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-or-later
  5 | */
  6 | 
  7 | #include "../src/kcodecsbase64.h"
  8 | 
  9 | #include <KCodecs>
 10 | 
 11 | #include <QByteArray>
 12 | #include <QObject>
 13 | #include <QTest>
 14 | 
 15 | class Base64Benchmark : public QObject
 16 | {
 17 |     Q_OBJECT
 18 | private:
 19 |     static QByteArray fillByteArray(int size)
 20 |     {
 21 |         char c = 0;
 22 |         QByteArray result;
 23 |         result.reserve(size);
 24 |         while (result.size() < size) {
 25 |             result.append(c++);
 26 |         }
 27 |         return result;
 28 |     }
 29 | 
 30 |     void createTestSet()
 31 |     {
 32 |         QTest::addColumn<QByteArray>("output");
 33 |         QTest::addColumn<QByteArray>("input");
 34 |         QTest::newRow("empty") << QByteArray() << QByteArray();
 35 |         QTest::newRow("128") << fillByteArray(128) << KCodecs::base64Encode(fillByteArray(128));
 36 |         QTest::newRow("1k") << fillByteArray(1 << 10) << KCodecs::base64Encode(fillByteArray(1 << 10));
 37 |         QTest::newRow("1M") << fillByteArray(1 << 20) << KCodecs::base64Encode(fillByteArray(1 << 20));
 38 |     }
 39 | private Q_SLOTS:
 40 |     void benchmarkKCodecDecode_data()
 41 |     {
 42 |         createTestSet();
 43 |     }
 44 | 
 45 |     void benchmarkKCodecDecode()
 46 |     {
 47 |         QFETCH(QByteArray, input);
 48 |         QFETCH(QByteArray, output);
 49 |         QByteArray result;
 50 |         QBENCHMARK {
 51 |             result = KCodecs::base64Decode(input);
 52 |         }
 53 |         QCOMPARE(result, output);
 54 |     }
 55 | 
 56 |     void benchmarkQByteArrayDecode_data()
 57 |     {
 58 |         createTestSet();
 59 |     }
 60 | 
 61 |     void benchmarkQByteArrayDecode()
 62 |     {
 63 |         QFETCH(QByteArray, input);
 64 |         QFETCH(QByteArray, output);
 65 |         QByteArray result;
 66 |         QBENCHMARK {
 67 |             result = QByteArray::fromBase64(input);
 68 |         }
 69 |         QCOMPARE(result, output);
 70 |     }
 71 | 
 72 |     void benchmarkKMimeBase64Decoder_data()
 73 |     {
 74 |         createTestSet();
 75 |     }
 76 | 
 77 |     void benchmarkKMimeBase64Decoder()
 78 |     {
 79 |         QFETCH(QByteArray, input);
 80 |         QFETCH(QByteArray, output);
 81 |         QByteArray result;
 82 |         QBENCHMARK {
 83 |             KCodecs::Codec *codec = KCodecs::Codec::codecForName("base64");
 84 |             QVERIFY(codec);
 85 |             result.resize(codec->maxDecodedSizeFor(input.size()));
 86 |             KCodecs::Decoder *decoder = codec->makeDecoder();
 87 |             QByteArray::const_iterator inputIt = input.constBegin();
 88 |             QByteArray::iterator resultIt = result.begin();
 89 |             decoder->decode(inputIt, input.constEnd(), resultIt, result.constEnd());
 90 |             result.truncate(resultIt - result.begin());
 91 |             delete decoder;
 92 |         }
 93 |         QCOMPARE(result, output);
 94 |     }
 95 | };
 96 | 
 97 | QTEST_MAIN(Base64Benchmark)
 98 | 
 99 | #include "base64benchmark.moc"
100 | 


--------------------------------------------------------------------------------
/autotests/codectest.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     SPDX-FileCopyrightText: 2010 Thomas McGuire <mcguire@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-or-later
  5 | */
  6 | #include "codectest.h"
  7 | 
  8 | #include <QTest>
  9 | 
 10 | #include <QDir>
 11 | 
 12 | #include "../src/kcodecs.h"
 13 | 
 14 | using namespace KCodecs;
 15 | 
 16 | QTEST_MAIN(CodecTest)
 17 | 
 18 | enum Mode {
 19 |     Decode,
 20 |     Encode,
 21 | };
 22 | Q_DECLARE_METATYPE(Mode)
 23 | 
 24 | void CodecTest::testCodecs_data()
 25 | {
 26 |     QTest::addColumn<QByteArray>("input");
 27 |     QTest::addColumn<QByteArray>("expResult");
 28 |     QTest::addColumn<QByteArray>("codecName");
 29 |     QTest::addColumn<QString>("tag");
 30 |     QTest::addColumn<Mode>("mode");
 31 | 
 32 |     QString dataDir = QFINDTESTDATA("data/binary_data");
 33 |     QVERIFY(!dataDir.isEmpty());
 34 |     dataDir.chop(QByteArrayView("binary_data").size());
 35 |     QDir codecBaseDir(dataDir);
 36 |     const QStringList lst = codecBaseDir.entryList(QStringList(), QDir::Dirs | QDir::NoDotAndDotDot, QDir::NoSort);
 37 |     for (const QString &dir : lst) {
 38 |         if (dir.toLower().startsWith(QLatin1String("codec_"))) {
 39 |             const QString codecName = dir.right(dir.size() - 6);
 40 |             QDir codecDir(codecBaseDir.path() + QLatin1String("/") + dir);
 41 |             const QStringList lst2 = codecDir.entryList(QStringList(), QDir::Files, QDir::NoSort);
 42 |             for (const QString &file : lst2) {
 43 |                 if (file.toLower().endsWith(QLatin1String(".expected"))) {
 44 |                     const QString dataFileNameBase = file.left(file.size() - 9);
 45 |                     QFile dataFile(codecDir.path() + QLatin1Char('/') + dataFileNameBase);
 46 |                     QFile expectedFile(codecDir.path() + QLatin1Char('/') + file);
 47 |                     QVERIFY(dataFile.open(QIODevice::ReadOnly));
 48 |                     QVERIFY(expectedFile.open(QIODevice::ReadOnly));
 49 | 
 50 |                     Mode mode = Decode;
 51 |                     if (file.contains(QLatin1String("-decode"))) {
 52 |                         mode = Decode;
 53 |                     } else if (file.contains(QLatin1String("-encode"))) {
 54 |                         mode = Encode;
 55 |                     }
 56 | 
 57 |                     const QByteArray data = dataFile.readAll();
 58 |                     const QByteArray expected = expectedFile.readAll();
 59 | 
 60 |                     const QString tag = codecName + QLatin1Char('/') + dataFileNameBase;
 61 |                     QTest::newRow(tag.toLatin1().constData()) << data << expected << codecName.toLatin1() << tag << mode;
 62 | 
 63 |                     dataFile.close();
 64 |                     expectedFile.close();
 65 |                 }
 66 |             }
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | void CodecTest::testCodecs()
 72 | {
 73 |     QFETCH(QByteArray, input);
 74 |     QFETCH(QByteArray, expResult);
 75 |     QFETCH(QByteArray, codecName);
 76 |     QFETCH(QString, tag);
 77 |     QFETCH(Mode, mode);
 78 | 
 79 |     Codec *codec = Codec::codecForName(codecName);
 80 |     QVERIFY(codec);
 81 | 
 82 |     QStringList blacklistedTags;
 83 |     if (blacklistedTags.contains(tag)) {
 84 |         QEXPECT_FAIL(tag.toLatin1().constData(), "Codec broken", Continue);
 85 |     }
 86 | 
 87 |     QByteArray result;
 88 |     if (mode == Decode) {
 89 |         result = codec->decode(input, Codec::NewlineLF);
 90 |     } else {
 91 |         result = codec->encode(input, Codec::NewlineLF);
 92 |     }
 93 | 
 94 |     // More usable version of QCOMPARE(result, expResult), in case the difference is at the end...
 95 |     if (result != expResult) {
 96 |         const QList<QByteArray> lines = result.split('\n');
 97 |         const QList<QByteArray> expLines = expResult.split('\n');
 98 |         if (lines.count() == expLines.count()) {
 99 |             QCOMPARE(result.split('\n'), expResult.split('\n'));
100 |         }
101 |     }
102 |     QCOMPARE(result, expResult);
103 | }
104 | 
105 | void CodecTest::testInvalidCodec()
106 | {
107 |     Codec *codec = Codec::codecForName("thiscodectotallydoesntexist");
108 |     QCOMPARE(codec, nullptr);
109 | }
110 | 
111 | #include "moc_codectest.cpp"
112 | 


--------------------------------------------------------------------------------
/autotests/codectest.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2010 Thomas McGuire <mcguire@kde.org>
 3 | 
 4 |     SPDX-License-Identifier: LGPL-2.0-or-later
 5 | */
 6 | #ifndef CODECTEST_H
 7 | #define CODECTEST_H
 8 | 
 9 | #include <QObject>
10 | 
11 | class CodecTest : public QObject
12 | {
13 |     Q_OBJECT
14 | private Q_SLOTS:
15 |     void testCodecs();
16 |     void testCodecs_data();
17 |     void testInvalidCodec();
18 | };
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/autotests/data/binary_data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/binary_data


--------------------------------------------------------------------------------
/autotests/data/codec_b/basic-decode.b:
--------------------------------------------------------------------------------
1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmqq6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w==


--------------------------------------------------------------------------------
/autotests/data/codec_b/basic-decode.b.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/basic-decode.b.expected


--------------------------------------------------------------------------------
/autotests/data/codec_b/basic-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/basic-encode


--------------------------------------------------------------------------------
/autotests/data/codec_b/basic-encode.expected:
--------------------------------------------------------------------------------
1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmqq6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w==


--------------------------------------------------------------------------------
/autotests/data/codec_b/null-decode.b:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-decode.b


--------------------------------------------------------------------------------
/autotests/data/codec_b/null-decode.b.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-decode.b.expected


--------------------------------------------------------------------------------
/autotests/data/codec_b/null-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-encode


--------------------------------------------------------------------------------
/autotests/data/codec_b/null-encode.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-encode.expected


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding0-encode:
--------------------------------------------------------------------------------
1 | abc


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding0-encode.expected:
--------------------------------------------------------------------------------
1 | YWJj


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding1-encode:
--------------------------------------------------------------------------------
1 | ab


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding1-encode.expected:
--------------------------------------------------------------------------------
1 | YWI=


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding2-encode:
--------------------------------------------------------------------------------
1 | a


--------------------------------------------------------------------------------
/autotests/data/codec_b/padding2-encode.expected:
--------------------------------------------------------------------------------
1 | YQ==


--------------------------------------------------------------------------------
/autotests/data/codec_base64/basic-decode.base64:
--------------------------------------------------------------------------------
1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4
2 | OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3Bx
3 | cnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmq
4 | q6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj
5 | 5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w==
6 | 


--------------------------------------------------------------------------------
/autotests/data/codec_base64/basic-decode.base64.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/basic-decode.base64.expected


--------------------------------------------------------------------------------
/autotests/data/codec_base64/basic-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/basic-encode


--------------------------------------------------------------------------------
/autotests/data/codec_base64/basic-encode.expected:
--------------------------------------------------------------------------------
1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4
2 | OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3Bx
3 | cnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmq
4 | q6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj
5 | 5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w==
6 | 


--------------------------------------------------------------------------------
/autotests/data/codec_base64/corrupt.base64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/corrupt.base64


--------------------------------------------------------------------------------
/autotests/data/codec_base64/corrupt.base64.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/corrupt.base64.expected


--------------------------------------------------------------------------------
/autotests/data/codec_base64/very_small-encode:
--------------------------------------------------------------------------------
1 | 12


--------------------------------------------------------------------------------
/autotests/data/codec_base64/very_small-encode.expected:
--------------------------------------------------------------------------------
1 | MTI=
2 | 


--------------------------------------------------------------------------------
/autotests/data/codec_q/all-encoded-decode.q:
--------------------------------------------------------------------------------
1 | =00=01=02=03=04=05=06=07=08=09=0A=0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F=20=21=22=23=24=25=26=27=28=29=2A=2B=2C=2D=2E=2F=30=31=32=33=34=35=36=37=38=39=3A=3B=3C=3D=3E=3F=40=41=42=43=44=45=46=47=48=49=4A=4B=4C=4D=4E=4F=50=51=52=53=54=55=56=57=58=59=5A=5B=5C=5D=5E=5F=60=61=62=63=64=65=66=67=68=69=6A=6B=6C=6D=6E=6F=70=71=72=73=74=75=76=77=78=79=7A=7B=7C=7D=7E=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=F8=F9=FA=FB=FC=FD=FE=FF


--------------------------------------------------------------------------------
/autotests/data/codec_q/all-encoded-decode.q.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/all-encoded-decode.q.expected


--------------------------------------------------------------------------------
/autotests/data/codec_q/basic-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/basic-encode


--------------------------------------------------------------------------------
/autotests/data/codec_q/basic-encode.expected:
--------------------------------------------------------------------------------
1 | =00=01=02=03=04=05=06=07=08=09=0A=0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F_!=22=23=24=25=26=27=28=29*+=2C-=2E/0123456789=3A=3B=3C=3D=3E=3F=40ABCDEFGHIJKLMNOPQRSTUVWXYZ=5B=5C=5D=5E=5F=60abcdefghijklmnopqrstuvwxyz=7B=7C=7D=7E=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=F8=F9=FA=FB=FC=FD=FE=FF


--------------------------------------------------------------------------------
/autotests/data/codec_q/null-decode.q:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-decode.q


--------------------------------------------------------------------------------
/autotests/data/codec_q/null-decode.q.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-decode.q.expected


--------------------------------------------------------------------------------
/autotests/data/codec_q/null-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-encode


--------------------------------------------------------------------------------
/autotests/data/codec_q/null-encode.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-encode.expected


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/basic-decode.quoted-printable:
--------------------------------------------------------------------------------
1 | =00=01=02=03=04=05=06=07=08=09
2 | =0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F !"#$%&'()*+=
3 | ,-./0123456789:;<=3D>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst=
4 | uvwxyz{|}~=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=
5 | =94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=
6 | =AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=
7 | =C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=
8 | =DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=
9 | =F8=F9=FA=FB=FC=FD=FE=FF


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/basic-decode.quoted-printable.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/basic-decode.quoted-printable.expected


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/basic-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/basic-encode


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/basic-encode.expected:
--------------------------------------------------------------------------------
1 | =00=01=02=03=04=05=06=07=08=09
2 | =0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F !"#$%&'()*+=
3 | ,-./0123456789:;<=3D>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst=
4 | uvwxyz{|}~=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=
5 | =94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=
6 | =AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=
7 | =C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=
8 | =DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=
9 | =F8=F9=FA=FB=FC=FD=FE=FF


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt.quoted-printable:
--------------------------------------------------------------------------------
1 | A =3D wasn't properly encoded (should be kept): APE=MAN MAN=APE
2 | A =3D wasn't properly encoded (lowercase): ape=man man=ape
3 | Lowercase hexchars: =bb=a1=4b=44=45 =72=75=6c=65=7a=21=ab
4 | Mixed-case hexchars: =Bb=A1=4B=44=45 =72=75=6C=65=7A=21=aB
5 | A misplaced (unencoded =3D), followed by whitespace: =  not at end!
6 | Two consecutive =3D at the end of the line: ==
7 | A misplaced (unencoded =3D), as the ultimate character: =


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt.quoted-printable.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/corrupt.quoted-printable.expected


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt2.quoted-printable:
--------------------------------------------------------------------------------
1 | An incomplete encoded character at the very end of the encoded data: =a


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt2.quoted-printable.expected:
--------------------------------------------------------------------------------
1 | An incomplete encoded character at the very end of the encoded data: =a


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt3.quoted-printable:
--------------------------------------------------------------------------------
1 | An invalid encoded character at the very end of the encoded data: =ax


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt3.quoted-printable.expected:
--------------------------------------------------------------------------------
1 | An invalid encoded character at the very end of the encoded data: =ax


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt4.quoted-printable:
--------------------------------------------------------------------------------
1 | Two =3D at the very end of the encoded data: ==


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/corrupt4.quoted-printable.expected:
--------------------------------------------------------------------------------
1 | Two = at the very end of the encoded data: ==


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/wrap-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/wrap-encode


--------------------------------------------------------------------------------
/autotests/data/codec_quoted-printable/wrap-encode.expected:
--------------------------------------------------------------------------------
 1 | This is a line without a special char at the end.
 2 | This is a line with a space at the end.=20
 3 | This is a line with multiple spaces at the end.      =20
 4 | This is a line with a tab at the end.=09
 5 | This is a line with an umlaut at the end.=E4
 6 | This is a line with an umlaut and a space at the end.=E4=20
 7 | This is a line with an umlaut and a tab at the end.=E4=09
 8 | =46rom This is a line with From at the beginning.
 9 | =2EThis is a line with a dot at the beginning.
10 | =2DThis is a line with a dash at the beginning.
11 | 
12 | This is a very long line (=E4 ) which just happens to be wrapped so that a =
13 | =46rom appears at the beginning of the second line. Furthermore, this break=
14 | =2E makes a dot appear as the first character on the third line.
15 | 
16 | Just long enough: xxxxxxxx This is a line without a special char at the end.
17 | Just too long: xxxxxxxxxxxx This is a line without a special char at the en=
18 | d.
19 | xxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line without a special char at the e=
20 | nd.
21 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line without a special char at the =
22 | end.
23 | 
24 | Just long enough: xxxxxxxxxxxxxxx This is a line with a space at the end.=20
25 | Just too long: xxxxxxxxxxxxxxxxxxx This is a line with a space at the end.=
26 | =20
27 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a space at the end.=
28 | =20
29 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a space at the end=
30 | =2E=20
31 | 
32 | Just long enough: xxxxxxxxxxxxxxxxx This is a line with a tab at the end.=09
33 | Just too long: xxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end.=
34 | =09
35 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end.=
36 | =09
37 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end=
38 | =2E=09
39 | 
40 | Just long enough: xxxxxxxxxxxxx This is a line with an umlaut at the end.=E4
41 | Just too long: xxxxxxxxxxxxxxxxx This is a line with an umlaut at the end.=
42 | =E4
43 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut at the end.=
44 | =E4
45 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut at the end=
46 | =2E=E4
47 | 
48 | Just long enough This is a line with an umlaut and a space at the end.=E4=20
49 | Just too long: xx This is a line with an umlaut and a space at the end.=E4=
50 | =20
51 | xxxxxxxxxxxxxxxxxx This is a line with an umlaut and a space at the end.=E4=
52 | =20
53 | xxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a space at the end.=
54 | =E4=20
55 | 
56 | Just long enough:  This is a line with an umlaut and a tab at the end.=E4=09
57 | Just too long: xxxx This is a line with an umlaut and a tab at the end.=E4=
58 | =09
59 | xxxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a tab at the end.=E4=
60 | =09
61 | xxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a tab at the end.=
62 | =E4=09
63 | 
64 | This line has a space at the end and ends the buffer=20


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode:
--------------------------------------------------------------------------------
1 | %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F%30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F%40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F%50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F%60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F%70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode.expected


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/basic-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/basic-encode


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/basic-encode.expected:
--------------------------------------------------------------------------------
1 | %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23%24%25%26%27%28%29%2A+%2C-%2E%2F0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E%5F%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231.expected


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/null-encode:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-encode


--------------------------------------------------------------------------------
/autotests/data/codec_x-kmime-rfc2231/null-encode.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-encode.expected


--------------------------------------------------------------------------------
/autotests/data/codec_x-uuencode/basic-decode.x-uuencode:
--------------------------------------------------------------------------------
 1 | begin 664 foo
 2 | M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL
 3 | M+2XO,#$R,S0U-C<X.3H[/#T^/T!!0D-$149'2$E*2TQ-3D]045)35%565UA9
 4 | M6EM<75Y?8&%B8V1E9F=H:6IK;&UN;W!Q<G-T=79W>'EZ>WQ]?G^`@8*#A(6&
 5 | MAXB)BHN,C8Z/D)&2DY25EI>8F9J;G)V>GZ"AHJ.DI::GJ*FJJZRMKJ^PL;*S
 6 | MM+6VM[BYNKN\O;Z_P,'"P\3%QL?(R<K+S,W.S]#1TM/4U=;7V-G:V]S=WM_@
 7 | ?X>+CY.7FY^CIZNOL[>[O\/'R\_3U]O?X^?K[_/W^_P``
 8 | `
 9 | end
10 | 


--------------------------------------------------------------------------------
/autotests/data/codec_x-uuencode/basic-decode.x-uuencode.expected:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-uuencode/basic-decode.x-uuencode.expected


--------------------------------------------------------------------------------
/autotests/kcharsetstest.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2011 Romain Perier <bambi@kubuntu.org>
 3 | 
 4 |     SPDX-License-Identifier: GPL-2.0-or-later
 5 | */
 6 | 
 7 | #include "kcharsetstest.h"
 8 | 
 9 | #include "kcharsets_p.h"
10 | #include <QDebug>
11 | #include <QString>
12 | #include <QTest>
13 | #include <kcharsets.h>
14 | 
15 | using namespace Qt::Literals;
16 | 
17 | static bool encodingNameHasADescription(const QString &encodingName, const QStringList &descriptions)
18 | {
19 |     return std::any_of(descriptions.cbegin(), descriptions.cend(), [&encodingName](const QString &description) {
20 |         return description.contains(encodingName);
21 |     });
22 | }
23 | 
24 | void KCharsetsTest::testSingleton()
25 | {
26 |     QVERIFY(KCharsets::charsets() != nullptr);
27 |     QCOMPARE(KCharsets::charsets(), KCharsets::charsets());
28 | }
29 | 
30 | void KCharsetsTest::testFromEntity()
31 | {
32 |     KCharsets *singleton = KCharsets::charsets();
33 | 
34 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("&#1234")), QChar(1234));
35 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("&#x1234")), QChar(0x1234));
36 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("lt")), QChar::fromLatin1('<'));
37 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("gt")), QChar::fromLatin1('>'));
38 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("quot")), QChar::fromLatin1('"'));
39 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("amp")), QChar::fromLatin1('&'));
40 |     QCOMPARE(singleton->fromEntity(QString::fromLatin1("apos")), QChar::fromLatin1('\''));
41 |     QCOMPARE(singleton->fromEntity(u"aposgarbagesuffix"_s), QChar());
42 |     QCOMPARE(singleton->fromEntity(u"thetasym"_s), QChar(0x03d1));
43 |     QCOMPARE(singleton->fromEntity(u"thetasymgarbagesuffix"_s), QChar());
44 | }
45 | 
46 | void KCharsetsTest::testToEntity()
47 | {
48 |     QSKIP("KCharsets::toEntity test not implemented.");
49 | }
50 | 
51 | void KCharsetsTest::testResolveEntities()
52 | {
53 |     KCharsets *singleton = KCharsets::charsets();
54 | 
55 |     QCOMPARE(singleton->resolveEntities(QString::fromLatin1("&quot;&apos;&lt;Hello &amp;World&gt;&apos;&quot;")),
56 |              QString::fromLatin1("\"\'<Hello &World>\'\""));
57 | }
58 | 
59 | void KCharsetsTest::testEncodingNames()
60 | {
61 |     KCharsets *singleton = KCharsets::charsets();
62 | 
63 |     QCOMPARE(singleton->availableEncodingNames().count(), singleton->descriptiveEncodingNames().count());
64 | 
65 |     for (const QString &encodingName : singleton->availableEncodingNames()) {
66 |         QVERIFY(encodingNameHasADescription(encodingName, singleton->descriptiveEncodingNames()));
67 |         QVERIFY(!singleton->descriptionForEncoding(encodingName).isEmpty());
68 |         QCOMPARE(singleton->encodingForName(singleton->descriptionForEncoding(encodingName)), encodingName);
69 |     }
70 | }
71 | 
72 | QTEST_MAIN(KCharsetsTest)
73 | 
74 | #include "moc_kcharsetstest.cpp"
75 | 


--------------------------------------------------------------------------------
/autotests/kcharsetstest.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2011 Romain Perier <bambi@kubuntu.org>
 3 | 
 4 |     SPDX-License-Identifier: GPL-2.0-or-later
 5 | */
 6 | 
 7 | #ifndef KCHARSETSTEST_H
 8 | #define KCHARSETSTEST_H
 9 | 
10 | #include <QObject>
11 | 
12 | class KCharsetsTest : public QObject
13 | {
14 |     Q_OBJECT
15 | private Q_SLOTS:
16 |     void testSingleton();
17 |     void testFromEntity();
18 |     void testToEntity();
19 |     void testResolveEntities();
20 |     void testEncodingNames();
21 | };
22 | 
23 | #endif /* KCHARSETSTEST_H */
24 | 


--------------------------------------------------------------------------------
/autotests/kemailaddresstest.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     This file is part of the KDE project
 3 | 
 4 |     SPDX-FileCopyrightText: 2004 David Faure <faure@kde.org>
 5 |     SPDX-FileCopyrightText: 2009 Thomas McGuire <mcguire@kde.org>
 6 | 
 7 |     SPDX-License-Identifier: LGPL-2.0-only
 8 | */
 9 | #ifndef TESTEMAIL_H
10 | #define TESTEMAIL_H
11 | 
12 | #include <QObject>
13 | 
14 | class KEmailAddressTest : public QObject
15 | {
16 |     Q_OBJECT
17 | private Q_SLOTS:
18 |     void testGetNameAndEmail();
19 |     void testGetNameAndEmail_data();
20 |     void testIsValidEmailAddress();
21 |     void testIsValidEmailAddress_data();
22 |     void testIsValidAddressList();
23 |     void testIsValidAddressList_data();
24 |     void testIsValidSimpleEmailAddress();
25 |     void testIsValidSimpleEmailAddress_data();
26 |     void testGetEmailAddress();
27 |     void testGetEmailAddress_data();
28 |     void testCheckSplitEmailAddrList();
29 |     void testCheckSplitEmailAddrList_data();
30 |     void testNormalizeAddressesAndEncodeIDNs();
31 |     void testNormalizeAddressesAndEncodeIDNs_data();
32 |     void testNormalizeAddressesAndDecodeIDNs();
33 |     void testNormalizeAddressesAndDecodeIDNs_data();
34 |     void testQuoteIfNecessary();
35 |     void testQuoteIfNecessary_data();
36 |     void testMailtoUrls();
37 |     void testMailtoUrls_data();
38 | };
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/autotests/kencodingprobertest.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2012 Ni Hui <shuizhuyuanluo@126.com>
 3 | 
 4 |     SPDX-License-Identifier: GPL-2.0-or-later
 5 | */
 6 | 
 7 | #include "kencodingprobertest.h"
 8 | 
 9 | #include <QDir>
10 | #include <QTest>
11 | #include <kencodingprober.h>
12 | 
13 | static KEncodingProber *ep = nullptr;
14 | 
15 | void KEncodingProberTest::initTestCase()
16 | {
17 |     ep = new KEncodingProber;
18 | }
19 | 
20 | void KEncodingProberTest::cleanupTestCase()
21 | {
22 |     delete ep;
23 |     ep = nullptr;
24 | }
25 | 
26 | void KEncodingProberTest::cleanup()
27 | {
28 |     ep->reset();
29 | }
30 | 
31 | void KEncodingProberTest::testReset()
32 | {
33 |     ep->feed(QByteArray("some random data @*@#&jd"));
34 |     ep->reset();
35 |     QCOMPARE(ep->state(), KEncodingProber::Probing);
36 |     QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8"));
37 | }
38 | 
39 | void KEncodingProberTest::testProbe()
40 | {
41 |     // utf-8
42 |     ep->setProberType(KEncodingProber::Universal);
43 |     ep->feed(QByteArray::fromHex("e998bfe5b094e58d91e696afe5b1b1e88489"));
44 |     QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8"));
45 |     ep->reset();
46 | 
47 |     // gb18030
48 |     ep->setProberType(KEncodingProber::ChineseSimplified);
49 |     ep->feed(QByteArray::fromHex("d7d4d3c9b5c4b0d9bfc6c8abcae9"));
50 |     QCOMPARE(ep->encoding().toLower(), QByteArray("gb18030"));
51 |     ep->reset();
52 | 
53 |     // shift_jis
54 |     ep->setProberType(KEncodingProber::Japanese);
55 |     ep->feed(QByteArray::fromHex("8374838a815b955389c88e969354"));
56 |     QCOMPARE(ep->encoding().toLower(), QByteArray("shift_jis"));
57 |     ep->reset();
58 | 
59 |     // big5
60 |     ep->setProberType(KEncodingProber::ChineseTraditional);
61 |     ep->feed(QByteArray::fromHex("aefcafc7a6caa474a141a6b3ae65a444a46a"));
62 |     QCOMPARE(ep->encoding().toLower(), QByteArray("big5"));
63 |     ep->reset();
64 | 
65 |     // binary data, just make sure we do not crash (cf. crash in bug #357341)
66 |     const QString binaryFile = QFINDTESTDATA("data/binary_data");
67 |     QVERIFY(!binaryFile.isEmpty());
68 |     QFile file(binaryFile);
69 |     QVERIFY(file.open(QIODevice::ReadOnly));
70 |     QByteArray binaryData(file.readAll());
71 |     ep->setProberType(KEncodingProber::Universal);
72 |     ep->feed(binaryData);
73 |     QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8"));
74 |     ep->reset();
75 | }
76 | 
77 | QTEST_MAIN(KEncodingProberTest)
78 | 
79 | #include "moc_kencodingprobertest.cpp"
80 | 


--------------------------------------------------------------------------------
/autotests/kencodingprobertest.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2012 Ni Hui <shuizhuyuanluo@126.com>
 3 | 
 4 |     SPDX-License-Identifier: GPL-2.0-or-later
 5 | */
 6 | 
 7 | #ifndef KENCODINGPROBERTEST_H
 8 | #define KENCODINGPROBERTEST_H
 9 | 
10 | #include <QObject>
11 | 
12 | class KEncodingProberTest : public QObject
13 | {
14 |     Q_OBJECT
15 | private Q_SLOTS:
16 |     void initTestCase();
17 |     void cleanupTestCase();
18 |     void cleanup();
19 |     void testReset();
20 |     void testProbe();
21 | };
22 | 
23 | #endif // KENCODINGPROBERTEST_H
24 | 


--------------------------------------------------------------------------------
/autotests/rfc2047test.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     SPDX-FileCopyrightText: 2006 Volker Krause <vkrause@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-only
  5 | */
  6 | 
  7 | #include <QTest>
  8 | 
  9 | #include "rfc2047test.h"
 10 | 
 11 | #include "../src/kcodecs.h"
 12 | 
 13 | using namespace KCodecs;
 14 | 
 15 | QTEST_MAIN(RFC2047Test)
 16 | 
 17 | void RFC2047Test::testRFC2047decode_data()
 18 | {
 19 |     QTest::addColumn<QByteArray>("input");
 20 |     QTest::addColumn<QByteArray>("expectedCharset");
 21 |     QTest::addColumn<QByteArray>("defaultCharset");
 22 |     QTest::addColumn<bool>("forceCharset");
 23 |     QTest::addColumn<QString>("expectedResult");
 24 | 
 25 |     /* clang-format off */
 26 |     QTest::newRow("empty") << QByteArray()
 27 |                            << QByteArray() << QByteArray("utf-8") << false
 28 |                            << QString();
 29 |     QTest::newRow("identity") << QByteArray("bla")
 30 |                               << QByteArray() << QByteArray("utf-8") << false
 31 |                               << QString::fromLatin1("bla");
 32 | 
 33 |     QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= <kloecker@kde.org>")
 34 |                            << QByteArray("UTF-8") << QByteArray("utf-8") << false
 35 |                            << QString::fromUtf8("Ingo Klöcker <kloecker@kde.org>");
 36 |     QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= <kloecker@kde.org>")
 37 |                            << QByteArray("UTF-8") << QByteArray("iso8859-1") << false
 38 |                            << QString::fromUtf8("Ingo Klöcker <kloecker@kde.org>");
 39 |     QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?=")
 40 |                            << QByteArray("UTF-8") << QByteArray("utf-8") << false
 41 |                            << QString::fromUtf8("Ingo Klöcker");
 42 | 
 43 | 
 44 |     QTest::newRow("whitespaces") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?=       =?utf-8?q?Ingo=20Kl=C3=B6cker?=")
 45 |                                  << QByteArray("UTF-8") << QByteArray("utf-8") << false
 46 |                                  << QString::fromUtf8("Ingo KlöckerIngo Klöcker");
 47 |     QTest::newRow("whitespaces") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?=  foo  =?utf-8?q?Ingo=20Kl=C3=B6cker?=")
 48 |                                  << QByteArray("UTF-8") << QByteArray("utf-8") << false
 49 |                                  << QString::fromUtf8("Ingo Klöcker  foo  Ingo Klöcker");
 50 | 
 51 |     QTest::newRow("iso-8859-1") << QByteArray("=?ISO-8859-1?Q?Andr=E9s_Ot=F3n?=")
 52 |                                 << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false
 53 |                                 << QString::fromUtf8("Andrés Otón");
 54 |     QTest::newRow("iso-8859-2") << QByteArray("=?iso-8859-2?q?Rafa=B3_Rzepecki?=")
 55 |                                 << QByteArray("ISO-8859-2") << QByteArray("utf-8") << false
 56 |                                 << QString::fromUtf8("Rafał Rzepecki");
 57 |     QTest::newRow("iso-8859-9") << QByteArray("=?iso-8859-9?Q?S=2E=C7a=F0lar?= Onur")
 58 |                                 << QByteArray("ISO-8859-9") << QByteArray("utf-8") << false
 59 |                                 << QString::fromUtf8("S.Çağlar Onur");
 60 |     QTest::newRow("iso-8859-15") << QByteArray("Rafael =?iso-8859-15?q?Rodr=EDguez?=")
 61 |                                  << QByteArray("ISO-8859-15") << QByteArray("utf-8") << false
 62 |                                  << QString::fromUtf8("Rafael Rodríguez");
 63 | 
 64 |     QTest::newRow("wrong charset") << QByteArray("=?iso-8859-1?q?Ingo=20Kl=C3=B6cker?=")
 65 |                                    << QByteArray("UTF-8") << QByteArray("utf-8") << true
 66 |                                    << QString::fromUtf8("Ingo Klöcker");
 67 | 
 68 |     // language parameter according to RFC 2231, section 5
 69 |     QTest::newRow("RFC-2331") << QByteArray("From: =?US-ASCII*EN?Q?Keith_Moore?= <moore@cs.utk.edu>")
 70 |                               << QByteArray("US-ASCII") << QByteArray("utf-8") << false
 71 |                               << QString::fromUtf8("From: Keith Moore <moore@cs.utk.edu>");
 72 | 
 73 |     QTest::newRow("broken QP") << QByteArray("Subject: =?iso-8859-1?Q?Belangrijk=3a=20Verhuizing=20FTP=20server?=")
 74 |                                << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false
 75 |                                << QString::fromUtf8("Subject: Belangrijk: Verhuizing FTP server");
 76 | 
 77 |     // mixed charsets, based on bug 125542
 78 |     QTest::newRow("mixed charsets") << QByteArray("Subject: =?utf-8?q?Ingo=20Kl=C3=B6cker?= unencoded words =?iso-8859-9?Q?S=2E=C7a=F0lar?=")
 79 |                                     << QByteArray("UTF-8") << QByteArray("utf-8") << false
 80 |                                     << QString::fromUtf8("Subject: Ingo Klöcker unencoded words S.Çağlar");
 81 |     QTest::newRow("mixed charsets-125542") << QByteArray("Subject: =?koi8-r?b?5MXMz9fJINrB?= HP Pavillion =?iso-8859-5?b?KNzV3N7g2PjQIN/e4dXR3d4p?=")
 82 |                                     << QByteArray("UTF-8") << QByteArray("us-ascii") << false
 83 |                                     << QString::fromUtf8("Subject: Делови за HP Pavillion (меморија посебно)");
 84 | 
 85 |     // illegal characters which are already encoded in the given encoding but are not ASCII (bug 206417)
 86 |     QTest::newRow("illegal characters") << QByteArray("Subject: =?utf-8?Q?пиѿилл,=20=D0=B4=D0=BE=D0=B1=D1=80=D1=8B=D0=B9=20=D0=B4=D0=B5=D0=BD=D1=8C?=")
 87 |                                         << QByteArray("UTF-8") << QByteArray("utf-8") << false
 88 |                                         << QString::fromUtf8("Subject: пиѿилл, добрый день");
 89 |     const auto iso88591Encoded = QByteArray::fromHex("D6C4DCF6E4FC"); // "ÖÄÜöäü" in ISO-8859-1 encoding - this is not valid UTF-8 though and thus rejected by MSVC in string literals
 90 |     QTest::newRow("illegal characters") << QByteArray("Subject: =?iso-8859-1?Q?") + iso88591Encoded + "?="
 91 |                                         << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false
 92 |                                         << QString::fromLatin1("Subject: " + iso88591Encoded);
 93 | 
 94 | 
 95 |     QTest::newRow("small data") << QByteArray("=?iso-8859-1?Q?c?=")
 96 |                                 << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false
 97 |                                 << QString::fromUtf8("c");
 98 |     /* clang-format on */
 99 | }
100 | 
101 | void RFC2047Test::testRFC2047decode()
102 | {
103 |     QFETCH(QByteArray, input);
104 |     QFETCH(QByteArray, expectedCharset);
105 |     QFETCH(QByteArray, defaultCharset);
106 |     QFETCH(bool, forceCharset);
107 |     QFETCH(QString, expectedResult);
108 | 
109 |     QByteArray detectedCharset;
110 | 
111 |     const KCodecs::CharsetOption options = forceCharset ? KCodecs::ForceDefaultCharset : KCodecs::NoOption;
112 |     const QString result = KCodecs::decodeRFC2047String(input, &detectedCharset, defaultCharset, options);
113 | 
114 |     QCOMPARE(result, expectedResult);
115 |     QCOMPARE(detectedCharset, expectedCharset);
116 | }
117 | 
118 | void RFC2047Test::testInvalidDecode_data()
119 | {
120 |     QTest::addColumn<QByteArray>("input");
121 |     QTest::addColumn<QString>("expectedResult");
122 | 
123 |     QTest::newRow("") << QByteArray("=") << QString::fromUtf8("=");
124 |     QTest::newRow("") << QByteArray("=?") << QString::fromUtf8("=?");
125 |     QTest::newRow("") << QByteArray("=?a?b?=") << QString::fromUtf8("=?a?b?=");
126 |     QTest::newRow("") << QByteArray("=?a?b?c?") << QString::fromUtf8("=?a?b?c?");
127 |     QTest::newRow("") << QByteArray("=?a??c?=") << QString::fromUtf8("=?a??c?=");
128 | }
129 | 
130 | void RFC2047Test::testInvalidDecode()
131 | {
132 |     QFETCH(QByteArray, input);
133 |     QFETCH(QString, expectedResult);
134 | 
135 |     QByteArray encCharset;
136 | 
137 |     const QString result = KCodecs::decodeRFC2047String(input, &encCharset);
138 |     QCOMPARE(result, expectedResult);
139 | }
140 | 
141 | void RFC2047Test::testRFC2047encode_data()
142 | {
143 |     QTest::addColumn<QString>("input");
144 |     QTest::addColumn<QByteArray>("encoding");
145 |     QTest::addColumn<QByteArray>("expectedResult");
146 | 
147 |     /* clang-format off */
148 |     QTest::newRow("empty") << QString()
149 |                            << QByteArray("utf-8")
150 |                            << QByteArray();
151 |     QTest::newRow("identity") << QString::fromUtf8("bla")
152 |                               << QByteArray("utf-8")
153 |                               << QByteArray("bla");
154 |     QTest::newRow("QP") << QString::fromUtf8("Ingo Klöcker <kloecker@kde.org>")
155 |                         << QByteArray("utf-8")
156 |                         << QByteArray("=?UTF-8?q?Ingo=20Kl=C3=B6cker?= <kloecker@kde.org>");
157 | 
158 |     QTest::newRow("utf-8 fallback") << QString::fromUtf8("æſðđŋħł")
159 |                                     << QByteArray("latin1")
160 |                                     << QByteArray("=?UTF-8?B?w6bFv8OwxJHFi8SnxYI=?=");
161 |     /* clang-format on */
162 | }
163 | 
164 | void RFC2047Test::testRFC2047encode()
165 | {
166 |     QFETCH(QString, input);
167 |     QFETCH(QByteArray, encoding);
168 |     QFETCH(QByteArray, expectedResult);
169 | 
170 |     const QByteArray result = KCodecs::encodeRFC2047String(input, encoding);
171 | 
172 |     // expected value is probably wrong, libkmime will choose 'B' instead of 'Q' encoding
173 |     QEXPECT_FAIL("QP", "KCodecs will choose 'B' instead of 'Q' encoding", Continue);
174 |     QCOMPARE(result, expectedResult);
175 | }
176 | 
177 | #include "moc_rfc2047test.cpp"
178 | 


--------------------------------------------------------------------------------
/autotests/rfc2047test.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2006 Volker Krause <vkrause@kde.org>
 3 | 
 4 |     SPDX-License-Identifier: LGPL-2.0-only
 5 | */
 6 | 
 7 | #ifndef RFC2047TEST_H
 8 | #define RFC2047TEST_H
 9 | 
10 | #include <QObject>
11 | 
12 | class RFC2047Test : public QObject
13 | {
14 |     Q_OBJECT
15 | private Q_SLOTS:
16 |     void testRFC2047decode_data();
17 |     void testRFC2047decode();
18 | 
19 |     void testInvalidDecode_data();
20 |     void testInvalidDecode();
21 | 
22 |     void testRFC2047encode_data();
23 |     void testRFC2047encode();
24 | };
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/docs/Doxyfile.local:
--------------------------------------------------------------------------------
1 | ### KApiDox Project-specific Overrides File
2 | 
3 | # define so that deprecated API is not skipped
4 | PREDEFINED += \
5 |     "KCODECS_ENABLE_DEPRECATED_SINCE(x, y)=1" \
6 |     "KCODECS_BUILD_DEPRECATED_SINCE(x, y)=1" \
7 |     "KCODECS_DEPRECATED_VERSION(x, y, t)="
8 | 


--------------------------------------------------------------------------------
/metainfo.yaml:
--------------------------------------------------------------------------------
 1 | maintainer:
 2 | description: Text encoding
 3 | tier: 1
 4 | type: functional
 5 | platforms:
 6 |     - name: Linux
 7 |     - name: FreeBSD
 8 |     - name: Windows
 9 |     - name: macOS
10 |     - name: Android
11 | portingAid: false
12 | deprecated: false
13 | release: true
14 | libraries:
15 |     - cmake: "KF6::Codecs"
16 | cmakename: KF6Codecs
17 | 
18 | public_lib: true
19 | group: Frameworks
20 | subgroup: Tier 1
21 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | add_library(KF6Codecs)
  2 | add_library(KF6::Codecs ALIAS KF6Codecs)
  3 | 
  4 | set_target_properties(KF6Codecs PROPERTIES
  5 |     VERSION     ${KCODECS_VERSION}
  6 |     SOVERSION   ${KCODECS_SOVERSION}
  7 |     EXPORT_NAME Codecs
  8 | )
  9 | 
 10 | ecm_create_qm_loader(KF6Codecs kcodecs6_qt)
 11 | 
 12 | target_sources(KF6Codecs PRIVATE
 13 |     kcharsets.cpp
 14 |     kcharsets.h
 15 |     kcodecsbase45.cpp
 16 |     kcodecsbase64.cpp
 17 |     kcodecsbase64.h
 18 |     kcodecs.cpp
 19 |     kcodecs.h
 20 |     kcodecs_p.h
 21 |     kcodecsqp.cpp
 22 |     kcodecsqp.h
 23 |     kcodecsuuencode.cpp
 24 |     kcodecsuuencode.h
 25 |     kemailaddress.cpp
 26 |     kemailaddress.h
 27 |     kencodingprober.cpp
 28 |     kencodingprober.h
 29 |     probers/CharDistribution.cpp
 30 |     probers/CharDistribution.h
 31 |     probers/ChineseGroupProber.cpp
 32 |     probers/ChineseGroupProber.h
 33 |     probers/JapaneseGroupProber.cpp
 34 |     probers/JapaneseGroupProber.h
 35 |     probers/JpCntx.cpp
 36 |     probers/JpCntx.h
 37 |     probers/LangBulgarianModel.cpp
 38 |     probers/LangCyrillicModel.cpp
 39 |     probers/LangGreekModel.cpp
 40 |     probers/LangHebrewModel.cpp
 41 |     probers/LangHungarianModel.cpp
 42 |     probers/LangThaiModel.cpp
 43 |     probers/nsBig5Prober.cpp
 44 |     probers/nsBig5Prober.h
 45 |     probers/nsCharSetProber.cpp
 46 |     probers/nsCharSetProber.h
 47 |     probers/nsCodingStateMachine.h
 48 |     probers/nsEscCharsetProber.cpp
 49 |     probers/nsEscCharsetProber.h
 50 |     probers/nsEscSM.cpp
 51 |     probers/nsEUCJPProber.cpp
 52 |     probers/nsEUCJPProber.h
 53 |     probers/nsEUCKRProber.cpp
 54 |     probers/nsEUCKRProber.h
 55 |     probers/nsGB2312Prober.cpp
 56 |     probers/nsGB2312Prober.h
 57 |     probers/nsHebrewProber.cpp
 58 |     probers/nsHebrewProber.h
 59 |     probers/nsLatin1Prober.cpp
 60 |     probers/nsLatin1Prober.h
 61 |     probers/nsMBCSGroupProber.cpp
 62 |     probers/nsMBCSGroupProber.h
 63 |     probers/nsMBCSSM.cpp
 64 |     probers/nsPkgInt.h
 65 |     probers/nsSBCharSetProber.cpp
 66 |     probers/nsSBCharSetProber.h
 67 |     probers/nsSBCSGroupProber.cpp
 68 |     probers/nsSBCSGroupProber.h
 69 |     probers/nsSJISProber.cpp
 70 |     probers/nsSJISProber.h
 71 |     probers/nsUniversalDetector.cpp
 72 |     probers/nsUniversalDetector.h
 73 |     probers/UnicodeGroupProber.cpp
 74 |     probers/UnicodeGroupProber.h
 75 | )
 76 | 
 77 | ecm_qt_declare_logging_category(KF6Codecs
 78 |     HEADER kcodecs_debug.h
 79 |     IDENTIFIER KCODECS_LOG
 80 |     CATEGORY_NAME kf.codecs
 81 |     OLD_CATEGORY_NAMES kf5.kcodecs
 82 |     DESCRIPTION "KCodecs"
 83 |     EXPORT KCODECS
 84 | )
 85 | 
 86 | ecm_generate_export_header(KF6Codecs
 87 |     BASE_NAME KCodecs
 88 |     GROUP_BASE_NAME KF
 89 |     VERSION ${KF_VERSION}
 90 |     USE_VERSION_HEADER
 91 |     DEPRECATED_BASE_VERSION 0
 92 |     DEPRECATION_VERSIONS
 93 |     EXCLUDE_DEPRECATED_BEFORE_AND_AT ${EXCLUDE_DEPRECATED_BEFORE_AND_AT}
 94 | )
 95 | 
 96 | target_include_directories(KF6Codecs INTERFACE "$<INSTALL_INTERFACE:${KDE_INSTALL_INCLUDEDIR_KF}/KCodecs>")
 97 | 
 98 | target_link_libraries(KF6Codecs PUBLIC Qt6::Core)
 99 | 
100 | ecm_generate_headers(KCodecs_HEADERS
101 |   HEADER_NAMES
102 |   KCharsets
103 |   KCodecs
104 |   KEncodingProber
105 |   KEmailAddress
106 |   REQUIRED_HEADERS KCodecs_HEADERS
107 | )
108 | 
109 | install(TARGETS KF6Codecs EXPORT KF6CodecsTargets ${KF_INSTALL_TARGETS_DEFAULT_ARGS})
110 | 
111 | install(FILES
112 |   ${CMAKE_CURRENT_BINARY_DIR}/kcodecs_export.h
113 |   ${KCodecs_HEADERS}
114 |   DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF}/KCodecs COMPONENT Devel
115 | )
116 | 
117 | ecm_qt_install_logging_categories(
118 |     EXPORT KCODECS
119 |     FILE kcodecs.categories
120 |     DESTINATION ${KDE_INSTALL_LOGGINGCATEGORIESDIR}
121 | )
122 | 
123 | ecm_generate_qdoc(KF6Codecs kcodecs.qdocconf)
124 | 


--------------------------------------------------------------------------------
/src/Messages.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # Extract strings from all source files.
4 | # EXTRACT_TR_STRINGS extracts strings with lupdate and convert them to .pot with
5 | # lconvert.
6 | $EXTRACT_TR_STRINGS `find . -name \*.cpp -o -name \*.h -o -name \*.ui -o -name \*.qml` -o $podir/kcodecs6_qt.pot
7 | 


--------------------------------------------------------------------------------
/src/kcharsets.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     This file is part of the KDE libraries
  3 |     SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
  4 | 
  5 |     SPDX-License-Identifier: LGPL-2.0-or-later
  6 | */
  7 | #ifndef KCHARSETS_H
  8 | #define KCHARSETS_H
  9 | 
 10 | #include <kcodecs_export.h>
 11 | 
 12 | #include <QCoreApplication>
 13 | #include <QList>
 14 | #include <QStringList>
 15 | #include <memory>
 16 | 
 17 | #include "kcodecs.h"
 18 | 
 19 | class KCharsetsPrivate;
 20 | 
 21 | class QChar;
 22 | class QString;
 23 | 
 24 | /*!
 25 |  * \class KCharsets
 26 |  * \inmodule KCodecs
 27 |  *
 28 |  * \brief Charset font and encoder/decoder handling.
 29 |  *
 30 |  * This is needed, because Qt's encoding name matching in
 31 |  * QTextCodec::codecForName() matches only closely-related encoded names
 32 |  * but not alternate names, e.g. found in the reality of the Internet.
 33 |  */
 34 | class KCODECS_EXPORT KCharsets final
 35 | {
 36 |     Q_DECLARE_TR_FUNCTIONS(KCharsets)
 37 | 
 38 | protected:
 39 |     /*
 40 |      * Protected constructor. If you need the kcharsets object, use
 41 |      * KCharsets::charsets() instead.
 42 |      */
 43 |     KCharsets();
 44 | 
 45 | public:
 46 |     ~KCharsets();
 47 | 
 48 |     /*!
 49 |      * The global charset manager.
 50 |      */
 51 |     static KCharsets *charsets();
 52 | 
 53 |     /*!
 54 |      * Converts an entity to a character.
 55 |      *
 56 |      * The string must contain only the
 57 |      * entity without the trailing ';'.
 58 |      *
 59 |      * \a str the entity
 60 |      *
 61 |      * Returns QChar::Null if the entity could not be decoded.
 62 |      */
 63 |     static QChar fromEntity(QStringView str);
 64 | 
 65 |     /*!
 66 |      * Tries to find an entity in the
 67 |      * QString str.
 68 |      *
 69 |      * \a str the string containing entified
 70 |      *
 71 |      * \a len is a return value, that gives the length of the decoded
 72 |      * entity.
 73 |      *
 74 |      * Returns a decoded entity if one could be found, QChar::null
 75 |      * otherwise
 76 |      *
 77 |      * \overload fromEntity(QStringView)
 78 |      */
 79 |     static QChar fromEntity(QStringView str, int &len);
 80 | 
 81 |     /*!
 82 |      * Converts a QChar to an entity. The returned string does already
 83 |      * contain the leading '&' and the trailing ';'.
 84 |      *
 85 |      * \a ch the char to convert
 86 |      *
 87 |      * Returns the entity
 88 |      */
 89 |     static QString toEntity(const QChar &ch);
 90 | 
 91 |     /*!
 92 |      * Scans the given string for entities (like &amp;amp;) and resolves them
 93 |      * using fromEntity.
 94 |      *
 95 |      * \a text the string containing the entities
 96 |      *
 97 |      * Returns the clean string
 98 |      */
 99 |     static QString resolveEntities(const QString &text);
100 | 
101 |     /*!
102 |      * Lists all available encodings as names
103 |      */
104 |     QStringList availableEncodingNames() const;
105 | 
106 |     /*!
107 |      * Lists the available encoding names together with a more descriptive language
108 |      */
109 |     QStringList descriptiveEncodingNames() const;
110 | 
111 |     /*!
112 |      * Lists the available encoding names grouped by script (or language that uses them).
113 |      *
114 |      * Returns the list of lists consisting of description followed by encoding names (i.e. encodingsByScript().at(i).at(0) is a description for
115 |      * encodingsByScript().at(i).at(k), k>0)
116 |      */
117 |     QList<QStringList> encodingsByScript() const;
118 | 
119 |     /*!
120 |      * Returns a long description for an encoding name.
121 |      *
122 |      * \a encoding the encoding for the language
123 |      *
124 |      */
125 |     QString descriptionForEncoding(QStringView encoding) const;
126 | 
127 |     /*!
128 |      * Returns the encoding for a string obtained with descriptiveEncodingNames().
129 |      *
130 |      * \a descriptiveName the descriptive name for the encoding
131 |      */
132 |     QString encodingForName(const QString &descriptiveName) const;
133 | 
134 | private:
135 |     std::unique_ptr<KCharsetsPrivate> const d;
136 |     friend struct KCharsetsSingletonPrivate;
137 | };
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/src/kcharsets_p.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     This file is part of the KDE libraries
 3 | 
 4 |     SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
 5 |     SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
 6 |     SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
 7 | 
 8 |     SPDX-License-Identifier: LGPL-2.0-or-later
 9 | */
10 | 
11 | #ifndef KCHARSETS_P_H
12 | #define KCHARSETS_P_H
13 | 
14 | #include <QStringList>
15 | 
16 | class KCharsetsPrivate
17 | {
18 | public:
19 |     // Cache list so QStrings can be implicitly shared
20 |     QList<QStringList> encodingsByScript;
21 | };
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/kcodecs-index.qdoc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |     \page kcodecs-index.html
 3 |     \title KCodecs
 4 | 
 5 |     KCodecs provide a collection of methods to manipulate strings using various
 6 |     encodings.
 7 | 
 8 |     It can automatically determine the charset of a string, translate XML entities,
 9 |     validate email addresses, and find encodings by name in a more tolerant way than QTextCodec
10 |     (useful e.g. for data coming from the Internet).
11 | 
12 |     \section1 Using the Module
13 | 
14 |     \include {module-use.qdocinc} {using the c++ api}
15 | 
16 |     \section2 Building with CMake
17 | 
18 |     \include {module-use.qdocinc} {building with cmake} {KF6} {Codecs} {KF6::Codecs}
19 | 
20 |     \section1 API Reference
21 | 
22 |     \list
23 |         \li \l{KCodecs C++ Classes}
24 |     \endlist
25 | */
26 | 


--------------------------------------------------------------------------------
/src/kcodecs.qdoc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |     \module KCodecs
 3 |     \title KCodecs C++ Classes
 4 |     \ingroup modules
 5 |     \cmakepackage KF6
 6 |     \cmakecomponent Codecs
 7 | 
 8 |     \brief Text encoding.
 9 | */
10 | 


--------------------------------------------------------------------------------
/src/kcodecs.qdocconf:
--------------------------------------------------------------------------------
 1 | include($KDE_DOCS/global/qt-module-defaults.qdocconf)
 2 | 
 3 | project = KCodecs
 4 | description = Text encoding
 5 | 
 6 | documentationinheaders = true
 7 | 
 8 | headerdirs += .
 9 | sourcedirs += .
10 | 
11 | outputformats = HTML
12 | 
13 | depends += \
14 |     qtcore \
15 |     qtcore5compat
16 | 
17 | navigation.landingpage = "KCodecs"
18 | 
19 | qhp.projects            = KCodecs
20 | 
21 | qhp.KCodecs.file                = kcodecs.qhp
22 | qhp.KCodecs.namespace           = org.kde.kcodecs.$QT_VERSION_TAG
23 | qhp.KCodecs.virtualFolder       = kcodecs
24 | qhp.KCodecs.indexTitle          = KCodecs
25 | qhp.KCodecs.indexRoot           =
26 | 
27 | qhp.KCodecs.subprojects         = classes
28 | qhp.KCodecs.subprojects.classes.title = C++ Classes
29 | qhp.KCodecs.subprojects.classes.indexTitle = KCodecs C++ Classes
30 | qhp.KCodecs.subprojects.classes.selectors = class fake:headerfile
31 | qhp.KCodecs.subprojects.classes.sortPages = true
32 | 
33 | tagfile = kcodecs.tags
34 | 


--------------------------------------------------------------------------------
/src/kcodecs_p.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
 3 | 
 4 |     SPDX-License-Identifier: LGPL-2.0-only
 5 | */
 6 | 
 7 | #ifndef KCODECS_P_H
 8 | #define KCODECS_P_H
 9 | 
10 | #include "kcodecs.h"
11 | 
12 | namespace KCodecs
13 | {
14 | class EncoderPrivate
15 | {
16 | public:
17 |     explicit EncoderPrivate(Codec::NewlineType newline);
18 | 
19 |     /**
20 |       An output buffer to simplify some codecs.
21 |       Used with write() and flushOutputBuffer().
22 |     */
23 |     char outputBuffer[Encoder::maxBufferedChars];
24 | 
25 |     uchar outputBufferCursor;
26 |     const Codec::NewlineType newline;
27 | };
28 | 
29 | class DecoderPrivate
30 | {
31 | public:
32 |     explicit DecoderPrivate(Codec::NewlineType newline);
33 | 
34 |     const Codec::NewlineType newline;
35 | };
36 | 
37 | }
38 | 
39 | #endif // KCODECS_P_H
40 | 


--------------------------------------------------------------------------------
/src/kcodecsbase45.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |     SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
 4 | 
 5 |     SPDX-License-Identifier: LGPL-2.0-or-later
 6 | */
 7 | 
 8 | #include "kcodecs.h"
 9 | #include "kcodecs_debug.h"
10 | 
11 | #include <QDebug>
12 | 
13 | static constexpr const char base45Table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:";
14 | 
15 | static uint8_t base45MapFromChar(char c)
16 | {
17 |     const auto it = std::find(std::begin(base45Table), std::end(base45Table), c);
18 |     if (it == std::end(base45Table)) {
19 |         qCWarning(KCODECS_LOG) << "invalid base45 character:" << c;
20 |         return 0;
21 |     }
22 |     return std::distance(std::begin(base45Table), it);
23 | }
24 | 
25 | QByteArray KCodecs::base45Decode(QByteArrayView in)
26 | {
27 |     QByteArray out;
28 |     out.reserve(((in.size() / 3) + 1) * 2);
29 | 
30 |     for (qsizetype i = 0; i + 1 < in.size(); i += 3) {
31 |         uint32_t n = base45MapFromChar(in[i]) + base45MapFromChar(in[i + 1]) * 45;
32 |         if (i + 2 < in.size()) {
33 |             n += 45 * 45 * base45MapFromChar(in[i + 2]);
34 |             out.push_back(n >> 8);
35 |         } else {
36 |             if (n >> 8) {
37 |                 out.push_back(n >> 8);
38 |             }
39 |         }
40 |         out.push_back(n % 256);
41 |     }
42 | 
43 |     return out;
44 | }
45 | 


--------------------------------------------------------------------------------
/src/kcodecsbase64.h:
--------------------------------------------------------------------------------
  1 | /*  -*- c++ -*-
  2 |     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-or-later
  5 | */
  6 | /*
  7 |   @glossary @anchor Base64 @anchor base64 @b base64:
  8 |   a binary to text encoding scheme based on @ref RFC1421.
  9 | 
 10 |   @glossary @anchor RFC1421 @anchor rfc1421 @b RFC @b 1421:
 11 |   RFC that defines the <a href="http://tools.ietf.org/html/rfc1421">
 12 |   Privacy Enhancement for Internet Electronic Mail:  Part I:
 13 |   Message Encryption and Authentication Procedures</a>.
 14 | 
 15 |   @glossary @anchor RFC2045 @anchor rfc2045 @b RFC @b 2045:
 16 |   RFC that defines the <a href="http://tools.ietf.org/html/rfc2045">
 17 |   MIME Part One: Format of Internet Message Bodies</a>.
 18 | 
 19 |   @glossary @anchor RFC2047 @anchor rfc2047 @b RFC @b 2047:
 20 |   RFC that defines the <a href="http://tools.ietf.org/html/rfc2047">
 21 |   MIME Part Three: Message Header Extensions for Non-ASCII Text</a>.
 22 | 
 23 |   @glossary @anchor RFC2047B @anchor rfc2047b @b RFC @b 2047B:
 24 |   Section 4.1 of @ref RFC2047.
 25 | */
 26 | 
 27 | #ifndef KCODECS_BASE64_H
 28 | #define KCODECS_BASE64_H
 29 | 
 30 | #include "kcodecs.h"
 31 | 
 32 | namespace KCodecs
 33 | {
 34 | class Base64Codec : public Codec
 35 | {
 36 | public:
 37 |     Base64Codec()
 38 |         : Codec()
 39 |     {
 40 |     }
 41 | 
 42 |     ~Base64Codec() override
 43 |     {
 44 |     }
 45 | 
 46 |     const char *name() const override
 47 |     {
 48 |         return "base64";
 49 |     }
 50 | 
 51 |     qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline) const override
 52 |     {
 53 |         // first, the total number of 4-char packets will be:
 54 |         qsizetype totalNumPackets = (insize + 2) / 3;
 55 |         // now, after every 76/4'th packet there needs to be a linebreak:
 56 |         qsizetype numLineBreaks = totalNumPackets / (76 / 4);
 57 |         // and at the very end, too:
 58 |         ++numLineBreaks;
 59 |         // putting it all together, we have:
 60 |         return 4 * totalNumPackets + (newline == Codec::NewlineCRLF ? 2 : 1) * numLineBreaks;
 61 |     }
 62 | 
 63 |     qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
 64 |     {
 65 |         // assuming all characters are part of the base64 stream (which
 66 |         // does almost never hold due to required linebreaking; but
 67 |         // additional non-base64 chars don't affect the output size), each
 68 |         // 4-tupel of them becomes a 3-tupel in the decoded octet
 69 |         // stream. So:
 70 |         qsizetype result = ((insize + 3) / 4) * 3;
 71 |         // but all of them may be \n, so
 72 |         if (newline == Codec::NewlineCRLF) {
 73 |             result *= 2; // :-o
 74 |         }
 75 | 
 76 |         return result;
 77 |     }
 78 | 
 79 |     Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override;
 80 | 
 81 |     Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override;
 82 | };
 83 | 
 84 | class Rfc2047BEncodingCodec : public Base64Codec
 85 | {
 86 | public:
 87 |     Rfc2047BEncodingCodec()
 88 |         : Base64Codec()
 89 |     {
 90 |     }
 91 | 
 92 |     ~Rfc2047BEncodingCodec() override
 93 |     {
 94 |     }
 95 | 
 96 |     const char *name() const override
 97 |     {
 98 |         return "b";
 99 |     }
100 | 
101 |     qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
102 |     {
103 |         Q_UNUSED(newline);
104 |         // Each (begun) 3-octet triple becomes a 4 char quartet, so:
105 |         return ((insize + 2) / 3) * 4;
106 |     }
107 | 
108 |     qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
109 |     {
110 |         Q_UNUSED(newline);
111 |         // Each 4-char quartet becomes a 3-octet triple, the last one
112 |         // possibly even less. So:
113 |         return ((insize + 3) / 4) * 3;
114 |     }
115 | 
116 |     Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override;
117 | };
118 | 
119 | } // namespace KCodecs
120 | 
121 | #endif // KCODECS_BASE64_H
122 | 


--------------------------------------------------------------------------------
/src/kcodecsqp.h:
--------------------------------------------------------------------------------
  1 | /*  -*- c++ -*-
  2 |     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-or-later
  5 | */
  6 | 
  7 | #ifndef KCODECS_QP_H
  8 | #define KCODECS_QP_H
  9 | 
 10 | #include "kcodecs.h"
 11 | 
 12 | namespace KCodecs
 13 | {
 14 | /*
 15 |   A class representing the codec for QuotedPrintable as specified in
 16 |    RFC2045 (section 6.7).
 17 | */
 18 | class QuotedPrintableCodec : public Codec
 19 | {
 20 | public:
 21 |     QuotedPrintableCodec()
 22 |         : Codec()
 23 |     {
 24 |     }
 25 | 
 26 |     ~QuotedPrintableCodec() override
 27 |     {
 28 |     }
 29 | 
 30 |     const char *name() const override
 31 |     {
 32 |         return "quoted-printable";
 33 |     }
 34 | 
 35 |     qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
 36 |     {
 37 |         // all chars encoded:
 38 |         qsizetype result = 3 * insize;
 39 |         // then after 25 hexchars comes a soft linebreak: =(\r)\n
 40 |         result += (newline == Codec::NewlineCRLF ? 3 : 2) * (insize / 25);
 41 | 
 42 |         return result;
 43 |     }
 44 | 
 45 |     qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override;
 46 | 
 47 |     Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override;
 48 | 
 49 |     Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override;
 50 | };
 51 | 
 52 | /*
 53 |   A class representing the codec for the Q encoding as specified
 54 |   in RFC2047Q.
 55 | */
 56 | class Rfc2047QEncodingCodec : public Codec
 57 | {
 58 | public:
 59 |     Rfc2047QEncodingCodec()
 60 |         : Codec()
 61 |     {
 62 |     }
 63 | 
 64 |     ~Rfc2047QEncodingCodec() override
 65 |     {
 66 |     }
 67 | 
 68 |     const char *name() const override
 69 |     {
 70 |         return "q";
 71 |     }
 72 | 
 73 |     qsizetype maxEncodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override
 74 |     {
 75 |         Q_UNUSED(newline);
 76 |         // this one is simple: We don't do linebreaking, so all that can
 77 |         // happen is that every char needs encoding, so:
 78 |         return 3 * insize;
 79 |     }
 80 | 
 81 |     qsizetype maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override;
 82 | 
 83 |     Encoder *makeEncoder(Codec::NewlineType newline = Codec::NewlineLF) const override;
 84 | 
 85 |     Decoder *makeDecoder(Codec::NewlineType newline = Codec::NewlineLF) const override;
 86 | };
 87 | 
 88 | /*
 89 |   A class representing the codec for RFC2231.
 90 | */
 91 | class Rfc2231EncodingCodec : public Codec
 92 | {
 93 | public:
 94 |     Rfc2231EncodingCodec()
 95 |         : Codec()
 96 |     {
 97 |     }
 98 | 
 99 |     ~Rfc2231EncodingCodec() override
100 |     {
101 |     }
102 | 
103 |     const char *name() const override
104 |     {
105 |         return "x-kmime-rfc2231";
106 |     }
107 | 
108 |     qsizetype maxEncodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override
109 |     {
110 |         Q_UNUSED(newline);
111 |         // same as for "q" encoding:
112 |         return 3 * insize;
113 |     }
114 | 
115 |     qsizetype maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override;
116 | 
117 |     Encoder *makeEncoder(Codec::NewlineType newline = Codec::NewlineLF) const override;
118 | 
119 |     Decoder *makeDecoder(Codec::NewlineType newline = Codec::NewlineLF) const override;
120 | };
121 | 
122 | } // namespace KCodecs
123 | 
124 | #endif // KCODECS_QP_H
125 | 


--------------------------------------------------------------------------------
/src/kcodecsuuencode.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- c++ -*-
  2 |     SPDX-FileCopyrightText: 2002 Marc Mutz <mutz@kde.org>
  3 | 
  4 |     SPDX-License-Identifier: LGPL-2.0-or-later
  5 | */
  6 | 
  7 | #include "kcodecsuuencode.h"
  8 | 
  9 | #include <QDebug>
 10 | 
 11 | #include <cassert>
 12 | 
 13 | using namespace KCodecs;
 14 | 
 15 | namespace KCodecs
 16 | {
 17 | class UUDecoder : public Decoder
 18 | {
 19 |     uint mStepNo;
 20 |     uchar mAnnouncedOctetCount; // (on current line)
 21 |     uchar mCurrentOctetCount; // (on current line)
 22 |     uchar mOutbits;
 23 |     bool mLastWasCRLF : 1;
 24 |     bool mSawBegin : 1; // whether we already saw ^begin...
 25 |     uint mIntoBeginLine : 3; // count #chars we compared against "begin" 0..5
 26 |     bool mSawEnd : 1; // whether we already saw ^end...
 27 |     uint mIntoEndLine : 2; // count #chars we compared against "end" 0..3
 28 | 
 29 |     void searchForBegin(const char *&scursor, const char *const send);
 30 | 
 31 | protected:
 32 |     friend class UUCodec;
 33 |     UUDecoder(Codec::NewlineType newline = Codec::NewlineLF)
 34 |         : Decoder(newline)
 35 |         , mStepNo(0)
 36 |         , mAnnouncedOctetCount(0)
 37 |         , mCurrentOctetCount(0)
 38 |         , mOutbits(0)
 39 |         , mLastWasCRLF(true)
 40 |         , mSawBegin(false)
 41 |         , mIntoBeginLine(0)
 42 |         , mSawEnd(false)
 43 |         , mIntoEndLine(0)
 44 |     {
 45 |     }
 46 | 
 47 | public:
 48 |     ~UUDecoder() override
 49 |     {
 50 |     }
 51 | 
 52 |     bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override;
 53 |     // ### really needs no finishing???
 54 |     bool finish(char *&dcursor, const char *const dend) override
 55 |     {
 56 |         Q_UNUSED(dcursor);
 57 |         Q_UNUSED(dend);
 58 |         return true;
 59 |     }
 60 | };
 61 | 
 62 | Encoder *UUCodec::makeEncoder(NewlineType newline) const
 63 | {
 64 |     Q_UNUSED(newline)
 65 |     return nullptr; // encoding not supported
 66 | }
 67 | 
 68 | Decoder *UUCodec::makeDecoder(NewlineType newline) const
 69 | {
 70 |     return new UUDecoder(newline);
 71 | }
 72 | 
 73 | /********************************************************/
 74 | /********************************************************/
 75 | /********************************************************/
 76 | 
 77 | void UUDecoder::searchForBegin(const char *&scursor, const char *const send)
 78 | {
 79 |     static const char begin[] = "begin\n";
 80 |     static const uint beginLength = 5; // sic!
 81 | 
 82 |     assert(!mSawBegin || mIntoBeginLine > 0);
 83 | 
 84 |     while (scursor != send) {
 85 |         uchar ch = *scursor++;
 86 |         if (ch == begin[mIntoBeginLine]) {
 87 |             if (mIntoBeginLine < beginLength) {
 88 |                 // found another char
 89 |                 ++mIntoBeginLine;
 90 |                 if (mIntoBeginLine == beginLength) {
 91 |                     mSawBegin = true; // "begin" complete, now search the next \n...
 92 |                 }
 93 |             } else { // mIntoBeginLine == beginLength
 94 |                 // found '\n': begin line complete
 95 |                 mLastWasCRLF = true;
 96 |                 mIntoBeginLine = 0;
 97 |                 return;
 98 |             }
 99 |         } else if (mSawBegin) {
100 |             // OK, skip stuff until the next \n
101 |         } else {
102 |             // qWarning() << "UUDecoder: garbage before \"begin\", resetting parser";
103 |             mIntoBeginLine = 0;
104 |         }
105 |     }
106 | }
107 | 
108 | // uuencoding just shifts all 6-bit octets by 32 (SP/' '), except NUL,
109 | // which gets mapped to 0x60
110 | static inline uchar uuDecode(uchar c)
111 | {
112 |     return (c - ' ') // undo shift and
113 |         & 0x3F; // map 0x40 (0x60-' ') to 0...
114 | }
115 | 
116 | bool UUDecoder::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend)
117 | {
118 |     // First, check whether we still need to find the "begin" line:
119 |     if (!mSawBegin || mIntoBeginLine != 0) {
120 |         searchForBegin(scursor, send);
121 |     } else if (mSawEnd) {
122 |         // or if we are past the end line:
123 |         scursor = send; // do nothing anymore...
124 |         return true;
125 |     }
126 | 
127 |     while (dcursor != dend && scursor != send) {
128 |         uchar ch = *scursor++;
129 |         uchar value;
130 | 
131 |         // Check whether we need to look for the "end" line:
132 |         if (mIntoEndLine > 0) {
133 |             static const char end[] = "end";
134 |             static const uint endLength = 3;
135 | 
136 |             if (ch == end[mIntoEndLine]) {
137 |                 ++mIntoEndLine;
138 |                 if (mIntoEndLine == endLength) {
139 |                     mSawEnd = true;
140 |                     scursor = send; // shortcut to the end
141 |                     return true;
142 |                 }
143 |                 continue;
144 |             } else {
145 |                 // qWarning() << "UUDecoder: invalid line octet count looks like \"end\" (mIntoEndLine ="
146 |                 //           << mIntoEndLine << ")!";
147 |                 mIntoEndLine = 0;
148 |                 // fall through...
149 |             }
150 |         }
151 | 
152 |         // Normal parsing:
153 | 
154 |         // The first char of a line is an encoding of the length of the
155 |         // current line. We simply ignore it:
156 |         if (mLastWasCRLF) {
157 |             // reset char-per-line counter:
158 |             mLastWasCRLF = false;
159 |             mCurrentOctetCount = 0;
160 | 
161 |             // try to decode the chars-on-this-line announcement:
162 |             if (ch == 'e') { // maybe the beginning of the "end"? ;-)
163 |                 mIntoEndLine = 1;
164 |             } else if (ch > 0x60) {
165 |                 // ### invalid line length char: what shall we do??
166 |             } else if (ch > ' ') {
167 |                 mAnnouncedOctetCount = uuDecode(ch);
168 |             } else if (ch == '\n') {
169 |                 mLastWasCRLF = true; // oops, empty line
170 |             }
171 | 
172 |             continue;
173 |         }
174 | 
175 |         // try converting ch to a 6-bit value:
176 |         if (ch > 0x60) {
177 |             continue; // invalid char
178 |         } else if (ch > ' ') {
179 |             value = uuDecode(ch);
180 |         } else if (ch == '\n') { // line end
181 |             mLastWasCRLF = true;
182 |             continue;
183 |         } else {
184 |             continue;
185 |         }
186 | 
187 |         // add the new bits to the output stream and flush full octets:
188 |         switch (mStepNo) {
189 |         case 0:
190 |             mOutbits = value << 2;
191 |             break;
192 |         case 1:
193 |             if (mCurrentOctetCount < mAnnouncedOctetCount) {
194 |                 *dcursor++ = (char)(mOutbits | value >> 4);
195 |             }
196 |             ++mCurrentOctetCount;
197 |             mOutbits = value << 4;
198 |             break;
199 |         case 2:
200 |             if (mCurrentOctetCount < mAnnouncedOctetCount) {
201 |                 *dcursor++ = (char)(mOutbits | value >> 2);
202 |             }
203 |             ++mCurrentOctetCount;
204 |             mOutbits = value << 6;
205 |             break;
206 |         case 3:
207 |             if (mCurrentOctetCount < mAnnouncedOctetCount) {
208 |                 *dcursor++ = (char)(mOutbits | value);
209 |             }
210 |             ++mCurrentOctetCount;
211 |             mOutbits = 0;
212 |             break;
213 |         default:
214 |             assert(0);
215 |         }
216 |         mStepNo = (mStepNo + 1) % 4;
217 | 
218 |         // check whether we ran over the announced octet count for this line:
219 |         if (mCurrentOctetCount == mAnnouncedOctetCount + 1) {
220 |             // qWarning()
221 |             //         << "UUDecoder: mismatch between announced ("
222 |             //         << mAnnouncedOctetCount << ") and actual line octet count!";
223 |         }
224 |     }
225 | 
226 |     // return false when caller should call us again:
227 |     return scursor == send;
228 | } // UUDecoder::decode()
229 | 
230 | } // namespace KCodecs
231 | 


--------------------------------------------------------------------------------
/src/kcodecsuuencode.h:
--------------------------------------------------------------------------------
 1 | /*  -*- c++ -*-
 2 |     SPDX-FileCopyrightText: 2002 Marc Mutz <mutz@kde.org>
 3 | 
 4 |     SPDX-License-Identifier: LGPL-2.0-or-later
 5 | */
 6 | 
 7 | #ifndef KCODECS_UUENCODE_H
 8 | #define KCODECS_UUENCODE_H
 9 | 
10 | #include "kcodecs.h"
11 | 
12 | namespace KCodecs
13 | {
14 | /*
15 |   A class representing the UUEncode codec.
16 | */
17 | class UUCodec : public Codec
18 | {
19 | public:
20 |     UUCodec()
21 |         : Codec()
22 |     {
23 |     }
24 | 
25 |     ~UUCodec() override
26 |     {
27 |     }
28 | 
29 |     const char *name() const override
30 |     {
31 |         return "x-uuencode";
32 |     }
33 | 
34 |     qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
35 |     {
36 |         Q_UNUSED(newline);
37 |         return insize; // we have no encoder!
38 |     }
39 | 
40 |     qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override
41 |     {
42 |         // assuming all characters are part of the uuencode stream (which
43 |         // does almost never hold due to required linebreaking; but
44 |         // additional non-uu chars don't affect the output size), each
45 |         // 4-tupel of them becomes a 3-tupel in the decoded octet
46 |         // stream. So:
47 |         qsizetype result = ((insize + 3) / 4) * 3;
48 |         // but all of them may be \n, so
49 |         if (newline == Codec::NewlineCRLF) {
50 |             result *= 2; // :-o
51 |         }
52 |         return result;
53 |     }
54 | 
55 |     Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override;
56 | 
57 |     Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override;
58 | };
59 | 
60 | } // namespace KCodecs
61 | 
62 | #endif // KCODECS_UUENCODE_H
63 | 


--------------------------------------------------------------------------------
/src/kencodingprober.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     This file is part of the KDE libraries
  3 | 
  4 |     SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
  5 | 
  6 |     SPDX-License-Identifier: LGPL-2.0-or-later
  7 | */
  8 | #ifndef KENCODINGPROBER_H
  9 | #define KENCODINGPROBER_H
 10 | 
 11 | // enable debug of private probers
 12 | // #define DEBUG_PROBE
 13 | 
 14 | #include <kcodecs_export.h>
 15 | 
 16 | #ifdef DEBUG_PROBE
 17 | #include <QDebug>
 18 | #endif
 19 | 
 20 | #include <QCoreApplication>
 21 | #include <QString>
 22 | #include <memory>
 23 | 
 24 | class KEncodingProberPrivate;
 25 | 
 26 | /*!
 27 |  * \class KEncodingProber
 28 |  * \inmodule KCodecs
 29 |  *
 30 |  * \brief Provides encoding detection(probe) capabilities.
 31 |  *
 32 |  * Probe the encoding of raw data only.
 33 |  * In the case it can't find it, return the most possible encoding it guessed.
 34 |  *
 35 |  * Always do Unicode probe regardless the ProberType
 36 |  *
 37 |  * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
 38 |  * or confidence() returns a value you find acceptable.
 39 |  *
 40 |  * Intended lifetime of the object: one instance per ProberType.
 41 |  *
 42 |  * Typical use:
 43 |  * \code
 44 |  * QByteArray data, moredata;
 45 |  * ...
 46 |  * KEncodingProber prober(KEncodingProber::Chinese);
 47 |  * prober.feed(data);
 48 |  * prober.feed(moredata);
 49 |  * if (prober.confidence() > 0.6)
 50 |  *    encoding  = prober.encoding();
 51 |  * \endcode
 52 |  *
 53 |  * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
 54 |  * If you don't have so many characters to probe,
 55 |  * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
 56 |  *
 57 |  */
 58 | class KCODECS_EXPORT KEncodingProber
 59 | {
 60 |     Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
 61 | 
 62 | public:
 63 |     /*!
 64 |      * \value FoundIt Sure find the encoding
 65 |      * \value NotMe Sure not included in current ProberType's all supported encodings
 66 |      * \value Probing Need more data to make a decision
 67 |      */
 68 |     enum ProberState {
 69 |         FoundIt,
 70 |         NotMe,
 71 |         Probing,
 72 |     };
 73 | 
 74 |     /*!
 75 |      * \value None
 76 |      * \value Universal
 77 |      * \value Arabic
 78 |      * \value Baltic
 79 |      * \value CentralEuropean
 80 |      * \value ChineseSimplified
 81 |      * \value ChineseTraditional
 82 |      * \value Cyrillic
 83 |      * \value Greek
 84 |      * \value Hebrew
 85 |      * \value Japanese
 86 |      * \value Korean
 87 |      * \value NorthernSaami
 88 |      * \value Other
 89 |      * \value SouthEasternEurope
 90 |      * \value Thai
 91 |      * \value Turkish
 92 |      * \value Unicode
 93 |      * \value WesternEuropean
 94 |      */
 95 |     enum ProberType {
 96 |         None,
 97 |         Universal,
 98 |         Arabic,
 99 |         Baltic,
100 |         CentralEuropean,
101 |         ChineseSimplified,
102 |         ChineseTraditional,
103 |         Cyrillic,
104 |         Greek,
105 |         Hebrew,
106 |         Japanese,
107 |         Korean,
108 |         NorthernSaami,
109 |         Other,
110 |         SouthEasternEurope,
111 |         Thai,
112 |         Turkish,
113 |         Unicode,
114 |         WesternEuropean,
115 |     };
116 | 
117 |     /*!
118 |      * Default ProberType is Universal(detect all possible encodings)
119 |      */
120 |     KEncodingProber(ProberType proberType = Universal);
121 | 
122 |     ~KEncodingProber();
123 | 
124 |     KEncodingProber(const KEncodingProber &) = delete;
125 |     KEncodingProber &operator=(const KEncodingProber &) = delete;
126 | 
127 |     /*!
128 |      * reset the prober's internal state and data.
129 |      */
130 |     void reset();
131 | 
132 |     /*!
133 |      * The main class method
134 |      *
135 |      * Feed \a data to the prober
136 |      *
137 |      * Returns the ProberState after probing the fed data.
138 |      */
139 |     ProberState feed(QByteArrayView data);
140 |     // for API compatibility
141 |     inline ProberState feed(const char *data, qsizetype len)
142 |     {
143 |         return feed({data, len});
144 |     }
145 | 
146 |     /*!
147 |      * Returns the prober's current ProberState
148 |      *
149 |      */
150 |     ProberState state() const;
151 | 
152 |     /*!
153 |      * Returns a QByteArray with the name of the best encoding it has guessed so far
154 |      * \since 4.2.2
155 |      */
156 |     QByteArray encoding() const;
157 | 
158 |     /*!
159 |      * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
160 |      */
161 |     float confidence() const;
162 | 
163 |     ProberType proberType() const;
164 | 
165 |     /*!
166 |      * change current prober's ProberType and reset the prober
167 |      *
168 |      * \a proberType the new type
169 |      */
170 |     void setProberType(ProberType proberType);
171 | 
172 |     /*!
173 |      * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
174 |      */
175 |     static ProberType proberTypeForName(const QString &lang);
176 | 
177 |     /*!
178 |      * map ProberType to language string
179 |      *
180 |      * \a proberType the proper type
181 |      *
182 |      * Returns the language string
183 |      */
184 |     static QString nameForProberType(ProberType proberType);
185 | 
186 | private:
187 |     std::unique_ptr<KEncodingProberPrivate> const d;
188 | };
189 | 
190 | #endif
191 | 


--------------------------------------------------------------------------------
/src/probers/CharDistribution.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #include "CharDistribution.h"
 8 | 
 9 | #include "tables/Big5Freq.tab"
10 | #include "tables/EUCKRFreq.tab"
11 | #include "tables/GB2312Freq.tab"
12 | #include "tables/JISFreq.tab"
13 | 
14 | #define SURE_YES 0.99f
15 | #define SURE_NO 0.01f
16 | 
17 | namespace kencodingprober
18 | {
19 | // return confidence base on received data
20 | float CharDistributionAnalysis::GetConfidence()
21 | {
22 |     // if we didn't receive any character in our consideration range, return negative answer
23 |     if (mTotalChars == 0) {
24 |         return SURE_NO;
25 |     }
26 | 
27 |     if (mTotalChars != mFreqChars) {
28 |         float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
29 | 
30 |         if (r < SURE_YES) {
31 |             return r;
32 |         }
33 |     }
34 |     // normalize confidence, (we don't want to be 100% sure)
35 |     return SURE_YES;
36 | }
37 | 
38 | EUCKRDistributionAnalysis::EUCKRDistributionAnalysis()
39 | {
40 |     mCharToFreqOrder = EUCKRCharToFreqOrder;
41 |     mTableSize = EUCKR_TABLE_SIZE;
42 |     mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
43 | }
44 | 
45 | GB2312DistributionAnalysis::GB2312DistributionAnalysis()
46 | {
47 |     mCharToFreqOrder = GB2312CharToFreqOrder;
48 |     mTableSize = GB2312_TABLE_SIZE;
49 |     mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
50 | }
51 | 
52 | Big5DistributionAnalysis::Big5DistributionAnalysis()
53 | {
54 |     mCharToFreqOrder = Big5CharToFreqOrder;
55 |     mTableSize = BIG5_TABLE_SIZE;
56 |     mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
57 | }
58 | 
59 | SJISDistributionAnalysis::SJISDistributionAnalysis()
60 | {
61 |     mCharToFreqOrder = JISCharToFreqOrder;
62 |     mTableSize = JIS_TABLE_SIZE;
63 |     mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
64 | }
65 | 
66 | EUCJPDistributionAnalysis::EUCJPDistributionAnalysis()
67 | {
68 |     mCharToFreqOrder = JISCharToFreqOrder;
69 |     mTableSize = JIS_TABLE_SIZE;
70 |     mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
71 | }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/probers/CharDistribution.h:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #ifndef CharDistribution_h__
  8 | #define CharDistribution_h__
  9 | 
 10 | #include "kcodecs_export.h"
 11 | 
 12 | #include <qcompilerdetection.h>
 13 | 
 14 | #define ENOUGH_DATA_THRESHOLD 256
 15 | 
 16 | namespace kencodingprober
 17 | {
 18 | class KCODECS_NO_EXPORT CharDistributionAnalysis
 19 | {
 20 | public:
 21 |     CharDistributionAnalysis()
 22 |     {
 23 |         Reset();
 24 |     }
 25 |     virtual ~CharDistributionAnalysis()
 26 |     {
 27 |     }
 28 | 
 29 |     // feed a block of data and do distribution analysis
 30 |     void HandleData(const char * /* aBuf */, unsigned int /* aLen */)
 31 |     {
 32 |     }
 33 | 
 34 |     // Feed a character with known length
 35 |     void HandleOneChar(const char *aStr, unsigned int aCharLen)
 36 |     {
 37 |         int order;
 38 | 
 39 |         // we only care about 2-bytes character in our distribution analysis
 40 |         order = (aCharLen == 2) ? GetOrder(aStr) : -1;
 41 | 
 42 |         if (order >= 0) {
 43 |             mTotalChars++;
 44 |             // order is valid
 45 |             if ((unsigned int)order < mTableSize) {
 46 |                 if (512 > mCharToFreqOrder[order]) {
 47 |                     mFreqChars++;
 48 |                 }
 49 |             }
 50 |         }
 51 |     }
 52 | 
 53 |     // return confidence base on existing data
 54 |     float GetConfidence();
 55 | 
 56 |     // Reset analyser, clear any state
 57 |     void Reset(void)
 58 |     {
 59 |         mDone = false;
 60 |         mTotalChars = 0;
 61 |         mFreqChars = 0;
 62 |     }
 63 | 
 64 |     // This function is for future extension. Caller can use this function to control
 65 |     // analyser's behavior
 66 |     void SetOpion()
 67 |     {
 68 |     }
 69 | 
 70 |     // It is not necessary to receive all data to draw conclusion. For charset detection,
 71 |     // certain amount of data is enough
 72 |     bool GotEnoughData()
 73 |     {
 74 |         return mTotalChars > ENOUGH_DATA_THRESHOLD;
 75 |     }
 76 | 
 77 | protected:
 78 |     // we do not handle character base on its original encoding string, but
 79 |     // convert this encoding string to a number, here called order.
 80 |     // This allows multiple encodings of a language to share one frequency table
 81 |     virtual int GetOrder(const char * /* str */)
 82 |     {
 83 |         return -1;
 84 |     }
 85 | 
 86 |     // If this flag is set to true, detection is done and conclusion has been made
 87 |     bool mDone;
 88 | 
 89 |     // The number of characters whose frequency order is less than 512
 90 |     unsigned int mFreqChars;
 91 | 
 92 |     // Total character encountered.
 93 |     unsigned int mTotalChars;
 94 | 
 95 |     // Mapping table to get frequency order from char order (get from GetOrder())
 96 |     const short *mCharToFreqOrder;
 97 | 
 98 |     // Size of above table
 99 |     unsigned int mTableSize;
100 | 
101 |     // This is a constant value varies from language to language, it is used in
102 |     // calculating confidence. See my paper for further detail.
103 |     float mTypicalDistributionRatio;
104 | };
105 | 
106 | class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
107 | {
108 | public:
109 |     EUCKRDistributionAnalysis();
110 | 
111 | protected:
112 |     // for euc-KR encoding, we are interested
113 |     //  first  byte range: 0xb0 -- 0xfe
114 |     //  second byte range: 0xa1 -- 0xfe
115 |     // no validation needed here. State machine has done that
116 |     int GetOrder(const char *str) override
117 |     {
118 |         if ((unsigned char)*str >= (unsigned char)0xb0) {
119 |             return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
120 |         } else {
121 |             return -1;
122 |         }
123 |     }
124 | };
125 | 
126 | class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
127 | {
128 | public:
129 |     GB2312DistributionAnalysis();
130 | 
131 | protected:
132 |     // for GB2312 encoding, we are interested
133 |     //  first  byte range: 0xb0 -- 0xfe
134 |     //  second byte range: 0xa1 -- 0xfe
135 |     // no validation needed here. State machine has done that
136 |     int GetOrder(const char *str) override
137 |     {
138 |         if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) {
139 |             return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
140 |         } else {
141 |             return -1;
142 |         }
143 |     }
144 | };
145 | 
146 | class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
147 | {
148 | public:
149 |     Big5DistributionAnalysis();
150 | 
151 | protected:
152 |     // for big5 encoding, we are interested
153 |     //  first  byte range: 0xa4 -- 0xfe
154 |     //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
155 |     // no validation needed here. State machine has done that
156 |     int GetOrder(const char *str) override
157 |     {
158 |         if ((unsigned char)*str >= (unsigned char)0xa4)
159 |             if ((unsigned char)str[1] >= (unsigned char)0xa1) {
160 |                 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63;
161 |             } else {
162 |                 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
163 |             }
164 |         else {
165 |             return -1;
166 |         }
167 |     }
168 | };
169 | 
170 | class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
171 | {
172 | public:
173 |     SJISDistributionAnalysis();
174 | 
175 | protected:
176 |     // for sjis encoding, we are interested
177 |     //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
178 |     //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
179 |     // no validation needed here. State machine has done that
180 |     int GetOrder(const char *str) override
181 |     {
182 |         int order;
183 |         if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) {
184 |             order = 188 * ((unsigned char)str[0] - (unsigned char)0x81);
185 |         } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) {
186 |             order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31);
187 |         } else {
188 |             return -1;
189 |         }
190 |         order += (unsigned char)*(str + 1) - 0x40;
191 |         if ((unsigned char)str[1] > (unsigned char)0x7f) {
192 |             order--;
193 |         }
194 |         return order;
195 |     }
196 | };
197 | 
198 | class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
199 | {
200 | public:
201 |     EUCJPDistributionAnalysis();
202 | 
203 | protected:
204 |     // for euc-JP encoding, we are interested
205 |     //  first  byte range: 0xa0 -- 0xfe
206 |     //  second byte range: 0xa1 -- 0xfe
207 |     // no validation needed here. State machine has done that
208 |     int GetOrder(const char *str) override
209 |     {
210 |         if ((unsigned char)*str >= (unsigned char)0xa0) {
211 |             return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
212 |         } else {
213 |             return -1;
214 |         }
215 |     }
216 | };
217 | }
218 | #endif // CharDistribution_h__
219 | 


--------------------------------------------------------------------------------
/src/probers/ChineseGroupProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "ChineseGroupProber.h"
  8 | 
  9 | #include "UnicodeGroupProber.h"
 10 | #include "nsBig5Prober.h"
 11 | #include "nsGB2312Prober.h"
 12 | 
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | 
 16 | namespace kencodingprober
 17 | {
 18 | #ifdef DEBUG_PROBE
 19 | static const char *const ProberName[] = {
 20 |     "Unicode",
 21 |     "GB18030",
 22 |     "Big5",
 23 | };
 24 | 
 25 | #endif
 26 | 
 27 | ChineseGroupProber::ChineseGroupProber()
 28 | {
 29 |     mProbers[0] = new UnicodeGroupProber();
 30 |     mProbers[1] = new nsGB18030Prober();
 31 |     mProbers[2] = new nsBig5Prober();
 32 |     Reset();
 33 | }
 34 | 
 35 | ChineseGroupProber::~ChineseGroupProber()
 36 | {
 37 |     for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) {
 38 |         delete mProbers[i];
 39 |     }
 40 | }
 41 | 
 42 | const char *ChineseGroupProber::GetCharSetName()
 43 | {
 44 |     if (mBestGuess == -1) {
 45 |         GetConfidence();
 46 |         if (mBestGuess == -1) {
 47 |             mBestGuess = 1; // assume it's GB18030
 48 |         }
 49 |     }
 50 |     return mProbers[mBestGuess]->GetCharSetName();
 51 | }
 52 | 
 53 | void ChineseGroupProber::Reset(void)
 54 | {
 55 |     mActiveNum = 0;
 56 |     for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) {
 57 |         if (mProbers[i]) {
 58 |             mProbers[i]->Reset();
 59 |             mIsActive[i] = true;
 60 |             ++mActiveNum;
 61 |         } else {
 62 |             mIsActive[i] = false;
 63 |         }
 64 |     }
 65 |     mBestGuess = -1;
 66 |     mState = eDetecting;
 67 | }
 68 | 
 69 | nsProbingState ChineseGroupProber::HandleData(const char *aBuf, unsigned int aLen)
 70 | {
 71 |     nsProbingState st;
 72 |     unsigned int i;
 73 | 
 74 |     // do filtering to reduce load to probers
 75 |     char *highbyteBuf;
 76 |     char *hptr;
 77 |     bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise
 78 |     hptr = highbyteBuf = (char *)malloc(aLen);
 79 |     if (!hptr) {
 80 |         return mState;
 81 |     }
 82 |     for (i = 0; i < aLen; ++i) {
 83 |         if (aBuf[i] & 0x80) {
 84 |             *hptr++ = aBuf[i];
 85 |             keepNext = true;
 86 |         } else {
 87 |             // if previous is highbyte, keep this even it is an ASCII
 88 |             if (keepNext) {
 89 |                 *hptr++ = aBuf[i];
 90 |                 keepNext = false;
 91 |             }
 92 |         }
 93 |     }
 94 | 
 95 |     for (i = 0; i < CN_NUM_OF_PROBERS; ++i) {
 96 |         if (!mIsActive[i]) {
 97 |             continue;
 98 |         }
 99 |         st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
100 |         if (st == eFoundIt) {
101 |             mBestGuess = i;
102 |             mState = eFoundIt;
103 |             break;
104 |         } else if (st == eNotMe) {
105 |             mIsActive[i] = false;
106 |             --mActiveNum;
107 |             if (mActiveNum == 0) {
108 |                 mState = eNotMe;
109 |                 break;
110 |             }
111 |         }
112 |     }
113 | 
114 |     free(highbyteBuf);
115 | 
116 |     return mState;
117 | }
118 | 
119 | float ChineseGroupProber::GetConfidence(void)
120 | {
121 |     unsigned int i;
122 |     float bestConf = 0.0;
123 |     float cf;
124 | 
125 |     switch (mState) {
126 |     case eFoundIt:
127 |         return (float)0.99;
128 |     case eNotMe:
129 |         return (float)0.01;
130 |     default:
131 |         for (i = 0; i < CN_NUM_OF_PROBERS; ++i) {
132 |             if (!mIsActive[i]) {
133 |                 continue;
134 |             }
135 |             cf = mProbers[i]->GetConfidence();
136 |             if (bestConf < cf) {
137 |                 bestConf = cf;
138 |                 mBestGuess = i;
139 |             }
140 |         }
141 |     }
142 |     return bestConf;
143 | }
144 | 
145 | #ifdef DEBUG_PROBE
146 | void ChineseGroupProber::DumpStatus()
147 | {
148 |     unsigned int i;
149 |     float cf;
150 | 
151 |     GetConfidence();
152 |     for (i = 0; i < CN_NUM_OF_PROBERS; i++) {
153 |         if (!mIsActive[i]) {
154 |             printf("  Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
155 |         } else {
156 |             cf = mProbers[i]->GetConfidence();
157 |             printf("  Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]);
158 |         }
159 |     }
160 | }
161 | #endif
162 | }
163 | 


--------------------------------------------------------------------------------
/src/probers/ChineseGroupProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef CHINESEGROUPPROBER_H
 8 | #define CHINESEGROUPPROBER_H
 9 | 
10 | #include "nsCharSetProber.h"
11 | 
12 | #define CN_NUM_OF_PROBERS 3
13 | namespace kencodingprober
14 | {
15 | class KCODECS_NO_EXPORT ChineseGroupProber : public nsCharSetProber
16 | {
17 | public:
18 |     ChineseGroupProber();
19 |     ~ChineseGroupProber() override;
20 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
21 |     const char *GetCharSetName() override;
22 |     nsProbingState GetState(void) override
23 |     {
24 |         return mState;
25 |     }
26 |     void Reset(void) override;
27 |     float GetConfidence(void) override;
28 |     void SetOpion() override
29 |     {
30 |     }
31 | 
32 | #ifdef DEBUG_PROBE
33 |     void DumpStatus() override;
34 | #endif
35 | 
36 | protected:
37 |     nsProbingState mState;
38 |     nsCharSetProber *mProbers[CN_NUM_OF_PROBERS];
39 |     bool mIsActive[CN_NUM_OF_PROBERS];
40 |     int mBestGuess;
41 |     unsigned int mActiveNum;
42 | };
43 | }
44 | #endif /* CHINESEGROUPPROBER_H */
45 | 


--------------------------------------------------------------------------------
/src/probers/JapaneseGroupProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "JapaneseGroupProber.h"
  8 | 
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | namespace kencodingprober
 13 | {
 14 | #ifdef DEBUG_PROBE
 15 | static const char *const ProberName[] = {
 16 |     "Unicode",
 17 |     "GB18030",
 18 |     "Big5",
 19 | };
 20 | 
 21 | #endif
 22 | 
 23 | JapaneseGroupProber::JapaneseGroupProber()
 24 | {
 25 |     mProbers[0] = new UnicodeGroupProber();
 26 |     mProbers[1] = new nsSJISProber();
 27 |     mProbers[2] = new nsEUCJPProber();
 28 |     Reset();
 29 | }
 30 | 
 31 | JapaneseGroupProber::~JapaneseGroupProber()
 32 | {
 33 |     for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) {
 34 |         delete mProbers[i];
 35 |     }
 36 | }
 37 | 
 38 | const char *JapaneseGroupProber::GetCharSetName()
 39 | {
 40 |     if (mBestGuess == -1) {
 41 |         GetConfidence();
 42 |         if (mBestGuess == -1) {
 43 |             mBestGuess = 1; // assume it's GB18030
 44 |         }
 45 |     }
 46 |     return mProbers[mBestGuess]->GetCharSetName();
 47 | }
 48 | 
 49 | void JapaneseGroupProber::Reset(void)
 50 | {
 51 |     mActiveNum = 0;
 52 |     for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) {
 53 |         if (mProbers[i]) {
 54 |             mProbers[i]->Reset();
 55 |             mIsActive[i] = true;
 56 |             ++mActiveNum;
 57 |         } else {
 58 |             mIsActive[i] = false;
 59 |         }
 60 |     }
 61 |     mBestGuess = -1;
 62 |     mState = eDetecting;
 63 | }
 64 | 
 65 | nsProbingState JapaneseGroupProber::HandleData(const char *aBuf, unsigned int aLen)
 66 | {
 67 |     nsProbingState st;
 68 |     unsigned int i;
 69 | 
 70 |     // do filtering to reduce load to probers
 71 |     char *highbyteBuf;
 72 |     char *hptr;
 73 |     bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise
 74 |     hptr = highbyteBuf = (char *)malloc(aLen);
 75 |     if (!hptr) {
 76 |         return mState;
 77 |     }
 78 |     for (i = 0; i < aLen; ++i) {
 79 |         if (aBuf[i] & 0x80) {
 80 |             *hptr++ = aBuf[i];
 81 |             keepNext = true;
 82 |         } else {
 83 |             // if previous is highbyte, keep this even it is a ASCII
 84 |             if (keepNext) {
 85 |                 *hptr++ = aBuf[i];
 86 |                 keepNext = false;
 87 |             }
 88 |         }
 89 |     }
 90 | 
 91 |     for (i = 0; i < JP_NUM_OF_PROBERS; ++i) {
 92 |         if (!mIsActive[i]) {
 93 |             continue;
 94 |         }
 95 |         st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
 96 |         if (st == eFoundIt) {
 97 |             mBestGuess = i;
 98 |             mState = eFoundIt;
 99 |             break;
100 |         } else if (st == eNotMe) {
101 |             mIsActive[i] = false;
102 |             --mActiveNum;
103 |             if (mActiveNum == 0) {
104 |                 mState = eNotMe;
105 |                 break;
106 |             }
107 |         }
108 |     }
109 | 
110 |     free(highbyteBuf);
111 | 
112 |     return mState;
113 | }
114 | 
115 | float JapaneseGroupProber::GetConfidence(void)
116 | {
117 |     unsigned int i;
118 |     float bestConf = 0.0;
119 |     float cf;
120 | 
121 |     switch (mState) {
122 |     case eFoundIt:
123 |         return (float)0.99;
124 |     case eNotMe:
125 |         return (float)0.01;
126 |     default:
127 |         for (i = 0; i < JP_NUM_OF_PROBERS; ++i) {
128 |             if (!mIsActive[i]) {
129 |                 continue;
130 |             }
131 |             cf = mProbers[i]->GetConfidence();
132 |             if (bestConf < cf) {
133 |                 bestConf = cf;
134 |                 mBestGuess = i;
135 |             }
136 |         }
137 |     }
138 |     return bestConf;
139 | }
140 | 
141 | #ifdef DEBUG_PROBE
142 | void JapaneseGroupProber::DumpStatus()
143 | {
144 |     unsigned int i;
145 |     float cf;
146 | 
147 |     GetConfidence();
148 |     for (i = 0; i < JP_NUM_OF_PROBERS; i++) {
149 |         if (!mIsActive[i]) {
150 |             printf("  Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
151 |         } else {
152 |             cf = mProbers[i]->GetConfidence();
153 |             printf("  Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]);
154 |         }
155 |     }
156 | }
157 | #endif
158 | }
159 | 


--------------------------------------------------------------------------------
/src/probers/JapaneseGroupProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef JAPANESEGROUPPROBER_H
 8 | #define JAPANESEGROUPPROBER_H
 9 | 
10 | #include "UnicodeGroupProber.h"
11 | #include "nsCharSetProber.h"
12 | #include "nsEUCJPProber.h"
13 | #include "nsSJISProber.h"
14 | 
15 | #define JP_NUM_OF_PROBERS 3
16 | namespace kencodingprober
17 | {
18 | class KCODECS_NO_EXPORT JapaneseGroupProber : public nsCharSetProber
19 | {
20 | public:
21 |     JapaneseGroupProber();
22 |     ~JapaneseGroupProber() override;
23 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
24 |     const char *GetCharSetName() override;
25 |     nsProbingState GetState(void) override
26 |     {
27 |         return mState;
28 |     }
29 |     void Reset(void) override;
30 |     float GetConfidence(void) override;
31 |     void SetOpion() override
32 |     {
33 |     }
34 | 
35 | #ifdef DEBUG_PROBE
36 |     void DumpStatus() override;
37 | #endif
38 | 
39 | protected:
40 |     nsProbingState mState;
41 |     nsCharSetProber *mProbers[JP_NUM_OF_PROBERS];
42 |     bool mIsActive[JP_NUM_OF_PROBERS];
43 |     int mBestGuess;
44 |     unsigned int mActiveNum;
45 | };
46 | }
47 | #endif /* JAPANESEGROUPPROBER_H */
48 | 


--------------------------------------------------------------------------------
/src/probers/JpCntx.h:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #ifndef __JPCNTX_H__
  8 | #define __JPCNTX_H__
  9 | 
 10 | #include "kcodecs_export.h"
 11 | 
 12 | #include <qglobal.h>
 13 | 
 14 | #define NUM_OF_CATEGORY 6
 15 | 
 16 | #define ENOUGH_REL_THRESHOLD 100
 17 | #define MAX_REL_THRESHOLD 1000
 18 | namespace kencodingprober
 19 | {
 20 | // hiragana frequency category table
 21 | extern const char jp2CharContext[83][83];
 22 | 
 23 | class KCODECS_NO_EXPORT JapaneseContextAnalysis
 24 | {
 25 | public:
 26 |     JapaneseContextAnalysis()
 27 |     {
 28 |         Reset();
 29 |     }
 30 |     virtual ~JapaneseContextAnalysis()
 31 |     {
 32 |     }
 33 | 
 34 |     void HandleData(const char *aBuf, unsigned int aLen);
 35 | 
 36 |     void HandleOneChar(const char *aStr, unsigned int aCharLen)
 37 |     {
 38 |         int order;
 39 | 
 40 |         // if we received enough data, stop here
 41 |         if (mTotalRel > MAX_REL_THRESHOLD) {
 42 |             mDone = true;
 43 |         }
 44 |         if (mDone) {
 45 |             return;
 46 |         }
 47 | 
 48 |         // Only 2-bytes characters are of our interest
 49 |         order = (aCharLen == 2) ? GetOrder(aStr) : -1;
 50 |         if (order != -1 && mLastCharOrder != -1) {
 51 |             mTotalRel++;
 52 |             // count this sequence to its category counter
 53 |             mRelSample[(int)jp2CharContext[mLastCharOrder][order]]++;
 54 |         }
 55 |         mLastCharOrder = order;
 56 |     }
 57 | 
 58 |     float GetConfidence();
 59 |     void Reset(void);
 60 |     void SetOpion()
 61 |     {
 62 |     }
 63 |     bool GotEnoughData()
 64 |     {
 65 |         return mTotalRel > ENOUGH_REL_THRESHOLD;
 66 |     }
 67 | 
 68 | protected:
 69 |     virtual int GetOrder(const char *str, unsigned int *charLen) = 0;
 70 |     virtual int GetOrder(const char *str) = 0;
 71 | 
 72 |     // category counters, each integer counts sequence in its category
 73 |     unsigned int mRelSample[NUM_OF_CATEGORY];
 74 | 
 75 |     // total sequence received
 76 |     unsigned int mTotalRel;
 77 | 
 78 |     // The order of previous char
 79 |     int mLastCharOrder;
 80 | 
 81 |     // if last byte in current buffer is not the last byte of a character, we
 82 |     // need to know how many byte to skip in next buffer.
 83 |     unsigned int mNeedToSkipCharNum;
 84 | 
 85 |     // If this flag is set to true, detection is done and conclusion has been made
 86 |     bool mDone;
 87 | };
 88 | 
 89 | class KCODECS_NO_EXPORT SJISContextAnalysis : public JapaneseContextAnalysis
 90 | {
 91 |     // SJISContextAnalysis(){};
 92 | protected:
 93 |     int GetOrder(const char *str, unsigned int *charLen) override;
 94 | 
 95 |     int GetOrder(const char *str) override
 96 |     {
 97 |         // We only interested in Hiragana, so first byte is '\202'
 98 |         if (*str == '\202' && (unsigned char)*(str + 1) >= (unsigned char)0x9f && (unsigned char)*(str + 1) <= (unsigned char)0xf1) {
 99 |             return (unsigned char)*(str + 1) - (unsigned char)0x9f;
100 |         }
101 |         return -1;
102 |     }
103 | };
104 | 
105 | class KCODECS_NO_EXPORT EUCJPContextAnalysis : public JapaneseContextAnalysis
106 | {
107 | protected:
108 |     int GetOrder(const char *str, unsigned int *charLen) override;
109 |     int GetOrder(const char *str) override
110 |     // We only interested in Hiragana, so first byte is '\244'
111 |     {
112 |         if (*str == '\244' //
113 |             && (unsigned char)*(str + 1) >= (unsigned char)0xa1 //
114 |             && (unsigned char)*(str + 1) <= (unsigned char)0xf3) {
115 |             return (unsigned char)*(str + 1) - (unsigned char)0xa1;
116 |         }
117 |         return -1;
118 |     }
119 | };
120 | }
121 | #endif /* __JPCNTX_H__ */
122 | 


--------------------------------------------------------------------------------
/src/probers/UnicodeGroupProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "UnicodeGroupProber.h"
  8 | 
  9 | #include <QChar>
 10 | #include <math.h>
 11 | 
 12 | namespace kencodingprober
 13 | {
 14 | UnicodeGroupProber::UnicodeGroupProber(void)
 15 | {
 16 |     mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel);
 17 |     mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel);
 18 |     mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel);
 19 |     mActiveSM = NUM_OF_UNICODE_CHARSETS;
 20 |     mState = eDetecting;
 21 |     mDetectedCharset = "UTF-8";
 22 | }
 23 | 
 24 | UnicodeGroupProber::~UnicodeGroupProber(void)
 25 | {
 26 |     for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
 27 |         delete mCodingSM[i];
 28 |     }
 29 | }
 30 | 
 31 | void UnicodeGroupProber::Reset(void)
 32 | {
 33 |     mState = eDetecting;
 34 |     for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
 35 |         mCodingSM[i]->Reset();
 36 |     }
 37 |     mActiveSM = NUM_OF_UNICODE_CHARSETS;
 38 |     mDetectedCharset = "UTF-8";
 39 | }
 40 | 
 41 | nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen)
 42 | {
 43 |     nsSMState codingState;
 44 |     static bool disableUTF16LE = false;
 45 |     static bool disableUTF16BE = false;
 46 | 
 47 |     if (mActiveSM == 0 || aLen < 2) {
 48 |         mState = eNotMe;
 49 |         return mState;
 50 |     }
 51 | 
 52 |     if (!(disableUTF16LE || disableUTF16BE)) {
 53 |         if (aLen % 2 != 0) {
 54 |             disableUTF16LE = true;
 55 |             disableUTF16BE = true;
 56 |         }
 57 |         const uint weight_BOM = sqrt((double)aLen) + aLen / 10.0;
 58 |         uint counts[5] = {0, 0, 0, 0, 0};
 59 |         for (uint i = 0; i < 5; i++) {
 60 |             counts[i] = std::count(aBuf, aBuf + aLen, char(i));
 61 |         }
 62 |         const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen;
 63 |         if (weight_zero < log(1.4142)) {
 64 |             disableUTF16LE = true;
 65 |             disableUTF16BE = true;
 66 |         }
 67 |         if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(static_cast<uint>(aBuf[0]))) {
 68 |             disableUTF16BE = true;
 69 |         } else {
 70 |             disableUTF16LE = true;
 71 |         }
 72 |         if (disableUTF16BE) {
 73 |             mActiveSM--;
 74 |         }
 75 |         if (disableUTF16LE) {
 76 |             nsCodingStateMachine *t;
 77 |             t = mCodingSM[1];
 78 |             mCodingSM[1] = mCodingSM[2];
 79 |             mCodingSM[2] = t;
 80 |             mActiveSM--;
 81 |         }
 82 |     }
 83 | 
 84 |     for (uint i = 0; i < aLen; ++i) {
 85 |         for (int j = mActiveSM - 1; j >= 0; --j) {
 86 |             // byte is feed to all active state machine
 87 |             codingState = mCodingSM[j]->NextState(aBuf[i]);
 88 |             if (codingState == eError) {
 89 |                 // got negative answer for this state machine, make it inactive
 90 |                 mActiveSM--;
 91 |                 if (mActiveSM == 0) {
 92 |                     mState = eNotMe;
 93 |                     return mState;
 94 |                 } else if (j != (int)mActiveSM) {
 95 |                     nsCodingStateMachine *t;
 96 |                     t = mCodingSM[mActiveSM];
 97 |                     mCodingSM[mActiveSM] = mCodingSM[j];
 98 |                     mCodingSM[j] = t;
 99 |                 }
100 |             } else if (codingState == eItsMe) {
101 |                 mState = eFoundIt;
102 |                 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
103 |                 return mState;
104 |             } else if (mState == eDetecting) {
105 |                 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
106 |             };
107 |         }
108 |     }
109 |     return mState;
110 | }
111 | 
112 | float UnicodeGroupProber::GetConfidence()
113 | {
114 |     if (mState == eFoundIt) {
115 |         return 0.99f;
116 |     } else {
117 |         return 0.0f;
118 |     }
119 | }
120 | 
121 | #ifdef DEBUG_PROBE
122 | void UnicodeGroupProber::DumpStatus()
123 | {
124 |     GetConfidence();
125 |     for (uint i = 0; i < mActiveSM; i++) {
126 |         qDebug() << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine();
127 |     }
128 | }
129 | #endif
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/src/probers/UnicodeGroupProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef UNICODEGROUPPROBER_H
 8 | #define UNICODEGROUPPROBER_H
 9 | 
10 | #include "nsCharSetProber.h"
11 | #include "nsCodingStateMachine.h"
12 | 
13 | #define NUM_OF_UNICODE_CHARSETS 3
14 | namespace kencodingprober
15 | {
16 | class KCODECS_NO_EXPORT UnicodeGroupProber : public nsCharSetProber
17 | {
18 | public:
19 |     UnicodeGroupProber(void);
20 |     ~UnicodeGroupProber(void) override;
21 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
22 |     const char *GetCharSetName() override
23 |     {
24 |         return mDetectedCharset;
25 |     }
26 |     nsProbingState GetState(void) override
27 |     {
28 |         return mState;
29 |     }
30 |     void Reset(void) override;
31 |     float GetConfidence() override;
32 |     void SetOpion() override
33 |     {
34 |     }
35 | #ifdef DEBUG_PROBE
36 |     void DumpStatus() override;
37 | #endif
38 | 
39 | protected:
40 |     void GetDistribution(unsigned int aCharLen, const char *aStr);
41 | 
42 |     nsCodingStateMachine *mCodingSM[NUM_OF_UNICODE_CHARSETS];
43 |     unsigned int mActiveSM;
44 |     nsProbingState mState;
45 |     const char *mDetectedCharset;
46 | };
47 | }
48 | #endif /* UNICODEGROUPPROBER_H */
49 | 


--------------------------------------------------------------------------------
/src/probers/nsBig5Prober.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #include "nsBig5Prober.h"
 8 | 
 9 | namespace kencodingprober
10 | {
11 | void nsBig5Prober::Reset(void)
12 | {
13 |     mCodingSM->Reset();
14 |     mState = eDetecting;
15 |     mDistributionAnalyser.Reset();
16 | }
17 | 
18 | nsProbingState nsBig5Prober::HandleData(const char *aBuf, unsigned int aLen)
19 | {
20 |     if (aLen == 0) {
21 |         return mState;
22 |     }
23 | 
24 |     for (unsigned int i = 0; i < aLen; i++) {
25 |         const nsSMState codingState = mCodingSM->NextState(aBuf[i]);
26 |         if (codingState == eError) {
27 |             mState = eNotMe;
28 |             break;
29 |         }
30 |         if (codingState == eItsMe) {
31 |             mState = eFoundIt;
32 |             break;
33 |         }
34 |         if (codingState == eStart) {
35 |             unsigned int charLen = mCodingSM->GetCurrentCharLen();
36 | 
37 |             if (i == 0) {
38 |                 mLastChar[1] = aBuf[0];
39 |                 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
40 |             } else {
41 |                 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
42 |             }
43 |         }
44 |     }
45 | 
46 |     mLastChar[0] = aBuf[aLen - 1];
47 | 
48 |     if (mState == eDetecting) {
49 |         if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
50 |             mState = eFoundIt;
51 |         }
52 |     }
53 | 
54 |     return mState;
55 | }
56 | 
57 | float nsBig5Prober::GetConfidence(void)
58 | {
59 |     float distribCf = mDistributionAnalyser.GetConfidence();
60 | 
61 |     return (float)distribCf;
62 | }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/probers/nsBig5Prober.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsBig5Prober_h__
 8 | #define nsBig5Prober_h__
 9 | 
10 | #include "CharDistribution.h"
11 | #include "nsCharSetProber.h"
12 | #include "nsCodingStateMachine.h"
13 | namespace kencodingprober
14 | {
15 | class KCODECS_NO_EXPORT nsBig5Prober : public nsCharSetProber
16 | {
17 | public:
18 |     nsBig5Prober(void)
19 |     {
20 |         mCodingSM = new nsCodingStateMachine(&Big5SMModel);
21 |         Reset();
22 |     }
23 |     ~nsBig5Prober() override
24 |     {
25 |         delete mCodingSM;
26 |     }
27 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
28 |     const char *GetCharSetName() override
29 |     {
30 |         return "Big5";
31 |     }
32 |     nsProbingState GetState(void) override
33 |     {
34 |         return mState;
35 |     }
36 |     void Reset(void) override;
37 |     float GetConfidence(void) override;
38 |     void SetOpion() override
39 |     {
40 |     }
41 | 
42 | protected:
43 |     void GetDistribution(unsigned int aCharLen, const char *aStr);
44 | 
45 |     nsCodingStateMachine *mCodingSM;
46 |     nsProbingState mState;
47 | 
48 |     // Big5ContextAnalysis mContextAnalyser;
49 |     Big5DistributionAnalysis mDistributionAnalyser;
50 |     char mLastChar[2];
51 | };
52 | }
53 | 
54 | #endif /* nsBig5Prober_h__ */
55 | 


--------------------------------------------------------------------------------
/src/probers/nsCharSetProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsCharSetProber.h"
  8 | 
  9 | #include <stdlib.h>
 10 | 
 11 | namespace kencodingprober
 12 | {
 13 | // This filter applies to all scripts which do not use English characters
 14 | bool nsCharSetProber::FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen)
 15 | {
 16 |     char *newptr;
 17 |     char *prevPtr;
 18 |     char *curPtr;
 19 | 
 20 |     bool meetMSB = false;
 21 |     newptr = *newBuf = (char *)malloc(aLen);
 22 |     if (!newptr) {
 23 |         return false;
 24 |     }
 25 | 
 26 |     for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
 27 |         if (*curPtr & 0x80) {
 28 |             meetMSB = true;
 29 |         } else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') {
 30 |             // current char is a symbol, most likely a punctuation. we treat it as segment delimiter
 31 |             if (meetMSB && curPtr > prevPtr)
 32 |             // this segment contains more than single symbol, and it has upper ASCII, we need to keep it
 33 |             {
 34 |                 while (prevPtr < curPtr) {
 35 |                     *newptr++ = *prevPtr++;
 36 |                 }
 37 |                 prevPtr++;
 38 |                 *newptr++ = ' ';
 39 |                 meetMSB = false;
 40 |             } else { // ignore current segment. (either because it is just a symbol or just an English word)
 41 |                 prevPtr = curPtr + 1;
 42 |             }
 43 |         }
 44 |     }
 45 |     if (meetMSB && curPtr > prevPtr) {
 46 |         while (prevPtr < curPtr) {
 47 |             *newptr++ = *prevPtr++;
 48 |         }
 49 |     }
 50 | 
 51 |     newLen = newptr - *newBuf;
 52 | 
 53 |     return true;
 54 | }
 55 | 
 56 | // This filter applies to all scripts which contain both English characters and upper ASCII characters.
 57 | bool nsCharSetProber::FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen)
 58 | {
 59 |     // do filtering to reduce load to probers
 60 |     char *newptr;
 61 |     char *prevPtr;
 62 |     char *curPtr;
 63 |     bool isInTag = false;
 64 | 
 65 |     newptr = *newBuf = (char *)malloc(aLen);
 66 |     if (!newptr) {
 67 |         return false;
 68 |     }
 69 | 
 70 |     for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
 71 |         if (*curPtr == '>') {
 72 |             isInTag = false;
 73 |         } else if (*curPtr == '<') {
 74 |             isInTag = true;
 75 |         }
 76 | 
 77 |         if (!(*curPtr & 0x80) //
 78 |             && (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')) {
 79 |             if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
 80 |                                               // and it is not inside a tag, keep it.
 81 |             {
 82 |                 while (prevPtr < curPtr) {
 83 |                     *newptr++ = *prevPtr++;
 84 |                 }
 85 |                 prevPtr++;
 86 |                 *newptr++ = ' ';
 87 |             } else {
 88 |                 prevPtr = curPtr + 1;
 89 |             }
 90 |         }
 91 |     }
 92 | 
 93 |     // If the current segment contains more than just a symbol
 94 |     // and it is not inside a tag then keep it.
 95 |     if (!isInTag) {
 96 |         while (prevPtr < curPtr) {
 97 |             *newptr++ = *prevPtr++;
 98 |         }
 99 |     }
100 | 
101 |     newLen = newptr - *newBuf;
102 | 
103 |     return true;
104 | }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/probers/nsCharSetProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsCharSetProber_h__
 8 | #define nsCharSetProber_h__
 9 | 
10 | #include "kencodingprober.h"
11 | 
12 | namespace kencodingprober
13 | {
14 | typedef enum {
15 |     eDetecting = 0, // We are still detecting, no sure answer yet, but caller can ask for confidence.
16 |     eFoundIt = 1, // That's a positive answer
17 |     eNotMe = 2, // Negative answer
18 | } nsProbingState;
19 | 
20 | #define SHORTCUT_THRESHOLD (float)0.95
21 | 
22 | class KCODECS_NO_EXPORT nsCharSetProber
23 | {
24 | public:
25 |     virtual ~nsCharSetProber()
26 |     {
27 |     }
28 |     virtual const char *GetCharSetName() = 0;
29 |     virtual nsProbingState HandleData(const char *aBuf, unsigned int aLen) = 0;
30 |     virtual nsProbingState GetState(void) = 0;
31 |     virtual void Reset(void) = 0;
32 |     virtual float GetConfidence(void) = 0;
33 |     virtual void SetOpion() = 0;
34 | 
35 | #ifdef DEBUG_PROBE
36 |     void DumpStatus() override
37 |     {
38 |     }
39 | #endif
40 | 
41 |     // Helper functions used in the Latin1 and Group probers.
42 |     // both functions Allocate a new buffer for newBuf. This buffer should be
43 |     // freed by the caller using PR_FREEIF.
44 |     // Both functions return false in case of memory allocation failure.
45 |     static bool FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen);
46 |     static bool FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen);
47 | };
48 | }
49 | #endif /* nsCharSetProber_h__ */
50 | 


--------------------------------------------------------------------------------
/src/probers/nsCodingStateMachine.h:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #ifndef nsCodingStateMachine_h__
  8 | #define nsCodingStateMachine_h__
  9 | 
 10 | #include "kencodingprober.h"
 11 | 
 12 | #include "kcodecs_export.h"
 13 | 
 14 | #include "nsPkgInt.h"
 15 | namespace kencodingprober
 16 | {
 17 | enum {
 18 |     eStart = 0,
 19 |     eError = 1,
 20 |     eItsMe = 2,
 21 | };
 22 | using nsSMState = int;
 23 | 
 24 | #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable)
 25 | 
 26 | // state machine model
 27 | typedef struct {
 28 |     nsPkgInt classTable;
 29 |     unsigned int classFactor;
 30 |     nsPkgInt stateTable;
 31 |     const unsigned int *charLenTable;
 32 |     const char *name;
 33 | } SMModel;
 34 | 
 35 | class KCODECS_NO_EXPORT nsCodingStateMachine
 36 | {
 37 | public:
 38 |     nsCodingStateMachine(const SMModel *sm)
 39 |     {
 40 |         mCurrentState = eStart;
 41 |         mModel = sm;
 42 |     }
 43 |     nsSMState NextState(char c)
 44 |     {
 45 |         // for each byte we get its class KCODECS_NO_EXPORT , if it is first byte, we also get byte length
 46 |         unsigned int byteCls = GETCLASS(c);
 47 |         if (mCurrentState == eStart) {
 48 |             mCurrentBytePos = 0;
 49 |             mCurrentCharLen = mModel->charLenTable[byteCls];
 50 |         }
 51 |         // from byte's class KCODECS_NO_EXPORT and stateTable, we get its next state
 52 |         mCurrentState = GETFROMPCK(mCurrentState * (mModel->classFactor) + byteCls, mModel->stateTable);
 53 |         mCurrentBytePos++;
 54 |         return mCurrentState;
 55 |     }
 56 |     unsigned int GetCurrentCharLen(void)
 57 |     {
 58 |         return mCurrentCharLen;
 59 |     }
 60 |     void Reset(void)
 61 |     {
 62 |         mCurrentState = eStart;
 63 |     }
 64 |     const char *GetCodingStateMachine()
 65 |     {
 66 |         return mModel->name;
 67 |     }
 68 | #ifdef DEBUG_PROBE
 69 |     const char *DumpCurrentState()
 70 |     {
 71 |         switch (mCurrentState) {
 72 |         case eStart:
 73 |             return "eStart";
 74 |         case eError:
 75 |             return "eError";
 76 |         case eItsMe:
 77 |             return "eItsMe";
 78 |         default:
 79 |             return "OK";
 80 |         }
 81 |     }
 82 | #endif
 83 | 
 84 | protected:
 85 |     int mCurrentState;
 86 |     unsigned int mCurrentCharLen;
 87 |     unsigned int mCurrentBytePos;
 88 | 
 89 |     const SMModel *mModel;
 90 | };
 91 | 
 92 | extern KCODECS_NO_EXPORT const SMModel UTF8SMModel;
 93 | extern KCODECS_NO_EXPORT const SMModel Big5SMModel;
 94 | extern KCODECS_NO_EXPORT const SMModel EUCJPSMModel;
 95 | extern KCODECS_NO_EXPORT const SMModel EUCKRSMModel;
 96 | extern KCODECS_NO_EXPORT const SMModel GB18030SMModel;
 97 | extern KCODECS_NO_EXPORT const SMModel SJISSMModel;
 98 | extern KCODECS_NO_EXPORT const SMModel UCS2LESMModel;
 99 | extern KCODECS_NO_EXPORT const SMModel UCS2BESMModel;
100 | 
101 | extern KCODECS_NO_EXPORT const SMModel HZSMModel;
102 | extern KCODECS_NO_EXPORT const SMModel ISO2022CNSMModel;
103 | extern KCODECS_NO_EXPORT const SMModel ISO2022JPSMModel;
104 | extern KCODECS_NO_EXPORT const SMModel ISO2022KRSMModel;
105 | }
106 | #endif /* nsCodingStateMachine_h__ */
107 | 


--------------------------------------------------------------------------------
/src/probers/nsEUCJPProber.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | // for japanese encoding, observe characteristic:
 8 | // 1, kana character (or hankaku?) often have high frequency of appearance
 9 | // 2, kana character often exist in group
10 | // 3, certain combination of kana is never used in japanese language
11 | 
12 | #include "nsEUCJPProber.h"
13 | 
14 | namespace kencodingprober
15 | {
16 | void nsEUCJPProber::Reset(void)
17 | {
18 |     mCodingSM->Reset();
19 |     mState = eDetecting;
20 |     mContextAnalyser.Reset();
21 |     mDistributionAnalyser.Reset();
22 | }
23 | 
24 | nsProbingState nsEUCJPProber::HandleData(const char *aBuf, unsigned int aLen)
25 | {
26 |     if (aLen == 0) {
27 |         return mState;
28 |     }
29 | 
30 |     for (unsigned int i = 0; i < aLen; i++) {
31 |         const nsSMState codingState = mCodingSM->NextState(aBuf[i]);
32 |         if (codingState == eError) {
33 |             mState = eNotMe;
34 |             break;
35 |         }
36 |         if (codingState == eItsMe) {
37 |             mState = eFoundIt;
38 |             break;
39 |         }
40 |         if (codingState == eStart) {
41 |             unsigned int charLen = mCodingSM->GetCurrentCharLen();
42 | 
43 |             if (i == 0) {
44 |                 mLastChar[1] = aBuf[0];
45 |                 mContextAnalyser.HandleOneChar(mLastChar, charLen);
46 |                 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
47 |             } else {
48 |                 mContextAnalyser.HandleOneChar(aBuf + i - 1, charLen);
49 |                 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
50 |             }
51 |         }
52 |     }
53 | 
54 |     mLastChar[0] = aBuf[aLen - 1];
55 | 
56 |     if (mState == eDetecting) {
57 |         if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
58 |             mState = eFoundIt;
59 |         }
60 |     }
61 | 
62 |     return mState;
63 | }
64 | 
65 | float nsEUCJPProber::GetConfidence(void)
66 | {
67 |     float contxtCf = mContextAnalyser.GetConfidence();
68 |     float distribCf = mDistributionAnalyser.GetConfidence();
69 | 
70 |     return (contxtCf > distribCf ? contxtCf : distribCf);
71 | }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/probers/nsEUCJPProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | // for S-JIS encoding, observe characteristic:
 8 | // 1, kana character (or hankaku?) often have high frequency of appearance
 9 | // 2, kana character often exist in group
10 | // 3, certain combination of kana is never used in japanese language
11 | 
12 | #ifndef nsEUCJPProber_h__
13 | #define nsEUCJPProber_h__
14 | 
15 | #include "CharDistribution.h"
16 | #include "JpCntx.h"
17 | #include "nsCharSetProber.h"
18 | #include "nsCodingStateMachine.h"
19 | namespace kencodingprober
20 | {
21 | class KCODECS_NO_EXPORT nsEUCJPProber : public nsCharSetProber
22 | {
23 | public:
24 |     nsEUCJPProber(void)
25 |     {
26 |         mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
27 |         Reset();
28 |     }
29 |     ~nsEUCJPProber(void) override
30 |     {
31 |         delete mCodingSM;
32 |     }
33 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
34 |     const char *GetCharSetName() override
35 |     {
36 |         return "EUC-JP";
37 |     }
38 |     nsProbingState GetState(void) override
39 |     {
40 |         return mState;
41 |     }
42 |     void Reset(void) override;
43 |     float GetConfidence(void) override;
44 |     void SetOpion() override
45 |     {
46 |     }
47 | 
48 | protected:
49 |     nsCodingStateMachine *mCodingSM;
50 |     nsProbingState mState;
51 | 
52 |     EUCJPContextAnalysis mContextAnalyser;
53 |     EUCJPDistributionAnalysis mDistributionAnalyser;
54 | 
55 |     char mLastChar[2];
56 | };
57 | }
58 | 
59 | #endif /* nsEUCJPProber_h__ */
60 | 


--------------------------------------------------------------------------------
/src/probers/nsEUCKRProber.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #include "nsEUCKRProber.h"
 8 | 
 9 | namespace kencodingprober
10 | {
11 | void nsEUCKRProber::Reset(void)
12 | {
13 |     mCodingSM->Reset();
14 |     mState = eDetecting;
15 |     mDistributionAnalyser.Reset();
16 |     // mContextAnalyser.Reset();
17 | }
18 | 
19 | nsProbingState nsEUCKRProber::HandleData(const char *aBuf, unsigned int aLen)
20 | {
21 |     if (aLen == 0) {
22 |         return mState;
23 |     }
24 | 
25 |     for (unsigned int i = 0; i < aLen; i++) {
26 |         const nsSMState codingState = mCodingSM->NextState(aBuf[i]);
27 |         if (codingState == eError) {
28 |             mState = eNotMe;
29 |             break;
30 |         }
31 |         if (codingState == eItsMe) {
32 |             mState = eFoundIt;
33 |             break;
34 |         }
35 |         if (codingState == eStart) {
36 |             unsigned int charLen = mCodingSM->GetCurrentCharLen();
37 | 
38 |             if (i == 0) {
39 |                 mLastChar[1] = aBuf[0];
40 |                 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
41 |             } else {
42 |                 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
43 |             }
44 |         }
45 |     }
46 | 
47 |     mLastChar[0] = aBuf[aLen - 1];
48 | 
49 |     if (mState == eDetecting) {
50 |         if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
51 |             mState = eFoundIt;
52 |         }
53 |     }
54 |     //    else
55 |     //      mDistributionAnalyser.HandleData(aBuf, aLen);
56 | 
57 |     return mState;
58 | }
59 | 
60 | float nsEUCKRProber::GetConfidence(void)
61 | {
62 |     float distribCf = mDistributionAnalyser.GetConfidence();
63 | 
64 |     return (float)distribCf;
65 | }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/probers/nsEUCKRProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsEUCKRProber_h__
 8 | #define nsEUCKRProber_h__
 9 | 
10 | #include "CharDistribution.h"
11 | #include "nsCharSetProber.h"
12 | #include "nsCodingStateMachine.h"
13 | namespace kencodingprober
14 | {
15 | class KCODECS_NO_EXPORT nsEUCKRProber : public nsCharSetProber
16 | {
17 | public:
18 |     nsEUCKRProber(void)
19 |     {
20 |         mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
21 |         Reset();
22 |     }
23 |     ~nsEUCKRProber(void) override
24 |     {
25 |         delete mCodingSM;
26 |     }
27 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
28 |     const char *GetCharSetName() override
29 |     {
30 |         return "EUC-KR";
31 |     }
32 |     nsProbingState GetState(void) override
33 |     {
34 |         return mState;
35 |     }
36 |     void Reset(void) override;
37 |     float GetConfidence(void) override;
38 |     void SetOpion() override
39 |     {
40 |     }
41 | 
42 | protected:
43 |     void GetDistribution(unsigned int aCharLen, const char *aStr);
44 | 
45 |     nsCodingStateMachine *mCodingSM;
46 |     nsProbingState mState;
47 | 
48 |     // EUCKRContextAnalysis mContextAnalyser;
49 |     EUCKRDistributionAnalysis mDistributionAnalyser;
50 |     char mLastChar[2];
51 | };
52 | }
53 | 
54 | #endif /* nsEUCKRProber_h__ */
55 | 


--------------------------------------------------------------------------------
/src/probers/nsEscCharsetProber.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #include "nsEscCharsetProber.h"
 8 | 
 9 | namespace kencodingprober
10 | {
11 | nsEscCharSetProber::nsEscCharSetProber(void)
12 | {
13 |     mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
14 |     mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
15 |     mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
16 |     mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
17 |     mActiveSM = NUM_OF_ESC_CHARSETS;
18 |     mState = eDetecting;
19 |     mDetectedCharset = nullptr;
20 | }
21 | 
22 | nsEscCharSetProber::~nsEscCharSetProber(void)
23 | {
24 |     for (unsigned int i = 0; i < NUM_OF_ESC_CHARSETS; i++) {
25 |         delete mCodingSM[i];
26 |     }
27 | }
28 | 
29 | void nsEscCharSetProber::Reset(void)
30 | {
31 |     mState = eDetecting;
32 |     for (unsigned int i = 0; i < NUM_OF_ESC_CHARSETS; i++) {
33 |         mCodingSM[i]->Reset();
34 |     }
35 |     mActiveSM = NUM_OF_ESC_CHARSETS;
36 |     mDetectedCharset = nullptr;
37 | }
38 | 
39 | nsProbingState nsEscCharSetProber::HandleData(const char *aBuf, unsigned int aLen)
40 | {
41 |     nsSMState codingState;
42 |     int j;
43 |     unsigned int i;
44 | 
45 |     for (i = 0; i < aLen && mState == eDetecting; i++) {
46 |         for (j = mActiveSM - 1; j >= 0; j--) {
47 |             // byte is feed to all active state machine
48 |             codingState = mCodingSM[j]->NextState(aBuf[i]);
49 |             if (codingState == eError) {
50 |                 // got negative answer for this state machine, make it inactive
51 |                 mActiveSM--;
52 |                 if (mActiveSM == 0) {
53 |                     mState = eNotMe;
54 |                     return mState;
55 |                 } else if (j != (int)mActiveSM) {
56 |                     nsCodingStateMachine *t;
57 |                     t = mCodingSM[mActiveSM];
58 |                     mCodingSM[mActiveSM] = mCodingSM[j];
59 |                     mCodingSM[j] = t;
60 |                 }
61 |             } else if (codingState == eItsMe) {
62 |                 mState = eFoundIt;
63 |                 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
64 |                 return mState;
65 |             }
66 |         }
67 |     }
68 | 
69 |     return mState;
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/probers/nsEscCharsetProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsEscCharSetProber_h__
 8 | #define nsEscCharSetProber_h__
 9 | 
10 | #include "nsCharSetProber.h"
11 | #include "nsCodingStateMachine.h"
12 | 
13 | #define NUM_OF_ESC_CHARSETS 4
14 | namespace kencodingprober
15 | {
16 | class KCODECS_NO_EXPORT nsEscCharSetProber : public nsCharSetProber
17 | {
18 | public:
19 |     nsEscCharSetProber(void);
20 |     ~nsEscCharSetProber(void) override;
21 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
22 |     const char *GetCharSetName() override
23 |     {
24 |         return mDetectedCharset;
25 |     }
26 |     nsProbingState GetState(void) override
27 |     {
28 |         return mState;
29 |     }
30 |     void Reset(void) override;
31 |     float GetConfidence(void) override
32 |     {
33 |         return (float)0.99;
34 |     }
35 |     void SetOpion() override
36 |     {
37 |     }
38 | 
39 | protected:
40 |     void GetDistribution(unsigned int aCharLen, const char *aStr);
41 | 
42 |     nsCodingStateMachine *mCodingSM[NUM_OF_ESC_CHARSETS];
43 |     unsigned int mActiveSM;
44 |     nsProbingState mState;
45 |     const char *mDetectedCharset;
46 | };
47 | }
48 | #endif /* nsEscCharSetProber_h__ */
49 | 


--------------------------------------------------------------------------------
/src/probers/nsGB2312Prober.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | // for S-JIS encoding, observe characteristic:
 8 | // 1, kana character (or hankaku?) often have high frequency of appearance
 9 | // 2, kana character often exist in group
10 | // 3, certain combination of kana is never used in japanese language
11 | 
12 | #include "nsGB2312Prober.h"
13 | 
14 | namespace kencodingprober
15 | {
16 | void nsGB18030Prober::Reset(void)
17 | {
18 |     mCodingSM->Reset();
19 |     mState = eDetecting;
20 |     mDistributionAnalyser.Reset();
21 |     // mContextAnalyser.Reset();
22 | }
23 | 
24 | nsProbingState nsGB18030Prober::HandleData(const char *aBuf, unsigned int aLen)
25 | {
26 |     if (aLen == 0) {
27 |         return mState;
28 |     }
29 | 
30 |     for (unsigned int i = 0; i < aLen; i++) {
31 |         const nsSMState codingState = mCodingSM->NextState(aBuf[i]);
32 |         if (codingState == eError) {
33 |             mState = eNotMe;
34 |             break;
35 |         }
36 |         if (codingState == eItsMe) {
37 |             mState = eFoundIt;
38 |             break;
39 |         }
40 |         if (codingState == eStart) {
41 |             unsigned int charLen = mCodingSM->GetCurrentCharLen();
42 | 
43 |             if (i == 0) {
44 |                 mLastChar[1] = aBuf[0];
45 |                 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
46 |             } else {
47 |                 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
48 |             }
49 |         }
50 |     }
51 | 
52 |     mLastChar[0] = aBuf[aLen - 1];
53 | 
54 |     if (mState == eDetecting) {
55 |         if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
56 |             mState = eFoundIt;
57 |         }
58 |     }
59 |     //    else
60 |     //      mDistributionAnalyser.HandleData(aBuf, aLen);
61 | 
62 |     return mState;
63 | }
64 | 
65 | float nsGB18030Prober::GetConfidence(void)
66 | {
67 |     float distribCf = mDistributionAnalyser.GetConfidence();
68 | 
69 |     return (float)distribCf;
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/probers/nsGB2312Prober.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsGB2312Prober_h__
 8 | #define nsGB2312Prober_h__
 9 | 
10 | #include "CharDistribution.h"
11 | #include "nsCharSetProber.h"
12 | #include "nsCodingStateMachine.h"
13 | 
14 | // We use gb18030 to replace gb2312, because 18030 is a superset.
15 | namespace kencodingprober
16 | {
17 | class KCODECS_NO_EXPORT nsGB18030Prober : public nsCharSetProber
18 | {
19 | public:
20 |     nsGB18030Prober(void)
21 |     {
22 |         mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
23 |         Reset();
24 |     }
25 |     ~nsGB18030Prober(void) override
26 |     {
27 |         delete mCodingSM;
28 |     }
29 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
30 |     const char *GetCharSetName() override
31 |     {
32 |         return "gb18030";
33 |     }
34 |     nsProbingState GetState(void) override
35 |     {
36 |         return mState;
37 |     }
38 |     void Reset(void) override;
39 |     float GetConfidence(void) override;
40 |     void SetOpion() override
41 |     {
42 |     }
43 | 
44 | protected:
45 |     void GetDistribution(unsigned int aCharLen, const char *aStr);
46 | 
47 |     nsCodingStateMachine *mCodingSM;
48 |     nsProbingState mState;
49 | 
50 |     // GB2312ContextAnalysis mContextAnalyser;
51 |     GB2312DistributionAnalysis mDistributionAnalyser;
52 |     char mLastChar[2];
53 | };
54 | }
55 | 
56 | #endif /* nsGB2312Prober_h__ */
57 | 


--------------------------------------------------------------------------------
/src/probers/nsHebrewProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsHebrewProber.h"
  8 | #include <stdio.h>
  9 | 
 10 | // windows-1255 / ISO-8859-8 code points of interest
 11 | #define FINAL_KAF ('\xea')
 12 | #define NORMAL_KAF ('\xeb')
 13 | #define FINAL_MEM ('\xed')
 14 | #define NORMAL_MEM ('\xee')
 15 | #define FINAL_NUN ('\xef')
 16 | #define NORMAL_NUN ('\xf0')
 17 | #define FINAL_PE ('\xf3')
 18 | #define NORMAL_PE ('\xf4')
 19 | #define FINAL_TSADI ('\xf5')
 20 | #define NORMAL_TSADI ('\xf6')
 21 | 
 22 | // Minimum Visual vs Logical final letter score difference.
 23 | // If the difference is below this, don't rely solely on the final letter score distance.
 24 | #define MIN_FINAL_CHAR_DISTANCE (5)
 25 | 
 26 | // Minimum Visual vs Logical model score difference.
 27 | // If the difference is below this, don't rely at all on the model score distance.
 28 | #define MIN_MODEL_DISTANCE (0.01)
 29 | 
 30 | #define VISUAL_HEBREW_NAME ("ISO-8859-8")
 31 | #define LOGICAL_HEBREW_NAME ("windows-1255")
 32 | 
 33 | namespace kencodingprober
 34 | {
 35 | bool nsHebrewProber::isFinal(char c)
 36 | {
 37 |     return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
 38 | }
 39 | 
 40 | bool nsHebrewProber::isNonFinal(char c)
 41 | {
 42 |     return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
 43 |     // The normal Tsadi is not a good Non-Final letter due to words like
 44 |     // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
 45 |     // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
 46 |     // the Non-Final tsadi to appear at an end of a word even though this is not
 47 |     // the case in the original text.
 48 |     // The letters Pe and Kaf rarely display a related behavior of not being a
 49 |     // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
 50 |     // example legally end with a Non-Final Pe or Kaf. However, the benefit of
 51 |     // these letters as Non-Final letters outweighs the damage since these words
 52 |     // are quite rare.
 53 | }
 54 | 
 55 | /** HandleData
 56 |  * Final letter analysis for logical-visual decision.
 57 |  * Look for evidence that the received buffer is either logical Hebrew or
 58 |  * visual Hebrew.
 59 |  * The following cases are checked:
 60 |  * 1) A word longer than 1 letter, ending with a final letter. This is an
 61 |  *    indication that the text is laid out "naturally" since the final letter
 62 |  *    really appears at the end. +1 for logical score.
 63 |  * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
 64 |  *    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
 65 |  *    the Non-Final form of that letter. Exceptions to this rule are mentioned
 66 |  *    above in isNonFinal(). This is an indication that the text is laid out
 67 |  *    backwards. +1 for visual score
 68 |  * 3) A word longer than 1 letter, starting with a final letter. Final letters
 69 |  *    should not appear at the beginning of a word. This is an indication that
 70 |  *    the text is laid out backwards. +1 for visual score.
 71 |  *
 72 |  * The visual score and logical score are accumulated throughout the text and
 73 |  * are finally checked against each other in GetCharSetName().
 74 |  * No checking for final letters in the middle of words is done since that case
 75 |  * is not an indication for either Logical or Visual text.
 76 |  *
 77 |  * The input buffer should not contain any white spaces that are not (' ')
 78 |  * or any low-ascii punctuation marks.
 79 |  */
 80 | nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen)
 81 | {
 82 |     // Both model probers say it's not them. No reason to continue.
 83 |     if (GetState() == eNotMe) {
 84 |         return eNotMe;
 85 |     }
 86 | 
 87 |     const char *curPtr;
 88 |     const char *endPtr = aBuf + aLen;
 89 | 
 90 |     for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) {
 91 |         char cur = *curPtr;
 92 |         if (cur == ' ') { // We stand on a space - a word just ended
 93 |             if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word
 94 |                 if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space]
 95 |                     ++mFinalCharLogicalScore;
 96 |                 } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space]
 97 |                     ++mFinalCharVisualScore;
 98 |                 }
 99 |             }
100 |         } else { // Not standing on a space
101 |             if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space]
102 |                 ++mFinalCharVisualScore;
103 |             }
104 |         }
105 |         mBeforePrev = mPrev;
106 |         mPrev = cur;
107 |     }
108 | 
109 |     // Forever detecting, till the end or until both model probers return eNotMe (handled above).
110 |     return eDetecting;
111 | }
112 | 
113 | // Make the decision: is it Logical or Visual?
114 | const char *nsHebrewProber::GetCharSetName()
115 | {
116 |     // If the final letter score distance is dominant enough, rely on it.
117 |     int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
118 |     if (finalsub >= MIN_FINAL_CHAR_DISTANCE) {
119 |         return LOGICAL_HEBREW_NAME;
120 |     }
121 |     if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) {
122 |         return VISUAL_HEBREW_NAME;
123 |     }
124 | 
125 |     // It's not dominant enough, try to rely on the model scores instead.
126 |     float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
127 |     if (modelsub > MIN_MODEL_DISTANCE) {
128 |         return LOGICAL_HEBREW_NAME;
129 |     }
130 |     if (modelsub < -(MIN_MODEL_DISTANCE)) {
131 |         return VISUAL_HEBREW_NAME;
132 |     }
133 | 
134 |     // Still no good, back to final letter distance, maybe it'll save the day.
135 |     if (finalsub < 0) {
136 |         return VISUAL_HEBREW_NAME;
137 |     }
138 | 
139 |     // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
140 |     return LOGICAL_HEBREW_NAME;
141 | }
142 | 
143 | void nsHebrewProber::Reset(void)
144 | {
145 |     mFinalCharLogicalScore = 0;
146 |     mFinalCharVisualScore = 0;
147 | 
148 |     // mPrev and mBeforePrev are initialized to space in order to simulate a word
149 |     // delimiter at the beginning of the data
150 |     mPrev = ' ';
151 |     mBeforePrev = ' ';
152 | }
153 | 
154 | nsProbingState nsHebrewProber::GetState(void)
155 | {
156 |     // Remain active as long as any of the model probers are active.
157 |     if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) {
158 |         return eNotMe;
159 |     }
160 |     return eDetecting;
161 | }
162 | 
163 | #ifdef DEBUG_PROBE
164 | void nsHebrewProber::DumpStatus()
165 | {
166 |     printf("  HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
167 | }
168 | #endif
169 | }
170 | 


--------------------------------------------------------------------------------
/src/probers/nsHebrewProber.h:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #ifndef nsHebrewProber_h__
  8 | #define nsHebrewProber_h__
  9 | 
 10 | #include "nsSBCharSetProber.h"
 11 | namespace kencodingprober
 12 | {
 13 | // This prober doesn't actually recognize a language or a charset.
 14 | // It is a helper prober for the use of the Hebrew model probers
 15 | class KCODECS_NO_EXPORT nsHebrewProber : public nsCharSetProber
 16 | {
 17 | public:
 18 |     nsHebrewProber(void)
 19 |         : mLogicalProb(nullptr)
 20 |         , mVisualProb(nullptr)
 21 |     {
 22 |         Reset();
 23 |     }
 24 | 
 25 |     ~nsHebrewProber(void) override
 26 |     {
 27 |     }
 28 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
 29 |     const char *GetCharSetName() override;
 30 |     void Reset(void) override;
 31 | 
 32 |     nsProbingState GetState(void) override;
 33 | 
 34 |     float GetConfidence(void) override
 35 |     {
 36 |         return (float)0.0;
 37 |     }
 38 |     void SetOpion() override
 39 |     {
 40 |     }
 41 | 
 42 |     void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
 43 |     {
 44 |         mLogicalProb = logicalPrb;
 45 |         mVisualProb = visualPrb;
 46 |     }
 47 | 
 48 | #ifdef DEBUG_PROBE
 49 |     void DumpStatus() override;
 50 | #endif
 51 | 
 52 | protected:
 53 |     static bool isFinal(char c);
 54 |     static bool isNonFinal(char c);
 55 | 
 56 |     int mFinalCharLogicalScore, mFinalCharVisualScore;
 57 | 
 58 |     // The two last characters seen in the previous buffer.
 59 |     char mPrev, mBeforePrev;
 60 | 
 61 |     // These probers are owned by the group prober.
 62 |     nsCharSetProber *mLogicalProb, *mVisualProb;
 63 | };
 64 | }
 65 | 
 66 | /**
 67 |  * ** General ideas of the Hebrew charset recognition **
 68 |  *
 69 |  * Four main charsets exist in Hebrew:
 70 |  * "ISO-8859-8" - Visual Hebrew
 71 |  * "windows-1255" - Logical Hebrew
 72 |  * "ISO-8859-8-I" - Logical Hebrew
 73 |  * "x-mac-hebrew" - ?? Logical Hebrew ??
 74 |  *
 75 |  * Both "ISO" charsets use a completely identical set of code points, whereas
 76 |  * "windows-1255" and "x-mac-hebrew" are two different proper supersets of
 77 |  * these code points. windows-1255 defines additional characters in the range
 78 |  * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
 79 |  * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
 80 |  * x-mac-hebrew defines similar additional code points but with a different
 81 |  * mapping.
 82 |  *
 83 |  * As far as an average Hebrew text with no diacritics is concerned, all four
 84 |  * charsets are identical with respect to code points. Meaning that for the
 85 |  * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
 86 |  * (including final letters).
 87 |  *
 88 |  * The dominant difference between these charsets is their directionality.
 89 |  * "Visual" directionality means that the text is ordered as if the renderer is
 90 |  * not aware of a BIDI rendering algorithm. The renderer sees the text and
 91 |  * draws it from left to right. The text itself when ordered naturally is read
 92 |  * backwards. A buffer of Visual Hebrew generally looks like so:
 93 |  * "[last word of first line spelled backwards] [whole line ordered backwards
 94 |  * and spelled backwards] [first word of first line spelled backwards]
 95 |  * [end of line] [last word of second line] ... etc' "
 96 |  * adding punctuation marks, numbers and English text to visual text is
 97 |  * naturally also "visual" and from left to right.
 98 |  *
 99 |  * "Logical" directionality means the text is ordered "naturally" according to
100 |  * the order it is read. It is the responsibility of the renderer to display
101 |  * the text from right to left. A BIDI algorithm is used to place general
102 |  * punctuation marks, numbers and English text in the text.
103 |  *
104 |  * Texts in x-mac-hebrew are almost impossible to find on the Internet. From
105 |  * what little evidence I could find, it seems that its general directionality
106 |  * is Logical.
107 |  *
108 |  * To sum up all of the above, the Hebrew probing mechanism knows about two
109 |  * charsets:
110 |  * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
111 |  *    backwards while line order is natural. For charset recognition purposes
112 |  *    the line order is unimportant (In fact, for this implementation, even
113 |  *    word order is unimportant).
114 |  * Logical Hebrew - "windows-1255" - normal, naturally ordered text.
115 |  *
116 |  * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
117 |  *    specifically identified.
118 |  * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
119 |  *    that contain special punctuation marks or diacritics is displayed with
120 |  *    some unconverted characters showing as question marks. This problem might
121 |  *    be corrected using another model prober for x-mac-hebrew. Due to the fact
122 |  *    that x-mac-hebrew texts are so rare, writing another model prober isn't
123 |  *    worth the effort and performance hit.
124 |  *
125 |  * *** The Prober ***
126 |  *
127 |  * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber,
128 |  * all of which are managed, created, fed data, inquired and deleted by the
129 |  * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in
130 |  * fact some kind of Hebrew, Logical or Visual. The final decision about which
131 |  * one is it is made by the nsHebrewProber by combining final-letter scores
132 |  * with the scores of the two nsSBCharSetProbers to produce a final answer.
133 |  *
134 |  * The nsSBCSGroupProber is responsible for stripping the original text of HTML
135 |  * tags, English characters, numbers, low-ASCII punctuation characters, spaces
136 |  * and new lines. It reduces any sequence of such characters to a single space.
137 |  * The buffer fed to each prober in the SBCS group prober is pure text in
138 |  * high-ASCII.
139 |  * The two nsSBCharSetProbers (model probers) share the same language model:
140 |  * Win1255Model.
141 |  * The first nsSBCharSetProber uses the model normally as any other
142 |  * nsSBCharSetProber does, to recognize windows-1255, upon which this model was
143 |  * built. The second nsSBCharSetProber is told to make the pair-of-letter
144 |  * lookup in the language model backwards. This in practice exactly simulates
145 |  * a visual Hebrew model using the windows-1255 logical Hebrew model.
146 |  *
147 |  * The nsHebrewProber is not using any language model. All it does is look for
148 |  * final-letter evidence suggesting the text is either logical Hebrew or visual
149 |  * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber
150 |  * alone are meaningless. nsHebrewProber always returns 0.00 as confidence
151 |  * since it never identifies a charset by itself. Instead, the pointer to the
152 |  * nsHebrewProber is passed to the model probers as a helper "Name Prober".
153 |  * When the Group prober receives a positive identification from any prober,
154 |  * it asks for the name of the charset identified. If the prober queried is a
155 |  * Hebrew model prober, the model prober forwards the call to the
156 |  * nsHebrewProber to make the final decision. In the nsHebrewProber, the
157 |  * decision is made according to the final-letters scores maintained and Both
158 |  * model probers scores. The answer is returned in the form of the name of the
159 |  * charset identified, either "windows-1255" or "ISO-8859-8".
160 |  *
161 |  */
162 | #endif /* nsHebrewProber_h__ */
163 | 


--------------------------------------------------------------------------------
/src/probers/nsLatin1Prober.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsLatin1Prober.h"
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #define UDF 0 // undefined
 12 | #define OTH 1 // other
 13 | #define ASC 2 // ascii capital letter
 14 | #define ASS 3 // ascii small letter
 15 | #define ACV 4 // accent capital vowel
 16 | #define ACO 5 // accent capital other
 17 | #define ASV 6 // accent small vowel
 18 | #define ASO 7 // accent small other
 19 | #define CLASS_NUM 8 // total classes
 20 | 
 21 | namespace kencodingprober
 22 | {
 23 | static const unsigned char Latin1_CharToClass[] = {
 24 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
 25 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
 26 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
 27 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
 28 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
 29 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
 30 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
 31 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
 32 |     OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
 33 |     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
 34 |     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
 35 |     ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
 36 |     OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
 37 |     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
 38 |     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
 39 |     ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
 40 |     OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
 41 |     OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
 42 |     UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
 43 |     OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
 44 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
 45 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
 46 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
 47 |     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
 48 |     ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
 49 |     ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
 50 |     ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
 51 |     ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
 52 |     ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
 53 |     ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
 54 |     ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
 55 |     ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
 56 | };
 57 | 
 58 | /* 0 : illegal
 59 |    1 : very unlikely
 60 |    2 : normal
 61 |    3 : very likely
 62 | */
 63 | static const unsigned char Latin1ClassModel[] = {
 64 |     /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
 65 |     /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
 66 |     /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
 67 |     /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
 68 |     /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
 69 |     /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
 70 |     /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
 71 |     /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
 72 |     /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
 73 | };
 74 | 
 75 | void nsLatin1Prober::Reset(void)
 76 | {
 77 |     mState = eDetecting;
 78 |     mLastCharClass = OTH;
 79 |     for (int i = 0; i < FREQ_CAT_NUM; i++) {
 80 |         mFreqCounter[i] = 0;
 81 |     }
 82 | }
 83 | 
 84 | nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen)
 85 | {
 86 |     char *newBuf1 = nullptr;
 87 |     unsigned int newLen1 = 0;
 88 | 
 89 |     if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
 90 |         newBuf1 = (char *)aBuf;
 91 |         newLen1 = aLen;
 92 |     }
 93 | 
 94 |     for (unsigned int i = 0; i < newLen1; i++) {
 95 |         const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
 96 |         const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
 97 |         if (freq == 0) {
 98 |             mState = eNotMe;
 99 |             break;
100 |         }
101 |         mFreqCounter[freq]++;
102 |         mLastCharClass = charClass;
103 |     }
104 | 
105 |     if (newBuf1 != aBuf) {
106 |         free(newBuf1);
107 |     }
108 | 
109 |     return mState;
110 | }
111 | 
112 | float nsLatin1Prober::GetConfidence(void)
113 | {
114 |     if (mState == eNotMe) {
115 |         return 0.01f;
116 |     }
117 | 
118 |     float confidence;
119 |     unsigned int total = 0;
120 |     for (int i = 0; i < FREQ_CAT_NUM; i++) {
121 |         total += mFreqCounter[i];
122 |     }
123 | 
124 |     if (!total) {
125 |         confidence = 0.0f;
126 |     } else {
127 |         confidence = mFreqCounter[3] * 1.0f / total;
128 |         confidence -= mFreqCounter[1] * 20.0f / total;
129 |     }
130 | 
131 |     if (confidence < 0.0f) {
132 |         confidence = 0.0f;
133 |     }
134 | 
135 |     // lower the confidence of latin1 so that other more accurate detector
136 |     // can take priority.
137 |     confidence *= 0.50f;
138 | 
139 |     return confidence;
140 | }
141 | 
142 | #ifdef DEBUG_PROBE
143 | void nsLatin1Prober::DumpStatus()
144 | {
145 |     printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
146 | }
147 | #endif
148 | }
149 | 


--------------------------------------------------------------------------------
/src/probers/nsLatin1Prober.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsLatin1Prober_h__
 8 | #define nsLatin1Prober_h__
 9 | 
10 | #include "nsCharSetProber.h"
11 | 
12 | #define FREQ_CAT_NUM 4
13 | namespace kencodingprober
14 | {
15 | class KCODECS_NO_EXPORT nsLatin1Prober : public nsCharSetProber
16 | {
17 | public:
18 |     nsLatin1Prober(void)
19 |     {
20 |         Reset();
21 |     }
22 |     ~nsLatin1Prober(void) override
23 |     {
24 |     }
25 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
26 |     const char *GetCharSetName() override
27 |     {
28 |         return "windows-1252";
29 |     }
30 |     nsProbingState GetState(void) override
31 |     {
32 |         return mState;
33 |     }
34 |     void Reset(void) override;
35 |     float GetConfidence(void) override;
36 |     void SetOpion() override
37 |     {
38 |     }
39 | 
40 | #ifdef DEBUG_PROBE
41 |     void DumpStatus() override;
42 | #endif
43 | 
44 | protected:
45 |     nsProbingState mState;
46 |     char mLastCharClass;
47 |     unsigned int mFreqCounter[FREQ_CAT_NUM];
48 | };
49 | }
50 | 
51 | #endif /* nsLatin1Prober_h__ */
52 | 


--------------------------------------------------------------------------------
/src/probers/nsMBCSGroupProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsMBCSGroupProber.h"
  8 | 
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | namespace kencodingprober
 13 | {
 14 | #ifdef DEBUG_PROBE
 15 | static const char *const ProberName[] = {
 16 |     "Unicode",
 17 |     "SJIS",
 18 |     "EUCJP",
 19 |     "GB18030",
 20 |     "EUCKR",
 21 |     "Big5",
 22 | };
 23 | 
 24 | #endif
 25 | 
 26 | nsMBCSGroupProber::nsMBCSGroupProber()
 27 | {
 28 |     mProbers[0] = new UnicodeGroupProber();
 29 |     mProbers[1] = new nsSJISProber();
 30 |     mProbers[2] = new nsEUCJPProber();
 31 |     mProbers[3] = new nsGB18030Prober();
 32 |     mProbers[4] = new nsEUCKRProber();
 33 |     mProbers[5] = new nsBig5Prober();
 34 |     Reset();
 35 | }
 36 | 
 37 | nsMBCSGroupProber::~nsMBCSGroupProber()
 38 | {
 39 |     for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) {
 40 |         delete mProbers[i];
 41 |     }
 42 | }
 43 | 
 44 | const char *nsMBCSGroupProber::GetCharSetName()
 45 | {
 46 |     if (mBestGuess == -1) {
 47 |         GetConfidence();
 48 |         if (mBestGuess == -1) {
 49 |             mBestGuess = 0;
 50 |         }
 51 |     }
 52 |     return mProbers[mBestGuess]->GetCharSetName();
 53 | }
 54 | 
 55 | void nsMBCSGroupProber::Reset(void)
 56 | {
 57 |     mActiveNum = 0;
 58 |     for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) {
 59 |         if (mProbers[i]) {
 60 |             mProbers[i]->Reset();
 61 |             mIsActive[i] = true;
 62 |             ++mActiveNum;
 63 |         } else {
 64 |             mIsActive[i] = false;
 65 |         }
 66 |     }
 67 |     mBestGuess = -1;
 68 |     mState = eDetecting;
 69 | }
 70 | 
 71 | nsProbingState nsMBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen)
 72 | {
 73 |     nsProbingState st;
 74 |     unsigned int i;
 75 | 
 76 |     // do filtering to reduce load to probers
 77 |     char *highbyteBuf;
 78 |     char *hptr;
 79 |     bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise
 80 |     hptr = highbyteBuf = (char *)malloc(aLen);
 81 |     if (!hptr) {
 82 |         return mState;
 83 |     }
 84 |     for (i = 0; i < aLen; ++i) {
 85 |         if (aBuf[i] & 0x80) {
 86 |             *hptr++ = aBuf[i];
 87 |             keepNext = true;
 88 |         } else {
 89 |             // if previous is highbyte, keep this even it is a ASCII
 90 |             if (keepNext) {
 91 |                 *hptr++ = aBuf[i];
 92 |                 keepNext = false;
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     for (i = 0; i < NUM_OF_PROBERS; ++i) {
 98 |         if (!mIsActive[i]) {
 99 |             continue;
100 |         }
101 |         st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
102 |         if (st == eFoundIt) {
103 |             mBestGuess = i;
104 |             mState = eFoundIt;
105 |             break;
106 |         } else if (st == eNotMe) {
107 |             mIsActive[i] = false;
108 |             mActiveNum--;
109 |             if (mActiveNum == 0) {
110 |                 mState = eNotMe;
111 |                 break;
112 |             }
113 |         }
114 |     }
115 | 
116 |     free(highbyteBuf);
117 | 
118 |     return mState;
119 | }
120 | 
121 | float nsMBCSGroupProber::GetConfidence(void)
122 | {
123 |     unsigned int i;
124 |     float bestConf = 0.0;
125 |     float cf;
126 | 
127 |     switch (mState) {
128 |     case eFoundIt:
129 |         return (float)0.99;
130 |     case eNotMe:
131 |         return (float)0.01;
132 |     default:
133 |         for (i = 0; i < NUM_OF_PROBERS; ++i) {
134 |             if (!mIsActive[i]) {
135 |                 continue;
136 |             }
137 |             cf = mProbers[i]->GetConfidence();
138 |             if (bestConf < cf) {
139 |                 bestConf = cf;
140 |                 mBestGuess = i;
141 |             }
142 |         }
143 |     }
144 |     return bestConf;
145 | }
146 | 
147 | #ifdef DEBUG_PROBE
148 | void nsMBCSGroupProber::DumpStatus()
149 | {
150 |     unsigned int i;
151 |     float cf;
152 | 
153 |     GetConfidence();
154 |     for (i = 0; i < NUM_OF_PROBERS; i++) {
155 |         if (!mIsActive[i]) {
156 |             printf("  MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
157 |         } else {
158 |             cf = mProbers[i]->GetConfidence();
159 |             printf("  MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
160 |         }
161 |     }
162 | }
163 | #endif
164 | }
165 | 


--------------------------------------------------------------------------------
/src/probers/nsMBCSGroupProber.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsMBCSGroupProber_h__
 8 | #define nsMBCSGroupProber_h__
 9 | 
10 | #include "UnicodeGroupProber.h"
11 | #include "nsBig5Prober.h"
12 | #include "nsEUCJPProber.h"
13 | #include "nsEUCKRProber.h"
14 | #include "nsGB2312Prober.h"
15 | #include "nsSJISProber.h"
16 | 
17 | #define NUM_OF_PROBERS 6
18 | namespace kencodingprober
19 | {
20 | class KCODECS_NO_EXPORT nsMBCSGroupProber : public nsCharSetProber
21 | {
22 | public:
23 |     nsMBCSGroupProber();
24 |     ~nsMBCSGroupProber() override;
25 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
26 |     const char *GetCharSetName() override;
27 |     nsProbingState GetState(void) override
28 |     {
29 |         return mState;
30 |     }
31 |     void Reset(void) override;
32 |     float GetConfidence(void) override;
33 |     void SetOpion() override
34 |     {
35 |     }
36 | 
37 | #ifdef DEBUG_PROBE
38 |     void DumpStatus() override;
39 | #endif
40 | 
41 | protected:
42 |     nsProbingState mState;
43 |     nsCharSetProber *mProbers[NUM_OF_PROBERS];
44 |     bool mIsActive[NUM_OF_PROBERS];
45 |     int mBestGuess;
46 |     unsigned int mActiveNum;
47 | };
48 | }
49 | 
50 | #endif /* nsMBCSGroupProber_h__ */
51 | 


--------------------------------------------------------------------------------
/src/probers/nsPkgInt.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | #ifndef nsPkgInt_h__
 8 | #define nsPkgInt_h__
 9 | 
10 | namespace kencodingprober
11 | {
12 | typedef enum {
13 |     eIdxSft4bits = 3,
14 |     eIdxSft8bits = 2,
15 |     eIdxSft16bits = 1,
16 | } nsIdxSft;
17 | 
18 | typedef enum {
19 |     eSftMsk4bits = 7,
20 |     eSftMsk8bits = 3,
21 |     eSftMsk16bits = 1,
22 | } nsSftMsk;
23 | 
24 | typedef enum {
25 |     eBitSft4bits = 2,
26 |     eBitSft8bits = 3,
27 |     eBitSft16bits = 4,
28 | } nsBitSft;
29 | 
30 | typedef enum {
31 |     eUnitMsk4bits = 0x0000000FL,
32 |     eUnitMsk8bits = 0x000000FFL,
33 |     eUnitMsk16bits = 0x0000FFFFL,
34 | } nsUnitMsk;
35 | 
36 | typedef struct nsPkgInt {
37 |     nsIdxSft idxsft;
38 |     nsSftMsk sftmsk;
39 |     nsBitSft bitsft;
40 |     nsUnitMsk unitmsk;
41 |     const unsigned int *data;
42 | } nsPkgInt;
43 | }
44 | 
45 | #define PCK16BITS(a, b) ((unsigned int)(((b) << 16) | (a)))
46 | 
47 | #define PCK8BITS(a, b, c, d) PCK16BITS(((unsigned int)(((b) << 8) | (a))), ((unsigned int)(((d) << 8) | (c))))
48 | 
49 | #define PCK4BITS(a, b, c, d, e, f, g, h)                                                                                                                       \
50 |     PCK8BITS(((unsigned int)(((b) << 4) | (a))), ((unsigned int)(((d) << 4) | (c))), ((unsigned int)(((f) << 4) | (e))), ((unsigned int)(((h) << 4) | (g))))
51 | 
52 | #define GETFROMPCK(i, c) (((((c).data)[(i) >> (c).idxsft]) >> (((i) & (c).sftmsk) << (c).bitsft)) & (c).unitmsk)
53 | 
54 | #endif /* nsPkgInt_h__ */
55 | 


--------------------------------------------------------------------------------
/src/probers/nsSBCSGroupProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsSBCSGroupProber.h"
  8 | 
  9 | #include "UnicodeGroupProber.h"
 10 | #include "nsHebrewProber.h"
 11 | #include "nsSBCharSetProber.h"
 12 | 
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | 
 16 | namespace kencodingprober
 17 | {
 18 | nsSBCSGroupProber::nsSBCSGroupProber()
 19 | {
 20 |     mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
 21 |     mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
 22 |     mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
 23 |     mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
 24 |     mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
 25 |     mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
 26 |     mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
 27 |     mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
 28 |     mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
 29 |     mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
 30 | 
 31 |     nsHebrewProber *hebprober = new nsHebrewProber();
 32 |     // Notice: Any change in these indexes - 10,11,12 must be reflected
 33 |     // in the code below as well.
 34 |     mProbers[10] = hebprober;
 35 |     mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
 36 |     mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
 37 |     mProbers[13] = new UnicodeGroupProber();
 38 | 
 39 |     // Tell the Hebrew prober about the logical and visual probers
 40 |     if (mProbers[10] && mProbers[11] && mProbers[12]) { // all are not null
 41 |         hebprober->SetModelProbers(mProbers[11], mProbers[12]);
 42 |     } else { // One or more is null. avoid any Hebrew probing, null them all
 43 |         for (unsigned int i = 10; i <= 12; ++i) {
 44 |             delete mProbers[i];
 45 |             mProbers[i] = nullptr;
 46 |         }
 47 |     }
 48 | 
 49 |     // disable latin2 before latin1 is available, otherwise all latin1
 50 |     // will be detected as latin2 because of their similarity.
 51 |     // mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
 52 |     // mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
 53 | 
 54 |     Reset();
 55 | }
 56 | 
 57 | nsSBCSGroupProber::~nsSBCSGroupProber()
 58 | {
 59 |     for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
 60 |         delete mProbers[i];
 61 |     }
 62 | }
 63 | 
 64 | const char *nsSBCSGroupProber::GetCharSetName()
 65 | {
 66 |     // if we have no answer yet
 67 |     if (mBestGuess == -1) {
 68 |         GetConfidence();
 69 |         // no charset seems positive
 70 |         if (mBestGuess == -1)
 71 |         // we will use default.
 72 |         {
 73 |             mBestGuess = 0;
 74 |         }
 75 |     }
 76 |     return mProbers[mBestGuess]->GetCharSetName();
 77 | }
 78 | 
 79 | void nsSBCSGroupProber::Reset(void)
 80 | {
 81 |     mActiveNum = 0;
 82 |     for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
 83 |         if (mProbers[i]) { // not null
 84 |             mProbers[i]->Reset();
 85 |             mIsActive[i] = true;
 86 |             ++mActiveNum;
 87 |         } else {
 88 |             mIsActive[i] = false;
 89 |         }
 90 |     }
 91 |     mBestGuess = -1;
 92 |     mState = eDetecting;
 93 | }
 94 | 
 95 | nsProbingState nsSBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen)
 96 | {
 97 |     nsProbingState st;
 98 |     unsigned int i;
 99 |     char *newBuf1 = nullptr;
100 |     unsigned int newLen1 = 0;
101 | 
102 |     // apply filter to original buffer, and we got new buffer back
103 |     // depend on what script it is, we will feed them the new buffer
104 |     // we got after applying proper filter
105 |     // this is done without any consideration to KeepEnglishLetters
106 |     // of each prober since as of now, there are no probers here which
107 |     // recognize languages with English characters.
108 |     if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
109 |         goto done;
110 |     }
111 | 
112 |     if (newLen1 == 0) {
113 |         goto done; // Nothing to see here, move on.
114 |     }
115 | 
116 |     for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) {
117 |         if (!mIsActive[i]) {
118 |             continue;
119 |         }
120 |         st = mProbers[i]->HandleData(newBuf1, newLen1);
121 |         if (st == eFoundIt) {
122 |             mBestGuess = i;
123 |             mState = eFoundIt;
124 |             break;
125 |         } else if (st == eNotMe) {
126 |             mIsActive[i] = false;
127 |             mActiveNum--;
128 |             if (mActiveNum == 0) {
129 |                 mState = eNotMe;
130 |                 break;
131 |             }
132 |         }
133 |     }
134 | 
135 | done:
136 |     free(newBuf1);
137 | 
138 |     return mState;
139 | }
140 | 
141 | float nsSBCSGroupProber::GetConfidence(void)
142 | {
143 |     unsigned int i;
144 |     float bestConf = 0.0;
145 |     float cf;
146 | 
147 |     switch (mState) {
148 |     case eFoundIt:
149 |         return (float)0.99; // sure yes
150 |     case eNotMe:
151 |         return (float)0.01; // sure no
152 |     default:
153 |         for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) {
154 |             if (!mIsActive[i]) {
155 |                 continue;
156 |             }
157 |             cf = mProbers[i]->GetConfidence();
158 |             if (bestConf < cf) {
159 |                 bestConf = cf;
160 |                 mBestGuess = i;
161 |             }
162 |         }
163 |     }
164 |     return bestConf;
165 | }
166 | 
167 | #ifdef DEBUG_PROBE
168 | void nsSBCSGroupProber::DumpStatus()
169 | {
170 |     unsigned int i;
171 |     float cf;
172 | 
173 |     cf = GetConfidence();
174 |     printf(" SBCS Group Prober --------begin status \r\n");
175 |     for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
176 |         if (!mIsActive[i]) {
177 |             printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
178 |         } else {
179 |             mProbers[i]->DumpStatus();
180 |         }
181 |     }
182 |     printf(" SBCS Group found best match [%s] confidence %f.\r\n", mProbers[mBestGuess]->GetCharSetName(), cf);
183 | }
184 | #endif
185 | }
186 | 


--------------------------------------------------------------------------------
/src/probers/nsSBCSGroupProber.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     The Original Code is Mozilla Universal charset detector code.
 3 | 
 4 |     SPDX-FileCopyrightText: 2001 Netscape Communications Corporation
 5 |     SPDX-FileContributor: Shy Shalom <shooshX@gmail.com>
 6 | 
 7 |     SPDX-License-Identifier: MPL-1.1 OR GPL-2.0-or-later OR LGPL-2.1-or-later
 8 | */
 9 | 
10 | #ifndef nsSBCSGroupProber_h__
11 | #define nsSBCSGroupProber_h__
12 | 
13 | #include "nsCharSetProber.h"
14 | 
15 | #define NUM_OF_SBCS_PROBERS 14
16 | 
17 | namespace kencodingprober
18 | {
19 | class KCODECS_NO_EXPORT nsSBCSGroupProber : public nsCharSetProber
20 | {
21 | public:
22 |     nsSBCSGroupProber();
23 |     ~nsSBCSGroupProber() override;
24 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
25 |     const char *GetCharSetName() override;
26 |     nsProbingState GetState(void) override
27 |     {
28 |         return mState;
29 |     }
30 |     void Reset(void) override;
31 |     float GetConfidence(void) override;
32 |     void SetOpion() override
33 |     {
34 |     }
35 | 
36 | #ifdef DEBUG_PROBE
37 |     void DumpStatus() override;
38 | #endif
39 | 
40 | protected:
41 |     nsProbingState mState;
42 |     nsCharSetProber *mProbers[NUM_OF_SBCS_PROBERS];
43 |     bool mIsActive[NUM_OF_SBCS_PROBERS];
44 |     int mBestGuess;
45 |     unsigned int mActiveNum;
46 | };
47 | }
48 | 
49 | #endif /* nsSBCSGroupProber_h__ */
50 | 


--------------------------------------------------------------------------------
/src/probers/nsSBCharSetProber.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #include "nsSBCharSetProber.h"
  8 | 
  9 | #include <stdio.h>
 10 | 
 11 | namespace kencodingprober
 12 | {
 13 | nsProbingState nsSingleByteCharSetProber::HandleData(const char *aBuf, unsigned int aLen)
 14 | {
 15 |     for (unsigned int i = 0; i < aLen; i++) {
 16 |         const unsigned char order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
 17 | 
 18 |         if (order < SYMBOL_CAT_ORDER) {
 19 |             mTotalChar++;
 20 |         }
 21 |         if (order < SAMPLE_SIZE) {
 22 |             mFreqChar++;
 23 | 
 24 |             if (mLastOrder < SAMPLE_SIZE) {
 25 |                 mTotalSeqs++;
 26 |                 if (!mReversed) {
 27 |                     ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]);
 28 |                 } else { // reverse the order of the letters in the lookup
 29 |                     ++(mSeqCounters[(int)mModel->precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]);
 30 |                 }
 31 |             }
 32 |         }
 33 |         mLastOrder = order;
 34 |     }
 35 | 
 36 |     if (mState == eDetecting) {
 37 |         if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) {
 38 |             float cf = GetConfidence();
 39 |             if (cf > POSITIVE_SHORTCUT_THRESHOLD) {
 40 |                 mState = eFoundIt;
 41 |             } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) {
 42 |                 mState = eNotMe;
 43 |             }
 44 |         }
 45 |     }
 46 | 
 47 |     return mState;
 48 | }
 49 | 
 50 | void nsSingleByteCharSetProber::Reset(void)
 51 | {
 52 |     mState = eDetecting;
 53 |     mLastOrder = 255;
 54 |     for (unsigned int i = 0; i < NUMBER_OF_SEQ_CAT; i++) {
 55 |         mSeqCounters[i] = 0;
 56 |     }
 57 |     mTotalSeqs = 0;
 58 |     mTotalChar = 0;
 59 |     mFreqChar = 0;
 60 | }
 61 | 
 62 | //#define NEGATIVE_APPROACH 1
 63 | 
 64 | float nsSingleByteCharSetProber::GetConfidence(void)
 65 | {
 66 | #ifdef NEGATIVE_APPROACH
 67 |     if (mTotalSeqs > 0)
 68 |         if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT] * 10) {
 69 |             return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT] * 10)) / mTotalSeqs * mFreqChar / mTotalChar;
 70 |         }
 71 |     return (float)0.01;
 72 | #else // POSITIVE_APPROACH
 73 |     float r;
 74 | 
 75 |     if (mTotalSeqs > 0) {
 76 |         r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
 77 |         r = r * mFreqChar / mTotalChar;
 78 |         if (r >= (float)1.00) {
 79 |             r = (float)0.99;
 80 |         }
 81 |         return r;
 82 |     }
 83 |     return (float)0.01;
 84 | #endif
 85 | }
 86 | 
 87 | const char *nsSingleByteCharSetProber::GetCharSetName()
 88 | {
 89 |     if (!mNameProber) {
 90 |         return mModel->charsetName;
 91 |     }
 92 |     return mNameProber->GetCharSetName();
 93 | }
 94 | 
 95 | #ifdef DEBUG_PROBE
 96 | void nsSingleByteCharSetProber::DumpStatus()
 97 | {
 98 |     printf("  SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
 99 | }
100 | #endif
101 | }
102 | 


--------------------------------------------------------------------------------
/src/probers/nsSBCharSetProber.h:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 | 
  4 |     SPDX-License-Identifier: MIT
  5 | */
  6 | 
  7 | #ifndef NSSBCHARSETPROBER_H
  8 | #define NSSBCHARSETPROBER_H
  9 | 
 10 | #include "nsCharSetProber.h"
 11 | 
 12 | #define SAMPLE_SIZE 64
 13 | #define SB_ENOUGH_REL_THRESHOLD 1024
 14 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
 15 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
 16 | #define SYMBOL_CAT_ORDER 250
 17 | #define NUMBER_OF_SEQ_CAT 4
 18 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT - 1)
 19 | #define NEGATIVE_CAT 0
 20 | 
 21 | namespace kencodingprober
 22 | {
 23 | typedef struct {
 24 |     const unsigned char *charToOrderMap; // [256] table use to find a char's order
 25 |     const char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
 26 |     float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
 27 |     bool keepEnglishLetter; // says if this script contains English characters (not implemented)
 28 |     const char *charsetName;
 29 | } SequenceModel;
 30 | 
 31 | class KCODECS_NO_EXPORT nsSingleByteCharSetProber : public nsCharSetProber
 32 | {
 33 | public:
 34 |     explicit nsSingleByteCharSetProber(const SequenceModel *model)
 35 |         : mModel(model)
 36 |         , mReversed(false)
 37 |         , mNameProber(nullptr)
 38 |     {
 39 |         Reset();
 40 |     }
 41 |     nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber *nameProber)
 42 |         : mModel(model)
 43 |         , mReversed(reversed)
 44 |         , mNameProber(nameProber)
 45 |     {
 46 |         Reset();
 47 |     }
 48 | 
 49 |     const char *GetCharSetName() override;
 50 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
 51 |     nsProbingState GetState(void) override
 52 |     {
 53 |         return mState;
 54 |     }
 55 |     void Reset(void) override;
 56 |     float GetConfidence(void) override;
 57 |     void SetOpion() override
 58 |     {
 59 |     }
 60 | 
 61 |     // This feature is not implemented yet. any current language model
 62 |     // contain this parameter as false. No one is looking at this
 63 |     // parameter or calling this method.
 64 |     // Moreover, the nsSBCSGroupProber which calls the HandleData of this
 65 |     // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
 66 |     // of the English letters.
 67 |     bool KeepEnglishLetters()
 68 |     {
 69 |         return mModel->keepEnglishLetter;
 70 |     } // (not implemented)
 71 | 
 72 | #ifdef DEBUG_PROBE
 73 |     void DumpStatus() override;
 74 | #endif
 75 | 
 76 | protected:
 77 |     nsProbingState mState;
 78 |     const SequenceModel *mModel;
 79 |     const bool mReversed; // true if we need to reverse every pair in the model lookup
 80 | 
 81 |     // char order of last character
 82 |     unsigned char mLastOrder;
 83 | 
 84 |     unsigned int mTotalSeqs;
 85 |     unsigned int mSeqCounters[NUMBER_OF_SEQ_CAT];
 86 | 
 87 |     unsigned int mTotalChar;
 88 |     // characters that fall in our sampling range
 89 |     unsigned int mFreqChar;
 90 | 
 91 |     // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
 92 |     nsCharSetProber *mNameProber;
 93 | };
 94 | 
 95 | extern const SequenceModel Koi8rModel;
 96 | extern const SequenceModel Win1251Model;
 97 | extern const SequenceModel Latin5Model;
 98 | extern const SequenceModel MacCyrillicModel;
 99 | extern const SequenceModel Ibm866Model;
100 | extern const SequenceModel Ibm855Model;
101 | extern const SequenceModel Latin7Model;
102 | extern const SequenceModel Win1253Model;
103 | extern const SequenceModel Latin5BulgarianModel;
104 | extern const SequenceModel Win1251BulgarianModel;
105 | extern const SequenceModel Latin2HungarianModel;
106 | extern const SequenceModel Win1250HungarianModel;
107 | extern const SequenceModel Win1255Model;
108 | }
109 | #endif /* NSSBCHARSETPROBER_H */
110 | 


--------------------------------------------------------------------------------
/src/probers/nsSJISProber.cpp:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 | 
 4 |     SPDX-License-Identifier: MIT
 5 | */
 6 | 
 7 | // for S-JIS encoding, observe characteristic:
 8 | // 1, kana character (or hankaku?) often have high frequency of appearance
 9 | // 2, kana character often exist in group
10 | // 3, certain combination of kana is never used in japanese language
11 | 
12 | #include "nsSJISProber.h"
13 | 
14 | namespace kencodingprober
15 | {
16 | void nsSJISProber::Reset(void)
17 | {
18 |     mCodingSM->Reset();
19 |     mState = eDetecting;
20 |     mContextAnalyser.Reset();
21 |     mDistributionAnalyser.Reset();
22 | }
23 | 
24 | nsProbingState nsSJISProber::HandleData(const char *aBuf, unsigned int aLen)
25 | {
26 |     if (aLen == 0) {
27 |         return mState;
28 |     }
29 | 
30 |     for (unsigned int i = 0; i < aLen; i++) {
31 |         const nsSMState codingState = mCodingSM->NextState(aBuf[i]);
32 |         if (codingState == eError) {
33 |             mState = eNotMe;
34 |             break;
35 |         }
36 |         if (codingState == eItsMe) {
37 |             mState = eFoundIt;
38 |             break;
39 |         }
40 |         if (codingState == eStart) {
41 |             unsigned int charLen = mCodingSM->GetCurrentCharLen();
42 |             if (i == 0) {
43 |                 mLastChar[1] = aBuf[0];
44 |                 mContextAnalyser.HandleOneChar(mLastChar + 2 - charLen, charLen);
45 |                 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
46 |             } else {
47 |                 mContextAnalyser.HandleOneChar(aBuf + i + 1 - charLen, charLen);
48 |                 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
49 |             }
50 |         }
51 |     }
52 | 
53 |     mLastChar[0] = aBuf[aLen - 1];
54 | 
55 |     if (mState == eDetecting) {
56 |         if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
57 |             mState = eFoundIt;
58 |         }
59 |     }
60 | 
61 |     return mState;
62 | }
63 | 
64 | float nsSJISProber::GetConfidence(void)
65 | {
66 |     float contxtCf = mContextAnalyser.GetConfidence();
67 |     float distribCf = mDistributionAnalyser.GetConfidence();
68 | 
69 |     return (contxtCf > distribCf ? contxtCf : distribCf);
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/probers/nsSJISProber.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     The Original Code is mozilla.org code.
 3 | 
 4 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation
 5 | 
 6 |     SPDX-License-Identifier: MPL-1.1 OR GPL-2.0-or-later OR LGPL-2.1-or-later
 7 | */
 8 | 
 9 | // for S-JIS encoding, observe characteristic:
10 | // 1, kana character (or hankaku?) often have high frequency of appearance
11 | // 2, kana character often exist in group
12 | // 3, certain combination of kana is never used in japanese language
13 | 
14 | #ifndef nsSJISProber_h__
15 | #define nsSJISProber_h__
16 | 
17 | #include "CharDistribution.h"
18 | #include "JpCntx.h"
19 | #include "nsCharSetProber.h"
20 | #include "nsCodingStateMachine.h"
21 | 
22 | namespace kencodingprober
23 | {
24 | class KCODECS_NO_EXPORT nsSJISProber : public nsCharSetProber
25 | {
26 | public:
27 |     nsSJISProber(void)
28 |     {
29 |         mCodingSM = new nsCodingStateMachine(&SJISSMModel);
30 |         Reset();
31 |     }
32 |     ~nsSJISProber(void) override
33 |     {
34 |         delete mCodingSM;
35 |     }
36 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
37 |     const char *GetCharSetName() override
38 |     {
39 |         return "Shift_JIS";
40 |     }
41 |     nsProbingState GetState(void) override
42 |     {
43 |         return mState;
44 |     }
45 |     void Reset(void) override;
46 |     float GetConfidence(void) override;
47 |     void SetOpion() override
48 |     {
49 |     }
50 | 
51 | protected:
52 |     nsCodingStateMachine *mCodingSM;
53 |     nsProbingState mState;
54 | 
55 |     SJISContextAnalysis mContextAnalyser;
56 |     SJISDistributionAnalysis mDistributionAnalyser;
57 | 
58 |     char mLastChar[2];
59 | };
60 | }
61 | 
62 | #endif /* nsSJISProber_h__ */
63 | 


--------------------------------------------------------------------------------
/src/probers/nsUniversalDetector.cpp:
--------------------------------------------------------------------------------
  1 | /*  -*- C++ -*-
  2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
  3 |     SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
  4 | 
  5 |     SPDX-License-Identifier: MIT
  6 | */
  7 | 
  8 | #include "nsUniversalDetector.h"
  9 | 
 10 | #include "nsEscCharsetProber.h"
 11 | #include "nsLatin1Prober.h"
 12 | #include "nsMBCSGroupProber.h"
 13 | #include "nsSBCSGroupProber.h"
 14 | 
 15 | namespace kencodingprober
 16 | {
 17 | nsUniversalDetector::nsUniversalDetector()
 18 | {
 19 |     mDone = false;
 20 |     mBestGuess = -1; // illegal value as signal
 21 |     mInTag = false;
 22 |     mEscCharSetProber = nullptr;
 23 | 
 24 |     mStart = true;
 25 |     mDetectedCharset = nullptr;
 26 |     mGotData = false;
 27 |     mInputState = ePureAscii;
 28 |     mLastChar = '\0';
 29 | 
 30 |     unsigned int i;
 31 |     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
 32 |         mCharSetProbers[i] = nullptr;
 33 |     }
 34 | }
 35 | 
 36 | nsUniversalDetector::~nsUniversalDetector()
 37 | {
 38 |     for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
 39 |         delete mCharSetProbers[i];
 40 |     }
 41 |     delete mEscCharSetProber;
 42 | }
 43 | 
 44 | void nsUniversalDetector::Reset()
 45 | {
 46 |     mDone = false;
 47 |     mBestGuess = -1; // illegal value as signal
 48 |     mInTag = false;
 49 | 
 50 |     mStart = true;
 51 |     mDetectedCharset = nullptr;
 52 |     mGotData = false;
 53 |     mInputState = ePureAscii;
 54 |     mLastChar = '\0';
 55 | 
 56 |     if (mEscCharSetProber) {
 57 |         mEscCharSetProber->Reset();
 58 |     }
 59 | 
 60 |     unsigned int i;
 61 |     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
 62 |         if (mCharSetProbers[i]) {
 63 |             mCharSetProbers[i]->Reset();
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | //---------------------------------------------------------------------
 69 | #define SHORTCUT_THRESHOLD (float)0.95
 70 | #define MINIMUM_THRESHOLD (float)0.20
 71 | 
 72 | nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen)
 73 | {
 74 |     if (mDone) {
 75 |         return eFoundIt;
 76 |     }
 77 | 
 78 |     if (aLen > 0) {
 79 |         mGotData = true;
 80 |     }
 81 | 
 82 |     unsigned int i;
 83 |     for (i = 0; i < aLen; i++) {
 84 |         // other than 0xa0, if every other character is ascii, the page is ascii
 85 |         if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP
 86 |             // we got a non-ascii byte (high-byte)
 87 |             if (mInputState != eHighbyte) {
 88 |                 // adjust state
 89 |                 mInputState = eHighbyte;
 90 | 
 91 |                 // kill mEscCharSetProber if it is active
 92 |                 delete mEscCharSetProber;
 93 |                 mEscCharSetProber = nullptr;
 94 | 
 95 |                 // start multibyte and singlebyte charset prober
 96 |                 if (nullptr == mCharSetProbers[0]) {
 97 |                     mCharSetProbers[0] = new nsMBCSGroupProber;
 98 |                 }
 99 |                 if (nullptr == mCharSetProbers[1]) {
100 |                     mCharSetProbers[1] = new nsSBCSGroupProber;
101 |                 }
102 |                 if (nullptr == mCharSetProbers[2]) {
103 |                     mCharSetProbers[2] = new nsLatin1Prober;
104 |                 }
105 |             }
106 |         } else {
107 |             // ok, just pure ascii so far
108 |             if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) {
109 |                 // found escape character or HZ "~{"
110 |                 mInputState = eEscAscii;
111 |             }
112 | 
113 |             mLastChar = aBuf[i];
114 |         }
115 |     }
116 | 
117 |     nsProbingState st = eDetecting;
118 |     switch (mInputState) {
119 |     case eEscAscii:
120 |         if (nullptr == mEscCharSetProber) {
121 |             mEscCharSetProber = new nsEscCharSetProber;
122 |         }
123 |         st = mEscCharSetProber->HandleData(aBuf, aLen);
124 |         if (st == eFoundIt) {
125 |             mDone = true;
126 |             mDetectedCharset = mEscCharSetProber->GetCharSetName();
127 |         }
128 |         break;
129 |     case eHighbyte:
130 |         for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) {
131 |             st = mCharSetProbers[i]->HandleData(aBuf, aLen);
132 |             if (st == eFoundIt) {
133 |                 mDone = true;
134 |                 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
135 |             }
136 |         }
137 |         break;
138 | 
139 |     default: // pure ascii
140 |         mDetectedCharset = "UTF-8";
141 |     }
142 |     return st;
143 | }
144 | 
145 | //---------------------------------------------------------------------
146 | const char *nsUniversalDetector::GetCharSetName()
147 | {
148 |     if (mDetectedCharset) {
149 |         return mDetectedCharset;
150 |     }
151 |     switch (mInputState) {
152 |     case eHighbyte: {
153 |         float proberConfidence;
154 |         float maxProberConfidence = (float)0.0;
155 |         int maxProber = 0;
156 | 
157 |         for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
158 |             proberConfidence = mCharSetProbers[i]->GetConfidence();
159 |             if (proberConfidence > maxProberConfidence) {
160 |                 maxProberConfidence = proberConfidence;
161 |                 maxProber = i;
162 |             }
163 |         }
164 |         // do not report anything because we are not confident of it, that's in fact a negative answer
165 |         if (maxProberConfidence > MINIMUM_THRESHOLD) {
166 |             return mCharSetProbers[maxProber]->GetCharSetName();
167 |         }
168 |     }
169 |     case eEscAscii:
170 |         break;
171 |     default: // pure ascii
172 |              ;
173 |     }
174 |     return "UTF-8";
175 | }
176 | 
177 | //---------------------------------------------------------------------
178 | float nsUniversalDetector::GetConfidence()
179 | {
180 |     if (!mGotData) {
181 |         // we haven't got any data yet, return immediately
182 |         // caller program sometimes call DataEnd before anything has been sent to detector
183 |         return MINIMUM_THRESHOLD;
184 |     }
185 |     if (mDetectedCharset) {
186 |         return 0.99f;
187 |     }
188 |     switch (mInputState) {
189 |     case eHighbyte: {
190 |         float proberConfidence;
191 |         float maxProberConfidence = (float)0.0;
192 |         int maxProber = 0;
193 | 
194 |         for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
195 |             proberConfidence = mCharSetProbers[i]->GetConfidence();
196 |             if (proberConfidence > maxProberConfidence) {
197 |                 maxProberConfidence = proberConfidence;
198 |                 maxProber = i;
199 |             }
200 |         }
201 |         // do not report anything because we are not confident of it, that's in fact a negative answer
202 |         if (maxProberConfidence > MINIMUM_THRESHOLD) {
203 |             return mCharSetProbers[maxProber]->GetConfidence();
204 |         }
205 |     }
206 |     case eEscAscii:
207 |         break;
208 |     default: // pure ascii
209 |              ;
210 |     }
211 |     return MINIMUM_THRESHOLD;
212 | }
213 | 
214 | nsProbingState nsUniversalDetector::GetState()
215 | {
216 |     if (mDone) {
217 |         return eFoundIt;
218 |     } else {
219 |         return eDetecting;
220 |     }
221 | }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/probers/nsUniversalDetector.h:
--------------------------------------------------------------------------------
 1 | /*  -*- C++ -*-
 2 |     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
 3 |     SPDX-FileCopyrightText: 2008 Wang Kai <zealot.kai@gmail.com>
 4 | 
 5 |     SPDX-License-Identifier: MIT
 6 | */
 7 | 
 8 | #ifndef nsUniversalDetector_h__
 9 | #define nsUniversalDetector_h__
10 | 
11 | #include "nsCharSetProber.h"
12 | 
13 | #define NUM_OF_CHARSET_PROBERS 3
14 | 
15 | namespace kencodingprober
16 | {
17 | typedef enum {
18 |     ePureAscii = 0,
19 |     eEscAscii = 1,
20 |     eHighbyte = 2,
21 | } nsInputState;
22 | 
23 | class KCODECS_NO_EXPORT nsUniversalDetector : public nsCharSetProber
24 | {
25 | public:
26 |     nsUniversalDetector();
27 |     ~nsUniversalDetector() override;
28 |     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
29 |     const char *GetCharSetName() override;
30 |     void Reset(void) override;
31 |     float GetConfidence(void) override;
32 |     nsProbingState GetState() override;
33 |     void SetOpion() override
34 |     {
35 |     }
36 | 
37 | protected:
38 |     nsInputState mInputState;
39 |     bool mDone;
40 |     bool mInTag;
41 |     bool mStart;
42 |     bool mGotData;
43 |     char mLastChar;
44 |     const char *mDetectedCharset;
45 |     int mBestGuess;
46 | 
47 |     nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
48 |     nsCharSetProber *mEscCharSetProber;
49 | };
50 | }
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------