├── .git-blame-ignore-revs ├── .gitattributes ├── .gitignore ├── .gitlab-ci.yml ├── .kde-ci.yml ├── CMakeLists.txt ├── KF6CodecsConfig.cmake.in ├── LICENSES ├── BSD-3-Clause.txt ├── CC0-1.0.txt ├── GPL-2.0-or-later.txt ├── LGPL-2.0-only.txt ├── LGPL-2.0-or-later.txt ├── LGPL-2.1-or-later.txt ├── MIT.txt └── MPL-1.1.txt ├── README.md ├── autotests ├── CMakeLists.txt ├── base45test.cpp ├── base64benchmark.cpp ├── codectest.cpp ├── codectest.h ├── data │ ├── binary_data │ ├── codec_b │ │ ├── basic-decode.b │ │ ├── basic-decode.b.expected │ │ ├── basic-encode │ │ ├── basic-encode.expected │ │ ├── null-decode.b │ │ ├── null-decode.b.expected │ │ ├── null-encode │ │ ├── null-encode.expected │ │ ├── padding0-encode │ │ ├── padding0-encode.expected │ │ ├── padding1-encode │ │ ├── padding1-encode.expected │ │ ├── padding2-encode │ │ └── padding2-encode.expected │ ├── codec_base64 │ │ ├── basic-decode.base64 │ │ ├── basic-decode.base64.expected │ │ ├── basic-encode │ │ ├── basic-encode.expected │ │ ├── corrupt.base64 │ │ ├── corrupt.base64.expected │ │ ├── very_small-encode │ │ └── very_small-encode.expected │ ├── codec_q │ │ ├── all-encoded-decode.q │ │ ├── all-encoded-decode.q.expected │ │ ├── basic-encode │ │ ├── basic-encode.expected │ │ ├── null-decode.q │ │ ├── null-decode.q.expected │ │ ├── null-encode │ │ └── null-encode.expected │ ├── codec_quoted-printable │ │ ├── basic-decode.quoted-printable │ │ ├── basic-decode.quoted-printable.expected │ │ ├── basic-encode │ │ ├── basic-encode.expected │ │ ├── corrupt.quoted-printable │ │ ├── corrupt.quoted-printable.expected │ │ ├── corrupt2.quoted-printable │ │ ├── corrupt2.quoted-printable.expected │ │ ├── corrupt3.quoted-printable │ │ ├── corrupt3.quoted-printable.expected │ │ ├── corrupt4.quoted-printable │ │ ├── corrupt4.quoted-printable.expected │ │ ├── wrap-encode │ │ └── wrap-encode.expected │ ├── codec_x-kmime-rfc2231 │ │ ├── all-encoded.x-kmime-rfc2231-decode │ │ ├── all-encoded.x-kmime-rfc2231-decode.expected │ │ ├── basic-encode │ │ ├── basic-encode.expected │ │ ├── null-decode.x-kmime-rfc2231 │ │ ├── null-decode.x-kmime-rfc2231.expected │ │ ├── null-encode │ │ └── null-encode.expected │ └── codec_x-uuencode │ │ ├── basic-decode.x-uuencode │ │ └── basic-decode.x-uuencode.expected ├── kcharsetstest.cpp ├── kcharsetstest.h ├── kemailaddresstest.cpp ├── kemailaddresstest.h ├── kencodingprobertest.cpp ├── kencodingprobertest.h ├── rfc2047test.cpp └── rfc2047test.h ├── docs └── Doxyfile.local ├── metainfo.yaml ├── poqm ├── af │ └── kcodecs6_qt.po ├── ar │ └── kcodecs6_qt.po ├── as │ └── kcodecs6_qt.po ├── ast │ └── kcodecs6_qt.po ├── az │ └── kcodecs6_qt.po ├── be │ └── kcodecs6_qt.po ├── be@latin │ └── kcodecs6_qt.po ├── bg │ └── kcodecs6_qt.po ├── bn │ └── kcodecs6_qt.po ├── bn_IN │ └── kcodecs6_qt.po ├── br │ └── kcodecs6_qt.po ├── bs │ └── kcodecs6_qt.po ├── ca │ └── kcodecs6_qt.po ├── ca@valencia │ └── kcodecs6_qt.po ├── crh │ └── kcodecs6_qt.po ├── cs │ └── kcodecs6_qt.po ├── csb │ └── kcodecs6_qt.po ├── cy │ └── kcodecs6_qt.po ├── da │ └── kcodecs6_qt.po ├── de │ └── kcodecs6_qt.po ├── el │ └── kcodecs6_qt.po ├── en_GB │ └── kcodecs6_qt.po ├── eo │ └── kcodecs6_qt.po ├── es │ └── kcodecs6_qt.po ├── et │ └── kcodecs6_qt.po ├── eu │ └── kcodecs6_qt.po ├── fa │ └── kcodecs6_qt.po ├── fi │ └── kcodecs6_qt.po ├── fr │ └── kcodecs6_qt.po ├── fy │ └── kcodecs6_qt.po ├── ga │ └── kcodecs6_qt.po ├── gd │ └── kcodecs6_qt.po ├── gl │ └── kcodecs6_qt.po ├── gu │ └── kcodecs6_qt.po ├── ha │ └── kcodecs6_qt.po ├── he │ └── kcodecs6_qt.po ├── hi │ └── kcodecs6_qt.po ├── hne │ └── kcodecs6_qt.po ├── hr │ └── kcodecs6_qt.po ├── hsb │ └── kcodecs6_qt.po ├── hu │ └── kcodecs6_qt.po ├── hy │ └── kcodecs6_qt.po ├── ia │ └── kcodecs6_qt.po ├── id │ └── kcodecs6_qt.po ├── is │ └── kcodecs6_qt.po ├── it │ └── kcodecs6_qt.po ├── ja │ └── kcodecs6_qt.po ├── ka │ └── kcodecs6_qt.po ├── kk │ └── kcodecs6_qt.po ├── km │ └── kcodecs6_qt.po ├── kn │ └── kcodecs6_qt.po ├── ko │ └── kcodecs6_qt.po ├── ku │ └── kcodecs6_qt.po ├── lb │ └── kcodecs6_qt.po ├── lt │ └── kcodecs6_qt.po ├── lv │ └── kcodecs6_qt.po ├── mai │ └── kcodecs6_qt.po ├── mk │ └── kcodecs6_qt.po ├── ml │ └── kcodecs6_qt.po ├── mr │ └── kcodecs6_qt.po ├── ms │ └── kcodecs6_qt.po ├── nb │ └── kcodecs6_qt.po ├── nds │ └── kcodecs6_qt.po ├── ne │ └── kcodecs6_qt.po ├── nl │ └── kcodecs6_qt.po ├── nn │ └── kcodecs6_qt.po ├── oc │ └── kcodecs6_qt.po ├── or │ └── kcodecs6_qt.po ├── pa │ └── kcodecs6_qt.po ├── pl │ └── kcodecs6_qt.po ├── ps │ └── kcodecs6_qt.po ├── pt │ └── kcodecs6_qt.po ├── pt_BR │ └── kcodecs6_qt.po ├── ro │ └── kcodecs6_qt.po ├── ru │ └── kcodecs6_qt.po ├── sa │ └── kcodecs6_qt.po ├── se │ └── kcodecs6_qt.po ├── si │ └── kcodecs6_qt.po ├── sk │ └── kcodecs6_qt.po ├── sl │ └── kcodecs6_qt.po ├── sq │ └── kcodecs6_qt.po ├── sr │ └── kcodecs6_qt.po ├── sr@ijekavian │ └── kcodecs6_qt.po ├── sr@ijekavianlatin │ └── kcodecs6_qt.po ├── sr@latin │ └── kcodecs6_qt.po ├── sv │ └── kcodecs6_qt.po ├── ta │ └── kcodecs6_qt.po ├── te │ └── kcodecs6_qt.po ├── tg │ └── kcodecs6_qt.po ├── th │ └── kcodecs6_qt.po ├── tr │ └── kcodecs6_qt.po ├── tt │ └── kcodecs6_qt.po ├── ug │ └── kcodecs6_qt.po ├── uk │ └── kcodecs6_qt.po ├── uz │ └── kcodecs6_qt.po ├── uz@cyrillic │ └── kcodecs6_qt.po ├── vi │ └── kcodecs6_qt.po ├── wa │ └── kcodecs6_qt.po ├── xh │ └── kcodecs6_qt.po ├── zh_CN │ └── kcodecs6_qt.po ├── zh_HK │ └── kcodecs6_qt.po └── zh_TW │ └── kcodecs6_qt.po └── src ├── CMakeLists.txt ├── Messages.sh ├── kcharsets.cpp ├── kcharsets.h ├── kcharsets_p.h ├── kcodecs-index.qdoc ├── kcodecs.cpp ├── kcodecs.h ├── kcodecs.qdoc ├── kcodecs.qdocconf ├── kcodecs_p.h ├── kcodecsbase45.cpp ├── kcodecsbase64.cpp ├── kcodecsbase64.h ├── kcodecsqp.cpp ├── kcodecsqp.h ├── kcodecsuuencode.cpp ├── kcodecsuuencode.h ├── kemailaddress.cpp ├── kemailaddress.h ├── kencodingprober.cpp ├── kencodingprober.h └── probers ├── CharDistribution.cpp ├── CharDistribution.h ├── ChineseGroupProber.cpp ├── ChineseGroupProber.h ├── JapaneseGroupProber.cpp ├── JapaneseGroupProber.h ├── JpCntx.cpp ├── JpCntx.h ├── LangBulgarianModel.cpp ├── LangCyrillicModel.cpp ├── LangGreekModel.cpp ├── LangHebrewModel.cpp ├── LangHungarianModel.cpp ├── LangThaiModel.cpp ├── UnicodeGroupProber.cpp ├── UnicodeGroupProber.h ├── nsBig5Prober.cpp ├── nsBig5Prober.h ├── nsCharSetProber.cpp ├── nsCharSetProber.h ├── nsCodingStateMachine.h ├── nsEUCJPProber.cpp ├── nsEUCJPProber.h ├── nsEUCKRProber.cpp ├── nsEUCKRProber.h ├── nsEscCharsetProber.cpp ├── nsEscCharsetProber.h ├── nsEscSM.cpp ├── nsGB2312Prober.cpp ├── nsGB2312Prober.h ├── nsHebrewProber.cpp ├── nsHebrewProber.h ├── nsLatin1Prober.cpp ├── nsLatin1Prober.h ├── nsMBCSGroupProber.cpp ├── nsMBCSGroupProber.h ├── nsMBCSSM.cpp ├── nsPkgInt.h ├── nsSBCSGroupProber.cpp ├── nsSBCSGroupProber.h ├── nsSBCharSetProber.cpp ├── nsSBCharSetProber.h ├── nsSJISProber.cpp ├── nsSJISProber.h ├── nsUniversalDetector.cpp ├── nsUniversalDetector.h └── tables ├── Big5Freq.tab ├── EUCKRFreq.tab ├── GB2312Freq.tab └── JISFreq.tab /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | #clang-format/tidy 2 | 88d295069b046faa5846e320c2ed81805cdcd44c 3 | bf6a25dad4c25070bdb292a4adbf6058acd50ad1 4 | 439b22f0e561c7637b0df364b6dc4958e1aad617 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.expected -crlf 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore the following files 2 | *~ 3 | *.[oa] 4 | *.diff 5 | *.kate-swp 6 | *.kdev4 7 | .kdev_include_paths 8 | *.kdevelop.pcs 9 | *.moc 10 | *.moc.cpp 11 | *.orig 12 | *.user 13 | .*.swp 14 | .swp.* 15 | Doxyfile 16 | Makefile 17 | avail 18 | random_seed 19 | /build*/ 20 | /.vscode/ 21 | CMakeLists.txt.user* 22 | *.unc-backup* 23 | .cmake/ 24 | /.clang-format 25 | /compile_commands.json 26 | .clangd 27 | .idea 28 | /cmake-build* 29 | .cache 30 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2020 Volker Krause 2 | # SPDX-License-Identifier: CC0-1.0 3 | 4 | include: 5 | - project: sysadmin/ci-utilities 6 | file: 7 | - /gitlab-templates/linux-qt6.yml 8 | - /gitlab-templates/linux-qt6-next.yml 9 | - /gitlab-templates/linux-qt6-static.yml 10 | - /gitlab-templates/android-qt6.yml 11 | - /gitlab-templates/freebsd-qt6.yml 12 | - /gitlab-templates/windows-qt6.yml 13 | - /gitlab-templates/alpine-qt6.yml 14 | - /gitlab-templates/xml-lint.yml 15 | - /gitlab-templates/yaml-lint.yml 16 | -------------------------------------------------------------------------------- /.kde-ci.yml: -------------------------------------------------------------------------------- 1 | Dependencies: 2 | - 'on': ['@all'] 3 | 'require': 4 | 'frameworks/extra-cmake-modules': '@same' 5 | 6 | Options: 7 | test-before-installing: True 8 | require-passing-tests-on: ['Linux', 'FreeBSD'] 9 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | set(KF_VERSION "6.15.0") # handled by release scripts 4 | project(KCodecs VERSION ${KF_VERSION}) 5 | 6 | include(FeatureSummary) 7 | find_package(ECM 6.14.0 NO_MODULE) 8 | set_package_properties(ECM PROPERTIES TYPE REQUIRED DESCRIPTION "Extra CMake Modules." URL "https://commits.kde.org/extra-cmake-modules") 9 | feature_summary(WHAT REQUIRED_PACKAGES_NOT_FOUND FATAL_ON_MISSING_REQUIRED_PACKAGES) 10 | 11 | 12 | set(CMAKE_MODULE_PATH ${ECM_MODULE_PATH}) 13 | 14 | include(KDEInstallDirs) 15 | include(KDEFrameworkCompilerSettings NO_POLICY_SCOPE) 16 | include(KDECMakeSettings) 17 | include(KDEGitCommitHooks) 18 | include(ECMQtDeclareLoggingCategory) 19 | include(ECMDeprecationSettings) 20 | 21 | set(REQUIRED_QT_VERSION 6.7.0) 22 | find_package(Qt6Core ${REQUIRED_QT_VERSION} REQUIRED NO_MODULE) 23 | 24 | include(ECMGenerateExportHeader) 25 | include(CMakePackageConfigHelpers) 26 | include(ECMSetupVersion) 27 | include(ECMGenerateHeaders) 28 | include(ECMGenerateQDoc) 29 | 30 | include(ECMPoQmTools) 31 | 32 | set(EXCLUDE_DEPRECATED_BEFORE_AND_AT 0 CACHE STRING "Control the range of deprecated API excluded from the build [default=0].") 33 | 34 | set(kcodecs_version_header "${CMAKE_CURRENT_BINARY_DIR}/src/kcodecs_version.h") 35 | ecm_setup_version(PROJECT VARIABLE_PREFIX KCODECS 36 | VERSION_HEADER "${kcodecs_version_header}" 37 | PACKAGE_VERSION_FILE "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfigVersion.cmake" 38 | SOVERSION 6) 39 | 40 | ecm_install_po_files_as_qm(poqm) 41 | 42 | ecm_set_disabled_deprecation_versions( 43 | QT 6.9.0 44 | ) 45 | 46 | add_subdirectory(src) 47 | 48 | if (BUILD_TESTING) 49 | add_subdirectory(autotests) 50 | endif() 51 | 52 | # create a Config.cmake and a ConfigVersion.cmake file and install them 53 | set(CMAKECONFIG_INSTALL_DIR "${KDE_INSTALL_CMAKEPACKAGEDIR}/KF6Codecs") 54 | 55 | configure_package_config_file("${CMAKE_CURRENT_SOURCE_DIR}/KF6CodecsConfig.cmake.in" 56 | "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfig.cmake" 57 | INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR} 58 | ) 59 | 60 | install(FILES "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfig.cmake" 61 | "${CMAKE_CURRENT_BINARY_DIR}/KF6CodecsConfigVersion.cmake" 62 | DESTINATION "${CMAKECONFIG_INSTALL_DIR}" 63 | COMPONENT Devel ) 64 | 65 | install(EXPORT KF6CodecsTargets DESTINATION "${CMAKECONFIG_INSTALL_DIR}" FILE KF6CodecsTargets.cmake NAMESPACE KF6:: ) 66 | 67 | install(FILES ${kcodecs_version_header} 68 | DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF}/KCodecs COMPONENT Devel) 69 | 70 | include(ECMFeatureSummary) 71 | ecm_feature_summary(WHAT ALL FATAL_ON_MISSING_REQUIRED_PACKAGES) 72 | 73 | kde_configure_git_pre_commit_hook(CHECKS CLANG_FORMAT) 74 | -------------------------------------------------------------------------------- /KF6CodecsConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include(CMakeFindDependencyMacro) 4 | find_dependency(Qt6Core @REQUIRED_QT_VERSION@) 5 | 6 | include("${CMAKE_CURRENT_LIST_DIR}/KF6CodecsTargets.cmake") 7 | -------------------------------------------------------------------------------- /LICENSES/BSD-3-Clause.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) . All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, 4 | are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors 14 | may be used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 26 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /LICENSES/CC0-1.0.txt: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /LICENSES/MIT.txt: -------------------------------------------------------------------------------- 1 | MIT License Copyright (c) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice (including the next 11 | paragraph) shall be included in all copies or substantial portions of the 12 | Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 17 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 | OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KCodecs 2 | 3 | String encoding library 4 | 5 | ## Introduction 6 | 7 | KCodecs provide a collection of methods to manipulate strings using various 8 | encodings. 9 | 10 | It can automatically determine the charset of a string, translate XML entities, 11 | validate email addresses, and find encodings by name in a more tolerant way than QTextCodec 12 | (useful e.g. for data coming from the Internet). 13 | 14 | 15 | -------------------------------------------------------------------------------- /autotests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(ECMAddTests) 2 | 3 | find_package(Qt6Test ${REQUIRED_QT_VERSION} CONFIG QUIET) 4 | 5 | if(NOT TARGET Qt6::Test) 6 | message(STATUS "Qt6Test not found, autotests will not be built.") 7 | return() 8 | endif() 9 | 10 | ecm_add_tests( 11 | kencodingprobertest.cpp 12 | rfc2047test.cpp 13 | base45test.cpp 14 | codectest.cpp 15 | kemailaddresstest.cpp 16 | LINK_LIBRARIES KF6::Codecs Qt6::Test 17 | ) 18 | 19 | ecm_add_test( 20 | kcharsetstest.cpp 21 | LINK_LIBRARIES KF6::Codecs Qt6::Test 22 | ) 23 | 24 | # Benchmark, compiled, but not run automatically with ctest 25 | add_executable(base64benchmark base64benchmark.cpp) 26 | target_link_libraries(base64benchmark KF6::Codecs Qt6::Test) 27 | -------------------------------------------------------------------------------- /autotests/base45test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2021 Volker Krause 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | 7 | #include 8 | 9 | #include 10 | 11 | class Base45Test : public QObject 12 | { 13 | Q_OBJECT 14 | private Q_SLOTS: 15 | void testBase45Decode_data() 16 | { 17 | QTest::addColumn("in"); 18 | QTest::addColumn("out"); 19 | 20 | QTest::newRow("empty") << QByteArray() << QByteArray(); 21 | 22 | // examples from the RFC - https://datatracker.ietf.org/doc/draft-faltstrom-base45/ 23 | QTest::newRow("hello") << QByteArray("%69 VD92EX0") << QByteArray("Hello!!"); 24 | QTest::newRow("base-45") << QByteArray("UJCLQE7W581") << QByteArray("base-45"); 25 | QTest::newRow("ietf") << QByteArray("QED8WEX0") << QByteArray("ietf!"); 26 | 27 | // from EU DCG test data - https://github.com/eu-digital-green-certificates/dgc-testdata 28 | QTest::newRow("eu-dcg") 29 | << QByteArray( 30 | "6BF+70790T9WJWG.FKY*4GO0.O1CV2 O5 " 31 | "N2FBBRW1*70HS8WY04AC*WIFN0AHCD8KD97TK0F90KECTHGWJC0FDC:5AIA%G7X+AQB9746HS80:54IBQF60R6$A80X6S1BTYACG6M+9XG8KIAWNA91AY%67092L4WJCT3EHS8XJC$+" 32 | "DXJCCWENF6OF63W5NW6WF6%JC QE/IAYJC5LEW34U3ET7DXC9 QE-ED8%E.JCBECB1A-:8$96646AL60A60S6Q$D.UDRYA " 33 | "96NF6L/5QW6307KQEPD09WEQDD+Q6TW6FA7C466KCN9E%961A6DL6FA7D46JPCT3E5JDLA7$Q6E464W5TG6..DX%DZJC6/DTZ9 QE5$CB$DA/D " 34 | "JC1/D3Z8WED1ECW.CCWE.Y92OAGY8MY9L+9MPCG/D5 C5IA5N9$PC5$CUZCY$5Y$527B+A4KZNQG5TKOWWD9FL%I8U$F7O2IBM85CWOC%LEZU4R/BXHDAHN " 35 | "11$CA5MRI:AONFN7091K9FKIGIY%VWSSSU9%01FO2*FTPQ3C3F") 36 | << QByteArray::fromHex( 37 | "789c0163019cfed28443a10126a104480c4b15512be9140159010da401624445061a60b29429041a61f39fa9390103a101a4617681aa626369782f55524e3a555643493a303" 38 | "144452f495a3132333435412f3543574c553132524e4f4239525853454f5036464738235762636f62444562646e026264746a323032312d30352d323962697374526f626572" 39 | "74204b6f63682d496e737469747574626d616d4f52472d313030303331313834626d706c45552f312f32302f3135303762736402627467693834303533393030366276706a3" 40 | "131313933343930303763646f626a313936342d30382d3132636e616da462666e6a4d75737465726d616e6e62676e654572696b6163666e746a4d55535445524d414e4e6367" 41 | "6e74654552494b416376657265312e302e305840218ebc2a2a77c1796c95a8c942987d461411b0075fd563447295250d5ead69f3b8f6083a515bd97656e87aca01529e6aa0e" 42 | "09144fc07e2884c93080f1419e82f1c66773a"); 43 | } 44 | 45 | void testBase45Decode() 46 | { 47 | QFETCH(QByteArray, in); 48 | QFETCH(QByteArray, out); 49 | 50 | QCOMPARE(KCodecs::base45Decode(in), out); 51 | } 52 | 53 | void testBase45DecodeInvalid_data() 54 | { 55 | QTest::addColumn("in"); 56 | QTest::newRow("1 byte") << QByteArray("X"); 57 | QTest::newRow("invalid chars") << QByteArray("%69 vD92Ex0"); 58 | } 59 | 60 | void testBase45DecodeInvalid() 61 | { 62 | QFETCH(QByteArray, in); 63 | 64 | // undefined return value, but must not crash or produce ASAN errors 65 | KCodecs::base45Decode(in); 66 | } 67 | }; 68 | 69 | QTEST_APPLESS_MAIN(Base45Test) 70 | 71 | #include "base45test.moc" 72 | -------------------------------------------------------------------------------- /autotests/base64benchmark.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2010 Volker Krause 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | 7 | #include "../src/kcodecsbase64.h" 8 | 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | class Base64Benchmark : public QObject 16 | { 17 | Q_OBJECT 18 | private: 19 | static QByteArray fillByteArray(int size) 20 | { 21 | char c = 0; 22 | QByteArray result; 23 | result.reserve(size); 24 | while (result.size() < size) { 25 | result.append(c++); 26 | } 27 | return result; 28 | } 29 | 30 | void createTestSet() 31 | { 32 | QTest::addColumn("output"); 33 | QTest::addColumn("input"); 34 | QTest::newRow("empty") << QByteArray() << QByteArray(); 35 | QTest::newRow("128") << fillByteArray(128) << KCodecs::base64Encode(fillByteArray(128)); 36 | QTest::newRow("1k") << fillByteArray(1 << 10) << KCodecs::base64Encode(fillByteArray(1 << 10)); 37 | QTest::newRow("1M") << fillByteArray(1 << 20) << KCodecs::base64Encode(fillByteArray(1 << 20)); 38 | } 39 | private Q_SLOTS: 40 | void benchmarkKCodecDecode_data() 41 | { 42 | createTestSet(); 43 | } 44 | 45 | void benchmarkKCodecDecode() 46 | { 47 | QFETCH(QByteArray, input); 48 | QFETCH(QByteArray, output); 49 | QByteArray result; 50 | QBENCHMARK { 51 | result = KCodecs::base64Decode(input); 52 | } 53 | QCOMPARE(result, output); 54 | } 55 | 56 | void benchmarkQByteArrayDecode_data() 57 | { 58 | createTestSet(); 59 | } 60 | 61 | void benchmarkQByteArrayDecode() 62 | { 63 | QFETCH(QByteArray, input); 64 | QFETCH(QByteArray, output); 65 | QByteArray result; 66 | QBENCHMARK { 67 | result = QByteArray::fromBase64(input); 68 | } 69 | QCOMPARE(result, output); 70 | } 71 | 72 | void benchmarkKMimeBase64Decoder_data() 73 | { 74 | createTestSet(); 75 | } 76 | 77 | void benchmarkKMimeBase64Decoder() 78 | { 79 | QFETCH(QByteArray, input); 80 | QFETCH(QByteArray, output); 81 | QByteArray result; 82 | QBENCHMARK { 83 | KCodecs::Codec *codec = KCodecs::Codec::codecForName("base64"); 84 | QVERIFY(codec); 85 | result.resize(codec->maxDecodedSizeFor(input.size())); 86 | KCodecs::Decoder *decoder = codec->makeDecoder(); 87 | QByteArray::const_iterator inputIt = input.constBegin(); 88 | QByteArray::iterator resultIt = result.begin(); 89 | decoder->decode(inputIt, input.constEnd(), resultIt, result.constEnd()); 90 | result.truncate(resultIt - result.begin()); 91 | delete decoder; 92 | } 93 | QCOMPARE(result, output); 94 | } 95 | }; 96 | 97 | QTEST_MAIN(Base64Benchmark) 98 | 99 | #include "base64benchmark.moc" 100 | -------------------------------------------------------------------------------- /autotests/codectest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2010 Thomas McGuire 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | #include "codectest.h" 7 | 8 | #include 9 | 10 | #include 11 | 12 | #include "../src/kcodecs.h" 13 | 14 | using namespace KCodecs; 15 | 16 | QTEST_MAIN(CodecTest) 17 | 18 | enum Mode { 19 | Decode, 20 | Encode, 21 | }; 22 | Q_DECLARE_METATYPE(Mode) 23 | 24 | void CodecTest::testCodecs_data() 25 | { 26 | QTest::addColumn("input"); 27 | QTest::addColumn("expResult"); 28 | QTest::addColumn("codecName"); 29 | QTest::addColumn("tag"); 30 | QTest::addColumn("mode"); 31 | 32 | QString dataDir = QFINDTESTDATA("data/binary_data"); 33 | QVERIFY(!dataDir.isEmpty()); 34 | dataDir.chop(QByteArrayView("binary_data").size()); 35 | QDir codecBaseDir(dataDir); 36 | const QStringList lst = codecBaseDir.entryList(QStringList(), QDir::Dirs | QDir::NoDotAndDotDot, QDir::NoSort); 37 | for (const QString &dir : lst) { 38 | if (dir.toLower().startsWith(QLatin1String("codec_"))) { 39 | const QString codecName = dir.right(dir.size() - 6); 40 | QDir codecDir(codecBaseDir.path() + QLatin1String("/") + dir); 41 | const QStringList lst2 = codecDir.entryList(QStringList(), QDir::Files, QDir::NoSort); 42 | for (const QString &file : lst2) { 43 | if (file.toLower().endsWith(QLatin1String(".expected"))) { 44 | const QString dataFileNameBase = file.left(file.size() - 9); 45 | QFile dataFile(codecDir.path() + QLatin1Char('/') + dataFileNameBase); 46 | QFile expectedFile(codecDir.path() + QLatin1Char('/') + file); 47 | QVERIFY(dataFile.open(QIODevice::ReadOnly)); 48 | QVERIFY(expectedFile.open(QIODevice::ReadOnly)); 49 | 50 | Mode mode = Decode; 51 | if (file.contains(QLatin1String("-decode"))) { 52 | mode = Decode; 53 | } else if (file.contains(QLatin1String("-encode"))) { 54 | mode = Encode; 55 | } 56 | 57 | const QByteArray data = dataFile.readAll(); 58 | const QByteArray expected = expectedFile.readAll(); 59 | 60 | const QString tag = codecName + QLatin1Char('/') + dataFileNameBase; 61 | QTest::newRow(tag.toLatin1().constData()) << data << expected << codecName.toLatin1() << tag << mode; 62 | 63 | dataFile.close(); 64 | expectedFile.close(); 65 | } 66 | } 67 | } 68 | } 69 | } 70 | 71 | void CodecTest::testCodecs() 72 | { 73 | QFETCH(QByteArray, input); 74 | QFETCH(QByteArray, expResult); 75 | QFETCH(QByteArray, codecName); 76 | QFETCH(QString, tag); 77 | QFETCH(Mode, mode); 78 | 79 | Codec *codec = Codec::codecForName(codecName); 80 | QVERIFY(codec); 81 | 82 | QStringList blacklistedTags; 83 | if (blacklistedTags.contains(tag)) { 84 | QEXPECT_FAIL(tag.toLatin1().constData(), "Codec broken", Continue); 85 | } 86 | 87 | QByteArray result; 88 | if (mode == Decode) { 89 | result = codec->decode(input, Codec::NewlineLF); 90 | } else { 91 | result = codec->encode(input, Codec::NewlineLF); 92 | } 93 | 94 | // More usable version of QCOMPARE(result, expResult), in case the difference is at the end... 95 | if (result != expResult) { 96 | const QList lines = result.split('\n'); 97 | const QList expLines = expResult.split('\n'); 98 | if (lines.count() == expLines.count()) { 99 | QCOMPARE(result.split('\n'), expResult.split('\n')); 100 | } 101 | } 102 | QCOMPARE(result, expResult); 103 | } 104 | 105 | void CodecTest::testInvalidCodec() 106 | { 107 | Codec *codec = Codec::codecForName("thiscodectotallydoesntexist"); 108 | QCOMPARE(codec, nullptr); 109 | } 110 | 111 | #include "moc_codectest.cpp" 112 | -------------------------------------------------------------------------------- /autotests/codectest.h: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2010 Thomas McGuire 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | #ifndef CODECTEST_H 7 | #define CODECTEST_H 8 | 9 | #include 10 | 11 | class CodecTest : public QObject 12 | { 13 | Q_OBJECT 14 | private Q_SLOTS: 15 | void testCodecs(); 16 | void testCodecs_data(); 17 | void testInvalidCodec(); 18 | }; 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /autotests/data/binary_data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/binary_data -------------------------------------------------------------------------------- /autotests/data/codec_b/basic-decode.b: -------------------------------------------------------------------------------- 1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmqq6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w== -------------------------------------------------------------------------------- /autotests/data/codec_b/basic-decode.b.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/basic-decode.b.expected -------------------------------------------------------------------------------- /autotests/data/codec_b/basic-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/basic-encode -------------------------------------------------------------------------------- /autotests/data/codec_b/basic-encode.expected: -------------------------------------------------------------------------------- 1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmqq6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w== -------------------------------------------------------------------------------- /autotests/data/codec_b/null-decode.b: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-decode.b -------------------------------------------------------------------------------- /autotests/data/codec_b/null-decode.b.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-decode.b.expected -------------------------------------------------------------------------------- /autotests/data/codec_b/null-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-encode -------------------------------------------------------------------------------- /autotests/data/codec_b/null-encode.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_b/null-encode.expected -------------------------------------------------------------------------------- /autotests/data/codec_b/padding0-encode: -------------------------------------------------------------------------------- 1 | abc -------------------------------------------------------------------------------- /autotests/data/codec_b/padding0-encode.expected: -------------------------------------------------------------------------------- 1 | YWJj -------------------------------------------------------------------------------- /autotests/data/codec_b/padding1-encode: -------------------------------------------------------------------------------- 1 | ab -------------------------------------------------------------------------------- /autotests/data/codec_b/padding1-encode.expected: -------------------------------------------------------------------------------- 1 | YWI= -------------------------------------------------------------------------------- /autotests/data/codec_b/padding2-encode: -------------------------------------------------------------------------------- 1 | a -------------------------------------------------------------------------------- /autotests/data/codec_b/padding2-encode.expected: -------------------------------------------------------------------------------- 1 | YQ== -------------------------------------------------------------------------------- /autotests/data/codec_base64/basic-decode.base64: -------------------------------------------------------------------------------- 1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4 2 | OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3Bx 3 | cnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmq 4 | q6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj 5 | 5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w== 6 | -------------------------------------------------------------------------------- /autotests/data/codec_base64/basic-decode.base64.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/basic-decode.base64.expected -------------------------------------------------------------------------------- /autotests/data/codec_base64/basic-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/basic-encode -------------------------------------------------------------------------------- /autotests/data/codec_base64/basic-encode.expected: -------------------------------------------------------------------------------- 1 | AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4 2 | OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3Bx 3 | cnN0dXZ3eHl6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6ChoqOkpaanqKmq 4 | q6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIycrLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj 5 | 5OXm5+jp6uvs7e7v8PHy8/T19vf4+fr7/P3+/w== 6 | -------------------------------------------------------------------------------- /autotests/data/codec_base64/corrupt.base64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/corrupt.base64 -------------------------------------------------------------------------------- /autotests/data/codec_base64/corrupt.base64.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_base64/corrupt.base64.expected -------------------------------------------------------------------------------- /autotests/data/codec_base64/very_small-encode: -------------------------------------------------------------------------------- 1 | 12 -------------------------------------------------------------------------------- /autotests/data/codec_base64/very_small-encode.expected: -------------------------------------------------------------------------------- 1 | MTI= 2 | -------------------------------------------------------------------------------- /autotests/data/codec_q/all-encoded-decode.q: -------------------------------------------------------------------------------- 1 | =00=01=02=03=04=05=06=07=08=09=0A=0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F=20=21=22=23=24=25=26=27=28=29=2A=2B=2C=2D=2E=2F=30=31=32=33=34=35=36=37=38=39=3A=3B=3C=3D=3E=3F=40=41=42=43=44=45=46=47=48=49=4A=4B=4C=4D=4E=4F=50=51=52=53=54=55=56=57=58=59=5A=5B=5C=5D=5E=5F=60=61=62=63=64=65=66=67=68=69=6A=6B=6C=6D=6E=6F=70=71=72=73=74=75=76=77=78=79=7A=7B=7C=7D=7E=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=F8=F9=FA=FB=FC=FD=FE=FF -------------------------------------------------------------------------------- /autotests/data/codec_q/all-encoded-decode.q.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/all-encoded-decode.q.expected -------------------------------------------------------------------------------- /autotests/data/codec_q/basic-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/basic-encode -------------------------------------------------------------------------------- /autotests/data/codec_q/basic-encode.expected: -------------------------------------------------------------------------------- 1 | =00=01=02=03=04=05=06=07=08=09=0A=0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F_!=22=23=24=25=26=27=28=29*+=2C-=2E/0123456789=3A=3B=3C=3D=3E=3F=40ABCDEFGHIJKLMNOPQRSTUVWXYZ=5B=5C=5D=5E=5F=60abcdefghijklmnopqrstuvwxyz=7B=7C=7D=7E=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93=94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC=AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5=C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE=DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7=F8=F9=FA=FB=FC=FD=FE=FF -------------------------------------------------------------------------------- /autotests/data/codec_q/null-decode.q: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-decode.q -------------------------------------------------------------------------------- /autotests/data/codec_q/null-decode.q.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-decode.q.expected -------------------------------------------------------------------------------- /autotests/data/codec_q/null-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-encode -------------------------------------------------------------------------------- /autotests/data/codec_q/null-encode.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_q/null-encode.expected -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/basic-decode.quoted-printable: -------------------------------------------------------------------------------- 1 | =00=01=02=03=04=05=06=07=08=09 2 | =0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F !"#$%&'()*+= 3 | ,-./0123456789:;<=3D>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst= 4 | uvwxyz{|}~=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93= 5 | =94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC= 6 | =AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5= 7 | =C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE= 8 | =DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7= 9 | =F8=F9=FA=FB=FC=FD=FE=FF -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/basic-decode.quoted-printable.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/basic-decode.quoted-printable.expected -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/basic-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/basic-encode -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/basic-encode.expected: -------------------------------------------------------------------------------- 1 | =00=01=02=03=04=05=06=07=08=09 2 | =0B=0C=0D=0E=0F=10=11=12=13=14=15=16=17=18=19=1A=1B=1C=1D=1E=1F !"#$%&'()*+= 3 | ,-./0123456789:;<=3D>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrst= 4 | uvwxyz{|}~=7F=80=81=82=83=84=85=86=87=88=89=8A=8B=8C=8D=8E=8F=90=91=92=93= 5 | =94=95=96=97=98=99=9A=9B=9C=9D=9E=9F=A0=A1=A2=A3=A4=A5=A6=A7=A8=A9=AA=AB=AC= 6 | =AD=AE=AF=B0=B1=B2=B3=B4=B5=B6=B7=B8=B9=BA=BB=BC=BD=BE=BF=C0=C1=C2=C3=C4=C5= 7 | =C6=C7=C8=C9=CA=CB=CC=CD=CE=CF=D0=D1=D2=D3=D4=D5=D6=D7=D8=D9=DA=DB=DC=DD=DE= 8 | =DF=E0=E1=E2=E3=E4=E5=E6=E7=E8=E9=EA=EB=EC=ED=EE=EF=F0=F1=F2=F3=F4=F5=F6=F7= 9 | =F8=F9=FA=FB=FC=FD=FE=FF -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt.quoted-printable: -------------------------------------------------------------------------------- 1 | A =3D wasn't properly encoded (should be kept): APE=MAN MAN=APE 2 | A =3D wasn't properly encoded (lowercase): ape=man man=ape 3 | Lowercase hexchars: =bb=a1=4b=44=45 =72=75=6c=65=7a=21=ab 4 | Mixed-case hexchars: =Bb=A1=4B=44=45 =72=75=6C=65=7A=21=aB 5 | A misplaced (unencoded =3D), followed by whitespace: = not at end! 6 | Two consecutive =3D at the end of the line: == 7 | A misplaced (unencoded =3D), as the ultimate character: = -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt.quoted-printable.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/corrupt.quoted-printable.expected -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt2.quoted-printable: -------------------------------------------------------------------------------- 1 | An incomplete encoded character at the very end of the encoded data: =a -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt2.quoted-printable.expected: -------------------------------------------------------------------------------- 1 | An incomplete encoded character at the very end of the encoded data: =a -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt3.quoted-printable: -------------------------------------------------------------------------------- 1 | An invalid encoded character at the very end of the encoded data: =ax -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt3.quoted-printable.expected: -------------------------------------------------------------------------------- 1 | An invalid encoded character at the very end of the encoded data: =ax -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt4.quoted-printable: -------------------------------------------------------------------------------- 1 | Two =3D at the very end of the encoded data: == -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/corrupt4.quoted-printable.expected: -------------------------------------------------------------------------------- 1 | Two = at the very end of the encoded data: == -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/wrap-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_quoted-printable/wrap-encode -------------------------------------------------------------------------------- /autotests/data/codec_quoted-printable/wrap-encode.expected: -------------------------------------------------------------------------------- 1 | This is a line without a special char at the end. 2 | This is a line with a space at the end.=20 3 | This is a line with multiple spaces at the end. =20 4 | This is a line with a tab at the end.=09 5 | This is a line with an umlaut at the end.=E4 6 | This is a line with an umlaut and a space at the end.=E4=20 7 | This is a line with an umlaut and a tab at the end.=E4=09 8 | =46rom This is a line with From at the beginning. 9 | =2EThis is a line with a dot at the beginning. 10 | =2DThis is a line with a dash at the beginning. 11 | 12 | This is a very long line (=E4 ) which just happens to be wrapped so that a = 13 | =46rom appears at the beginning of the second line. Furthermore, this break= 14 | =2E makes a dot appear as the first character on the third line. 15 | 16 | Just long enough: xxxxxxxx This is a line without a special char at the end. 17 | Just too long: xxxxxxxxxxxx This is a line without a special char at the en= 18 | d. 19 | xxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line without a special char at the e= 20 | nd. 21 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line without a special char at the = 22 | end. 23 | 24 | Just long enough: xxxxxxxxxxxxxxx This is a line with a space at the end.=20 25 | Just too long: xxxxxxxxxxxxxxxxxxx This is a line with a space at the end.= 26 | =20 27 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a space at the end.= 28 | =20 29 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a space at the end= 30 | =2E=20 31 | 32 | Just long enough: xxxxxxxxxxxxxxxxx This is a line with a tab at the end.=09 33 | Just too long: xxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end.= 34 | =09 35 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end.= 36 | =09 37 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with a tab at the end= 38 | =2E=09 39 | 40 | Just long enough: xxxxxxxxxxxxx This is a line with an umlaut at the end.=E4 41 | Just too long: xxxxxxxxxxxxxxxxx This is a line with an umlaut at the end.= 42 | =E4 43 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut at the end.= 44 | =E4 45 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut at the end= 46 | =2E=E4 47 | 48 | Just long enough This is a line with an umlaut and a space at the end.=E4=20 49 | Just too long: xx This is a line with an umlaut and a space at the end.=E4= 50 | =20 51 | xxxxxxxxxxxxxxxxxx This is a line with an umlaut and a space at the end.=E4= 52 | =20 53 | xxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a space at the end.= 54 | =E4=20 55 | 56 | Just long enough: This is a line with an umlaut and a tab at the end.=E4=09 57 | Just too long: xxxx This is a line with an umlaut and a tab at the end.=E4= 58 | =09 59 | xxxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a tab at the end.=E4= 60 | =09 61 | xxxxxxxxxxxxxxxxxxxxx This is a line with an umlaut and a tab at the end.= 62 | =E4=09 63 | 64 | This line has a space at the end and ends the buffer=20 -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode: -------------------------------------------------------------------------------- 1 | %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F%30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F%40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F%50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F%60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F%70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/all-encoded.x-kmime-rfc2231-decode.expected -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/basic-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/basic-encode -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/basic-encode.expected: -------------------------------------------------------------------------------- 1 | %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23%24%25%26%27%28%29%2A+%2C-%2E%2F0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E%5F%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231 -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-decode.x-kmime-rfc2231.expected -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/null-encode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-encode -------------------------------------------------------------------------------- /autotests/data/codec_x-kmime-rfc2231/null-encode.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-kmime-rfc2231/null-encode.expected -------------------------------------------------------------------------------- /autotests/data/codec_x-uuencode/basic-decode.x-uuencode: -------------------------------------------------------------------------------- 1 | begin 664 foo 2 | M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL 3 | M+2XO,#$R,S0U-C'EZ>WQ]?G^`@8*#A(6& 5 | MAXB)BHN,C8Z/D)&2DY25EI>8F9J;G)V>GZ"AHJ.DI::GJ*FJJZRMKJ^PL;*S 6 | MM+6VM[BYNKN\O;Z_P,'"P\3%QL?(R+CY.7FY^CIZNOL[>[O\/'R\_3U]O?X^?K[_/W^_P`` 8 | ` 9 | end 10 | -------------------------------------------------------------------------------- /autotests/data/codec_x-uuencode/basic-decode.x-uuencode.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KDE/kcodecs/71e4e5dfef6a6e67eb012efa3a24501d34baef35/autotests/data/codec_x-uuencode/basic-decode.x-uuencode.expected -------------------------------------------------------------------------------- /autotests/kcharsetstest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2011 Romain Perier 3 | 4 | SPDX-License-Identifier: GPL-2.0-or-later 5 | */ 6 | 7 | #include "kcharsetstest.h" 8 | 9 | #include "kcharsets_p.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace Qt::Literals; 16 | 17 | static bool encodingNameHasADescription(const QString &encodingName, const QStringList &descriptions) 18 | { 19 | return std::any_of(descriptions.cbegin(), descriptions.cend(), [&encodingName](const QString &description) { 20 | return description.contains(encodingName); 21 | }); 22 | } 23 | 24 | void KCharsetsTest::testSingleton() 25 | { 26 | QVERIFY(KCharsets::charsets() != nullptr); 27 | QCOMPARE(KCharsets::charsets(), KCharsets::charsets()); 28 | } 29 | 30 | void KCharsetsTest::testFromEntity() 31 | { 32 | KCharsets *singleton = KCharsets::charsets(); 33 | 34 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("Ӓ")), QChar(1234)); 35 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("ሴ")), QChar(0x1234)); 36 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("lt")), QChar::fromLatin1('<')); 37 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("gt")), QChar::fromLatin1('>')); 38 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("quot")), QChar::fromLatin1('"')); 39 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("amp")), QChar::fromLatin1('&')); 40 | QCOMPARE(singleton->fromEntity(QString::fromLatin1("apos")), QChar::fromLatin1('\'')); 41 | QCOMPARE(singleton->fromEntity(u"aposgarbagesuffix"_s), QChar()); 42 | QCOMPARE(singleton->fromEntity(u"thetasym"_s), QChar(0x03d1)); 43 | QCOMPARE(singleton->fromEntity(u"thetasymgarbagesuffix"_s), QChar()); 44 | } 45 | 46 | void KCharsetsTest::testToEntity() 47 | { 48 | QSKIP("KCharsets::toEntity test not implemented."); 49 | } 50 | 51 | void KCharsetsTest::testResolveEntities() 52 | { 53 | KCharsets *singleton = KCharsets::charsets(); 54 | 55 | QCOMPARE(singleton->resolveEntities(QString::fromLatin1(""'<Hello &World>'"")), 56 | QString::fromLatin1("\"\'\'\"")); 57 | } 58 | 59 | void KCharsetsTest::testEncodingNames() 60 | { 61 | KCharsets *singleton = KCharsets::charsets(); 62 | 63 | QCOMPARE(singleton->availableEncodingNames().count(), singleton->descriptiveEncodingNames().count()); 64 | 65 | for (const QString &encodingName : singleton->availableEncodingNames()) { 66 | QVERIFY(encodingNameHasADescription(encodingName, singleton->descriptiveEncodingNames())); 67 | QVERIFY(!singleton->descriptionForEncoding(encodingName).isEmpty()); 68 | QCOMPARE(singleton->encodingForName(singleton->descriptionForEncoding(encodingName)), encodingName); 69 | } 70 | } 71 | 72 | QTEST_MAIN(KCharsetsTest) 73 | 74 | #include "moc_kcharsetstest.cpp" 75 | -------------------------------------------------------------------------------- /autotests/kcharsetstest.h: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2011 Romain Perier 3 | 4 | SPDX-License-Identifier: GPL-2.0-or-later 5 | */ 6 | 7 | #ifndef KCHARSETSTEST_H 8 | #define KCHARSETSTEST_H 9 | 10 | #include 11 | 12 | class KCharsetsTest : public QObject 13 | { 14 | Q_OBJECT 15 | private Q_SLOTS: 16 | void testSingleton(); 17 | void testFromEntity(); 18 | void testToEntity(); 19 | void testResolveEntities(); 20 | void testEncodingNames(); 21 | }; 22 | 23 | #endif /* KCHARSETSTEST_H */ 24 | -------------------------------------------------------------------------------- /autotests/kemailaddresstest.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of the KDE project 3 | 4 | SPDX-FileCopyrightText: 2004 David Faure 5 | SPDX-FileCopyrightText: 2009 Thomas McGuire 6 | 7 | SPDX-License-Identifier: LGPL-2.0-only 8 | */ 9 | #ifndef TESTEMAIL_H 10 | #define TESTEMAIL_H 11 | 12 | #include 13 | 14 | class KEmailAddressTest : public QObject 15 | { 16 | Q_OBJECT 17 | private Q_SLOTS: 18 | void testGetNameAndEmail(); 19 | void testGetNameAndEmail_data(); 20 | void testIsValidEmailAddress(); 21 | void testIsValidEmailAddress_data(); 22 | void testIsValidAddressList(); 23 | void testIsValidAddressList_data(); 24 | void testIsValidSimpleEmailAddress(); 25 | void testIsValidSimpleEmailAddress_data(); 26 | void testGetEmailAddress(); 27 | void testGetEmailAddress_data(); 28 | void testCheckSplitEmailAddrList(); 29 | void testCheckSplitEmailAddrList_data(); 30 | void testNormalizeAddressesAndEncodeIDNs(); 31 | void testNormalizeAddressesAndEncodeIDNs_data(); 32 | void testNormalizeAddressesAndDecodeIDNs(); 33 | void testNormalizeAddressesAndDecodeIDNs_data(); 34 | void testQuoteIfNecessary(); 35 | void testQuoteIfNecessary_data(); 36 | void testMailtoUrls(); 37 | void testMailtoUrls_data(); 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /autotests/kencodingprobertest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2012 Ni Hui 3 | 4 | SPDX-License-Identifier: GPL-2.0-or-later 5 | */ 6 | 7 | #include "kencodingprobertest.h" 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | static KEncodingProber *ep = nullptr; 14 | 15 | void KEncodingProberTest::initTestCase() 16 | { 17 | ep = new KEncodingProber; 18 | } 19 | 20 | void KEncodingProberTest::cleanupTestCase() 21 | { 22 | delete ep; 23 | ep = nullptr; 24 | } 25 | 26 | void KEncodingProberTest::cleanup() 27 | { 28 | ep->reset(); 29 | } 30 | 31 | void KEncodingProberTest::testReset() 32 | { 33 | ep->feed(QByteArray("some random data @*@#&jd")); 34 | ep->reset(); 35 | QCOMPARE(ep->state(), KEncodingProber::Probing); 36 | QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8")); 37 | } 38 | 39 | void KEncodingProberTest::testProbe() 40 | { 41 | // utf-8 42 | ep->setProberType(KEncodingProber::Universal); 43 | ep->feed(QByteArray::fromHex("e998bfe5b094e58d91e696afe5b1b1e88489")); 44 | QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8")); 45 | ep->reset(); 46 | 47 | // gb18030 48 | ep->setProberType(KEncodingProber::ChineseSimplified); 49 | ep->feed(QByteArray::fromHex("d7d4d3c9b5c4b0d9bfc6c8abcae9")); 50 | QCOMPARE(ep->encoding().toLower(), QByteArray("gb18030")); 51 | ep->reset(); 52 | 53 | // shift_jis 54 | ep->setProberType(KEncodingProber::Japanese); 55 | ep->feed(QByteArray::fromHex("8374838a815b955389c88e969354")); 56 | QCOMPARE(ep->encoding().toLower(), QByteArray("shift_jis")); 57 | ep->reset(); 58 | 59 | // big5 60 | ep->setProberType(KEncodingProber::ChineseTraditional); 61 | ep->feed(QByteArray::fromHex("aefcafc7a6caa474a141a6b3ae65a444a46a")); 62 | QCOMPARE(ep->encoding().toLower(), QByteArray("big5")); 63 | ep->reset(); 64 | 65 | // binary data, just make sure we do not crash (cf. crash in bug #357341) 66 | const QString binaryFile = QFINDTESTDATA("data/binary_data"); 67 | QVERIFY(!binaryFile.isEmpty()); 68 | QFile file(binaryFile); 69 | QVERIFY(file.open(QIODevice::ReadOnly)); 70 | QByteArray binaryData(file.readAll()); 71 | ep->setProberType(KEncodingProber::Universal); 72 | ep->feed(binaryData); 73 | QCOMPARE(ep->encoding().toLower(), QByteArray("utf-8")); 74 | ep->reset(); 75 | } 76 | 77 | QTEST_MAIN(KEncodingProberTest) 78 | 79 | #include "moc_kencodingprobertest.cpp" 80 | -------------------------------------------------------------------------------- /autotests/kencodingprobertest.h: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2012 Ni Hui 3 | 4 | SPDX-License-Identifier: GPL-2.0-or-later 5 | */ 6 | 7 | #ifndef KENCODINGPROBERTEST_H 8 | #define KENCODINGPROBERTEST_H 9 | 10 | #include 11 | 12 | class KEncodingProberTest : public QObject 13 | { 14 | Q_OBJECT 15 | private Q_SLOTS: 16 | void initTestCase(); 17 | void cleanupTestCase(); 18 | void cleanup(); 19 | void testReset(); 20 | void testProbe(); 21 | }; 22 | 23 | #endif // KENCODINGPROBERTEST_H 24 | -------------------------------------------------------------------------------- /autotests/rfc2047test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2006 Volker Krause 3 | 4 | SPDX-License-Identifier: LGPL-2.0-only 5 | */ 6 | 7 | #include 8 | 9 | #include "rfc2047test.h" 10 | 11 | #include "../src/kcodecs.h" 12 | 13 | using namespace KCodecs; 14 | 15 | QTEST_MAIN(RFC2047Test) 16 | 17 | void RFC2047Test::testRFC2047decode_data() 18 | { 19 | QTest::addColumn("input"); 20 | QTest::addColumn("expectedCharset"); 21 | QTest::addColumn("defaultCharset"); 22 | QTest::addColumn("forceCharset"); 23 | QTest::addColumn("expectedResult"); 24 | 25 | /* clang-format off */ 26 | QTest::newRow("empty") << QByteArray() 27 | << QByteArray() << QByteArray("utf-8") << false 28 | << QString(); 29 | QTest::newRow("identity") << QByteArray("bla") 30 | << QByteArray() << QByteArray("utf-8") << false 31 | << QString::fromLatin1("bla"); 32 | 33 | QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= ") 34 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 35 | << QString::fromUtf8("Ingo Klöcker "); 36 | QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= ") 37 | << QByteArray("UTF-8") << QByteArray("iso8859-1") << false 38 | << QString::fromUtf8("Ingo Klöcker "); 39 | QTest::newRow("utf-8") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?=") 40 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 41 | << QString::fromUtf8("Ingo Klöcker"); 42 | 43 | 44 | QTest::newRow("whitespaces") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= =?utf-8?q?Ingo=20Kl=C3=B6cker?=") 45 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 46 | << QString::fromUtf8("Ingo KlöckerIngo Klöcker"); 47 | QTest::newRow("whitespaces") << QByteArray("=?utf-8?q?Ingo=20Kl=C3=B6cker?= foo =?utf-8?q?Ingo=20Kl=C3=B6cker?=") 48 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 49 | << QString::fromUtf8("Ingo Klöcker foo Ingo Klöcker"); 50 | 51 | QTest::newRow("iso-8859-1") << QByteArray("=?ISO-8859-1?Q?Andr=E9s_Ot=F3n?=") 52 | << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false 53 | << QString::fromUtf8("Andrés Otón"); 54 | QTest::newRow("iso-8859-2") << QByteArray("=?iso-8859-2?q?Rafa=B3_Rzepecki?=") 55 | << QByteArray("ISO-8859-2") << QByteArray("utf-8") << false 56 | << QString::fromUtf8("Rafał Rzepecki"); 57 | QTest::newRow("iso-8859-9") << QByteArray("=?iso-8859-9?Q?S=2E=C7a=F0lar?= Onur") 58 | << QByteArray("ISO-8859-9") << QByteArray("utf-8") << false 59 | << QString::fromUtf8("S.Çağlar Onur"); 60 | QTest::newRow("iso-8859-15") << QByteArray("Rafael =?iso-8859-15?q?Rodr=EDguez?=") 61 | << QByteArray("ISO-8859-15") << QByteArray("utf-8") << false 62 | << QString::fromUtf8("Rafael Rodríguez"); 63 | 64 | QTest::newRow("wrong charset") << QByteArray("=?iso-8859-1?q?Ingo=20Kl=C3=B6cker?=") 65 | << QByteArray("UTF-8") << QByteArray("utf-8") << true 66 | << QString::fromUtf8("Ingo Klöcker"); 67 | 68 | // language parameter according to RFC 2231, section 5 69 | QTest::newRow("RFC-2331") << QByteArray("From: =?US-ASCII*EN?Q?Keith_Moore?= ") 70 | << QByteArray("US-ASCII") << QByteArray("utf-8") << false 71 | << QString::fromUtf8("From: Keith Moore "); 72 | 73 | QTest::newRow("broken QP") << QByteArray("Subject: =?iso-8859-1?Q?Belangrijk=3a=20Verhuizing=20FTP=20server?=") 74 | << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false 75 | << QString::fromUtf8("Subject: Belangrijk: Verhuizing FTP server"); 76 | 77 | // mixed charsets, based on bug 125542 78 | QTest::newRow("mixed charsets") << QByteArray("Subject: =?utf-8?q?Ingo=20Kl=C3=B6cker?= unencoded words =?iso-8859-9?Q?S=2E=C7a=F0lar?=") 79 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 80 | << QString::fromUtf8("Subject: Ingo Klöcker unencoded words S.Çağlar"); 81 | QTest::newRow("mixed charsets-125542") << QByteArray("Subject: =?koi8-r?b?5MXMz9fJINrB?= HP Pavillion =?iso-8859-5?b?KNzV3N7g2PjQIN/e4dXR3d4p?=") 82 | << QByteArray("UTF-8") << QByteArray("us-ascii") << false 83 | << QString::fromUtf8("Subject: Делови за HP Pavillion (меморија посебно)"); 84 | 85 | // illegal characters which are already encoded in the given encoding but are not ASCII (bug 206417) 86 | QTest::newRow("illegal characters") << QByteArray("Subject: =?utf-8?Q?пиѿилл,=20=D0=B4=D0=BE=D0=B1=D1=80=D1=8B=D0=B9=20=D0=B4=D0=B5=D0=BD=D1=8C?=") 87 | << QByteArray("UTF-8") << QByteArray("utf-8") << false 88 | << QString::fromUtf8("Subject: пиѿилл, добрый день"); 89 | const auto iso88591Encoded = QByteArray::fromHex("D6C4DCF6E4FC"); // "ÖÄÜöäü" in ISO-8859-1 encoding - this is not valid UTF-8 though and thus rejected by MSVC in string literals 90 | QTest::newRow("illegal characters") << QByteArray("Subject: =?iso-8859-1?Q?") + iso88591Encoded + "?=" 91 | << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false 92 | << QString::fromLatin1("Subject: " + iso88591Encoded); 93 | 94 | 95 | QTest::newRow("small data") << QByteArray("=?iso-8859-1?Q?c?=") 96 | << QByteArray("ISO-8859-1") << QByteArray("utf-8") << false 97 | << QString::fromUtf8("c"); 98 | /* clang-format on */ 99 | } 100 | 101 | void RFC2047Test::testRFC2047decode() 102 | { 103 | QFETCH(QByteArray, input); 104 | QFETCH(QByteArray, expectedCharset); 105 | QFETCH(QByteArray, defaultCharset); 106 | QFETCH(bool, forceCharset); 107 | QFETCH(QString, expectedResult); 108 | 109 | QByteArray detectedCharset; 110 | 111 | const KCodecs::CharsetOption options = forceCharset ? KCodecs::ForceDefaultCharset : KCodecs::NoOption; 112 | const QString result = KCodecs::decodeRFC2047String(input, &detectedCharset, defaultCharset, options); 113 | 114 | QCOMPARE(result, expectedResult); 115 | QCOMPARE(detectedCharset, expectedCharset); 116 | } 117 | 118 | void RFC2047Test::testInvalidDecode_data() 119 | { 120 | QTest::addColumn("input"); 121 | QTest::addColumn("expectedResult"); 122 | 123 | QTest::newRow("") << QByteArray("=") << QString::fromUtf8("="); 124 | QTest::newRow("") << QByteArray("=?") << QString::fromUtf8("=?"); 125 | QTest::newRow("") << QByteArray("=?a?b?=") << QString::fromUtf8("=?a?b?="); 126 | QTest::newRow("") << QByteArray("=?a?b?c?") << QString::fromUtf8("=?a?b?c?"); 127 | QTest::newRow("") << QByteArray("=?a??c?=") << QString::fromUtf8("=?a??c?="); 128 | } 129 | 130 | void RFC2047Test::testInvalidDecode() 131 | { 132 | QFETCH(QByteArray, input); 133 | QFETCH(QString, expectedResult); 134 | 135 | QByteArray encCharset; 136 | 137 | const QString result = KCodecs::decodeRFC2047String(input, &encCharset); 138 | QCOMPARE(result, expectedResult); 139 | } 140 | 141 | void RFC2047Test::testRFC2047encode_data() 142 | { 143 | QTest::addColumn("input"); 144 | QTest::addColumn("encoding"); 145 | QTest::addColumn("expectedResult"); 146 | 147 | /* clang-format off */ 148 | QTest::newRow("empty") << QString() 149 | << QByteArray("utf-8") 150 | << QByteArray(); 151 | QTest::newRow("identity") << QString::fromUtf8("bla") 152 | << QByteArray("utf-8") 153 | << QByteArray("bla"); 154 | QTest::newRow("QP") << QString::fromUtf8("Ingo Klöcker ") 155 | << QByteArray("utf-8") 156 | << QByteArray("=?UTF-8?q?Ingo=20Kl=C3=B6cker?= "); 157 | 158 | QTest::newRow("utf-8 fallback") << QString::fromUtf8("æſðđŋħł") 159 | << QByteArray("latin1") 160 | << QByteArray("=?UTF-8?B?w6bFv8OwxJHFi8SnxYI=?="); 161 | /* clang-format on */ 162 | } 163 | 164 | void RFC2047Test::testRFC2047encode() 165 | { 166 | QFETCH(QString, input); 167 | QFETCH(QByteArray, encoding); 168 | QFETCH(QByteArray, expectedResult); 169 | 170 | const QByteArray result = KCodecs::encodeRFC2047String(input, encoding); 171 | 172 | // expected value is probably wrong, libkmime will choose 'B' instead of 'Q' encoding 173 | QEXPECT_FAIL("QP", "KCodecs will choose 'B' instead of 'Q' encoding", Continue); 174 | QCOMPARE(result, expectedResult); 175 | } 176 | 177 | #include "moc_rfc2047test.cpp" 178 | -------------------------------------------------------------------------------- /autotests/rfc2047test.h: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2006 Volker Krause 3 | 4 | SPDX-License-Identifier: LGPL-2.0-only 5 | */ 6 | 7 | #ifndef RFC2047TEST_H 8 | #define RFC2047TEST_H 9 | 10 | #include 11 | 12 | class RFC2047Test : public QObject 13 | { 14 | Q_OBJECT 15 | private Q_SLOTS: 16 | void testRFC2047decode_data(); 17 | void testRFC2047decode(); 18 | 19 | void testInvalidDecode_data(); 20 | void testInvalidDecode(); 21 | 22 | void testRFC2047encode_data(); 23 | void testRFC2047encode(); 24 | }; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /docs/Doxyfile.local: -------------------------------------------------------------------------------- 1 | ### KApiDox Project-specific Overrides File 2 | 3 | # define so that deprecated API is not skipped 4 | PREDEFINED += \ 5 | "KCODECS_ENABLE_DEPRECATED_SINCE(x, y)=1" \ 6 | "KCODECS_BUILD_DEPRECATED_SINCE(x, y)=1" \ 7 | "KCODECS_DEPRECATED_VERSION(x, y, t)=" 8 | -------------------------------------------------------------------------------- /metainfo.yaml: -------------------------------------------------------------------------------- 1 | maintainer: 2 | description: Text encoding 3 | tier: 1 4 | type: functional 5 | platforms: 6 | - name: Linux 7 | - name: FreeBSD 8 | - name: Windows 9 | - name: macOS 10 | - name: Android 11 | portingAid: false 12 | deprecated: false 13 | release: true 14 | libraries: 15 | - cmake: "KF6::Codecs" 16 | cmakename: KF6Codecs 17 | 18 | public_lib: true 19 | group: Frameworks 20 | subgroup: Tier 1 21 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(KF6Codecs) 2 | add_library(KF6::Codecs ALIAS KF6Codecs) 3 | 4 | set_target_properties(KF6Codecs PROPERTIES 5 | VERSION ${KCODECS_VERSION} 6 | SOVERSION ${KCODECS_SOVERSION} 7 | EXPORT_NAME Codecs 8 | ) 9 | 10 | ecm_create_qm_loader(KF6Codecs kcodecs6_qt) 11 | 12 | target_sources(KF6Codecs PRIVATE 13 | kcharsets.cpp 14 | kcharsets.h 15 | kcodecsbase45.cpp 16 | kcodecsbase64.cpp 17 | kcodecsbase64.h 18 | kcodecs.cpp 19 | kcodecs.h 20 | kcodecs_p.h 21 | kcodecsqp.cpp 22 | kcodecsqp.h 23 | kcodecsuuencode.cpp 24 | kcodecsuuencode.h 25 | kemailaddress.cpp 26 | kemailaddress.h 27 | kencodingprober.cpp 28 | kencodingprober.h 29 | probers/CharDistribution.cpp 30 | probers/CharDistribution.h 31 | probers/ChineseGroupProber.cpp 32 | probers/ChineseGroupProber.h 33 | probers/JapaneseGroupProber.cpp 34 | probers/JapaneseGroupProber.h 35 | probers/JpCntx.cpp 36 | probers/JpCntx.h 37 | probers/LangBulgarianModel.cpp 38 | probers/LangCyrillicModel.cpp 39 | probers/LangGreekModel.cpp 40 | probers/LangHebrewModel.cpp 41 | probers/LangHungarianModel.cpp 42 | probers/LangThaiModel.cpp 43 | probers/nsBig5Prober.cpp 44 | probers/nsBig5Prober.h 45 | probers/nsCharSetProber.cpp 46 | probers/nsCharSetProber.h 47 | probers/nsCodingStateMachine.h 48 | probers/nsEscCharsetProber.cpp 49 | probers/nsEscCharsetProber.h 50 | probers/nsEscSM.cpp 51 | probers/nsEUCJPProber.cpp 52 | probers/nsEUCJPProber.h 53 | probers/nsEUCKRProber.cpp 54 | probers/nsEUCKRProber.h 55 | probers/nsGB2312Prober.cpp 56 | probers/nsGB2312Prober.h 57 | probers/nsHebrewProber.cpp 58 | probers/nsHebrewProber.h 59 | probers/nsLatin1Prober.cpp 60 | probers/nsLatin1Prober.h 61 | probers/nsMBCSGroupProber.cpp 62 | probers/nsMBCSGroupProber.h 63 | probers/nsMBCSSM.cpp 64 | probers/nsPkgInt.h 65 | probers/nsSBCharSetProber.cpp 66 | probers/nsSBCharSetProber.h 67 | probers/nsSBCSGroupProber.cpp 68 | probers/nsSBCSGroupProber.h 69 | probers/nsSJISProber.cpp 70 | probers/nsSJISProber.h 71 | probers/nsUniversalDetector.cpp 72 | probers/nsUniversalDetector.h 73 | probers/UnicodeGroupProber.cpp 74 | probers/UnicodeGroupProber.h 75 | ) 76 | 77 | ecm_qt_declare_logging_category(KF6Codecs 78 | HEADER kcodecs_debug.h 79 | IDENTIFIER KCODECS_LOG 80 | CATEGORY_NAME kf.codecs 81 | OLD_CATEGORY_NAMES kf5.kcodecs 82 | DESCRIPTION "KCodecs" 83 | EXPORT KCODECS 84 | ) 85 | 86 | ecm_generate_export_header(KF6Codecs 87 | BASE_NAME KCodecs 88 | GROUP_BASE_NAME KF 89 | VERSION ${KF_VERSION} 90 | USE_VERSION_HEADER 91 | DEPRECATED_BASE_VERSION 0 92 | DEPRECATION_VERSIONS 93 | EXCLUDE_DEPRECATED_BEFORE_AND_AT ${EXCLUDE_DEPRECATED_BEFORE_AND_AT} 94 | ) 95 | 96 | target_include_directories(KF6Codecs INTERFACE "$") 97 | 98 | target_link_libraries(KF6Codecs PUBLIC Qt6::Core) 99 | 100 | ecm_generate_headers(KCodecs_HEADERS 101 | HEADER_NAMES 102 | KCharsets 103 | KCodecs 104 | KEncodingProber 105 | KEmailAddress 106 | REQUIRED_HEADERS KCodecs_HEADERS 107 | ) 108 | 109 | install(TARGETS KF6Codecs EXPORT KF6CodecsTargets ${KF_INSTALL_TARGETS_DEFAULT_ARGS}) 110 | 111 | install(FILES 112 | ${CMAKE_CURRENT_BINARY_DIR}/kcodecs_export.h 113 | ${KCodecs_HEADERS} 114 | DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF}/KCodecs COMPONENT Devel 115 | ) 116 | 117 | ecm_qt_install_logging_categories( 118 | EXPORT KCODECS 119 | FILE kcodecs.categories 120 | DESTINATION ${KDE_INSTALL_LOGGINGCATEGORIESDIR} 121 | ) 122 | 123 | ecm_generate_qdoc(KF6Codecs kcodecs.qdocconf) 124 | -------------------------------------------------------------------------------- /src/Messages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Extract strings from all source files. 4 | # EXTRACT_TR_STRINGS extracts strings with lupdate and convert them to .pot with 5 | # lconvert. 6 | $EXTRACT_TR_STRINGS `find . -name \*.cpp -o -name \*.h -o -name \*.ui -o -name \*.qml` -o $podir/kcodecs6_qt.pot 7 | -------------------------------------------------------------------------------- /src/kcharsets.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of the KDE libraries 3 | SPDX-FileCopyrightText: 1999 Lars Knoll 4 | 5 | SPDX-License-Identifier: LGPL-2.0-or-later 6 | */ 7 | #ifndef KCHARSETS_H 8 | #define KCHARSETS_H 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "kcodecs.h" 18 | 19 | class KCharsetsPrivate; 20 | 21 | class QChar; 22 | class QString; 23 | 24 | /*! 25 | * \class KCharsets 26 | * \inmodule KCodecs 27 | * 28 | * \brief Charset font and encoder/decoder handling. 29 | * 30 | * This is needed, because Qt's encoding name matching in 31 | * QTextCodec::codecForName() matches only closely-related encoded names 32 | * but not alternate names, e.g. found in the reality of the Internet. 33 | */ 34 | class KCODECS_EXPORT KCharsets final 35 | { 36 | Q_DECLARE_TR_FUNCTIONS(KCharsets) 37 | 38 | protected: 39 | /* 40 | * Protected constructor. If you need the kcharsets object, use 41 | * KCharsets::charsets() instead. 42 | */ 43 | KCharsets(); 44 | 45 | public: 46 | ~KCharsets(); 47 | 48 | /*! 49 | * The global charset manager. 50 | */ 51 | static KCharsets *charsets(); 52 | 53 | /*! 54 | * Converts an entity to a character. 55 | * 56 | * The string must contain only the 57 | * entity without the trailing ';'. 58 | * 59 | * \a str the entity 60 | * 61 | * Returns QChar::Null if the entity could not be decoded. 62 | */ 63 | static QChar fromEntity(QStringView str); 64 | 65 | /*! 66 | * Tries to find an entity in the 67 | * QString str. 68 | * 69 | * \a str the string containing entified 70 | * 71 | * \a len is a return value, that gives the length of the decoded 72 | * entity. 73 | * 74 | * Returns a decoded entity if one could be found, QChar::null 75 | * otherwise 76 | * 77 | * \overload fromEntity(QStringView) 78 | */ 79 | static QChar fromEntity(QStringView str, int &len); 80 | 81 | /*! 82 | * Converts a QChar to an entity. The returned string does already 83 | * contain the leading '&' and the trailing ';'. 84 | * 85 | * \a ch the char to convert 86 | * 87 | * Returns the entity 88 | */ 89 | static QString toEntity(const QChar &ch); 90 | 91 | /*! 92 | * Scans the given string for entities (like &amp;) and resolves them 93 | * using fromEntity. 94 | * 95 | * \a text the string containing the entities 96 | * 97 | * Returns the clean string 98 | */ 99 | static QString resolveEntities(const QString &text); 100 | 101 | /*! 102 | * Lists all available encodings as names 103 | */ 104 | QStringList availableEncodingNames() const; 105 | 106 | /*! 107 | * Lists the available encoding names together with a more descriptive language 108 | */ 109 | QStringList descriptiveEncodingNames() const; 110 | 111 | /*! 112 | * Lists the available encoding names grouped by script (or language that uses them). 113 | * 114 | * Returns the list of lists consisting of description followed by encoding names (i.e. encodingsByScript().at(i).at(0) is a description for 115 | * encodingsByScript().at(i).at(k), k>0) 116 | */ 117 | QList encodingsByScript() const; 118 | 119 | /*! 120 | * Returns a long description for an encoding name. 121 | * 122 | * \a encoding the encoding for the language 123 | * 124 | */ 125 | QString descriptionForEncoding(QStringView encoding) const; 126 | 127 | /*! 128 | * Returns the encoding for a string obtained with descriptiveEncodingNames(). 129 | * 130 | * \a descriptiveName the descriptive name for the encoding 131 | */ 132 | QString encodingForName(const QString &descriptiveName) const; 133 | 134 | private: 135 | std::unique_ptr const d; 136 | friend struct KCharsetsSingletonPrivate; 137 | }; 138 | 139 | #endif 140 | -------------------------------------------------------------------------------- /src/kcharsets_p.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of the KDE libraries 3 | 4 | SPDX-FileCopyrightText: 1999 Lars Knoll 5 | SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE 6 | SPDX-FileCopyrightText: 2007 Nick Shaforostoff 7 | 8 | SPDX-License-Identifier: LGPL-2.0-or-later 9 | */ 10 | 11 | #ifndef KCHARSETS_P_H 12 | #define KCHARSETS_P_H 13 | 14 | #include 15 | 16 | class KCharsetsPrivate 17 | { 18 | public: 19 | // Cache list so QStrings can be implicitly shared 20 | QList encodingsByScript; 21 | }; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/kcodecs-index.qdoc: -------------------------------------------------------------------------------- 1 | /*! 2 | \page kcodecs-index.html 3 | \title KCodecs 4 | 5 | KCodecs provide a collection of methods to manipulate strings using various 6 | encodings. 7 | 8 | It can automatically determine the charset of a string, translate XML entities, 9 | validate email addresses, and find encodings by name in a more tolerant way than QTextCodec 10 | (useful e.g. for data coming from the Internet). 11 | 12 | \section1 Using the Module 13 | 14 | \include {module-use.qdocinc} {using the c++ api} 15 | 16 | \section2 Building with CMake 17 | 18 | \include {module-use.qdocinc} {building with cmake} {KF6} {Codecs} {KF6::Codecs} 19 | 20 | \section1 API Reference 21 | 22 | \list 23 | \li \l{KCodecs C++ Classes} 24 | \endlist 25 | */ 26 | -------------------------------------------------------------------------------- /src/kcodecs.qdoc: -------------------------------------------------------------------------------- 1 | /*! 2 | \module KCodecs 3 | \title KCodecs C++ Classes 4 | \ingroup modules 5 | \cmakepackage KF6 6 | \cmakecomponent Codecs 7 | 8 | \brief Text encoding. 9 | */ 10 | -------------------------------------------------------------------------------- /src/kcodecs.qdocconf: -------------------------------------------------------------------------------- 1 | include($KDE_DOCS/global/qt-module-defaults.qdocconf) 2 | 3 | project = KCodecs 4 | description = Text encoding 5 | 6 | documentationinheaders = true 7 | 8 | headerdirs += . 9 | sourcedirs += . 10 | 11 | outputformats = HTML 12 | 13 | depends += \ 14 | qtcore \ 15 | qtcore5compat 16 | 17 | navigation.landingpage = "KCodecs" 18 | 19 | qhp.projects = KCodecs 20 | 21 | qhp.KCodecs.file = kcodecs.qhp 22 | qhp.KCodecs.namespace = org.kde.kcodecs.$QT_VERSION_TAG 23 | qhp.KCodecs.virtualFolder = kcodecs 24 | qhp.KCodecs.indexTitle = KCodecs 25 | qhp.KCodecs.indexRoot = 26 | 27 | qhp.KCodecs.subprojects = classes 28 | qhp.KCodecs.subprojects.classes.title = C++ Classes 29 | qhp.KCodecs.subprojects.classes.indexTitle = KCodecs C++ Classes 30 | qhp.KCodecs.subprojects.classes.selectors = class fake:headerfile 31 | qhp.KCodecs.subprojects.classes.sortPages = true 32 | 33 | tagfile = kcodecs.tags 34 | -------------------------------------------------------------------------------- /src/kcodecs_p.h: -------------------------------------------------------------------------------- 1 | /* 2 | SPDX-FileCopyrightText: 2014 Daniel Vrátil 3 | 4 | SPDX-License-Identifier: LGPL-2.0-only 5 | */ 6 | 7 | #ifndef KCODECS_P_H 8 | #define KCODECS_P_H 9 | 10 | #include "kcodecs.h" 11 | 12 | namespace KCodecs 13 | { 14 | class EncoderPrivate 15 | { 16 | public: 17 | explicit EncoderPrivate(Codec::NewlineType newline); 18 | 19 | /** 20 | An output buffer to simplify some codecs. 21 | Used with write() and flushOutputBuffer(). 22 | */ 23 | char outputBuffer[Encoder::maxBufferedChars]; 24 | 25 | uchar outputBufferCursor; 26 | const Codec::NewlineType newline; 27 | }; 28 | 29 | class DecoderPrivate 30 | { 31 | public: 32 | explicit DecoderPrivate(Codec::NewlineType newline); 33 | 34 | const Codec::NewlineType newline; 35 | }; 36 | 37 | } 38 | 39 | #endif // KCODECS_P_H 40 | -------------------------------------------------------------------------------- /src/kcodecsbase45.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | SPDX-FileCopyrightText: 2021 Volker Krause 4 | 5 | SPDX-License-Identifier: LGPL-2.0-or-later 6 | */ 7 | 8 | #include "kcodecs.h" 9 | #include "kcodecs_debug.h" 10 | 11 | #include 12 | 13 | static constexpr const char base45Table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:"; 14 | 15 | static uint8_t base45MapFromChar(char c) 16 | { 17 | const auto it = std::find(std::begin(base45Table), std::end(base45Table), c); 18 | if (it == std::end(base45Table)) { 19 | qCWarning(KCODECS_LOG) << "invalid base45 character:" << c; 20 | return 0; 21 | } 22 | return std::distance(std::begin(base45Table), it); 23 | } 24 | 25 | QByteArray KCodecs::base45Decode(QByteArrayView in) 26 | { 27 | QByteArray out; 28 | out.reserve(((in.size() / 3) + 1) * 2); 29 | 30 | for (qsizetype i = 0; i + 1 < in.size(); i += 3) { 31 | uint32_t n = base45MapFromChar(in[i]) + base45MapFromChar(in[i + 1]) * 45; 32 | if (i + 2 < in.size()) { 33 | n += 45 * 45 * base45MapFromChar(in[i + 2]); 34 | out.push_back(n >> 8); 35 | } else { 36 | if (n >> 8) { 37 | out.push_back(n >> 8); 38 | } 39 | } 40 | out.push_back(n % 256); 41 | } 42 | 43 | return out; 44 | } 45 | -------------------------------------------------------------------------------- /src/kcodecsbase64.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- 2 | SPDX-FileCopyrightText: 2001-2002 Marc Mutz 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | /* 7 | @glossary @anchor Base64 @anchor base64 @b base64: 8 | a binary to text encoding scheme based on @ref RFC1421. 9 | 10 | @glossary @anchor RFC1421 @anchor rfc1421 @b RFC @b 1421: 11 | RFC that defines the 12 | Privacy Enhancement for Internet Electronic Mail: Part I: 13 | Message Encryption and Authentication Procedures. 14 | 15 | @glossary @anchor RFC2045 @anchor rfc2045 @b RFC @b 2045: 16 | RFC that defines the 17 | MIME Part One: Format of Internet Message Bodies. 18 | 19 | @glossary @anchor RFC2047 @anchor rfc2047 @b RFC @b 2047: 20 | RFC that defines the 21 | MIME Part Three: Message Header Extensions for Non-ASCII Text. 22 | 23 | @glossary @anchor RFC2047B @anchor rfc2047b @b RFC @b 2047B: 24 | Section 4.1 of @ref RFC2047. 25 | */ 26 | 27 | #ifndef KCODECS_BASE64_H 28 | #define KCODECS_BASE64_H 29 | 30 | #include "kcodecs.h" 31 | 32 | namespace KCodecs 33 | { 34 | class Base64Codec : public Codec 35 | { 36 | public: 37 | Base64Codec() 38 | : Codec() 39 | { 40 | } 41 | 42 | ~Base64Codec() override 43 | { 44 | } 45 | 46 | const char *name() const override 47 | { 48 | return "base64"; 49 | } 50 | 51 | qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline) const override 52 | { 53 | // first, the total number of 4-char packets will be: 54 | qsizetype totalNumPackets = (insize + 2) / 3; 55 | // now, after every 76/4'th packet there needs to be a linebreak: 56 | qsizetype numLineBreaks = totalNumPackets / (76 / 4); 57 | // and at the very end, too: 58 | ++numLineBreaks; 59 | // putting it all together, we have: 60 | return 4 * totalNumPackets + (newline == Codec::NewlineCRLF ? 2 : 1) * numLineBreaks; 61 | } 62 | 63 | qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 64 | { 65 | // assuming all characters are part of the base64 stream (which 66 | // does almost never hold due to required linebreaking; but 67 | // additional non-base64 chars don't affect the output size), each 68 | // 4-tupel of them becomes a 3-tupel in the decoded octet 69 | // stream. So: 70 | qsizetype result = ((insize + 3) / 4) * 3; 71 | // but all of them may be \n, so 72 | if (newline == Codec::NewlineCRLF) { 73 | result *= 2; // :-o 74 | } 75 | 76 | return result; 77 | } 78 | 79 | Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override; 80 | 81 | Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override; 82 | }; 83 | 84 | class Rfc2047BEncodingCodec : public Base64Codec 85 | { 86 | public: 87 | Rfc2047BEncodingCodec() 88 | : Base64Codec() 89 | { 90 | } 91 | 92 | ~Rfc2047BEncodingCodec() override 93 | { 94 | } 95 | 96 | const char *name() const override 97 | { 98 | return "b"; 99 | } 100 | 101 | qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 102 | { 103 | Q_UNUSED(newline); 104 | // Each (begun) 3-octet triple becomes a 4 char quartet, so: 105 | return ((insize + 2) / 3) * 4; 106 | } 107 | 108 | qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 109 | { 110 | Q_UNUSED(newline); 111 | // Each 4-char quartet becomes a 3-octet triple, the last one 112 | // possibly even less. So: 113 | return ((insize + 3) / 4) * 3; 114 | } 115 | 116 | Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override; 117 | }; 118 | 119 | } // namespace KCodecs 120 | 121 | #endif // KCODECS_BASE64_H 122 | -------------------------------------------------------------------------------- /src/kcodecsqp.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- 2 | SPDX-FileCopyrightText: 2001-2002 Marc Mutz 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | 7 | #ifndef KCODECS_QP_H 8 | #define KCODECS_QP_H 9 | 10 | #include "kcodecs.h" 11 | 12 | namespace KCodecs 13 | { 14 | /* 15 | A class representing the codec for QuotedPrintable as specified in 16 | RFC2045 (section 6.7). 17 | */ 18 | class QuotedPrintableCodec : public Codec 19 | { 20 | public: 21 | QuotedPrintableCodec() 22 | : Codec() 23 | { 24 | } 25 | 26 | ~QuotedPrintableCodec() override 27 | { 28 | } 29 | 30 | const char *name() const override 31 | { 32 | return "quoted-printable"; 33 | } 34 | 35 | qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 36 | { 37 | // all chars encoded: 38 | qsizetype result = 3 * insize; 39 | // then after 25 hexchars comes a soft linebreak: =(\r)\n 40 | result += (newline == Codec::NewlineCRLF ? 3 : 2) * (insize / 25); 41 | 42 | return result; 43 | } 44 | 45 | qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override; 46 | 47 | Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override; 48 | 49 | Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override; 50 | }; 51 | 52 | /* 53 | A class representing the codec for the Q encoding as specified 54 | in RFC2047Q. 55 | */ 56 | class Rfc2047QEncodingCodec : public Codec 57 | { 58 | public: 59 | Rfc2047QEncodingCodec() 60 | : Codec() 61 | { 62 | } 63 | 64 | ~Rfc2047QEncodingCodec() override 65 | { 66 | } 67 | 68 | const char *name() const override 69 | { 70 | return "q"; 71 | } 72 | 73 | qsizetype maxEncodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override 74 | { 75 | Q_UNUSED(newline); 76 | // this one is simple: We don't do linebreaking, so all that can 77 | // happen is that every char needs encoding, so: 78 | return 3 * insize; 79 | } 80 | 81 | qsizetype maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override; 82 | 83 | Encoder *makeEncoder(Codec::NewlineType newline = Codec::NewlineLF) const override; 84 | 85 | Decoder *makeDecoder(Codec::NewlineType newline = Codec::NewlineLF) const override; 86 | }; 87 | 88 | /* 89 | A class representing the codec for RFC2231. 90 | */ 91 | class Rfc2231EncodingCodec : public Codec 92 | { 93 | public: 94 | Rfc2231EncodingCodec() 95 | : Codec() 96 | { 97 | } 98 | 99 | ~Rfc2231EncodingCodec() override 100 | { 101 | } 102 | 103 | const char *name() const override 104 | { 105 | return "x-kmime-rfc2231"; 106 | } 107 | 108 | qsizetype maxEncodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override 109 | { 110 | Q_UNUSED(newline); 111 | // same as for "q" encoding: 112 | return 3 * insize; 113 | } 114 | 115 | qsizetype maxDecodedSizeFor(qsizetype insize, Codec::NewlineType newline = Codec::NewlineLF) const override; 116 | 117 | Encoder *makeEncoder(Codec::NewlineType newline = Codec::NewlineLF) const override; 118 | 119 | Decoder *makeDecoder(Codec::NewlineType newline = Codec::NewlineLF) const override; 120 | }; 121 | 122 | } // namespace KCodecs 123 | 124 | #endif // KCODECS_QP_H 125 | -------------------------------------------------------------------------------- /src/kcodecsuuencode.cpp: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- 2 | SPDX-FileCopyrightText: 2002 Marc Mutz 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | 7 | #include "kcodecsuuencode.h" 8 | 9 | #include 10 | 11 | #include 12 | 13 | using namespace KCodecs; 14 | 15 | namespace KCodecs 16 | { 17 | class UUDecoder : public Decoder 18 | { 19 | uint mStepNo; 20 | uchar mAnnouncedOctetCount; // (on current line) 21 | uchar mCurrentOctetCount; // (on current line) 22 | uchar mOutbits; 23 | bool mLastWasCRLF : 1; 24 | bool mSawBegin : 1; // whether we already saw ^begin... 25 | uint mIntoBeginLine : 3; // count #chars we compared against "begin" 0..5 26 | bool mSawEnd : 1; // whether we already saw ^end... 27 | uint mIntoEndLine : 2; // count #chars we compared against "end" 0..3 28 | 29 | void searchForBegin(const char *&scursor, const char *const send); 30 | 31 | protected: 32 | friend class UUCodec; 33 | UUDecoder(Codec::NewlineType newline = Codec::NewlineLF) 34 | : Decoder(newline) 35 | , mStepNo(0) 36 | , mAnnouncedOctetCount(0) 37 | , mCurrentOctetCount(0) 38 | , mOutbits(0) 39 | , mLastWasCRLF(true) 40 | , mSawBegin(false) 41 | , mIntoBeginLine(0) 42 | , mSawEnd(false) 43 | , mIntoEndLine(0) 44 | { 45 | } 46 | 47 | public: 48 | ~UUDecoder() override 49 | { 50 | } 51 | 52 | bool decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) override; 53 | // ### really needs no finishing??? 54 | bool finish(char *&dcursor, const char *const dend) override 55 | { 56 | Q_UNUSED(dcursor); 57 | Q_UNUSED(dend); 58 | return true; 59 | } 60 | }; 61 | 62 | Encoder *UUCodec::makeEncoder(NewlineType newline) const 63 | { 64 | Q_UNUSED(newline) 65 | return nullptr; // encoding not supported 66 | } 67 | 68 | Decoder *UUCodec::makeDecoder(NewlineType newline) const 69 | { 70 | return new UUDecoder(newline); 71 | } 72 | 73 | /********************************************************/ 74 | /********************************************************/ 75 | /********************************************************/ 76 | 77 | void UUDecoder::searchForBegin(const char *&scursor, const char *const send) 78 | { 79 | static const char begin[] = "begin\n"; 80 | static const uint beginLength = 5; // sic! 81 | 82 | assert(!mSawBegin || mIntoBeginLine > 0); 83 | 84 | while (scursor != send) { 85 | uchar ch = *scursor++; 86 | if (ch == begin[mIntoBeginLine]) { 87 | if (mIntoBeginLine < beginLength) { 88 | // found another char 89 | ++mIntoBeginLine; 90 | if (mIntoBeginLine == beginLength) { 91 | mSawBegin = true; // "begin" complete, now search the next \n... 92 | } 93 | } else { // mIntoBeginLine == beginLength 94 | // found '\n': begin line complete 95 | mLastWasCRLF = true; 96 | mIntoBeginLine = 0; 97 | return; 98 | } 99 | } else if (mSawBegin) { 100 | // OK, skip stuff until the next \n 101 | } else { 102 | // qWarning() << "UUDecoder: garbage before \"begin\", resetting parser"; 103 | mIntoBeginLine = 0; 104 | } 105 | } 106 | } 107 | 108 | // uuencoding just shifts all 6-bit octets by 32 (SP/' '), except NUL, 109 | // which gets mapped to 0x60 110 | static inline uchar uuDecode(uchar c) 111 | { 112 | return (c - ' ') // undo shift and 113 | & 0x3F; // map 0x40 (0x60-' ') to 0... 114 | } 115 | 116 | bool UUDecoder::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend) 117 | { 118 | // First, check whether we still need to find the "begin" line: 119 | if (!mSawBegin || mIntoBeginLine != 0) { 120 | searchForBegin(scursor, send); 121 | } else if (mSawEnd) { 122 | // or if we are past the end line: 123 | scursor = send; // do nothing anymore... 124 | return true; 125 | } 126 | 127 | while (dcursor != dend && scursor != send) { 128 | uchar ch = *scursor++; 129 | uchar value; 130 | 131 | // Check whether we need to look for the "end" line: 132 | if (mIntoEndLine > 0) { 133 | static const char end[] = "end"; 134 | static const uint endLength = 3; 135 | 136 | if (ch == end[mIntoEndLine]) { 137 | ++mIntoEndLine; 138 | if (mIntoEndLine == endLength) { 139 | mSawEnd = true; 140 | scursor = send; // shortcut to the end 141 | return true; 142 | } 143 | continue; 144 | } else { 145 | // qWarning() << "UUDecoder: invalid line octet count looks like \"end\" (mIntoEndLine =" 146 | // << mIntoEndLine << ")!"; 147 | mIntoEndLine = 0; 148 | // fall through... 149 | } 150 | } 151 | 152 | // Normal parsing: 153 | 154 | // The first char of a line is an encoding of the length of the 155 | // current line. We simply ignore it: 156 | if (mLastWasCRLF) { 157 | // reset char-per-line counter: 158 | mLastWasCRLF = false; 159 | mCurrentOctetCount = 0; 160 | 161 | // try to decode the chars-on-this-line announcement: 162 | if (ch == 'e') { // maybe the beginning of the "end"? ;-) 163 | mIntoEndLine = 1; 164 | } else if (ch > 0x60) { 165 | // ### invalid line length char: what shall we do?? 166 | } else if (ch > ' ') { 167 | mAnnouncedOctetCount = uuDecode(ch); 168 | } else if (ch == '\n') { 169 | mLastWasCRLF = true; // oops, empty line 170 | } 171 | 172 | continue; 173 | } 174 | 175 | // try converting ch to a 6-bit value: 176 | if (ch > 0x60) { 177 | continue; // invalid char 178 | } else if (ch > ' ') { 179 | value = uuDecode(ch); 180 | } else if (ch == '\n') { // line end 181 | mLastWasCRLF = true; 182 | continue; 183 | } else { 184 | continue; 185 | } 186 | 187 | // add the new bits to the output stream and flush full octets: 188 | switch (mStepNo) { 189 | case 0: 190 | mOutbits = value << 2; 191 | break; 192 | case 1: 193 | if (mCurrentOctetCount < mAnnouncedOctetCount) { 194 | *dcursor++ = (char)(mOutbits | value >> 4); 195 | } 196 | ++mCurrentOctetCount; 197 | mOutbits = value << 4; 198 | break; 199 | case 2: 200 | if (mCurrentOctetCount < mAnnouncedOctetCount) { 201 | *dcursor++ = (char)(mOutbits | value >> 2); 202 | } 203 | ++mCurrentOctetCount; 204 | mOutbits = value << 6; 205 | break; 206 | case 3: 207 | if (mCurrentOctetCount < mAnnouncedOctetCount) { 208 | *dcursor++ = (char)(mOutbits | value); 209 | } 210 | ++mCurrentOctetCount; 211 | mOutbits = 0; 212 | break; 213 | default: 214 | assert(0); 215 | } 216 | mStepNo = (mStepNo + 1) % 4; 217 | 218 | // check whether we ran over the announced octet count for this line: 219 | if (mCurrentOctetCount == mAnnouncedOctetCount + 1) { 220 | // qWarning() 221 | // << "UUDecoder: mismatch between announced (" 222 | // << mAnnouncedOctetCount << ") and actual line octet count!"; 223 | } 224 | } 225 | 226 | // return false when caller should call us again: 227 | return scursor == send; 228 | } // UUDecoder::decode() 229 | 230 | } // namespace KCodecs 231 | -------------------------------------------------------------------------------- /src/kcodecsuuencode.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- 2 | SPDX-FileCopyrightText: 2002 Marc Mutz 3 | 4 | SPDX-License-Identifier: LGPL-2.0-or-later 5 | */ 6 | 7 | #ifndef KCODECS_UUENCODE_H 8 | #define KCODECS_UUENCODE_H 9 | 10 | #include "kcodecs.h" 11 | 12 | namespace KCodecs 13 | { 14 | /* 15 | A class representing the UUEncode codec. 16 | */ 17 | class UUCodec : public Codec 18 | { 19 | public: 20 | UUCodec() 21 | : Codec() 22 | { 23 | } 24 | 25 | ~UUCodec() override 26 | { 27 | } 28 | 29 | const char *name() const override 30 | { 31 | return "x-uuencode"; 32 | } 33 | 34 | qsizetype maxEncodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 35 | { 36 | Q_UNUSED(newline); 37 | return insize; // we have no encoder! 38 | } 39 | 40 | qsizetype maxDecodedSizeFor(qsizetype insize, NewlineType newline = Codec::NewlineLF) const override 41 | { 42 | // assuming all characters are part of the uuencode stream (which 43 | // does almost never hold due to required linebreaking; but 44 | // additional non-uu chars don't affect the output size), each 45 | // 4-tupel of them becomes a 3-tupel in the decoded octet 46 | // stream. So: 47 | qsizetype result = ((insize + 3) / 4) * 3; 48 | // but all of them may be \n, so 49 | if (newline == Codec::NewlineCRLF) { 50 | result *= 2; // :-o 51 | } 52 | return result; 53 | } 54 | 55 | Encoder *makeEncoder(NewlineType newline = Codec::NewlineLF) const override; 56 | 57 | Decoder *makeDecoder(NewlineType newline = Codec::NewlineLF) const override; 58 | }; 59 | 60 | } // namespace KCodecs 61 | 62 | #endif // KCODECS_UUENCODE_H 63 | -------------------------------------------------------------------------------- /src/kencodingprober.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is part of the KDE libraries 3 | 4 | SPDX-FileCopyrightText: 2008 Wang Hoi 5 | 6 | SPDX-License-Identifier: LGPL-2.0-or-later 7 | */ 8 | #ifndef KENCODINGPROBER_H 9 | #define KENCODINGPROBER_H 10 | 11 | // enable debug of private probers 12 | // #define DEBUG_PROBE 13 | 14 | #include 15 | 16 | #ifdef DEBUG_PROBE 17 | #include 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | class KEncodingProberPrivate; 25 | 26 | /*! 27 | * \class KEncodingProber 28 | * \inmodule KCodecs 29 | * 30 | * \brief Provides encoding detection(probe) capabilities. 31 | * 32 | * Probe the encoding of raw data only. 33 | * In the case it can't find it, return the most possible encoding it guessed. 34 | * 35 | * Always do Unicode probe regardless the ProberType 36 | * 37 | * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, 38 | * or confidence() returns a value you find acceptable. 39 | * 40 | * Intended lifetime of the object: one instance per ProberType. 41 | * 42 | * Typical use: 43 | * \code 44 | * QByteArray data, moredata; 45 | * ... 46 | * KEncodingProber prober(KEncodingProber::Chinese); 47 | * prober.feed(data); 48 | * prober.feed(moredata); 49 | * if (prober.confidence() > 0.6) 50 | * encoding = prober.encoding(); 51 | * \endcode 52 | * 53 | * At least 256 characters are needed to change the ProberState from Probing to FoundIt. 54 | * If you don't have so many characters to probe, 55 | * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. 56 | * 57 | */ 58 | class KCODECS_EXPORT KEncodingProber 59 | { 60 | Q_DECLARE_TR_FUNCTIONS(KEncodingProber) 61 | 62 | public: 63 | /*! 64 | * \value FoundIt Sure find the encoding 65 | * \value NotMe Sure not included in current ProberType's all supported encodings 66 | * \value Probing Need more data to make a decision 67 | */ 68 | enum ProberState { 69 | FoundIt, 70 | NotMe, 71 | Probing, 72 | }; 73 | 74 | /*! 75 | * \value None 76 | * \value Universal 77 | * \value Arabic 78 | * \value Baltic 79 | * \value CentralEuropean 80 | * \value ChineseSimplified 81 | * \value ChineseTraditional 82 | * \value Cyrillic 83 | * \value Greek 84 | * \value Hebrew 85 | * \value Japanese 86 | * \value Korean 87 | * \value NorthernSaami 88 | * \value Other 89 | * \value SouthEasternEurope 90 | * \value Thai 91 | * \value Turkish 92 | * \value Unicode 93 | * \value WesternEuropean 94 | */ 95 | enum ProberType { 96 | None, 97 | Universal, 98 | Arabic, 99 | Baltic, 100 | CentralEuropean, 101 | ChineseSimplified, 102 | ChineseTraditional, 103 | Cyrillic, 104 | Greek, 105 | Hebrew, 106 | Japanese, 107 | Korean, 108 | NorthernSaami, 109 | Other, 110 | SouthEasternEurope, 111 | Thai, 112 | Turkish, 113 | Unicode, 114 | WesternEuropean, 115 | }; 116 | 117 | /*! 118 | * Default ProberType is Universal(detect all possible encodings) 119 | */ 120 | KEncodingProber(ProberType proberType = Universal); 121 | 122 | ~KEncodingProber(); 123 | 124 | KEncodingProber(const KEncodingProber &) = delete; 125 | KEncodingProber &operator=(const KEncodingProber &) = delete; 126 | 127 | /*! 128 | * reset the prober's internal state and data. 129 | */ 130 | void reset(); 131 | 132 | /*! 133 | * The main class method 134 | * 135 | * Feed \a data to the prober 136 | * 137 | * Returns the ProberState after probing the fed data. 138 | */ 139 | ProberState feed(QByteArrayView data); 140 | // for API compatibility 141 | inline ProberState feed(const char *data, qsizetype len) 142 | { 143 | return feed({data, len}); 144 | } 145 | 146 | /*! 147 | * Returns the prober's current ProberState 148 | * 149 | */ 150 | ProberState state() const; 151 | 152 | /*! 153 | * Returns a QByteArray with the name of the best encoding it has guessed so far 154 | * \since 4.2.2 155 | */ 156 | QByteArray encoding() const; 157 | 158 | /*! 159 | * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings 160 | */ 161 | float confidence() const; 162 | 163 | ProberType proberType() const; 164 | 165 | /*! 166 | * change current prober's ProberType and reset the prober 167 | * 168 | * \a proberType the new type 169 | */ 170 | void setProberType(ProberType proberType); 171 | 172 | /*! 173 | * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified 174 | */ 175 | static ProberType proberTypeForName(const QString &lang); 176 | 177 | /*! 178 | * map ProberType to language string 179 | * 180 | * \a proberType the proper type 181 | * 182 | * Returns the language string 183 | */ 184 | static QString nameForProberType(ProberType proberType); 185 | 186 | private: 187 | std::unique_ptr const d; 188 | }; 189 | 190 | #endif 191 | -------------------------------------------------------------------------------- /src/probers/CharDistribution.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "CharDistribution.h" 8 | 9 | #include "tables/Big5Freq.tab" 10 | #include "tables/EUCKRFreq.tab" 11 | #include "tables/GB2312Freq.tab" 12 | #include "tables/JISFreq.tab" 13 | 14 | #define SURE_YES 0.99f 15 | #define SURE_NO 0.01f 16 | 17 | namespace kencodingprober 18 | { 19 | // return confidence base on received data 20 | float CharDistributionAnalysis::GetConfidence() 21 | { 22 | // if we didn't receive any character in our consideration range, return negative answer 23 | if (mTotalChars == 0) { 24 | return SURE_NO; 25 | } 26 | 27 | if (mTotalChars != mFreqChars) { 28 | float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); 29 | 30 | if (r < SURE_YES) { 31 | return r; 32 | } 33 | } 34 | // normalize confidence, (we don't want to be 100% sure) 35 | return SURE_YES; 36 | } 37 | 38 | EUCKRDistributionAnalysis::EUCKRDistributionAnalysis() 39 | { 40 | mCharToFreqOrder = EUCKRCharToFreqOrder; 41 | mTableSize = EUCKR_TABLE_SIZE; 42 | mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO; 43 | } 44 | 45 | GB2312DistributionAnalysis::GB2312DistributionAnalysis() 46 | { 47 | mCharToFreqOrder = GB2312CharToFreqOrder; 48 | mTableSize = GB2312_TABLE_SIZE; 49 | mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO; 50 | } 51 | 52 | Big5DistributionAnalysis::Big5DistributionAnalysis() 53 | { 54 | mCharToFreqOrder = Big5CharToFreqOrder; 55 | mTableSize = BIG5_TABLE_SIZE; 56 | mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO; 57 | } 58 | 59 | SJISDistributionAnalysis::SJISDistributionAnalysis() 60 | { 61 | mCharToFreqOrder = JISCharToFreqOrder; 62 | mTableSize = JIS_TABLE_SIZE; 63 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 64 | } 65 | 66 | EUCJPDistributionAnalysis::EUCJPDistributionAnalysis() 67 | { 68 | mCharToFreqOrder = JISCharToFreqOrder; 69 | mTableSize = JIS_TABLE_SIZE; 70 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/probers/CharDistribution.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef CharDistribution_h__ 8 | #define CharDistribution_h__ 9 | 10 | #include "kcodecs_export.h" 11 | 12 | #include 13 | 14 | #define ENOUGH_DATA_THRESHOLD 256 15 | 16 | namespace kencodingprober 17 | { 18 | class KCODECS_NO_EXPORT CharDistributionAnalysis 19 | { 20 | public: 21 | CharDistributionAnalysis() 22 | { 23 | Reset(); 24 | } 25 | virtual ~CharDistributionAnalysis() 26 | { 27 | } 28 | 29 | // feed a block of data and do distribution analysis 30 | void HandleData(const char * /* aBuf */, unsigned int /* aLen */) 31 | { 32 | } 33 | 34 | // Feed a character with known length 35 | void HandleOneChar(const char *aStr, unsigned int aCharLen) 36 | { 37 | int order; 38 | 39 | // we only care about 2-bytes character in our distribution analysis 40 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 41 | 42 | if (order >= 0) { 43 | mTotalChars++; 44 | // order is valid 45 | if ((unsigned int)order < mTableSize) { 46 | if (512 > mCharToFreqOrder[order]) { 47 | mFreqChars++; 48 | } 49 | } 50 | } 51 | } 52 | 53 | // return confidence base on existing data 54 | float GetConfidence(); 55 | 56 | // Reset analyser, clear any state 57 | void Reset(void) 58 | { 59 | mDone = false; 60 | mTotalChars = 0; 61 | mFreqChars = 0; 62 | } 63 | 64 | // This function is for future extension. Caller can use this function to control 65 | // analyser's behavior 66 | void SetOpion() 67 | { 68 | } 69 | 70 | // It is not necessary to receive all data to draw conclusion. For charset detection, 71 | // certain amount of data is enough 72 | bool GotEnoughData() 73 | { 74 | return mTotalChars > ENOUGH_DATA_THRESHOLD; 75 | } 76 | 77 | protected: 78 | // we do not handle character base on its original encoding string, but 79 | // convert this encoding string to a number, here called order. 80 | // This allows multiple encodings of a language to share one frequency table 81 | virtual int GetOrder(const char * /* str */) 82 | { 83 | return -1; 84 | } 85 | 86 | // If this flag is set to true, detection is done and conclusion has been made 87 | bool mDone; 88 | 89 | // The number of characters whose frequency order is less than 512 90 | unsigned int mFreqChars; 91 | 92 | // Total character encountered. 93 | unsigned int mTotalChars; 94 | 95 | // Mapping table to get frequency order from char order (get from GetOrder()) 96 | const short *mCharToFreqOrder; 97 | 98 | // Size of above table 99 | unsigned int mTableSize; 100 | 101 | // This is a constant value varies from language to language, it is used in 102 | // calculating confidence. See my paper for further detail. 103 | float mTypicalDistributionRatio; 104 | }; 105 | 106 | class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis 107 | { 108 | public: 109 | EUCKRDistributionAnalysis(); 110 | 111 | protected: 112 | // for euc-KR encoding, we are interested 113 | // first byte range: 0xb0 -- 0xfe 114 | // second byte range: 0xa1 -- 0xfe 115 | // no validation needed here. State machine has done that 116 | int GetOrder(const char *str) override 117 | { 118 | if ((unsigned char)*str >= (unsigned char)0xb0) { 119 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 120 | } else { 121 | return -1; 122 | } 123 | } 124 | }; 125 | 126 | class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis 127 | { 128 | public: 129 | GB2312DistributionAnalysis(); 130 | 131 | protected: 132 | // for GB2312 encoding, we are interested 133 | // first byte range: 0xb0 -- 0xfe 134 | // second byte range: 0xa1 -- 0xfe 135 | // no validation needed here. State machine has done that 136 | int GetOrder(const char *str) override 137 | { 138 | if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) { 139 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 140 | } else { 141 | return -1; 142 | } 143 | } 144 | }; 145 | 146 | class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis 147 | { 148 | public: 149 | Big5DistributionAnalysis(); 150 | 151 | protected: 152 | // for big5 encoding, we are interested 153 | // first byte range: 0xa4 -- 0xfe 154 | // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 155 | // no validation needed here. State machine has done that 156 | int GetOrder(const char *str) override 157 | { 158 | if ((unsigned char)*str >= (unsigned char)0xa4) 159 | if ((unsigned char)str[1] >= (unsigned char)0xa1) { 160 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63; 161 | } else { 162 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 163 | } 164 | else { 165 | return -1; 166 | } 167 | } 168 | }; 169 | 170 | class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis 171 | { 172 | public: 173 | SJISDistributionAnalysis(); 174 | 175 | protected: 176 | // for sjis encoding, we are interested 177 | // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 178 | // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 179 | // no validation needed here. State machine has done that 180 | int GetOrder(const char *str) override 181 | { 182 | int order; 183 | if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) { 184 | order = 188 * ((unsigned char)str[0] - (unsigned char)0x81); 185 | } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) { 186 | order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31); 187 | } else { 188 | return -1; 189 | } 190 | order += (unsigned char)*(str + 1) - 0x40; 191 | if ((unsigned char)str[1] > (unsigned char)0x7f) { 192 | order--; 193 | } 194 | return order; 195 | } 196 | }; 197 | 198 | class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis 199 | { 200 | public: 201 | EUCJPDistributionAnalysis(); 202 | 203 | protected: 204 | // for euc-JP encoding, we are interested 205 | // first byte range: 0xa0 -- 0xfe 206 | // second byte range: 0xa1 -- 0xfe 207 | // no validation needed here. State machine has done that 208 | int GetOrder(const char *str) override 209 | { 210 | if ((unsigned char)*str >= (unsigned char)0xa0) { 211 | return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 212 | } else { 213 | return -1; 214 | } 215 | } 216 | }; 217 | } 218 | #endif // CharDistribution_h__ 219 | -------------------------------------------------------------------------------- /src/probers/ChineseGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "ChineseGroupProber.h" 8 | 9 | #include "UnicodeGroupProber.h" 10 | #include "nsBig5Prober.h" 11 | #include "nsGB2312Prober.h" 12 | 13 | #include 14 | #include 15 | 16 | namespace kencodingprober 17 | { 18 | #ifdef DEBUG_PROBE 19 | static const char *const ProberName[] = { 20 | "Unicode", 21 | "GB18030", 22 | "Big5", 23 | }; 24 | 25 | #endif 26 | 27 | ChineseGroupProber::ChineseGroupProber() 28 | { 29 | mProbers[0] = new UnicodeGroupProber(); 30 | mProbers[1] = new nsGB18030Prober(); 31 | mProbers[2] = new nsBig5Prober(); 32 | Reset(); 33 | } 34 | 35 | ChineseGroupProber::~ChineseGroupProber() 36 | { 37 | for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { 38 | delete mProbers[i]; 39 | } 40 | } 41 | 42 | const char *ChineseGroupProber::GetCharSetName() 43 | { 44 | if (mBestGuess == -1) { 45 | GetConfidence(); 46 | if (mBestGuess == -1) { 47 | mBestGuess = 1; // assume it's GB18030 48 | } 49 | } 50 | return mProbers[mBestGuess]->GetCharSetName(); 51 | } 52 | 53 | void ChineseGroupProber::Reset(void) 54 | { 55 | mActiveNum = 0; 56 | for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { 57 | if (mProbers[i]) { 58 | mProbers[i]->Reset(); 59 | mIsActive[i] = true; 60 | ++mActiveNum; 61 | } else { 62 | mIsActive[i] = false; 63 | } 64 | } 65 | mBestGuess = -1; 66 | mState = eDetecting; 67 | } 68 | 69 | nsProbingState ChineseGroupProber::HandleData(const char *aBuf, unsigned int aLen) 70 | { 71 | nsProbingState st; 72 | unsigned int i; 73 | 74 | // do filtering to reduce load to probers 75 | char *highbyteBuf; 76 | char *hptr; 77 | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise 78 | hptr = highbyteBuf = (char *)malloc(aLen); 79 | if (!hptr) { 80 | return mState; 81 | } 82 | for (i = 0; i < aLen; ++i) { 83 | if (aBuf[i] & 0x80) { 84 | *hptr++ = aBuf[i]; 85 | keepNext = true; 86 | } else { 87 | // if previous is highbyte, keep this even it is an ASCII 88 | if (keepNext) { 89 | *hptr++ = aBuf[i]; 90 | keepNext = false; 91 | } 92 | } 93 | } 94 | 95 | for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { 96 | if (!mIsActive[i]) { 97 | continue; 98 | } 99 | st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 100 | if (st == eFoundIt) { 101 | mBestGuess = i; 102 | mState = eFoundIt; 103 | break; 104 | } else if (st == eNotMe) { 105 | mIsActive[i] = false; 106 | --mActiveNum; 107 | if (mActiveNum == 0) { 108 | mState = eNotMe; 109 | break; 110 | } 111 | } 112 | } 113 | 114 | free(highbyteBuf); 115 | 116 | return mState; 117 | } 118 | 119 | float ChineseGroupProber::GetConfidence(void) 120 | { 121 | unsigned int i; 122 | float bestConf = 0.0; 123 | float cf; 124 | 125 | switch (mState) { 126 | case eFoundIt: 127 | return (float)0.99; 128 | case eNotMe: 129 | return (float)0.01; 130 | default: 131 | for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { 132 | if (!mIsActive[i]) { 133 | continue; 134 | } 135 | cf = mProbers[i]->GetConfidence(); 136 | if (bestConf < cf) { 137 | bestConf = cf; 138 | mBestGuess = i; 139 | } 140 | } 141 | } 142 | return bestConf; 143 | } 144 | 145 | #ifdef DEBUG_PROBE 146 | void ChineseGroupProber::DumpStatus() 147 | { 148 | unsigned int i; 149 | float cf; 150 | 151 | GetConfidence(); 152 | for (i = 0; i < CN_NUM_OF_PROBERS; i++) { 153 | if (!mIsActive[i]) { 154 | printf(" Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 155 | } else { 156 | cf = mProbers[i]->GetConfidence(); 157 | printf(" Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]); 158 | } 159 | } 160 | } 161 | #endif 162 | } 163 | -------------------------------------------------------------------------------- /src/probers/ChineseGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef CHINESEGROUPPROBER_H 8 | #define CHINESEGROUPPROBER_H 9 | 10 | #include "nsCharSetProber.h" 11 | 12 | #define CN_NUM_OF_PROBERS 3 13 | namespace kencodingprober 14 | { 15 | class KCODECS_NO_EXPORT ChineseGroupProber : public nsCharSetProber 16 | { 17 | public: 18 | ChineseGroupProber(); 19 | ~ChineseGroupProber() override; 20 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 21 | const char *GetCharSetName() override; 22 | nsProbingState GetState(void) override 23 | { 24 | return mState; 25 | } 26 | void Reset(void) override; 27 | float GetConfidence(void) override; 28 | void SetOpion() override 29 | { 30 | } 31 | 32 | #ifdef DEBUG_PROBE 33 | void DumpStatus() override; 34 | #endif 35 | 36 | protected: 37 | nsProbingState mState; 38 | nsCharSetProber *mProbers[CN_NUM_OF_PROBERS]; 39 | bool mIsActive[CN_NUM_OF_PROBERS]; 40 | int mBestGuess; 41 | unsigned int mActiveNum; 42 | }; 43 | } 44 | #endif /* CHINESEGROUPPROBER_H */ 45 | -------------------------------------------------------------------------------- /src/probers/JapaneseGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "JapaneseGroupProber.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace kencodingprober 13 | { 14 | #ifdef DEBUG_PROBE 15 | static const char *const ProberName[] = { 16 | "Unicode", 17 | "GB18030", 18 | "Big5", 19 | }; 20 | 21 | #endif 22 | 23 | JapaneseGroupProber::JapaneseGroupProber() 24 | { 25 | mProbers[0] = new UnicodeGroupProber(); 26 | mProbers[1] = new nsSJISProber(); 27 | mProbers[2] = new nsEUCJPProber(); 28 | Reset(); 29 | } 30 | 31 | JapaneseGroupProber::~JapaneseGroupProber() 32 | { 33 | for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) { 34 | delete mProbers[i]; 35 | } 36 | } 37 | 38 | const char *JapaneseGroupProber::GetCharSetName() 39 | { 40 | if (mBestGuess == -1) { 41 | GetConfidence(); 42 | if (mBestGuess == -1) { 43 | mBestGuess = 1; // assume it's GB18030 44 | } 45 | } 46 | return mProbers[mBestGuess]->GetCharSetName(); 47 | } 48 | 49 | void JapaneseGroupProber::Reset(void) 50 | { 51 | mActiveNum = 0; 52 | for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) { 53 | if (mProbers[i]) { 54 | mProbers[i]->Reset(); 55 | mIsActive[i] = true; 56 | ++mActiveNum; 57 | } else { 58 | mIsActive[i] = false; 59 | } 60 | } 61 | mBestGuess = -1; 62 | mState = eDetecting; 63 | } 64 | 65 | nsProbingState JapaneseGroupProber::HandleData(const char *aBuf, unsigned int aLen) 66 | { 67 | nsProbingState st; 68 | unsigned int i; 69 | 70 | // do filtering to reduce load to probers 71 | char *highbyteBuf; 72 | char *hptr; 73 | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise 74 | hptr = highbyteBuf = (char *)malloc(aLen); 75 | if (!hptr) { 76 | return mState; 77 | } 78 | for (i = 0; i < aLen; ++i) { 79 | if (aBuf[i] & 0x80) { 80 | *hptr++ = aBuf[i]; 81 | keepNext = true; 82 | } else { 83 | // if previous is highbyte, keep this even it is a ASCII 84 | if (keepNext) { 85 | *hptr++ = aBuf[i]; 86 | keepNext = false; 87 | } 88 | } 89 | } 90 | 91 | for (i = 0; i < JP_NUM_OF_PROBERS; ++i) { 92 | if (!mIsActive[i]) { 93 | continue; 94 | } 95 | st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 96 | if (st == eFoundIt) { 97 | mBestGuess = i; 98 | mState = eFoundIt; 99 | break; 100 | } else if (st == eNotMe) { 101 | mIsActive[i] = false; 102 | --mActiveNum; 103 | if (mActiveNum == 0) { 104 | mState = eNotMe; 105 | break; 106 | } 107 | } 108 | } 109 | 110 | free(highbyteBuf); 111 | 112 | return mState; 113 | } 114 | 115 | float JapaneseGroupProber::GetConfidence(void) 116 | { 117 | unsigned int i; 118 | float bestConf = 0.0; 119 | float cf; 120 | 121 | switch (mState) { 122 | case eFoundIt: 123 | return (float)0.99; 124 | case eNotMe: 125 | return (float)0.01; 126 | default: 127 | for (i = 0; i < JP_NUM_OF_PROBERS; ++i) { 128 | if (!mIsActive[i]) { 129 | continue; 130 | } 131 | cf = mProbers[i]->GetConfidence(); 132 | if (bestConf < cf) { 133 | bestConf = cf; 134 | mBestGuess = i; 135 | } 136 | } 137 | } 138 | return bestConf; 139 | } 140 | 141 | #ifdef DEBUG_PROBE 142 | void JapaneseGroupProber::DumpStatus() 143 | { 144 | unsigned int i; 145 | float cf; 146 | 147 | GetConfidence(); 148 | for (i = 0; i < JP_NUM_OF_PROBERS; i++) { 149 | if (!mIsActive[i]) { 150 | printf(" Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 151 | } else { 152 | cf = mProbers[i]->GetConfidence(); 153 | printf(" Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]); 154 | } 155 | } 156 | } 157 | #endif 158 | } 159 | -------------------------------------------------------------------------------- /src/probers/JapaneseGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef JAPANESEGROUPPROBER_H 8 | #define JAPANESEGROUPPROBER_H 9 | 10 | #include "UnicodeGroupProber.h" 11 | #include "nsCharSetProber.h" 12 | #include "nsEUCJPProber.h" 13 | #include "nsSJISProber.h" 14 | 15 | #define JP_NUM_OF_PROBERS 3 16 | namespace kencodingprober 17 | { 18 | class KCODECS_NO_EXPORT JapaneseGroupProber : public nsCharSetProber 19 | { 20 | public: 21 | JapaneseGroupProber(); 22 | ~JapaneseGroupProber() override; 23 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 24 | const char *GetCharSetName() override; 25 | nsProbingState GetState(void) override 26 | { 27 | return mState; 28 | } 29 | void Reset(void) override; 30 | float GetConfidence(void) override; 31 | void SetOpion() override 32 | { 33 | } 34 | 35 | #ifdef DEBUG_PROBE 36 | void DumpStatus() override; 37 | #endif 38 | 39 | protected: 40 | nsProbingState mState; 41 | nsCharSetProber *mProbers[JP_NUM_OF_PROBERS]; 42 | bool mIsActive[JP_NUM_OF_PROBERS]; 43 | int mBestGuess; 44 | unsigned int mActiveNum; 45 | }; 46 | } 47 | #endif /* JAPANESEGROUPPROBER_H */ 48 | -------------------------------------------------------------------------------- /src/probers/JpCntx.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef __JPCNTX_H__ 8 | #define __JPCNTX_H__ 9 | 10 | #include "kcodecs_export.h" 11 | 12 | #include 13 | 14 | #define NUM_OF_CATEGORY 6 15 | 16 | #define ENOUGH_REL_THRESHOLD 100 17 | #define MAX_REL_THRESHOLD 1000 18 | namespace kencodingprober 19 | { 20 | // hiragana frequency category table 21 | extern const char jp2CharContext[83][83]; 22 | 23 | class KCODECS_NO_EXPORT JapaneseContextAnalysis 24 | { 25 | public: 26 | JapaneseContextAnalysis() 27 | { 28 | Reset(); 29 | } 30 | virtual ~JapaneseContextAnalysis() 31 | { 32 | } 33 | 34 | void HandleData(const char *aBuf, unsigned int aLen); 35 | 36 | void HandleOneChar(const char *aStr, unsigned int aCharLen) 37 | { 38 | int order; 39 | 40 | // if we received enough data, stop here 41 | if (mTotalRel > MAX_REL_THRESHOLD) { 42 | mDone = true; 43 | } 44 | if (mDone) { 45 | return; 46 | } 47 | 48 | // Only 2-bytes characters are of our interest 49 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 50 | if (order != -1 && mLastCharOrder != -1) { 51 | mTotalRel++; 52 | // count this sequence to its category counter 53 | mRelSample[(int)jp2CharContext[mLastCharOrder][order]]++; 54 | } 55 | mLastCharOrder = order; 56 | } 57 | 58 | float GetConfidence(); 59 | void Reset(void); 60 | void SetOpion() 61 | { 62 | } 63 | bool GotEnoughData() 64 | { 65 | return mTotalRel > ENOUGH_REL_THRESHOLD; 66 | } 67 | 68 | protected: 69 | virtual int GetOrder(const char *str, unsigned int *charLen) = 0; 70 | virtual int GetOrder(const char *str) = 0; 71 | 72 | // category counters, each integer counts sequence in its category 73 | unsigned int mRelSample[NUM_OF_CATEGORY]; 74 | 75 | // total sequence received 76 | unsigned int mTotalRel; 77 | 78 | // The order of previous char 79 | int mLastCharOrder; 80 | 81 | // if last byte in current buffer is not the last byte of a character, we 82 | // need to know how many byte to skip in next buffer. 83 | unsigned int mNeedToSkipCharNum; 84 | 85 | // If this flag is set to true, detection is done and conclusion has been made 86 | bool mDone; 87 | }; 88 | 89 | class KCODECS_NO_EXPORT SJISContextAnalysis : public JapaneseContextAnalysis 90 | { 91 | // SJISContextAnalysis(){}; 92 | protected: 93 | int GetOrder(const char *str, unsigned int *charLen) override; 94 | 95 | int GetOrder(const char *str) override 96 | { 97 | // We only interested in Hiragana, so first byte is '\202' 98 | if (*str == '\202' && (unsigned char)*(str + 1) >= (unsigned char)0x9f && (unsigned char)*(str + 1) <= (unsigned char)0xf1) { 99 | return (unsigned char)*(str + 1) - (unsigned char)0x9f; 100 | } 101 | return -1; 102 | } 103 | }; 104 | 105 | class KCODECS_NO_EXPORT EUCJPContextAnalysis : public JapaneseContextAnalysis 106 | { 107 | protected: 108 | int GetOrder(const char *str, unsigned int *charLen) override; 109 | int GetOrder(const char *str) override 110 | // We only interested in Hiragana, so first byte is '\244' 111 | { 112 | if (*str == '\244' // 113 | && (unsigned char)*(str + 1) >= (unsigned char)0xa1 // 114 | && (unsigned char)*(str + 1) <= (unsigned char)0xf3) { 115 | return (unsigned char)*(str + 1) - (unsigned char)0xa1; 116 | } 117 | return -1; 118 | } 119 | }; 120 | } 121 | #endif /* __JPCNTX_H__ */ 122 | -------------------------------------------------------------------------------- /src/probers/UnicodeGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 2008 Wang Kai 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "UnicodeGroupProber.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace kencodingprober 13 | { 14 | UnicodeGroupProber::UnicodeGroupProber(void) 15 | { 16 | mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel); 17 | mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel); 18 | mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel); 19 | mActiveSM = NUM_OF_UNICODE_CHARSETS; 20 | mState = eDetecting; 21 | mDetectedCharset = "UTF-8"; 22 | } 23 | 24 | UnicodeGroupProber::~UnicodeGroupProber(void) 25 | { 26 | for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { 27 | delete mCodingSM[i]; 28 | } 29 | } 30 | 31 | void UnicodeGroupProber::Reset(void) 32 | { 33 | mState = eDetecting; 34 | for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { 35 | mCodingSM[i]->Reset(); 36 | } 37 | mActiveSM = NUM_OF_UNICODE_CHARSETS; 38 | mDetectedCharset = "UTF-8"; 39 | } 40 | 41 | nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen) 42 | { 43 | nsSMState codingState; 44 | static bool disableUTF16LE = false; 45 | static bool disableUTF16BE = false; 46 | 47 | if (mActiveSM == 0 || aLen < 2) { 48 | mState = eNotMe; 49 | return mState; 50 | } 51 | 52 | if (!(disableUTF16LE || disableUTF16BE)) { 53 | if (aLen % 2 != 0) { 54 | disableUTF16LE = true; 55 | disableUTF16BE = true; 56 | } 57 | const uint weight_BOM = sqrt((double)aLen) + aLen / 10.0; 58 | uint counts[5] = {0, 0, 0, 0, 0}; 59 | for (uint i = 0; i < 5; i++) { 60 | counts[i] = std::count(aBuf, aBuf + aLen, char(i)); 61 | } 62 | const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen; 63 | if (weight_zero < log(1.4142)) { 64 | disableUTF16LE = true; 65 | disableUTF16BE = true; 66 | } 67 | if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(static_cast(aBuf[0]))) { 68 | disableUTF16BE = true; 69 | } else { 70 | disableUTF16LE = true; 71 | } 72 | if (disableUTF16BE) { 73 | mActiveSM--; 74 | } 75 | if (disableUTF16LE) { 76 | nsCodingStateMachine *t; 77 | t = mCodingSM[1]; 78 | mCodingSM[1] = mCodingSM[2]; 79 | mCodingSM[2] = t; 80 | mActiveSM--; 81 | } 82 | } 83 | 84 | for (uint i = 0; i < aLen; ++i) { 85 | for (int j = mActiveSM - 1; j >= 0; --j) { 86 | // byte is feed to all active state machine 87 | codingState = mCodingSM[j]->NextState(aBuf[i]); 88 | if (codingState == eError) { 89 | // got negative answer for this state machine, make it inactive 90 | mActiveSM--; 91 | if (mActiveSM == 0) { 92 | mState = eNotMe; 93 | return mState; 94 | } else if (j != (int)mActiveSM) { 95 | nsCodingStateMachine *t; 96 | t = mCodingSM[mActiveSM]; 97 | mCodingSM[mActiveSM] = mCodingSM[j]; 98 | mCodingSM[j] = t; 99 | } 100 | } else if (codingState == eItsMe) { 101 | mState = eFoundIt; 102 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 103 | return mState; 104 | } else if (mState == eDetecting) { 105 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 106 | }; 107 | } 108 | } 109 | return mState; 110 | } 111 | 112 | float UnicodeGroupProber::GetConfidence() 113 | { 114 | if (mState == eFoundIt) { 115 | return 0.99f; 116 | } else { 117 | return 0.0f; 118 | } 119 | } 120 | 121 | #ifdef DEBUG_PROBE 122 | void UnicodeGroupProber::DumpStatus() 123 | { 124 | GetConfidence(); 125 | for (uint i = 0; i < mActiveSM; i++) { 126 | qDebug() << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine(); 127 | } 128 | } 129 | #endif 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/probers/UnicodeGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 2008 Wang Kai 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef UNICODEGROUPPROBER_H 8 | #define UNICODEGROUPPROBER_H 9 | 10 | #include "nsCharSetProber.h" 11 | #include "nsCodingStateMachine.h" 12 | 13 | #define NUM_OF_UNICODE_CHARSETS 3 14 | namespace kencodingprober 15 | { 16 | class KCODECS_NO_EXPORT UnicodeGroupProber : public nsCharSetProber 17 | { 18 | public: 19 | UnicodeGroupProber(void); 20 | ~UnicodeGroupProber(void) override; 21 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 22 | const char *GetCharSetName() override 23 | { 24 | return mDetectedCharset; 25 | } 26 | nsProbingState GetState(void) override 27 | { 28 | return mState; 29 | } 30 | void Reset(void) override; 31 | float GetConfidence() override; 32 | void SetOpion() override 33 | { 34 | } 35 | #ifdef DEBUG_PROBE 36 | void DumpStatus() override; 37 | #endif 38 | 39 | protected: 40 | void GetDistribution(unsigned int aCharLen, const char *aStr); 41 | 42 | nsCodingStateMachine *mCodingSM[NUM_OF_UNICODE_CHARSETS]; 43 | unsigned int mActiveSM; 44 | nsProbingState mState; 45 | const char *mDetectedCharset; 46 | }; 47 | } 48 | #endif /* UNICODEGROUPPROBER_H */ 49 | -------------------------------------------------------------------------------- /src/probers/nsBig5Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsBig5Prober.h" 8 | 9 | namespace kencodingprober 10 | { 11 | void nsBig5Prober::Reset(void) 12 | { 13 | mCodingSM->Reset(); 14 | mState = eDetecting; 15 | mDistributionAnalyser.Reset(); 16 | } 17 | 18 | nsProbingState nsBig5Prober::HandleData(const char *aBuf, unsigned int aLen) 19 | { 20 | if (aLen == 0) { 21 | return mState; 22 | } 23 | 24 | for (unsigned int i = 0; i < aLen; i++) { 25 | const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 26 | if (codingState == eError) { 27 | mState = eNotMe; 28 | break; 29 | } 30 | if (codingState == eItsMe) { 31 | mState = eFoundIt; 32 | break; 33 | } 34 | if (codingState == eStart) { 35 | unsigned int charLen = mCodingSM->GetCurrentCharLen(); 36 | 37 | if (i == 0) { 38 | mLastChar[1] = aBuf[0]; 39 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 40 | } else { 41 | mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 42 | } 43 | } 44 | } 45 | 46 | mLastChar[0] = aBuf[aLen - 1]; 47 | 48 | if (mState == eDetecting) { 49 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 50 | mState = eFoundIt; 51 | } 52 | } 53 | 54 | return mState; 55 | } 56 | 57 | float nsBig5Prober::GetConfidence(void) 58 | { 59 | float distribCf = mDistributionAnalyser.GetConfidence(); 60 | 61 | return (float)distribCf; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/probers/nsBig5Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsBig5Prober_h__ 8 | #define nsBig5Prober_h__ 9 | 10 | #include "CharDistribution.h" 11 | #include "nsCharSetProber.h" 12 | #include "nsCodingStateMachine.h" 13 | namespace kencodingprober 14 | { 15 | class KCODECS_NO_EXPORT nsBig5Prober : public nsCharSetProber 16 | { 17 | public: 18 | nsBig5Prober(void) 19 | { 20 | mCodingSM = new nsCodingStateMachine(&Big5SMModel); 21 | Reset(); 22 | } 23 | ~nsBig5Prober() override 24 | { 25 | delete mCodingSM; 26 | } 27 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 28 | const char *GetCharSetName() override 29 | { 30 | return "Big5"; 31 | } 32 | nsProbingState GetState(void) override 33 | { 34 | return mState; 35 | } 36 | void Reset(void) override; 37 | float GetConfidence(void) override; 38 | void SetOpion() override 39 | { 40 | } 41 | 42 | protected: 43 | void GetDistribution(unsigned int aCharLen, const char *aStr); 44 | 45 | nsCodingStateMachine *mCodingSM; 46 | nsProbingState mState; 47 | 48 | // Big5ContextAnalysis mContextAnalyser; 49 | Big5DistributionAnalysis mDistributionAnalyser; 50 | char mLastChar[2]; 51 | }; 52 | } 53 | 54 | #endif /* nsBig5Prober_h__ */ 55 | -------------------------------------------------------------------------------- /src/probers/nsCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsCharSetProber.h" 8 | 9 | #include 10 | 11 | namespace kencodingprober 12 | { 13 | // This filter applies to all scripts which do not use English characters 14 | bool nsCharSetProber::FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) 15 | { 16 | char *newptr; 17 | char *prevPtr; 18 | char *curPtr; 19 | 20 | bool meetMSB = false; 21 | newptr = *newBuf = (char *)malloc(aLen); 22 | if (!newptr) { 23 | return false; 24 | } 25 | 26 | for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { 27 | if (*curPtr & 0x80) { 28 | meetMSB = true; 29 | } else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') { 30 | // current char is a symbol, most likely a punctuation. we treat it as segment delimiter 31 | if (meetMSB && curPtr > prevPtr) 32 | // this segment contains more than single symbol, and it has upper ASCII, we need to keep it 33 | { 34 | while (prevPtr < curPtr) { 35 | *newptr++ = *prevPtr++; 36 | } 37 | prevPtr++; 38 | *newptr++ = ' '; 39 | meetMSB = false; 40 | } else { // ignore current segment. (either because it is just a symbol or just an English word) 41 | prevPtr = curPtr + 1; 42 | } 43 | } 44 | } 45 | if (meetMSB && curPtr > prevPtr) { 46 | while (prevPtr < curPtr) { 47 | *newptr++ = *prevPtr++; 48 | } 49 | } 50 | 51 | newLen = newptr - *newBuf; 52 | 53 | return true; 54 | } 55 | 56 | // This filter applies to all scripts which contain both English characters and upper ASCII characters. 57 | bool nsCharSetProber::FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) 58 | { 59 | // do filtering to reduce load to probers 60 | char *newptr; 61 | char *prevPtr; 62 | char *curPtr; 63 | bool isInTag = false; 64 | 65 | newptr = *newBuf = (char *)malloc(aLen); 66 | if (!newptr) { 67 | return false; 68 | } 69 | 70 | for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { 71 | if (*curPtr == '>') { 72 | isInTag = false; 73 | } else if (*curPtr == '<') { 74 | isInTag = true; 75 | } 76 | 77 | if (!(*curPtr & 0x80) // 78 | && (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')) { 79 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 80 | // and it is not inside a tag, keep it. 81 | { 82 | while (prevPtr < curPtr) { 83 | *newptr++ = *prevPtr++; 84 | } 85 | prevPtr++; 86 | *newptr++ = ' '; 87 | } else { 88 | prevPtr = curPtr + 1; 89 | } 90 | } 91 | } 92 | 93 | // If the current segment contains more than just a symbol 94 | // and it is not inside a tag then keep it. 95 | if (!isInTag) { 96 | while (prevPtr < curPtr) { 97 | *newptr++ = *prevPtr++; 98 | } 99 | } 100 | 101 | newLen = newptr - *newBuf; 102 | 103 | return true; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/probers/nsCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsCharSetProber_h__ 8 | #define nsCharSetProber_h__ 9 | 10 | #include "kencodingprober.h" 11 | 12 | namespace kencodingprober 13 | { 14 | typedef enum { 15 | eDetecting = 0, // We are still detecting, no sure answer yet, but caller can ask for confidence. 16 | eFoundIt = 1, // That's a positive answer 17 | eNotMe = 2, // Negative answer 18 | } nsProbingState; 19 | 20 | #define SHORTCUT_THRESHOLD (float)0.95 21 | 22 | class KCODECS_NO_EXPORT nsCharSetProber 23 | { 24 | public: 25 | virtual ~nsCharSetProber() 26 | { 27 | } 28 | virtual const char *GetCharSetName() = 0; 29 | virtual nsProbingState HandleData(const char *aBuf, unsigned int aLen) = 0; 30 | virtual nsProbingState GetState(void) = 0; 31 | virtual void Reset(void) = 0; 32 | virtual float GetConfidence(void) = 0; 33 | virtual void SetOpion() = 0; 34 | 35 | #ifdef DEBUG_PROBE 36 | void DumpStatus() override 37 | { 38 | } 39 | #endif 40 | 41 | // Helper functions used in the Latin1 and Group probers. 42 | // both functions Allocate a new buffer for newBuf. This buffer should be 43 | // freed by the caller using PR_FREEIF. 44 | // Both functions return false in case of memory allocation failure. 45 | static bool FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen); 46 | static bool FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen); 47 | }; 48 | } 49 | #endif /* nsCharSetProber_h__ */ 50 | -------------------------------------------------------------------------------- /src/probers/nsCodingStateMachine.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsCodingStateMachine_h__ 8 | #define nsCodingStateMachine_h__ 9 | 10 | #include "kencodingprober.h" 11 | 12 | #include "kcodecs_export.h" 13 | 14 | #include "nsPkgInt.h" 15 | namespace kencodingprober 16 | { 17 | enum { 18 | eStart = 0, 19 | eError = 1, 20 | eItsMe = 2, 21 | }; 22 | using nsSMState = int; 23 | 24 | #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) 25 | 26 | // state machine model 27 | typedef struct { 28 | nsPkgInt classTable; 29 | unsigned int classFactor; 30 | nsPkgInt stateTable; 31 | const unsigned int *charLenTable; 32 | const char *name; 33 | } SMModel; 34 | 35 | class KCODECS_NO_EXPORT nsCodingStateMachine 36 | { 37 | public: 38 | nsCodingStateMachine(const SMModel *sm) 39 | { 40 | mCurrentState = eStart; 41 | mModel = sm; 42 | } 43 | nsSMState NextState(char c) 44 | { 45 | // for each byte we get its class KCODECS_NO_EXPORT , if it is first byte, we also get byte length 46 | unsigned int byteCls = GETCLASS(c); 47 | if (mCurrentState == eStart) { 48 | mCurrentBytePos = 0; 49 | mCurrentCharLen = mModel->charLenTable[byteCls]; 50 | } 51 | // from byte's class KCODECS_NO_EXPORT and stateTable, we get its next state 52 | mCurrentState = GETFROMPCK(mCurrentState * (mModel->classFactor) + byteCls, mModel->stateTable); 53 | mCurrentBytePos++; 54 | return mCurrentState; 55 | } 56 | unsigned int GetCurrentCharLen(void) 57 | { 58 | return mCurrentCharLen; 59 | } 60 | void Reset(void) 61 | { 62 | mCurrentState = eStart; 63 | } 64 | const char *GetCodingStateMachine() 65 | { 66 | return mModel->name; 67 | } 68 | #ifdef DEBUG_PROBE 69 | const char *DumpCurrentState() 70 | { 71 | switch (mCurrentState) { 72 | case eStart: 73 | return "eStart"; 74 | case eError: 75 | return "eError"; 76 | case eItsMe: 77 | return "eItsMe"; 78 | default: 79 | return "OK"; 80 | } 81 | } 82 | #endif 83 | 84 | protected: 85 | int mCurrentState; 86 | unsigned int mCurrentCharLen; 87 | unsigned int mCurrentBytePos; 88 | 89 | const SMModel *mModel; 90 | }; 91 | 92 | extern KCODECS_NO_EXPORT const SMModel UTF8SMModel; 93 | extern KCODECS_NO_EXPORT const SMModel Big5SMModel; 94 | extern KCODECS_NO_EXPORT const SMModel EUCJPSMModel; 95 | extern KCODECS_NO_EXPORT const SMModel EUCKRSMModel; 96 | extern KCODECS_NO_EXPORT const SMModel GB18030SMModel; 97 | extern KCODECS_NO_EXPORT const SMModel SJISSMModel; 98 | extern KCODECS_NO_EXPORT const SMModel UCS2LESMModel; 99 | extern KCODECS_NO_EXPORT const SMModel UCS2BESMModel; 100 | 101 | extern KCODECS_NO_EXPORT const SMModel HZSMModel; 102 | extern KCODECS_NO_EXPORT const SMModel ISO2022CNSMModel; 103 | extern KCODECS_NO_EXPORT const SMModel ISO2022JPSMModel; 104 | extern KCODECS_NO_EXPORT const SMModel ISO2022KRSMModel; 105 | } 106 | #endif /* nsCodingStateMachine_h__ */ 107 | -------------------------------------------------------------------------------- /src/probers/nsEUCJPProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | // for japanese encoding, observe characteristic: 8 | // 1, kana character (or hankaku?) often have high frequency of appearance 9 | // 2, kana character often exist in group 10 | // 3, certain combination of kana is never used in japanese language 11 | 12 | #include "nsEUCJPProber.h" 13 | 14 | namespace kencodingprober 15 | { 16 | void nsEUCJPProber::Reset(void) 17 | { 18 | mCodingSM->Reset(); 19 | mState = eDetecting; 20 | mContextAnalyser.Reset(); 21 | mDistributionAnalyser.Reset(); 22 | } 23 | 24 | nsProbingState nsEUCJPProber::HandleData(const char *aBuf, unsigned int aLen) 25 | { 26 | if (aLen == 0) { 27 | return mState; 28 | } 29 | 30 | for (unsigned int i = 0; i < aLen; i++) { 31 | const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 32 | if (codingState == eError) { 33 | mState = eNotMe; 34 | break; 35 | } 36 | if (codingState == eItsMe) { 37 | mState = eFoundIt; 38 | break; 39 | } 40 | if (codingState == eStart) { 41 | unsigned int charLen = mCodingSM->GetCurrentCharLen(); 42 | 43 | if (i == 0) { 44 | mLastChar[1] = aBuf[0]; 45 | mContextAnalyser.HandleOneChar(mLastChar, charLen); 46 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 47 | } else { 48 | mContextAnalyser.HandleOneChar(aBuf + i - 1, charLen); 49 | mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 50 | } 51 | } 52 | } 53 | 54 | mLastChar[0] = aBuf[aLen - 1]; 55 | 56 | if (mState == eDetecting) { 57 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 58 | mState = eFoundIt; 59 | } 60 | } 61 | 62 | return mState; 63 | } 64 | 65 | float nsEUCJPProber::GetConfidence(void) 66 | { 67 | float contxtCf = mContextAnalyser.GetConfidence(); 68 | float distribCf = mDistributionAnalyser.GetConfidence(); 69 | 70 | return (contxtCf > distribCf ? contxtCf : distribCf); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/probers/nsEUCJPProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | // for S-JIS encoding, observe characteristic: 8 | // 1, kana character (or hankaku?) often have high frequency of appearance 9 | // 2, kana character often exist in group 10 | // 3, certain combination of kana is never used in japanese language 11 | 12 | #ifndef nsEUCJPProber_h__ 13 | #define nsEUCJPProber_h__ 14 | 15 | #include "CharDistribution.h" 16 | #include "JpCntx.h" 17 | #include "nsCharSetProber.h" 18 | #include "nsCodingStateMachine.h" 19 | namespace kencodingprober 20 | { 21 | class KCODECS_NO_EXPORT nsEUCJPProber : public nsCharSetProber 22 | { 23 | public: 24 | nsEUCJPProber(void) 25 | { 26 | mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); 27 | Reset(); 28 | } 29 | ~nsEUCJPProber(void) override 30 | { 31 | delete mCodingSM; 32 | } 33 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 34 | const char *GetCharSetName() override 35 | { 36 | return "EUC-JP"; 37 | } 38 | nsProbingState GetState(void) override 39 | { 40 | return mState; 41 | } 42 | void Reset(void) override; 43 | float GetConfidence(void) override; 44 | void SetOpion() override 45 | { 46 | } 47 | 48 | protected: 49 | nsCodingStateMachine *mCodingSM; 50 | nsProbingState mState; 51 | 52 | EUCJPContextAnalysis mContextAnalyser; 53 | EUCJPDistributionAnalysis mDistributionAnalyser; 54 | 55 | char mLastChar[2]; 56 | }; 57 | } 58 | 59 | #endif /* nsEUCJPProber_h__ */ 60 | -------------------------------------------------------------------------------- /src/probers/nsEUCKRProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsEUCKRProber.h" 8 | 9 | namespace kencodingprober 10 | { 11 | void nsEUCKRProber::Reset(void) 12 | { 13 | mCodingSM->Reset(); 14 | mState = eDetecting; 15 | mDistributionAnalyser.Reset(); 16 | // mContextAnalyser.Reset(); 17 | } 18 | 19 | nsProbingState nsEUCKRProber::HandleData(const char *aBuf, unsigned int aLen) 20 | { 21 | if (aLen == 0) { 22 | return mState; 23 | } 24 | 25 | for (unsigned int i = 0; i < aLen; i++) { 26 | const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 27 | if (codingState == eError) { 28 | mState = eNotMe; 29 | break; 30 | } 31 | if (codingState == eItsMe) { 32 | mState = eFoundIt; 33 | break; 34 | } 35 | if (codingState == eStart) { 36 | unsigned int charLen = mCodingSM->GetCurrentCharLen(); 37 | 38 | if (i == 0) { 39 | mLastChar[1] = aBuf[0]; 40 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 41 | } else { 42 | mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 43 | } 44 | } 45 | } 46 | 47 | mLastChar[0] = aBuf[aLen - 1]; 48 | 49 | if (mState == eDetecting) { 50 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 51 | mState = eFoundIt; 52 | } 53 | } 54 | // else 55 | // mDistributionAnalyser.HandleData(aBuf, aLen); 56 | 57 | return mState; 58 | } 59 | 60 | float nsEUCKRProber::GetConfidence(void) 61 | { 62 | float distribCf = mDistributionAnalyser.GetConfidence(); 63 | 64 | return (float)distribCf; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/probers/nsEUCKRProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsEUCKRProber_h__ 8 | #define nsEUCKRProber_h__ 9 | 10 | #include "CharDistribution.h" 11 | #include "nsCharSetProber.h" 12 | #include "nsCodingStateMachine.h" 13 | namespace kencodingprober 14 | { 15 | class KCODECS_NO_EXPORT nsEUCKRProber : public nsCharSetProber 16 | { 17 | public: 18 | nsEUCKRProber(void) 19 | { 20 | mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); 21 | Reset(); 22 | } 23 | ~nsEUCKRProber(void) override 24 | { 25 | delete mCodingSM; 26 | } 27 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 28 | const char *GetCharSetName() override 29 | { 30 | return "EUC-KR"; 31 | } 32 | nsProbingState GetState(void) override 33 | { 34 | return mState; 35 | } 36 | void Reset(void) override; 37 | float GetConfidence(void) override; 38 | void SetOpion() override 39 | { 40 | } 41 | 42 | protected: 43 | void GetDistribution(unsigned int aCharLen, const char *aStr); 44 | 45 | nsCodingStateMachine *mCodingSM; 46 | nsProbingState mState; 47 | 48 | // EUCKRContextAnalysis mContextAnalyser; 49 | EUCKRDistributionAnalysis mDistributionAnalyser; 50 | char mLastChar[2]; 51 | }; 52 | } 53 | 54 | #endif /* nsEUCKRProber_h__ */ 55 | -------------------------------------------------------------------------------- /src/probers/nsEscCharsetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsEscCharsetProber.h" 8 | 9 | namespace kencodingprober 10 | { 11 | nsEscCharSetProber::nsEscCharSetProber(void) 12 | { 13 | mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); 14 | mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); 15 | mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); 16 | mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); 17 | mActiveSM = NUM_OF_ESC_CHARSETS; 18 | mState = eDetecting; 19 | mDetectedCharset = nullptr; 20 | } 21 | 22 | nsEscCharSetProber::~nsEscCharSetProber(void) 23 | { 24 | for (unsigned int i = 0; i < NUM_OF_ESC_CHARSETS; i++) { 25 | delete mCodingSM[i]; 26 | } 27 | } 28 | 29 | void nsEscCharSetProber::Reset(void) 30 | { 31 | mState = eDetecting; 32 | for (unsigned int i = 0; i < NUM_OF_ESC_CHARSETS; i++) { 33 | mCodingSM[i]->Reset(); 34 | } 35 | mActiveSM = NUM_OF_ESC_CHARSETS; 36 | mDetectedCharset = nullptr; 37 | } 38 | 39 | nsProbingState nsEscCharSetProber::HandleData(const char *aBuf, unsigned int aLen) 40 | { 41 | nsSMState codingState; 42 | int j; 43 | unsigned int i; 44 | 45 | for (i = 0; i < aLen && mState == eDetecting; i++) { 46 | for (j = mActiveSM - 1; j >= 0; j--) { 47 | // byte is feed to all active state machine 48 | codingState = mCodingSM[j]->NextState(aBuf[i]); 49 | if (codingState == eError) { 50 | // got negative answer for this state machine, make it inactive 51 | mActiveSM--; 52 | if (mActiveSM == 0) { 53 | mState = eNotMe; 54 | return mState; 55 | } else if (j != (int)mActiveSM) { 56 | nsCodingStateMachine *t; 57 | t = mCodingSM[mActiveSM]; 58 | mCodingSM[mActiveSM] = mCodingSM[j]; 59 | mCodingSM[j] = t; 60 | } 61 | } else if (codingState == eItsMe) { 62 | mState = eFoundIt; 63 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 64 | return mState; 65 | } 66 | } 67 | } 68 | 69 | return mState; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/probers/nsEscCharsetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsEscCharSetProber_h__ 8 | #define nsEscCharSetProber_h__ 9 | 10 | #include "nsCharSetProber.h" 11 | #include "nsCodingStateMachine.h" 12 | 13 | #define NUM_OF_ESC_CHARSETS 4 14 | namespace kencodingprober 15 | { 16 | class KCODECS_NO_EXPORT nsEscCharSetProber : public nsCharSetProber 17 | { 18 | public: 19 | nsEscCharSetProber(void); 20 | ~nsEscCharSetProber(void) override; 21 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 22 | const char *GetCharSetName() override 23 | { 24 | return mDetectedCharset; 25 | } 26 | nsProbingState GetState(void) override 27 | { 28 | return mState; 29 | } 30 | void Reset(void) override; 31 | float GetConfidence(void) override 32 | { 33 | return (float)0.99; 34 | } 35 | void SetOpion() override 36 | { 37 | } 38 | 39 | protected: 40 | void GetDistribution(unsigned int aCharLen, const char *aStr); 41 | 42 | nsCodingStateMachine *mCodingSM[NUM_OF_ESC_CHARSETS]; 43 | unsigned int mActiveSM; 44 | nsProbingState mState; 45 | const char *mDetectedCharset; 46 | }; 47 | } 48 | #endif /* nsEscCharSetProber_h__ */ 49 | -------------------------------------------------------------------------------- /src/probers/nsGB2312Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | // for S-JIS encoding, observe characteristic: 8 | // 1, kana character (or hankaku?) often have high frequency of appearance 9 | // 2, kana character often exist in group 10 | // 3, certain combination of kana is never used in japanese language 11 | 12 | #include "nsGB2312Prober.h" 13 | 14 | namespace kencodingprober 15 | { 16 | void nsGB18030Prober::Reset(void) 17 | { 18 | mCodingSM->Reset(); 19 | mState = eDetecting; 20 | mDistributionAnalyser.Reset(); 21 | // mContextAnalyser.Reset(); 22 | } 23 | 24 | nsProbingState nsGB18030Prober::HandleData(const char *aBuf, unsigned int aLen) 25 | { 26 | if (aLen == 0) { 27 | return mState; 28 | } 29 | 30 | for (unsigned int i = 0; i < aLen; i++) { 31 | const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 32 | if (codingState == eError) { 33 | mState = eNotMe; 34 | break; 35 | } 36 | if (codingState == eItsMe) { 37 | mState = eFoundIt; 38 | break; 39 | } 40 | if (codingState == eStart) { 41 | unsigned int charLen = mCodingSM->GetCurrentCharLen(); 42 | 43 | if (i == 0) { 44 | mLastChar[1] = aBuf[0]; 45 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 46 | } else { 47 | mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 48 | } 49 | } 50 | } 51 | 52 | mLastChar[0] = aBuf[aLen - 1]; 53 | 54 | if (mState == eDetecting) { 55 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 56 | mState = eFoundIt; 57 | } 58 | } 59 | // else 60 | // mDistributionAnalyser.HandleData(aBuf, aLen); 61 | 62 | return mState; 63 | } 64 | 65 | float nsGB18030Prober::GetConfidence(void) 66 | { 67 | float distribCf = mDistributionAnalyser.GetConfidence(); 68 | 69 | return (float)distribCf; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/probers/nsGB2312Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsGB2312Prober_h__ 8 | #define nsGB2312Prober_h__ 9 | 10 | #include "CharDistribution.h" 11 | #include "nsCharSetProber.h" 12 | #include "nsCodingStateMachine.h" 13 | 14 | // We use gb18030 to replace gb2312, because 18030 is a superset. 15 | namespace kencodingprober 16 | { 17 | class KCODECS_NO_EXPORT nsGB18030Prober : public nsCharSetProber 18 | { 19 | public: 20 | nsGB18030Prober(void) 21 | { 22 | mCodingSM = new nsCodingStateMachine(&GB18030SMModel); 23 | Reset(); 24 | } 25 | ~nsGB18030Prober(void) override 26 | { 27 | delete mCodingSM; 28 | } 29 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 30 | const char *GetCharSetName() override 31 | { 32 | return "gb18030"; 33 | } 34 | nsProbingState GetState(void) override 35 | { 36 | return mState; 37 | } 38 | void Reset(void) override; 39 | float GetConfidence(void) override; 40 | void SetOpion() override 41 | { 42 | } 43 | 44 | protected: 45 | void GetDistribution(unsigned int aCharLen, const char *aStr); 46 | 47 | nsCodingStateMachine *mCodingSM; 48 | nsProbingState mState; 49 | 50 | // GB2312ContextAnalysis mContextAnalyser; 51 | GB2312DistributionAnalysis mDistributionAnalyser; 52 | char mLastChar[2]; 53 | }; 54 | } 55 | 56 | #endif /* nsGB2312Prober_h__ */ 57 | -------------------------------------------------------------------------------- /src/probers/nsHebrewProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsHebrewProber.h" 8 | #include 9 | 10 | // windows-1255 / ISO-8859-8 code points of interest 11 | #define FINAL_KAF ('\xea') 12 | #define NORMAL_KAF ('\xeb') 13 | #define FINAL_MEM ('\xed') 14 | #define NORMAL_MEM ('\xee') 15 | #define FINAL_NUN ('\xef') 16 | #define NORMAL_NUN ('\xf0') 17 | #define FINAL_PE ('\xf3') 18 | #define NORMAL_PE ('\xf4') 19 | #define FINAL_TSADI ('\xf5') 20 | #define NORMAL_TSADI ('\xf6') 21 | 22 | // Minimum Visual vs Logical final letter score difference. 23 | // If the difference is below this, don't rely solely on the final letter score distance. 24 | #define MIN_FINAL_CHAR_DISTANCE (5) 25 | 26 | // Minimum Visual vs Logical model score difference. 27 | // If the difference is below this, don't rely at all on the model score distance. 28 | #define MIN_MODEL_DISTANCE (0.01) 29 | 30 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") 31 | #define LOGICAL_HEBREW_NAME ("windows-1255") 32 | 33 | namespace kencodingprober 34 | { 35 | bool nsHebrewProber::isFinal(char c) 36 | { 37 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 38 | } 39 | 40 | bool nsHebrewProber::isNonFinal(char c) 41 | { 42 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 43 | // The normal Tsadi is not a good Non-Final letter due to words like 44 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 45 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 46 | // the Non-Final tsadi to appear at an end of a word even though this is not 47 | // the case in the original text. 48 | // The letters Pe and Kaf rarely display a related behavior of not being a 49 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 50 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of 51 | // these letters as Non-Final letters outweighs the damage since these words 52 | // are quite rare. 53 | } 54 | 55 | /** HandleData 56 | * Final letter analysis for logical-visual decision. 57 | * Look for evidence that the received buffer is either logical Hebrew or 58 | * visual Hebrew. 59 | * The following cases are checked: 60 | * 1) A word longer than 1 letter, ending with a final letter. This is an 61 | * indication that the text is laid out "naturally" since the final letter 62 | * really appears at the end. +1 for logical score. 63 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 64 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 65 | * the Non-Final form of that letter. Exceptions to this rule are mentioned 66 | * above in isNonFinal(). This is an indication that the text is laid out 67 | * backwards. +1 for visual score 68 | * 3) A word longer than 1 letter, starting with a final letter. Final letters 69 | * should not appear at the beginning of a word. This is an indication that 70 | * the text is laid out backwards. +1 for visual score. 71 | * 72 | * The visual score and logical score are accumulated throughout the text and 73 | * are finally checked against each other in GetCharSetName(). 74 | * No checking for final letters in the middle of words is done since that case 75 | * is not an indication for either Logical or Visual text. 76 | * 77 | * The input buffer should not contain any white spaces that are not (' ') 78 | * or any low-ascii punctuation marks. 79 | */ 80 | nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) 81 | { 82 | // Both model probers say it's not them. No reason to continue. 83 | if (GetState() == eNotMe) { 84 | return eNotMe; 85 | } 86 | 87 | const char *curPtr; 88 | const char *endPtr = aBuf + aLen; 89 | 90 | for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) { 91 | char cur = *curPtr; 92 | if (cur == ' ') { // We stand on a space - a word just ended 93 | if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word 94 | if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] 95 | ++mFinalCharLogicalScore; 96 | } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] 97 | ++mFinalCharVisualScore; 98 | } 99 | } 100 | } else { // Not standing on a space 101 | if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] 102 | ++mFinalCharVisualScore; 103 | } 104 | } 105 | mBeforePrev = mPrev; 106 | mPrev = cur; 107 | } 108 | 109 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). 110 | return eDetecting; 111 | } 112 | 113 | // Make the decision: is it Logical or Visual? 114 | const char *nsHebrewProber::GetCharSetName() 115 | { 116 | // If the final letter score distance is dominant enough, rely on it. 117 | int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 118 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { 119 | return LOGICAL_HEBREW_NAME; 120 | } 121 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { 122 | return VISUAL_HEBREW_NAME; 123 | } 124 | 125 | // It's not dominant enough, try to rely on the model scores instead. 126 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 127 | if (modelsub > MIN_MODEL_DISTANCE) { 128 | return LOGICAL_HEBREW_NAME; 129 | } 130 | if (modelsub < -(MIN_MODEL_DISTANCE)) { 131 | return VISUAL_HEBREW_NAME; 132 | } 133 | 134 | // Still no good, back to final letter distance, maybe it'll save the day. 135 | if (finalsub < 0) { 136 | return VISUAL_HEBREW_NAME; 137 | } 138 | 139 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 140 | return LOGICAL_HEBREW_NAME; 141 | } 142 | 143 | void nsHebrewProber::Reset(void) 144 | { 145 | mFinalCharLogicalScore = 0; 146 | mFinalCharVisualScore = 0; 147 | 148 | // mPrev and mBeforePrev are initialized to space in order to simulate a word 149 | // delimiter at the beginning of the data 150 | mPrev = ' '; 151 | mBeforePrev = ' '; 152 | } 153 | 154 | nsProbingState nsHebrewProber::GetState(void) 155 | { 156 | // Remain active as long as any of the model probers are active. 157 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { 158 | return eNotMe; 159 | } 160 | return eDetecting; 161 | } 162 | 163 | #ifdef DEBUG_PROBE 164 | void nsHebrewProber::DumpStatus() 165 | { 166 | printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 167 | } 168 | #endif 169 | } 170 | -------------------------------------------------------------------------------- /src/probers/nsHebrewProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsHebrewProber_h__ 8 | #define nsHebrewProber_h__ 9 | 10 | #include "nsSBCharSetProber.h" 11 | namespace kencodingprober 12 | { 13 | // This prober doesn't actually recognize a language or a charset. 14 | // It is a helper prober for the use of the Hebrew model probers 15 | class KCODECS_NO_EXPORT nsHebrewProber : public nsCharSetProber 16 | { 17 | public: 18 | nsHebrewProber(void) 19 | : mLogicalProb(nullptr) 20 | , mVisualProb(nullptr) 21 | { 22 | Reset(); 23 | } 24 | 25 | ~nsHebrewProber(void) override 26 | { 27 | } 28 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 29 | const char *GetCharSetName() override; 30 | void Reset(void) override; 31 | 32 | nsProbingState GetState(void) override; 33 | 34 | float GetConfidence(void) override 35 | { 36 | return (float)0.0; 37 | } 38 | void SetOpion() override 39 | { 40 | } 41 | 42 | void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) 43 | { 44 | mLogicalProb = logicalPrb; 45 | mVisualProb = visualPrb; 46 | } 47 | 48 | #ifdef DEBUG_PROBE 49 | void DumpStatus() override; 50 | #endif 51 | 52 | protected: 53 | static bool isFinal(char c); 54 | static bool isNonFinal(char c); 55 | 56 | int mFinalCharLogicalScore, mFinalCharVisualScore; 57 | 58 | // The two last characters seen in the previous buffer. 59 | char mPrev, mBeforePrev; 60 | 61 | // These probers are owned by the group prober. 62 | nsCharSetProber *mLogicalProb, *mVisualProb; 63 | }; 64 | } 65 | 66 | /** 67 | * ** General ideas of the Hebrew charset recognition ** 68 | * 69 | * Four main charsets exist in Hebrew: 70 | * "ISO-8859-8" - Visual Hebrew 71 | * "windows-1255" - Logical Hebrew 72 | * "ISO-8859-8-I" - Logical Hebrew 73 | * "x-mac-hebrew" - ?? Logical Hebrew ?? 74 | * 75 | * Both "ISO" charsets use a completely identical set of code points, whereas 76 | * "windows-1255" and "x-mac-hebrew" are two different proper supersets of 77 | * these code points. windows-1255 defines additional characters in the range 78 | * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific 79 | * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. 80 | * x-mac-hebrew defines similar additional code points but with a different 81 | * mapping. 82 | * 83 | * As far as an average Hebrew text with no diacritics is concerned, all four 84 | * charsets are identical with respect to code points. Meaning that for the 85 | * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters 86 | * (including final letters). 87 | * 88 | * The dominant difference between these charsets is their directionality. 89 | * "Visual" directionality means that the text is ordered as if the renderer is 90 | * not aware of a BIDI rendering algorithm. The renderer sees the text and 91 | * draws it from left to right. The text itself when ordered naturally is read 92 | * backwards. A buffer of Visual Hebrew generally looks like so: 93 | * "[last word of first line spelled backwards] [whole line ordered backwards 94 | * and spelled backwards] [first word of first line spelled backwards] 95 | * [end of line] [last word of second line] ... etc' " 96 | * adding punctuation marks, numbers and English text to visual text is 97 | * naturally also "visual" and from left to right. 98 | * 99 | * "Logical" directionality means the text is ordered "naturally" according to 100 | * the order it is read. It is the responsibility of the renderer to display 101 | * the text from right to left. A BIDI algorithm is used to place general 102 | * punctuation marks, numbers and English text in the text. 103 | * 104 | * Texts in x-mac-hebrew are almost impossible to find on the Internet. From 105 | * what little evidence I could find, it seems that its general directionality 106 | * is Logical. 107 | * 108 | * To sum up all of the above, the Hebrew probing mechanism knows about two 109 | * charsets: 110 | * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are 111 | * backwards while line order is natural. For charset recognition purposes 112 | * the line order is unimportant (In fact, for this implementation, even 113 | * word order is unimportant). 114 | * Logical Hebrew - "windows-1255" - normal, naturally ordered text. 115 | * 116 | * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be 117 | * specifically identified. 118 | * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew 119 | * that contain special punctuation marks or diacritics is displayed with 120 | * some unconverted characters showing as question marks. This problem might 121 | * be corrected using another model prober for x-mac-hebrew. Due to the fact 122 | * that x-mac-hebrew texts are so rare, writing another model prober isn't 123 | * worth the effort and performance hit. 124 | * 125 | * *** The Prober *** 126 | * 127 | * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber, 128 | * all of which are managed, created, fed data, inquired and deleted by the 129 | * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in 130 | * fact some kind of Hebrew, Logical or Visual. The final decision about which 131 | * one is it is made by the nsHebrewProber by combining final-letter scores 132 | * with the scores of the two nsSBCharSetProbers to produce a final answer. 133 | * 134 | * The nsSBCSGroupProber is responsible for stripping the original text of HTML 135 | * tags, English characters, numbers, low-ASCII punctuation characters, spaces 136 | * and new lines. It reduces any sequence of such characters to a single space. 137 | * The buffer fed to each prober in the SBCS group prober is pure text in 138 | * high-ASCII. 139 | * The two nsSBCharSetProbers (model probers) share the same language model: 140 | * Win1255Model. 141 | * The first nsSBCharSetProber uses the model normally as any other 142 | * nsSBCharSetProber does, to recognize windows-1255, upon which this model was 143 | * built. The second nsSBCharSetProber is told to make the pair-of-letter 144 | * lookup in the language model backwards. This in practice exactly simulates 145 | * a visual Hebrew model using the windows-1255 logical Hebrew model. 146 | * 147 | * The nsHebrewProber is not using any language model. All it does is look for 148 | * final-letter evidence suggesting the text is either logical Hebrew or visual 149 | * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber 150 | * alone are meaningless. nsHebrewProber always returns 0.00 as confidence 151 | * since it never identifies a charset by itself. Instead, the pointer to the 152 | * nsHebrewProber is passed to the model probers as a helper "Name Prober". 153 | * When the Group prober receives a positive identification from any prober, 154 | * it asks for the name of the charset identified. If the prober queried is a 155 | * Hebrew model prober, the model prober forwards the call to the 156 | * nsHebrewProber to make the final decision. In the nsHebrewProber, the 157 | * decision is made according to the final-letters scores maintained and Both 158 | * model probers scores. The answer is returned in the form of the name of the 159 | * charset identified, either "windows-1255" or "ISO-8859-8". 160 | * 161 | */ 162 | #endif /* nsHebrewProber_h__ */ 163 | -------------------------------------------------------------------------------- /src/probers/nsLatin1Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsLatin1Prober.h" 8 | #include 9 | #include 10 | 11 | #define UDF 0 // undefined 12 | #define OTH 1 // other 13 | #define ASC 2 // ascii capital letter 14 | #define ASS 3 // ascii small letter 15 | #define ACV 4 // accent capital vowel 16 | #define ACO 5 // accent capital other 17 | #define ASV 6 // accent small vowel 18 | #define ASO 7 // accent small other 19 | #define CLASS_NUM 8 // total classes 20 | 21 | namespace kencodingprober 22 | { 23 | static const unsigned char Latin1_CharToClass[] = { 24 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 25 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 26 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 27 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 28 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 29 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 30 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 31 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 32 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 33 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 34 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 35 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 36 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 37 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 38 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 39 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 40 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 41 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 42 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 43 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 44 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 48 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 49 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 50 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 51 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 52 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 53 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 54 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 55 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 56 | }; 57 | 58 | /* 0 : illegal 59 | 1 : very unlikely 60 | 2 : normal 61 | 3 : very likely 62 | */ 63 | static const unsigned char Latin1ClassModel[] = { 64 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ 65 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 66 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 67 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 68 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 69 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 70 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 71 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 72 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 73 | }; 74 | 75 | void nsLatin1Prober::Reset(void) 76 | { 77 | mState = eDetecting; 78 | mLastCharClass = OTH; 79 | for (int i = 0; i < FREQ_CAT_NUM; i++) { 80 | mFreqCounter[i] = 0; 81 | } 82 | } 83 | 84 | nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen) 85 | { 86 | char *newBuf1 = nullptr; 87 | unsigned int newLen1 = 0; 88 | 89 | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 90 | newBuf1 = (char *)aBuf; 91 | newLen1 = aLen; 92 | } 93 | 94 | for (unsigned int i = 0; i < newLen1; i++) { 95 | const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 96 | const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass]; 97 | if (freq == 0) { 98 | mState = eNotMe; 99 | break; 100 | } 101 | mFreqCounter[freq]++; 102 | mLastCharClass = charClass; 103 | } 104 | 105 | if (newBuf1 != aBuf) { 106 | free(newBuf1); 107 | } 108 | 109 | return mState; 110 | } 111 | 112 | float nsLatin1Prober::GetConfidence(void) 113 | { 114 | if (mState == eNotMe) { 115 | return 0.01f; 116 | } 117 | 118 | float confidence; 119 | unsigned int total = 0; 120 | for (int i = 0; i < FREQ_CAT_NUM; i++) { 121 | total += mFreqCounter[i]; 122 | } 123 | 124 | if (!total) { 125 | confidence = 0.0f; 126 | } else { 127 | confidence = mFreqCounter[3] * 1.0f / total; 128 | confidence -= mFreqCounter[1] * 20.0f / total; 129 | } 130 | 131 | if (confidence < 0.0f) { 132 | confidence = 0.0f; 133 | } 134 | 135 | // lower the confidence of latin1 so that other more accurate detector 136 | // can take priority. 137 | confidence *= 0.50f; 138 | 139 | return confidence; 140 | } 141 | 142 | #ifdef DEBUG_PROBE 143 | void nsLatin1Prober::DumpStatus() 144 | { 145 | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 146 | } 147 | #endif 148 | } 149 | -------------------------------------------------------------------------------- /src/probers/nsLatin1Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsLatin1Prober_h__ 8 | #define nsLatin1Prober_h__ 9 | 10 | #include "nsCharSetProber.h" 11 | 12 | #define FREQ_CAT_NUM 4 13 | namespace kencodingprober 14 | { 15 | class KCODECS_NO_EXPORT nsLatin1Prober : public nsCharSetProber 16 | { 17 | public: 18 | nsLatin1Prober(void) 19 | { 20 | Reset(); 21 | } 22 | ~nsLatin1Prober(void) override 23 | { 24 | } 25 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 26 | const char *GetCharSetName() override 27 | { 28 | return "windows-1252"; 29 | } 30 | nsProbingState GetState(void) override 31 | { 32 | return mState; 33 | } 34 | void Reset(void) override; 35 | float GetConfidence(void) override; 36 | void SetOpion() override 37 | { 38 | } 39 | 40 | #ifdef DEBUG_PROBE 41 | void DumpStatus() override; 42 | #endif 43 | 44 | protected: 45 | nsProbingState mState; 46 | char mLastCharClass; 47 | unsigned int mFreqCounter[FREQ_CAT_NUM]; 48 | }; 49 | } 50 | 51 | #endif /* nsLatin1Prober_h__ */ 52 | -------------------------------------------------------------------------------- /src/probers/nsMBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsMBCSGroupProber.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace kencodingprober 13 | { 14 | #ifdef DEBUG_PROBE 15 | static const char *const ProberName[] = { 16 | "Unicode", 17 | "SJIS", 18 | "EUCJP", 19 | "GB18030", 20 | "EUCKR", 21 | "Big5", 22 | }; 23 | 24 | #endif 25 | 26 | nsMBCSGroupProber::nsMBCSGroupProber() 27 | { 28 | mProbers[0] = new UnicodeGroupProber(); 29 | mProbers[1] = new nsSJISProber(); 30 | mProbers[2] = new nsEUCJPProber(); 31 | mProbers[3] = new nsGB18030Prober(); 32 | mProbers[4] = new nsEUCKRProber(); 33 | mProbers[5] = new nsBig5Prober(); 34 | Reset(); 35 | } 36 | 37 | nsMBCSGroupProber::~nsMBCSGroupProber() 38 | { 39 | for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) { 40 | delete mProbers[i]; 41 | } 42 | } 43 | 44 | const char *nsMBCSGroupProber::GetCharSetName() 45 | { 46 | if (mBestGuess == -1) { 47 | GetConfidence(); 48 | if (mBestGuess == -1) { 49 | mBestGuess = 0; 50 | } 51 | } 52 | return mProbers[mBestGuess]->GetCharSetName(); 53 | } 54 | 55 | void nsMBCSGroupProber::Reset(void) 56 | { 57 | mActiveNum = 0; 58 | for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) { 59 | if (mProbers[i]) { 60 | mProbers[i]->Reset(); 61 | mIsActive[i] = true; 62 | ++mActiveNum; 63 | } else { 64 | mIsActive[i] = false; 65 | } 66 | } 67 | mBestGuess = -1; 68 | mState = eDetecting; 69 | } 70 | 71 | nsProbingState nsMBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) 72 | { 73 | nsProbingState st; 74 | unsigned int i; 75 | 76 | // do filtering to reduce load to probers 77 | char *highbyteBuf; 78 | char *hptr; 79 | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise 80 | hptr = highbyteBuf = (char *)malloc(aLen); 81 | if (!hptr) { 82 | return mState; 83 | } 84 | for (i = 0; i < aLen; ++i) { 85 | if (aBuf[i] & 0x80) { 86 | *hptr++ = aBuf[i]; 87 | keepNext = true; 88 | } else { 89 | // if previous is highbyte, keep this even it is a ASCII 90 | if (keepNext) { 91 | *hptr++ = aBuf[i]; 92 | keepNext = false; 93 | } 94 | } 95 | } 96 | 97 | for (i = 0; i < NUM_OF_PROBERS; ++i) { 98 | if (!mIsActive[i]) { 99 | continue; 100 | } 101 | st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 102 | if (st == eFoundIt) { 103 | mBestGuess = i; 104 | mState = eFoundIt; 105 | break; 106 | } else if (st == eNotMe) { 107 | mIsActive[i] = false; 108 | mActiveNum--; 109 | if (mActiveNum == 0) { 110 | mState = eNotMe; 111 | break; 112 | } 113 | } 114 | } 115 | 116 | free(highbyteBuf); 117 | 118 | return mState; 119 | } 120 | 121 | float nsMBCSGroupProber::GetConfidence(void) 122 | { 123 | unsigned int i; 124 | float bestConf = 0.0; 125 | float cf; 126 | 127 | switch (mState) { 128 | case eFoundIt: 129 | return (float)0.99; 130 | case eNotMe: 131 | return (float)0.01; 132 | default: 133 | for (i = 0; i < NUM_OF_PROBERS; ++i) { 134 | if (!mIsActive[i]) { 135 | continue; 136 | } 137 | cf = mProbers[i]->GetConfidence(); 138 | if (bestConf < cf) { 139 | bestConf = cf; 140 | mBestGuess = i; 141 | } 142 | } 143 | } 144 | return bestConf; 145 | } 146 | 147 | #ifdef DEBUG_PROBE 148 | void nsMBCSGroupProber::DumpStatus() 149 | { 150 | unsigned int i; 151 | float cf; 152 | 153 | GetConfidence(); 154 | for (i = 0; i < NUM_OF_PROBERS; i++) { 155 | if (!mIsActive[i]) { 156 | printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 157 | } else { 158 | cf = mProbers[i]->GetConfidence(); 159 | printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); 160 | } 161 | } 162 | } 163 | #endif 164 | } 165 | -------------------------------------------------------------------------------- /src/probers/nsMBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsMBCSGroupProber_h__ 8 | #define nsMBCSGroupProber_h__ 9 | 10 | #include "UnicodeGroupProber.h" 11 | #include "nsBig5Prober.h" 12 | #include "nsEUCJPProber.h" 13 | #include "nsEUCKRProber.h" 14 | #include "nsGB2312Prober.h" 15 | #include "nsSJISProber.h" 16 | 17 | #define NUM_OF_PROBERS 6 18 | namespace kencodingprober 19 | { 20 | class KCODECS_NO_EXPORT nsMBCSGroupProber : public nsCharSetProber 21 | { 22 | public: 23 | nsMBCSGroupProber(); 24 | ~nsMBCSGroupProber() override; 25 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 26 | const char *GetCharSetName() override; 27 | nsProbingState GetState(void) override 28 | { 29 | return mState; 30 | } 31 | void Reset(void) override; 32 | float GetConfidence(void) override; 33 | void SetOpion() override 34 | { 35 | } 36 | 37 | #ifdef DEBUG_PROBE 38 | void DumpStatus() override; 39 | #endif 40 | 41 | protected: 42 | nsProbingState mState; 43 | nsCharSetProber *mProbers[NUM_OF_PROBERS]; 44 | bool mIsActive[NUM_OF_PROBERS]; 45 | int mBestGuess; 46 | unsigned int mActiveNum; 47 | }; 48 | } 49 | 50 | #endif /* nsMBCSGroupProber_h__ */ 51 | -------------------------------------------------------------------------------- /src/probers/nsPkgInt.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef nsPkgInt_h__ 8 | #define nsPkgInt_h__ 9 | 10 | namespace kencodingprober 11 | { 12 | typedef enum { 13 | eIdxSft4bits = 3, 14 | eIdxSft8bits = 2, 15 | eIdxSft16bits = 1, 16 | } nsIdxSft; 17 | 18 | typedef enum { 19 | eSftMsk4bits = 7, 20 | eSftMsk8bits = 3, 21 | eSftMsk16bits = 1, 22 | } nsSftMsk; 23 | 24 | typedef enum { 25 | eBitSft4bits = 2, 26 | eBitSft8bits = 3, 27 | eBitSft16bits = 4, 28 | } nsBitSft; 29 | 30 | typedef enum { 31 | eUnitMsk4bits = 0x0000000FL, 32 | eUnitMsk8bits = 0x000000FFL, 33 | eUnitMsk16bits = 0x0000FFFFL, 34 | } nsUnitMsk; 35 | 36 | typedef struct nsPkgInt { 37 | nsIdxSft idxsft; 38 | nsSftMsk sftmsk; 39 | nsBitSft bitsft; 40 | nsUnitMsk unitmsk; 41 | const unsigned int *data; 42 | } nsPkgInt; 43 | } 44 | 45 | #define PCK16BITS(a, b) ((unsigned int)(((b) << 16) | (a))) 46 | 47 | #define PCK8BITS(a, b, c, d) PCK16BITS(((unsigned int)(((b) << 8) | (a))), ((unsigned int)(((d) << 8) | (c)))) 48 | 49 | #define PCK4BITS(a, b, c, d, e, f, g, h) \ 50 | PCK8BITS(((unsigned int)(((b) << 4) | (a))), ((unsigned int)(((d) << 4) | (c))), ((unsigned int)(((f) << 4) | (e))), ((unsigned int)(((h) << 4) | (g)))) 51 | 52 | #define GETFROMPCK(i, c) (((((c).data)[(i) >> (c).idxsft]) >> (((i) & (c).sftmsk) << (c).bitsft)) & (c).unitmsk) 53 | 54 | #endif /* nsPkgInt_h__ */ 55 | -------------------------------------------------------------------------------- /src/probers/nsSBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsSBCSGroupProber.h" 8 | 9 | #include "UnicodeGroupProber.h" 10 | #include "nsHebrewProber.h" 11 | #include "nsSBCharSetProber.h" 12 | 13 | #include 14 | #include 15 | 16 | namespace kencodingprober 17 | { 18 | nsSBCSGroupProber::nsSBCSGroupProber() 19 | { 20 | mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); 21 | mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); 22 | mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); 23 | mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); 24 | mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); 25 | mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); 26 | mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); 27 | mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); 28 | mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); 29 | mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); 30 | 31 | nsHebrewProber *hebprober = new nsHebrewProber(); 32 | // Notice: Any change in these indexes - 10,11,12 must be reflected 33 | // in the code below as well. 34 | mProbers[10] = hebprober; 35 | mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew 36 | mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew 37 | mProbers[13] = new UnicodeGroupProber(); 38 | 39 | // Tell the Hebrew prober about the logical and visual probers 40 | if (mProbers[10] && mProbers[11] && mProbers[12]) { // all are not null 41 | hebprober->SetModelProbers(mProbers[11], mProbers[12]); 42 | } else { // One or more is null. avoid any Hebrew probing, null them all 43 | for (unsigned int i = 10; i <= 12; ++i) { 44 | delete mProbers[i]; 45 | mProbers[i] = nullptr; 46 | } 47 | } 48 | 49 | // disable latin2 before latin1 is available, otherwise all latin1 50 | // will be detected as latin2 because of their similarity. 51 | // mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); 52 | // mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); 53 | 54 | Reset(); 55 | } 56 | 57 | nsSBCSGroupProber::~nsSBCSGroupProber() 58 | { 59 | for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 60 | delete mProbers[i]; 61 | } 62 | } 63 | 64 | const char *nsSBCSGroupProber::GetCharSetName() 65 | { 66 | // if we have no answer yet 67 | if (mBestGuess == -1) { 68 | GetConfidence(); 69 | // no charset seems positive 70 | if (mBestGuess == -1) 71 | // we will use default. 72 | { 73 | mBestGuess = 0; 74 | } 75 | } 76 | return mProbers[mBestGuess]->GetCharSetName(); 77 | } 78 | 79 | void nsSBCSGroupProber::Reset(void) 80 | { 81 | mActiveNum = 0; 82 | for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 83 | if (mProbers[i]) { // not null 84 | mProbers[i]->Reset(); 85 | mIsActive[i] = true; 86 | ++mActiveNum; 87 | } else { 88 | mIsActive[i] = false; 89 | } 90 | } 91 | mBestGuess = -1; 92 | mState = eDetecting; 93 | } 94 | 95 | nsProbingState nsSBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) 96 | { 97 | nsProbingState st; 98 | unsigned int i; 99 | char *newBuf1 = nullptr; 100 | unsigned int newLen1 = 0; 101 | 102 | // apply filter to original buffer, and we got new buffer back 103 | // depend on what script it is, we will feed them the new buffer 104 | // we got after applying proper filter 105 | // this is done without any consideration to KeepEnglishLetters 106 | // of each prober since as of now, there are no probers here which 107 | // recognize languages with English characters. 108 | if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 109 | goto done; 110 | } 111 | 112 | if (newLen1 == 0) { 113 | goto done; // Nothing to see here, move on. 114 | } 115 | 116 | for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { 117 | if (!mIsActive[i]) { 118 | continue; 119 | } 120 | st = mProbers[i]->HandleData(newBuf1, newLen1); 121 | if (st == eFoundIt) { 122 | mBestGuess = i; 123 | mState = eFoundIt; 124 | break; 125 | } else if (st == eNotMe) { 126 | mIsActive[i] = false; 127 | mActiveNum--; 128 | if (mActiveNum == 0) { 129 | mState = eNotMe; 130 | break; 131 | } 132 | } 133 | } 134 | 135 | done: 136 | free(newBuf1); 137 | 138 | return mState; 139 | } 140 | 141 | float nsSBCSGroupProber::GetConfidence(void) 142 | { 143 | unsigned int i; 144 | float bestConf = 0.0; 145 | float cf; 146 | 147 | switch (mState) { 148 | case eFoundIt: 149 | return (float)0.99; // sure yes 150 | case eNotMe: 151 | return (float)0.01; // sure no 152 | default: 153 | for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { 154 | if (!mIsActive[i]) { 155 | continue; 156 | } 157 | cf = mProbers[i]->GetConfidence(); 158 | if (bestConf < cf) { 159 | bestConf = cf; 160 | mBestGuess = i; 161 | } 162 | } 163 | } 164 | return bestConf; 165 | } 166 | 167 | #ifdef DEBUG_PROBE 168 | void nsSBCSGroupProber::DumpStatus() 169 | { 170 | unsigned int i; 171 | float cf; 172 | 173 | cf = GetConfidence(); 174 | printf(" SBCS Group Prober --------begin status \r\n"); 175 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 176 | if (!mIsActive[i]) { 177 | printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); 178 | } else { 179 | mProbers[i]->DumpStatus(); 180 | } 181 | } 182 | printf(" SBCS Group found best match [%s] confidence %f.\r\n", mProbers[mBestGuess]->GetCharSetName(), cf); 183 | } 184 | #endif 185 | } 186 | -------------------------------------------------------------------------------- /src/probers/nsSBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* 2 | The Original Code is Mozilla Universal charset detector code. 3 | 4 | SPDX-FileCopyrightText: 2001 Netscape Communications Corporation 5 | SPDX-FileContributor: Shy Shalom 6 | 7 | SPDX-License-Identifier: MPL-1.1 OR GPL-2.0-or-later OR LGPL-2.1-or-later 8 | */ 9 | 10 | #ifndef nsSBCSGroupProber_h__ 11 | #define nsSBCSGroupProber_h__ 12 | 13 | #include "nsCharSetProber.h" 14 | 15 | #define NUM_OF_SBCS_PROBERS 14 16 | 17 | namespace kencodingprober 18 | { 19 | class KCODECS_NO_EXPORT nsSBCSGroupProber : public nsCharSetProber 20 | { 21 | public: 22 | nsSBCSGroupProber(); 23 | ~nsSBCSGroupProber() override; 24 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 25 | const char *GetCharSetName() override; 26 | nsProbingState GetState(void) override 27 | { 28 | return mState; 29 | } 30 | void Reset(void) override; 31 | float GetConfidence(void) override; 32 | void SetOpion() override 33 | { 34 | } 35 | 36 | #ifdef DEBUG_PROBE 37 | void DumpStatus() override; 38 | #endif 39 | 40 | protected: 41 | nsProbingState mState; 42 | nsCharSetProber *mProbers[NUM_OF_SBCS_PROBERS]; 43 | bool mIsActive[NUM_OF_SBCS_PROBERS]; 44 | int mBestGuess; 45 | unsigned int mActiveNum; 46 | }; 47 | } 48 | 49 | #endif /* nsSBCSGroupProber_h__ */ 50 | -------------------------------------------------------------------------------- /src/probers/nsSBCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #include "nsSBCharSetProber.h" 8 | 9 | #include 10 | 11 | namespace kencodingprober 12 | { 13 | nsProbingState nsSingleByteCharSetProber::HandleData(const char *aBuf, unsigned int aLen) 14 | { 15 | for (unsigned int i = 0; i < aLen; i++) { 16 | const unsigned char order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; 17 | 18 | if (order < SYMBOL_CAT_ORDER) { 19 | mTotalChar++; 20 | } 21 | if (order < SAMPLE_SIZE) { 22 | mFreqChar++; 23 | 24 | if (mLastOrder < SAMPLE_SIZE) { 25 | mTotalSeqs++; 26 | if (!mReversed) { 27 | ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]); 28 | } else { // reverse the order of the letters in the lookup 29 | ++(mSeqCounters[(int)mModel->precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]); 30 | } 31 | } 32 | } 33 | mLastOrder = order; 34 | } 35 | 36 | if (mState == eDetecting) { 37 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) { 38 | float cf = GetConfidence(); 39 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) { 40 | mState = eFoundIt; 41 | } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) { 42 | mState = eNotMe; 43 | } 44 | } 45 | } 46 | 47 | return mState; 48 | } 49 | 50 | void nsSingleByteCharSetProber::Reset(void) 51 | { 52 | mState = eDetecting; 53 | mLastOrder = 255; 54 | for (unsigned int i = 0; i < NUMBER_OF_SEQ_CAT; i++) { 55 | mSeqCounters[i] = 0; 56 | } 57 | mTotalSeqs = 0; 58 | mTotalChar = 0; 59 | mFreqChar = 0; 60 | } 61 | 62 | //#define NEGATIVE_APPROACH 1 63 | 64 | float nsSingleByteCharSetProber::GetConfidence(void) 65 | { 66 | #ifdef NEGATIVE_APPROACH 67 | if (mTotalSeqs > 0) 68 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT] * 10) { 69 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT] * 10)) / mTotalSeqs * mFreqChar / mTotalChar; 70 | } 71 | return (float)0.01; 72 | #else // POSITIVE_APPROACH 73 | float r; 74 | 75 | if (mTotalSeqs > 0) { 76 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; 77 | r = r * mFreqChar / mTotalChar; 78 | if (r >= (float)1.00) { 79 | r = (float)0.99; 80 | } 81 | return r; 82 | } 83 | return (float)0.01; 84 | #endif 85 | } 86 | 87 | const char *nsSingleByteCharSetProber::GetCharSetName() 88 | { 89 | if (!mNameProber) { 90 | return mModel->charsetName; 91 | } 92 | return mNameProber->GetCharSetName(); 93 | } 94 | 95 | #ifdef DEBUG_PROBE 96 | void nsSingleByteCharSetProber::DumpStatus() 97 | { 98 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 99 | } 100 | #endif 101 | } 102 | -------------------------------------------------------------------------------- /src/probers/nsSBCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | #ifndef NSSBCHARSETPROBER_H 8 | #define NSSBCHARSETPROBER_H 9 | 10 | #include "nsCharSetProber.h" 11 | 12 | #define SAMPLE_SIZE 64 13 | #define SB_ENOUGH_REL_THRESHOLD 1024 14 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 15 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 16 | #define SYMBOL_CAT_ORDER 250 17 | #define NUMBER_OF_SEQ_CAT 4 18 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT - 1) 19 | #define NEGATIVE_CAT 0 20 | 21 | namespace kencodingprober 22 | { 23 | typedef struct { 24 | const unsigned char *charToOrderMap; // [256] table use to find a char's order 25 | const char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 26 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 27 | bool keepEnglishLetter; // says if this script contains English characters (not implemented) 28 | const char *charsetName; 29 | } SequenceModel; 30 | 31 | class KCODECS_NO_EXPORT nsSingleByteCharSetProber : public nsCharSetProber 32 | { 33 | public: 34 | explicit nsSingleByteCharSetProber(const SequenceModel *model) 35 | : mModel(model) 36 | , mReversed(false) 37 | , mNameProber(nullptr) 38 | { 39 | Reset(); 40 | } 41 | nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber *nameProber) 42 | : mModel(model) 43 | , mReversed(reversed) 44 | , mNameProber(nameProber) 45 | { 46 | Reset(); 47 | } 48 | 49 | const char *GetCharSetName() override; 50 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 51 | nsProbingState GetState(void) override 52 | { 53 | return mState; 54 | } 55 | void Reset(void) override; 56 | float GetConfidence(void) override; 57 | void SetOpion() override 58 | { 59 | } 60 | 61 | // This feature is not implemented yet. any current language model 62 | // contain this parameter as false. No one is looking at this 63 | // parameter or calling this method. 64 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this 65 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 66 | // of the English letters. 67 | bool KeepEnglishLetters() 68 | { 69 | return mModel->keepEnglishLetter; 70 | } // (not implemented) 71 | 72 | #ifdef DEBUG_PROBE 73 | void DumpStatus() override; 74 | #endif 75 | 76 | protected: 77 | nsProbingState mState; 78 | const SequenceModel *mModel; 79 | const bool mReversed; // true if we need to reverse every pair in the model lookup 80 | 81 | // char order of last character 82 | unsigned char mLastOrder; 83 | 84 | unsigned int mTotalSeqs; 85 | unsigned int mSeqCounters[NUMBER_OF_SEQ_CAT]; 86 | 87 | unsigned int mTotalChar; 88 | // characters that fall in our sampling range 89 | unsigned int mFreqChar; 90 | 91 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 92 | nsCharSetProber *mNameProber; 93 | }; 94 | 95 | extern const SequenceModel Koi8rModel; 96 | extern const SequenceModel Win1251Model; 97 | extern const SequenceModel Latin5Model; 98 | extern const SequenceModel MacCyrillicModel; 99 | extern const SequenceModel Ibm866Model; 100 | extern const SequenceModel Ibm855Model; 101 | extern const SequenceModel Latin7Model; 102 | extern const SequenceModel Win1253Model; 103 | extern const SequenceModel Latin5BulgarianModel; 104 | extern const SequenceModel Win1251BulgarianModel; 105 | extern const SequenceModel Latin2HungarianModel; 106 | extern const SequenceModel Win1250HungarianModel; 107 | extern const SequenceModel Win1255Model; 108 | } 109 | #endif /* NSSBCHARSETPROBER_H */ 110 | -------------------------------------------------------------------------------- /src/probers/nsSJISProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | 4 | SPDX-License-Identifier: MIT 5 | */ 6 | 7 | // for S-JIS encoding, observe characteristic: 8 | // 1, kana character (or hankaku?) often have high frequency of appearance 9 | // 2, kana character often exist in group 10 | // 3, certain combination of kana is never used in japanese language 11 | 12 | #include "nsSJISProber.h" 13 | 14 | namespace kencodingprober 15 | { 16 | void nsSJISProber::Reset(void) 17 | { 18 | mCodingSM->Reset(); 19 | mState = eDetecting; 20 | mContextAnalyser.Reset(); 21 | mDistributionAnalyser.Reset(); 22 | } 23 | 24 | nsProbingState nsSJISProber::HandleData(const char *aBuf, unsigned int aLen) 25 | { 26 | if (aLen == 0) { 27 | return mState; 28 | } 29 | 30 | for (unsigned int i = 0; i < aLen; i++) { 31 | const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 32 | if (codingState == eError) { 33 | mState = eNotMe; 34 | break; 35 | } 36 | if (codingState == eItsMe) { 37 | mState = eFoundIt; 38 | break; 39 | } 40 | if (codingState == eStart) { 41 | unsigned int charLen = mCodingSM->GetCurrentCharLen(); 42 | if (i == 0) { 43 | mLastChar[1] = aBuf[0]; 44 | mContextAnalyser.HandleOneChar(mLastChar + 2 - charLen, charLen); 45 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 46 | } else { 47 | mContextAnalyser.HandleOneChar(aBuf + i + 1 - charLen, charLen); 48 | mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 49 | } 50 | } 51 | } 52 | 53 | mLastChar[0] = aBuf[aLen - 1]; 54 | 55 | if (mState == eDetecting) { 56 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 57 | mState = eFoundIt; 58 | } 59 | } 60 | 61 | return mState; 62 | } 63 | 64 | float nsSJISProber::GetConfidence(void) 65 | { 66 | float contxtCf = mContextAnalyser.GetConfidence(); 67 | float distribCf = mDistributionAnalyser.GetConfidence(); 68 | 69 | return (contxtCf > distribCf ? contxtCf : distribCf); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/probers/nsSJISProber.h: -------------------------------------------------------------------------------- 1 | /* 2 | The Original Code is mozilla.org code. 3 | 4 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 5 | 6 | SPDX-License-Identifier: MPL-1.1 OR GPL-2.0-or-later OR LGPL-2.1-or-later 7 | */ 8 | 9 | // for S-JIS encoding, observe characteristic: 10 | // 1, kana character (or hankaku?) often have high frequency of appearance 11 | // 2, kana character often exist in group 12 | // 3, certain combination of kana is never used in japanese language 13 | 14 | #ifndef nsSJISProber_h__ 15 | #define nsSJISProber_h__ 16 | 17 | #include "CharDistribution.h" 18 | #include "JpCntx.h" 19 | #include "nsCharSetProber.h" 20 | #include "nsCodingStateMachine.h" 21 | 22 | namespace kencodingprober 23 | { 24 | class KCODECS_NO_EXPORT nsSJISProber : public nsCharSetProber 25 | { 26 | public: 27 | nsSJISProber(void) 28 | { 29 | mCodingSM = new nsCodingStateMachine(&SJISSMModel); 30 | Reset(); 31 | } 32 | ~nsSJISProber(void) override 33 | { 34 | delete mCodingSM; 35 | } 36 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 37 | const char *GetCharSetName() override 38 | { 39 | return "Shift_JIS"; 40 | } 41 | nsProbingState GetState(void) override 42 | { 43 | return mState; 44 | } 45 | void Reset(void) override; 46 | float GetConfidence(void) override; 47 | void SetOpion() override 48 | { 49 | } 50 | 51 | protected: 52 | nsCodingStateMachine *mCodingSM; 53 | nsProbingState mState; 54 | 55 | SJISContextAnalysis mContextAnalyser; 56 | SJISDistributionAnalysis mDistributionAnalyser; 57 | 58 | char mLastChar[2]; 59 | }; 60 | } 61 | 62 | #endif /* nsSJISProber_h__ */ 63 | -------------------------------------------------------------------------------- /src/probers/nsUniversalDetector.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | SPDX-FileCopyrightText: 2008 Wang Kai 4 | 5 | SPDX-License-Identifier: MIT 6 | */ 7 | 8 | #include "nsUniversalDetector.h" 9 | 10 | #include "nsEscCharsetProber.h" 11 | #include "nsLatin1Prober.h" 12 | #include "nsMBCSGroupProber.h" 13 | #include "nsSBCSGroupProber.h" 14 | 15 | namespace kencodingprober 16 | { 17 | nsUniversalDetector::nsUniversalDetector() 18 | { 19 | mDone = false; 20 | mBestGuess = -1; // illegal value as signal 21 | mInTag = false; 22 | mEscCharSetProber = nullptr; 23 | 24 | mStart = true; 25 | mDetectedCharset = nullptr; 26 | mGotData = false; 27 | mInputState = ePureAscii; 28 | mLastChar = '\0'; 29 | 30 | unsigned int i; 31 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 32 | mCharSetProbers[i] = nullptr; 33 | } 34 | } 35 | 36 | nsUniversalDetector::~nsUniversalDetector() 37 | { 38 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 39 | delete mCharSetProbers[i]; 40 | } 41 | delete mEscCharSetProber; 42 | } 43 | 44 | void nsUniversalDetector::Reset() 45 | { 46 | mDone = false; 47 | mBestGuess = -1; // illegal value as signal 48 | mInTag = false; 49 | 50 | mStart = true; 51 | mDetectedCharset = nullptr; 52 | mGotData = false; 53 | mInputState = ePureAscii; 54 | mLastChar = '\0'; 55 | 56 | if (mEscCharSetProber) { 57 | mEscCharSetProber->Reset(); 58 | } 59 | 60 | unsigned int i; 61 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 62 | if (mCharSetProbers[i]) { 63 | mCharSetProbers[i]->Reset(); 64 | } 65 | } 66 | } 67 | 68 | //--------------------------------------------------------------------- 69 | #define SHORTCUT_THRESHOLD (float)0.95 70 | #define MINIMUM_THRESHOLD (float)0.20 71 | 72 | nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen) 73 | { 74 | if (mDone) { 75 | return eFoundIt; 76 | } 77 | 78 | if (aLen > 0) { 79 | mGotData = true; 80 | } 81 | 82 | unsigned int i; 83 | for (i = 0; i < aLen; i++) { 84 | // other than 0xa0, if every other character is ascii, the page is ascii 85 | if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP 86 | // we got a non-ascii byte (high-byte) 87 | if (mInputState != eHighbyte) { 88 | // adjust state 89 | mInputState = eHighbyte; 90 | 91 | // kill mEscCharSetProber if it is active 92 | delete mEscCharSetProber; 93 | mEscCharSetProber = nullptr; 94 | 95 | // start multibyte and singlebyte charset prober 96 | if (nullptr == mCharSetProbers[0]) { 97 | mCharSetProbers[0] = new nsMBCSGroupProber; 98 | } 99 | if (nullptr == mCharSetProbers[1]) { 100 | mCharSetProbers[1] = new nsSBCSGroupProber; 101 | } 102 | if (nullptr == mCharSetProbers[2]) { 103 | mCharSetProbers[2] = new nsLatin1Prober; 104 | } 105 | } 106 | } else { 107 | // ok, just pure ascii so far 108 | if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) { 109 | // found escape character or HZ "~{" 110 | mInputState = eEscAscii; 111 | } 112 | 113 | mLastChar = aBuf[i]; 114 | } 115 | } 116 | 117 | nsProbingState st = eDetecting; 118 | switch (mInputState) { 119 | case eEscAscii: 120 | if (nullptr == mEscCharSetProber) { 121 | mEscCharSetProber = new nsEscCharSetProber; 122 | } 123 | st = mEscCharSetProber->HandleData(aBuf, aLen); 124 | if (st == eFoundIt) { 125 | mDone = true; 126 | mDetectedCharset = mEscCharSetProber->GetCharSetName(); 127 | } 128 | break; 129 | case eHighbyte: 130 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) { 131 | st = mCharSetProbers[i]->HandleData(aBuf, aLen); 132 | if (st == eFoundIt) { 133 | mDone = true; 134 | mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 135 | } 136 | } 137 | break; 138 | 139 | default: // pure ascii 140 | mDetectedCharset = "UTF-8"; 141 | } 142 | return st; 143 | } 144 | 145 | //--------------------------------------------------------------------- 146 | const char *nsUniversalDetector::GetCharSetName() 147 | { 148 | if (mDetectedCharset) { 149 | return mDetectedCharset; 150 | } 151 | switch (mInputState) { 152 | case eHighbyte: { 153 | float proberConfidence; 154 | float maxProberConfidence = (float)0.0; 155 | int maxProber = 0; 156 | 157 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 158 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 159 | if (proberConfidence > maxProberConfidence) { 160 | maxProberConfidence = proberConfidence; 161 | maxProber = i; 162 | } 163 | } 164 | // do not report anything because we are not confident of it, that's in fact a negative answer 165 | if (maxProberConfidence > MINIMUM_THRESHOLD) { 166 | return mCharSetProbers[maxProber]->GetCharSetName(); 167 | } 168 | } 169 | case eEscAscii: 170 | break; 171 | default: // pure ascii 172 | ; 173 | } 174 | return "UTF-8"; 175 | } 176 | 177 | //--------------------------------------------------------------------- 178 | float nsUniversalDetector::GetConfidence() 179 | { 180 | if (!mGotData) { 181 | // we haven't got any data yet, return immediately 182 | // caller program sometimes call DataEnd before anything has been sent to detector 183 | return MINIMUM_THRESHOLD; 184 | } 185 | if (mDetectedCharset) { 186 | return 0.99f; 187 | } 188 | switch (mInputState) { 189 | case eHighbyte: { 190 | float proberConfidence; 191 | float maxProberConfidence = (float)0.0; 192 | int maxProber = 0; 193 | 194 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 195 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 196 | if (proberConfidence > maxProberConfidence) { 197 | maxProberConfidence = proberConfidence; 198 | maxProber = i; 199 | } 200 | } 201 | // do not report anything because we are not confident of it, that's in fact a negative answer 202 | if (maxProberConfidence > MINIMUM_THRESHOLD) { 203 | return mCharSetProbers[maxProber]->GetConfidence(); 204 | } 205 | } 206 | case eEscAscii: 207 | break; 208 | default: // pure ascii 209 | ; 210 | } 211 | return MINIMUM_THRESHOLD; 212 | } 213 | 214 | nsProbingState nsUniversalDetector::GetState() 215 | { 216 | if (mDone) { 217 | return eFoundIt; 218 | } else { 219 | return eDetecting; 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/probers/nsUniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation 3 | SPDX-FileCopyrightText: 2008 Wang Kai 4 | 5 | SPDX-License-Identifier: MIT 6 | */ 7 | 8 | #ifndef nsUniversalDetector_h__ 9 | #define nsUniversalDetector_h__ 10 | 11 | #include "nsCharSetProber.h" 12 | 13 | #define NUM_OF_CHARSET_PROBERS 3 14 | 15 | namespace kencodingprober 16 | { 17 | typedef enum { 18 | ePureAscii = 0, 19 | eEscAscii = 1, 20 | eHighbyte = 2, 21 | } nsInputState; 22 | 23 | class KCODECS_NO_EXPORT nsUniversalDetector : public nsCharSetProber 24 | { 25 | public: 26 | nsUniversalDetector(); 27 | ~nsUniversalDetector() override; 28 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 29 | const char *GetCharSetName() override; 30 | void Reset(void) override; 31 | float GetConfidence(void) override; 32 | nsProbingState GetState() override; 33 | void SetOpion() override 34 | { 35 | } 36 | 37 | protected: 38 | nsInputState mInputState; 39 | bool mDone; 40 | bool mInTag; 41 | bool mStart; 42 | bool mGotData; 43 | char mLastChar; 44 | const char *mDetectedCharset; 45 | int mBestGuess; 46 | 47 | nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; 48 | nsCharSetProber *mEscCharSetProber; 49 | }; 50 | } 51 | 52 | #endif 53 | --------------------------------------------------------------------------------