├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── src ├── 3rd_party │ └── utf8 │ │ ├── doc │ │ ├── ReleaseNotes │ │ └── utf8cpp.html │ │ └── source │ │ ├── utf8.h │ │ └── utf8 │ │ ├── checked.h │ │ ├── core.h │ │ └── unchecked.h ├── mono │ ├── buffered_map.cpp │ ├── buffered_map.h │ ├── filters │ │ ├── header.h │ │ ├── langcollectorfilter.cpp │ │ ├── langcollectorfilter.h │ │ ├── langsplitfilter.cpp │ │ ├── langsplitfilter.h │ │ ├── string_util.h │ │ ├── warcfilter.cpp │ │ └── warcfilter.h │ ├── language_sink.cpp │ ├── language_sink.h │ ├── mono.cpp │ ├── worker.cpp │ └── worker.h └── utils │ ├── common.cpp │ ├── common.h │ ├── compression.cpp │ ├── compression.h │ ├── curldownloader.cpp │ ├── curldownloader.h │ ├── logging.cpp │ └── logging.h └── tests ├── CMakeLists.txt ├── data_integration ├── test1.in.gz ├── test1_en.out ├── test1_ja.out ├── test2.in.gz ├── test2_de.out ├── test2_ru.out ├── test3.in └── test3_en.out ├── data_langsplit ├── test1.in ├── test1.out └── test1_langstats.out ├── data_readerwarc ├── test1.in ├── test1.out ├── test2.in ├── test2.out ├── test3.in ├── test3.out ├── test4.in ├── test4.out ├── test5.in └── test5.out ├── test_integration.cpp ├── test_langsplit.cpp ├── test_readerwarc.cpp └── test_utils.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.app 32 | 33 | # Other 34 | .idea/ 35 | cmake-build-debug/ 36 | cmake-build-release/ 37 | data/ 38 | *.log 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/3rd_party/cld2"] 2 | path = src/3rd_party/cld2 3 | url = https://github.com/CLD2Owners/cld2.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(extractor C CXX) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | set(NAME extractor) 6 | 7 | # options 8 | option(BUILD_TEST "Build tests" ON) 9 | message("BUILD_TEST: " ${BUILD_TEST}) 10 | option(WITH_LZMA "Build with lzma" ON) 11 | message("LZMA: " ${WITH_LZMA}) 12 | 13 | if (WITH_LZMA) 14 | add_definitions(-DWITH_LZMA) 15 | endif () 16 | 17 | # flags 18 | message("CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}") 19 | set(CMAKE_CXX_FLAGS_RELEASE "-Wall -Wextra -pthread -O3 -Ofast") 20 | set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -pthread -g ") 21 | 22 | 23 | # Boost 24 | find_package(Boost REQUIRED COMPONENTS log filesystem system iostreams regex program_options) 25 | if (Boost_FOUND) 26 | include_directories(${Boost_INCLUDE_DIRS}) 27 | endif () 28 | 29 | 30 | # update submodules 31 | execute_process(COMMAND git submodule update --init -- cld2 32 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/3rd_party/) 33 | 34 | 35 | # cld2 36 | set(CLD2_DIR "src/3rd_party/cld2") 37 | set(CLD2_DIR_INT "src/3rd_party/cld2/internal") 38 | include_directories("${CLD2_DIR}/internal") 39 | include_directories("${CLD2_DIR}/public") 40 | set(libcld2_full_files ${CLD2_DIR_INT}/cldutil.cc ${CLD2_DIR_INT}/cldutil_shared.cc ${CLD2_DIR_INT}/compact_lang_det.cc ${CLD2_DIR_INT}/compact_lang_det_hint_code.cc ${CLD2_DIR_INT}/compact_lang_det_impl.cc ${CLD2_DIR_INT}/debug.cc ${CLD2_DIR_INT}/fixunicodevalue.cc ${CLD2_DIR_INT}/generated_entities.cc ${CLD2_DIR_INT}/generated_language.cc ${CLD2_DIR_INT}/generated_ulscript.cc ${CLD2_DIR_INT}/getonescriptspan.cc ${CLD2_DIR_INT}/lang_script.cc ${CLD2_DIR_INT}/offsetmap.cc ${CLD2_DIR_INT}/scoreonescriptspan.cc ${CLD2_DIR_INT}/tote.cc ${CLD2_DIR_INT}/utf8statetable.cc ${CLD2_DIR_INT}/cld_generated_cjk_uni_prop_80.cc ${CLD2_DIR_INT}/cld2_generated_cjk_compatible.cc ${CLD2_DIR_INT}/cld_generated_cjk_delta_bi_32.cc ${CLD2_DIR_INT}/generated_distinct_bi_0.cc ${CLD2_DIR_INT}/cld2_generated_quad0122.cc ${CLD2_DIR_INT}/cld2_generated_deltaocta0122.cc ${CLD2_DIR_INT}/cld2_generated_distinctocta0122.cc ${CLD2_DIR_INT}/cld_generated_score_quad_octa_0122.cc) 41 | file(GLOB libcld2_full_glob ${libcld2_full_files}) 42 | add_library(cld2_lib SHARED ${libcld2_full_glob}) 43 | target_compile_options(cld2_lib PRIVATE -Wno-c++11-narrowing -w) 44 | 45 | # CURL 46 | find_package(CURL REQUIRED) 47 | if (CURL_FOUND) 48 | include_directories(${CURL_INCLUDE_DIRS}) 49 | endif () 50 | 51 | 52 | # mono 53 | add_executable(mono src/mono/mono.cpp src/mono/filters/langcollectorfilter.cpp src/mono/filters/langsplitfilter.cpp src/mono/filters/warcfilter.cpp src/mono/language_sink.cpp src/mono/buffered_map.cpp src/mono/worker.cpp src/utils/curldownloader.cpp src/utils/common.cpp src/utils/compression.cpp src/utils/logging.cpp) 54 | target_link_libraries(mono ${Boost_LIBRARIES} ${CURL_LIBRARIES} cld2_lib) 55 | 56 | 57 | # build tests with gtest 58 | if (BUILD_TEST) 59 | 60 | find_package(GTest REQUIRED) 61 | if (GTest_FOUND) 62 | include_directories(${GTEST_INCLUDE_DIRS}) 63 | endif () 64 | 65 | add_subdirectory(tests) 66 | endif (BUILD_TEST) 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extractor 2 | 3 | Extractor is a tool for fast monolingual data collection from Common Crawl data. It processes Common Crawl WET files, removes invalid UTF-8, identifies languages and outputs file-separated monolingual data. Text is classified by language using Google's CLD2 language detector (https://github.com/CLD2Owners/cld2). 4 | 5 | 6 | Following is an example of french output file: 7 | ``` 8 | ... 9 | f6fa1abb58549287111ba8d776733e9 uri:http://blogalolotte.canalblog.com/tag/pomme%20de%20terre language:fr bytes:59 10 | pomme de terre : Tous les messages sur pomme de terre - Le 11 | df6fa1abb58549287111ba8d776733e9 uri:http://blogalolotte.canalblog.com/tag/pomme%20de%20terre language:fr bytes:81 12 | ... 13 | ``` 14 | 15 | Each header line starts with the same hash code followed by metadata. Then, the extracted language-speific text (here french) starts on a new line and is ended by the next header line. 16 | 17 | Apart from this, Extractor can also produce statistics on language distribution per domain. 18 | 19 | 20 | ## Requirements 21 | 22 | - GCC, C++11 compiler 23 | - Boost 1.58.0 or later (Boost 1.65.1 or later for LZMA compression) 24 | - Cmake 3.5 or later 25 | - curl, libcurl4-openssl-dev 26 | 27 | 28 | ## Installation 29 | 30 | ```bash 31 | mkdir -p build && cd build 32 | cmake .. -DWITH_LZMA=off -DBUILD_TEST=on -DCMAKE_BUILD_TYPE=Release 33 | make -j 4 34 | ``` 35 | 36 | `-DWITH_LZMA=on` enables LZMA compression but requires Boost version 1.65.1 or higher. 37 | 38 | 39 | 40 | ## Usage 41 | 42 | Extractor can be used to process monolingual data with the following command: 43 | 44 | ```bash 45 | cat | ./mono --icompression --ocompression --workers --output 46 | ``` 47 | 48 | Extractor reads from standard input and expects a line-separated list of path names. Such list is then processed in parallel with the number of workers specified as an argument. 49 | 50 | `icompression` and `ocompression` arguments set the input and ouput compresion formats respectively. 51 | The following compression formats are supported: `gzip|bzip2|zlib|lzma|none`. 52 | 53 | One folder per worker is created in the output folder and each such folder contains file-separated monolingual data. Processed files are logged in `done.log` file located in each folder. 54 | 55 | 56 | ### Optional arguments: 57 | 58 | - --curl: uses cURL to download remote files 59 | - --print_stats: prints language statistics 60 | 61 | If `--curl` option is used, the input is expected to be a list of URL addresses instead of a list of path names. Extractor also supports multiple URL sources; If one URL source fails, it attempts to download the content from another one. To define additional sources, just add more URLs on the same line separated by a single space. Any cURL errors are logged in `errors.log` file in each worker folder. 62 | 63 | If `--print_stats` option is enabled, it creates additional `stats` folder in each worker folder, and outputs language statistics on the processed files. Each file contains statistics on language distribution per domain with the following format: 64 | ``` 65 | 66 | ``` 67 | 68 | Each file contains no duplicates and is alphabetically sorted. 69 | 70 | 71 | 72 | 73 | ## Known Issues 74 | 75 | CLD2 detects almost 300 different languages when processing Common Crawl. Since each worker opens a new file for every detected language, the total number of opened files might exceed the user's hard or soft resource limits on open file descriptors. You can type ```ulimit -a``` to get a list of all current resource limits. 76 | 77 | For the Common Crawl purposes, executing the following command should increase the limit of open file descriptors sufficiently: 78 | ```bash 79 | ulimit -n 32768 80 | ``` -------------------------------------------------------------------------------- /src/3rd_party/utf8/doc/ReleaseNotes: -------------------------------------------------------------------------------- 1 | utf8 cpp library 2 | Release 2.3.4 3 | 4 | A minor bug fix release. Thanks to all who reported bugs. 5 | 6 | Note: Version 2.3.3 contained a regression, and therefore was removed. 7 | 8 | Changes from version 2.3.2 9 | - Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';' 10 | - Bug fix [36]: replace_invalid() only works with back_inserter 11 | 12 | Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes 13 | -------------------------------------------------------------------------------- /src/3rd_party/utf8/source/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "utf8/checked.h" 32 | #include "utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /src/3rd_party/utf8/source/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public ::std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | octet_iterator append(uint32_t cp, octet_iterator result) 74 | { 75 | if (!utf8::internal::is_code_point_valid(cp)) 76 | throw invalid_code_point(cp); 77 | 78 | if (cp < 0x80) // one octet 79 | *(result++) = static_cast(cp); 80 | else if (cp < 0x800) { // two octets 81 | *(result++) = static_cast((cp >> 6) | 0xc0); 82 | *(result++) = static_cast((cp & 0x3f) | 0x80); 83 | } 84 | else if (cp < 0x10000) { // three octets 85 | *(result++) = static_cast((cp >> 12) | 0xe0); 86 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 87 | *(result++) = static_cast((cp & 0x3f) | 0x80); 88 | } 89 | else { // four octets 90 | *(result++) = static_cast((cp >> 18) | 0xf0); 91 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 92 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 93 | *(result++) = static_cast((cp & 0x3f) | 0x80); 94 | } 95 | return result; 96 | } 97 | 98 | template 99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 100 | { 101 | while (start != end) { 102 | octet_iterator sequence_start = start; 103 | internal::utf_error err_code = utf8::internal::validate_next(start, end); 104 | switch (err_code) { 105 | case internal::UTF8_OK : 106 | for (octet_iterator it = sequence_start; it != start; ++it) 107 | *out++ = *it; 108 | break; 109 | case internal::NOT_ENOUGH_ROOM: 110 | throw not_enough_room(); 111 | case internal::INVALID_LEAD: 112 | out = utf8::append (replacement, out); 113 | ++start; 114 | break; 115 | case internal::INCOMPLETE_SEQUENCE: 116 | case internal::OVERLONG_SEQUENCE: 117 | case internal::INVALID_CODE_POINT: 118 | out = utf8::append (replacement, out); 119 | ++start; 120 | // just one replacement mark for the sequence 121 | while (start != end && utf8::internal::is_trail(*start)) 122 | ++start; 123 | break; 124 | } 125 | } 126 | return out; 127 | } 128 | 129 | template 130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 131 | { 132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); 133 | return utf8::replace_invalid(start, end, out, replacement_marker); 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return utf8::next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (utf8::internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return utf8::peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (utf8::internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return utf8::next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | utf8::next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | utf8::next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = utf8::internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (utf8::internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 215 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (utf8::internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = utf8::append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start != end) { 237 | uint32_t cp = utf8::next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = utf8::append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start != end) 261 | (*result++) = utf8::next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {} 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return utf8::next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | utf8::next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | utf8::next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | utf8::prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | utf8::prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /src/3rd_party/utf8/source/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = utf8::internal::mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// Helper for get_sequence_x 137 | template 138 | utf_error increase_safely(octet_iterator& it, octet_iterator end) 139 | { 140 | if (++it == end) 141 | return NOT_ENOUGH_ROOM; 142 | 143 | if (!utf8::internal::is_trail(*it)) 144 | return INCOMPLETE_SEQUENCE; 145 | 146 | return UTF8_OK; 147 | } 148 | 149 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 150 | 151 | /// get_sequence_x functions decode utf-8 sequences of the length x 152 | template 153 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) 154 | { 155 | if (it == end) 156 | return NOT_ENOUGH_ROOM; 157 | 158 | code_point = utf8::internal::mask8(*it); 159 | 160 | return UTF8_OK; 161 | } 162 | 163 | template 164 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) 165 | { 166 | if (it == end) 167 | return NOT_ENOUGH_ROOM; 168 | 169 | code_point = utf8::internal::mask8(*it); 170 | 171 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 172 | 173 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); 174 | 175 | return UTF8_OK; 176 | } 177 | 178 | template 179 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) 180 | { 181 | if (it == end) 182 | return NOT_ENOUGH_ROOM; 183 | 184 | code_point = utf8::internal::mask8(*it); 185 | 186 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 187 | 188 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 189 | 190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 191 | 192 | code_point += (*it) & 0x3f; 193 | 194 | return UTF8_OK; 195 | } 196 | 197 | template 198 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) 199 | { 200 | if (it == end) 201 | return NOT_ENOUGH_ROOM; 202 | 203 | code_point = utf8::internal::mask8(*it); 204 | 205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 206 | 207 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 208 | 209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 210 | 211 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; 212 | 213 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 214 | 215 | code_point += (*it) & 0x3f; 216 | 217 | return UTF8_OK; 218 | } 219 | 220 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 221 | 222 | template 223 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) 224 | { 225 | // Save the original value of it so we can go back in case of failure 226 | // Of course, it does not make much sense with i.e. stream iterators 227 | octet_iterator original_it = it; 228 | 229 | uint32_t cp = 0; 230 | // Determine the sequence length based on the lead octet 231 | typedef typename std::iterator_traits::difference_type octet_difference_type; 232 | const octet_difference_type length = utf8::internal::sequence_length(it); 233 | 234 | // Get trail octets and calculate the code point 235 | utf_error err = UTF8_OK; 236 | switch (length) { 237 | case 0: 238 | return INVALID_LEAD; 239 | case 1: 240 | err = utf8::internal::get_sequence_1(it, end, cp); 241 | break; 242 | case 2: 243 | err = utf8::internal::get_sequence_2(it, end, cp); 244 | break; 245 | case 3: 246 | err = utf8::internal::get_sequence_3(it, end, cp); 247 | break; 248 | case 4: 249 | err = utf8::internal::get_sequence_4(it, end, cp); 250 | break; 251 | } 252 | 253 | if (err == UTF8_OK) { 254 | // Decoding succeeded. Now, security checks... 255 | if (utf8::internal::is_code_point_valid(cp)) { 256 | if (!utf8::internal::is_overlong_sequence(cp, length)){ 257 | // Passed! Return here. 258 | code_point = cp; 259 | ++it; 260 | return UTF8_OK; 261 | } 262 | else 263 | err = OVERLONG_SEQUENCE; 264 | } 265 | else 266 | err = INVALID_CODE_POINT; 267 | } 268 | 269 | // Failure branch - restore the original value of the iterator 270 | it = original_it; 271 | return err; 272 | } 273 | 274 | template 275 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 276 | uint32_t ignored; 277 | return utf8::internal::validate_next(it, end, ignored); 278 | } 279 | 280 | } // namespace internal 281 | 282 | /// The library API - functions intended to be called by the users 283 | 284 | // Byte order mark 285 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 286 | 287 | template 288 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 289 | { 290 | octet_iterator result = start; 291 | while (result != end) { 292 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); 293 | if (err_code != internal::UTF8_OK) 294 | return result; 295 | } 296 | return result; 297 | } 298 | 299 | template 300 | inline bool is_valid(octet_iterator start, octet_iterator end) 301 | { 302 | return (utf8::find_invalid(start, end) == end); 303 | } 304 | 305 | template 306 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 307 | { 308 | return ( 309 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && 310 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && 311 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) 312 | ); 313 | } 314 | 315 | //Deprecated in release 2.3 316 | template 317 | inline bool is_bom (octet_iterator it) 318 | { 319 | return ( 320 | (utf8::internal::mask8(*it++)) == bom[0] && 321 | (utf8::internal::mask8(*it++)) == bom[1] && 322 | (utf8::internal::mask8(*it)) == bom[2] 323 | ); 324 | } 325 | } // namespace utf8 326 | 327 | #endif // header guard 328 | 329 | 330 | -------------------------------------------------------------------------------- /src/3rd_party/utf8/source/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = utf8::internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (utf8::internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return utf8::unchecked::next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (utf8::internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return utf8::unchecked::next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return utf8::unchecked::prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | utf8::unchecked::next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | utf8::unchecked::next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = utf8::internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (utf8::internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = utf8::unchecked::append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = utf8::unchecked::next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = utf8::unchecked::append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = utf8::unchecked::next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {} 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return utf8::unchecked::next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | ::std::advance(it, utf8::internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | ::std::advance(it, utf8::internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | utf8::unchecked::prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | utf8::unchecked::prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /src/mono/buffered_map.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "buffered_map.h" 3 | #include "../utils/compression.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | #include 12 | #include 13 | 14 | 15 | namespace mono { 16 | 17 | buffered_map::buffered_map(std::string output_folder_, unsigned long buffer_size_) : output_folder(output_folder_), 18 | buffer_size(buffer_size_), 19 | file_no(0) {}; 20 | 21 | void buffered_map::add(std::pair key, long value) { 22 | if (domain_map.size() + 1 > buffer_size) { 23 | flush(); 24 | domain_map.clear(); 25 | } 26 | 27 | std::map, long>::iterator it = domain_map.find(key); 28 | if (it != domain_map.end()) { 29 | it->second += value; 30 | } else { 31 | domain_map.insert(std::pair, long>(key, value)); 32 | } 33 | 34 | } 35 | 36 | void buffered_map::flush() { 37 | // find an unused filename 38 | std::string candidate_ouput_path = output_folder + "/stats/langstats." + std::to_string(file_no) + ".gz"; 39 | while (boost::filesystem::exists(candidate_ouput_path)) { 40 | ++file_no; 41 | candidate_ouput_path = output_folder + "/stats/langstats." + std::to_string(file_no) + ".gz"; 42 | } 43 | 44 | boost::iostreams::filtering_streambuf qout; 45 | qout.push(boost::iostreams::gzip_compressor()); 46 | qout.push(boost::iostreams::file_sink(candidate_ouput_path, std::ofstream::out | std::ofstream::binary)); 47 | 48 | // output domain_map into a file 49 | std::ostream outf(&qout); 50 | std::map, long>::iterator it = domain_map.begin(); 51 | std::string sep = "\t"; 52 | std::string nl = "\n"; 53 | for (; it != domain_map.end(); ++it) { 54 | outf.write(it->first.first.c_str(), it->first.first.size()); 55 | outf.write(sep.c_str(), sep.size()); 56 | outf.write(it->first.second.c_str(), it->first.second.size()); 57 | outf.write(sep.c_str(), sep.size()); 58 | outf.write(std::to_string(it->second).c_str(), std::to_string(it->second).size()); 59 | outf.write(nl.c_str(), nl.size()); 60 | } 61 | 62 | // close and increment file number 63 | ++file_no; 64 | } 65 | 66 | size_t buffered_map::size() { 67 | return domain_map.size(); 68 | } 69 | 70 | 71 | } -------------------------------------------------------------------------------- /src/mono/buffered_map.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_BUFFERED_MAP_H 3 | #define EXTRACTOR_MONO_BUFFERED_MAP_H 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace mono { 10 | 11 | class buffered_map { 12 | public: 13 | 14 | std::string output_folder; 15 | 16 | unsigned long buffer_size; 17 | int file_no; 18 | 19 | std::map, long> domain_map; 20 | 21 | buffered_map(std::string output_folder_, unsigned long buffer_size_); 22 | 23 | void add(std::pair key, long value); 24 | 25 | void flush(); 26 | 27 | size_t size(); 28 | 29 | 30 | private: 31 | 32 | }; 33 | 34 | } 35 | 36 | #endif //EXTRACTOR_MONO_BUFFERED_MAP_H 37 | -------------------------------------------------------------------------------- /src/mono/filters/header.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_FILTERS_HEADER_H 3 | #define EXTRACTOR_MONO_FILTERS_HEADER_H 4 | 5 | #include "string_util.h" 6 | 7 | #include 8 | 9 | 10 | namespace mono { 11 | 12 | namespace filters { 13 | 14 | class Header { 15 | public: 16 | explicit Header(const std::string &header) { 17 | for (const auto &value : StringUtil::Split(header, ' ')) { 18 | if (value.find("tld:") == 0) { 19 | tld_ = value.substr(4); 20 | } else if (value.find("uri:") == 0) { 21 | uri_ = value.substr(4); 22 | } else if (value.find("encoding:") == 0) { 23 | encoding_ = value.substr(9); 24 | } 25 | } 26 | } 27 | 28 | const std::string get_tld() const { return tld_; } 29 | 30 | const std::string get_uri() const { return uri_; } 31 | 32 | const std::string get_encoding() const { return encoding_; } 33 | 34 | private: 35 | std::string uri_; 36 | std::string tld_; 37 | std::string encoding_; 38 | }; 39 | 40 | } 41 | 42 | } 43 | 44 | 45 | #endif //EXTRACTOR_MONO_FILTERS_HEADER_H 46 | -------------------------------------------------------------------------------- /src/mono/filters/langcollectorfilter.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "langcollectorfilter.h" 3 | #include "../../utils/common.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace mono { 16 | 17 | namespace filters { 18 | 19 | LangCollectorFilter::LangCollectorFilter(std::string output_folder_, utils::compression_option compr_) 20 | : boost::iostreams::line_filter(true), 21 | header(""), text_buffer(""), 22 | ls(output_folder_, compr_) {}; 23 | 24 | std::string LangCollectorFilter::do_filter(const std::string &str_) { 25 | std::string line = boost::trim_copy(str_); 26 | 27 | if (line.empty()) { 28 | return ""; 29 | } 30 | 31 | if (boost::starts_with(line, magic_number)) { 32 | if (output_to_langsink()) { 33 | return ""; 34 | } 35 | 36 | // update header 37 | header = line; 38 | 39 | return ""; 40 | 41 | } else { 42 | text_buffer += line + "\n"; 43 | } 44 | 45 | return ""; 46 | } 47 | 48 | std::string LangCollectorFilter::parse_language(std::string const &str) { 49 | std::string found = find_language(str); 50 | 51 | if (found.length() > 0) { 52 | return boost::trim_copy(found.substr(language_guard.length())); 53 | } 54 | 55 | return ""; 56 | 57 | }; 58 | 59 | std::string LangCollectorFilter::find_language(std::string const &str) { 60 | std::string str_uri = str.substr(magic_number.length()); 61 | std::ostringstream ss; 62 | boost::regex expr{language_guard + "\\s*[a-zA-Z0-9]+\\s*"}; 63 | boost::smatch match; 64 | 65 | if (boost::regex_search(str_uri, match, expr) && match.size() >= 1) { 66 | return match[0]; 67 | } 68 | 69 | return ""; 70 | }; 71 | 72 | bool LangCollectorFilter::output_to_langsink() { 73 | if (header.size() > 0) { 74 | std::string res = header + '\n' + text_buffer; 75 | std::string lang = parse_language(header); 76 | 77 | text_buffer.erase(); 78 | 79 | if (lang.length() == 0) { 80 | header = ""; 81 | return true; 82 | } 83 | 84 | ls.output(lang, res); 85 | 86 | } 87 | 88 | return false; 89 | } 90 | 91 | } 92 | 93 | } -------------------------------------------------------------------------------- /src/mono/filters/langcollectorfilter.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H 3 | #define EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H 4 | 5 | #include "../language_sink.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | 23 | namespace mono { 24 | 25 | namespace filters { 26 | 27 | class LangCollectorFilter : public boost::iostreams::line_filter { 28 | 29 | public: 30 | 31 | std::string header; 32 | std::string text_buffer; 33 | mono::language_sink ls; 34 | 35 | LangCollectorFilter(std::string output_folder_, utils::compression_option compr_); 36 | 37 | template 38 | void close(Sink &snk, BOOST_IOS::openmode which) { 39 | output_to_langsink(); // flush text buffer 40 | boost::iostreams::line_filter::close(snk, which); 41 | } 42 | 43 | 44 | private: 45 | 46 | const std::string magic_number = "df6fa1abb58549287111ba8d776733e9"; 47 | const std::string language_guard = "language:"; 48 | 49 | std::string do_filter(const std::string &str); 50 | 51 | std::string parse_language(std::string const &str); 52 | 53 | std::string find_language(std::string const &str); 54 | 55 | bool output_to_langsink(); 56 | 57 | }; 58 | 59 | } 60 | 61 | } 62 | 63 | 64 | #endif //EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H 65 | -------------------------------------------------------------------------------- /src/mono/filters/langsplitfilter.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "langsplitfilter.h" 3 | #include "../buffered_map.h" 4 | #include "../../utils/common.h" 5 | #include "../../utils/logging.h" 6 | #include "compact_lang_det.h" 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | namespace mono { 19 | 20 | namespace filters { 21 | 22 | LangsplitFilter::LangsplitFilter(std::string output_folder_, bool print_stats_) : 23 | boost::iostreams::line_filter(true), 24 | print_stats(print_stats_), 25 | bmap(output_folder_, 1000000), 26 | output_folder(output_folder_), 27 | header(""), 28 | text_buffer(""), 29 | num_reliable(0), 30 | num_unreliable(0) { 31 | flags = get_flag(modes); 32 | } 33 | 34 | LangsplitFilter::~LangsplitFilter() { 35 | if (num_reliable == 0 && num_unreliable == 0) { 36 | return; 37 | } 38 | 39 | logging::log_reliable(output_folder, num_reliable, num_unreliable); 40 | } 41 | 42 | std::string LangsplitFilter::do_filter(const std::string &str_) { 43 | std::string line = boost::trim_copy(str_); 44 | 45 | if (line.empty()) { 46 | return ""; 47 | } 48 | 49 | if (boost::starts_with(line, magic_number)) { 50 | std::string res = PrintLanguageStats(flags, header, text_buffer); 51 | 52 | text_buffer.erase(); 53 | header = line; 54 | 55 | return res; 56 | } else { 57 | text_buffer += line + "\n"; 58 | } 59 | 60 | return ""; 61 | } 62 | 63 | int LangsplitFilter::get_flag(std::vector modes) { 64 | int flags = 0; 65 | for (int i = 0; i < (int) modes.size(); ++i) { 66 | if (modes.at(i) == "--scoreasquads") { 67 | flags |= CLD2::kCLDFlagScoreAsQuads; 68 | } else if (modes.at(i) == "--html") { 69 | flags |= CLD2::kCLDFlagHtml; 70 | } else if (modes.at(i) == "--cr") { 71 | flags |= CLD2::kCLDFlagCr; 72 | } else if (modes.at(i) == "--verbose") { 73 | flags |= CLD2::kCLDFlagVerbose; 74 | } else if (modes.at(i) == "--echo") { 75 | flags |= CLD2::kCLDFlagEcho; 76 | } else if (modes.at(i) == "--besteffort") { 77 | flags |= CLD2::kCLDFlagBestEffort; 78 | } 79 | } 80 | 81 | return flags; 82 | } 83 | 84 | std::string LangsplitFilter::PrintLanguageStats(const int flags, const std::string &header, 85 | const std::string &buffer) { 86 | std::stringstream ss; 87 | if (header.empty() || buffer.empty()) { 88 | return ""; 89 | } 90 | 91 | const Header header_values(header); 92 | const std::string uri = header_values.get_uri(); 93 | utils::parse_uri parsed_uri(uri); 94 | 95 | CLD2::CLDHints cld_hints = {NULL, NULL, UNKNOWN_ENCODING, 96 | CLD2::UNKNOWN_LANGUAGE}; 97 | if (!parsed_uri.get_tld().empty()) { 98 | cld_hints.tld_hint = parsed_uri.get_tld().c_str(); 99 | } 100 | 101 | CLD2::Language language3[3]; 102 | int percent3[3]; 103 | double normalized_score3[3]; 104 | int valid_prefix_bytes; 105 | 106 | CLD2::ResultChunkVector resultchunkvector; 107 | int text_bytes; 108 | bool is_reliable; 109 | 110 | CLD2::ExtDetectLanguageSummaryCheckUTF8( 111 | buffer.c_str(), buffer.size(), true, &cld_hints, flags, 112 | language3, percent3, normalized_score3, &resultchunkvector, &text_bytes, 113 | &is_reliable, &valid_prefix_bytes); 114 | 115 | if (is_reliable) { 116 | ++num_reliable; 117 | for (int i = 0; i < static_cast(resultchunkvector.size()); ++i) { 118 | const CLD2::ResultChunk &rc = resultchunkvector[i]; 119 | CLD2::Language rc_lang = static_cast(rc.lang1); 120 | 121 | if (rc_lang == CLD2::UNKNOWN_LANGUAGE) { 122 | continue; 123 | } 124 | 125 | const std::string lang_code = LanguageCode(rc_lang); 126 | ss << output_chunk(buffer, rc, header, lang_code); 127 | 128 | if (print_stats) { 129 | bmap.add(std::make_pair(parsed_uri.get_domain(), lang_code), static_cast(rc.bytes)); 130 | } 131 | } 132 | 133 | } else { 134 | ++num_unreliable; 135 | } 136 | 137 | return ss.str(); 138 | } 139 | 140 | std::string 141 | LangsplitFilter::output_chunk(const std::string buffer, const CLD2::ResultChunk &rc, const std::string header, 142 | const std::string lang_code) { 143 | std::stringstream ss; 144 | const std::string chunk = std::string(buffer, rc.offset, rc.bytes); 145 | 146 | ss << header << " language:" << lang_code << " bytes:" << rc.bytes << "\n" << chunk << "\n"; 147 | return ss.str(); 148 | } 149 | 150 | } 151 | 152 | } -------------------------------------------------------------------------------- /src/mono/filters/langsplitfilter.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H 3 | #define EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H 4 | 5 | #include "header.h" 6 | #include "compact_lang_det.h" 7 | #include "../buffered_map.h" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | typedef CLD2::int32 Encoding; 17 | static const Encoding UNKNOWN_ENCODING = 0; 18 | 19 | namespace mono { 20 | 21 | namespace filters { 22 | 23 | class LangsplitFilter : public boost::iostreams::line_filter { 24 | 25 | public: 26 | 27 | int flags; 28 | bool print_stats; 29 | buffered_map bmap; 30 | 31 | std::string output_folder; 32 | std::string header; 33 | std::string text_buffer; 34 | 35 | long num_reliable; 36 | long num_unreliable; 37 | 38 | std::vector modes = std::vector(); 39 | 40 | LangsplitFilter(std::string output_folder_, bool print_stats_); 41 | 42 | virtual ~LangsplitFilter(); 43 | 44 | template 45 | void close(Sink &snk, BOOST_IOS::openmode which) { 46 | boost::iostreams::line_filter::close(snk, which); 47 | 48 | string_type line = PrintLanguageStats(flags, header, text_buffer); 49 | std::streamsize amt = static_cast(line.size()); 50 | boost::iostreams::write_if(snk, line.data(), amt); 51 | 52 | bmap.flush(); 53 | } 54 | 55 | 56 | private: 57 | 58 | const std::string magic_number = "df6fa1abb58549287111ba8d776733e9"; 59 | 60 | std::string do_filter(const std::string &str); 61 | 62 | int get_flag(std::vector modes); 63 | 64 | std::string PrintLanguageStats(const int flags, const std::string &header, 65 | const std::string &buffer); 66 | 67 | std::string output_chunk(const std::string buffer, const CLD2::ResultChunk &rc, const std::string header, 68 | const std::string lang_code); 69 | 70 | }; 71 | 72 | } 73 | 74 | } 75 | 76 | #endif //EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H 77 | -------------------------------------------------------------------------------- /src/mono/filters/string_util.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_FILTERS_STRING_UTIL_H 3 | #define EXTRACTOR_MONO_FILTERS_STRING_UTIL_H 4 | 5 | #include "boost/algorithm/string/trim.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | namespace mono { 14 | 15 | namespace filters { 16 | 17 | class StringUtil { 18 | public: 19 | static bool EndsWith(std::string const &s, const std::string &end) { 20 | return (s.length() >= end.length() && s.compare(s.length() - end.length(), end.length(), end) == 0); 21 | } 22 | 23 | static std::string ToLower(const std::string &s) { 24 | std::string lower(s); 25 | std::transform(s.begin(), s.end(), lower.begin(), ::tolower); 26 | return lower; 27 | } 28 | 29 | static std::vector Split(const std::string &s, const char delim = ' ') { 30 | std::vector tokens; 31 | std::stringstream ss(s); 32 | std::string token; 33 | while (std::getline(ss, token, delim)) { 34 | if (!token.empty()) { 35 | tokens.push_back(token); 36 | } 37 | } 38 | return tokens; 39 | } 40 | 41 | static void TrimRepeatedSpace(std::string *s) { 42 | s->erase(std::unique(s->begin(), s->end(), 43 | [](char a, char b) { return std::isspace(a) && std::isspace(b); }), 44 | s->end()); 45 | } 46 | 47 | static std::string TrimRepeatedWhitespace(const std::string &s) { 48 | std::stringstream ss(s); 49 | std::ostringstream oss; 50 | std::string line; 51 | while (std::getline(ss, line)) { 52 | line = boost::trim_copy(line); 53 | if (!line.empty()) { 54 | TrimRepeatedSpace(&line); 55 | oss << line << "\n"; 56 | } 57 | } 58 | return oss.str(); 59 | } 60 | }; 61 | 62 | } 63 | 64 | } 65 | 66 | 67 | #endif //EXTRACTOR_MONO_FILTERS_STRING_UTIL_H 68 | -------------------------------------------------------------------------------- /src/mono/filters/warcfilter.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "warcfilter.h" 3 | #include "../../utils/common.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | namespace mono { 13 | 14 | namespace filters { 15 | 16 | std::string WARCFilter::do_filter(const std::string &str_) { 17 | std::string str = str_; 18 | 19 | if (str.length() > 0 && str.back() == '\r') 20 | str.pop_back(); 21 | 22 | switch (state) { 23 | 24 | case 0: 25 | // PARSE HEADER IDENTIFIER 26 | if (boost::starts_with(str, warc_guard)) 27 | state = 1; 28 | break; 29 | 30 | case 1: 31 | // PARSE and OUTPUT URI HEADER 32 | if (boost::starts_with(str, uri_guard)) { 33 | std::string found_header = parse_uri_header(str); 34 | if (found_header.length() > 0) { 35 | state = 2; 36 | return found_header; 37 | } else { 38 | state = 0; 39 | } 40 | } else if (str.empty()) { 41 | state = 0; 42 | } 43 | break; 44 | 45 | case 2: 46 | // PARSE END-OF-HEADER NEW LINE 47 | if (str.empty()) 48 | state = 3; 49 | break; 50 | 51 | case 3: 52 | // PARSE AND OUTPUT CONTENT UNTIL HEADER IDENTIFIER FOUND] 53 | if (boost::starts_with(str, "WARC/1.0")) { 54 | state = 1; 55 | } else { 56 | if (!(str.empty())) { 57 | utils::fix_utf8_string(str); 58 | return str + '\n'; 59 | } 60 | } 61 | break; 62 | } 63 | 64 | return ""; 65 | }; 66 | 67 | std::string WARCFilter::make_header(std::string const &uri) { 68 | std::ostringstream ss; 69 | ss << magic_number << " uri:" << boost::trim_left_copy(uri) << '\n'; 70 | return ss.str(); 71 | }; 72 | 73 | std::string WARCFilter::parse_uri_header(std::string const &str) { 74 | std::string str_uri = str.substr(uri_guard.length()); 75 | std::ostringstream ss; 76 | boost::regex expr{"^\\s*([^\\s]+)\\s*$"}; 77 | boost::smatch match; 78 | 79 | if (boost::regex_match(str_uri, match, expr) && match.size() >= 1) { 80 | return make_header(match[0]); 81 | } 82 | 83 | return ""; 84 | }; 85 | 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/mono/filters/warcfilter.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_FILTERS_WARCFILTER_H 3 | #define EXTRACTOR_MONO_FILTERS_WARCFILTER_H 4 | 5 | #include 6 | #include 7 | 8 | 9 | namespace mono { 10 | 11 | namespace filters { 12 | 13 | class WARCFilter : public boost::iostreams::line_filter { 14 | 15 | public: 16 | 17 | int state; 18 | 19 | WARCFilter() : boost::iostreams::line_filter(true), state(0) {}; 20 | 21 | private: 22 | 23 | const std::string warc_guard = "WARC/1.0"; 24 | const std::string uri_guard = "WARC-Target-URI:"; 25 | const std::string magic_number = "df6fa1abb58549287111ba8d776733e9"; 26 | 27 | std::string do_filter(const std::string &str); 28 | 29 | std::string make_header(std::string const &uri); 30 | 31 | std::string parse_uri_header(std::string const &str); 32 | }; 33 | 34 | } 35 | 36 | } 37 | 38 | #endif //EXTRACTOR_MONO_FILTERS_WARCFILTER_H 39 | -------------------------------------------------------------------------------- /src/mono/language_sink.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "language_sink.h" 4 | #include "../utils/compression.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace mono { 16 | 17 | language_sink::language_sink(std::string output_folder_, utils::compression_option compr_) : output_folder( 18 | output_folder_), compr(compr_) {}; 19 | 20 | void language_sink::output(std::string const &lang, std::string const &text) { 21 | std::unordered_map >::iterator it; 22 | 23 | it = sinkmap.find(lang); 24 | if (it == sinkmap.end()) { 25 | add_language_sink(lang); 26 | } 27 | 28 | std::ostream outf(sinkmap.at(lang).get()); 29 | outf.write(text.c_str(), text.size()); 30 | } 31 | 32 | void language_sink::add_language_sink(std::string lang) { 33 | // add new stream to language map 34 | auto out = std::make_shared(); 35 | sinkmap.insert(std::make_pair(lang, out)); 36 | 37 | // flags 38 | std::ios_base::openmode flags = std::ofstream::app; 39 | if (compr == utils::gzip || compr == utils::gzip || compr == utils::gzip || compr == utils::gzip) { 40 | flags |= std::ofstream::binary; 41 | } 42 | 43 | // add file sink with compression 44 | utils::add_compression(sinkmap.at(lang), compr); 45 | std::string ofilesink_path = get_langfile_path(output_folder, lang).string(); 46 | sinkmap.at(lang)->push(boost::iostreams::file_sink(ofilesink_path, flags)); 47 | } 48 | 49 | boost::filesystem::path language_sink::get_langfile_path(std::string folder, std::string lang) { 50 | boost::format path = boost::format("%s/text.%s.%s") % folder % lang % get_compression_extension(compr); 51 | return boost::filesystem::path(path.str()); 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /src/mono/language_sink.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_LANGUAGE_SINK_H 3 | #define EXTRACTOR_MONO_LANGUAGE_SINK_H 4 | 5 | #include "../utils/compression.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | namespace mono { 15 | 16 | typedef boost::iostreams::filtering_streambuf ostreambuf; 17 | 18 | class language_sink { 19 | public: 20 | 21 | std::string output_folder; 22 | utils::compression_option compr; 23 | 24 | language_sink(std::string output_folder_, utils::compression_option compr_); 25 | 26 | std::unordered_map > sinkmap; 27 | 28 | void output(std::string const &lang, std::string const &text); 29 | 30 | 31 | private: 32 | 33 | void add_language_sink(std::string lang); 34 | 35 | boost::filesystem::path get_langfile_path(std::string folder, std::string lang); 36 | 37 | }; 38 | 39 | } 40 | 41 | 42 | #endif //EXTRACTOR_MONO_LANGUAGE_SINK_H 43 | -------------------------------------------------------------------------------- /src/mono/mono.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "worker.h" 3 | #include "../utils/logging.h" 4 | #include "../utils/common.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace po = boost::program_options; 16 | 17 | typedef utils::shared_vector shared_vector_string; 18 | 19 | namespace mono { 20 | 21 | long get_ulimit() { 22 | struct rlimit limit; 23 | 24 | if (getrlimit(RLIMIT_NOFILE, &limit) != 0) { 25 | return -1; 26 | } 27 | 28 | return (long) limit.rlim_cur; 29 | } 30 | 31 | void load_data_from_cin(shared_vector_string &data) { 32 | for (std::string path; std::getline(std::cin, path);) { 33 | path = boost::trim_copy(path); 34 | if (path.length() > 0) 35 | data.push(path); 36 | } 37 | data.reverse(); // so that pop_back returns in the original order 38 | } 39 | 40 | void 41 | start(int workers, bool curl, bool print_stats, std::string output_folder, utils::compression_option input_compr, 42 | utils::compression_option output_compr) { 43 | 44 | shared_vector_string files_to_process; 45 | load_data_from_cin(files_to_process); 46 | LOG_INFO << files_to_process.size() << " files found to process."; 47 | 48 | utils::progress prog(files_to_process.size()); 49 | boost::thread_group threads; 50 | for (int id = 0; id < workers; ++id) { 51 | std::string output_folder_thread = output_folder + "/" + std::to_string(id + 1); 52 | boost::filesystem::create_directory(output_folder_thread); 53 | if (print_stats) { 54 | boost::filesystem::create_directory(output_folder_thread + "/stats"); 55 | } 56 | threads.create_thread( 57 | boost::bind(run_worker, &files_to_process, &prog, curl, print_stats, output_folder_thread, input_compr, 58 | output_compr)); 59 | } 60 | threads.join_all(); 61 | prog.finish(); 62 | } 63 | 64 | } 65 | 66 | int main(int argc, char *argv[]) { 67 | logging::init(); 68 | 69 | po::options_description desc("Allowed options"); 70 | desc.add_options() 71 | ("help", "produce help message") 72 | ("curl", "uses curl to download remote files") 73 | ("print_stats", "prints language statistics") 74 | ("icompression", po::value(), "set expected input compression") 75 | ("ocompression", po::value(), "set output compression") 76 | ("workers", po::value(), "set the number of workers") 77 | ("output", po::value(), "set the output folder"); 78 | 79 | po::variables_map vm; 80 | po::store(po::parse_command_line(argc, reinterpret_cast(argv), desc), vm); 81 | po::notify(vm); 82 | 83 | if (vm.count("help")) { 84 | std::cout << desc << "\n"; 85 | return 1; 86 | } 87 | 88 | if (vm.count("workers")) { 89 | if (vm["workers"].as() <= 0) { 90 | LOG_ERROR << "The number of workers has to be >= 1"; 91 | throw 11; 92 | } 93 | 94 | LOG_INFO << "The number of workers set to " 95 | << vm["workers"].as() << "."; 96 | } 97 | 98 | // check system resources 99 | long curr_ulimit = mono::get_ulimit(); 100 | long sufficient_limit = 400 * vm["workers"].as(); 101 | LOG_INFO << "Current limit of open file descriptors: " << curr_ulimit; 102 | if (curr_ulimit == -1) { 103 | LOG_ERROR << "Failed to read available system resources!"; 104 | } else if (curr_ulimit < sufficient_limit) { 105 | LOG_ERROR << "The limit of open file descriptors is too low - should be at least: " << sufficient_limit; 106 | throw 19; 107 | } 108 | 109 | utils::compression_option input_compr; 110 | utils::compression_option output_compr; 111 | 112 | if (vm.count("icompression")) { 113 | input_compr = utils::string_to_compression_option(vm["icompression"].as()); 114 | if (input_compr == utils::null || input_compr == utils::lzma) { 115 | LOG_ERROR << "Unsupported input compression option: " << utils::compression_option_to_string(input_compr); 116 | throw 11; 117 | } 118 | LOG_INFO << "Expecting input compression: " << utils::compression_option_to_string(input_compr); 119 | } else { 120 | input_compr = utils::none; 121 | LOG_INFO << "Expecting uncompressed input files. "; 122 | } 123 | 124 | if (vm.count("ocompression")) { 125 | output_compr = utils::string_to_compression_option(vm["ocompression"].as()); 126 | if (output_compr == utils::null) { 127 | LOG_ERROR << "Unsupported output compression option: " << vm["ocompression"].as(); 128 | throw 11; 129 | } 130 | LOG_INFO << "Setting output compression: " << utils::compression_option_to_string(output_compr); 131 | } else { 132 | output_compr = utils::none; 133 | LOG_INFO << "Using no compression for output files. "; 134 | } 135 | 136 | if (vm.count("output")) { 137 | boost::filesystem::path output_dir(vm["output"].as()); 138 | if (boost::filesystem::create_directory(output_dir)) { 139 | LOG_INFO << "Outputting to: " << output_dir.string(); 140 | } else { 141 | LOG_ERROR << "The output folder already exists!\n"; 142 | throw 17; 143 | } 144 | } else { 145 | LOG_ERROR << "The output folder was not set!\n"; 146 | throw 10; 147 | } 148 | 149 | if (vm.count("curl")) { 150 | LOG_INFO << "Using curl to download remote files. "; 151 | } else { 152 | LOG_INFO << "Local files will be processed. "; 153 | } 154 | 155 | if (vm.count("print_stats")) { 156 | LOG_INFO << "Printing language statistics: On"; 157 | } else { 158 | LOG_INFO << "Printing language statistics: Off"; 159 | } 160 | 161 | 162 | mono::start(vm["workers"].as(), vm.count("curl"), vm.count("print_stats"), vm["output"].as(), 163 | input_compr, output_compr); 164 | 165 | return 0; 166 | 167 | } -------------------------------------------------------------------------------- /src/mono/worker.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "worker.h" 3 | #include "filters/warcfilter.h" 4 | #include "filters/langcollectorfilter.h" 5 | #include "filters/langsplitfilter.h" 6 | #include "../utils/curldownloader.h" 7 | #include "../utils/common.h" 8 | #include "../utils/compression.h" 9 | #include "../utils/logging.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | 24 | namespace mono { 25 | 26 | void run_worker(shared_vector_string *files_to_process, 27 | utils::progress *prog, bool curl, bool print_stats, std::string output_folder, 28 | utils::compression_option input_compr, 29 | utils::compression_option output_compr) { 30 | 31 | while (files_to_process->size() > 0) { 32 | std::string path = files_to_process->pop(); 33 | if (curl) { 34 | worker_curl(path, print_stats, output_folder, input_compr, output_compr); 35 | } else { 36 | worker_file(path, print_stats, output_folder, input_compr, output_compr); 37 | } 38 | 39 | prog->increment(); 40 | 41 | } 42 | } 43 | 44 | void 45 | worker_file(std::string path, bool print_stats, std::string output_folder, utils::compression_option input_compr, 46 | utils::compression_option output_compr) { 47 | 48 | std::ios_base::openmode flags = std::ofstream::in; 49 | if (input_compr == utils::gzip) { 50 | flags |= std::ofstream::binary; 51 | } 52 | 53 | std::ifstream input_file(path, flags); 54 | if (!boost::filesystem::exists(path)) { 55 | std::cerr << "File not found!" << std::endl; 56 | return; 57 | } 58 | 59 | boost::iostreams::filtering_streambuf qin(input_file); 60 | boost::iostreams::filtering_streambuf qout; 61 | 62 | add_decompression(&qout, input_compr); 63 | 64 | qout.push(filters::WARCFilter()); 65 | qout.push(filters::LangsplitFilter(output_folder, print_stats)); 66 | qout.push(filters::LangCollectorFilter(output_folder, output_compr)); 67 | qout.push(boost::iostreams::null_sink()); 68 | 69 | boost::iostreams::copy(qin, qout); 70 | logging::log_done(output_folder, path); 71 | 72 | } 73 | 74 | void 75 | worker_curl(std::string url, bool print_stats, std::string output_folder, utils::compression_option input_compr, 76 | utils::compression_option output_compr) { 77 | 78 | boost::iostreams::filtering_streambuf qout; 79 | 80 | add_decompression(&qout, input_compr); 81 | 82 | qout.push(filters::WARCFilter()); 83 | qout.push(filters::LangsplitFilter(output_folder, print_stats)); 84 | qout.push(filters::LangCollectorFilter(output_folder, output_compr)); 85 | qout.push(boost::iostreams::null_sink()); 86 | 87 | // split input on a single space 88 | std::vector parsed_urls; 89 | boost::algorithm::split(parsed_urls, url, boost::algorithm::is_any_of(" ")); 90 | 91 | std::size_t i = 0; 92 | std::ostream oqout(&qout); 93 | HTTPDownloader downloader; 94 | CURLcode res = downloader.download(parsed_urls.at(i), &oqout); 95 | 96 | // download from other sources if failed 97 | while (res != CURLE_OK) { 98 | std::string error_text = 99 | "Failed to download from: " + parsed_urls.at(i) + " with error: " + curl_easy_strerror(res); 100 | logging::log_error(output_folder, error_text); 101 | 102 | if (++i >= parsed_urls.size()) break; 103 | 104 | res = downloader.download(parsed_urls.at(i), &oqout); 105 | } 106 | 107 | // Not found in any source 108 | if (res != CURLE_OK) { 109 | std::string error_text = "CURL ERROR: " + parsed_urls.at(0) + " failed to process!"; 110 | logging::log_error(output_folder, error_text); 111 | } else { 112 | logging::log_done(output_folder, parsed_urls.at(i)); 113 | } 114 | 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/mono/worker.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_MONO_PRODUCER_H 3 | #define EXTRACTOR_MONO_PRODUCER_H 4 | 5 | #include "../utils/common.h" 6 | #include 7 | 8 | 9 | typedef utils::shared_vector shared_vector_string; 10 | 11 | namespace mono { 12 | 13 | void run_worker(shared_vector_string *files_to_process, 14 | utils::progress *prog, bool curl, bool print_stats, std::string output_folder, 15 | utils::compression_option input_compr, 16 | utils::compression_option output_compr); 17 | 18 | void 19 | worker_file(std::string path, bool print_stats, std::string output_folder, utils::compression_option input_compr, 20 | utils::compression_option output_compr); 21 | 22 | void 23 | worker_curl(std::string url, bool print_stats, std::string output_folder, utils::compression_option input_compr, 24 | utils::compression_option output_compr); 25 | 26 | }; 27 | 28 | #endif //EXTRACTOR_MONO_PRODUCER_H 29 | -------------------------------------------------------------------------------- /src/utils/common.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "common.h" 3 | #include "logging.h" 4 | #include "../3rd_party/utf8/source/utf8.h" 5 | 6 | #include 7 | #include 8 | 9 | 10 | namespace utils { 11 | 12 | progress::progress(int total_) : current_progress(0), total(total_) { 13 | std::cout << std::endl; 14 | show_bar(); 15 | } 16 | 17 | void progress::increment() { 18 | boost::lock_guard lock(mutex); 19 | ++current_progress; 20 | show_bar(); 21 | } 22 | 23 | void progress::show_bar() { 24 | int bar_width = 70; 25 | 26 | std::cout << " " << current_progress << "/" << total << " ["; 27 | int pos = bar_width * current_progress / float(total); 28 | for (int i = 0; i < bar_width; ++i) { 29 | if (i < pos) 30 | std::cout << "="; 31 | else if (i == pos) 32 | std::cout << ">"; 33 | else 34 | std::cout << " "; 35 | } 36 | std::cout << "] " << int(current_progress / float(total) * 100.0) << " %\r"; 37 | std::cout.flush(); 38 | }; 39 | 40 | void progress::finish() { 41 | std::cout << std::endl; 42 | LOG_INFO << "Done."; 43 | } 44 | 45 | void fix_utf8_string(std::string &str) { 46 | std::string temp; 47 | utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp)); 48 | str = temp; 49 | } 50 | 51 | parse_uri::parse_uri(const std::string &url) : uri_scheme(""), uri_domain(""), uri_tld(""), uri_path("") { 52 | try { 53 | int scheme_end = parse_uri::parse_scheme(0, url); 54 | int domain_end = parse_domains(scheme_end, url); 55 | parse_path(domain_end, url); 56 | } catch (std::exception &e) { 57 | std::cerr << "Error in parse_uri: " << e.what() << std::endl; 58 | uri_domain = url; 59 | } 60 | 61 | } 62 | 63 | int parse_uri::parse_scheme(int start, const std::string &uri) { 64 | std::string uri_stripped = uri.substr(start, uri.length() - start); 65 | int found = uri.find("://"); 66 | if (found != static_cast(std::string::npos)) { 67 | uri_scheme = uri.substr(0, found); 68 | return found + 3; 69 | } 70 | 71 | return 0; 72 | } 73 | 74 | int parse_uri::parse_domains(int start, const std::string &uri) { 75 | // parse authority 76 | int found_authority = uri.find("/", start); 77 | if (found_authority == static_cast(std::string::npos)) 78 | found_authority = uri.length(); 79 | std::string domain = uri.substr(start, found_authority - start); 80 | 81 | // remove user information 82 | int found_user = domain.find("@"); 83 | if (found_user != static_cast(std::string::npos)) { 84 | domain = domain.substr(found_user + 1, domain.length() - found_user - 1); 85 | } 86 | 87 | // remove port 88 | int found_port = domain.find_last_of(":"); 89 | if (found_port != static_cast(std::string::npos)) { 90 | domain = domain.substr(0, found_port); 91 | } 92 | 93 | // parse tld 94 | int found_tld_start = domain.find_last_of("."); 95 | if (found_tld_start == static_cast(std::string::npos)) 96 | found_tld_start = 0; 97 | else 98 | ++found_tld_start; 99 | std::string tld = domain.substr(found_tld_start, domain.length() - found_tld_start); 100 | 101 | // save 102 | uri_domain = domain; 103 | uri_tld = tld; 104 | 105 | return found_authority + 1; 106 | } 107 | 108 | int parse_uri::parse_path(int start, const std::string &uri) { 109 | if (start >= static_cast(uri.length())) 110 | return uri.length(); 111 | 112 | // parse path 113 | int found_path = uri.find("?", start); 114 | if (found_path == static_cast(std::string::npos)) 115 | found_path = uri.length(); 116 | uri_path = uri.substr(start, found_path - start); 117 | 118 | return found_path; 119 | } 120 | 121 | const std::string &parse_uri::get_scheme() const { 122 | return uri_scheme; 123 | } 124 | 125 | const std::string &parse_uri::get_domain() const { 126 | return uri_domain; 127 | } 128 | 129 | const std::string &parse_uri::get_tld() const { 130 | return uri_tld; 131 | } 132 | 133 | const std::string &parse_uri::get_path() const { 134 | return uri_path; 135 | } 136 | 137 | } -------------------------------------------------------------------------------- /src/utils/common.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_UTILS_COMMON_H 3 | #define EXTRACTOR_UTILS_COMMON_H 4 | 5 | #include "compression.h" 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | namespace utils { 12 | 13 | template 14 | class shared_vector { 15 | public: 16 | 17 | T pop() { 18 | boost::lock_guard lock(mutex); 19 | if (storage.empty()) { 20 | return T(); 21 | } 22 | 23 | std::string val = storage.back(); 24 | storage.pop_back(); 25 | return val; 26 | } 27 | 28 | void push(T val) { 29 | boost::lock_guard lock(mutex); 30 | storage.push_back(val); 31 | } 32 | 33 | void reverse() { 34 | boost::lock_guard lock(mutex); 35 | std::reverse(storage.begin(), storage.end()); 36 | } 37 | 38 | int size() { 39 | boost::lock_guard lock(mutex); 40 | return storage.size(); 41 | } 42 | 43 | private: 44 | boost::mutex mutex; 45 | std::vector storage; 46 | }; 47 | 48 | 49 | class progress { 50 | public: 51 | 52 | int current_progress; 53 | int total; 54 | 55 | progress(int total_); 56 | 57 | public: 58 | 59 | void increment(); 60 | 61 | void finish(); 62 | 63 | private: 64 | 65 | void show_bar(); 66 | 67 | boost::mutex mutex; 68 | 69 | }; 70 | 71 | class parse_uri { 72 | public: 73 | 74 | std::string uri_scheme; 75 | std::string uri_domain; 76 | std::string uri_tld; 77 | std::string uri_path; 78 | 79 | const std::string &get_scheme() const; 80 | 81 | const std::string &get_domain() const; 82 | 83 | const std::string &get_path() const; 84 | 85 | const std::string &get_tld() const; 86 | 87 | parse_uri(const std::string &uri); 88 | 89 | 90 | private: 91 | 92 | int parse_scheme(int start, const std::string &uri); 93 | 94 | int parse_domains(int start, const std::string &uri); 95 | 96 | int parse_path(int start, const std::string &uri); 97 | 98 | }; 99 | 100 | void fix_utf8_string(std::string &str); 101 | 102 | } 103 | 104 | #endif //EXTRACTOR_UTILS_COMMON_H 105 | -------------------------------------------------------------------------------- /src/utils/compression.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "compression.h" 3 | 4 | #ifdef WITH_LZMA 5 | #include 6 | #endif 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | namespace utils { 15 | 16 | compression_option string_to_compression_option(std::string str) { 17 | if (str == "gzip") return gzip; 18 | if (str == "bzip2") return bzip2; 19 | if (str == "zlib") return zlib; 20 | if (str == "lzma") return lzma; 21 | if (str == "none") return none; 22 | 23 | return null; 24 | } 25 | 26 | std::string compression_option_to_string(compression_option compr) { 27 | if (compr == gzip) return "gzip"; 28 | if (compr == bzip2) return "bzip2"; 29 | if (compr == zlib) return "zlib"; 30 | if (compr == lzma) return "lzma"; 31 | if (compr == none) return "none"; 32 | 33 | return ""; 34 | } 35 | 36 | std::string get_compression_extension(compression_option compr) { 37 | if (compr == gzip) return "gz"; 38 | if (compr == bzip2) return "bz2"; 39 | if (compr == zlib) return "zz"; 40 | if (compr == lzma) return "xz"; 41 | if (compr == none) return "out"; 42 | 43 | return ""; 44 | } 45 | 46 | void add_compression(std::shared_ptr> osb, 47 | utils::compression_option compr) { 48 | 49 | if (compr == utils::gzip) { 50 | osb->push(boost::iostreams::gzip_compressor()); 51 | } 52 | 53 | if (compr == utils::bzip2) { 54 | osb->push(boost::iostreams::bzip2_compressor()); 55 | } 56 | 57 | if (compr == utils::zlib) { 58 | osb->push(boost::iostreams::zlib_compressor()); 59 | } 60 | 61 | #ifdef WITH_LZMA 62 | if (compr == utils::lzma) { 63 | osb->push(boost::iostreams::lzma_compressor()); 64 | } 65 | #endif 66 | } 67 | 68 | void add_decompression(boost::iostreams::filtering_streambuf *osb, 69 | utils::compression_option compr) { 70 | 71 | if (compr == utils::gzip) { 72 | osb->push(boost::iostreams::gzip_decompressor()); 73 | } 74 | 75 | if (compr == utils::bzip2) { 76 | osb->push(boost::iostreams::bzip2_decompressor()); 77 | } 78 | 79 | if (compr == utils::zlib) { 80 | osb->push(boost::iostreams::zlib_decompressor()); 81 | } 82 | 83 | #ifdef WITH_LZMA 84 | if (compr == utils::lzma) { 85 | osb->push(boost::iostreams::lzma_decompressor()); 86 | } 87 | #endif 88 | } 89 | 90 | 91 | } -------------------------------------------------------------------------------- /src/utils/compression.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_UTILS_COMPRESSION_H 3 | #define EXTRACTOR_UTILS_COMPRESSION_H 4 | 5 | #include 6 | #include 7 | 8 | 9 | namespace utils { 10 | 11 | enum compression_option { 12 | none, gzip, bzip2, zlib, lzma, null 13 | }; 14 | 15 | compression_option string_to_compression_option(std::string str); 16 | 17 | std::string compression_option_to_string(compression_option compr); 18 | 19 | std::string get_compression_extension(compression_option compr); 20 | 21 | void add_compression(std::shared_ptr> osb, 22 | utils::compression_option compr); 23 | 24 | void add_decompression(boost::iostreams::filtering_streambuf *osb, 25 | utils::compression_option compr); 26 | 27 | } 28 | 29 | #endif //EXTRACTOR_UTILS_COMPRESSION_H 30 | -------------------------------------------------------------------------------- /src/utils/curldownloader.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "curldownloader.h" 3 | #include "logging.h" 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | size_t write_data(void *ptr, size_t size, size_t nmemb, void *poqout) { 13 | (*(std::ostream *) poqout).write((const char *) ptr, nmemb); 14 | 15 | return size * nmemb; 16 | } 17 | 18 | HTTPDownloader::HTTPDownloader() { 19 | curl = curl_easy_init(); 20 | } 21 | 22 | HTTPDownloader::~HTTPDownloader() { 23 | curl_easy_cleanup(curl); 24 | } 25 | 26 | CURLcode HTTPDownloader::download(const std::string &url, std::ostream *poqout) { 27 | curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); 28 | curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); 29 | curl_easy_setopt(curl, CURLOPT_WRITEDATA, poqout); 30 | curl_easy_setopt(curl, CURLOPT_VERBOSE, 0L); 31 | curl_easy_setopt(curl, CURLOPT_FAILONERROR, true); 32 | 33 | CURLcode res = curl_easy_perform(curl); 34 | return res; 35 | } 36 | -------------------------------------------------------------------------------- /src/utils/curldownloader.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_UTILS_CURLDOWNLOADER_H 3 | #define EXTRACTOR_UTILS_CURLDOWNLOADER_H 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | class HTTPDownloader { 11 | 12 | public: 13 | 14 | HTTPDownloader(); 15 | 16 | ~HTTPDownloader(); 17 | 18 | CURLcode download(const std::string &url, std::ostream *poqout); 19 | 20 | 21 | private: 22 | 23 | void *curl; 24 | 25 | }; 26 | 27 | 28 | #endif //EXTRACTOR_UTILS_CURLDOWNLOADER_H 29 | -------------------------------------------------------------------------------- /src/utils/logging.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "logging.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | 15 | namespace logging { 16 | 17 | void init() { 18 | boost::log::add_common_attributes(); 19 | boost::log::add_console_log(std::cout, boost::log::keywords::format = 20 | ( 21 | boost::log::expressions::stream 22 | << "(" << boost::log::trivial::severity << ") " 23 | << "[" << boost::log::expressions::format_date_time("TimeStamp", 24 | "%Y-%m-%d %H:%M:%S") 25 | << "]: " 26 | << boost::log::expressions::smessage 27 | )); 28 | 29 | } 30 | 31 | void log_reliable(std::string output_folder, int num_reliable, int num_unreliable) { 32 | std::ofstream logfile; 33 | logfile.open(output_folder + "/" + "langsplit.log", std::ios::out | std::ios::app); 34 | boost::format text = boost::format("reliable:%d unreliable:%d\n") % num_reliable % num_unreliable; 35 | logfile << text.str(); 36 | logfile.close(); 37 | } 38 | 39 | void log_done(std::string output_folder, std::string processed) { 40 | std::ofstream logfile; 41 | logfile.open(output_folder + "/" + "done.log", std::ios::out | std::ios::app); 42 | logfile << std::string(processed + "\n"); 43 | logfile.close(); 44 | } 45 | 46 | void log_error(std::string output_folder, std::string text) { 47 | std::ofstream logfile; 48 | logfile.open(output_folder + "/" + "error.log", std::ios::out | std::ios::app); 49 | logfile << std::string(text + "\n"); 50 | logfile.close(); 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /src/utils/logging.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef EXTRACTOR_UTILS_LOGGING_H 3 | #define EXTRACTOR_UTILS_LOGGING_H 4 | 5 | #define BOOST_LOG_DYN_LINK 1 6 | #define LOG_INFO BOOST_LOG_TRIVIAL(info) 7 | #define LOG_ERROR BOOST_LOG_TRIVIAL(error) 8 | 9 | #include 10 | 11 | #include 12 | 13 | 14 | namespace logging { 15 | 16 | void init(); 17 | 18 | void log_reliable(std::string output_folder, int num_reliable, int num_unreliable); 19 | 20 | void log_done(std::string output_folder, std::string processed); 21 | 22 | void log_error(std::string output_folder, std::string text); 23 | 24 | } 25 | 26 | #endif //EXTRACTOR_UTILS_LOGGING_H 27 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # gtest 3 | enable_testing() 4 | find_package(GTest REQUIRED) 5 | if (GTest_FOUND) 6 | include_directories(${GTEST_INCLUDE_DIRS}) 7 | endif () 8 | 9 | # test_readerwarc 10 | add_executable(test_readerwarc test_readerwarc.cpp ../src/mono/filters/warcfilter.cpp ../src/utils/common.cpp) 11 | target_link_libraries(test_readerwarc ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES}) 12 | install(TARGETS test_readerwarc DESTINATION tests) 13 | 14 | # test_langsplit 15 | add_executable(test_langsplit test_langsplit.cpp ../src/mono/filters/langsplitfilter.cpp ../src/mono/buffered_map.cpp ../src/utils/common.cpp ../src/utils/logging.cpp) 16 | target_link_libraries(test_langsplit ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} cld2_lib) 17 | install(TARGETS test_langsplit DESTINATION tests) 18 | 19 | # test_integration 20 | add_executable(test_integration test_integration.cpp ../src/mono/filters/langcollectorfilter.cpp ../src/mono/filters/langsplitfilter.cpp ../src/mono/filters/warcfilter.cpp ../src/mono/language_sink.cpp ../src/mono/buffered_map.cpp ../src/mono/worker.cpp ../src/utils/common.cpp ../src/utils/compression.cpp ../src/utils/curldownloader.cpp ../src/utils/logging.cpp) 21 | target_link_libraries(test_integration ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${CURL_LIBRARIES} cld2_lib) 22 | install(TARGETS test_integration DESTINATION tests) 23 | 24 | # test utils 25 | add_executable(test_utils test_utils.cpp ../src/utils/common.cpp) 26 | target_link_libraries(test_utils ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${CURL_LIBRARIES}) 27 | install(TARGETS test_utils DESTINATION tests) 28 | -------------------------------------------------------------------------------- /tests/data_integration/test1.in.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_integration/test1.in.gz -------------------------------------------------------------------------------- /tests/data_integration/test1_en.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:en bytes:64 2 | 記錄COPYRIGHT(C)2006 673091.av999-tw.net ALL RIGHTS RESERVED. 3 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:en bytes:25 4 | iPhone, iPad, iPod touch 5 | df6fa1abb58549287111ba8d776733e9 uri:http://6dollarshirts.com/tv-and-movie-tees/violent-delights language:en bytes:4130 6 | Violent Delights T-Shirt | 6 Dollar Shirts 7 | HUNDREDS OF TEES JUST $6 EACH • Get 10 FOR ONLY $50! 8 | HUNDREDS OF TEES JUST $6 EACH • Get 10 FOR ONLY $50! 9 | Account • 10 | Ordering • 11 | About • 12 | Contact 13 | SHOP BY STYLE 14 | Guys Tees 15 | Girls Tees 16 | Kids Tees 17 | Guys Tanks 18 | Girls Tanks 19 | Sweatshirts 20 | Hoodies 21 | Prints 22 | SHOP BY CATEGORY 23 | New 24 | Funny 25 | Music 26 | Partying 27 | Pop Culture 28 | Politics 29 | Shop All Tees SHOP BY PRICE Tees for $6 30 | Tees for $9 31 | Tees for $12 32 | Clearance Items 33 | SHOP BY STYLE 34 | Guys Tees 35 | Girls Tees 36 | Kids Tees 37 | Guys Tanks 38 | Girls Tanks 39 | Sweatshirts 40 | Hoodies 41 | Prints 42 | SHOP BY CATEGORY 43 | New 44 | Funny 45 | Music 46 | Partying 47 | Pop Culture 48 | Politics 49 | Shop All Tees SHOP BY PRICE Tees for $6 50 | Tees for $9 51 | Tees for $12 52 | Clearance Items 53 | NEW Hell Yeah 54 | New Who 55 | Squid Goals 56 | I Choose Violence 57 | Crystal Ball Buffering 58 | Weed 59 | Lava 60 | Hand Of The Queen 61 | SHOP Shop By Style 62 | Guys Tees 63 | Girls Tees 64 | Kids Tees 65 | Guys Tanks 66 | Girls Tanks 67 | Sweatshirts 68 | Hoodies 69 | Prints 70 | Shop By Category 71 | New Designs 72 | TV & Movies Pets & Animals 73 | Science & Math 74 | Geek & Gaming 75 | Graphic & Vintage 76 | Funny 77 | Food & Coffee 78 | Music 79 | Partying 80 | Pop Culture 81 | Politics 82 | Sports & Wellness 83 | Holidays & Costumes 84 | Shop All Tees Shop By Price 85 | Tees for $6 86 | Tees for $9 87 | Tees for $12 88 | 10 for $50 Tees 89 | Gift Certificates 90 | Clearance 91 | COLLECTIONS Cats 92 | Horror 93 | Gaming Drinking Science Dinosaurs 94 | Superheroes Math 95 | Zombies 96 | Dogs Sharks 97 | Skulls 98 | Literary Historical Music Account 99 | Ordering Info 100 | About Us 101 | Contact Us 102 | TV & Movie Tees 103 | Violent Delights #teeviolentdelightstee 104 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for. 105 | Professionally printed silkscreen 106 | Ships within 2 business days 107 | Designed and printed in the USA See more in: Violent Delights Looking for a fun vacation spot? Be careful what you wish for. 108 | • Professionally printed silkscreen 109 | • Ships within 2 business days 110 | • Designed and printed in the USA 111 | Choose Style: Select State 112 | Guys Tee Girls Tee (+$0.50) 113 | Guys Tank (+$5.95) 114 | Girls Tank (+$5.95) 115 | Kids Tee (+$3.50) 116 | Hoodie (+$12.95) 117 | Choose Color: Select State 118 | Black Charcoal Black Black Midnight Lucky Green Gray Heather Navy Heather Deep Ash Brown Heather Chocolate Midnight Royal Heather Black Navy Blue Kelly Green Heather True Navy Red Heather Deep Red Red Sky Blue Kelly Green Choose Size: Size Chart Select State 119 | Small Medium Large X-Large 2X-Large (+$1.00) 120 | 2X-Large (+$2.00) 121 | 2X-Large (+$3.00) 122 | 3X-Large (+$3.00) 123 | 3X-Large (+$2.00) 124 | 2 3 4 5/6 $6,00 125 | Qty 126 | Add to Cart 127 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for. • Professionally printed silkscreen 128 | • Ships within 2 business days 129 | • Designed and printed in the USA 130 | #teeviolentdelightstee 131 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for. 132 | Professionally printed silkscreen 133 | Ships within 2 business days 134 | Designed and printed in the USA $6. Dystopia 135 | $9. Shakespeare On Love 136 | $6. Evolution To Termination 137 | $6. Red Pill, Blue Pill Customer Reviews (0) 138 | DESIGN DETAILS FAN PHOTOS MORE INFO 139 | Product Specs 140 | Size & Fit 141 | Shirt width on our size chart measures from armpit to armpit, across the front only. Shirt length measures from the neck seam to the bottom hem. 142 | While our 100% cotton shirts are pre-shrunk, slight shrinkage may occur, particularly when using heat in washing and drying. Heather colors are a poly-cotton blend that may shrink slightly or not at all, depending on care. 143 | Guys shirts are a loose fit style. Girls shirts are fitted with a tapered waist and cap sleeves. More Questions? 144 | See our Ordering page, or contact us here. 145 | Add A Review 146 | Submit 147 | Size Chart 148 | GUYS TEE GIRLS TEE GUYS TANK GIRLS TANK KIDS TEE HOODIE SizeWidth Length SMALL 18 28 MEDIUM 20 29 LARGE 22 30 X-LARGE 24 31 2X-LARGE 26 32 3X-LARGE 28 33 Pre-shrunk fabric. • Short set-in sleeves, for relaxed fit. • Solid colors are 100% cotton; Heather colors are 50/50 poly-cotton blend (Grey Heather is a 70% cotton, 30% polyester blend). Size Width Length SMALL 16.5 25 MEDIUM 17.5 25.5 LARGE 18.5 27 X-LARGE 20 28 100% ring-spun cotton.Pre-shrunk fabric.Tapered silhouette.Gray Granite is 90% cotton 10% poly-blend. Size Width Length SMALL 18 27 MEDIUM 20 28.5 LARGE 21.5 28.75 X-LARGE 23 29.5 2X-LARGE 25.5 30.5 100% combed ring-spun cotton.Pre-shrunk fabric. 149 | df6fa1abb58549287111ba8d776733e9 uri:http://6dollarshirts.com/tv-and-movie-tees/violent-delights language:en bytes:1015 150 | Length SMALL 16.25 24.75 MEDIUM 18.5 24.5 LARGE 21.5 25 X-LARGE 22 27 100% combed ring-spun cotton.Pre-shrunk fabric. Size Width Length 2 11 12 3 12 13 4 13 14 5/6 14.5 16 SMALL 17 19 MEDIUM 18 20.5 LARGE 19 21 100% jersey knit cotton.Pre-shrunk fabric.​ Size Width Length SMALL 20 23 MEDIUM 21 25.5 LARGE 23 26 X-LARGE 25 27 2X-LARGE 28.5 27 3X-LARGE 31 27.5 80% ring-spun cotton/ 20% polyester.Pre-shrunk fabric.Jersey-lined hood. ×Close 151 | - Categories - 152 | New Designs 153 | TV & Movies Pets & Animals 154 | Science & Math 155 | Geek & Gaming 156 | Funny 157 | Graphic & Vintage 158 | Food & Coffee 159 | Music 160 | Partying 161 | Pop Culture 162 | Politics 163 | Sports & Wellness 164 | Holidays & Costumes 165 | Shop All Tees - Styles - 166 | Guys Tees 167 | Girls Tees 168 | Kids Tees 169 | Guys Tanks 170 | Girls Tanks 171 | Sweatshirts 172 | Hoodies 173 | Prints 174 | - Price - 175 | Tees For $6 176 | Tees For $9 Tees For $12 177 | 10 for $50 Tees 178 | My Account 179 | Ordering Info 180 | About Us 181 | Contact Us 182 | Subscribe To Our Newsletter 183 | ©2016 6DOLLARSHIRTS by Thread Pit•Terms•Privacy 184 | /g,'>');l[i].href='mailto:'+t.value}}catch(e){}}}catch(e){}})(document);/* ]]> */ 185 | -------------------------------------------------------------------------------- /tests/data_integration/test1_ja.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:29 2 | smg下載-向前走向愛走- 3 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:149 4 | 守護甜心遊戲-冒險王之神兵傳奇無敵版-真愛找麻煩第63集-言葉より大切なもの-切水果手機遊戲-說一句我不走了- 5 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:118 6 | 天空影音分享-嵐きっと大丈夫-羅百吉-雨果的冒險-情歌-四川麻將連連看-驢子下載中文版- 7 | -------------------------------------------------------------------------------- /tests/data_integration/test2.in.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_integration/test2.in.gz -------------------------------------------------------------------------------- /tests/data_integration/test2_de.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:205 2 | Reifen Meißner GmbH 3 | HomeÜber unsReifenFelgenAutoserviceGeschäftskundenAktuellesTuningFilialenKontaktpoint S CardStellenangeboteKundenzufriedenheitPKWMotorradNutzfahrzeugeUnsere starken MarkenMarkenReife 4 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:114 5 | AngeboteWartung und ServiceZubehörMobilitiätsgarantieFlottenFlottenbroschürePKW-ServiceNutzfahrzeug-Service24h 6 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:162 7 | nsporterreifenGrüne ReifenReifen-EinlagerungAuswuchtenWinterreifenLKW & ReisebusseLand- & ForstwirtschaftErdbewegungsmaschinenIndustriereifenLKW-Reifenversicheru 8 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:297 9 | laketteSchneeketteNutzfahrzeug und Reisebus ServiceErdbewegungsmaschinen ServiceLand- und Forstwirtschaft ServiceIndustriereifen Service Immer für Sie da: 10 | Geschäftsführer Frau Irmtraud Meißner / Herr Lutz Meißner Sie befinden sich hier: Autoservice » Wartung und Service 11 | Wartung und Service 12 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:460 13 | Hauptuntersuchung*: Die Ausführung sämtlicher Arbeiten am Fahrzeug erfolgt durch permanent geschulte Mitarbeiter und Kfz-Meister. Denn nur sie verfügen über die nötige Erfahrung und Qualifikation.*Die Prüfung erfolgt gem. §29 StVZO, in Ihrem point S Servicecenter zusammen mit staatlich anerkannten Prüforganisationen (welche Prüforganisation in Ihrem Servicecenter tätig ist, erfahren Sie vor Ort). 14 | NutzungsbedingungenDatenschutzerklärungImpressum 15 | df6fa1abb58549287111ba8d776733e9 uri:http://2016.palaissommer.de/programm/tango-im-park-8/ language:de bytes:135 16 | Mitteilungen 17 | Anfragen 18 | Newsletter abonnieren 19 | Programm 20 | Alle Veranstaltungen 21 | Filmnacht 22 | Hörspielnacht 23 | Klaviernacht 24 | Palais.Poesie 25 | Pleinair 26 | df6fa1abb58549287111ba8d776733e9 uri:http://2016.palaissommer.de/programm/tango-im-park-8/ language:de bytes:1253 27 | Sommerwirtschaft 28 | Schirmherrschaft 29 | Team 30 | News 31 | Partner 32 | Pleinair & Sammlung 33 | Galerie 34 | Canalettopreis 35 | Presse 36 | Aktuelle Mitteilungen 37 | Anfragen 38 | Newsletter abonnieren 39 | Tango im Park 40 | Sonntag, 14.08.2016 | 16:00 41 | Rahmenprogramm 42 | Tango im Park 43 | Der Argentinische Tango ist die ursprünglichere und weniger reglementierte Form des Tanzes der sich vor über hundert Jahren am Rio de la Plata entwickelt hat. Da der Tango seit 2009 zum Weltkulturerbe gehört, bringen wir dieses an die Elbwiesen zurück. 44 | Es ist eine offene Tanzveranstaltung zu traditionellen und auch modernen Tangos. Wir laden jeden zum schwofen, gaffen, genießen, ausprobieren und mitmachen ein. 45 | Danke an Norbert Gust 46 | Das eintrittsfreie Festival Palais Sommer wird neben Sponsoring vor allem auch durch die Spenden der Besucher möglich. Empfehlung: 5-10 € / Spendenbox am Kulturcounter Eingang Elbradweg. 47 | Vielen Dank für eure Unterstützung! 48 | Jetzt spenden! 49 | Newsletter Fördern Sie diese Veranstaltung! Mit Ihrer Hilfe kann der Palais Sommer weiterhin eintrittsfrei und ohne staatliche Zuschüsse stattfinden Mehr Informationen Kalender 50 | << 51 | Jul 2017 52 | >> 53 | MDMDFSS 54 | 26 27 28 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 1 2 3 4 5 6 Nächste VeranstaltungenKeine 55 | df6fa1abb58549287111ba8d776733e9 uri:http://24check-versicherung.com/news-archiv-002/fleckig-verfarbt-verfilzt.html language:de bytes:3169 56 | Fleckig, verfärbt, verfilzt: Textilreiniger haften nur bei eigenen Fehlern | 24CHECK-VERSICHERUNG | HOLZ 57 | 24CHECK-VERSICHERUNG 58 | Site Navigation[Skip] 59 | 24CHECK 60 | NEWS 61 | HANDY 62 | VORSORGE 63 | ENGLISCHE LEBENSVERSICHERUNG 64 | BERUFSUNFÄHIGKEIT 65 | UNFALLVERSICHERUNG 66 | LEBENSVERSICHERUNG 67 | RENTENVERSICHERUNG 68 | RIESTER-RENTE 69 | RÜRUP-RENTE 70 | RISIKO-LEBENSVERSICHERUNG 71 | PKV 72 | PRIVATE KRANKEN-VERSICHERUNG 73 | PKV BEAMTE 74 | PKV STUDENTEN 75 | PKV ÜBER 55 76 | PRIVATE KRANKENZUSATZ-VERSICHERUNG 77 | SACHVERSICHERUNG 78 | WOHNGEBÄUDE 79 | FIRMENVERSICHERUNG 80 | KFZ VERSICHERUNG VERGLEICH 81 | HAUS- UND GRUNDBESITZ 82 | HAUSRAT 83 | PRIVAT-HAFTPFLICHT 84 | RECHTSSCHUTZ-VERSICHERUNG 85 | TIERHALTER 86 | MOTORRAD-VERSICHERUNG 87 | FINANZEN 88 | BAUFINANZIERUNG 89 | AUTOFINANZIERUNG 90 | KREDIT VERGLEICH | RATENKREDIT | KREDITFINANZIERUNG 91 | Allgemeine 24CHECK - Informationen zu Krediten und Vergleichen 92 | FONDS 93 | KREDITKARTEN VERGLEICH 94 | SUCHEN 95 | TIPPS 96 | SITEMAP 97 | Fleckig, verfärbt, verfilzt: Textilreiniger haften nur bei eigenen Fehlern 98 | R+V-Infocenter: Verbraucher haben Beweispflicht 99 | Wiesbaden (ots) - Waschmaschine oder Reinigung? Bei teuren 100 | und empfindlichen Kleidungsstücken fällt die Wahl meist auf eine 101 | Textilreinigung. Doch was tun, wenn der Anzug danach noch mehr Flecken hat oder 102 | das Lieblingskleid verfilzt ist? "Der Kunde muss beweisen, dass die 103 | Reinigung den Schaden verursacht und auch verschuldet hat", sagt Sonja 104 | Biorac, Haftpflichtexpertin beim Infocenter der R+V Versicherung. Doch nur in 105 | jedem dritten Fall haben tatsächlich die Textilreiniger den Schaden zu 106 | vertreten. Fast ebenso häufig geht er auf Fehler der Hersteller zurück, etwa 107 | wenn das Kleidungsstück falsch etikettiert ist. 108 | Gut für die Kunden ist, wenn die Reinigung bereit ist, den 109 | Schaden auf Kulanzbasis zu regeln. Ist dies nicht der Fall, muss der 110 | Geschädigte dem Betrieb nachweisen, dass dieser einen Fehler gemacht hat. 111 | "Oft geht das nur mit einem Gutachten, zum Beispiel von einer Schiedstelle 112 | für Textilpflege", so R+V-Expertin Biorac. Zwischen 20 und 60 Euro kostet 113 | die Beurteilung durch die Experten. Stellt sich dabei heraus, dass die 114 | Pflegehinweise falsch sind, können die Kunden beim Verkäufer reklamieren. 115 | Dieser haftet zwei Jahre für die Kleidung. Voraussetzung ist aber, dass das 116 | Etikett nicht herausgeschnitten wurde. 117 | Bei rund jeder dritten Reklamation entscheiden die 118 | Schiedsstellen zu Ungunsten der Verbraucher: beispielsweise wenn die Kleidung 119 | mit Säure in Kontakt gekommen ist oder der Kunde an den Flecken gerieben und 120 | den Stoff beschädigt hat. 121 | Wenn die Reinigung für den Schaden verantwortlich ist, 122 | erstattet sie normalerweise nicht den kompletten Wert des Kleidungsstücks. 123 | "Meistens ist die Haftung auf den 15fachen Reinigungspreis begrenzt", 124 | so Sonja Biorac. Selbst für ein 800 Euro teures Markenkostüm steht dem Kunden 125 | also womöglich deutlich weniger als der halbe Kaufpreis zu. Viele Reinigungen 126 | bieten zusätzlich eine Versicherung an, die im Schadenfall den Zeitwert ersetzt 127 | - für neue wertvolle Kleidungsstücke kann sich das lohnen. 128 | http://www.infocenter.ruv.de 129 | Pressekontakt: 130 | R+V-Infocenter 131 | 06172/9022-131 132 | a.kassubek@arts-others.de Zurück zur Übersicht 133 | Dienstag, 20. März 2012 21:23 134 | © HOLZ MMC 2006 - 2014 - DEUTSCHLANDS GROSSES VERGLEICHSPORTAL - ALLE VERGLEICHE 135 | -------------------------------------------------------------------------------- /tests/data_integration/test3.in: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-10-13T10:20:04Z 4 | WARC-Filename: text.warc.wet.gz 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 110 8 | 9 | Software-Info: ia-web-commons.1.1.9-SNAPSHOT-20170811025357 10 | Extracted-Date: Fri, 13 Oct 2017 10:20:04 GMT 11 | 12 | 13 | 14 | WARC/1.0 15 | WARC-Type: conversion 16 | WARC-Target-URI: http://01publishing.com/tag/bondage/ 17 | WARC-Date: 2017-07-28T18:53:51Z 18 | WARC-Record-ID: 19 | WARC-Refers-To: 20 | WARC-Block-Digest: sha1:MXOMDGYOE3HNU2AFOXOLP7WJZDWYRI53 21 | Content-Type: text/plain 22 | Content-Length: 26 23 | 24 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. 25 | 26 | -------------------------------------------------------------------------------- /tests/data_integration/test3_en.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://01publishing.com/tag/bondage/ language:en bytes:246 2 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. 3 | -------------------------------------------------------------------------------- /tests/data_langsplit/test1.in: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://917ssmys.measychina.cn/za/11hbg/ 2 | 海报psd素材_四驱车 田宫 绝版_植物百科通 3 | 新闻资讯 4 | 植物科学 5 | 家庭养花 6 | 园林养护 7 | 植物图片 8 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html 9 | コラージュ・イン・ア・ボトル スタカニ用ATC作成中☆ 10 | コラージュ・イン・ア・ボトル 11 | collages in a bottle 12 | スポンサーサイト 13 | 上記の広告は1ヶ月以上更新のないブログに表示されています。新しい記事を書く事で広告が消せます。 -------- : スポンサー広告 : このページのトップへ 14 | スタカニ用ATC作成中☆ -------------------------------------------------------------------------------- /tests/data_langsplit/test1.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://917ssmys.measychina.cn/za/11hbg/ language:zh bytes:112 2 | 素材_四驱车 田宫 绝版_植物百科通 3 | 新闻资讯 4 | 植物科学 5 | 家庭养花 6 | 园林养护 7 | 植物图片 8 | 9 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:58 10 | コラージュ・イン・ア・ボトル スタカニ用 11 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:56 12 | 作成中☆ 13 | コラージュ・イン・ア・ボトル 14 | 15 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:en bytes:21 16 | collages in a bottle 17 | 18 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:248 19 | スポンサーサイト 20 | 上記の広告は1ヶ月以上更新のないブログに表示されています。新しい記事を書く事で広告が消せます。 -------- : スポンサー広告 : このページのトップへ 21 | スタカニ用 22 | -------------------------------------------------------------------------------- /tests/data_langsplit/test1_langstats.out: -------------------------------------------------------------------------------- 1 | 02repair.blog.fc2.com en 21 2 | 02repair.blog.fc2.com ja 362 3 | 917ssmys.measychina.cn zh 112 4 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test1.in: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: conversion 3 | WARC-Target-URI: http://statmt.org/ 4 | WARC-Date: 2017-07-20T12:39:49Z 5 | WARC-Record-ID: 6 | WARC-Refers-To: 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 8 | Content-Type: text/plain 9 | Content-Length: 6598 10 | 11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 12 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test1.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/ 2 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 3 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test2.in: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: conversion 3 | WARC-Target-URI: http://statmt.org/ 4 | WARC-Date: 2017-07-20T12:39:49Z 5 | WARC-Record-ID: 6 | WARC-Refers-To: 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 8 | Content-Type: text/plain 9 | Content-Length: 6598 10 | 11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 12 | 13 | WARC/1.0 14 | WARC-Type: conversion 15 | WARC-Date: 2017-07-20T12:39:49Z 16 | WARC-Record-ID: 17 | WARC-Refers-To: 18 | Content-Type: text/plain 19 | Content-Length: 6598 20 | 21 | Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. 22 | All you need is a collection of translated texts (parallel corpus). 23 | Once you have a trained model, an efficient search algorithm quickly finds the highest probability translation among the exponential number of choices. 24 | 25 | WARC/1.0 26 | WARC-Type: conversion 27 | WARC-Target-URI:http://www.fjoch.com/GIZA++.html 28 | WARC-Date: 2017-07-20T12:39:49Z 29 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 30 | Content-Type: text/plain 31 | Content-Length: 6598 32 | 33 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU). 34 | GIZA++ includes a lot of additional features. 35 | The extensions of GIZA++ were designed and written by Franz Josef Och. 36 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test2.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/ 2 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 3 | df6fa1abb58549287111ba8d776733e9 uri:http://www.fjoch.com/GIZA++.html 4 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU). 5 | GIZA++ includes a lot of additional features. 6 | The extensions of GIZA++ were designed and written by Franz Josef Och. 7 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test3.in: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: conversion 3 | WARC-Target-URI: 4 | WARC-Date: 2017-07-20T12:39:49Z 5 | WARC-Record-ID: 6 | WARC-Refers-To: 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 8 | Content-Type: text/plain 9 | Content-Length: 6598 10 | 11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 12 | 13 | WARC/1.0 14 | WARC-Type: conversion 15 | WARC-Target-URI: uri uri 16 | WARC-Date: 2017-07-20T12:39:49Z 17 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 18 | Content-Type: text/plain 19 | Content-Length: 6598 20 | 21 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU). 22 | GIZA++ includes a lot of additional features. 23 | The extensions of GIZA++ were designed and written by Franz Josef Och. 24 | 25 | WARC/1.0 26 | WARC-Type: conversion 27 | WARC-Date: 2017-07-20T12:39:49Z 28 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 29 | Content-Type: text/plain 30 | Content-Length: 6598 31 | 32 | The first shared task which will examine translation between the following language pairs: 33 | 34 | English-German and German-English 35 | English-French and French-English 36 | English-Hindi and Hindi-English NEW 37 | English-Czech and Czech-English 38 | English-Russian and Russian-English 39 | 40 | Participants may submit translations for any or all of the language directions. In addition to the common test sets the workshop organizers will provide optional training resources, including a newly expanded release of the Europarl corpora and out-of-domain corpora. 41 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test3.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_readerwarc/test3.out -------------------------------------------------------------------------------- /tests/data_readerwarc/test4.in: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: conversion 3 | WARC-Target-URI: http://statmt.org/europarl/ 4 | WARC-Date: 2017-07-20T12:39:49Z 5 | WARC-Record-ID: 6 | WARC-Refers-To: 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 8 | Content-Type: text/plain 9 | 10 | For a detailed description of this corpus, please read: 11 | 12 | Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005, pdf. 13 | 14 | Please cite the paper, if you use this corpus in your work. See also the extended (but earlier) version of the report (ps, pdf). 15 | 16 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament. 17 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek. 18 | http://statmt.org/europarl/ 19 | 20 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries. 21 | We sentence aligned the data using a tool based on the Church and Gale algorithm. 22 | 23 | WARC/1.0 24 | WARC-Type: conversion 25 | WARC-Target-URI: http://statmt.org/ 26 | WARC-Date: 2017-07-20T12:39:49Z 27 | WARC-Record-ID: 28 | WARC-Refers-To: 29 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 30 | Content-Type: text/plain 31 | Content-Length: 6598 32 | 33 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 34 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test4.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/europarl/ 2 | For a detailed description of this corpus, please read: 3 | Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005, pdf. 4 | Please cite the paper, if you use this corpus in your work. See also the extended (but earlier) version of the report (ps, pdf). 5 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament. 6 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek. 7 | http://statmt.org/europarl/ 8 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries. 9 | We sentence aligned the data using a tool based on the Church and Gale algorithm. 10 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/ 11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 12 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test5.in: -------------------------------------------------------------------------------- 1 | WARC/1.1 2 | WARC-Type: conversion 3 | WARC-Target-URI: http://statmt.org/ 4 | WARC-Date: 2017-07-20T12:39:49Z 5 | WARC-Record-ID: 6 | WARC-Refers-To: 7 | Content-Type: text/plain 8 | Content-Length: 6598 9 | 10 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text. 11 | 12 | WARC-Type: conversion 13 | WARC/1.0 14 | WARC-Target-URI: http://statmt.org/europarl/ 15 | WARC-Date: 2017-07-20T12:39:49Z 16 | WARC-Record-ID: 17 | WARC-Refers-To: 18 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2 19 | Content-Type: text/plain 20 | 21 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament. 22 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek. 23 | 24 | WARC/1.1 25 | 26 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. 27 | For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries. 28 | We sentence aligned the data using a tool based on the Church and Gale algorithm. 29 | WARC/1.0 30 | WARC-Type: conversion 31 | WARC-Target-URI: http://statmt.org/wmt14/ 32 | WARC-Date: 2017-07-20T12:39:49Z 33 | WARC-Record-ID: 34 | WARC-Refers-To: 35 | Content-Type: text/plain 36 | Content-Length: 6598 37 | 38 | This workshop builds on eight previous workshops on statistical machine translation, which is one of the most prestigious venues for research in computational linguistics: 39 | -------------------------------------------------------------------------------- /tests/data_readerwarc/test5.out: -------------------------------------------------------------------------------- 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/europarl/ 2 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament. 3 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek. 4 | WARC/1.1 5 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. 6 | For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries. 7 | We sentence aligned the data using a tool based on the Church and Gale algorithm. 8 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/wmt14/ 9 | This workshop builds on eight previous workshops on statistical machine translation, which is one of the most prestigious venues for research in computational linguistics: 10 | -------------------------------------------------------------------------------- /tests/test_integration.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gtest/gtest.h" 3 | #include "../src/mono/worker.h" 4 | #include "../src/utils/common.h" 5 | #include "boost/filesystem/fstream.hpp" 6 | #include "boost/filesystem/operations.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | typedef std::vector> pair_files_vec; 16 | 17 | 18 | void compare_lengths(std::string output_file, std::string expected_file) { 19 | std::cout << "Comparing: " << output_file << " and " << expected_file << std::endl; 20 | 21 | std::ifstream ja_output(output_file); 22 | std::ifstream ja_expected_output("../../tests/data_integration/" + expected_file); 23 | 24 | if (!ja_output.is_open()) { 25 | std::cerr << "Output file not found!" << std::endl; 26 | FAIL(); 27 | } 28 | 29 | if (!ja_expected_output.is_open()) { 30 | std::cerr << "Expected file not found!" << std::endl; 31 | FAIL(); 32 | } 33 | 34 | std::stringstream ss1; 35 | ss1 << ja_output.rdbuf(); 36 | ja_output.close(); 37 | 38 | std::stringstream ss2; 39 | ss2 << ja_expected_output.rdbuf(); 40 | ja_expected_output.close(); 41 | 42 | ASSERT_EQ(ss1.str().length(), ss2.str().length()); 43 | } 44 | 45 | void compare(std::string test_name, std::string input_file, std::string output_folder, pair_files_vec test_files, 46 | utils::compression_option in_compr, utils::compression_option out_compr) { 47 | 48 | std::string test1_input_path = std::string("../../tests/data_integration/") + input_file; 49 | 50 | if (!boost::filesystem::exists(test1_input_path)) { 51 | std::cerr << "Input file not found!" << std::endl; 52 | FAIL(); 53 | } 54 | 55 | boost::filesystem::path output_dir(output_folder); 56 | 57 | if (boost::filesystem::create_directory(boost::filesystem::path(output_dir.string() + "/" + test_name))) { 58 | 59 | mono::worker_file(test1_input_path, false, output_folder + "/" + test_name, in_compr, out_compr); 60 | for (auto a: test_files) { 61 | compare_lengths(output_folder + "/" + test_name + "/" + a.first, a.second); 62 | } 63 | 64 | } else { 65 | std::cerr << "Output folder already exists! Please remove: " << output_folder << std::endl; 66 | FAIL(); 67 | } 68 | 69 | } 70 | 71 | 72 | TEST(integration, test_simple_gzip_to_langsplit) { 73 | std::string output_dir = "test_dir_integration"; 74 | if (!boost::filesystem::create_directory(output_dir)) { 75 | std::cerr << "Output folder already exists! Please remove: " << output_dir << std::endl; 76 | FAIL(); 77 | } 78 | 79 | pair_files_vec test1_files; 80 | test1_files.push_back(std::make_pair("text.ja.out", "test1_ja.out")); 81 | test1_files.push_back(std::make_pair("text.en.out", "test1_en.out")); 82 | 83 | pair_files_vec test2_files; 84 | test2_files.push_back(std::make_pair("text.de.out", "test2_de.out")); 85 | test2_files.push_back(std::make_pair("text.ru.out", "test2_ru.out")); 86 | 87 | pair_files_vec test3_files; 88 | test3_files.push_back(std::make_pair("text.en.out", "test3_en.out")); 89 | 90 | compare("test1", "test1.in.gz", output_dir, test1_files, utils::gzip, utils::none); 91 | compare("test2", "test2.in.gz", output_dir, test2_files, utils::gzip, utils::none); 92 | compare("test3", "test3.in", output_dir, test3_files, utils::none, utils::none); 93 | } -------------------------------------------------------------------------------- /tests/test_langsplit.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gtest/gtest.h" 3 | #include "../src/mono/filters/langsplitfilter.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include "boost/filesystem/operations.hpp" 14 | #include 15 | 16 | 17 | void compare(std::string input_file, std::string expected_result_file) { 18 | // load files 19 | std::ifstream test1_input(std::string("../../tests/data_langsplit/") + input_file); 20 | std::ifstream test1_expected_output(std::string("../../tests/data_langsplit/") + expected_result_file); 21 | 22 | if (!test1_input.is_open()) { 23 | std::cerr << "Input file not found!" << std::endl; 24 | FAIL(); 25 | } 26 | 27 | if (!test1_expected_output.is_open()) { 28 | std::cerr << "Output file not found!" << std::endl; 29 | FAIL(); 30 | } 31 | 32 | // apply filter 33 | std::stringstream output; 34 | mono::filters::LangsplitFilter langsplitFilter = mono::filters::LangsplitFilter("../../tests/data_langsplit/", false); 35 | boost::iostreams::filtering_streambuf in(test1_input); 36 | boost::iostreams::filtering_streambuf out; 37 | out.push(langsplitFilter); 38 | out.push(output); 39 | 40 | boost::iostreams::copy(in, out); 41 | 42 | // compare outputs 43 | std::stringstream ss2; 44 | ss2 << test1_expected_output.rdbuf(); 45 | ASSERT_EQ(output.str(), ss2.str()); 46 | 47 | test1_input.close(); 48 | test1_expected_output.close(); 49 | } 50 | 51 | void compare_stats(std::string input_file, std::string output_dir, std::string input_stat_file, 52 | std::string expected_result_file) { 53 | // load files 54 | std::ifstream test1_input(std::string("../../tests/data_langsplit/") + input_file); 55 | std::ifstream test1_expected_output(std::string("../../tests/data_langsplit/") + expected_result_file); 56 | 57 | if (!test1_input.is_open()) { 58 | std::cerr << "Input file not found!" << std::endl; 59 | FAIL(); 60 | } 61 | 62 | if (!test1_expected_output.is_open()) { 63 | std::cerr << "Output file not found!" << std::endl; 64 | FAIL(); 65 | } 66 | 67 | // apply filter 68 | boost::filesystem::create_directory(output_dir + "/stats"); 69 | mono::filters::LangsplitFilter langsplitFilter = mono::filters::LangsplitFilter(output_dir, true); 70 | boost::iostreams::filtering_streambuf in(test1_input); 71 | boost::iostreams::filtering_streambuf out; 72 | out.push(langsplitFilter); 73 | out.push(boost::iostreams::null_sink()); 74 | 75 | boost::iostreams::copy(in, out); 76 | 77 | // read langstats 78 | std::ifstream langstats_input(output_dir + "/" + input_stat_file, std::ios_base::in | std::ios_base::binary); 79 | if (!langstats_input.is_open()) { 80 | std::cerr << "langstat file not found!" << std::endl; 81 | FAIL(); 82 | } 83 | std::stringstream langstats_output; 84 | boost::iostreams::filtering_streambuf qin(langstats_input); 85 | boost::iostreams::filtering_streambuf qout; 86 | qout.push(boost::iostreams::gzip_decompressor()); 87 | qout.push(langstats_output); 88 | 89 | boost::iostreams::copy(qin, qout); 90 | 91 | // compare outputs 92 | std::stringstream ss2; 93 | ss2 << test1_expected_output.rdbuf(); 94 | ASSERT_EQ(langstats_output.str(), ss2.str()); 95 | 96 | test1_input.close(); 97 | test1_expected_output.close(); 98 | } 99 | 100 | 101 | TEST(langsplit, test_simple) { 102 | std::string output_dir = "test_dir_langsplit"; 103 | if (!boost::filesystem::create_directory(output_dir)) { 104 | std::cerr << "Output folder already exists! Please remove: " << output_dir << std::endl; 105 | FAIL(); 106 | } 107 | 108 | compare("test1.in", "test1.out"); 109 | compare_stats("test1.in", output_dir, "stats/langstats.0.gz", "test1_langstats.out"); 110 | compare_stats("test1.in", output_dir, "stats/langstats.1.gz", "test1_langstats.out"); 111 | 112 | } 113 | -------------------------------------------------------------------------------- /tests/test_readerwarc.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gtest/gtest.h" 3 | #include "../src/mono/filters/warcfilter.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | void compare(std::string input_file, std::string expected_result_file) { 16 | // load files 17 | std::ifstream test1_input(std::string("../../tests/data_readerwarc/") + input_file); 18 | std::ifstream test1_expected_output(std::string("../../tests/data_readerwarc/") + expected_result_file); 19 | 20 | if (!test1_input.is_open()) { 21 | std::cerr << "Input file not found!" << std::endl; 22 | FAIL(); 23 | } 24 | 25 | if (!test1_expected_output.is_open()) { 26 | std::cerr << "Output file not found!" << std::endl; 27 | FAIL(); 28 | } 29 | 30 | // apply filter 31 | std::stringstream output; 32 | boost::iostreams::filtering_streambuf in(test1_input); 33 | boost::iostreams::filtering_streambuf out; 34 | out.push(mono::filters::WARCFilter()); 35 | out.push(output); 36 | boost::iostreams::copy(in, out); 37 | 38 | // compare outputs 39 | std::stringstream ss2; 40 | ss2 << test1_expected_output.rdbuf(); 41 | ASSERT_EQ(output.str(), ss2.str()); 42 | 43 | test1_input.close(); 44 | test1_expected_output.close(); 45 | } 46 | 47 | 48 | TEST(readerwarc, test_simple) { 49 | compare("test1.in", "test1.out"); 50 | } 51 | 52 | TEST(readerwarc, test_good_uri) { 53 | compare("test2.in", "test2.out"); 54 | } 55 | 56 | TEST(readerwarc, test_bad_uri) { 57 | compare("test3.in", "test3.out"); 58 | }; 59 | 60 | TEST(readerwarc, test_newlines) { 61 | compare("test4.in", "test4.out"); 62 | }; 63 | 64 | TEST(readerwarc, test_header) { 65 | compare("test5.in", "test5.out"); 66 | }; 67 | -------------------------------------------------------------------------------- /tests/test_utils.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gtest/gtest.h" 3 | #include "../src/utils/common.h" 4 | 5 | TEST(utils, test_common_parse_uri) { 6 | utils::parse_uri url1("http://blogos.com/article/118568/"); 7 | utils::parse_uri url2("http://blogos.com"); 8 | utils::parse_uri url3("http://000.blogos.com"); 9 | utils::parse_uri url4("https://blogos.com/article/118568/"); 10 | utils::parse_uri url5("https://blogos.com"); 11 | utils::parse_uri url6("https://000.blogos.com"); 12 | utils::parse_uri url7("111.blogos.com"); 13 | utils::parse_uri url8("eu.blogos.com:1111/article"); 14 | utils::parse_uri url9("username@us.blogos.com:1111/article"); 15 | utils::parse_uri url10("http://username@us.blogos.com:1111/article?username=noname"); 16 | utils::parse_uri url11("ftp://blogos.com"); 17 | utils::parse_uri url12("ftp://username@us.blogos.com:1111/article?username=noname#h1"); 18 | utils::parse_uri url13("foofoofoo"); 19 | utils::parse_uri url14("foofoofoo/bar"); 20 | 21 | ASSERT_EQ(url1.get_domain(), "blogos.com"); 22 | ASSERT_EQ(url2.get_domain(), "blogos.com"); 23 | ASSERT_EQ(url3.get_domain(), "000.blogos.com"); 24 | ASSERT_EQ(url4.get_domain(), "blogos.com"); 25 | ASSERT_EQ(url5.get_domain(), "blogos.com"); 26 | ASSERT_EQ(url6.get_domain(), "000.blogos.com"); 27 | ASSERT_EQ(url7.get_domain(), "111.blogos.com"); 28 | ASSERT_EQ(url8.get_domain(), "eu.blogos.com"); 29 | ASSERT_EQ(url9.get_domain(), "us.blogos.com"); 30 | ASSERT_EQ(url10.get_domain(), "us.blogos.com"); 31 | ASSERT_EQ(url11.get_domain(), "blogos.com"); 32 | ASSERT_EQ(url12.get_domain(), "us.blogos.com"); 33 | ASSERT_EQ(url13.get_domain(), "foofoofoo"); 34 | ASSERT_EQ(url14.get_domain(), "foofoofoo"); 35 | 36 | ASSERT_EQ(url1.get_tld(), "com"); 37 | ASSERT_EQ(url6.get_tld(), "com"); 38 | ASSERT_EQ(url13.get_tld(), "foofoofoo"); 39 | 40 | ASSERT_EQ(url1.get_path(), "article/118568/"); 41 | ASSERT_EQ(url8.get_path(), "article"); 42 | ASSERT_EQ(url10.get_path(), "article"); 43 | ASSERT_EQ(url12.get_path(), "article"); 44 | ASSERT_EQ(url13.get_path(), ""); 45 | ASSERT_EQ(url14.get_path(), "bar"); 46 | } --------------------------------------------------------------------------------