├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── src
    ├── 3rd_party
    │   └── utf8
    │   │   ├── doc
    │   │       ├── ReleaseNotes
    │   │       └── utf8cpp.html
    │   │   └── source
    │   │       ├── utf8.h
    │   │       └── utf8
    │   │           ├── checked.h
    │   │           ├── core.h
    │   │           └── unchecked.h
    ├── mono
    │   ├── buffered_map.cpp
    │   ├── buffered_map.h
    │   ├── filters
    │   │   ├── header.h
    │   │   ├── langcollectorfilter.cpp
    │   │   ├── langcollectorfilter.h
    │   │   ├── langsplitfilter.cpp
    │   │   ├── langsplitfilter.h
    │   │   ├── string_util.h
    │   │   ├── warcfilter.cpp
    │   │   └── warcfilter.h
    │   ├── language_sink.cpp
    │   ├── language_sink.h
    │   ├── mono.cpp
    │   ├── worker.cpp
    │   └── worker.h
    └── utils
    │   ├── common.cpp
    │   ├── common.h
    │   ├── compression.cpp
    │   ├── compression.h
    │   ├── curldownloader.cpp
    │   ├── curldownloader.h
    │   ├── logging.cpp
    │   └── logging.h
└── tests
    ├── CMakeLists.txt
    ├── data_integration
        ├── test1.in.gz
        ├── test1_en.out
        ├── test1_ja.out
        ├── test2.in.gz
        ├── test2_de.out
        ├── test2_ru.out
        ├── test3.in
        └── test3_en.out
    ├── data_langsplit
        ├── test1.in
        ├── test1.out
        └── test1_langstats.out
    ├── data_readerwarc
        ├── test1.in
        ├── test1.out
        ├── test2.in
        ├── test2.out
        ├── test3.in
        ├── test3.out
        ├── test4.in
        ├── test4.out
        ├── test5.in
        └── test5.out
    ├── test_integration.cpp
    ├── test_langsplit.cpp
    ├── test_readerwarc.cpp
    └── test_utils.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.app
32 | 
33 | # Other
34 | .idea/
35 | cmake-build-debug/
36 | cmake-build-release/
37 | data/
38 | *.log
39 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/3rd_party/cld2"]
2 | 	path = src/3rd_party/cld2
3 | 	url = https://github.com/CLD2Owners/cld2.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(extractor C CXX)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | set(NAME extractor)
 6 | 
 7 | # options
 8 | option(BUILD_TEST "Build tests" ON)
 9 | message("BUILD_TEST: " ${BUILD_TEST})
10 | option(WITH_LZMA "Build with lzma" ON)
11 | message("LZMA: " ${WITH_LZMA})
12 | 
13 | if (WITH_LZMA)
14 |     add_definitions(-DWITH_LZMA)
15 | endif ()
16 | 
17 | # flags
18 | message("CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
19 | set(CMAKE_CXX_FLAGS_RELEASE "-Wall -Wextra -pthread -O3 -Ofast")
20 | set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -pthread -g ")
21 | 
22 | 
23 | # Boost
24 | find_package(Boost REQUIRED COMPONENTS log filesystem system iostreams regex program_options)
25 | if (Boost_FOUND)
26 |     include_directories(${Boost_INCLUDE_DIRS})
27 | endif ()
28 | 
29 | 
30 | # update submodules
31 | execute_process(COMMAND git submodule update --init -- cld2
32 |         WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/3rd_party/)
33 | 
34 | 
35 | # cld2
36 | set(CLD2_DIR "src/3rd_party/cld2")
37 | set(CLD2_DIR_INT "src/3rd_party/cld2/internal")
38 | include_directories("${CLD2_DIR}/internal")
39 | include_directories("${CLD2_DIR}/public")
40 | set(libcld2_full_files ${CLD2_DIR_INT}/cldutil.cc ${CLD2_DIR_INT}/cldutil_shared.cc ${CLD2_DIR_INT}/compact_lang_det.cc ${CLD2_DIR_INT}/compact_lang_det_hint_code.cc ${CLD2_DIR_INT}/compact_lang_det_impl.cc ${CLD2_DIR_INT}/debug.cc ${CLD2_DIR_INT}/fixunicodevalue.cc ${CLD2_DIR_INT}/generated_entities.cc ${CLD2_DIR_INT}/generated_language.cc ${CLD2_DIR_INT}/generated_ulscript.cc ${CLD2_DIR_INT}/getonescriptspan.cc ${CLD2_DIR_INT}/lang_script.cc ${CLD2_DIR_INT}/offsetmap.cc ${CLD2_DIR_INT}/scoreonescriptspan.cc ${CLD2_DIR_INT}/tote.cc ${CLD2_DIR_INT}/utf8statetable.cc ${CLD2_DIR_INT}/cld_generated_cjk_uni_prop_80.cc ${CLD2_DIR_INT}/cld2_generated_cjk_compatible.cc ${CLD2_DIR_INT}/cld_generated_cjk_delta_bi_32.cc ${CLD2_DIR_INT}/generated_distinct_bi_0.cc ${CLD2_DIR_INT}/cld2_generated_quad0122.cc ${CLD2_DIR_INT}/cld2_generated_deltaocta0122.cc ${CLD2_DIR_INT}/cld2_generated_distinctocta0122.cc ${CLD2_DIR_INT}/cld_generated_score_quad_octa_0122.cc)
41 | file(GLOB libcld2_full_glob ${libcld2_full_files})
42 | add_library(cld2_lib SHARED ${libcld2_full_glob})
43 | target_compile_options(cld2_lib PRIVATE -Wno-c++11-narrowing -w)
44 | 
45 | # CURL
46 | find_package(CURL REQUIRED)
47 | if (CURL_FOUND)
48 |     include_directories(${CURL_INCLUDE_DIRS})
49 | endif ()
50 | 
51 | 
52 | # mono
53 | add_executable(mono src/mono/mono.cpp src/mono/filters/langcollectorfilter.cpp src/mono/filters/langsplitfilter.cpp src/mono/filters/warcfilter.cpp src/mono/language_sink.cpp src/mono/buffered_map.cpp src/mono/worker.cpp src/utils/curldownloader.cpp src/utils/common.cpp src/utils/compression.cpp src/utils/logging.cpp)
54 | target_link_libraries(mono ${Boost_LIBRARIES} ${CURL_LIBRARIES} cld2_lib)
55 | 
56 | 
57 | # build tests with gtest
58 | if (BUILD_TEST)
59 | 
60 |     find_package(GTest REQUIRED)
61 |     if (GTest_FOUND)
62 |         include_directories(${GTEST_INCLUDE_DIRS})
63 |     endif ()
64 | 
65 |     add_subdirectory(tests)
66 | endif (BUILD_TEST)
67 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Extractor
 2 | 
 3 | Extractor is a tool for fast monolingual data collection from Common Crawl data. It processes Common Crawl WET files, removes invalid UTF-8, identifies languages and outputs file-separated monolingual data. Text is classified by language using Google's CLD2 language detector (https://github.com/CLD2Owners/cld2).
 4 | 
 5 | 
 6 | Following is an example of french output file:
 7 | ```
 8 | ...
 9 | f6fa1abb58549287111ba8d776733e9 uri:http://blogalolotte.canalblog.com/tag/pomme%20de%20terre language:fr bytes:59
10 | pomme de terre : Tous les messages sur pomme de terre - Le
11 | df6fa1abb58549287111ba8d776733e9 uri:http://blogalolotte.canalblog.com/tag/pomme%20de%20terre language:fr bytes:81
12 | ...
13 | ```
14 | 
15 | Each header line starts with the same hash code followed by metadata. Then, the extracted language-speific text (here french) starts on a new line and is ended by the next header line.
16 | 
17 | Apart from this, Extractor can also produce statistics on language distribution per domain.
18 | 
19 | 
20 | ## Requirements
21 | 
22 | - GCC, C++11 compiler
23 | - Boost 1.58.0 or later (Boost 1.65.1 or later for LZMA compression)
24 | - Cmake 3.5 or later
25 | - curl, libcurl4-openssl-dev
26 | 
27 | 
28 | ## Installation
29 | 
30 | ```bash
31 | mkdir -p build && cd build
32 | cmake .. -DWITH_LZMA=off -DBUILD_TEST=on -DCMAKE_BUILD_TYPE=Release
33 | make -j 4
34 | ```
35 | 
36 | `-DWITH_LZMA=on` enables LZMA compression but requires Boost version 1.65.1 or higher.
37 | 
38 | 
39 | 
40 | ## Usage
41 | 
42 | Extractor can be used to process monolingual data with the following command:
43 | 
44 | ```bash
45 | cat <input_file> | ./mono --icompression <compression> --ocompression <compression> --workers <number_of_workers> --output <output_folder>
46 | ```
47 | 
48 | Extractor reads from standard input and expects a line-separated list of path names. Such list is then processed in parallel with the number of workers specified as an argument.
49 | 
50 | `icompression` and `ocompression` arguments set the input and ouput compresion formats respectively.
51 | The following compression formats are supported: `gzip|bzip2|zlib|lzma|none`.
52 | 
53 | One folder per worker is created in the output folder and each such folder contains file-separated monolingual data. Processed files are logged in `done.log` file located in each folder.
54 | 
55 | 
56 | ### Optional arguments:
57 | 
58 | - --curl: uses cURL to download remote files
59 | - --print_stats: prints language statistics
60 | 
61 | If `--curl` option is used, the input is expected to be a list of URL addresses instead of a list of path names. Extractor also supports multiple URL sources; If one URL source fails, it attempts to download the content from another one. To define additional sources, just add more URLs on the same line separated by a single space. Any cURL errors are logged in `errors.log` file in each worker folder.
62 | 
63 | If `--print_stats` option is enabled, it creates additional `stats` folder in each worker folder, and outputs language statistics on the processed files. Each file contains statistics on language distribution per domain with the following format:
64 | ```
65 | <domain>	<language code>	<byte length>
66 | ```
67 | 
68 | Each file contains no duplicates and is alphabetically sorted.
69 | 
70 | 
71 | 
72 | 
73 | ## Known Issues
74 | 
75 | CLD2 detects almost 300 different languages when processing Common Crawl. Since each worker opens a new file for every detected language, the total number of opened files might exceed the user's hard or soft resource limits on open file descriptors. You can type ```ulimit -a``` to get a list of all current resource limits.
76 | 
77 | For the Common Crawl purposes, executing the following command should increase the limit of open file descriptors sufficiently:
78 | ```bash
79 | ulimit -n 32768
80 | ```


--------------------------------------------------------------------------------
/src/3rd_party/utf8/doc/ReleaseNotes:
--------------------------------------------------------------------------------
 1 | utf8 cpp library
 2 | Release 2.3.4
 3 | 
 4 | A minor bug fix release. Thanks to all who reported bugs. 
 5 | 
 6 | Note: Version 2.3.3 contained a regression, and therefore was removed.
 7 | 
 8 | Changes from version 2.3.2
 9 | - Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';'
10 | - Bug fix [36]: replace_invalid() only works with back_inserter
11 | 
12 | Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
13 | 


--------------------------------------------------------------------------------
/src/3rd_party/utf8/source/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/src/3rd_party/utf8/source/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public ::std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t cp) : cp(cp) {}
 45 |         virtual const char* what() const throw() { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const throw() { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const throw() { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const throw() { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator>
 73 |     octet_iterator append(uint32_t cp, octet_iterator result)
 74 |     {
 75 |         if (!utf8::internal::is_code_point_valid(cp))
 76 |             throw invalid_code_point(cp);
 77 | 
 78 |         if (cp < 0x80)                        // one octet
 79 |             *(result++) = static_cast<uint8_t>(cp);
 80 |         else if (cp < 0x800) {                // two octets
 81 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
 82 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 83 |         }
 84 |         else if (cp < 0x10000) {              // three octets
 85 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
 86 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 87 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 88 |         }
 89 |         else {                                // four octets
 90 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
 91 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
 92 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 93 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 94 |         }
 95 |         return result;
 96 |     }
 97 | 
 98 |     template <typename octet_iterator, typename output_iterator>
 99 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 |     {
101 |         while (start != end) {
102 |             octet_iterator sequence_start = start;
103 |             internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 |             switch (err_code) {
105 |                 case internal::UTF8_OK :
106 |                     for (octet_iterator it = sequence_start; it != start; ++it)
107 |                         *out++ = *it;
108 |                     break;
109 |                 case internal::NOT_ENOUGH_ROOM:
110 |                     throw not_enough_room();
111 |                 case internal::INVALID_LEAD:
112 |                     out = utf8::append (replacement, out);
113 |                     ++start;
114 |                     break;
115 |                 case internal::INCOMPLETE_SEQUENCE:
116 |                 case internal::OVERLONG_SEQUENCE:
117 |                 case internal::INVALID_CODE_POINT:
118 |                     out = utf8::append (replacement, out);
119 |                     ++start;
120 |                     // just one replacement mark for the sequence
121 |                     while (start != end && utf8::internal::is_trail(*start))
122 |                         ++start;
123 |                     break;
124 |             }
125 |         }
126 |         return out;
127 |     }
128 | 
129 |     template <typename octet_iterator, typename output_iterator>
130 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
131 |     {
132 |         static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
133 |         return utf8::replace_invalid(start, end, out, replacement_marker);
134 |     }
135 | 
136 |     template <typename octet_iterator>
137 |     uint32_t next(octet_iterator& it, octet_iterator end)
138 |     {
139 |         uint32_t cp = 0;
140 |         internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
141 |         switch (err_code) {
142 |             case internal::UTF8_OK :
143 |                 break;
144 |             case internal::NOT_ENOUGH_ROOM :
145 |                 throw not_enough_room();
146 |             case internal::INVALID_LEAD :
147 |             case internal::INCOMPLETE_SEQUENCE :
148 |             case internal::OVERLONG_SEQUENCE :
149 |                 throw invalid_utf8(*it);
150 |             case internal::INVALID_CODE_POINT :
151 |                 throw invalid_code_point(cp);
152 |         }
153 |         return cp;
154 |     }
155 | 
156 |     template <typename octet_iterator>
157 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
158 |     {
159 |         return utf8::next(it, end);
160 |     }
161 | 
162 |     template <typename octet_iterator>
163 |     uint32_t prior(octet_iterator& it, octet_iterator start)
164 |     {
165 |         // can't do much if it == start
166 |         if (it == start)
167 |             throw not_enough_room();
168 | 
169 |         octet_iterator end = it;
170 |         // Go back until we hit either a lead octet or start
171 |         while (utf8::internal::is_trail(*(--it)))
172 |             if (it == start)
173 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
174 |         return utf8::peek_next(it, end);
175 |     }
176 | 
177 |     /// Deprecated in versions that include "prior"
178 |     template <typename octet_iterator>
179 |     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 |     {
181 |         octet_iterator end = it;
182 |         while (utf8::internal::is_trail(*(--it)))
183 |             if (it == pass_start)
184 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
185 |         octet_iterator temp = it;
186 |         return utf8::next(temp, end);
187 |     }
188 | 
189 |     template <typename octet_iterator, typename distance_type>
190 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 |     {
192 |         for (distance_type i = 0; i < n; ++i)
193 |             utf8::next(it, end);
194 |     }
195 | 
196 |     template <typename octet_iterator>
197 |     typename std::iterator_traits<octet_iterator>::difference_type
198 |     distance (octet_iterator first, octet_iterator last)
199 |     {
200 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
201 |         for (dist = 0; first < last; ++dist)
202 |             utf8::next(first, last);
203 |         return dist;
204 |     }
205 | 
206 |     template <typename u16bit_iterator, typename octet_iterator>
207 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 |     {
209 |         while (start != end) {
210 |             uint32_t cp = utf8::internal::mask16(*start++);
211 |             // Take care of surrogate pairs first
212 |             if (utf8::internal::is_lead_surrogate(cp)) {
213 |                 if (start != end) {
214 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
215 |                     if (utf8::internal::is_trail_surrogate(trail_surrogate))
216 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 |                     else
218 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 |                 }
220 |                 else
221 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
222 | 
223 |             }
224 |             // Lone trail surrogate
225 |             else if (utf8::internal::is_trail_surrogate(cp))
226 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
227 | 
228 |             result = utf8::append(cp, result);
229 |         }
230 |         return result;
231 |     }
232 | 
233 |     template <typename u16bit_iterator, typename octet_iterator>
234 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 |     {
236 |         while (start != end) {
237 |             uint32_t cp = utf8::next(start, end);
238 |             if (cp > 0xffff) { //make a surrogate pair
239 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
240 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 |             }
242 |             else
243 |                 *result++ = static_cast<uint16_t>(cp);
244 |         }
245 |         return result;
246 |     }
247 | 
248 |     template <typename octet_iterator, typename u32bit_iterator>
249 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 |     {
251 |         while (start != end)
252 |             result = utf8::append(*(start++), result);
253 | 
254 |         return result;
255 |     }
256 | 
257 |     template <typename octet_iterator, typename u32bit_iterator>
258 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 |     {
260 |         while (start != end)
261 |             (*result++) = utf8::next(start, end);
262 | 
263 |         return result;
264 |     }
265 | 
266 |     // The iterator class
267 |     template <typename octet_iterator>
268 |     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269 |       octet_iterator it;
270 |       octet_iterator range_start;
271 |       octet_iterator range_end;
272 |       public:
273 |       iterator () {}
274 |       explicit iterator (const octet_iterator& octet_it,
275 |                          const octet_iterator& range_start,
276 |                          const octet_iterator& range_end) :
277 |                it(octet_it), range_start(range_start), range_end(range_end)
278 |       {
279 |           if (it < range_start || it > range_end)
280 |               throw std::out_of_range("Invalid utf-8 iterator position");
281 |       }
282 |       // the default "big three" are OK
283 |       octet_iterator base () const { return it; }
284 |       uint32_t operator * () const
285 |       {
286 |           octet_iterator temp = it;
287 |           return utf8::next(temp, range_end);
288 |       }
289 |       bool operator == (const iterator& rhs) const
290 |       {
291 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
292 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 |           return (it == rhs.it);
294 |       }
295 |       bool operator != (const iterator& rhs) const
296 |       {
297 |           return !(operator == (rhs));
298 |       }
299 |       iterator& operator ++ ()
300 |       {
301 |           utf8::next(it, range_end);
302 |           return *this;
303 |       }
304 |       iterator operator ++ (int)
305 |       {
306 |           iterator temp = *this;
307 |           utf8::next(it, range_end);
308 |           return temp;
309 |       }
310 |       iterator& operator -- ()
311 |       {
312 |           utf8::prior(it, range_start);
313 |           return *this;
314 |       }
315 |       iterator operator -- (int)
316 |       {
317 |           iterator temp = *this;
318 |           utf8::prior(it, range_start);
319 |           return temp;
320 |       }
321 |     }; // class iterator
322 | 
323 | } // namespace utf8
324 | 
325 | #endif //header guard
326 | 
327 | 
328 | 


--------------------------------------------------------------------------------
/src/3rd_party/utf8/source/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | namespace utf8
 34 | {
 35 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36 |     // You may need to change them to match your system.
 37 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38 |     typedef unsigned char   uint8_t;
 39 |     typedef unsigned short  uint16_t;
 40 |     typedef unsigned int    uint32_t;
 41 | 
 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 43 | namespace internal
 44 | {
 45 |     // Unicode constants
 46 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 47 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52 |     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53 |     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54 | 
 55 |     // Maximum valid value for a Unicode code point
 56 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57 | 
 58 |     template<typename octet_type>
 59 |     inline uint8_t mask8(octet_type oc)
 60 |     {
 61 |         return static_cast<uint8_t>(0xff & oc);
 62 |     }
 63 |     template<typename u16_type>
 64 |     inline uint16_t mask16(u16_type oc)
 65 |     {
 66 |         return static_cast<uint16_t>(0xffff & oc);
 67 |     }
 68 |     template<typename octet_type>
 69 |     inline bool is_trail(octet_type oc)
 70 |     {
 71 |         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
 72 |     }
 73 | 
 74 |     template <typename u16>
 75 |     inline bool is_lead_surrogate(u16 cp)
 76 |     {
 77 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78 |     }
 79 | 
 80 |     template <typename u16>
 81 |     inline bool is_trail_surrogate(u16 cp)
 82 |     {
 83 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84 |     }
 85 | 
 86 |     template <typename u16>
 87 |     inline bool is_surrogate(u16 cp)
 88 |     {
 89 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90 |     }
 91 | 
 92 |     template <typename u32>
 93 |     inline bool is_code_point_valid(u32 cp)
 94 |     {
 95 |         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
 96 |     }
 97 | 
 98 |     template <typename octet_iterator>
 99 |     inline typename std::iterator_traits<octet_iterator>::difference_type
100 |     sequence_length(octet_iterator lead_it)
101 |     {
102 |         uint8_t lead = utf8::internal::mask8(*lead_it);
103 |         if (lead < 0x80)
104 |             return 1;
105 |         else if ((lead >> 5) == 0x6)
106 |             return 2;
107 |         else if ((lead >> 4) == 0xe)
108 |             return 3;
109 |         else if ((lead >> 3) == 0x1e)
110 |             return 4;
111 |         else
112 |             return 0;
113 |     }
114 | 
115 |     template <typename octet_difference_type>
116 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 |     {
118 |         if (cp < 0x80) {
119 |             if (length != 1) 
120 |                 return true;
121 |         }
122 |         else if (cp < 0x800) {
123 |             if (length != 2) 
124 |                 return true;
125 |         }
126 |         else if (cp < 0x10000) {
127 |             if (length != 3) 
128 |                 return true;
129 |         }
130 | 
131 |         return false;
132 |     }
133 | 
134 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 | 
136 |     /// Helper for get_sequence_x
137 |     template <typename octet_iterator>
138 |     utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 |     {
140 |         if (++it == end)
141 |             return NOT_ENOUGH_ROOM;
142 | 
143 |         if (!utf8::internal::is_trail(*it))
144 |             return INCOMPLETE_SEQUENCE;
145 |         
146 |         return UTF8_OK;
147 |     }
148 | 
149 |     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
150 | 
151 |     /// get_sequence_x functions decode utf-8 sequences of the length x
152 |     template <typename octet_iterator>
153 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 |     {
155 |         if (it == end)
156 |             return NOT_ENOUGH_ROOM;
157 | 
158 |         code_point = utf8::internal::mask8(*it);
159 | 
160 |         return UTF8_OK;
161 |     }
162 | 
163 |     template <typename octet_iterator>
164 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 |     {
166 |         if (it == end) 
167 |             return NOT_ENOUGH_ROOM;
168 |         
169 |         code_point = utf8::internal::mask8(*it);
170 | 
171 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
172 | 
173 |         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174 | 
175 |         return UTF8_OK;
176 |     }
177 | 
178 |     template <typename octet_iterator>
179 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 |     {
181 |         if (it == end)
182 |             return NOT_ENOUGH_ROOM;
183 |             
184 |         code_point = utf8::internal::mask8(*it);
185 | 
186 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
187 | 
188 |         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189 | 
190 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 | 
192 |         code_point += (*it) & 0x3f;
193 | 
194 |         return UTF8_OK;
195 |     }
196 | 
197 |     template <typename octet_iterator>
198 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 |     {
200 |         if (it == end)
201 |            return NOT_ENOUGH_ROOM;
202 | 
203 |         code_point = utf8::internal::mask8(*it);
204 | 
205 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
206 | 
207 |         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208 | 
209 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
210 | 
211 |         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212 | 
213 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
214 | 
215 |         code_point += (*it) & 0x3f;
216 | 
217 |         return UTF8_OK;
218 |     }
219 | 
220 |     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221 | 
222 |     template <typename octet_iterator>
223 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 |     {
225 |         // Save the original value of it so we can go back in case of failure
226 |         // Of course, it does not make much sense with i.e. stream iterators
227 |         octet_iterator original_it = it;
228 | 
229 |         uint32_t cp = 0;
230 |         // Determine the sequence length based on the lead octet
231 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 |         const octet_difference_type length = utf8::internal::sequence_length(it);
233 | 
234 |         // Get trail octets and calculate the code point
235 |         utf_error err = UTF8_OK;
236 |         switch (length) {
237 |             case 0: 
238 |                 return INVALID_LEAD;
239 |             case 1:
240 |                 err = utf8::internal::get_sequence_1(it, end, cp);
241 |                 break;
242 |             case 2:
243 |                 err = utf8::internal::get_sequence_2(it, end, cp);
244 |             break;
245 |             case 3:
246 |                 err = utf8::internal::get_sequence_3(it, end, cp);
247 |             break;
248 |             case 4:
249 |                 err = utf8::internal::get_sequence_4(it, end, cp);
250 |             break;
251 |         }
252 | 
253 |         if (err == UTF8_OK) {
254 |             // Decoding succeeded. Now, security checks...
255 |             if (utf8::internal::is_code_point_valid(cp)) {
256 |                 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 |                     // Passed! Return here.
258 |                     code_point = cp;
259 |                     ++it;
260 |                     return UTF8_OK;
261 |                 }
262 |                 else
263 |                     err = OVERLONG_SEQUENCE;
264 |             }
265 |             else 
266 |                 err = INVALID_CODE_POINT;
267 |         }
268 | 
269 |         // Failure branch - restore the original value of the iterator
270 |         it = original_it;
271 |         return err;
272 |     }
273 | 
274 |     template <typename octet_iterator>
275 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
276 |         uint32_t ignored;
277 |         return utf8::internal::validate_next(it, end, ignored);
278 |     }
279 | 
280 | } // namespace internal
281 | 
282 |     /// The library API - functions intended to be called by the users
283 | 
284 |     // Byte order mark
285 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
286 | 
287 |     template <typename octet_iterator>
288 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
289 |     {
290 |         octet_iterator result = start;
291 |         while (result != end) {
292 |             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
293 |             if (err_code != internal::UTF8_OK)
294 |                 return result;
295 |         }
296 |         return result;
297 |     }
298 | 
299 |     template <typename octet_iterator>
300 |     inline bool is_valid(octet_iterator start, octet_iterator end)
301 |     {
302 |         return (utf8::find_invalid(start, end) == end);
303 |     }
304 | 
305 |     template <typename octet_iterator>
306 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
307 |     {
308 |         return (
309 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 |             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
312 |            );
313 |     }
314 | 	
315 |     //Deprecated in release 2.3 
316 |     template <typename octet_iterator>
317 |     inline bool is_bom (octet_iterator it)
318 |     {
319 |         return (
320 |             (utf8::internal::mask8(*it++)) == bom[0] &&
321 |             (utf8::internal::mask8(*it++)) == bom[1] &&
322 |             (utf8::internal::mask8(*it))   == bom[2]
323 |            );
324 |     }
325 | } // namespace utf8
326 | 
327 | #endif // header guard
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/src/3rd_party/utf8/source/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked 
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);  
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator>
 61 |         uint32_t next(octet_iterator& it)
 62 |         {
 63 |             uint32_t cp = utf8::internal::mask8(*it);
 64 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
 65 |             switch (length) {
 66 |                 case 1:
 67 |                     break;
 68 |                 case 2:
 69 |                     it++;
 70 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 71 |                     break;
 72 |                 case 3:
 73 |                     ++it; 
 74 |                     cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
 75 |                     ++it;
 76 |                     cp += (*it) & 0x3f;
 77 |                     break;
 78 |                 case 4:
 79 |                     ++it;
 80 |                     cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
 81 |                     ++it;
 82 |                     cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
 83 |                     ++it;
 84 |                     cp += (*it) & 0x3f; 
 85 |                     break;
 86 |             }
 87 |             ++it;
 88 |             return cp;        
 89 |         }
 90 | 
 91 |         template <typename octet_iterator>
 92 |         uint32_t peek_next(octet_iterator it)
 93 |         {
 94 |             return utf8::unchecked::next(it);    
 95 |         }
 96 | 
 97 |         template <typename octet_iterator>
 98 |         uint32_t prior(octet_iterator& it)
 99 |         {
100 |             while (utf8::internal::is_trail(*(--it))) ;
101 |             octet_iterator temp = it;
102 |             return utf8::unchecked::next(temp);
103 |         }
104 | 
105 |         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 |         template <typename octet_iterator>
107 |         inline uint32_t previous(octet_iterator& it)
108 |         {
109 |             return utf8::unchecked::prior(it);
110 |         }
111 | 
112 |         template <typename octet_iterator, typename distance_type>
113 |         void advance (octet_iterator& it, distance_type n)
114 |         {
115 |             for (distance_type i = 0; i < n; ++i)
116 |                 utf8::unchecked::next(it);
117 |         }
118 | 
119 |         template <typename octet_iterator>
120 |         typename std::iterator_traits<octet_iterator>::difference_type
121 |         distance (octet_iterator first, octet_iterator last)
122 |         {
123 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
124 |             for (dist = 0; first < last; ++dist) 
125 |                 utf8::unchecked::next(first);
126 |             return dist;
127 |         }
128 | 
129 |         template <typename u16bit_iterator, typename octet_iterator>
130 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 |         {       
132 |             while (start != end) {
133 |                 uint32_t cp = utf8::internal::mask16(*start++);
134 |             // Take care of surrogate pairs first
135 |                 if (utf8::internal::is_lead_surrogate(cp)) {
136 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
137 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 |                 }
139 |                 result = utf8::unchecked::append(cp, result);
140 |             }
141 |             return result;         
142 |         }
143 | 
144 |         template <typename u16bit_iterator, typename octet_iterator>
145 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 |         {
147 |             while (start < end) {
148 |                 uint32_t cp = utf8::unchecked::next(start);
149 |                 if (cp > 0xffff) { //make a surrogate pair
150 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
151 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 |                 }
153 |                 else
154 |                     *result++ = static_cast<uint16_t>(cp);
155 |             }
156 |             return result;
157 |         }
158 | 
159 |         template <typename octet_iterator, typename u32bit_iterator>
160 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 |         {
162 |             while (start != end)
163 |                 result = utf8::unchecked::append(*(start++), result);
164 | 
165 |             return result;
166 |         }
167 | 
168 |         template <typename octet_iterator, typename u32bit_iterator>
169 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 |         {
171 |             while (start < end)
172 |                 (*result++) = utf8::unchecked::next(start);
173 | 
174 |             return result;
175 |         }
176 | 
177 |         // The iterator class
178 |         template <typename octet_iterator>
179 |           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
180 |             octet_iterator it;
181 |             public:
182 |             iterator () {}
183 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 |             // the default "big three" are OK
185 |             octet_iterator base () const { return it; }
186 |             uint32_t operator * () const
187 |             {
188 |                 octet_iterator temp = it;
189 |                 return utf8::unchecked::next(temp);
190 |             }
191 |             bool operator == (const iterator& rhs) const 
192 |             { 
193 |                 return (it == rhs.it);
194 |             }
195 |             bool operator != (const iterator& rhs) const
196 |             {
197 |                 return !(operator == (rhs));
198 |             }
199 |             iterator& operator ++ () 
200 |             {
201 |                 ::std::advance(it, utf8::internal::sequence_length(it));
202 |                 return *this;
203 |             }
204 |             iterator operator ++ (int)
205 |             {
206 |                 iterator temp = *this;
207 |                 ::std::advance(it, utf8::internal::sequence_length(it));
208 |                 return temp;
209 |             }  
210 |             iterator& operator -- ()
211 |             {
212 |                 utf8::unchecked::prior(it);
213 |                 return *this;
214 |             }
215 |             iterator operator -- (int)
216 |             {
217 |                 iterator temp = *this;
218 |                 utf8::unchecked::prior(it);
219 |                 return temp;
220 |             }
221 |           }; // class iterator
222 | 
223 |     } // namespace utf8::unchecked
224 | } // namespace utf8 
225 | 
226 | 
227 | #endif // header guard
228 | 
229 | 


--------------------------------------------------------------------------------
/src/mono/buffered_map.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "buffered_map.h"
 3 | #include "../utils/compression.h"
 4 | 
 5 | #include <boost/iostreams/filtering_stream.hpp>
 6 | #include <boost/iostreams/device/file.hpp>
 7 | #include <boost/iostreams/filter/gzip.hpp>
 8 | #include <boost/filesystem.hpp>
 9 | 
10 | 
11 | #include <iostream>
12 | #include <string>
13 | 
14 | 
15 | namespace mono {
16 | 
17 |     buffered_map::buffered_map(std::string output_folder_, unsigned long buffer_size_) : output_folder(output_folder_),
18 |                                                                                          buffer_size(buffer_size_),
19 |                                                                                          file_no(0) {};
20 | 
21 |     void buffered_map::add(std::pair<std::string, std::string> key, long value) {
22 |       if (domain_map.size() + 1 > buffer_size) {
23 |         flush();
24 |         domain_map.clear();
25 |       }
26 | 
27 |       std::map<std::pair<std::string, std::string>, long>::iterator it = domain_map.find(key);
28 |       if (it != domain_map.end()) {
29 |         it->second += value;
30 |       } else {
31 |         domain_map.insert(std::pair<std::pair<std::string, std::string>, long>(key, value));
32 |       }
33 | 
34 |     }
35 | 
36 |     void buffered_map::flush() {
37 |       // find an unused filename
38 |       std::string candidate_ouput_path = output_folder + "/stats/langstats." + std::to_string(file_no) + ".gz";
39 |       while (boost::filesystem::exists(candidate_ouput_path)) {
40 |         ++file_no;
41 |         candidate_ouput_path = output_folder + "/stats/langstats." + std::to_string(file_no) + ".gz";
42 |       }
43 | 
44 |       boost::iostreams::filtering_streambuf<boost::iostreams::output> qout;
45 |       qout.push(boost::iostreams::gzip_compressor());
46 |       qout.push(boost::iostreams::file_sink(candidate_ouput_path, std::ofstream::out | std::ofstream::binary));
47 | 
48 |       // output domain_map into a file
49 |       std::ostream outf(&qout);
50 |       std::map<std::pair<std::string, std::string>, long>::iterator it = domain_map.begin();
51 |       std::string sep = "\t";
52 |       std::string nl = "\n";
53 |       for (; it != domain_map.end(); ++it) {
54 |         outf.write(it->first.first.c_str(), it->first.first.size());
55 |         outf.write(sep.c_str(), sep.size());
56 |         outf.write(it->first.second.c_str(), it->first.second.size());
57 |         outf.write(sep.c_str(), sep.size());
58 |         outf.write(std::to_string(it->second).c_str(), std::to_string(it->second).size());
59 |         outf.write(nl.c_str(), nl.size());
60 |       }
61 | 
62 |       // close and increment file number
63 |       ++file_no;
64 |     }
65 | 
66 |     size_t buffered_map::size() {
67 |       return domain_map.size();
68 |     }
69 | 
70 | 
71 | }


--------------------------------------------------------------------------------
/src/mono/buffered_map.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_BUFFERED_MAP_H
 3 | #define EXTRACTOR_MONO_BUFFERED_MAP_H
 4 | 
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <map>
 8 | 
 9 | namespace mono {
10 | 
11 |     class buffered_map {
12 |     public:
13 | 
14 |         std::string output_folder;
15 | 
16 |         unsigned long buffer_size;
17 |         int file_no;
18 | 
19 |         std::map<std::pair<std::string, std::string>, long> domain_map;
20 | 
21 |         buffered_map(std::string output_folder_, unsigned long buffer_size_);
22 | 
23 |         void add(std::pair<std::string, std::string> key, long value);
24 | 
25 |         void flush();
26 | 
27 |         size_t size();
28 | 
29 | 
30 |     private:
31 | 
32 |     };
33 | 
34 | }
35 | 
36 | #endif //EXTRACTOR_MONO_BUFFERED_MAP_H
37 | 


--------------------------------------------------------------------------------
/src/mono/filters/header.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_FILTERS_HEADER_H
 3 | #define EXTRACTOR_MONO_FILTERS_HEADER_H
 4 | 
 5 | #include "string_util.h"
 6 | 
 7 | #include <string>
 8 | 
 9 | 
10 | namespace mono {
11 | 
12 |     namespace filters {
13 | 
14 |         class Header {
15 |         public:
16 |             explicit Header(const std::string &header) {
17 |               for (const auto &value : StringUtil::Split(header, ' ')) {
18 |                 if (value.find("tld:") == 0) {
19 |                   tld_ = value.substr(4);
20 |                 } else if (value.find("uri:") == 0) {
21 |                   uri_ = value.substr(4);
22 |                 } else if (value.find("encoding:") == 0) {
23 |                   encoding_ = value.substr(9);
24 |                 }
25 |               }
26 |             }
27 | 
28 |             const std::string get_tld() const { return tld_; }
29 | 
30 |             const std::string get_uri() const { return uri_; }
31 | 
32 |             const std::string get_encoding() const { return encoding_; }
33 | 
34 |         private:
35 |             std::string uri_;
36 |             std::string tld_;
37 |             std::string encoding_;
38 |         };
39 | 
40 |     }
41 | 
42 | }
43 | 
44 | 
45 | #endif //EXTRACTOR_MONO_FILTERS_HEADER_H
46 | 


--------------------------------------------------------------------------------
/src/mono/filters/langcollectorfilter.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "langcollectorfilter.h"
 3 | #include "../../utils/common.h"
 4 | 
 5 | #include <boost/algorithm/string/trim.hpp>
 6 | #include <boost/algorithm/string/predicate.hpp>
 7 | #include <boost/algorithm/string.hpp>
 8 | #include <boost/regex.hpp>
 9 | 
10 | #include <iostream>
11 | #include <map>
12 | #include <fstream>
13 | 
14 | 
15 | namespace mono {
16 | 
17 |     namespace filters {
18 | 
19 |         LangCollectorFilter::LangCollectorFilter(std::string output_folder_, utils::compression_option compr_)
20 |                 : boost::iostreams::line_filter(true),
21 |                   header(""), text_buffer(""),
22 |                   ls(output_folder_, compr_) {};
23 | 
24 |         std::string LangCollectorFilter::do_filter(const std::string &str_) {
25 |           std::string line = boost::trim_copy(str_);
26 | 
27 |           if (line.empty()) {
28 |             return "";
29 |           }
30 | 
31 |           if (boost::starts_with(line, magic_number)) {
32 |             if (output_to_langsink()) {
33 |               return "";
34 |             }
35 | 
36 |             // update header
37 |             header = line;
38 | 
39 |             return "";
40 | 
41 |           } else {
42 |             text_buffer += line + "\n";
43 |           }
44 | 
45 |           return "";
46 |         }
47 | 
48 |         std::string LangCollectorFilter::parse_language(std::string const &str) {
49 |           std::string found = find_language(str);
50 | 
51 |           if (found.length() > 0) {
52 |             return boost::trim_copy(found.substr(language_guard.length()));
53 |           }
54 | 
55 |           return "";
56 | 
57 |         };
58 | 
59 |         std::string LangCollectorFilter::find_language(std::string const &str) {
60 |           std::string str_uri = str.substr(magic_number.length());
61 |           std::ostringstream ss;
62 |           boost::regex expr{language_guard + "\\s*[a-zA-Z0-9]+\\s*"};
63 |           boost::smatch match;
64 | 
65 |           if (boost::regex_search(str_uri, match, expr) && match.size() >= 1) {
66 |             return match[0];
67 |           }
68 | 
69 |           return "";
70 |         };
71 | 
72 |         bool LangCollectorFilter::output_to_langsink() {
73 |           if (header.size() > 0) {
74 |             std::string res = header + '\n' + text_buffer;
75 |             std::string lang = parse_language(header);
76 | 
77 |             text_buffer.erase();
78 | 
79 |             if (lang.length() == 0) {
80 |               header = "";
81 |               return true;
82 |             }
83 | 
84 |             ls.output(lang, res);
85 | 
86 |           }
87 | 
88 |           return false;
89 |         }
90 | 
91 |     }
92 | 
93 | }


--------------------------------------------------------------------------------
/src/mono/filters/langcollectorfilter.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H
 3 | #define EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H
 4 | 
 5 | #include "../language_sink.h"
 6 | 
 7 | #include <boost/iostreams/filter/line.hpp>
 8 | #include <boost/iostreams/filter/aggregate.hpp>
 9 | #include <boost/iostreams/filter/gzip.hpp>
10 | #include <boost/iostreams/device/file_descriptor.hpp>
11 | #include <boost/iostreams/device/file.hpp>
12 | #include <boost/iostreams/device/back_inserter.hpp>
13 | #include <boost/iostreams/filtering_stream.hpp>
14 | #include <boost/filesystem/fstream.hpp>
15 | #include <boost/filesystem/operations.hpp>
16 | #include <boost/iostreams/copy.hpp>
17 | 
18 | #include <iostream>
19 | #include <string>
20 | #include <map>
21 | 
22 | 
23 | namespace mono {
24 | 
25 |     namespace filters {
26 | 
27 |         class LangCollectorFilter : public boost::iostreams::line_filter {
28 | 
29 |         public:
30 | 
31 |             std::string header;
32 |             std::string text_buffer;
33 |             mono::language_sink ls;
34 | 
35 |             LangCollectorFilter(std::string output_folder_, utils::compression_option compr_);
36 | 
37 |             template<typename Sink>
38 |             void close(Sink &snk, BOOST_IOS::openmode which) {
39 |               output_to_langsink(); // flush text buffer
40 |               boost::iostreams::line_filter::close(snk, which);
41 |             }
42 | 
43 | 
44 |         private:
45 | 
46 |             const std::string magic_number = "df6fa1abb58549287111ba8d776733e9";
47 |             const std::string language_guard = "language:";
48 | 
49 |             std::string do_filter(const std::string &str);
50 | 
51 |             std::string parse_language(std::string const &str);
52 | 
53 |             std::string find_language(std::string const &str);
54 | 
55 |             bool output_to_langsink();
56 | 
57 |         };
58 | 
59 |     }
60 | 
61 | }
62 | 
63 | 
64 | #endif //EXTRACTOR_MONO_FILTERS_LANGCOLLECTOR_H
65 | 


--------------------------------------------------------------------------------
/src/mono/filters/langsplitfilter.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "langsplitfilter.h"
  3 | #include "../buffered_map.h"
  4 | #include "../../utils/common.h"
  5 | #include "../../utils/logging.h"
  6 | #include "compact_lang_det.h"
  7 | #include <boost/algorithm/string/trim.hpp>
  8 | #include <boost/algorithm/string.hpp>
  9 | 
 10 | #include <iostream>
 11 | #include <sstream>
 12 | #include <string>
 13 | #include <vector>
 14 | #include <set>
 15 | #include <compact_lang_det.h>
 16 | 
 17 | 
 18 | namespace mono {
 19 | 
 20 |     namespace filters {
 21 | 
 22 |         LangsplitFilter::LangsplitFilter(std::string output_folder_, bool print_stats_) :
 23 |                 boost::iostreams::line_filter(true),
 24 |                 print_stats(print_stats_),
 25 |                 bmap(output_folder_, 1000000),
 26 |                 output_folder(output_folder_),
 27 |                 header(""),
 28 |                 text_buffer(""),
 29 |                 num_reliable(0),
 30 |                 num_unreliable(0) {
 31 |           flags = get_flag(modes);
 32 |         }
 33 | 
 34 |         LangsplitFilter::~LangsplitFilter() {
 35 |           if (num_reliable == 0 && num_unreliable == 0) {
 36 |             return;
 37 |           }
 38 | 
 39 |           logging::log_reliable(output_folder, num_reliable, num_unreliable);
 40 |         }
 41 | 
 42 |         std::string LangsplitFilter::do_filter(const std::string &str_) {
 43 |           std::string line = boost::trim_copy(str_);
 44 | 
 45 |           if (line.empty()) {
 46 |             return "";
 47 |           }
 48 | 
 49 |           if (boost::starts_with(line, magic_number)) {
 50 |             std::string res = PrintLanguageStats(flags, header, text_buffer);
 51 | 
 52 |             text_buffer.erase();
 53 |             header = line;
 54 | 
 55 |             return res;
 56 |           } else {
 57 |             text_buffer += line + "\n";
 58 |           }
 59 | 
 60 |           return "";
 61 |         }
 62 | 
 63 |         int LangsplitFilter::get_flag(std::vector<std::string> modes) {
 64 |           int flags = 0;
 65 |           for (int i = 0; i < (int) modes.size(); ++i) {
 66 |             if (modes.at(i) == "--scoreasquads") {
 67 |               flags |= CLD2::kCLDFlagScoreAsQuads;
 68 |             } else if (modes.at(i) == "--html") {
 69 |               flags |= CLD2::kCLDFlagHtml;
 70 |             } else if (modes.at(i) == "--cr") {
 71 |               flags |= CLD2::kCLDFlagCr;
 72 |             } else if (modes.at(i) == "--verbose") {
 73 |               flags |= CLD2::kCLDFlagVerbose;
 74 |             } else if (modes.at(i) == "--echo") {
 75 |               flags |= CLD2::kCLDFlagEcho;
 76 |             } else if (modes.at(i) == "--besteffort") {
 77 |               flags |= CLD2::kCLDFlagBestEffort;
 78 |             }
 79 |           }
 80 | 
 81 |           return flags;
 82 |         }
 83 | 
 84 |         std::string LangsplitFilter::PrintLanguageStats(const int flags, const std::string &header,
 85 |                                                         const std::string &buffer) {
 86 |           std::stringstream ss;
 87 |           if (header.empty() || buffer.empty()) {
 88 |             return "";
 89 |           }
 90 | 
 91 |           const Header header_values(header);
 92 |           const std::string uri = header_values.get_uri();
 93 |           utils::parse_uri parsed_uri(uri);
 94 | 
 95 |           CLD2::CLDHints cld_hints = {NULL, NULL, UNKNOWN_ENCODING,
 96 |                                       CLD2::UNKNOWN_LANGUAGE};
 97 |           if (!parsed_uri.get_tld().empty()) {
 98 |             cld_hints.tld_hint = parsed_uri.get_tld().c_str();
 99 |           }
100 | 
101 |           CLD2::Language language3[3];
102 |           int percent3[3];
103 |           double normalized_score3[3];
104 |           int valid_prefix_bytes;
105 | 
106 |           CLD2::ResultChunkVector resultchunkvector;
107 |           int text_bytes;
108 |           bool is_reliable;
109 | 
110 |           CLD2::ExtDetectLanguageSummaryCheckUTF8(
111 |                   buffer.c_str(), buffer.size(), true, &cld_hints, flags,
112 |                   language3, percent3, normalized_score3, &resultchunkvector, &text_bytes,
113 |                   &is_reliable, &valid_prefix_bytes);
114 | 
115 |           if (is_reliable) {
116 |             ++num_reliable;
117 |             for (int i = 0; i < static_cast<int>(resultchunkvector.size()); ++i) {
118 |               const CLD2::ResultChunk &rc = resultchunkvector[i];
119 |               CLD2::Language rc_lang = static_cast<CLD2::Language>(rc.lang1);
120 | 
121 |               if (rc_lang == CLD2::UNKNOWN_LANGUAGE) {
122 |                 continue;
123 |               }
124 | 
125 |               const std::string lang_code = LanguageCode(rc_lang);
126 |               ss << output_chunk(buffer, rc, header, lang_code);
127 | 
128 |               if (print_stats) {
129 |                 bmap.add(std::make_pair(parsed_uri.get_domain(), lang_code), static_cast<long>(rc.bytes));
130 |               }
131 |             }
132 | 
133 |           } else {
134 |             ++num_unreliable;
135 |           }
136 | 
137 |           return ss.str();
138 |         }
139 | 
140 |         std::string
141 |         LangsplitFilter::output_chunk(const std::string buffer, const CLD2::ResultChunk &rc, const std::string header,
142 |                                       const std::string lang_code) {
143 |           std::stringstream ss;
144 |           const std::string chunk = std::string(buffer, rc.offset, rc.bytes);
145 | 
146 |           ss << header << " language:" << lang_code << " bytes:" << rc.bytes << "\n" << chunk << "\n";
147 |           return ss.str();
148 |         }
149 | 
150 |     }
151 | 
152 | }


--------------------------------------------------------------------------------
/src/mono/filters/langsplitfilter.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H
 3 | #define EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H
 4 | 
 5 | #include "header.h"
 6 | #include "compact_lang_det.h"
 7 | #include "../buffered_map.h"
 8 | 
 9 | #include <boost/iostreams/filter/line.hpp>
10 | #include <iostream>
11 | #include <sstream>
12 | #include <string>
13 | #include <vector>
14 | 
15 | 
16 | typedef CLD2::int32 Encoding;
17 | static const Encoding UNKNOWN_ENCODING = 0;
18 | 
19 | namespace mono {
20 | 
21 |     namespace filters {
22 | 
23 |         class LangsplitFilter : public boost::iostreams::line_filter {
24 | 
25 |         public:
26 | 
27 |             int flags;
28 |             bool print_stats;
29 |             buffered_map bmap;
30 | 
31 |             std::string output_folder;
32 |             std::string header;
33 |             std::string text_buffer;
34 | 
35 |             long num_reliable;
36 |             long num_unreliable;
37 | 
38 |             std::vector<std::string> modes = std::vector<std::string>();
39 | 
40 |             LangsplitFilter(std::string output_folder_, bool print_stats_);
41 | 
42 |             virtual ~LangsplitFilter();
43 | 
44 |             template<typename Sink>
45 |             void close(Sink &snk, BOOST_IOS::openmode which) {
46 |               boost::iostreams::line_filter::close(snk, which);
47 | 
48 |               string_type line = PrintLanguageStats(flags, header, text_buffer);
49 |               std::streamsize amt = static_cast<std::streamsize>(line.size());
50 |               boost::iostreams::write_if(snk, line.data(), amt);
51 | 
52 |               bmap.flush();
53 |             }
54 | 
55 | 
56 |         private:
57 | 
58 |             const std::string magic_number = "df6fa1abb58549287111ba8d776733e9";
59 | 
60 |             std::string do_filter(const std::string &str);
61 | 
62 |             int get_flag(std::vector<std::string> modes);
63 | 
64 |             std::string PrintLanguageStats(const int flags, const std::string &header,
65 |                                            const std::string &buffer);
66 | 
67 |             std::string output_chunk(const std::string buffer, const CLD2::ResultChunk &rc, const std::string header,
68 |                                      const std::string lang_code);
69 | 
70 |         };
71 | 
72 |     }
73 | 
74 | }
75 | 
76 | #endif //EXTRACTOR_MONO_FILTERS_LANGSPLITFILTER_H
77 | 


--------------------------------------------------------------------------------
/src/mono/filters/string_util.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_FILTERS_STRING_UTIL_H
 3 | #define EXTRACTOR_MONO_FILTERS_STRING_UTIL_H
 4 | 
 5 | #include "boost/algorithm/string/trim.hpp"
 6 | 
 7 | #include <sstream>
 8 | #include <algorithm>
 9 | #include <string>
10 | #include <vector>
11 | 
12 | 
13 | namespace mono {
14 | 
15 |     namespace filters {
16 | 
17 |         class StringUtil {
18 |         public:
19 |             static bool EndsWith(std::string const &s, const std::string &end) {
20 |               return (s.length() >= end.length() && s.compare(s.length() - end.length(), end.length(), end) == 0);
21 |             }
22 | 
23 |             static std::string ToLower(const std::string &s) {
24 |               std::string lower(s);
25 |               std::transform(s.begin(), s.end(), lower.begin(), ::tolower);
26 |               return lower;
27 |             }
28 | 
29 |             static std::vector<std::string> Split(const std::string &s, const char delim = ' ') {
30 |               std::vector<std::string> tokens;
31 |               std::stringstream ss(s);
32 |               std::string token;
33 |               while (std::getline(ss, token, delim)) {
34 |                 if (!token.empty()) {
35 |                   tokens.push_back(token);
36 |                 }
37 |               }
38 |               return tokens;
39 |             }
40 | 
41 |             static void TrimRepeatedSpace(std::string *s) {
42 |               s->erase(std::unique(s->begin(), s->end(),
43 |                                    [](char a, char b) { return std::isspace(a) && std::isspace(b); }),
44 |                        s->end());
45 |             }
46 | 
47 |             static std::string TrimRepeatedWhitespace(const std::string &s) {
48 |               std::stringstream ss(s);
49 |               std::ostringstream oss;
50 |               std::string line;
51 |               while (std::getline(ss, line)) {
52 |                 line = boost::trim_copy(line);
53 |                 if (!line.empty()) {
54 |                   TrimRepeatedSpace(&line);
55 |                   oss << line << "\n";
56 |                 }
57 |               }
58 |               return oss.str();
59 |             }
60 |         };
61 | 
62 |     }
63 | 
64 | }
65 | 
66 | 
67 | #endif //EXTRACTOR_MONO_FILTERS_STRING_UTIL_H
68 | 


--------------------------------------------------------------------------------
/src/mono/filters/warcfilter.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "warcfilter.h"
 3 | #include "../../utils/common.h"
 4 | 
 5 | #include <iostream>
 6 | #include <sstream>
 7 | #include <boost/algorithm/string/predicate.hpp>
 8 | #include <boost/algorithm/string.hpp>
 9 | #include <boost/regex.hpp>
10 | 
11 | 
12 | namespace mono {
13 | 
14 |     namespace filters {
15 | 
16 |         std::string WARCFilter::do_filter(const std::string &str_) {
17 |           std::string str = str_;
18 | 
19 |           if (str.length() > 0 && str.back() == '\r')
20 |             str.pop_back();
21 | 
22 |           switch (state) {
23 | 
24 |             case 0:
25 |               // PARSE HEADER IDENTIFIER
26 |               if (boost::starts_with(str, warc_guard))
27 |                 state = 1;
28 |               break;
29 | 
30 |             case 1:
31 |               // PARSE and OUTPUT URI HEADER
32 |               if (boost::starts_with(str, uri_guard)) {
33 |                 std::string found_header = parse_uri_header(str);
34 |                 if (found_header.length() > 0) {
35 |                   state = 2;
36 |                   return found_header;
37 |                 } else {
38 |                   state = 0;
39 |                 }
40 |               } else if (str.empty()) {
41 |                 state = 0;
42 |               }
43 |               break;
44 | 
45 |             case 2:
46 |               // PARSE END-OF-HEADER NEW LINE
47 |               if (str.empty())
48 |                 state = 3;
49 |               break;
50 | 
51 |             case 3:
52 |               // PARSE AND OUTPUT CONTENT UNTIL HEADER IDENTIFIER FOUND]
53 |               if (boost::starts_with(str, "WARC/1.0")) {
54 |                 state = 1;
55 |               } else {
56 |                 if (!(str.empty())) {
57 |                   utils::fix_utf8_string(str);
58 |                   return str + '\n';
59 |                 }
60 |               }
61 |               break;
62 |           }
63 | 
64 |           return "";
65 |         };
66 | 
67 |         std::string WARCFilter::make_header(std::string const &uri) {
68 |           std::ostringstream ss;
69 |           ss << magic_number << " uri:" << boost::trim_left_copy(uri) << '\n';
70 |           return ss.str();
71 |         };
72 | 
73 |         std::string WARCFilter::parse_uri_header(std::string const &str) {
74 |           std::string str_uri = str.substr(uri_guard.length());
75 |           std::ostringstream ss;
76 |           boost::regex expr{"^\\s*([^\\s]+)\\s*$"};
77 |           boost::smatch match;
78 | 
79 |           if (boost::regex_match(str_uri, match, expr) && match.size() >= 1) {
80 |             return make_header(match[0]);
81 |           }
82 | 
83 |           return "";
84 |         };
85 | 
86 |     }
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/src/mono/filters/warcfilter.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_FILTERS_WARCFILTER_H
 3 | #define EXTRACTOR_MONO_FILTERS_WARCFILTER_H
 4 | 
 5 | #include <boost/iostreams/filter/line.hpp>
 6 | #include <string>
 7 | 
 8 | 
 9 | namespace mono {
10 | 
11 |     namespace filters {
12 | 
13 |         class WARCFilter : public boost::iostreams::line_filter {
14 | 
15 |         public:
16 | 
17 |             int state;
18 | 
19 |             WARCFilter() : boost::iostreams::line_filter(true), state(0) {};
20 | 
21 |         private:
22 | 
23 |             const std::string warc_guard = "WARC/1.0";
24 |             const std::string uri_guard = "WARC-Target-URI:";
25 |             const std::string magic_number = "df6fa1abb58549287111ba8d776733e9";
26 | 
27 |             std::string do_filter(const std::string &str);
28 | 
29 |             std::string make_header(std::string const &uri);
30 | 
31 |             std::string parse_uri_header(std::string const &str);
32 |         };
33 | 
34 |     }
35 | 
36 | }
37 | 
38 | #endif //EXTRACTOR_MONO_FILTERS_WARCFILTER_H
39 | 


--------------------------------------------------------------------------------
/src/mono/language_sink.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #include "language_sink.h"
 4 | #include "../utils/compression.h"
 5 | #include <boost/iostreams/device/file.hpp>
 6 | #include <boost/filesystem.hpp>
 7 | #include <boost/format.hpp>
 8 | #include <boost/thread.hpp>
 9 | 
10 | #include <iostream>
11 | #include <unordered_map>
12 | #include <memory>
13 | 
14 | 
15 | namespace mono {
16 | 
17 |     language_sink::language_sink(std::string output_folder_, utils::compression_option compr_) : output_folder(
18 |             output_folder_), compr(compr_) {};
19 | 
20 |     void language_sink::output(std::string const &lang, std::string const &text) {
21 |       std::unordered_map<std::string, std::shared_ptr<ostreambuf> >::iterator it;
22 | 
23 |       it = sinkmap.find(lang);
24 |       if (it == sinkmap.end()) {
25 |         add_language_sink(lang);
26 |       }
27 | 
28 |       std::ostream outf(sinkmap.at(lang).get());
29 |       outf.write(text.c_str(), text.size());
30 |     }
31 | 
32 |     void language_sink::add_language_sink(std::string lang) {
33 |       // add new stream to language map
34 |       auto out = std::make_shared<ostreambuf>();
35 |       sinkmap.insert(std::make_pair(lang, out));
36 | 
37 |       // flags
38 |       std::ios_base::openmode flags = std::ofstream::app;
39 |       if (compr == utils::gzip || compr == utils::gzip || compr == utils::gzip || compr == utils::gzip) {
40 |         flags |= std::ofstream::binary;
41 |       }
42 | 
43 |       // add file sink with compression
44 |       utils::add_compression(sinkmap.at(lang), compr);
45 |       std::string ofilesink_path = get_langfile_path(output_folder, lang).string();
46 |       sinkmap.at(lang)->push(boost::iostreams::file_sink(ofilesink_path, flags));
47 |     }
48 | 
49 |     boost::filesystem::path language_sink::get_langfile_path(std::string folder, std::string lang) {
50 |       boost::format path = boost::format("%s/text.%s.%s") % folder % lang % get_compression_extension(compr);
51 |       return boost::filesystem::path(path.str());
52 |     }
53 | 
54 | }


--------------------------------------------------------------------------------
/src/mono/language_sink.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_LANGUAGE_SINK_H
 3 | #define EXTRACTOR_MONO_LANGUAGE_SINK_H
 4 | 
 5 | #include "../utils/compression.h"
 6 | #include <boost/iostreams/filtering_stream.hpp>
 7 | #include <boost/iostreams/device/file_descriptor.hpp>
 8 | #include <boost/iostreams/device/file.hpp>
 9 | #include <boost/filesystem.hpp>
10 | #include <string>
11 | #include <unordered_map>
12 | 
13 | 
14 | namespace mono {
15 | 
16 |     typedef boost::iostreams::filtering_streambuf<boost::iostreams::output> ostreambuf;
17 | 
18 |     class language_sink {
19 |     public:
20 | 
21 |         std::string output_folder;
22 |         utils::compression_option compr;
23 | 
24 |         language_sink(std::string output_folder_, utils::compression_option compr_);
25 | 
26 |         std::unordered_map<std::string, std::shared_ptr<ostreambuf> > sinkmap;
27 | 
28 |         void output(std::string const &lang, std::string const &text);
29 | 
30 | 
31 |     private:
32 | 
33 |         void add_language_sink(std::string lang);
34 | 
35 |         boost::filesystem::path get_langfile_path(std::string folder, std::string lang);
36 | 
37 |     };
38 | 
39 | }
40 | 
41 | 
42 | #endif //EXTRACTOR_MONO_LANGUAGE_SINK_H
43 | 


--------------------------------------------------------------------------------
/src/mono/mono.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "worker.h"
  3 | #include "../utils/logging.h"
  4 | #include "../utils/common.h"
  5 | 
  6 | #include <boost/program_options.hpp>
  7 | #include <boost/algorithm/string/trim.hpp>
  8 | #include <boost/filesystem.hpp>
  9 | #include <iostream>
 10 | #include <thread>
 11 | #include <sstream>
 12 | #include <sys/resource.h>
 13 | 
 14 | 
 15 | namespace po = boost::program_options;
 16 | 
 17 | typedef utils::shared_vector<std::string> shared_vector_string;
 18 | 
 19 | namespace mono {
 20 | 
 21 |     long get_ulimit() {
 22 |       struct rlimit limit;
 23 | 
 24 |       if (getrlimit(RLIMIT_NOFILE, &limit) != 0) {
 25 |         return -1;
 26 |       }
 27 | 
 28 |       return (long) limit.rlim_cur;
 29 |     }
 30 | 
 31 |     void load_data_from_cin(shared_vector_string &data) {
 32 |       for (std::string path; std::getline(std::cin, path);) {
 33 |         path = boost::trim_copy(path);
 34 |         if (path.length() > 0)
 35 |           data.push(path);
 36 |       }
 37 |       data.reverse();  // so that pop_back returns in the original order
 38 |     }
 39 | 
 40 |     void
 41 |     start(int workers, bool curl, bool print_stats, std::string output_folder, utils::compression_option input_compr,
 42 |           utils::compression_option output_compr) {
 43 | 
 44 |       shared_vector_string files_to_process;
 45 |       load_data_from_cin(files_to_process);
 46 |       LOG_INFO << files_to_process.size() << " files found to process.";
 47 | 
 48 |       utils::progress prog(files_to_process.size());
 49 |       boost::thread_group threads;
 50 |       for (int id = 0; id < workers; ++id) {
 51 |         std::string output_folder_thread = output_folder + "/" + std::to_string(id + 1);
 52 |         boost::filesystem::create_directory(output_folder_thread);
 53 |         if (print_stats) {
 54 |           boost::filesystem::create_directory(output_folder_thread + "/stats");
 55 |         }
 56 |         threads.create_thread(
 57 |                 boost::bind(run_worker, &files_to_process, &prog, curl, print_stats, output_folder_thread, input_compr,
 58 |                             output_compr));
 59 |       }
 60 |       threads.join_all();
 61 |       prog.finish();
 62 |     }
 63 | 
 64 | }
 65 | 
 66 | int main(int argc, char *argv[]) {
 67 |   logging::init();
 68 | 
 69 |   po::options_description desc("Allowed options");
 70 |   desc.add_options()
 71 |           ("help", "produce help message")
 72 |           ("curl", "uses curl to download remote files")
 73 |           ("print_stats", "prints language statistics")
 74 |           ("icompression", po::value<std::string>(), "set expected input compression")
 75 |           ("ocompression", po::value<std::string>(), "set output compression")
 76 |           ("workers", po::value<int>(), "set the number of workers")
 77 |           ("output", po::value<std::string>(), "set the output folder");
 78 | 
 79 |   po::variables_map vm;
 80 |   po::store(po::parse_command_line(argc, reinterpret_cast<const char *const *>(argv), desc), vm);
 81 |   po::notify(vm);
 82 | 
 83 |   if (vm.count("help")) {
 84 |     std::cout << desc << "\n";
 85 |     return 1;
 86 |   }
 87 | 
 88 |   if (vm.count("workers")) {
 89 |     if (vm["workers"].as<int>() <= 0) {
 90 |       LOG_ERROR << "The number of workers has to be >= 1";
 91 |       throw 11;
 92 |     }
 93 | 
 94 |     LOG_INFO << "The number of workers set to "
 95 |              << vm["workers"].as<int>() << ".";
 96 |   }
 97 | 
 98 |   // check system resources
 99 |   long curr_ulimit = mono::get_ulimit();
100 |   long sufficient_limit = 400 * vm["workers"].as<int>();
101 |   LOG_INFO << "Current limit of open file descriptors: " << curr_ulimit;
102 |   if (curr_ulimit == -1) {
103 |     LOG_ERROR << "Failed to read available system resources!";
104 |   } else if (curr_ulimit < sufficient_limit) {
105 |     LOG_ERROR << "The limit of open file descriptors is too low - should be at least: " << sufficient_limit;
106 |     throw 19;
107 |   }
108 | 
109 |   utils::compression_option input_compr;
110 |   utils::compression_option output_compr;
111 | 
112 |   if (vm.count("icompression")) {
113 |     input_compr = utils::string_to_compression_option(vm["icompression"].as<std::string>());
114 |     if (input_compr == utils::null || input_compr == utils::lzma) {
115 |       LOG_ERROR << "Unsupported input compression option: " << utils::compression_option_to_string(input_compr);
116 |       throw 11;
117 |     }
118 |     LOG_INFO << "Expecting input compression: " << utils::compression_option_to_string(input_compr);
119 |   } else {
120 |     input_compr = utils::none;
121 |     LOG_INFO << "Expecting uncompressed input files. ";
122 |   }
123 | 
124 |   if (vm.count("ocompression")) {
125 |     output_compr = utils::string_to_compression_option(vm["ocompression"].as<std::string>());
126 |     if (output_compr == utils::null) {
127 |       LOG_ERROR << "Unsupported output compression option: " << vm["ocompression"].as<std::string>();
128 |       throw 11;
129 |     }
130 |     LOG_INFO << "Setting output compression: " << utils::compression_option_to_string(output_compr);
131 |   } else {
132 |     output_compr = utils::none;
133 |     LOG_INFO << "Using no compression for output files. ";
134 |   }
135 | 
136 |   if (vm.count("output")) {
137 |     boost::filesystem::path output_dir(vm["output"].as<std::string>());
138 |     if (boost::filesystem::create_directory(output_dir)) {
139 |       LOG_INFO << "Outputting to: " << output_dir.string();
140 |     } else {
141 |       LOG_ERROR << "The output folder already exists!\n";
142 |       throw 17;
143 |     }
144 |   } else {
145 |     LOG_ERROR << "The output folder was not set!\n";
146 |     throw 10;
147 |   }
148 | 
149 |   if (vm.count("curl")) {
150 |     LOG_INFO << "Using curl to download remote files. ";
151 |   } else {
152 |     LOG_INFO << "Local files will be processed. ";
153 |   }
154 | 
155 |   if (vm.count("print_stats")) {
156 |     LOG_INFO << "Printing language statistics: On";
157 |   } else {
158 |     LOG_INFO << "Printing language statistics: Off";
159 |   }
160 | 
161 | 
162 |   mono::start(vm["workers"].as<int>(), vm.count("curl"), vm.count("print_stats"), vm["output"].as<std::string>(),
163 |               input_compr, output_compr);
164 | 
165 |   return 0;
166 | 
167 | }


--------------------------------------------------------------------------------
/src/mono/worker.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "worker.h"
  3 | #include "filters/warcfilter.h"
  4 | #include "filters/langcollectorfilter.h"
  5 | #include "filters/langsplitfilter.h"
  6 | #include "../utils/curldownloader.h"
  7 | #include "../utils/common.h"
  8 | #include "../utils/compression.h"
  9 | #include "../utils/logging.h"
 10 | 
 11 | #include <boost/filesystem/fstream.hpp>
 12 | #include <boost/filesystem/operations.hpp>
 13 | #include <boost/iostreams/filter/aggregate.hpp>
 14 | #include <boost/iostreams/filtering_stream.hpp>
 15 | #include <boost/iostreams/copy.hpp>
 16 | #include <boost/iostreams/device/null.hpp>
 17 | #include <boost/algorithm/string/split.hpp>
 18 | #include <boost/algorithm/string/classification.hpp>
 19 | #include <curl/curl.h>
 20 | 
 21 | #include <string>
 22 | 
 23 | 
 24 | namespace mono {
 25 | 
 26 |     void run_worker(shared_vector_string *files_to_process,
 27 |                     utils::progress *prog, bool curl, bool print_stats, std::string output_folder,
 28 |                     utils::compression_option input_compr,
 29 |                     utils::compression_option output_compr) {
 30 | 
 31 |       while (files_to_process->size() > 0) {
 32 |         std::string path = files_to_process->pop();
 33 |         if (curl) {
 34 |           worker_curl(path, print_stats, output_folder, input_compr, output_compr);
 35 |         } else {
 36 |           worker_file(path, print_stats, output_folder, input_compr, output_compr);
 37 |         }
 38 | 
 39 |         prog->increment();
 40 | 
 41 |       }
 42 |     }
 43 | 
 44 |     void
 45 |     worker_file(std::string path, bool print_stats, std::string output_folder, utils::compression_option input_compr,
 46 |                 utils::compression_option output_compr) {
 47 | 
 48 |       std::ios_base::openmode flags = std::ofstream::in;
 49 |       if (input_compr == utils::gzip) {
 50 |         flags |= std::ofstream::binary;
 51 |       }
 52 | 
 53 |       std::ifstream input_file(path, flags);
 54 |       if (!boost::filesystem::exists(path)) {
 55 |         std::cerr << "File not found!" << std::endl;
 56 |         return;
 57 |       }
 58 | 
 59 |       boost::iostreams::filtering_streambuf<boost::iostreams::input> qin(input_file);
 60 |       boost::iostreams::filtering_streambuf<boost::iostreams::output> qout;
 61 | 
 62 |       add_decompression(&qout, input_compr);
 63 | 
 64 |       qout.push(filters::WARCFilter());
 65 |       qout.push(filters::LangsplitFilter(output_folder, print_stats));
 66 |       qout.push(filters::LangCollectorFilter(output_folder, output_compr));
 67 |       qout.push(boost::iostreams::null_sink());
 68 | 
 69 |       boost::iostreams::copy(qin, qout);
 70 |       logging::log_done(output_folder, path);
 71 | 
 72 |     }
 73 | 
 74 |     void
 75 |     worker_curl(std::string url, bool print_stats, std::string output_folder, utils::compression_option input_compr,
 76 |                 utils::compression_option output_compr) {
 77 | 
 78 |       boost::iostreams::filtering_streambuf<boost::iostreams::output> qout;
 79 | 
 80 |       add_decompression(&qout, input_compr);
 81 | 
 82 |       qout.push(filters::WARCFilter());
 83 |       qout.push(filters::LangsplitFilter(output_folder, print_stats));
 84 |       qout.push(filters::LangCollectorFilter(output_folder, output_compr));
 85 |       qout.push(boost::iostreams::null_sink());
 86 | 
 87 |       // split input on a single space
 88 |       std::vector<std::string> parsed_urls;
 89 |       boost::algorithm::split(parsed_urls, url, boost::algorithm::is_any_of(" "));
 90 | 
 91 |       std::size_t i = 0;
 92 |       std::ostream oqout(&qout);
 93 |       HTTPDownloader downloader;
 94 |       CURLcode res = downloader.download(parsed_urls.at(i), &oqout);
 95 | 
 96 |       // download from other sources if failed
 97 |       while (res != CURLE_OK) {
 98 |         std::string error_text =
 99 |                 "Failed to download from: " + parsed_urls.at(i) + " with error: " + curl_easy_strerror(res);
100 |         logging::log_error(output_folder, error_text);
101 | 
102 |         if (++i >= parsed_urls.size()) break;
103 | 
104 |         res = downloader.download(parsed_urls.at(i), &oqout);
105 |       }
106 | 
107 |       // Not found in any source
108 |       if (res != CURLE_OK) {
109 |         std::string error_text = "CURL ERROR: " + parsed_urls.at(0) + " failed to process!";
110 |         logging::log_error(output_folder, error_text);
111 |       } else {
112 |         logging::log_done(output_folder, parsed_urls.at(i));
113 |       }
114 | 
115 |     }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/src/mono/worker.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_MONO_PRODUCER_H
 3 | #define EXTRACTOR_MONO_PRODUCER_H
 4 | 
 5 | #include "../utils/common.h"
 6 | #include <string>
 7 | 
 8 | 
 9 | typedef utils::shared_vector<std::string> shared_vector_string;
10 | 
11 | namespace mono {
12 | 
13 |     void run_worker(shared_vector_string *files_to_process,
14 |                     utils::progress *prog, bool curl, bool print_stats, std::string output_folder,
15 |                     utils::compression_option input_compr,
16 |                     utils::compression_option output_compr);
17 | 
18 |     void
19 |     worker_file(std::string path, bool print_stats, std::string output_folder, utils::compression_option input_compr,
20 |                 utils::compression_option output_compr);
21 | 
22 |     void
23 |     worker_curl(std::string url, bool print_stats, std::string output_folder, utils::compression_option input_compr,
24 |                 utils::compression_option output_compr);
25 | 
26 | };
27 | 
28 | #endif //EXTRACTOR_MONO_PRODUCER_H
29 | 


--------------------------------------------------------------------------------
/src/utils/common.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "common.h"
  3 | #include "logging.h"
  4 | #include "../3rd_party/utf8/source/utf8.h"
  5 | 
  6 | #include <iostream>
  7 | #include <exception>
  8 | 
  9 | 
 10 | namespace utils {
 11 | 
 12 |     progress::progress(int total_) : current_progress(0), total(total_) {
 13 |       std::cout << std::endl;
 14 |       show_bar();
 15 |     }
 16 | 
 17 |     void progress::increment() {
 18 |       boost::lock_guard<boost::mutex> lock(mutex);
 19 |       ++current_progress;
 20 |       show_bar();
 21 |     }
 22 | 
 23 |     void progress::show_bar() {
 24 |       int bar_width = 70;
 25 | 
 26 |       std::cout << " " << current_progress << "/" << total << " [";
 27 |       int pos = bar_width * current_progress / float(total);
 28 |       for (int i = 0; i < bar_width; ++i) {
 29 |         if (i < pos)
 30 |           std::cout << "=";
 31 |         else if (i == pos)
 32 |           std::cout << ">";
 33 |         else
 34 |           std::cout << " ";
 35 |       }
 36 |       std::cout << "] " << int(current_progress / float(total) * 100.0) << " %\r";
 37 |       std::cout.flush();
 38 |     };
 39 | 
 40 |     void progress::finish() {
 41 |       std::cout << std::endl;
 42 |       LOG_INFO << "Done.";
 43 |     }
 44 | 
 45 |     void fix_utf8_string(std::string &str) {
 46 |       std::string temp;
 47 |       utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
 48 |       str = temp;
 49 |     }
 50 | 
 51 |     parse_uri::parse_uri(const std::string &url) : uri_scheme(""), uri_domain(""), uri_tld(""), uri_path("") {
 52 |       try {
 53 |         int scheme_end = parse_uri::parse_scheme(0, url);
 54 |         int domain_end = parse_domains(scheme_end, url);
 55 |         parse_path(domain_end, url);
 56 |       } catch (std::exception &e) {
 57 |         std::cerr << "Error in parse_uri: " << e.what() << std::endl;
 58 |         uri_domain = url;
 59 |       }
 60 | 
 61 |     }
 62 | 
 63 |     int parse_uri::parse_scheme(int start, const std::string &uri) {
 64 |       std::string uri_stripped = uri.substr(start, uri.length() - start);
 65 |       int found = uri.find("://");
 66 |       if (found != static_cast<int>(std::string::npos)) {
 67 |         uri_scheme = uri.substr(0, found);
 68 |         return found + 3;
 69 |       }
 70 | 
 71 |       return 0;
 72 |     }
 73 | 
 74 |     int parse_uri::parse_domains(int start, const std::string &uri) {
 75 |       // parse authority
 76 |       int found_authority = uri.find("/", start);
 77 |       if (found_authority == static_cast<int>(std::string::npos))
 78 |         found_authority = uri.length();
 79 |       std::string domain = uri.substr(start, found_authority - start);
 80 | 
 81 |       // remove user information
 82 |       int found_user = domain.find("@");
 83 |       if (found_user != static_cast<int>(std::string::npos)) {
 84 |         domain = domain.substr(found_user + 1, domain.length() - found_user - 1);
 85 |       }
 86 | 
 87 |       // remove port
 88 |       int found_port = domain.find_last_of(":");
 89 |       if (found_port != static_cast<int>(std::string::npos)) {
 90 |         domain = domain.substr(0, found_port);
 91 |       }
 92 | 
 93 |       // parse tld
 94 |       int found_tld_start = domain.find_last_of(".");
 95 |       if (found_tld_start == static_cast<int>(std::string::npos))
 96 |         found_tld_start = 0;
 97 |       else
 98 |         ++found_tld_start;
 99 |       std::string tld = domain.substr(found_tld_start, domain.length() - found_tld_start);
100 | 
101 |       // save
102 |       uri_domain = domain;
103 |       uri_tld = tld;
104 | 
105 |       return found_authority + 1;
106 |     }
107 | 
108 |     int parse_uri::parse_path(int start, const std::string &uri) {
109 |       if (start >= static_cast<int>(uri.length()))
110 |         return uri.length();
111 | 
112 |       // parse path
113 |       int found_path = uri.find("?", start);
114 |       if (found_path == static_cast<int>(std::string::npos))
115 |         found_path = uri.length();
116 |       uri_path = uri.substr(start, found_path - start);
117 | 
118 |       return found_path;
119 |     }
120 | 
121 |     const std::string &parse_uri::get_scheme() const {
122 |       return uri_scheme;
123 |     }
124 | 
125 |     const std::string &parse_uri::get_domain() const {
126 |       return uri_domain;
127 |     }
128 | 
129 |     const std::string &parse_uri::get_tld() const {
130 |       return uri_tld;
131 |     }
132 | 
133 |     const std::string &parse_uri::get_path() const {
134 |       return uri_path;
135 |     }
136 | 
137 | }


--------------------------------------------------------------------------------
/src/utils/common.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EXTRACTOR_UTILS_COMMON_H
  3 | #define EXTRACTOR_UTILS_COMMON_H
  4 | 
  5 | #include "compression.h"
  6 | #include <boost/thread.hpp>
  7 | #include <string>
  8 | #include <vector>
  9 | 
 10 | 
 11 | namespace utils {
 12 | 
 13 |     template<class T>
 14 |     class shared_vector {
 15 |     public:
 16 | 
 17 |         T pop() {
 18 |           boost::lock_guard<boost::mutex> lock(mutex);
 19 |           if (storage.empty()) {
 20 |             return T();
 21 |           }
 22 | 
 23 |           std::string val = storage.back();
 24 |           storage.pop_back();
 25 |           return val;
 26 |         }
 27 | 
 28 |         void push(T val) {
 29 |           boost::lock_guard<boost::mutex> lock(mutex);
 30 |           storage.push_back(val);
 31 |         }
 32 | 
 33 |         void reverse() {
 34 |           boost::lock_guard<boost::mutex> lock(mutex);
 35 |           std::reverse(storage.begin(), storage.end());
 36 |         }
 37 | 
 38 |         int size() {
 39 |           boost::lock_guard<boost::mutex> lock(mutex);
 40 |           return storage.size();
 41 |         }
 42 | 
 43 |     private:
 44 |         boost::mutex mutex;
 45 |         std::vector<T> storage;
 46 |     };
 47 | 
 48 | 
 49 |     class progress {
 50 |     public:
 51 | 
 52 |         int current_progress;
 53 |         int total;
 54 | 
 55 |         progress(int total_);
 56 | 
 57 |     public:
 58 | 
 59 |         void increment();
 60 | 
 61 |         void finish();
 62 | 
 63 |     private:
 64 | 
 65 |         void show_bar();
 66 | 
 67 |         boost::mutex mutex;
 68 | 
 69 |     };
 70 | 
 71 |     class parse_uri {
 72 |     public:
 73 | 
 74 |         std::string uri_scheme;
 75 |         std::string uri_domain;
 76 |         std::string uri_tld;
 77 |         std::string uri_path;
 78 | 
 79 |         const std::string &get_scheme() const;
 80 | 
 81 |         const std::string &get_domain() const;
 82 | 
 83 |         const std::string &get_path() const;
 84 | 
 85 |         const std::string &get_tld() const;
 86 | 
 87 |         parse_uri(const std::string &uri);
 88 | 
 89 | 
 90 |     private:
 91 | 
 92 |         int parse_scheme(int start, const std::string &uri);
 93 | 
 94 |         int parse_domains(int start, const std::string &uri);
 95 | 
 96 |         int parse_path(int start, const std::string &uri);
 97 | 
 98 |     };
 99 | 
100 |     void fix_utf8_string(std::string &str);
101 | 
102 | }
103 | 
104 | #endif //EXTRACTOR_UTILS_COMMON_H
105 | 


--------------------------------------------------------------------------------
/src/utils/compression.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "compression.h"
 3 | 
 4 | #ifdef WITH_LZMA
 5 | #include <boost/iostreams/filter/lzma.hpp>
 6 | #endif
 7 | 
 8 | #include <boost/iostreams/filter/gzip.hpp>
 9 | #include <boost/iostreams/filter/bzip2.hpp>
10 | #include <boost/iostreams/filter/zlib.hpp>
11 | #include <boost/iostreams/filtering_stream.hpp>
12 | 
13 | 
14 | namespace utils {
15 | 
16 |     compression_option string_to_compression_option(std::string str) {
17 |       if (str == "gzip") return gzip;
18 |       if (str == "bzip2") return bzip2;
19 |       if (str == "zlib") return zlib;
20 |       if (str == "lzma") return lzma;
21 |       if (str == "none") return none;
22 | 
23 |       return null;
24 |     }
25 | 
26 |     std::string compression_option_to_string(compression_option compr) {
27 |       if (compr == gzip) return "gzip";
28 |       if (compr == bzip2) return "bzip2";
29 |       if (compr == zlib) return "zlib";
30 |       if (compr == lzma) return "lzma";
31 |       if (compr == none) return "none";
32 | 
33 |       return "";
34 |     }
35 | 
36 |     std::string get_compression_extension(compression_option compr) {
37 |       if (compr == gzip) return "gz";
38 |       if (compr == bzip2) return "bz2";
39 |       if (compr == zlib) return "zz";
40 |       if (compr == lzma) return "xz";
41 |       if (compr == none) return "out";
42 | 
43 |       return "";
44 |     }
45 | 
46 |     void add_compression(std::shared_ptr<boost::iostreams::filtering_streambuf<boost::iostreams::output>> osb,
47 |                          utils::compression_option compr) {
48 | 
49 |       if (compr == utils::gzip) {
50 |         osb->push(boost::iostreams::gzip_compressor());
51 |       }
52 | 
53 |       if (compr == utils::bzip2) {
54 |         osb->push(boost::iostreams::bzip2_compressor());
55 |       }
56 | 
57 |       if (compr == utils::zlib) {
58 |         osb->push(boost::iostreams::zlib_compressor());
59 |       }
60 | 
61 |       #ifdef WITH_LZMA
62 |       if (compr == utils::lzma) {
63 |         osb->push(boost::iostreams::lzma_compressor());
64 |       }
65 |       #endif
66 |     }
67 | 
68 |     void add_decompression(boost::iostreams::filtering_streambuf<boost::iostreams::output> *osb,
69 |                            utils::compression_option compr) {
70 | 
71 |       if (compr == utils::gzip) {
72 |         osb->push(boost::iostreams::gzip_decompressor());
73 |       }
74 | 
75 |       if (compr == utils::bzip2) {
76 |         osb->push(boost::iostreams::bzip2_decompressor());
77 |       }
78 | 
79 |       if (compr == utils::zlib) {
80 |         osb->push(boost::iostreams::zlib_decompressor());
81 |       }
82 | 
83 |       #ifdef WITH_LZMA
84 |       if (compr == utils::lzma) {
85 |         osb->push(boost::iostreams::lzma_decompressor());
86 |       }
87 |       #endif
88 |     }
89 | 
90 | 
91 | }


--------------------------------------------------------------------------------
/src/utils/compression.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_UTILS_COMPRESSION_H
 3 | #define EXTRACTOR_UTILS_COMPRESSION_H
 4 | 
 5 | #include <boost/iostreams/filtering_stream.hpp>
 6 | #include <string>
 7 | 
 8 | 
 9 | namespace utils {
10 | 
11 |     enum compression_option {
12 |         none, gzip, bzip2, zlib, lzma, null
13 |     };
14 | 
15 |     compression_option string_to_compression_option(std::string str);
16 | 
17 |     std::string compression_option_to_string(compression_option compr);
18 | 
19 |     std::string get_compression_extension(compression_option compr);
20 | 
21 |     void add_compression(std::shared_ptr<boost::iostreams::filtering_streambuf<boost::iostreams::output>> osb,
22 |                          utils::compression_option compr);
23 | 
24 |     void add_decompression(boost::iostreams::filtering_streambuf<boost::iostreams::output> *osb,
25 |                            utils::compression_option compr);
26 | 
27 | }
28 | 
29 | #endif //EXTRACTOR_UTILS_COMPRESSION_H
30 | 


--------------------------------------------------------------------------------
/src/utils/curldownloader.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "curldownloader.h"
 3 | #include "logging.h"
 4 | 
 5 | #include <curl/curl.h>
 6 | 
 7 | #include <iostream>
 8 | #include <sstream>
 9 | #include <string>
10 | 
11 | 
12 | size_t write_data(void *ptr, size_t size, size_t nmemb, void *poqout) {
13 |   (*(std::ostream *) poqout).write((const char *) ptr, nmemb);
14 | 
15 |   return size * nmemb;
16 | }
17 | 
18 | HTTPDownloader::HTTPDownloader() {
19 |   curl = curl_easy_init();
20 | }
21 | 
22 | HTTPDownloader::~HTTPDownloader() {
23 |   curl_easy_cleanup(curl);
24 | }
25 | 
26 | CURLcode HTTPDownloader::download(const std::string &url, std::ostream *poqout) {
27 |   curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
28 |   curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
29 |   curl_easy_setopt(curl, CURLOPT_WRITEDATA, poqout);
30 |   curl_easy_setopt(curl, CURLOPT_VERBOSE, 0L);
31 |   curl_easy_setopt(curl, CURLOPT_FAILONERROR, true);
32 | 
33 |   CURLcode res = curl_easy_perform(curl);
34 |   return res;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/utils/curldownloader.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_UTILS_CURLDOWNLOADER_H
 3 | #define EXTRACTOR_UTILS_CURLDOWNLOADER_H
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <curl/curl.h>
 8 | 
 9 | 
10 | class HTTPDownloader {
11 | 
12 | public:
13 | 
14 |     HTTPDownloader();
15 | 
16 |     ~HTTPDownloader();
17 | 
18 |     CURLcode download(const std::string &url, std::ostream *poqout);
19 | 
20 | 
21 | private:
22 | 
23 |     void *curl;
24 | 
25 | };
26 | 
27 | 
28 | #endif //EXTRACTOR_UTILS_CURLDOWNLOADER_H
29 | 


--------------------------------------------------------------------------------
/src/utils/logging.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "logging.h"
 3 | #include <boost/log/trivial.hpp>
 4 | #include <boost/log/expressions.hpp>
 5 | #include <boost/log/sources/severity_logger.hpp>
 6 | #include <boost/log/sources/record_ostream.hpp>
 7 | #include <boost/log/utility/setup/console.hpp>
 8 | #include <boost/log/utility/setup/common_attributes.hpp>
 9 | #include <boost/log/support/date_time.hpp>
10 | #include <boost/format.hpp>
11 | 
12 | #include <fstream>
13 | 
14 | 
15 | namespace logging {
16 | 
17 |     void init() {
18 |       boost::log::add_common_attributes();
19 |       boost::log::add_console_log(std::cout, boost::log::keywords::format =
20 |               (
21 |                       boost::log::expressions::stream
22 |                               << "(" << boost::log::trivial::severity << ") "
23 |                               << "[" << boost::log::expressions::format_date_time<boost::posix_time::ptime>("TimeStamp",
24 |                                                                                                             "%Y-%m-%d %H:%M:%S")
25 |                               << "]: "
26 |                               << boost::log::expressions::smessage
27 |               ));
28 | 
29 |     }
30 | 
31 |     void log_reliable(std::string output_folder, int num_reliable, int num_unreliable) {
32 |       std::ofstream logfile;
33 |       logfile.open(output_folder + "/" + "langsplit.log", std::ios::out | std::ios::app);
34 |       boost::format text = boost::format("reliable:%d unreliable:%d\n") % num_reliable % num_unreliable;
35 |       logfile << text.str();
36 |       logfile.close();
37 |     }
38 | 
39 |     void log_done(std::string output_folder, std::string processed) {
40 |       std::ofstream logfile;
41 |       logfile.open(output_folder + "/" + "done.log", std::ios::out | std::ios::app);
42 |       logfile << std::string(processed + "\n");
43 |       logfile.close();
44 |     }
45 | 
46 |     void log_error(std::string output_folder, std::string text) {
47 |       std::ofstream logfile;
48 |       logfile.open(output_folder + "/" + "error.log", std::ios::out | std::ios::app);
49 |       logfile << std::string(text + "\n");
50 |       logfile.close();
51 |     }
52 | 
53 | }


--------------------------------------------------------------------------------
/src/utils/logging.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EXTRACTOR_UTILS_LOGGING_H
 3 | #define EXTRACTOR_UTILS_LOGGING_H
 4 | 
 5 | #define BOOST_LOG_DYN_LINK 1
 6 | #define LOG_INFO BOOST_LOG_TRIVIAL(info)
 7 | #define LOG_ERROR BOOST_LOG_TRIVIAL(error)
 8 | 
 9 | #include <boost/log/trivial.hpp>
10 | 
11 | #include <string>
12 | 
13 | 
14 | namespace logging {
15 | 
16 |     void init();
17 | 
18 |     void log_reliable(std::string output_folder, int num_reliable, int num_unreliable);
19 | 
20 |     void log_done(std::string output_folder, std::string processed);
21 | 
22 |     void log_error(std::string output_folder, std::string text);
23 | 
24 | }
25 | 
26 | #endif //EXTRACTOR_UTILS_LOGGING_H
27 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # gtest
 3 | enable_testing()
 4 | find_package(GTest REQUIRED)
 5 | if (GTest_FOUND)
 6 |     include_directories(${GTEST_INCLUDE_DIRS})
 7 | endif ()
 8 | 
 9 | # test_readerwarc
10 | add_executable(test_readerwarc test_readerwarc.cpp ../src/mono/filters/warcfilter.cpp ../src/utils/common.cpp)
11 | target_link_libraries(test_readerwarc ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES})
12 | install(TARGETS test_readerwarc DESTINATION tests)
13 | 
14 | # test_langsplit
15 | add_executable(test_langsplit test_langsplit.cpp ../src/mono/filters/langsplitfilter.cpp ../src/mono/buffered_map.cpp ../src/utils/common.cpp ../src/utils/logging.cpp)
16 | target_link_libraries(test_langsplit ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} cld2_lib)
17 | install(TARGETS test_langsplit DESTINATION tests)
18 | 
19 | # test_integration
20 | add_executable(test_integration test_integration.cpp ../src/mono/filters/langcollectorfilter.cpp ../src/mono/filters/langsplitfilter.cpp ../src/mono/filters/warcfilter.cpp ../src/mono/language_sink.cpp ../src/mono/buffered_map.cpp ../src/mono/worker.cpp ../src/utils/common.cpp ../src/utils/compression.cpp ../src/utils/curldownloader.cpp ../src/utils/logging.cpp)
21 | target_link_libraries(test_integration ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${CURL_LIBRARIES} cld2_lib)
22 | install(TARGETS test_integration DESTINATION tests)
23 | 
24 | # test utils
25 | add_executable(test_utils test_utils.cpp ../src/utils/common.cpp)
26 | target_link_libraries(test_utils ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${CURL_LIBRARIES})
27 | install(TARGETS test_utils DESTINATION tests)
28 | 


--------------------------------------------------------------------------------
/tests/data_integration/test1.in.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_integration/test1.in.gz


--------------------------------------------------------------------------------
/tests/data_integration/test1_en.out:
--------------------------------------------------------------------------------
  1 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:en bytes:64
  2 | 記錄COPYRIGHT(C)2006 673091.av999-tw.net ALL RIGHTS RESERVED.
  3 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:en bytes:25
  4 | iPhone, iPad, iPod touch
  5 | df6fa1abb58549287111ba8d776733e9 uri:http://6dollarshirts.com/tv-and-movie-tees/violent-delights language:en bytes:4130
  6 | Violent Delights T-Shirt | 6 Dollar Shirts
  7 | HUNDREDS OF TEES JUST $6 EACH • Get 10 FOR ONLY $50!
  8 | HUNDREDS OF TEES JUST $6 EACH • Get 10 FOR ONLY $50!
  9 | Account •
 10 | Ordering •
 11 | About •
 12 | Contact
 13 | SHOP BY STYLE
 14 | Guys Tees
 15 | Girls Tees
 16 | Kids Tees
 17 | Guys Tanks
 18 | Girls Tanks
 19 | Sweatshirts
 20 | Hoodies
 21 | Prints
 22 | SHOP BY CATEGORY
 23 | New
 24 | Funny
 25 | Music
 26 | Partying
 27 | Pop Culture
 28 | Politics
 29 | Shop All Tees	SHOP BY PRICE Tees for $6
 30 | Tees for $9
 31 | Tees for $12
 32 | Clearance Items
 33 | SHOP BY STYLE
 34 | Guys Tees
 35 | Girls Tees
 36 | Kids Tees
 37 | Guys Tanks
 38 | Girls Tanks
 39 | Sweatshirts
 40 | Hoodies
 41 | Prints
 42 | SHOP BY CATEGORY
 43 | New
 44 | Funny
 45 | Music
 46 | Partying
 47 | Pop Culture
 48 | Politics
 49 | Shop All Tees	SHOP BY PRICE Tees for $6
 50 | Tees for $9
 51 | Tees for $12
 52 | Clearance Items
 53 | NEW Hell Yeah
 54 | New Who
 55 | Squid Goals
 56 | I Choose Violence
 57 | Crystal Ball Buffering
 58 | Weed
 59 | Lava
 60 | Hand Of The Queen
 61 | SHOP Shop By Style
 62 | Guys Tees
 63 | Girls Tees
 64 | Kids Tees
 65 | Guys Tanks
 66 | Girls Tanks
 67 | Sweatshirts
 68 | Hoodies
 69 | Prints
 70 | Shop By Category
 71 | New Designs
 72 | TV & Movies	Pets & Animals
 73 | Science & Math
 74 | Geek & Gaming
 75 | Graphic & Vintage
 76 | Funny
 77 | Food & Coffee
 78 | Music
 79 | Partying
 80 | Pop Culture
 81 | Politics
 82 | Sports & Wellness
 83 | Holidays & Costumes
 84 | Shop All Tees	Shop By Price
 85 | Tees for $6
 86 | Tees for $9
 87 | Tees for $12
 88 | 10 for $50 Tees
 89 | Gift Certificates
 90 | Clearance
 91 | COLLECTIONS Cats
 92 | Horror
 93 | Gaming Drinking Science Dinosaurs
 94 | Superheroes Math
 95 | Zombies
 96 | Dogs Sharks
 97 | Skulls
 98 | Literary Historical Music Account
 99 | Ordering Info
100 | About Us
101 | Contact Us
102 | TV & Movie Tees
103 | Violent Delights #teeviolentdelightstee
104 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for.
105 | Professionally printed silkscreen
106 | Ships within 2 business days
107 | Designed and printed in the USA	See more in: Violent Delights Looking for a fun vacation spot? Be careful what you wish for.
108 | • Professionally printed silkscreen
109 | • Ships within 2 business days
110 | • Designed and printed in the USA
111 | Choose Style: Select State
112 | Guys Tee	Girls Tee	(+$0.50)
113 | Guys Tank	(+$5.95)
114 | Girls Tank	(+$5.95)
115 | Kids Tee	(+$3.50)
116 | Hoodie	(+$12.95)
117 | Choose Color: Select State
118 | Black	Charcoal	Black	Black	Midnight	Lucky Green	Gray Heather	Navy Heather	Deep Ash	Brown Heather	Chocolate	Midnight	Royal Heather	Black	Navy Blue	Kelly Green Heather	True Navy	Red Heather	Deep Red	Red	Sky Blue	Kelly Green	Choose Size: Size Chart	Select State
119 | Small	Medium	Large	X-Large	2X-Large	(+$1.00)
120 | 2X-Large	(+$2.00)
121 | 2X-Large	(+$3.00)
122 | 3X-Large	(+$3.00)
123 | 3X-Large	(+$2.00)
124 | 2	3	4	5/6	$6,00
125 | Qty
126 | Add to Cart
127 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for.	• Professionally printed silkscreen
128 | • Ships within 2 business days
129 | • Designed and printed in the USA
130 | #teeviolentdelightstee
131 | Violent Delights Looking for a fun vacation spot? Be careful what you wish for.
132 | Professionally printed silkscreen
133 | Ships within 2 business days
134 | Designed and printed in the USA	$6. Dystopia
135 | $9. Shakespeare On Love
136 | $6. Evolution To Termination
137 | $6. Red Pill, Blue Pill Customer Reviews (0)
138 | DESIGN DETAILS FAN PHOTOS MORE INFO
139 | Product Specs
140 | Size & Fit
141 | Shirt width on our size chart measures from armpit to armpit, across the front only. Shirt length measures from the neck seam to the bottom hem.
142 | While our 100% cotton shirts are pre-shrunk, slight shrinkage may occur, particularly when using heat in washing and drying. Heather colors are a poly-cotton blend that may shrink slightly or not at all, depending on care.
143 | Guys shirts are a loose fit style. Girls shirts are fitted with a tapered waist and cap sleeves.	More Questions?
144 | See our Ordering page, or contact us here.
145 | Add A Review
146 | Submit
147 | Size Chart
148 | GUYS TEE GIRLS TEE GUYS TANK GIRLS TANK KIDS TEE HOODIE SizeWidth Length SMALL 18 28 MEDIUM 20 29 LARGE 22 30 X-LARGE 24 31 2X-LARGE 26 32 3X-LARGE 28 33 Pre-shrunk fabric. • Short set-in sleeves, for relaxed fit. • Solid colors are 100% cotton; Heather colors are 50/50 poly-cotton blend (Grey Heather is a 70% cotton, 30% polyester blend). Size Width Length SMALL 16.5 25 MEDIUM 17.5 25.5 LARGE 18.5 27 X-LARGE 20 28 100% ring-spun cotton.Pre-shrunk fabric.Tapered silhouette.Gray Granite is 90% cotton 10% poly-blend. Size Width Length SMALL 18 27 MEDIUM 20 28.5 LARGE 21.5 28.75 X-LARGE 23 29.5 2X-LARGE 25.5 30.5 100% combed ring-spun cotton.Pre-shrunk fabric.
149 | df6fa1abb58549287111ba8d776733e9 uri:http://6dollarshirts.com/tv-and-movie-tees/violent-delights language:en bytes:1015
150 | Length SMALL 16.25 24.75 MEDIUM 18.5 24.5 LARGE 21.5 25 X-LARGE 22 27 100% combed ring-spun cotton.Pre-shrunk fabric. Size Width Length 2 11 12 3 12 13 4 13 14 5/6 14.5 16 SMALL 17 19 MEDIUM 18 20.5 LARGE 19 21 100% jersey knit cotton.Pre-shrunk fabric.​ Size Width Length SMALL 20 23 MEDIUM 21 25.5 LARGE 23 26 X-LARGE 25 27 2X-LARGE 28.5 27 3X-LARGE 31 27.5 80% ring-spun cotton/ 20% polyester.Pre-shrunk fabric.Jersey-lined hood. ×Close
151 | - Categories -
152 | New Designs
153 | TV & Movies	Pets & Animals
154 | Science & Math
155 | Geek & Gaming
156 | Funny
157 | Graphic & Vintage
158 | Food & Coffee
159 | Music
160 | Partying
161 | Pop Culture
162 | Politics
163 | Sports & Wellness
164 | Holidays & Costumes
165 | Shop All Tees	- Styles -
166 | Guys Tees
167 | Girls Tees
168 | Kids Tees
169 | Guys Tanks
170 | Girls Tanks
171 | Sweatshirts
172 | Hoodies
173 | Prints
174 | - Price -
175 | Tees For $6
176 | Tees For $9 Tees For $12
177 | 10 for $50 Tees
178 | My Account
179 | Ordering Info
180 | About Us
181 | Contact Us
182 | Subscribe To Our Newsletter
183 | ©2016 6DOLLARSHIRTS by Thread Pit•Terms•Privacy
184 | /g,'>');l[i].href='mailto:'+t.value}}catch(e){}}}catch(e){}})(document);/* ]]> */
185 | 


--------------------------------------------------------------------------------
/tests/data_integration/test1_ja.out:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:29
2 | smg下載-向前走向愛走-
3 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:149
4 | 守護甜心遊戲-冒險王之神兵傳奇無敵版-真愛找麻煩第63集-言葉より大切なもの-切水果手機遊戲-說一句我不走了-
5 | df6fa1abb58549287111ba8d776733e9 uri:http://673091.av999-tw.net/?PUT=a_show&AID=77418&FID=673091&R2=&CHANNEL= language:ja bytes:118
6 | 天空影音分享-嵐きっと大丈夫-羅百吉-雨果的冒險-情歌-四川麻將連連看-驢子下載中文版-
7 | 


--------------------------------------------------------------------------------
/tests/data_integration/test2.in.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_integration/test2.in.gz


--------------------------------------------------------------------------------
/tests/data_integration/test2_de.out:
--------------------------------------------------------------------------------
  1 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:205
  2 | Reifen Meißner GmbH
  3 | HomeÜber unsReifenFelgenAutoserviceGeschäftskundenAktuellesTuningFilialenKontaktpoint S CardStellenangeboteKundenzufriedenheitPKWMotorradNutzfahrzeugeUnsere starken MarkenMarkenReife
  4 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:114
  5 | AngeboteWartung und ServiceZubehörMobilitiätsgarantieFlottenFlottenbroschürePKW-ServiceNutzfahrzeug-Service24h
  6 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:162
  7 | nsporterreifenGrüne ReifenReifen-EinlagerungAuswuchtenWinterreifenLKW & ReisebusseLand- & ForstwirtschaftErdbewegungsmaschinenIndustriereifenLKW-Reifenversicheru
  8 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:297
  9 | laketteSchneeketteNutzfahrzeug und Reisebus ServiceErdbewegungsmaschinen ServiceLand- und Forstwirtschaft ServiceIndustriereifen Service	Immer für Sie da:
 10 | Geschäftsführer	Frau Irmtraud Meißner / Herr Lutz Meißner	Sie befinden sich hier: Autoservice » Wartung und Service
 11 | Wartung und Service
 12 | df6fa1abb58549287111ba8d776733e9 uri:http://1039500.webexpress.point-s.de/autoservice/wartung_und_service language:de bytes:460
 13 | Hauptuntersuchung*: Die Ausführung sämtlicher Arbeiten am Fahrzeug erfolgt durch permanent geschulte Mitarbeiter und Kfz-Meister. Denn nur sie verfügen über die nötige Erfahrung und Qualifikation.*Die Prüfung erfolgt gem. §29 StVZO, in Ihrem point S Servicecenter zusammen mit staatlich anerkannten Prüforganisationen (welche Prüforganisation in Ihrem Servicecenter tätig ist, erfahren Sie vor Ort).
 14 | NutzungsbedingungenDatenschutzerklärungImpressum
 15 | df6fa1abb58549287111ba8d776733e9 uri:http://2016.palaissommer.de/programm/tango-im-park-8/ language:de bytes:135
 16 | Mitteilungen
 17 | Anfragen
 18 | Newsletter abonnieren
 19 | Programm
 20 | Alle Veranstaltungen
 21 | Filmnacht
 22 | Hörspielnacht
 23 | Klaviernacht
 24 | Palais.Poesie
 25 | Pleinair
 26 | df6fa1abb58549287111ba8d776733e9 uri:http://2016.palaissommer.de/programm/tango-im-park-8/ language:de bytes:1253
 27 | Sommerwirtschaft
 28 | Schirmherrschaft
 29 | Team
 30 | News
 31 | Partner
 32 | Pleinair & Sammlung
 33 | Galerie
 34 | Canalettopreis
 35 | Presse
 36 | Aktuelle Mitteilungen
 37 | Anfragen
 38 | Newsletter abonnieren
 39 | Tango im Park
 40 | Sonntag, 14.08.2016 | 16:00
 41 | Rahmenprogramm
 42 | Tango im Park
 43 | Der Argentinische Tango ist die ursprünglichere und weniger reglementierte Form des Tanzes der sich vor über hundert Jahren am Rio de la Plata entwickelt hat. Da der Tango seit 2009 zum Weltkulturerbe gehört, bringen wir dieses an die Elbwiesen zurück.
 44 | Es ist eine offene Tanzveranstaltung zu traditionellen und auch modernen Tangos. Wir laden jeden zum schwofen, gaffen, genießen, ausprobieren und mitmachen ein.
 45 | Danke an Norbert Gust
 46 | Das eintrittsfreie Festival Palais Sommer wird neben Sponsoring vor allem auch durch die Spenden der Besucher möglich. Empfehlung: 5-10 € / Spendenbox am Kulturcounter Eingang Elbradweg.
 47 | Vielen Dank für eure Unterstützung!
 48 | Jetzt spenden!
 49 | Newsletter	Fördern Sie diese Veranstaltung!	Mit Ihrer Hilfe kann der Palais Sommer weiterhin eintrittsfrei und ohne staatliche Zuschüsse stattfinden Mehr Informationen Kalender
 50 | <<
 51 | Jul 2017
 52 | >>
 53 | MDMDFSS
 54 | 26	27	28	29	30	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	1	2	3	4	5	6	Nächste VeranstaltungenKeine
 55 | df6fa1abb58549287111ba8d776733e9 uri:http://24check-versicherung.com/news-archiv-002/fleckig-verfarbt-verfilzt.html language:de bytes:3169
 56 | Fleckig, verfärbt, verfilzt: Textilreiniger haften nur bei eigenen Fehlern | 24CHECK-VERSICHERUNG | HOLZ
 57 | 24CHECK-VERSICHERUNG
 58 | Site Navigation[Skip]
 59 | 24CHECK
 60 | NEWS
 61 | HANDY
 62 | VORSORGE
 63 | ENGLISCHE LEBENSVERSICHERUNG
 64 | BERUFSUNFÄHIGKEIT
 65 | UNFALLVERSICHERUNG
 66 | LEBENSVERSICHERUNG
 67 | RENTENVERSICHERUNG
 68 | RIESTER-RENTE
 69 | RÜRUP-RENTE
 70 | RISIKO-LEBENSVERSICHERUNG
 71 | PKV
 72 | PRIVATE KRANKEN-VERSICHERUNG
 73 | PKV BEAMTE
 74 | PKV STUDENTEN
 75 | PKV ÜBER 55
 76 | PRIVATE KRANKENZUSATZ-VERSICHERUNG
 77 | SACHVERSICHERUNG
 78 | WOHNGEBÄUDE
 79 | FIRMENVERSICHERUNG
 80 | KFZ VERSICHERUNG VERGLEICH
 81 | HAUS- UND GRUNDBESITZ
 82 | HAUSRAT
 83 | PRIVAT-HAFTPFLICHT
 84 | RECHTSSCHUTZ-VERSICHERUNG
 85 | TIERHALTER
 86 | MOTORRAD-VERSICHERUNG
 87 | FINANZEN
 88 | BAUFINANZIERUNG
 89 | AUTOFINANZIERUNG
 90 | KREDIT VERGLEICH | RATENKREDIT | KREDITFINANZIERUNG
 91 | Allgemeine 24CHECK - Informationen zu Krediten und Vergleichen
 92 | FONDS
 93 | KREDITKARTEN VERGLEICH
 94 | SUCHEN
 95 | TIPPS
 96 | SITEMAP
 97 | Fleckig, verfärbt, verfilzt: Textilreiniger haften nur bei eigenen Fehlern
 98 | R+V-Infocenter: Verbraucher haben Beweispflicht
 99 | Wiesbaden (ots) - Waschmaschine oder Reinigung? Bei teuren
100 | und empfindlichen Kleidungsstücken fällt die Wahl meist auf eine
101 | Textilreinigung. Doch was tun, wenn der Anzug danach noch mehr Flecken hat oder
102 | das Lieblingskleid verfilzt ist? "Der Kunde muss beweisen, dass die
103 | Reinigung den Schaden verursacht und auch verschuldet hat", sagt Sonja
104 | Biorac, Haftpflichtexpertin beim Infocenter der R+V Versicherung. Doch nur in
105 | jedem dritten Fall haben tatsächlich die Textilreiniger den Schaden zu
106 | vertreten. Fast ebenso häufig geht er auf Fehler der Hersteller zurück, etwa
107 | wenn das Kleidungsstück falsch etikettiert ist.
108 | Gut für die Kunden ist, wenn die Reinigung bereit ist, den
109 | Schaden auf Kulanzbasis zu regeln. Ist dies nicht der Fall, muss der
110 | Geschädigte dem Betrieb nachweisen, dass dieser einen Fehler gemacht hat.
111 | "Oft geht das nur mit einem Gutachten, zum Beispiel von einer Schiedstelle
112 | für Textilpflege", so R+V-Expertin Biorac. Zwischen 20 und 60 Euro kostet
113 | die Beurteilung durch die Experten. Stellt sich dabei heraus, dass die
114 | Pflegehinweise falsch sind, können die Kunden beim Verkäufer reklamieren.
115 | Dieser haftet zwei Jahre für die Kleidung. Voraussetzung ist aber, dass das
116 | Etikett nicht herausgeschnitten wurde.
117 | Bei rund jeder dritten Reklamation entscheiden die
118 | Schiedsstellen zu Ungunsten der Verbraucher: beispielsweise wenn die Kleidung
119 | mit Säure in Kontakt gekommen ist oder der Kunde an den Flecken gerieben und
120 | den Stoff beschädigt hat.
121 | Wenn die Reinigung für den Schaden verantwortlich ist,
122 | erstattet sie normalerweise nicht den kompletten Wert des Kleidungsstücks.
123 | "Meistens ist die Haftung auf den 15fachen Reinigungspreis begrenzt",
124 | so Sonja Biorac. Selbst für ein 800 Euro teures Markenkostüm steht dem Kunden
125 | also womöglich deutlich weniger als der halbe Kaufpreis zu. Viele Reinigungen
126 | bieten zusätzlich eine Versicherung an, die im Schadenfall den Zeitwert ersetzt
127 | - für neue wertvolle Kleidungsstücke kann sich das lohnen.
128 | http://www.infocenter.ruv.de
129 | Pressekontakt:
130 | R+V-Infocenter
131 | 06172/9022-131
132 | a.kassubek@arts-others.de Zurück zur Übersicht
133 | Dienstag, 20. März 2012 21:23
134 | © HOLZ MMC 2006 - 2014 - DEUTSCHLANDS GROSSES VERGLEICHSPORTAL - ALLE VERGLEICHE
135 | 


--------------------------------------------------------------------------------
/tests/data_integration/test3.in:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: warcinfo
 3 | WARC-Date: 2017-10-13T10:20:04Z
 4 | WARC-Filename: text.warc.wet.gz
 5 | WARC-Record-ID: <urn:uuid:743872f4-ad60-4318-a132-d413a4d2ce64>
 6 | Content-Type: application/warc-fields
 7 | Content-Length: 110
 8 | 
 9 | Software-Info: ia-web-commons.1.1.9-SNAPSHOT-20170811025357
10 | Extracted-Date: Fri, 13 Oct 2017 10:20:04 GMT
11 | 
12 | 
13 | 
14 | WARC/1.0
15 | WARC-Type: conversion
16 | WARC-Target-URI: http://01publishing.com/tag/bondage/
17 | WARC-Date: 2017-07-28T18:53:51Z
18 | WARC-Record-ID: <urn:uuid:0c1b0316-29c6-4afa-aaa2-c1fa0dbb5954>
19 | WARC-Refers-To: <urn:uuid:c50842ba-fea2-460c-9731-458805117831>
20 | WARC-Block-Digest: sha1:MXOMDGYOE3HNU2AFOXOLP7WJZDWYRI53
21 | Content-Type: text/plain
22 | Content-Length: 26
23 | 
24 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/data_integration/test3_en.out:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 uri:http://01publishing.com/tag/bondage/ language:en bytes:246
2 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.
3 | 


--------------------------------------------------------------------------------
/tests/data_langsplit/test1.in:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 uri:http://917ssmys.measychina.cn/za/11hbg/
 2 | 海报psd素材_四驱车 田宫 绝版_植物百科通
 3 | 新闻资讯
 4 | 植物科学
 5 | 家庭养花
 6 | 园林养护
 7 | 植物图片
 8 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html
 9 | コラージュ・イン・ア・ボトル スタカニ用ATC作成中☆
10 | コラージュ・イン・ア・ボトル
11 | collages in a bottle
12 | スポンサーサイト
13 | 上記の広告は１ヶ月以上更新のないブログに表示されています。新しい記事を書く事で広告が消せます。	-------- : スポンサー広告 : このページのトップへ
14 | スタカニ用ATC作成中☆


--------------------------------------------------------------------------------
/tests/data_langsplit/test1.out:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 uri:http://917ssmys.measychina.cn/za/11hbg/ language:zh bytes:112
 2 | 素材_四驱车 田宫 绝版_植物百科通
 3 | 新闻资讯
 4 | 植物科学
 5 | 家庭养花
 6 | 园林养护
 7 | 植物图片
 8 | 
 9 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:58
10 | コラージュ・イン・ア・ボトル スタカニ用
11 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:56
12 | 作成中☆
13 | コラージュ・イン・ア・ボトル
14 | 
15 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:en bytes:21
16 | collages in a bottle
17 | 
18 | df6fa1abb58549287111ba8d776733e9 uri:http://02repair.blog.fc2.com/blog-entry-192.html language:ja bytes:248
19 | スポンサーサイト
20 | 上記の広告は１ヶ月以上更新のないブログに表示されています。新しい記事を書く事で広告が消せます。	-------- : スポンサー広告 : このページのトップへ
21 | スタカニ用
22 | 


--------------------------------------------------------------------------------
/tests/data_langsplit/test1_langstats.out:
--------------------------------------------------------------------------------
1 | 02repair.blog.fc2.com	en	21
2 | 02repair.blog.fc2.com	ja	362
3 | 917ssmys.measychina.cn	zh	112
4 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test1.in:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: conversion
 3 | WARC-Target-URI: http://statmt.org/
 4 | WARC-Date: 2017-07-20T12:39:49Z
 5 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
 6 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
 8 | Content-Type: text/plain
 9 | Content-Length: 6598
10 | 
11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
12 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test1.out:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/
2 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
3 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test2.in:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: conversion
 3 | WARC-Target-URI: http://statmt.org/
 4 | WARC-Date: 2017-07-20T12:39:49Z
 5 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
 6 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
 8 | Content-Type: text/plain
 9 | Content-Length: 6598
10 | 
11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
12 | 
13 | WARC/1.0
14 | WARC-Type: conversion
15 | WARC-Date: 2017-07-20T12:39:49Z
16 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
17 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
18 | Content-Type: text/plain
19 | Content-Length: 6598
20 | 
21 | Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
22 | All you need is a collection of translated texts (parallel corpus).
23 | Once you have a trained model, an efficient search algorithm quickly finds the highest probability translation among the exponential number of choices.
24 | 
25 | WARC/1.0
26 | WARC-Type: conversion
27 | WARC-Target-URI:http://www.fjoch.com/GIZA++.html
28 | WARC-Date: 2017-07-20T12:39:49Z
29 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
30 | Content-Type: text/plain
31 | Content-Length: 6598
32 | 
33 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU).
34 | GIZA++ includes a lot of additional features.
35 | The extensions of GIZA++ were designed and written by Franz Josef Och.
36 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test2.out:
--------------------------------------------------------------------------------
1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/
2 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
3 | df6fa1abb58549287111ba8d776733e9 uri:http://www.fjoch.com/GIZA++.html
4 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU).
5 | GIZA++ includes a lot of additional features.
6 | The extensions of GIZA++ were designed and written by Franz Josef Och.
7 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test3.in:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: conversion
 3 | WARC-Target-URI:
 4 | WARC-Date: 2017-07-20T12:39:49Z
 5 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
 6 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
 8 | Content-Type: text/plain
 9 | Content-Length: 6598
10 | 
11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
12 | 
13 | WARC/1.0
14 | WARC-Type: conversion
15 | WARC-Target-URI: uri uri
16 | WARC-Date: 2017-07-20T12:39:49Z
17 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
18 | Content-Type: text/plain
19 | Content-Length: 6598
20 | 
21 | GIZA++ is an extension of the program GIZA (part of the SMT toolkit EGYPT) which was developed by the Statistical Machine Translation team during the summer workshop in 1999 at the Center for Language and Speech Processing at Johns-Hopkins University (CLSP/JHU).
22 | GIZA++ includes a lot of additional features.
23 | The extensions of GIZA++ were designed and written by Franz Josef Och.
24 | 
25 | WARC/1.0
26 | WARC-Type: conversion
27 | WARC-Date: 2017-07-20T12:39:49Z
28 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
29 | Content-Type: text/plain
30 | Content-Length: 6598
31 | 
32 | The first shared task which will examine translation between the following language pairs:
33 | 
34 |     English-German and German-English
35 |     English-French and French-English
36 |     English-Hindi and Hindi-English NEW
37 |     English-Czech and Czech-English
38 |     English-Russian and Russian-English
39 | 
40 | Participants may submit translations for any or all of the language directions. In addition to the common test sets the workshop organizers will provide optional training resources, including a newly expanded release of the Europarl corpora and out-of-domain corpora.
41 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test3.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paracrawl/extractor/14d66691b0da4b41ad1e7fe2f27cc9e1ab9cbd58/tests/data_readerwarc/test3.out


--------------------------------------------------------------------------------
/tests/data_readerwarc/test4.in:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: conversion
 3 | WARC-Target-URI: http://statmt.org/europarl/
 4 | WARC-Date: 2017-07-20T12:39:49Z
 5 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
 6 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
 7 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
 8 | Content-Type: text/plain
 9 | 
10 | For a detailed description of this corpus, please read:
11 | 
12 |     Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005, pdf.
13 | 
14 |     Please cite the paper, if you use this corpus in your work. See also the extended (but earlier) version of the report (ps, pdf).
15 | 
16 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament.
17 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek.
18 | http://statmt.org/europarl/
19 | 
20 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries.
21 | We sentence aligned the data using a tool based on the Church and Gale algorithm.
22 | 
23 | WARC/1.0
24 | WARC-Type: conversion
25 | WARC-Target-URI: http://statmt.org/
26 | WARC-Date: 2017-07-20T12:39:49Z
27 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
28 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
29 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
30 | Content-Type: text/plain
31 | Content-Length: 6598
32 | 
33 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
34 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test4.out:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/europarl/
 2 | For a detailed description of this corpus, please read:
 3 |     Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005, pdf.
 4 |     Please cite the paper, if you use this corpus in your work. See also the extended (but earlier) version of the report (ps, pdf).
 5 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament.
 6 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek.
 7 | http://statmt.org/europarl/
 8 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems. For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries.
 9 | We sentence aligned the data using a tool based on the Church and Gale algorithm.
10 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/
11 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
12 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test5.in:
--------------------------------------------------------------------------------
 1 | WARC/1.1
 2 | WARC-Type: conversion
 3 | WARC-Target-URI: http://statmt.org/
 4 | WARC-Date: 2017-07-20T12:39:49Z
 5 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
 6 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
 7 | Content-Type: text/plain
 8 | Content-Length: 6598
 9 | 
10 | This website is dedicated to research in statistical machine translation, i.e. the translation of text from one human language to another by a computer that learned how to translate from vast amounts of translated text.
11 | 
12 | WARC-Type: conversion
13 | WARC/1.0
14 | WARC-Target-URI: http://statmt.org/europarl/
15 | WARC-Date: 2017-07-20T12:39:49Z
16 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
17 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
18 | WARC-Block-Digest: sha1:UZJYKZZJH3QI3RMVNKBAHMU3VKEFZGS2
19 | Content-Type: text/plain
20 | 
21 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament.
22 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek.
23 | 
24 | WARC/1.1
25 | 
26 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems.
27 | For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries.
28 | We sentence aligned the data using a tool based on the Church and Gale algorithm.
29 | WARC/1.0
30 | WARC-Type: conversion
31 | WARC-Target-URI: http://statmt.org/wmt14/
32 | WARC-Date: 2017-07-20T12:39:49Z
33 | WARC-Record-ID: <urn:uuid:4e2c9e9d-16ad-487d-a7a3-a7844df236c2>
34 | WARC-Refers-To: <urn:uuid:acf277dd-0b09-44bb-b7dd-1429e7b6cdca>
35 | Content-Type: text/plain
36 | Content-Length: 6598
37 | 
38 | This workshop builds on eight previous workshops on statistical machine translation, which is one of the most prestigious venues for research in computational linguistics:
39 | 


--------------------------------------------------------------------------------
/tests/data_readerwarc/test5.out:
--------------------------------------------------------------------------------
 1 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/europarl/
 2 | The Europarl parallel corpus is extracted from the proceedings of the European Parliament.
 3 | It includes versions in 21 European languages: Romanic (French, Italian, Spanish, Portuguese, Romanian), Germanic (English, Dutch, German, Danish, Swedish), Slavik (Bulgarian, Czech, Polish, Slovak, Slovene), Finni-Ugric (Finnish, Hungarian, Estonian), Baltic (Latvian, Lithuanian), and Greek.
 4 | WARC/1.1
 5 | The goal of the extraction and processing was to generate sentence aligned text for statistical machine translation systems.
 6 | For this purpose we extracted matching items and labeled them with corresponding document IDs. Using a preprocessor we identified sentence boundaries.
 7 | We sentence aligned the data using a tool based on the Church and Gale algorithm.
 8 | df6fa1abb58549287111ba8d776733e9 uri:http://statmt.org/wmt14/
 9 | This workshop builds on eight previous workshops on statistical machine translation, which is one of the most prestigious venues for research in computational linguistics:
10 | 


--------------------------------------------------------------------------------
/tests/test_integration.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "gtest/gtest.h"
 3 | #include "../src/mono/worker.h"
 4 | #include "../src/utils/common.h"
 5 | #include "boost/filesystem/fstream.hpp"
 6 | #include "boost/filesystem/operations.hpp"
 7 | 
 8 | #include <iostream>
 9 | #include <fstream>
10 | #include <sstream>
11 | #include <string>
12 | #include <vector>
13 | 
14 | 
15 | typedef std::vector<std::pair<std::string, std::string>> pair_files_vec;
16 | 
17 | 
18 | void compare_lengths(std::string output_file, std::string expected_file) {
19 |   std::cout << "Comparing: " << output_file << " and " << expected_file << std::endl;
20 | 
21 |   std::ifstream ja_output(output_file);
22 |   std::ifstream ja_expected_output("../../tests/data_integration/" + expected_file);
23 | 
24 |   if (!ja_output.is_open()) {
25 |     std::cerr << "Output file not found!" << std::endl;
26 |     FAIL();
27 |   }
28 | 
29 |   if (!ja_expected_output.is_open()) {
30 |     std::cerr << "Expected file not found!" << std::endl;
31 |     FAIL();
32 |   }
33 | 
34 |   std::stringstream ss1;
35 |   ss1 << ja_output.rdbuf();
36 |   ja_output.close();
37 | 
38 |   std::stringstream ss2;
39 |   ss2 << ja_expected_output.rdbuf();
40 |   ja_expected_output.close();
41 | 
42 |   ASSERT_EQ(ss1.str().length(), ss2.str().length());
43 | }
44 | 
45 | void compare(std::string test_name, std::string input_file, std::string output_folder, pair_files_vec test_files,
46 |              utils::compression_option in_compr, utils::compression_option out_compr) {
47 | 
48 |   std::string test1_input_path = std::string("../../tests/data_integration/") + input_file;
49 | 
50 |   if (!boost::filesystem::exists(test1_input_path)) {
51 |     std::cerr << "Input file not found!" << std::endl;
52 |     FAIL();
53 |   }
54 | 
55 |   boost::filesystem::path output_dir(output_folder);
56 | 
57 |   if (boost::filesystem::create_directory(boost::filesystem::path(output_dir.string() + "/" + test_name))) {
58 | 
59 |     mono::worker_file(test1_input_path, false, output_folder + "/" + test_name, in_compr, out_compr);
60 |     for (auto a: test_files) {
61 |       compare_lengths(output_folder + "/" + test_name + "/" + a.first, a.second);
62 |     }
63 | 
64 |   } else {
65 |     std::cerr << "Output folder already exists! Please remove: " << output_folder << std::endl;
66 |     FAIL();
67 |   }
68 | 
69 | }
70 | 
71 | 
72 | TEST(integration, test_simple_gzip_to_langsplit) {
73 |   std::string output_dir = "test_dir_integration";
74 |   if (!boost::filesystem::create_directory(output_dir)) {
75 |     std::cerr << "Output folder already exists! Please remove: " << output_dir << std::endl;
76 |     FAIL();
77 |   }
78 | 
79 |   pair_files_vec test1_files;
80 |   test1_files.push_back(std::make_pair("text.ja.out", "test1_ja.out"));
81 |   test1_files.push_back(std::make_pair("text.en.out", "test1_en.out"));
82 | 
83 |   pair_files_vec test2_files;
84 |   test2_files.push_back(std::make_pair("text.de.out", "test2_de.out"));
85 |   test2_files.push_back(std::make_pair("text.ru.out", "test2_ru.out"));
86 | 
87 |   pair_files_vec test3_files;
88 |   test3_files.push_back(std::make_pair("text.en.out", "test3_en.out"));
89 | 
90 |   compare("test1", "test1.in.gz", output_dir, test1_files, utils::gzip, utils::none);
91 |   compare("test2", "test2.in.gz", output_dir, test2_files, utils::gzip, utils::none);
92 |   compare("test3", "test3.in", output_dir, test3_files, utils::none, utils::none);
93 | }


--------------------------------------------------------------------------------
/tests/test_langsplit.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "gtest/gtest.h"
  3 | #include "../src/mono/filters/langsplitfilter.h"
  4 | 
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <string>
  9 | 
 10 | #include <boost/iostreams/filter/aggregate.hpp>
 11 | #include <boost/iostreams/filtering_stream.hpp>
 12 | #include <boost/iostreams/filter/gzip.hpp>
 13 | #include "boost/filesystem/operations.hpp"
 14 | #include <boost/iostreams/copy.hpp>
 15 | 
 16 | 
 17 | void compare(std::string input_file, std::string expected_result_file) {
 18 |   // load files
 19 |   std::ifstream test1_input(std::string("../../tests/data_langsplit/") + input_file);
 20 |   std::ifstream test1_expected_output(std::string("../../tests/data_langsplit/") + expected_result_file);
 21 | 
 22 |   if (!test1_input.is_open()) {
 23 |     std::cerr << "Input file not found!" << std::endl;
 24 |     FAIL();
 25 |   }
 26 | 
 27 |   if (!test1_expected_output.is_open()) {
 28 |     std::cerr << "Output file not found!" << std::endl;
 29 |     FAIL();
 30 |   }
 31 | 
 32 |   // apply filter
 33 |   std::stringstream output;
 34 |   mono::filters::LangsplitFilter langsplitFilter = mono::filters::LangsplitFilter("../../tests/data_langsplit/", false);
 35 |   boost::iostreams::filtering_streambuf<boost::iostreams::input> in(test1_input);
 36 |   boost::iostreams::filtering_streambuf<boost::iostreams::output> out;
 37 |   out.push(langsplitFilter);
 38 |   out.push(output);
 39 | 
 40 |   boost::iostreams::copy(in, out);
 41 | 
 42 |   // compare outputs
 43 |   std::stringstream ss2;
 44 |   ss2 << test1_expected_output.rdbuf();
 45 |   ASSERT_EQ(output.str(), ss2.str());
 46 | 
 47 |   test1_input.close();
 48 |   test1_expected_output.close();
 49 | }
 50 | 
 51 | void compare_stats(std::string input_file, std::string output_dir, std::string input_stat_file,
 52 |                    std::string expected_result_file) {
 53 |   // load files
 54 |   std::ifstream test1_input(std::string("../../tests/data_langsplit/") + input_file);
 55 |   std::ifstream test1_expected_output(std::string("../../tests/data_langsplit/") + expected_result_file);
 56 | 
 57 |   if (!test1_input.is_open()) {
 58 |     std::cerr << "Input file not found!" << std::endl;
 59 |     FAIL();
 60 |   }
 61 | 
 62 |   if (!test1_expected_output.is_open()) {
 63 |     std::cerr << "Output file not found!" << std::endl;
 64 |     FAIL();
 65 |   }
 66 | 
 67 |   // apply filter
 68 |   boost::filesystem::create_directory(output_dir + "/stats");
 69 |   mono::filters::LangsplitFilter langsplitFilter = mono::filters::LangsplitFilter(output_dir, true);
 70 |   boost::iostreams::filtering_streambuf<boost::iostreams::input> in(test1_input);
 71 |   boost::iostreams::filtering_streambuf<boost::iostreams::output> out;
 72 |   out.push(langsplitFilter);
 73 |   out.push(boost::iostreams::null_sink());
 74 | 
 75 |   boost::iostreams::copy(in, out);
 76 | 
 77 |   // read langstats
 78 |   std::ifstream langstats_input(output_dir + "/" + input_stat_file, std::ios_base::in | std::ios_base::binary);
 79 |   if (!langstats_input.is_open()) {
 80 |     std::cerr << "langstat file not found!" << std::endl;
 81 |     FAIL();
 82 |   }
 83 |   std::stringstream langstats_output;
 84 |   boost::iostreams::filtering_streambuf<boost::iostreams::input> qin(langstats_input);
 85 |   boost::iostreams::filtering_streambuf<boost::iostreams::output> qout;
 86 |   qout.push(boost::iostreams::gzip_decompressor());
 87 |   qout.push(langstats_output);
 88 | 
 89 |   boost::iostreams::copy(qin, qout);
 90 | 
 91 |   // compare outputs
 92 |   std::stringstream ss2;
 93 |   ss2 << test1_expected_output.rdbuf();
 94 |   ASSERT_EQ(langstats_output.str(), ss2.str());
 95 | 
 96 |   test1_input.close();
 97 |   test1_expected_output.close();
 98 | }
 99 | 
100 | 
101 | TEST(langsplit, test_simple) {
102 |   std::string output_dir = "test_dir_langsplit";
103 |   if (!boost::filesystem::create_directory(output_dir)) {
104 |     std::cerr << "Output folder already exists! Please remove: " << output_dir << std::endl;
105 |     FAIL();
106 |   }
107 | 
108 |   compare("test1.in", "test1.out");
109 |   compare_stats("test1.in", output_dir, "stats/langstats.0.gz", "test1_langstats.out");
110 |   compare_stats("test1.in", output_dir, "stats/langstats.1.gz", "test1_langstats.out");
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/tests/test_readerwarc.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "gtest/gtest.h"
 3 | #include "../src/mono/filters/warcfilter.h"
 4 | 
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <sstream>
 8 | #include <string>
 9 | 
10 | #include <boost/iostreams/filter/aggregate.hpp>
11 | #include <boost/iostreams/filtering_stream.hpp>
12 | #include <boost/iostreams/copy.hpp>
13 | 
14 | 
15 | void compare(std::string input_file, std::string expected_result_file) {
16 |   // load files
17 |   std::ifstream test1_input(std::string("../../tests/data_readerwarc/") + input_file);
18 |   std::ifstream test1_expected_output(std::string("../../tests/data_readerwarc/") + expected_result_file);
19 | 
20 |   if (!test1_input.is_open()) {
21 |     std::cerr << "Input file not found!" << std::endl;
22 |     FAIL();
23 |   }
24 | 
25 |   if (!test1_expected_output.is_open()) {
26 |     std::cerr << "Output file not found!" << std::endl;
27 |     FAIL();
28 |   }
29 | 
30 |   // apply filter
31 |   std::stringstream output;
32 |   boost::iostreams::filtering_streambuf<boost::iostreams::input> in(test1_input);
33 |   boost::iostreams::filtering_streambuf<boost::iostreams::output> out;
34 |   out.push(mono::filters::WARCFilter());
35 |   out.push(output);
36 |   boost::iostreams::copy(in, out);
37 | 
38 |   // compare outputs
39 |   std::stringstream ss2;
40 |   ss2 << test1_expected_output.rdbuf();
41 |   ASSERT_EQ(output.str(), ss2.str());
42 | 
43 |   test1_input.close();
44 |   test1_expected_output.close();
45 | }
46 | 
47 | 
48 | TEST(readerwarc, test_simple) {
49 |   compare("test1.in", "test1.out");
50 | }
51 | 
52 | TEST(readerwarc, test_good_uri) {
53 |   compare("test2.in", "test2.out");
54 | }
55 | 
56 | TEST(readerwarc, test_bad_uri) {
57 |   compare("test3.in", "test3.out");
58 | };
59 | 
60 | TEST(readerwarc, test_newlines) {
61 |   compare("test4.in", "test4.out");
62 | };
63 | 
64 | TEST(readerwarc, test_header) {
65 |   compare("test5.in", "test5.out");
66 | };
67 | 


--------------------------------------------------------------------------------
/tests/test_utils.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "gtest/gtest.h"
 3 | #include "../src/utils/common.h"
 4 | 
 5 | TEST(utils, test_common_parse_uri) {
 6 |   utils::parse_uri url1("http://blogos.com/article/118568/");
 7 |   utils::parse_uri url2("http://blogos.com");
 8 |   utils::parse_uri url3("http://000.blogos.com");
 9 |   utils::parse_uri url4("https://blogos.com/article/118568/");
10 |   utils::parse_uri url5("https://blogos.com");
11 |   utils::parse_uri url6("https://000.blogos.com");
12 |   utils::parse_uri url7("111.blogos.com");
13 |   utils::parse_uri url8("eu.blogos.com:1111/article");
14 |   utils::parse_uri url9("username@us.blogos.com:1111/article");
15 |   utils::parse_uri url10("http://username@us.blogos.com:1111/article?username=noname");
16 |   utils::parse_uri url11("ftp://blogos.com");
17 |   utils::parse_uri url12("ftp://username@us.blogos.com:1111/article?username=noname#h1");
18 |   utils::parse_uri url13("foofoofoo");
19 |   utils::parse_uri url14("foofoofoo/bar");
20 | 
21 |   ASSERT_EQ(url1.get_domain(), "blogos.com");
22 |   ASSERT_EQ(url2.get_domain(), "blogos.com");
23 |   ASSERT_EQ(url3.get_domain(), "000.blogos.com");
24 |   ASSERT_EQ(url4.get_domain(), "blogos.com");
25 |   ASSERT_EQ(url5.get_domain(), "blogos.com");
26 |   ASSERT_EQ(url6.get_domain(), "000.blogos.com");
27 |   ASSERT_EQ(url7.get_domain(), "111.blogos.com");
28 |   ASSERT_EQ(url8.get_domain(), "eu.blogos.com");
29 |   ASSERT_EQ(url9.get_domain(), "us.blogos.com");
30 |   ASSERT_EQ(url10.get_domain(), "us.blogos.com");
31 |   ASSERT_EQ(url11.get_domain(), "blogos.com");
32 |   ASSERT_EQ(url12.get_domain(), "us.blogos.com");
33 |   ASSERT_EQ(url13.get_domain(), "foofoofoo");
34 |   ASSERT_EQ(url14.get_domain(), "foofoofoo");
35 | 
36 |   ASSERT_EQ(url1.get_tld(), "com");
37 |   ASSERT_EQ(url6.get_tld(), "com");
38 |   ASSERT_EQ(url13.get_tld(), "foofoofoo");
39 | 
40 |   ASSERT_EQ(url1.get_path(), "article/118568/");
41 |   ASSERT_EQ(url8.get_path(), "article");
42 |   ASSERT_EQ(url10.get_path(), "article");
43 |   ASSERT_EQ(url12.get_path(), "article");
44 |   ASSERT_EQ(url13.get_path(), "");
45 |   ASSERT_EQ(url14.get_path(), "bar");
46 | }


--------------------------------------------------------------------------------