├── .gitattributes
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .jvmopts
├── .scalafmt.conf
├── LICENSE
├── README.md
├── bench
    └── src
    │   └── main
    │       └── java
    │           └── test
    │               ├── Log10Bench.java
    │               └── SqrtBench.java
├── build.sbt
├── core
    └── src
    │   └── main
    │       ├── resources
    │           ├── chitra.conf
    │           ├── ng_words.txt
    │           ├── reference.conf
    │           ├── rmTemplate.conf
    │           ├── sudachiDictCorpus.conf
    │           ├── template_sentences.txt
    │           └── warc.conf
    │       └── scala
    │           └── com
    │               └── worksap
    │                   └── nlp
    │                       └── uzushio
    │                           ├── CorpusCleaner.scala
    │                           ├── DocumentIO.scala
    │                           ├── MinHash.scala
    │                           ├── MinHashDeduplicator.scala
    │                           ├── Sudachi.scala
    │                           ├── SudachiTokenizer.scala
    │                           ├── TokenHasher.scala
    │                           ├── cleaning
    │                               ├── ConcatShortSentence.scala
    │                               ├── DeduplicateElement.scala
    │                               ├── DeduplicateRepeatingSentence.scala
    │                               ├── FieldSettable.scala
    │                               ├── Filter.scala
    │                               ├── FilterBySentenceLength.scala
    │                               ├── FilterJapaneseBasedOnCharacter.scala
    │                               ├── NormalizeCharacter.scala
    │                               ├── NormalizeWhitespace.scala
    │                               ├── Normalizer.scala
    │                               ├── Pipeline.scala
    │                               ├── RemoveEmail.scala
    │                               ├── RemoveNGWordDocument.scala
    │                               ├── RemoveScriptDocument.scala
    │                               ├── RemoveShortDocument.scala
    │                               ├── RemoveSubstring.scala
    │                               ├── RemoveURL.scala
    │                               ├── RemoveWikipediaCitation.scala
    │                               ├── SplitElement.scala
    │                               └── Transformer.scala
    │                           └── main
    │                               ├── DeduplicateParagraphs.scala
    │                               └── ExtractTextFromWarc.scala
├── docs
    └── tutorial.md
├── legacy
    ├── README.md
    ├── list_common_substr.py
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── com
    │   │           └── worksap
    │   │               └── nlp
    │   │                   └── uzushio
    │   │                       └── warc
    │   │                           ├── HttpResponseParser.scala
    │   │                           ├── HttpResponseSerializable.scala
    │   │                           ├── JusTextHandler.scala
    │   │                           ├── LongWritableSerializable.scala
    │   │                           ├── NWCToolkitHandler.scala
    │   │                           ├── ParagraphHandler.scala
    │   │                           ├── README.md
    │   │                           ├── WarcFileReader.scala
    │   │                           ├── WarcInputFormat.scala
    │   │                           ├── WarcLoader.scala
    │   │                           ├── WarcRecord.scala
    │   │                           ├── WarcToDocument.scala
    │   │                           └── WarcWritable.scala
    └── suffixarray.py
├── lib
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── com
    │       │   │   └── worksap
    │       │   │   │   └── nlp
    │       │   │   │       └── uzushio
    │       │   │   │           └── lib
    │       │   │   │               └── filters
    │       │   │   │                   ├── hojichar
    │       │   │   │                       ├── README.md
    │       │   │   │                       ├── adult_keywords_ja.txt
    │       │   │   │                       └── discriminations_keywords_ja.txt
    │       │   │   │                   └── ng_words.txt
    │       │   └── pipeline
    │       │   │   └── all_duplicate_paragraphs.conf
    │       └── scala
    │       │   └── com
    │       │       └── worksap
    │       │           └── nlp
    │       │               └── uzushio
    │       │                   └── lib
    │       │                       ├── cleaning
    │       │                           ├── PathSegment.scala
    │       │                           └── Pipeline.scala
    │       │                       ├── filters
    │       │                           ├── AdjacentDuplicateParagraphs.scala
    │       │                           ├── CompressionRate.scala
    │       │                           ├── DeduplicateDocuments.scala
    │       │                           ├── DeduplicateDocumentsPercentile.scala
    │       │                           ├── DocLength.scala
    │       │                           ├── DuplicateDocumentsLengthWeighted.scala
    │       │                           ├── DuplicateParagraphs.scala
    │       │                           ├── HiraganaRatio.scala
    │       │                           ├── KenLMDocAvgPerplexity.scala
    │       │                           ├── KenLMParagraphPerplexity.scala
    │       │                           ├── LargeFreqParagraphs.scala
    │       │                           ├── LinkCharRatio.scala
    │       │                           ├── MarkdownizeHeading.scala
    │       │                           ├── MergeListTag.scala
    │       │                           ├── NoContentDOM.scala
    │       │                           ├── WordInstances.scala
    │       │                           ├── WordTypes.scala
    │       │                           └── base
    │       │                           │   ├── FilterBase.scala
    │       │                           │   └── HighLowDocFilter.scala
    │       │                       ├── html
    │       │                           ├── AllTagMapper.scala
    │       │                           ├── ParagraphExtractor.scala
    │       │                           └── ParseAbortException.scala
    │       │                       ├── lang
    │       │                           ├── LangEstimation.scala
    │       │                           └── LangTagSniffer.scala
    │       │                       ├── resources
    │       │                           └── CachedLocalResource.scala
    │       │                       ├── runners
    │       │                           ├── DedupFilterStatistics.scala
    │       │                           ├── DeduplicateParagraphs.scala
    │       │                           ├── ExtractParagraphsFromWARC.scala
    │       │                           ├── FilterStatistics.scala
    │       │                           ├── KenLMRunner.scala
    │       │                           ├── MergeDedupStats.scala
    │       │                           └── Repackage.scala
    │       │                       ├── stats
    │       │                           ├── CountMinSketch.scala
    │       │                           ├── NgramBitSignatures.java
    │       │                           └── SimHashProcessor.scala
    │       │                       ├── utils
    │       │                           ├── BuilderSyntax.scala
    │       │                           ├── Levenshtein.java
    │       │                           ├── MathUtil.java
    │       │                           ├── Paragraphs.scala
    │       │                           ├── Resources.scala
    │       │                           ├── RowBuffer.java
    │       │                           ├── SentenceIterator.scala
    │       │                           ├── SessionBufferIn.scala
    │       │                           ├── TrieNode.scala
    │       │                           ├── WarcFileReader.scala
    │       │                           └── Ziggurat.java
    │       │                       └── warc
    │       │                           ├── WarcEntryParser.scala
    │       │                           ├── WarcInputFormat.scala
    │       │                           ├── WarcLoader.scala
    │       │                           ├── WarcRecord.scala
    │       │                           └── WarcWritable.scala
    │   └── test
    │       ├── resources
    │           ├── docs
    │           │   ├── links.html
    │           │   ├── paragraph_detect.html
    │           │   ├── perldoc_ja.html
    │           │   └── perldoc_ja_small.html
    │           ├── lang
    │           │   └── shift_jis.txt
    │           ├── pipeline
    │           │   └── doc_len.conf
    │           └── text
    │           │   └── dedup_docomo.txt
    │       └── scala
    │           └── com
    │               └── worksap
    │                   └── nlp
    │                       └── uzushio
    │                           └── lib
    │                               ├── cleaning
    │                                   ├── DocumentSpec.scala
    │                                   ├── ParagraphSpec.scala
    │                                   ├── PathSegmentSpec.scala
    │                                   └── PipelineSpec.scala
    │                               ├── dupes
    │                                   └── CandidateRowProcessorSpec.scala
    │                               ├── filters
    │                                   ├── AdjacentDuplicateParagraphsSpec.scala
    │                                   ├── CompressionRateSpec.scala
    │                                   ├── DeduplicateDocumentsSpec.scala
    │                                   ├── LargeFreqParagraphsSpec.scala
    │                                   ├── LinkCharRatioSpec.scala
    │                                   ├── MarkdownizeHeadingSpec.scala
    │                                   ├── MergeListTagSpec.scala
    │                                   ├── NoContentDOMSpec.scala
    │                                   ├── WordInstancesSpec.scala
    │                                   └── package.scala
    │                               ├── html
    │                                   └── HtmlParserSpec.scala
    │                               ├── lang
    │                                   └── LangEstimationSpec.scala
    │                               ├── runners
    │                                   └── MergeStatsSpec.scala
    │                               ├── utils
    │                                   ├── ClasspathAccess.scala
    │                                   ├── MathUtilTest.scala
    │                                   ├── ParagraphsSpec.scala
    │                                   ├── RowBufferSpec.scala
    │                                   ├── SentenceIteratorSpec.scala
    │                                   └── TrieSpec.scala
    │                               └── warc
    │                                   └── WarcEntryParserSpec.scala
├── project
    ├── Build.scala
    ├── build.properties
    └── plugins.sbt
├── scripts
    ├── cal_overlap_ratio
    │   ├── README.md
    │   ├── cal_overlap.py
    │   └── visualize.py
    ├── count_filter_statistics.py
    ├── count_tokens.py
    ├── pipeline_01.conf
    ├── pipeline_02.conf
    ├── pipeline_03a.conf
    ├── pipeline_test_perplexity.conf
    ├── submit_all_compute_stats.sh
    ├── submit_all_compute_stats_old.sh
    ├── submit_all_filter.sh
    ├── submit_all_merges_stage1.sh
    ├── submit_all_merges_stage2.sh
    ├── submit_calc_overlap.sh
    ├── submit_dedup_stage1.sh
    ├── submit_dedup_stats.sh
    ├── submit_filter_debug.sh
    ├── submit_filter_debug_2.sh
    ├── submit_kenlm.sh
    ├── submit_merge_stats.sh
    ├── submit_merge_stats_final.sh
    └── vis
    │   └── vis_filter.py
└── spark-config
    └── abci-f
        ├── spark-defaults.conf
        └── spark-env.sh


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.txt text eol=lf
2 | *.html text eol=lf


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - main
 6 |   pull_request:
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - uses: actions/setup-java@v4
13 |         with:
14 |           distribution: temurin
15 |           java-version: 17
16 |           cache: sbt
17 |       - name: Setup sbt launcher
18 |         uses: sbt/setup-sbt@v1
19 |       - run: sbt "scalafmtCheck;test"
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # sbt
 2 | target/
 3 | .bloop/
 4 | .bsp/
 5 | .idea/
 6 | .venv/
 7 | 
 8 | # metal
 9 | .metals/
10 | metals.sbt
11 | 
12 | # ignore abci logs for shell scripts
13 | *.sh.o*


--------------------------------------------------------------------------------
/.jvmopts:
--------------------------------------------------------------------------------
1 | -Xmx2G
2 | --add-exports=java.base/sun.nio.ch=ALL-UNNAMED


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
 1 | version = "3.7.12"
 2 | runner.dialect = scala213
 3 | project.excludePaths = [
 4 |     "glob:**/legacy/**/**.scala"
 5 | ]
 6 | 
 7 | maxColumn = 100
 8 | align.preset = none
 9 | 
10 | newlines.avoidForSimpleOverflow = [punct]
11 | newlines.beforeMultiline = fold
12 | newlines.selectChains = fold
13 | 
14 | rewrite.rules = [RedundantBraces, RedundantParens, SortModifiers]
15 | rewrite.redundantBraces.stringInterpolation = true
16 | rewrite.redundantBraces.generalExpressions = false
17 | rewrite.redundantBraces.defnBodies = noParams
18 | runner.optimizer.forceConfigStyleMinArgCount = 4
19 | 
20 | rewrite.trailingCommas.style = "keep"


--------------------------------------------------------------------------------
/bench/src/main/java/test/Log10Bench.java:
--------------------------------------------------------------------------------
 1 | package test;
 2 | 
 3 | import org.apache.commons.math3.util.FastMath;
 4 | import org.openjdk.jmh.annotations.Benchmark;
 5 | import org.openjdk.jmh.annotations.Scope;
 6 | import org.openjdk.jmh.annotations.Setup;
 7 | import org.openjdk.jmh.annotations.State;
 8 | 
 9 | import java.util.Random;
10 | 
11 | @State(Scope.Benchmark)
12 | public class Log10Bench {
13 |     double[] arg;
14 | 
15 |     @Setup
16 |     public void setup() {
17 |         double[] arr = new double[10000];
18 |         Random rng = new Random(42L);
19 |         for (int i = 0; i < 10000; ++i) {
20 |             arr[i] = rng.nextInt(10000) + 1;
21 |         }
22 |         arg = arr;
23 |     }
24 | 
25 | 
26 |     @Benchmark
27 |     public double bulitin() {
28 |         double result = 0;
29 |         double[] arr = arg;
30 |         for (int i = 0; i < 10000; i++) {
31 |             double v = arr[i];
32 |             result += Math.log10(v);
33 |         }
34 |         return result;
35 |     }
36 | 
37 |     @Benchmark
38 |     public double fastMath() {
39 |         double result = 0;
40 |         double[] arr = arg;
41 |         for (int i = 0; i < 10000; i++) {
42 |             double v = arr[i];
43 |             result += FastMath.log10(v);
44 |         }
45 |         return result;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/bench/src/main/java/test/SqrtBench.java:
--------------------------------------------------------------------------------
 1 | package test;
 2 | 
 3 | import org.apache.commons.math3.util.FastMath;
 4 | import org.openjdk.jmh.annotations.Benchmark;
 5 | import org.openjdk.jmh.annotations.Scope;
 6 | import org.openjdk.jmh.annotations.Setup;
 7 | import org.openjdk.jmh.annotations.State;
 8 | 
 9 | import java.util.Random;
10 | 
11 | @State(Scope.Benchmark)
12 | public class SqrtBench {
13 |     double[] arg;
14 | 
15 |     @Setup
16 |     public void setup() {
17 |         double[] arr = new double[10000];
18 |         Random rng = new Random(42L);
19 |         for (int i = 0; i < 10000; ++i) {
20 |             arr[i] = rng.nextInt(10000) + 1;
21 |         }
22 |         arg = arr;
23 |     }
24 | 
25 | 
26 |     @Benchmark
27 |     public double bulitin() {
28 |         double result = 0;
29 |         double[] arr = arg;
30 |         for (int i = 0; i < 10000; i++) {
31 |             double v = arr[i];
32 |             result += Math.sqrt(v);
33 |         }
34 |         return result;
35 |     }
36 | 
37 |     @Benchmark
38 |     public double fastMath() {
39 |         double result = 0;
40 |         double[] arr = arg;
41 |         for (int i = 0; i < 10000; i++) {
42 |             double v = arr[i];
43 |             result += FastMath.sqrt(v);
44 |         }
45 |         return result;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import Build._
 2 | 
 3 | inThisBuild(
 4 |   Seq(
 5 |     scalaVersion := V.scala212,
 6 |     organization := "com.worksap",
 7 |     organizationName := "Works Applications",
 8 |     startYear := Some(2023),
 9 |     homepage := Some(url("https://github.com/WorksApplications/uzushio")),
10 |     versionScheme := Some("early-semver"),
11 |     developers := List(
12 |       Developer(
13 |         "eiennohito",
14 |         "Arseny Tolmachev",
15 |         "arseny@kotonoha.ws",
16 |         url("https://github.com/eiennohito")
17 |       )
18 |     )
19 |   )
20 | )
21 | lazy val commonSettings = Seq(
22 |   crossScalaVersions := Seq(V.scala212),
23 |   scalacOptions ++= Seq(
24 |     "-feature",
25 |     "-deprecation",
26 |     "-unchecked",
27 |     "-encoding",
28 |     "utf-8"
29 |   ),
30 |   javacOptions ++= Seq(
31 |     "-encoding",
32 |     "utf8",
33 |     "-Xlint:all",
34 |     "-source",
35 |     "1.8",
36 |     "-target",
37 |     "1.8"
38 |   )
39 | )
40 | 
41 | disablePlugins(sbtassembly.AssemblyPlugin)
42 | 
43 | lazy val root = (project in file("."))
44 |   .aggregate(
45 |     lib,
46 |     core,
47 |     legacy
48 |   )
49 |   .settings(
50 |     name := "uzushio-root"
51 |   )
52 |   .settings(noPublishSettings)
53 |   .settings(commonSettings)
54 | 
55 | lazy val legacy = (project in file("legacy"))
56 |   .disablePlugins(sbtassembly.AssemblyPlugin)
57 |   .dependsOn(lib)
58 |   .settings(
59 |     libraryDependencies ++= sparkDependencies.map(_ % Provided)
60 |   )
61 | 
62 | lazy val core = (project in file("core"))
63 |   .enablePlugins(sbtassembly.AssemblyPlugin)
64 |   .settings(
65 |     name := "uzushio",
66 |     libraryDependencies ++= sparkDependencies.map(
67 |       _ % Provided
68 |     )
69 |   )
70 |   .settings(commonSettings)
71 |   .settings(lintSettings)
72 |   .settings(assemblySettings)
73 |   .dependsOn(lib)
74 | 
75 | lazy val lib = (project in file("lib"))
76 |   .disablePlugins(sbtassembly.AssemblyPlugin)
77 |   .settings(
78 |     name := "uzushio-lib",
79 |     libraryDependencies ++= sparkDependencies.map(_ % Optional),
80 |     libraryDependencies ++= libdependencies,
81 |     scalacOptions ++= (
82 |       if (scalaVersion.value.startsWith("2.")) {
83 |         Seq("-opt:l:inline", "-opt-inline-from:classpath")
84 |       } else {
85 |         Seq.empty
86 |       }
87 |     ),
88 |   )
89 |   .settings(commonSettings)
90 |   .settings(lintSettings)
91 |   .settings(scalaCompatSettings)
92 | 
93 | lazy val bench = (project in file("bench"))
94 |   .disablePlugins(sbtassembly.AssemblyPlugin)
95 |   .enablePlugins(JmhPlugin)
96 |   .settings(commonSettings)
97 |   .settings(noPublishSettings)
98 |   .dependsOn(lib)
99 | 


--------------------------------------------------------------------------------
/core/src/main/resources/chitra.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "stages": [
 3 |         {"class": "SplitIntoSentence"},
 4 |         {"class": "RemoveWikipediaCitation"},
 5 |         {"class": "NormalizeCharacter", "keepWS": false},
 6 |         {"class": "NormalizeWhitespace"},
 7 |         {"class": "ConcatShortSentence", "concatThr": 2},
 8 |         {"class": "RemoveEmail"},
 9 |         {"class": "RemoveURL"},
10 |         {"class": "FilterBySentenceLength", "min":10, "max": 200},
11 |         {"class": "RemoveShortDocument", "min": 5},
12 |         {"class": "RemoveScriptDocument"},
13 |         {"class": "RemoveNGWordDocument", "path": "ng_words.txt"},
14 |     ],
15 | }
16 | 


--------------------------------------------------------------------------------
/core/src/main/resources/ng_words.txt:
--------------------------------------------------------------------------------
  1 | fuck
  2 | g スポット
  3 | sm女王
  4 | tenga
  5 | あばずれ
  6 | あぱずれ
  7 | あほ
  8 | うざ
  9 | うんこ
 10 | え〇
 11 | えっち
 12 | おしっこ
 13 | おしりのあな
 14 | おっぱい
 15 | おもらし
 16 | かたわ
 17 | きちがい
 18 | きめぇ
 19 | きめえ
 20 | くそ
 21 | せんずり
 22 | ち〇
 23 | ちんぐり
 24 | ちんこ
 25 | つるぺた
 26 | つんぼ
 27 | ふたなり
 28 | ぶさいく
 29 | ぶす
 30 | ま〇
 31 | まんぐり
 32 | まんこ
 33 | めくら
 34 | やりまん
 35 | アスペ
 36 | アスホール
 37 | アナリングス
 38 | アナル
 39 | アヌス
 40 | アバズレ
 41 | アパズレ
 42 | アホ
 43 | イマラチオ
 44 | イメクラ
 45 | イラマチオ
 46 | ウザ
 47 | ウンコ
 48 | エ〇
 49 | エッチ
 50 | エロ
 51 | オカマ
 52 | オッパイ
 53 | オナ
 54 | オナニー
 55 | オフパコ
 56 | オマンコ
 57 | オルガズム
 58 | オーガズム
 59 | カス
 60 | ガイジ
 61 | キチガイ
 62 | キモ
 63 | クズ
 64 | クソ
 65 | クリトリス
 66 | クンニ
 67 | クンニリングス
 68 | グループ・セックス
 69 | グロ
 70 | ゲイボーイ
 71 | ゲイ・セックス
 72 | ゲロ
 73 | コカイン
 74 | コキ
 75 | コンドーム
 76 | ザーメン
 77 | シコ
 78 | ショタ
 79 | スカトロ
 80 | スケベ
 81 | ストリップ劇場
 82 | スマタ
 83 | セクロス
 84 | セックス
 85 | セフレ
 86 | センズリ
 87 | ダッチワイフ
 88 | チ〇
 89 | テレフォンセックス
 90 | ディルド
 91 | ディープ・スロート
 92 | デブ
 93 | デリヘル
 94 | デートレイプ
 95 | ドキュン
 96 | ナマポ
 97 | ニガー
 98 | ヌい
 99 | ヌく
100 | ヌけ
101 | ネオ・ナチ
102 | ハメ撮り
103 | パイズリ
104 | パイパン
105 | パンチラ
106 | パンティー
107 | ビッチ
108 | ピロートーク
109 | ファック
110 | フェラ
111 | フェラチオ
112 | ブサイク
113 | ブス
114 | プリンス アルバート ピアス
115 | ペッティング
116 | ペニス
117 | ペニスバンド
118 | ホモ
119 | ボンテージ
120 | ボールギャグ
121 | ポルノグラフィー
122 | マ〇
123 | マザー・ファッカー
124 | マスターベーション
125 | マラ
126 | マンコ
127 | ヤラせ
128 | ラブホ
129 | リスカ
130 | リストカット
131 | リョナ
132 | リンチ
133 | レイプ
134 | レズ
135 | 不細工
136 | 中出し
137 | 乱交
138 | 二穴
139 | 人妻
140 | 側位
141 | 児童性虐待
142 | 前戯
143 | 勃起する
144 | 合いの子
145 | 四十八手
146 | 売り専
147 | 売国
148 | 売女
149 | 売春婦
150 | 外人
151 | 夢精
152 | 大人のおもちゃ
153 | 大人のオモチャ
154 | 大人の玩具
155 | 大陰唇
156 | 射精
157 | 尻軽
158 | 尿道プレイ
159 | 巨乳
160 | 巨根
161 | 強姦犯
162 | 後戯
163 | 後背位
164 | 手コキ
165 | 手マン
166 | 援交
167 | 援助交際
168 | 支那
169 | 新しいポルノ
170 | 正常位
171 | 殺し方
172 | 殺人方法
173 | 氏ね
174 | 氏んだ
175 | 氏んで
176 | 気違い
177 | 池沼
178 | 淫乱
179 | 潮吹き女
180 | 潮吹き男性
181 | 熟女
182 | 獣姦
183 | 玉なめ
184 | 玉舐め
185 | 男根
186 | 痴呆
187 | 穴兄弟
188 | 竿姉妹
189 | 筆おろし
190 | 精液
191 | 糞便
192 | 糞尿愛好症
193 | 素股
194 | 緊縛
195 | 老害
196 | 肉便器
197 | 自慰
198 | 裸の女性
199 | 貞操帯
200 | 賢者タイム
201 | 足フェチ
202 | 輪姦
203 | 近親相姦
204 | 阿呆
205 | 陰毛
206 | 電マ
207 | 顔射
208 | 顔面騎乗
209 | 騎上位
210 | 騎乗位


--------------------------------------------------------------------------------
/core/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | // this conf contains the default values.
 2 | {
 3 |     // list of pipeline stages.
 4 |     "stages": [
 5 |         {"class": "Identity"},
 6 |     ],
 7 |     "input": {
 8 |         // text or parquet.
 9 |         "format": "text",
10 |         // Delimiter of documents (text).
11 |         "delimiter": "\n\n",
12 |         // Name of the document column (parquet).
13 |         "column": "document",
14 |     },
15 |     "output": {
16 |         // text or parquet.
17 |         "format": "text",
18 |         // Delimiter of documents (text).
19 |         "delimiter": "\n\n",
20 |         // Name of the document column (parquet).
21 |         "column": "document",
22 |         // Delimiter of elements e.g. paragraph, sentence.
23 |         "elementDelimiter": "\n",
24 |     },
25 | }
26 | 


--------------------------------------------------------------------------------
/core/src/main/resources/rmTemplate.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "stages": [
 3 |         {"class": "DeduplicateElement"}, // deduplicate per document
 4 |         {"class": "SplitIntoSentence"},
 5 |         {"class": "DeduplicateRepeatingSentence", "minRepeat": 2},
 6 |         {"class": "RemoveSubstring",
 7 |             "path": "template_sentences.txt",
 8 |             "delim": "\n\n", // template_sentences contains multi-sentence pattern.
 9 |             "matchSentence": true}, // match full sentence only.
10 |         {"class": "RemoveShortDocument", "min": 5},
11 |     ],
12 | }
13 | 


--------------------------------------------------------------------------------
/core/src/main/resources/sudachiDictCorpus.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "stages": [
 3 |         {"class": "SplitIntoSentence"},
 4 |         {"class": "NormalizeCharacter", "keepWS": true},
 5 |         {"class": "NormalizeWhitespace"},
 6 |         {"class": "DeduplicateElement"}, // deduplicate per sentence
 7 |     ],
 8 |     "output": {
 9 |         "delimiter": "\n", // concat documents
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/core/src/main/resources/template_sentences.txt:
--------------------------------------------------------------------------------
  1 | 管理人のみ閲覧できます
  2 | 
  3 | このコメントは管理人のみ閲覧できます
  4 | 
  5 | 管理者にだけ表示を許可する
  6 | 
  7 | このコメントは管理者の承認待ちです
  8 | 
  9 | 管理人の承認後に表示されます
 10 | 
 11 | 気になるリストに保存 この求人の詳細をみる
 12 | 
 13 | 最新コメントのRSS
 14 | 
 15 | 最新トラックバックのRSS
 16 | 
 17 | この広告は60日以上更新がないブログに表示がされております
 18 | 
 19 | この広告は60日以上更新がないブログに表示がされております
 20 | 以下のいずれかの方法で非表示にすることが可能です
 21 | ・記事の投稿、編集をおこなう
 22 | ・マイブログの【設定】 ＞【広告設定】 より、「60日間更新が無い場合」 の 「広告を表示しない」にチェックを入れて保存する
 23 | 
 24 | 上記の広告は１ヶ月以上更新のないブログに表示されています
 25 | 新しい記事を書く事で広告が消せます
 26 | 
 27 | 上記広告は1ヶ月以上更新のないブログに表示されています
 28 | 新しい記事を書くことで広告を消せます
 29 | 
 30 | [PR]この広告は3ヶ月以上更新がないため表示されています
 31 | ホームページを更新後24時間以内に表示されなくなります
 32 | 
 33 | ブロとも申請フォーム
 34 | 
 35 | ■ブロとも申請フォーム
 36 | 
 37 | ■ ブロとも申請フォーム
 38 | 
 39 | この人とブロともになる
 40 | 
 41 | この記事にトラックバックする(FC2ブログユーザー)
 42 | 
 43 | この記事に対してトラックバックを送信する（FC2ブログユーザー）
 44 | 
 45 | FC2ブログユーザー専用トラックバックURLはこちら
 46 | 
 47 | トラックバックURLはこちら
 48 | 
 49 | この記事のトラックバックURL
 50 | 
 51 | この記事へのトラックバック
 52 | 
 53 | この記事へのトラックバックURL
 54 | 
 55 | 最近のトラックバック
 56 | 
 57 | ■最近のトラックバック
 58 | 
 59 | ■ 最近のトラックバック
 60 | 
 61 | ※ブログオーナーが承認したトラックバックのみ表示されます
 62 | 
 63 | ※言及リンクのないトラックバックは受信されません
 64 | 
 65 | コメントをする・見る
 66 | 
 67 | トラックバックする・見る
 68 | 
 69 | スマートフォン専用ページを表示
 70 | 
 71 | このブログをリンクに追加する
 72 | 
 73 | このページのトップへ
 74 | 
 75 | ページのトップへ戻る
 76 | 
 77 | ＦＣ２ブログへようこそ
 78 | 
 79 | 自分のブログにトラックバック記事作成（会員用）
 80 | 
 81 | この記事へのコメント
 82 | 
 83 | この記事に対するコメント
 84 | 
 85 | この記事に対するコメントの投稿
 86 | 
 87 | この記事に対するトラックバック
 88 | 
 89 | ※画像の中の文字を半角で入力してください
 90 | 
 91 | お名前: [必須入力]
 92 | 
 93 | クリックして気持ちを伝えよう
 94 | ログインしてクリックすれば、自分のブログへのリンクが付きます
 95 | 
 96 | 最近の記事＋コメント
 97 | 
 98 | アクセスランキングを見る＞＞
 99 | 
100 | さらに詳しい情報はコチラ
101 | 
102 | このブログの読者になる
103 | 
104 | 更新情報をチェックする
105 | 
106 | 同じテーマのブログ記事
107 | 
108 | 開始・終了時間は直接の確認をおすすめします
109 | 
110 | 閲覧するには管理人が設定した
111 | パスワードの入力が必要です
112 | 
113 | 管理人からのメッセージ
114 | 
115 | ブログ画像一覧を見る
116 | 
117 | このブログの読者になる（チェック）
118 | 
119 | アメーバブログトップへ
120 | 
121 | ※著作権についてのご注意
122 | 
123 | ブログのトップページへ
124 | 
125 | 最新の記事一覧ページへ
126 | 
127 | このブログの更新情報が届きます
128 | 
129 | 自分のランキングを詳しく見る>>
130 | 
131 | 人気ブログランキングトップへ
132 | 
133 | このブログはランキングに参加していません
134 | 
135 | このブログにコメントするにはログインが必要です
136 | 
137 | この記事には許可ユーザしかコメントができません
138 | 
139 | 読者になると、このブログの更新情報が届きます
140 | 
141 | ブログの更新情報が受け取れて、アクセスが簡単になります
142 | 
143 | このページの先頭へ▲
144 | 
145 | [ コメント記入欄を表示 ]
146 | 
147 | 人気ブログランキングへ
148 | 
149 | このBlogのトップへ│前の記事│次の記事
150 | 
151 | このブログはジャンルランキングに参加していません
152 | 
153 | アメーバID登録して、ブログをつくろう
154 | 
155 | 本ブログパーツの提供を終了しました
156 | 
157 | この記事は削除されているか、
158 | または未来記事設定（現日時以降の公開）された記事のため表示できません
159 | 
160 | 前の記事│このブログのトップへ│次の記事
161 | 
162 | FLO:Qで世界にひとつだけのブログパーツを作ろう
163 | 
164 | ブログの説明を入力します
165 | 
166 | あなたもピュアブログでブログをつくりませんか
167 | 
168 | あなたもエコ・ブログでブログをつくりませんか
169 | 
170 | 「気になる」をクリックで
171 | 回答がついた時に通知でお知らせします
172 | 
173 | ブログやるならFC2ブログ
174 | 
175 | 無料ブログはココログ
176 | 
177 | この広告は1年以上新しい記事の投稿がないブログに表示されております
178 | 
179 | 掲載情報の著作権は提供元企業等に帰属します
180 | 
181 | こんにちはゲストさん
182 | 
183 | 会員登録(無料)して質問・回答してみよう
184 | 
185 | 管理者にだけ表示を許可
186 | 
187 | FC2ブックマークに追加する
188 | 
189 | この記事にトラックバックする（FC2ブログユーザー限定）
190 | 
191 | この記事を 編集・削除
192 | 
193 | ※コメント書き込みは制限されています
194 | 
195 | ブックマークに登録する
196 | 
197 | ※ブログオーナーが承認したコメントのみ表示されます
198 | 
199 | この記事にトラックバック
200 | 
201 | この記事にトラックバック(FC2ブログユーザー)
202 | 
203 | このブログをマイリストに追加
204 | 
205 | この記事へのコメント一覧
206 | 
207 | コメントは新しいものから表示されます
208 | 
209 | 編集・削除するのに必要
210 | 管理者だけにコメントを表示
211 | 
212 | 


--------------------------------------------------------------------------------
/core/src/main/resources/warc.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "stages": [
 3 |         // warc postprocess
 4 |         {"class": "SplitIntoParagraph"},
 5 |         {"class": "FilterJapaneseBasedOnCharacter", "kanaRate": 0.05, "jpRate": 0.7},
 6 |         {"class": "DeduplicateElement"}, // deduplicate per paragraph
 7 |         {"class": "RemoveShortDocument", "min": 5},
 8 |         // chitra preprocess
 9 |         {"class": "SplitIntoSentence"},
10 |         {"class": "RemoveWikipediaCitation"},
11 |         {"class": "NormalizeCharacter", "keepWS": false},
12 |         {"class": "NormalizeWhitespace"},
13 |         {"class": "ConcatShortSentence", "concatThr": 2},
14 |         {"class": "RemoveEmail"},
15 |         {"class": "RemoveURL"},
16 |         {"class": "FilterBySentenceLength", "min":10, "max": 200},
17 |         {"class": "RemoveShortDocument", "min": 5},
18 |         {"class": "RemoveScriptDocument"},
19 |         {"class": "RemoveNGWordDocument", "path": "ng_words.txt"},
20 |         // remove template
21 |         {"class": "DeduplicateRepeatingSentence", "minRepeat": 2},
22 |         {"class": "RemoveSubstring",
23 |             "path": "template_sentences.txt",
24 |             "delim": "\n\n", // template_sentences contains multi-sentence pattern.
25 |             "matchSentence": true}, // match full sentence only.
26 |         {"class": "RemoveShortDocument", "min": 5},
27 |     ],
28 |     "input": {
29 |         "format": "parquet",
30 |         "column": "document",
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/DocumentIO.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio
 2 | 
 3 | import java.nio.file.{Path, Paths}
 4 | import org.rogach.scallop.ScallopConf
 5 | 
 6 | import org.apache.spark.sql.{SparkSession, DataFrame}
 7 | import org.apache.spark.sql.functions.{expr, monotonically_increasing_id}
 8 | 
 9 | object DocumentIO {
10 |   val idxCol = "documentId"
11 |   val docCol = "document"
12 | 
13 |   private class Conf(args: Seq[String]) extends ScallopConf(args) {
14 |     // args `--input ./hoge.md ./*.txt` will be parsed like
15 |     //   List(./hoge.md, ./fuga.txt, ./piyo.txt)
16 |     val input = opt[List[Path]](required = true)
17 |     val output = opt[Path](default = Some(Paths.get("./out")))
18 |     verify()
19 |   }
20 | 
21 |   def run(spark: SparkSession, conf: Conf): Unit = {
22 |     val docs = loadRawDocuments(spark, conf.input())
23 |     val docWithIdx = addIndex(docs)
24 |     saveIndexedDocuments(docWithIdx, conf.output())
25 |   }
26 | 
27 |   def main(args: Array[String]): Unit = {
28 |     val conf = new Conf(args)
29 |     val spark = SparkSession.builder().appName("DocumentIO").getOrCreate()
30 | 
31 |     try { run(spark, conf) }
32 |     finally { spark.stop() }
33 |   }
34 | 
35 |   def addIndex(
36 |       dataframe: DataFrame,
37 |       idxColName: String = idxCol
38 |   ): DataFrame = {
39 |     // add index column
40 |     dataframe.withColumn(idxColName, monotonically_increasing_id)
41 |   }
42 | 
43 |   def formatPathList(paths: Seq[Path]): Seq[Path] = {
44 |     // align list to fix the order of file load (todo: check if necessary)
45 |     paths.distinct.sorted
46 |   }
47 | 
48 |   def saveRawDocuments(
49 |       documents: DataFrame,
50 |       output: Path,
51 |       docCol: String = docCol,
52 |       sep: String = "\n\n"
53 |   ): Unit = {
54 |     documents.select(docCol).write.option("lineSep", sep).text(output.toString)
55 |   }
56 | 
57 |   def loadRawDocuments(
58 |       spark: SparkSession,
59 |       input: Seq[Path],
60 |       sep: String = "\n\n"
61 |   ): DataFrame = {
62 |     // load document data.
63 |     //
64 |     // Assumes each input file contains multiple documents,
65 |     // and they are separated by `sep` (by default two empty lines).
66 |     val paths = formatPathList(input).map(_.toString)
67 |     spark.read.option("lineSep", sep).text(paths: _*).filter(r => r.getAs[String](0).trim != "")
68 |       .select(expr(s"value as $docCol"))
69 |   }
70 | 
71 |   def saveIndexedDocuments(
72 |       dataframe: DataFrame,
73 |       output: Path,
74 |       idxColName: String = idxCol,
75 |       docColName: String = docCol,
76 |       format: String = "parquet"
77 |   ): Unit = {
78 |     val data = dataframe.select(
79 |       expr(s"$idxColName as $idxCol"),
80 |       expr(s"$docColName as $docCol")
81 |     )
82 | 
83 |     data.write.format(format).save(output.toString)
84 |   }
85 | 
86 |   def loadIndexedDocuments(
87 |       spark: SparkSession,
88 |       input: Seq[Path],
89 |       format: String = "parquet"
90 |   ): DataFrame = {
91 |     // Assume the schema of files is same to `saveIndexedDocuments` output
92 |     val paths = formatPathList(input).map(_.toString)
93 |     spark.read.format(format).load(paths: _*)
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/Sudachi.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio
 2 | 
 3 | import java.nio.file.Paths
 4 | import com.worksap.nlp.sudachi.{DictionaryFactory, Tokenizer, Config}
 5 | 
 6 | object Sudachi {
 7 |   def parseSplitMode(mode: String): Tokenizer.SplitMode = {
 8 |     // Parse sudachi SplitMode from a string.
 9 |     mode.capitalize match {
10 |       case "A" => Tokenizer.SplitMode.A
11 |       case "B" => Tokenizer.SplitMode.B
12 |       case _ => Tokenizer.SplitMode.C
13 |     }
14 |   }
15 | 
16 |   def setupSudachiTokenizer(): Tokenizer = {
17 |     // create sudachi Tokenizer instance.
18 |     // system_core.dict must be in cwd.
19 |     // TODO: load config file
20 |     val dictPath = Paths.get("system_core.dic")
21 |     val conf = Config.defaultConfig().systemDictionary(dictPath)
22 |     new DictionaryFactory().create(conf).create()
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/SudachiTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio
 2 | 
 3 | import collection.JavaConverters._
 4 | 
 5 | import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row}
 6 | import org.apache.spark.sql.types._
 7 | import org.apache.spark.ml.Transformer
 8 | import org.apache.spark.ml.param.{Param, ParamMap}
 9 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
10 | import org.apache.spark.ml.util.Identifiable
11 | 
12 | /** Tokenizer based on Sudachi.
13 |   *
14 |   * The input col should contains document (String consists of "\n" delimited sentences).
15 |   * SudachiTokenizer runs sudachi for each sentences and returns concatenated array of surfaces.
16 |   */
17 | class SudachiTokenizer(override val uid: String)
18 |     extends Transformer
19 |     with HasInputCol
20 |     with HasOutputCol {
21 |   def this() = this(Identifiable.randomUID("sudachiTokenizer"))
22 | 
23 |   override def copy(extra: ParamMap) = defaultCopy(extra)
24 | 
25 |   def outputDataType = new ArrayType(StringType, true)
26 | 
27 |   def setInputCol(value: String) = set(inputCol, value)
28 |   def setOutputCol(value: String) = set(outputCol, value)
29 | 
30 |   // sudachi split mode.
31 |   val splitMode: Param[String] = new Param(
32 |     this,
33 |     "splitMode",
34 |     "sudachi split mode (A/B/C)",
35 |     (c: String) => {
36 |       c.length == 1 && "aAbBcC".contains(c)
37 |     }
38 |   )
39 |   def setSplitMode(value: String): this.type = set(splitMode, value)
40 |   def getSplitMode: String = $(splitMode)
41 | 
42 |   setDefault(splitMode -> "C")
43 | 
44 |   override def transformSchema(schema: StructType): StructType = {
45 |     val inputType = schema($(inputCol)).dataType
46 |     require(
47 |       inputType == StringType,
48 |       s"Input type must be ${StringType.catalogString} type but got ${inputType.catalogString}."
49 |     )
50 | 
51 |     if (schema.fieldNames.contains($(outputCol))) {
52 |       throw new IllegalArgumentException(
53 |         s"Output column ${$(outputCol)} already exists."
54 |       )
55 |     }
56 |     val outputFields = schema.fields :+
57 |       StructField($(outputCol), outputDataType, nullable = false)
58 |     StructType(outputFields)
59 |   }
60 | 
61 |   override def transform(dataset: Dataset[_]): DataFrame = {
62 |     val outputSchema = transformSchema(dataset.schema)
63 | 
64 |     val mode = Sudachi.parseSplitMode($(splitMode))
65 |     val tokenized = dataset.toDF.rdd.mapPartitions(iter => {
66 |       val tok = Sudachi.setupSudachiTokenizer()
67 | 
68 |       iter.map(row => {
69 |         val tokens = row.getAs[String]($(inputCol)).split("\n")
70 |           .flatMap(sent => tok.tokenize(mode, sent).asScala.map(_.surface()))
71 | 
72 |         Row(row.toSeq :+ tokens: _*)
73 |       })
74 |     })
75 | 
76 |     dataset.sparkSession.createDataFrame(tokenized, outputSchema)
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/TokenHasher.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio
 2 | 
 3 | import collection.JavaConverters._
 4 | 
 5 | import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row}
 6 | import org.apache.spark.sql.types._
 7 | import org.apache.spark.ml.UnaryTransformer
 8 | import org.apache.spark.ml.util.Identifiable
 9 | import org.apache.spark.ml.linalg.{Vectors, VectorUDT}
10 | 
11 | /**/
12 | class TokenHasher(override val uid: String)
13 |     extends UnaryTransformer[Seq[String], Seq[Long], TokenHasher] {
14 |   def this() = this(Identifiable.randomUID("TokenHasher"))
15 | 
16 |   override protected def outputDataType: DataType = new ArrayType(LongType, false)
17 | 
18 |   override protected def createTransformFunc: Seq[String] => Seq[Long] =
19 |     _.iterator.map(hashString).toSet.toSeq
20 | 
21 |   override protected def validateInputType(inputType: DataType): Unit = {
22 |     require(
23 |       inputType == ArrayType(StringType, true) ||
24 |         inputType == ArrayType(StringType, false),
25 |       s"Input type must be ${ArrayType(StringType).catalogString} but got " +
26 |         inputType.catalogString
27 |     )
28 |   }
29 | 
30 |   def hashString(s: String): Long = {
31 |     /* long version of scala String.hashCode */
32 |     s.foldLeft(0L) { case (code, c) => 31 * code + c }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/ConcatShortSentence.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Concat too short sentences to the previous sentence. */
 6 | class ConcatShortSentence(concatThr: Int = 2)
 7 |     extends DocumentNormalizer
 8 |     with FieldSettable[ConcatShortSentence] {
 9 |   override def normalizeDocument(doc: Seq[String]): Seq[String] = {
10 |     if (doc.length <= 1) {
11 |       doc
12 |     } else {
13 |       val shortSentIdx = doc.zipWithIndex.map(z => {
14 |         if (z._1.length <= concatThr) z._2 else -1
15 |       }).filter(_ > 0) // keep first sentence regardless of its length
16 | 
17 |       val appended = shortSentIdx.reverse.foldLeft(doc)((d, i) => d.updated(i - 1, d(i - 1) + d(i)))
18 | 
19 |       for (i <- 0 until appended.length if !shortSentIdx.contains(i)) yield appended(i)
20 |     }
21 |   }
22 | 
23 |   override def toString(): String = s"${this.getClass.getSimpleName}($concatThr)"
24 | }
25 | 
26 | object ConcatShortSentence extends FromConfig {
27 |   override def fromConfig(conf: ConfigObject): ConcatShortSentence = {
28 |     val args = Map[String, Option[Any]](
29 |       "concatThr" -> conf.getAs[Int]("concatThr")
30 |     ).collect { case (k, Some(v)) => k -> v }
31 | 
32 |     new ConcatShortSentence().setFields(args)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/DeduplicateElement.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | import org.apache.spark.sql.Dataset
 5 | import org.apache.spark.sql.functions.monotonically_increasing_id
 6 | 
 7 | /** Deduplicate elements of sequences, keeping seq order. */
 8 | class DeduplicateElement extends Transformer {
 9 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
10 |     import ds.sparkSession.implicits._
11 | 
12 |     // add indices: (doc_id, elem_id, txt)
13 |     val indexed = ds.withColumn("did", monotonically_increasing_id)
14 |       .flatMap(r => r.getSeq[String](0).zipWithIndex.map(z => (r.getLong(1), z._2, z._1)))
15 |     // drop duplicate paragraphs
16 |     val dedup = indexed.dropDuplicates("_3")
17 |     // reconstruct documents
18 |     dedup.groupByKey(_._1).mapGroups((k, itr) => itr.toSeq.sortBy(_._2).map(_._3))
19 |   }
20 | }
21 | 
22 | object DeduplicateElement extends FromConfig {
23 |   override def fromConfig(conf: ConfigObject): DeduplicateElement = new DeduplicateElement
24 | }
25 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/DeduplicateRepeatingSentence.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Deduplicate same sentences repeating many times.
 6 |   */
 7 | class DeduplicateRepeatingSentence(minRep: Int = 2)
 8 |     extends DocumentNormalizer
 9 |     with FieldSettable[DeduplicateRepeatingSentence] {
10 |   override def normalizeDocument(doc: Seq[String]): Seq[String] = {
11 |     var (i, j) = (0, 0)
12 |     var indices: Seq[Int] = Vector()
13 |     while (i < doc.length) {
14 |       j = i + 1
15 |       while ((j < doc.length) && (doc(i) == doc(j))) { j += 1 }
16 | 
17 |       if (i + minRep <= j) { indices :+= i }
18 |       else { indices ++= i until j }
19 |       i = j
20 |     }
21 |     for (i <- indices) yield doc(i)
22 |   }
23 | 
24 |   override def toString(): String = s"${this.getClass.getSimpleName}($minRep)"
25 | }
26 | 
27 | object DeduplicateRepeatingSentence extends FromConfig {
28 |   override def fromConfig(conf: ConfigObject): DeduplicateRepeatingSentence = {
29 |     val args = Map[String, Option[Any]](
30 |       "minRep" -> conf.getAs[Int]("minRepeat")
31 |     ).collect { case (k, Some(v)) => k -> v }
32 | 
33 |     new DeduplicateRepeatingSentence().setFields(args)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FieldSettable.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | /** Set class field via setField method.
 4 |   *
 5 |   * T must be the type of the class this is implemented with, e.g. `class MyClass extends
 6 |   * FieldSettable[MyClass]`.
 7 |   */
 8 | trait FieldSettable[T] {
 9 |   def setFields(map: Map[String, Any]): T = {
10 |     for ((k, v) <- map) setField(k, v)
11 |     this.asInstanceOf[T]
12 |   }
13 | 
14 |   def setField(key: String, value: Any): T = {
15 |     this.getClass.getDeclaredFields.find(_.getName == key) match {
16 |       case Some(field) => {
17 |         field.setAccessible(true)
18 |         field.set(this, value)
19 |       }
20 |       case None => throw new IllegalArgumentException(s"No field named $key")
21 |     }
22 |     this.asInstanceOf[T]
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Filter.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import org.apache.spark.sql.Dataset
 4 | 
 5 | /** Filters documents with specific condition. */
 6 | abstract class DocumentFilter extends Transformer {
 7 | 
 8 |   /** Determines if the document should be kept or not. */
 9 |   def isFiltered(doc: Seq[String]): Boolean
10 | 
11 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
12 |     ds.filter(isFiltered(_))
13 |   }
14 | }
15 | 
16 | /** Filters sentences with specific condition. */
17 | abstract class SentenceFilter extends Transformer {
18 | 
19 |   /** Determines if the sentence should be kept or not. */
20 |   def isFiltered(sent: String): Boolean
21 | 
22 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
23 |     import ds.sparkSession.implicits._
24 |     ds.map(_.filter(isFiltered)).filter(_.length > 0)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FilterBySentenceLength.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters sentences that are too short or too long.
 6 |   *
 7 |   * @constructor
 8 |   *   create a new filter.
 9 |   * @param min
10 |   *   the minimum number of characters a sentence should contain
11 |   * @param max
12 |   *   the maximum number of characters a sentence should contain
13 |   */
14 | class FilterBySentenceLength(min: Int = 10, max: Int = 200)
15 |     extends SentenceFilter
16 |     with FieldSettable[FilterBySentenceLength] {
17 |   override def isFiltered(sent: String): Boolean = {
18 |     min <= sent.length && sent.length <= max
19 |   }
20 | 
21 |   override def toString(): String = s"${this.getClass.getSimpleName}($min, $max)"
22 | }
23 | 
24 | object FilterBySentenceLength extends FromConfig {
25 |   override def fromConfig(conf: ConfigObject): FilterBySentenceLength = {
26 |     val args = Map[String, Option[Any]](
27 |       "min" -> conf.getAs[Int]("min"),
28 |       "max" -> conf.getAs[Int]("max")
29 |     ).collect { case (k, Some(v)) => k -> v }
30 | 
31 |     new FilterBySentenceLength().setFields(args)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FilterJapaneseBasedOnCharacter.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters non Japanese document based on the type of characters.
 6 |   *
 7 |   * Default threshold follows nwc-toolkit:text-filter.
 8 |   *
 9 |   * @param kanaRate
10 |   *   texts with hiragana/katakana less than this are filtered.
11 |   * @param jpRate
12 |   *   texts with kana/kanji less than this are filtered.
13 |   */
14 | class FilterJapaneseBasedOnCharacter(
15 |     kanaRate: Double = 0.05,
16 |     jpRate: Double = 0.7
17 | ) extends SentenceFilter
18 |     with FieldSettable[FilterJapaneseBasedOnCharacter] {
19 |   val kanaPattern = """\p{InHiragana}|\p{InKatakana}""".r
20 |   val jpCharPattern = """\p{InHiragana}|\p{InKatakana}|\p{InCJKUnifiedIdeographs}""".r
21 | 
22 |   override def isFiltered(sent: String): Boolean = {
23 |     val kanaCount = kanaPattern.findAllIn(sent).length.toDouble
24 |     val jpCount = jpCharPattern.findAllIn(sent).length.toDouble
25 |     val charCount = sent.length.toDouble
26 | 
27 |     (kanaCount / charCount) > kanaRate && (jpCount / charCount) > jpRate
28 |   }
29 | 
30 |   override def toString(): String = s"${this.getClass.getSimpleName}($kanaRate, $jpRate)"
31 | }
32 | 
33 | object FilterJapaneseBasedOnCharacter extends FromConfig {
34 |   override def fromConfig(
35 |       conf: ConfigObject
36 |   ): FilterJapaneseBasedOnCharacter = {
37 |     val args = Map[String, Option[Any]](
38 |       "kanaRate" -> conf.getAs[Double]("kanaRate"),
39 |       "jpRate" -> conf.getAs[Double]("jpRate")
40 |     ).collect { case (k, Some(v)) => k -> v }
41 | 
42 |     new FilterJapaneseBasedOnCharacter().setFields(args)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/NormalizeCharacter.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Removes non-printable characters.
 6 |   *
 7 |   * Following python's str.isprintable, remove unicode general-category "Other" or "Separator"
 8 |   * except space. We also keep surrogate code points (that are not in python).
 9 |   *
10 |   * @param keepWS
11 |   *   If true, keep whitespaces other than space (" "), including \u3000. This is not python
12 |   *   compatible behaviour.
13 |   */
14 | class NormalizeCharacter(keepWS: Boolean = NormalizeCharacter.defaultKeepWS)
15 |     extends SentenceNormalizer {
16 |   val nonPrintablePattern =
17 |     if (keepWS) """[\p{gc=C}\p{gc=Z}&&[^\s　\p{gc=Cs}]]""".r
18 |     else """[\p{gc=C}\p{gc=Z}&&[^ \p{gc=Cs}]]""".r
19 | 
20 |   override def normalizeSentence(sent: String): String = {
21 |     nonPrintablePattern.replaceAllIn(sent, "")
22 |   }
23 | 
24 |   override def toString(): String = s"${this.getClass.getSimpleName}(keepWS=$keepWS)"
25 | }
26 | 
27 | object NormalizeCharacter extends FromConfig {
28 |   val defaultKeepWS = false
29 | 
30 |   override def fromConfig(conf: ConfigObject): NormalizeCharacter = {
31 |     val keepWS = conf.getOrElseAs[Boolean]("keepWS", defaultKeepWS)
32 |     new NormalizeCharacter(keepWS)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/NormalizeWhitespace.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Removes excess whitespaces. */
 6 | class NormalizeWhitespace extends SentenceNormalizer {
 7 |   val continuousWhitespacePattern = """[\s　]+""".r
 8 | 
 9 |   override def normalizeSentence(sent: String): String = {
10 |     continuousWhitespacePattern.replaceAllIn(sent, " ")
11 |   }
12 | }
13 | 
14 | object NormalizeWhitespace extends FromConfig {
15 |   override def fromConfig(conf: ConfigObject): NormalizeWhitespace = new NormalizeWhitespace
16 | }
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Normalizer.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import org.apache.spark.sql.Dataset
 4 | 
 5 | /** Normalizes document in document-wise way. */
 6 | abstract class DocumentNormalizer extends Transformer {
 7 |   def normalizeDocument(doc: Seq[String]): Seq[String]
 8 | 
 9 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
10 |     import ds.sparkSession.implicits._
11 |     ds.map(doc => normalizeDocument(doc))
12 |   }
13 | }
14 | 
15 | /** Normalizes document in sentence-wise way. */
16 | abstract class SentenceNormalizer extends DocumentNormalizer {
17 |   def normalizeSentence(sent: String): String
18 | 
19 |   override def normalizeDocument(doc: Seq[String]): Seq[String] = {
20 |     doc.map(sent => normalizeSentence(sent))
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Pipeline.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import collection.JavaConverters._
 4 | import java.nio.file.{Path, Paths, Files}
 5 | import com.typesafe.config.{Config, ConfigFactory, ConfigObject}
 6 | import org.apache.spark.sql.Dataset
 7 | import java.nio.channels.Pipe
 8 | 
 9 | /** Sequencially apply multiple transformers.
10 |   *
11 |   * @param stages
12 |   *   list of transformers to apply
13 |   */
14 | class Pipeline(private var stages: Seq[Transformer] = Seq()) extends Transformer {
15 | 
16 |   def setStages(value: Seq[Transformer]): Pipeline = {
17 |     stages = value
18 |     this
19 |   }
20 | 
21 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
22 |     stages.foldLeft(ds)((ds, tr) => tr.transform(ds))
23 |   }
24 | 
25 |   override def toString(): String = {
26 |     s"Pipeline($stages)"
27 |   }
28 | }
29 | 
30 | object Pipeline {
31 |   def fromConfig(conf: Config): Pipeline = {
32 |     val stageConfs = conf.getObjectList("stages").asScala.map(_.asInstanceOf[ConfigObject])
33 |     val stages = getStagesFromCompanion(stageConfs)
34 |     new Pipeline(stages)
35 |   }
36 | 
37 |   /** Instantiate stages based on the config. Use constructor. */
38 |   private def getStagesFromConstructor(confObjs: Seq[ConfigObject]) = {
39 |     confObjs.map(co => {
40 |       val name = co.get("class").unwrapped.asInstanceOf[String]
41 |       getConstructorOf(name).newInstance(co)
42 |     })
43 |   }
44 | 
45 |   /** Instantiate stages based on the config. Use companion object. */
46 |   private def getStagesFromCompanion(confObjs: Seq[ConfigObject]) = {
47 |     confObjs.map(co => {
48 |       val name = co.get("class").unwrapped.asInstanceOf[String]
49 |       getCompanionOf(name).asInstanceOf[FromConfig].fromConfig(co)
50 |     })
51 |   }
52 | 
53 |   /** Get a constructor of a class from the given name. */
54 |   private def getConstructorOf(name: String) = {
55 |     val clz = Class.forName(withClassPrefix(name))
56 |     clz.getConstructor(Class.forName("com.typesafe.config.ConfigObject"))
57 |   }
58 | 
59 |   /** Get a companion object of a class from the given name. */
60 |   private def getCompanionOf(name: String) = {
61 |     val clz = Class.forName(withClassPrefix(name))
62 |     clz.getClassLoader.loadClass(clz.getName + "$").getField("MODULE$").get(null)
63 |   }
64 | 
65 |   private val classname = this.getClass.getName()
66 |   private val classPrefix = classname.take(classname.lastIndexOf("."))
67 | 
68 |   /** Append class name prefix if not exists.
69 |     *
70 |     * Note: This assume each transformer classes belong to the same package to this class.
71 |     */
72 |   private def withClassPrefix(name: String): String = {
73 |     if (name.startsWith(classPrefix)) { name }
74 |     else {
75 |       s"$classPrefix.$name"
76 |     }
77 |   }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveEmail.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters sentences that contain email address. */
 6 | class RemoveEmail extends SentenceFilter {
 7 |   val emailPattern = """[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+""".r
 8 | 
 9 |   override def isFiltered(sent: String): Boolean = {
10 |     emailPattern.findFirstIn(sent).isEmpty
11 |   }
12 | }
13 | 
14 | object RemoveEmail extends FromConfig {
15 |   override def fromConfig(conf: ConfigObject): RemoveEmail = new RemoveEmail
16 | }
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveNGWordDocument.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | import com.worksap.nlp.sudachi.Tokenizer
 5 | import com.worksap.nlp.uzushio.Sudachi
 6 | 
 7 | import collection.JavaConverters._
 8 | import java.nio.charset.StandardCharsets
 9 | import java.nio.file.{Files, Path, Paths}
10 | import org.apache.spark.sql.Dataset
11 | 
12 | import scala.io.Source
13 | 
14 | /** Filters documents that contain one of the specified words.
15 |   *
16 |   * @constructor
17 |   *   create a new filter with ng-word list.
18 |   * @param ngwords
19 |   *   the set of words which should not appear in the filtered documents
20 |   */
21 | class RemoveNGWordDocument(ngwords: Set[String]) extends Transformer {
22 |   val ngwordPattern = s"""(${ngwords.mkString("|")})""".r
23 |   val mode = Tokenizer.SplitMode.C
24 | 
25 |   def containsNgword(tok: Tokenizer, doc: Seq[String]): Boolean = {
26 |     for (sent <- doc) {
27 |       val matchIter = ngwordPattern.findAllMatchIn(sent)
28 |       val (matches, forSize) = matchIter.duplicate
29 | 
30 |       if (forSize.size != 0) {
31 |         try {
32 |           val morphmes = tok.tokenize(sent).asScala
33 |           val morphBegins = morphmes.map(_.begin()).toSet
34 |           val morphEnds = morphmes.map(_.end()).toSet
35 | 
36 |           for (m <- matches) {
37 |             if (morphBegins.contains(m.start) && morphEnds.contains(m.end)) {
38 |               return true
39 |             }
40 |           }
41 |         } catch {
42 |           case err: Exception => println(s"$sent")
43 |         }
44 |       }
45 |     }
46 |     false
47 |   }
48 | 
49 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
50 |     import ds.sparkSession.implicits._
51 | 
52 |     if (ngwords.size == 0) { ds }
53 |     else {
54 |       ds.mapPartitions(iter => {
55 |         // setup sudachi tokenizer per partition
56 |         val tok = Sudachi.setupSudachiTokenizer()
57 |         iter.filter(doc => !containsNgword(tok, doc))
58 |       })
59 |     }
60 |   }
61 | 
62 |   override def toString(): String = s"${this.getClass.getSimpleName}(#word=${ngwords.size})"
63 | }
64 | 
65 | object RemoveNGWordDocument extends FromConfig {
66 |   val defaultPath = "ng_words.txt"
67 | 
68 |   def fromFile(ngwordsFile: Path): RemoveNGWordDocument = {
69 |     val fullstr = new String(Files.readAllBytes(ngwordsFile), StandardCharsets.UTF_8)
70 |     new RemoveNGWordDocument(
71 |       fullstr.split("\n").map(_.trim).filter(_.nonEmpty).toSet
72 |     )
73 |   }
74 | 
75 |   override def fromConfig(conf: ConfigObject): RemoveNGWordDocument = {
76 |     val pathStr = conf.getOrElseAs[String]("path", defaultPath)
77 | 
78 |     val filepath = Paths.get(pathStr)
79 |     if (filepath.toFile.exists) {
80 |       fromFile(filepath)
81 |     } else {
82 |       val fullstr = Source.fromResource(pathStr).mkString
83 |       new RemoveNGWordDocument(
84 |         fullstr.split("\n").map(_.trim).filter(_.nonEmpty).toSet
85 |       )
86 |     }
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveScriptDocument.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters documents that contain script. */
 6 | class RemoveScriptDocument extends DocumentFilter {
 7 |   val curlyBracketsPattern = """[\{|\}]""".r
 8 | 
 9 |   def isFilteredSent(sent: String): Boolean = {
10 |     curlyBracketsPattern.findFirstIn(sent).isEmpty
11 |   }
12 | 
13 |   override def isFiltered(doc: Seq[String]): Boolean = {
14 |     doc.forall(sent => isFilteredSent(sent))
15 |   }
16 | }
17 | 
18 | object RemoveScriptDocument extends FromConfig {
19 |   override def fromConfig(conf: ConfigObject): RemoveScriptDocument = new RemoveScriptDocument
20 | }
21 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveShortDocument.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters documents that are too short.
 6 |   *
 7 |   * @constructor
 8 |   *   create a new filter.
 9 |   * @param min
10 |   *   the minimum number of sentences a document should contain
11 |   */
12 | class RemoveShortDocument(min: Int = 5)
13 |     extends DocumentFilter
14 |     with FieldSettable[RemoveShortDocument] {
15 |   override def isFiltered(doc: Seq[String]): Boolean = {
16 |     min <= doc.map(_.split("\n").length).reduce(_ + _)
17 |   }
18 | 
19 |   override def toString(): String = s"${this.getClass.getSimpleName}($min)"
20 | }
21 | 
22 | object RemoveShortDocument extends FromConfig {
23 |   override def fromConfig(conf: ConfigObject): RemoveShortDocument = {
24 |     val args = Map[String, Option[Any]]("min" -> conf.getAs[Int]("min"))
25 |       .collect { case (k, Some(v)) => k -> v }
26 | 
27 |     new RemoveShortDocument().setFields(args)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveSubstring.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | import java.nio.charset.StandardCharsets
 5 | import java.nio.file.{Path, Paths, Files}
 6 | import scala.io.Source
 7 | 
 8 | /** Removes given substrings from documents.
 9 |   *
10 |   * @param matchSentence
11 |   *   If true, match string with only full sentence, i.e. substr have to start/end at newline.
12 |   */
13 | class RemoveSubstring(
14 |     substrs: Set[String],
15 |     matchSentence: Boolean = RemoveSubstring.defaultMatchSentence
16 | ) extends DocumentNormalizer {
17 |   val substrPattern = matchSentence match {
18 |     case false => { s"""(${substrs.mkString("|")})""".r }
19 |     case true => {
20 |       s"""(?m)(^${substrs.mkString("$|^")}$$)""".r
21 |     }
22 |   }
23 | 
24 |   override def normalizeDocument(doc: Seq[String]): Seq[String] = {
25 |     val fullDoc = doc.mkString("\n")
26 |     val removed = substrPattern.replaceAllIn(fullDoc, "")
27 |     removed.split("\n").filter(_.length > 0).toSeq
28 |   }
29 | 
30 |   override def toString(): String = s"${this.getClass.getSimpleName}(#substr=${substrs.size})"
31 | }
32 | 
33 | object RemoveSubstring extends FromConfig {
34 |   val defaultPath = "template_sentences.txt"
35 |   val defaultDelim = "\n\n" // Delimiter of substrings in the file.
36 |   val defaultMatchSentence = false // Whether if match only full sentence.
37 | 
38 |   def fromFile(
39 |       filePath: Path,
40 |       delim: String = defaultDelim,
41 |       matchSentence: Boolean = defaultMatchSentence
42 |   ): RemoveSubstring = {
43 |     val fullstr = new String(Files.readAllBytes(filePath), StandardCharsets.UTF_8)
44 |     new RemoveSubstring(
45 |       fullstr.split(delim).map(_.trim).filter(_.nonEmpty).toSet,
46 |       matchSentence
47 |     )
48 |   }
49 | 
50 |   override def fromConfig(conf: ConfigObject): RemoveSubstring = {
51 |     val pathStr = conf.getOrElseAs[String]("path", defaultPath)
52 |     val delim = conf.getOrElseAs[String]("delim", defaultDelim)
53 |     val matchSentence = conf.getOrElseAs[Boolean]("matchSentence", defaultMatchSentence)
54 | 
55 |     val filepath = Paths.get(pathStr)
56 |     if (filepath.toFile.exists) {
57 |       fromFile(filepath, delim, matchSentence)
58 |     } else {
59 |       val fullstr = Source.fromResource(pathStr).mkString
60 |       new RemoveSubstring(
61 |         fullstr.split(delim).map(_.trim).filter(_.nonEmpty).toSet,
62 |         matchSentence
63 |       )
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveURL.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Filters sentences that contain URL. */
 6 | class RemoveURL extends SentenceFilter {
 7 |   val urlPattern = """(https?|sftp?)://[\w/:%#\$&\?\(\)~\.=\+\-]+""".r
 8 | 
 9 |   override def isFiltered(sent: String): Boolean = {
10 |     urlPattern.findFirstIn(sent).isEmpty
11 |   }
12 | }
13 | 
14 | object RemoveURL extends FromConfig {
15 |   override def fromConfig(conf: ConfigObject): RemoveURL = new RemoveURL
16 | }
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveWikipediaCitation.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | 
 5 | /** Removes citation markers (from Wikipedia). */
 6 | class RemoveWikipediaCitation extends SentenceNormalizer {
 7 |   val citationPattern = """\[\d+?\]|\[要.+?\]|\{\{+[^{}]+?\}\}+|\[(要出典|リンク切れ|.+?\?)\]""".r
 8 | 
 9 |   override def normalizeSentence(sent: String): String = {
10 |     citationPattern.replaceAllIn(sent, "")
11 |   }
12 | }
13 | 
14 | object RemoveWikipediaCitation extends FromConfig {
15 |   override def fromConfig(conf: ConfigObject): RemoveWikipediaCitation = new RemoveWikipediaCitation
16 | }
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/SplitElement.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | import org.apache.spark.sql.Dataset
 5 | 
 6 | /** Split each elements of the document by given delimiter and flatten.
 7 |   *
 8 |   * Use this to split document/paragraph into paragraph/sentence.
 9 |   *
10 |   * @param delim
11 |   *   the delimiter to split each elements.
12 |   */
13 | class SplitElement(delim: String = "\n") extends Transformer with FieldSettable[SplitElement] {
14 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = {
15 |     import ds.sparkSession.implicits._
16 |     ds.map(_.flatMap(_.split(delim)))
17 |   }
18 | }
19 | 
20 | object SplitElement extends FromConfig {
21 |   override def fromConfig(conf: ConfigObject): SplitElement = {
22 |     val args = Map[String, Option[Any]]("delim" -> conf.getAs[String]("delim"))
23 |       .collect { case (k, Some(v)) => k -> v }
24 |     new SplitElement().setFields(args)
25 |   }
26 | }
27 | 
28 | /** Split element into sentences. */
29 | class SplitIntoSentence extends SplitElement(delim = "\n")
30 | object SplitIntoSentence extends FromConfig {
31 |   override def fromConfig(conf: ConfigObject): SplitIntoSentence = new SplitIntoSentence()
32 | }
33 | 
34 | /** Split element into paragraphs. */
35 | class SplitIntoParagraph extends SplitElement(delim = "\n\n")
36 | object SplitIntoParagraph extends FromConfig {
37 |   override def fromConfig(conf: ConfigObject): SplitIntoParagraph = new SplitIntoParagraph()
38 | }
39 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Transformer.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigObject
 4 | import org.apache.spark.sql.Dataset
 5 | 
 6 | /** Transforms given spark dataset. */
 7 | trait Transformer extends scala.Serializable {
 8 |   def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]]
 9 | 
10 |   override def toString(): String = s"${this.getClass.getSimpleName}"
11 | }
12 | 
13 | /** Trait to instanciate transformer based on config file.
14 |   *
15 |   * Every Transformers should have a companion object with this trait.
16 |   */
17 | trait FromConfig {
18 |   def fromConfig(conf: ConfigObject): Transformer
19 | 
20 |   /** Wrapper class for easy config value access. */
21 |   implicit class ConfigObjectWrapper(val conf: ConfigObject) {
22 |     def getAs[T](key: String): Option[T] = Option(conf.get(key)).map(_.unwrapped.asInstanceOf[T])
23 | 
24 |     def getOrElseAs[T](key: String, default: T): T = conf.getAs[T](key).getOrElse(default)
25 |   }
26 | }
27 | 
28 | /** Transformer that does nothing. */
29 | class Identity extends Transformer {
30 |   override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = ds
31 | }
32 | 
33 | object Identity extends FromConfig {
34 |   override def fromConfig(conf: ConfigObject): Identity = new Identity
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/main/DeduplicateParagraphs.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.main
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.runners.{DeduplicateParagraphs => DedupTask}
 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | object DeduplicateParagraphs {
 8 |   def main(args: Array[String]): Unit = {
 9 |     val argObj = new DedupTask.ArgParser(args).toArgs
10 |     SparkSession.builder().getOrCreate().use { spark =>
11 |       new DedupTask(argObj, spark).process()
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/worksap/nlp/uzushio/main/ExtractTextFromWarc.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.main
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.runners.{ExtractParagraphsFromWARC, WarcTextExtractionRaw}
 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | /** Extracts text from WARC files.
 8 |   *
 9 |   * @see
10 |   *   [[WarcTextExtractionRaw.ConfigParser]]
11 |   */
12 | object ExtractTextFromWarc {
13 |   def main(args: Array[String]): Unit = {
14 |     val cfg = new WarcTextExtractionRaw.ConfigParser(args).asArgs()
15 |     SparkSession.builder().appName(getClass.getSimpleName).getOrCreate().use { spark =>
16 |       ExtractParagraphsFromWARC.run(cfg)(spark)
17 |     }
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/legacy/list_common_substr.py:
--------------------------------------------------------------------------------
 1 | import argparse as ap
 2 | from pathlib import Path
 3 | from collections import Counter
 4 | import itertools as it
 5 | 
 6 | import numpy as np
 7 | import suffixarray
 8 | 
 9 | doc_delim = "\n\n"
10 | 
11 | 
12 | def main():
13 |     args = parse_args()
14 |     validate_args(args)
15 | 
16 |     data = ""
17 |     for p in args.input:
18 |         with p.open() as fin:
19 |             data += fin.read()
20 | 
21 |     n_doc = len(data.split(doc_delim))
22 |     min_len = args.min_len
23 |     min_freq = n_doc * args.min_freq if args.min_freq < 1 else args.min_freq
24 | 
25 |     # use results for the reversed text to make result set closed
26 |     # (do not want to handle every prefix/suffix of substrings)
27 |     sa = suffixarray.SuffixArray(data)
28 |     sa_rev = suffixarray.SuffixArray(data[::-1])
29 | 
30 |     def key_func(s, i, l, c):
31 |         # take if target substr has enough length and freq count
32 |         # also check new-line to handle per sentence
33 |         return l >= min_len and c >= min_freq and s[i] == '\n' and s[i+l-1] == '\n'
34 | 
35 |     ss_cnt = {sa.str[x:x+l]: c
36 |               for x, l, c in sa.iter_repeated_substrings(key=key_func)}
37 |     rev_cnt = {sa_rev.str[x+l-1:x-1:-1]: c
38 |                for x, l, c in sa_rev.iter_repeated_substrings(key=key_func)}
39 |     common_ss = set(ss_cnt.keys()) & set(rev_cnt.keys())
40 | 
41 |     with args.output.open("w") as fout:
42 |         for ss in common_ss:
43 |             fout.write(f"{ss_cnt[ss]}\n{ss}{doc_delim}")
44 | 
45 |     return
46 | 
47 | 
48 | def parse_args():
49 |     parser = ap.ArgumentParser()
50 |     parser.add_argument(dest="input", type=str, nargs="+",
51 |                         help="Input text file.")
52 | 
53 |     parser.add_argument("--min-len", type=int,
54 |                         default=10, help="minimum length")
55 |     parser.add_argument("--min-freq", default=10, help="minimum frequency")
56 | 
57 |     parser.add_argument("-o", "--output", dest="output", type=str, default="./ss.txt",
58 |                         help="File to output summary.")
59 |     parser.add_argument("--overwrite", action="store_true",
60 |                         help="Overwrite output files when they already exist.")
61 | 
62 |     args = parser.parse_args()
63 |     args.input = [Path(s) for s in args.input]
64 |     args.output = Path(args.output)
65 |     args.min_freq = float(args.min_freq)
66 |     return args
67 | 
68 | 
69 | def validate_args(args):
70 |     if not args.overwrite:
71 |         if args.output.exists():
72 |             raise ValueError(
73 |                 f"File {args.output} already exists. Set --overwrite to continue anyway.")
74 |     return
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/HttpResponseParser.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import java.io.ByteArrayInputStream
 4 | import java.io.InputStream
 5 | import java.io.SequenceInputStream
 6 | import java.io.Serializable
 7 | import org.apache.commons.io.IOUtils
 8 | import org.apache.hc.core5.http.impl.io.{
 9 |   DefaultHttpResponseParser,
10 |   SessionInputBufferImpl
11 | }
12 | import org.apache.hc.core5.http.io.SessionInputBuffer
13 | import org.apache.log4j.LogManager
14 | 
15 | /** Http response parser for warc record. */
16 | class HttpResponseParser(bufSize: Int = 128 * 1024) extends Serializable {
17 |   @transient lazy val logger = LogManager.getLogger(this.getClass.getSimpleName)
18 | 
19 |   private val responseParser = new DefaultHttpResponseParser()
20 |   private val siBuffer = new SessionInputBufferImpl(bufSize)
21 |   private val byteBuffer = Array.ofDim[Byte](bufSize)
22 | 
23 |   /** Parses WarcRecord body as http response.
24 |     *
25 |     * Make sure that provided warc record has proper type.
26 |     */
27 |   def parseWarcRecord(warc: WarcRecord) = {
28 |     val is = new ByteArrayInputStream(warc.content)
29 | 
30 |     try {
31 |       val resp = responseParser.parse(siBuffer, is)
32 |       val body = readBody(siBuffer, is);
33 |       new HttpResponseSerializable(resp, body)
34 |     } catch {
35 |       // TODO: data handling in the error cases
36 |       case e: org.apache.hc.core5.http.HttpException => {
37 |         logger.warn(s"error parsing http response: ${e}")
38 |         new HttpResponseSerializable()
39 |       }
40 |     } finally {
41 |       is.close()
42 |     }
43 |   }
44 | 
45 |   /** Read body bytes from buffers after headers are read. */
46 |   private def readBody(isBuf: SessionInputBuffer, rest: InputStream) = {
47 |     val emptyIs = new java.io.ByteArrayInputStream(Array.emptyByteArray)
48 |     val restBytes = isBuf.read(byteBuffer, emptyIs)
49 | 
50 |     IOUtils.toByteArray(
51 |       new SequenceInputStream(
52 |         new ByteArrayInputStream(byteBuffer.slice(0, restBytes)),
53 |         rest
54 |       )
55 |     )
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/HttpResponseSerializable.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import java.io.Serializable
 4 | import org.apache.hc.core5.http.ClassicHttpResponse
 5 | import org.apache.hc.core5.http.message.BasicClassicHttpResponse
 6 | 
 7 | /** Seritalizable wrapper of ClassicHttpResponse. */
 8 | class HttpResponseSerializable(
 9 |     resp: ClassicHttpResponse = new BasicClassicHttpResponse(600),
10 |     val body: Array[Byte] = Array.empty[Byte]
11 | ) extends Serializable {
12 | 
13 |   /** Returns the value of the first header with the given name.
14 |     *
15 |     * @throws ProtocolException
16 |     *   in case multiple headers with the given name are found.
17 |     */
18 |   def getHeader(name: String): Option[String] = {
19 |     Option(resp.getHeader(name)).map(_.getValue)
20 |   }
21 | 
22 |   def getFirstHeader(name: String): Option[String] = {
23 |     Option(resp.getFirstHeader(name)).map(_.getValue)
24 |   }
25 | 
26 |   def getLastHeader(name: String): Option[String] = {
27 |     Option(resp.getLastHeader(name)).map(_.getValue)
28 |   }
29 | 
30 |   def getHeaders(): Seq[(String, String)] = {
31 |     resp.getHeaders().map(header => (header.getName(), header.getValue()))
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/LongWritableSerializable.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import org.apache.hadoop.io.LongWritable;
 4 | import java.io.Serializable
 5 | 
 6 | /* Serializable wrapper of Hadoop LongWritable class.
 7 |  *
 8 |  * ref: https://issues.apache.org/jira/browse/SPARK-2421
 9 |  */
10 | class LongWritableSerializable extends LongWritable with Serializable
11 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcFileReader.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import collection.JavaConverters._
 4 | import java.io.{InputStream, FilterInputStream}
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.log4j.LogManager
 8 | import org.archive.io.warc.WARCReaderFactory
 9 | 
10 | /** Reads {@link WarcRecord}s from a WARC file using Hadoop filesystem APIs. */
11 | class WarcFileReader(conf: Configuration, filePath: Path) {
12 |   @transient lazy val logger = LogManager.getLogger(this.getClass.getSimpleName)
13 | 
14 |   /** Opens a warc file and setup an iterator of records. */
15 |   private val fs = filePath.getFileSystem(conf)
16 |   private val fileSize = fs.getFileStatus(filePath).getLen
17 |   private val fsin = new CountingInputStream(fs.open(filePath))
18 |   private val reader = WARCReaderFactory.get(filePath.getName(), fsin, true)
19 |   private val recordIter = reader.iterator.asScala
20 | 
21 |   /** Init counters to report progress. */
22 |   private var recordsRead: Long = 0
23 |   private var bytesRead: Long = 0
24 | 
25 |   /** Closes the file and reader. */
26 |   def close(): Unit = {
27 |     reader.close()
28 |     fsin.close()
29 |   }
30 | 
31 |   /** Reads the next record from the iterator.
32 |     *
33 |     * @throws java.util.NoSuchElementException
34 |     */
35 |   def read(): WarcRecord = {
36 |     if (!recordIter.hasNext) {
37 |       throw new java.util.NoSuchElementException()
38 |     }
39 | 
40 |     try {
41 |       val record = new WarcRecord(recordIter.next())
42 |       recordsRead += 1
43 |       return record
44 |     } catch {
45 |       case e: java.io.EOFException => {
46 |         logger.warn(s"error while iterating warc, try to skip: ${e}")
47 |         return read()
48 |       }
49 |     }
50 |   }
51 | 
52 |   /** Returns the number of records that have been read. */
53 |   def getRecordsRead: Long = {
54 |     return recordsRead
55 |   }
56 | 
57 |   /** Returns the number of bytes that have been read. */
58 |   def getBytesRead: Long = {
59 |     return bytesRead
60 |   }
61 | 
62 |   /** Returns the proportion of the file thet has been read. */
63 |   def getProgress: Float = {
64 |     if (fileSize <= 0) return 1.0f
65 |     return bytesRead.toFloat / fileSize.toFloat
66 |   }
67 | 
68 |   /** InputStream that records the number of bytes read. */
69 |   private class CountingInputStream(in: InputStream)
70 |       extends FilterInputStream(in) {
71 |     override def read(): Int = {
72 |       val result = in.read()
73 |       if (result != -1) bytesRead += 1
74 |       return result
75 |     }
76 | 
77 |     override def read(b: Array[Byte], off: Int, len: Int): Int = {
78 |       val result = in.read(b, off, len)
79 |       if (result != -1) bytesRead += result
80 |       return result
81 |     }
82 | 
83 |     override def skip(n: Long): Long = {
84 |       val result = in.skip(n)
85 |       bytesRead += result
86 |       return result
87 |     }
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcInputFormat.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.mapreduce.InputSplit;
 5 | import org.apache.hadoop.mapreduce.JobContext;
 6 | import org.apache.hadoop.mapreduce.RecordReader;
 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
10 | 
11 | /* Hadoop InputFormat for WARC files.
12 |  *
13 |  * Key is 1-index LongWritable. Use get() method to take Long value.
14 |  */
15 | class WarcInputFormat
16 |     extends FileInputFormat[LongWritableSerializable, WarcWritable] {
17 | 
18 |   /* Opens a WARC file (possibly compressed), and returns a RecordReader for accessing it. */
19 |   override def createRecordReader(
20 |       split: InputSplit,
21 |       context: TaskAttemptContext
22 |   ) = {
23 |     new WarcRecordReader()
24 |   }
25 | 
26 |   override def isSplitable(context: JobContext, filename: Path): Boolean = {
27 |     // we cannot (sanely) split warc files, due to its variable-length records.
28 |     return false
29 |   }
30 | }
31 | 
32 | /* Wrapper class of {@link WarcFileReader} to implement RecordReader. */
33 | class WarcRecordReader
34 |     extends RecordReader[LongWritableSerializable, WarcWritable] {
35 |   private val key = new LongWritableSerializable();
36 |   private val value = new WarcWritable();
37 | 
38 |   private var reader: WarcFileReader = null
39 | 
40 |   override def initialize(
41 |       split: InputSplit,
42 |       context: TaskAttemptContext
43 |   ): Unit = {
44 |     reader = new WarcFileReader(
45 |       context.getConfiguration(),
46 |       split.asInstanceOf[FileSplit].getPath
47 |     );
48 |   }
49 | 
50 |   override def nextKeyValue(): Boolean = {
51 |     try {
52 |       val record = reader.read();
53 |       key.set(reader.getRecordsRead);
54 |       value.setRecord(record);
55 |     } catch {
56 |       case e: java.util.NoSuchElementException => { return false }
57 |     }
58 | 
59 |     return true;
60 |   }
61 | 
62 |   override def getCurrentKey(): LongWritableSerializable = {
63 |     return key;
64 |   }
65 | 
66 |   override def getCurrentValue(): WarcWritable = {
67 |     return value;
68 |   }
69 | 
70 |   override def getProgress(): Float = {
71 |     return reader.getProgress
72 |   }
73 | 
74 |   override def close(): Unit = {
75 |     reader.close();
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | object WarcLoader {
 7 |   /* Load WARC file as RDD. */
 8 |   def readFrom(
 9 |       spark: SparkSession,
10 |       name: String
11 |   ): RDD[WarcRecord] = {
12 |     spark.sparkContext
13 |       .newAPIHadoopFile(
14 |         name,
15 |         classOf[WarcInputFormat],
16 |         classOf[LongWritableSerializable],
17 |         classOf[WarcWritable]
18 |       )
19 |       .map { case (k, v) => v.getRecord() }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcRecord.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import collection.JavaConverters._
 4 | import java.io.Serializable
 5 | import org.apache.commons.io.IOUtils
 6 | import org.archive.io.ArchiveRecord
 7 | 
 8 | /* Serializable wrapper of ArchiveRecord, with its contents loaded. */
 9 | class WarcRecord(record: ArchiveRecord) extends Serializable {
10 |   val headers: Map[String, String] = record
11 |     .getHeader()
12 |     .getHeaderFields()
13 |     .asScala
14 |     .map { case (k, v) => (k, v.toString) }
15 |     .toMap
16 | 
17 |   /* read contents to safely step iterator forward. */
18 |   val content = IOUtils.toByteArray(record, record.available());
19 | 
20 |   def isResponse(): Boolean = {
21 |     // ref https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0
22 |     val warcType = headers.getOrElse("WARC-Type", "")
23 |     return warcType == "response"
24 |   }
25 | 
26 |   def isTruncated(): Boolean = {
27 |     val truncated = headers.get("WARC-Truncated")
28 |     return truncated.nonEmpty
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcWritable.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.warc
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.Serializable
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | /* A mutable wrapper around a {@link WarcRecord} implementing the Hadoop
 9 |  * Writable and Serializable (for Spark) interfaces.
10 |  */
11 | class WarcWritable(var record: WarcRecord = null)
12 |     extends Writable
13 |     with Serializable {
14 | 
15 |   /* Returns the record currently wrapped by this writable. */
16 |   def getRecord(): WarcRecord = {
17 |     return record;
18 |   }
19 | 
20 |   /* Updates the record held within this writable wrapper. */
21 |   def setRecord(newRecord: WarcRecord): Unit = {
22 |     record = newRecord;
23 |   }
24 | 
25 |   /* Appends the current record to a {@link DataOutput} stream. */
26 |   override def write(out: DataOutput): Unit = {
27 |     // TODO: impl (not neccessary for current use case)
28 |     // if (record != null) record.write(out);
29 |   }
30 | 
31 |   /* Parses a {@link WarcRecord} out of a {@link DataInput} stream, and make it
32 |    * the current record.
33 |    */
34 |   override def readFields(in: DataInput): Unit = {
35 |     // TODO: impl (not neccessary for current use case)
36 |     // record = new WarcRecord(in);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/hojichar/README.md:
--------------------------------------------------------------------------------
1 | These lists are imported from [HojiChar](https://github.com/HojiChar/HojiChar/tree/main/hojichar/dict).
2 | Lists contain offensive words and used for filtering. 
3 | 
4 | 


--------------------------------------------------------------------------------
/lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/hojichar/discriminations_keywords_ja.txt:
--------------------------------------------------------------------------------
  1 | アイヌ系
  2 | アカ
  3 | アメ公
  4 | アル中
  5 | イカサマ
  6 | イタ公
  7 | イモ
  8 | インチキ
  9 | インディアン嘘つかない
 10 | エスキモー
 11 | エチゼンクラゲ
 12 | オカマ
 13 | オールドミス
 14 | カッペ
 15 | ガキ
 16 | ガサ
 17 | キ●ガイ
 18 | キチ
 19 | キチガ●
 20 | キチガイ
 21 | キ印
 22 | ゲンナマ
 23 | コロシ
 24 | ゴミ屋
 25 | サツ
 26 | サラ金
 27 | ザギン
 28 | シマ
 29 | ジプシー
 30 | ジャップ
 31 | ジャリ
 32 | スケ
 33 | スチュワーデス
 34 | スラム
 35 | ズージャー
 36 | タケノコ医者
 37 | ダッチマン
 38 | チビ
 39 | チャリンコ
 40 | チャンコロ
 41 | チョン
 42 | デカ
 43 | トルコ嬢
 44 | トルコ風呂
 45 | ドヤ街
 46 | ナオン
 47 | ニガー
 48 | ニグロ
 49 | ニコヨン
 50 | ノビ
 51 | バタ屋
 52 | パクる
 53 | パン助
 54 | パーマ屋
 55 | ヒモ
 56 | ブス
 57 | ブタ箱
 58 | ブツ
 59 | ブラインドタッチ
 60 | ポコペン
 61 | ポリ公
 62 | マンコ
 63 | ヤンキー
 64 | ヤー様
 65 | ヨツ
 66 | ルンペン
 67 | レントゲン技師
 68 | ロンパリ
 69 | 丁稚
 70 | 三つ口
 71 | 三助
 72 | 三国人
 73 | 三韓征伐
 74 | 上方の贅六
 75 | 下女
 76 | 下男
 77 | 不具
 78 | 不可触民
 79 | 不治の病
 80 | 中共
 81 | 乞食
 82 | 二号
 83 | 人夫
 84 | 人足
 85 | 人非人
 86 | 他力本願
 87 | 代書屋
 88 | 令嬢
 89 | 伊勢乞食
 90 | 低脳
 91 | 低脳児
 92 | 低開発国
 93 | 保母
 94 | 保線工夫
 95 | 借り腹
 96 | 健全なる精神は健全なる身体に宿る
 97 | 傴僂
 98 | 八百屋
 99 | 共稼ぎ
100 | 処女作
101 | 処女峰
102 | 出戻り
103 | 出稼ぎ
104 | 助産婦
105 | 労務者
106 | 北鮮
107 | 千摺り
108 | ナオン
109 | ヒモ
110 | オールドミス
111 | 女子供
112 | 狂女
113 | 下女
114 | 下男
115 | 女給
116 | 女傑
117 | 女工
118 | 処女作
119 | 処女峰
120 | 女中
121 | #スケ
122 | 端女
123 | 醜男
124 | 阿婆擦れ
125 | 男のくせに
126 | 女のくせに
127 | 男らしく
128 | 女らしく
129 | 女々しい
130 | 女だてらに
131 | 男勝り
132 | 紅一点
133 | 女の腐ったような
134 | 女の腐ったの
135 | 売れ残り
136 | 出戻り
137 | めかけ
138 | 職場の花
139 | 二号さん
140 | フェミナチ
141 | あげまん
142 | さげまん
143 | あげちん
144 | さげちん
145 | まんこ
146 | ちんこ
147 | ビッチ
148 | 毒女
149 | 鬼女
150 | ババア
151 | <TEST_STRING_OF_DISCRIMINATION_KEYWORD>


--------------------------------------------------------------------------------
/lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/ng_words.txt:
--------------------------------------------------------------------------------
  1 | fuck
  2 | g スポット
  3 | sm女王
  4 | tenga
  5 | あばずれ
  6 | あぱずれ
  7 | あほ
  8 | うざ
  9 | うんこ
 10 | え〇
 11 | えっち
 12 | おしっこ
 13 | おしりのあな
 14 | おっぱい
 15 | おもらし
 16 | かたわ
 17 | きちがい
 18 | きめぇ
 19 | きめえ
 20 | くそ
 21 | せんずり
 22 | ち〇
 23 | ちんぐり
 24 | ちんこ
 25 | つるぺた
 26 | つんぼ
 27 | ふたなり
 28 | ぶさいく
 29 | ぶす
 30 | ま〇
 31 | まんぐり
 32 | まんこ
 33 | めくら
 34 | やりまん
 35 | アスペ
 36 | アスホール
 37 | アナリングス
 38 | アナル
 39 | アヌス
 40 | アバズレ
 41 | アパズレ
 42 | アホ
 43 | イマラチオ
 44 | イメクラ
 45 | イラマチオ
 46 | ウザ
 47 | ウンコ
 48 | エ〇
 49 | エッチ
 50 | エロ
 51 | オカマ
 52 | オッパイ
 53 | オナ
 54 | オナニー
 55 | オフパコ
 56 | オマンコ
 57 | オルガズム
 58 | オーガズム
 59 | カス
 60 | ガイジ
 61 | キチガイ
 62 | キモ
 63 | クズ
 64 | クソ
 65 | クリトリス
 66 | クンニ
 67 | クンニリングス
 68 | グループ・セックス
 69 | グロ
 70 | ゲイボーイ
 71 | ゲイ・セックス
 72 | ゲロ
 73 | コカイン
 74 | コキ
 75 | コンドーム
 76 | ザーメン
 77 | シコ
 78 | ショタ
 79 | スカトロ
 80 | スケベ
 81 | ストリップ劇場
 82 | スマタ
 83 | セクロス
 84 | セックス
 85 | セフレ
 86 | センズリ
 87 | ダッチワイフ
 88 | チ〇
 89 | テレフォンセックス
 90 | ディルド
 91 | ディープ・スロート
 92 | デブ
 93 | デリヘル
 94 | デートレイプ
 95 | ドキュン
 96 | ナマポ
 97 | ニガー
 98 | ヌい
 99 | ヌく
100 | ヌけ
101 | ネオ・ナチ
102 | ハメ撮り
103 | パイズリ
104 | パイパン
105 | パンチラ
106 | パンティー
107 | ビッチ
108 | ピロートーク
109 | ファック
110 | フェラ
111 | フェラチオ
112 | ブサイク
113 | ブス
114 | プリンス アルバート ピアス
115 | ペッティング
116 | ペニス
117 | ペニスバンド
118 | ホモ
119 | ボンテージ
120 | ボールギャグ
121 | ポルノグラフィー
122 | マ〇
123 | マザー・ファッカー
124 | マスターベーション
125 | マラ
126 | マンコ
127 | ヤラせ
128 | ラブホ
129 | リスカ
130 | リストカット
131 | リョナ
132 | リンチ
133 | レイプ
134 | レズ
135 | 不細工
136 | 中出し
137 | 乱交
138 | 二穴
139 | 人妻
140 | 側位
141 | 児童性虐待
142 | 前戯
143 | 勃起する
144 | 合いの子
145 | 四十八手
146 | 売り専
147 | 売国
148 | 売女
149 | 売春婦
150 | 外人
151 | 夢精
152 | 大人のおもちゃ
153 | 大人のオモチャ
154 | 大人の玩具
155 | 大陰唇
156 | 射精
157 | 尻軽
158 | 尿道プレイ
159 | 巨乳
160 | 巨根
161 | 強姦犯
162 | 後戯
163 | 後背位
164 | 手コキ
165 | 手マン
166 | 援交
167 | 援助交際
168 | 支那
169 | 新しいポルノ
170 | 正常位
171 | 殺し方
172 | 殺人方法
173 | 氏ね
174 | 氏んだ
175 | 氏んで
176 | 気違い
177 | 池沼
178 | 淫乱
179 | 潮吹き女
180 | 潮吹き男性
181 | 熟女
182 | 獣姦
183 | 玉なめ
184 | 玉舐め
185 | 男根
186 | 痴呆
187 | 穴兄弟
188 | 竿姉妹
189 | 筆おろし
190 | 精液
191 | 糞便
192 | 糞尿愛好症
193 | 素股
194 | 緊縛
195 | 老害
196 | 肉便器
197 | 自慰
198 | 裸の女性
199 | 貞操帯
200 | 賢者タイム
201 | 足フェチ
202 | 輪姦
203 | 近親相姦
204 | 阿呆
205 | 陰毛
206 | 電マ
207 | 顔射
208 | 顔面騎乗
209 | 騎上位
210 | 騎乗位


--------------------------------------------------------------------------------
/lib/src/main/resources/pipeline/all_duplicate_paragraphs.conf:
--------------------------------------------------------------------------------
1 | filters: [
2 |   {"class": "DuplicateParagraphs", "limit": 2}
3 | ]


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/cleaning/PathSegment.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.cleaning
 2 | 
 3 | import scala.collection.mutable.ArrayBuffer
 4 | 
 5 | case class PathSegment(tag: String, id: String, classes: Seq[String]) {
 6 |   override def toString: String = classes
 7 |     .mkString(tag + (if (classes.isEmpty) "" else "."), ".", if (id == null) "" else s"#$id")
 8 | 
 9 |   lazy val lowerClasses: Set[String] = classes.map(_.toLowerCase).toSet
10 |   lazy val lowerId: String = if (id == null) null else id.toLowerCase
11 | }
12 | 
13 | object PathSegment {
14 | 
15 |   final private val EMPTY_PATH: Seq[PathSegment] = ArrayBuffer.empty
16 | 
17 |   def parsePath(path: String): Seq[PathSegment] = {
18 |     var start = 0
19 |     val end = path.length
20 |     val result = new ArrayBuffer[PathSegment]()
21 |     while (start < end) {
22 |       val separator = path.indexOf('>', start)
23 |       if (separator == -1) {
24 |         result += parse(path, start, end)
25 |         start = end
26 |       } else {
27 |         result += parse(path, start, separator)
28 |         start = separator + 1
29 |       }
30 |     }
31 |     if (result.isEmpty) EMPTY_PATH else result
32 |   }
33 | 
34 |   final private val EMPTY_CLASSES: Seq[String] = new ArrayBuffer[String]()
35 |   def parse(raw: String, start: Int, end: Int): PathSegment = {
36 |     var dotIdx = raw.indexOf('.', start)
37 |     var hashIdx = raw.indexOf('#', start)
38 |     if (dotIdx > end) {
39 |       dotIdx = -1
40 |     }
41 |     if (hashIdx > end) {
42 |       hashIdx = -1
43 |     }
44 | 
45 |     if (dotIdx == -1 && hashIdx == -1) {
46 |       return PathSegment(raw.substring(start, end), null, EMPTY_CLASSES)
47 |     }
48 | 
49 |     var tagEndIdx = end
50 | 
51 |     val id =
52 |       if (hashIdx == -1) null
53 |       else {
54 |         tagEndIdx = hashIdx
55 |         raw.substring(hashIdx + 1, end)
56 |       }
57 | 
58 |     val classes =
59 |       if (dotIdx == -1) {
60 |         EMPTY_CLASSES
61 |       } else {
62 |         val classesEndIdx = tagEndIdx
63 |         tagEndIdx = dotIdx
64 |         var classesIdx = dotIdx
65 |         val classes = new ArrayBuffer[String]()
66 |         while (classesIdx < classesEndIdx) {
67 |           var nextClassIdx = raw.indexOf('.', classesIdx + 1)
68 |           if (nextClassIdx > end) {
69 |             nextClassIdx = -1
70 |           }
71 |           if (nextClassIdx > 0) {
72 |             classes += raw.substring(classesIdx + 1, nextClassIdx)
73 |             classesIdx = nextClassIdx
74 |           } else {
75 |             if (classesIdx != classesEndIdx) {
76 |               classes += raw.substring(classesIdx + 1, classesEndIdx)
77 |             }
78 |             classesIdx = classesEndIdx
79 |           }
80 |         }
81 |         classes
82 |       }
83 | 
84 |     PathSegment(
85 |       tag = raw.substring(start, tagEndIdx),
86 |       id = id,
87 |       classes = classes
88 |     )
89 |   }
90 | 
91 |   def parse(raw: String): PathSegment = parse(raw, 0, raw.length)
92 | }
93 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/AdjacentDuplicateParagraphs.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 5 | 
 6 | import scala.collection.mutable.ArrayBuffer
 7 | 
 8 | /** This class is a hack put in place before the final bugfix
 9 |   */
10 | class AdjacentDuplicateParagraphs extends DocFilter {
11 | 
12 |   private def compressParagraphs(paragraphs: Seq[Paragraph]): Seq[Paragraph] = {
13 |     val result = new ArrayBuffer[Paragraph]()
14 |     val iter = paragraphs.iterator
15 |     if (!iter.hasNext) {
16 |       return paragraphs
17 |     }
18 | 
19 |     var prev = iter.next()
20 |     while (iter.hasNext) {
21 |       val next = iter.next()
22 |       if (next.text != prev.text) {
23 |         result += prev
24 |         prev = next
25 |       }
26 |     }
27 | 
28 |     result += prev
29 |     result
30 |   }
31 | 
32 |   override def checkDocument(doc: Document): Document = {
33 |     val newPars = compressParagraphs(doc.paragraphs)
34 |     if (newPars.length == doc.paragraphs.length) {
35 |       doc
36 |     } else {
37 |       doc.copy(paragraphs = newPars)
38 |     }
39 |   }
40 | 
41 |   override val toString = "AdjacentDuplicateParagraphs"
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/CompressionRate.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.CompressionRate.{INPUT_SIZE, OUTPUT_SIZE}
 5 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter
 6 | import net.jpountz.lz4.{LZ4Exception, LZ4Factory}
 7 | 
 8 | import java.nio.charset.StandardCharsets
 9 | import java.nio.{ByteBuffer, CharBuffer}
10 | 
11 | /** Filter out documents which have too low or too high compression rate (using LZ4 algorithm)
12 |   *
13 |   * @param low
14 |   *   low compression rate threshold
15 |   * @param high
16 |   *   high compression rate threshold
17 |   */
18 | class CompressionRate(override val low: Float, override val high: Float) extends HighLowDocFilter {
19 |   @transient private lazy val lz4 = LZ4Factory.fastestInstance()
20 |   @transient private lazy val utf8Buffer = ByteBuffer.allocateDirect(INPUT_SIZE)
21 |   @transient private lazy val compressBuffer = ByteBuffer.allocateDirect(OUTPUT_SIZE)
22 | 
23 |   def encodeDocContent(doc: Document): ByteBuffer = {
24 |     val enc = StandardCharsets.UTF_8.newEncoder()
25 |     val buf = utf8Buffer
26 |     buf.clear()
27 |     val iter = doc.aliveParagraphs
28 |     while (iter.hasNext) {
29 |       val p = iter.next()
30 |       val cbuf = CharBuffer.wrap(p.text)
31 |       val res = enc.encode(cbuf, buf, true)
32 |       if (res.isOverflow) {
33 |         // Scala does not has nice break/continue :/
34 |         buf.flip()
35 |         return buf
36 |       }
37 |     }
38 |     buf.flip()
39 |     buf
40 |   }
41 | 
42 |   override def checkDocument(doc: Document): Document = {
43 |     val ratio: Float = compressionRatio(doc)
44 |     maybeFilter(doc, ratio)
45 |   }
46 | 
47 |   def compressionRatio(doc: Document): Float = {
48 |     val compressor = lz4.fastCompressor()
49 |     val buf = encodeDocContent(doc)
50 |     val uncompressedSize = buf.limit()
51 |     val outBuf = compressBuffer
52 |     outBuf.clear()
53 |     val compressedSize =
54 |       try {
55 |         compressor.compress(buf, outBuf)
56 |         outBuf.position()
57 |       } catch {
58 |         case _: LZ4Exception => OUTPUT_SIZE
59 |       }
60 |     val ratio = compressedSize.toFloat / uncompressedSize.toFloat
61 |     ratio
62 |   }
63 | }
64 | 
65 | object CompressionRate {
66 |   final val INPUT_SIZE = 1024 * 1024
67 |   final val OUTPUT_SIZE = 1200 * 1024
68 | }
69 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocuments.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.stats.NgramHashExtractor
 4 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 5 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 6 | import com.worksap.nlp.uzushio.lib.utils.MathUtil
 7 | import scala.math._
 8 | import scala.util.Random
 9 | 
10 | trait RandomGeneratorFromStringBase {
11 |   def generateRandom(docId: String): Double
12 | }
13 | 
14 | // An object in arguments of DocFilter on Spark needs to mixin Serializable.
15 | object RandomGeneratorFromString extends RandomGeneratorFromStringBase with Serializable {
16 |   def generateRandom(docId: String): Double = {
17 |     val seed = NgramHashExtractor.hashString(docId)
18 |     MathUtil.asRandomDouble(seed)
19 |   }
20 | }
21 | 
22 | class GaussianRandomGeneratorFromString(
23 |     val mu: Double = 0.3,
24 |     val sd: Double = 0.1
25 | ) extends RandomGeneratorFromStringBase
26 |     with Serializable {
27 |   def generateRandom(docId: String): Double = {
28 |     val seed = NgramHashExtractor.hashString(docId)
29 |     val rng = new Random(seed)
30 |     rng.nextGaussian() * mu + sd
31 |   }
32 | }
33 | 
34 | class DeduplicateDocuments(
35 |     val baseNumFreq: Int = 10,
36 |     val randomGenerator: RandomGeneratorFromStringBase = new GaussianRandomGeneratorFromString
37 | ) extends DocFilter {
38 | 
39 |   def computeNearDuplicateTextRatio(doc: Document): Float = {
40 |     val iter = doc.aliveParagraphs
41 | 
42 |     var totalLengthWeightedNearFreq = 0.0
43 |     var totalLength = 0.0
44 | 
45 |     while (iter.hasNext) {
46 |       val paragraph = iter.next()
47 |       val text = paragraph.text
48 |       val textLength = text.length()
49 |       val nearFreq = if (paragraph.nearFreq < baseNumFreq) paragraph.nearFreq else baseNumFreq
50 |       val weight = log(nearFreq) / log(baseNumFreq)
51 | 
52 |       totalLength += textLength
53 |       totalLengthWeightedNearFreq += (textLength * weight)
54 |     }
55 | 
56 |     MathUtil.ratio(totalLengthWeightedNearFreq.toFloat, totalLength.toFloat)
57 |   }
58 | 
59 |   def shouldRemoveDocument(doc: Document) = {
60 |     val nearDuplicateTextRatio = computeNearDuplicateTextRatio(doc)
61 |     val thresholdProb = randomGenerator.generateRandom(doc.docId)
62 | 
63 |     nearDuplicateTextRatio >= thresholdProb
64 |   }
65 | 
66 |   override def checkDocument(doc: Document): Document = {
67 |     doc.removeWhen(shouldRemoveDocument(doc), this)
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocumentsPercentile.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 5 | import spire.math.QuickSelect
 6 | 
 7 | class DeduplicateDocumentsPercentile(percentile: Float = 0.05f, expected: Double = 1.0)
 8 |     extends DocFilter {
 9 |   override def checkDocument(doc: Document): Document = {
10 |     val freq = DeduplicateDocumentsPercentile.freqAtPercentile(doc, percentile)
11 |     val probability = expected / freq
12 |     doc.removeWhen(doc.randomDouble > probability, this)
13 |   }
14 | 
15 |   override val toString = s"DedupDocsPercentile($percentile,$expected)"
16 | }
17 | 
18 | object DeduplicateDocumentsPercentile {
19 |   import spire.std.any.IntAlgebra
20 | 
21 |   def freqAtPercentile(doc: Document, percentile: Float): Int = {
22 |     val counts = doc.aliveParagraphs.map(_.nearFreq).toArray
23 |     if (counts.isEmpty) {
24 |       return 0
25 |     }
26 |     val position = (counts.length * percentile).toInt
27 |     QuickSelect.select(counts, position)
28 |     counts(position)
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DocLength.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocIntFilter
 5 | 
 6 | class DocLength(
 7 |     override val low: Int = 0,
 8 |     override val high: Int = Int.MaxValue
 9 | ) extends HighLowDocIntFilter {
10 |   override def checkDocument(doc: Document): Document = {
11 |     val length = doc.aliveParagraphs.map(_.text.length).sum
12 |     maybeFilter(doc, length)
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DuplicateDocumentsLengthWeighted.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 5 | import com.worksap.nlp.uzushio.lib.utils.MathUtil
 6 | 
 7 | class DuplicateDocumentsLengthWeighted(expected: Double = 1.0) extends DocFilter {
 8 |   override def checkDocument(doc: Document): Document = {
 9 |     val weight = DuplicateDocumentsLengthWeighted.nearFreqWeight(doc)
10 |     val prob = expected / weight
11 |     doc.removeWhen(doc.randomDouble > prob, this)
12 |   }
13 | 
14 |   override val toString = s"DuplicateDocumentsLengthWeighted($expected)"
15 | }
16 | 
17 | object DuplicateDocumentsLengthWeighted {
18 |   def nearFreqWeight(doc: Document): Double = {
19 |     var nchars = 0L
20 |     var weight = 0.0
21 | 
22 |     val iter = doc.aliveParagraphs
23 |     while (iter.hasNext) {
24 |       val par = iter.next()
25 |       val len = par.text.length.toLong
26 |       nchars += len
27 |       weight += len * (Math.log10(par.nearFreq) + 1)
28 |     }
29 |     MathUtil.doubleRatio(weight, nchars)
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DuplicateParagraphs.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph
 4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter
 5 | 
 6 | class DuplicateParagraphs(limit: Int = 2) extends ParagraphFilter {
 7 |   override def checkParagraph(p: Paragraph): Paragraph = {
 8 |     if (p.nearFreq >= limit) {
 9 |       p.copy(remove = this)
10 |     } else p
11 |   }
12 | 
13 |   override val toString = s"DuplicateParagraphs($limit)"
14 | }
15 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/HiraganaRatio.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.HiraganaRatio.isHiragana
 5 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter
 6 | import com.worksap.nlp.uzushio.lib.utils.MathUtil
 7 | 
 8 | final class HiraganaRatio(
 9 |     override val low: Float = 0.0f,
10 |     override val high: Float = 1.0f
11 | ) extends HighLowDocFilter {
12 |   override def checkDocument(doc: Document): Document = {
13 |     val ratio = computeHiraganaRatio(doc)
14 |     maybeFilter(doc, ratio)
15 |   }
16 | 
17 |   def computeHiraganaRatio(document: Document): Float = {
18 |     var nchars = 0
19 |     var nhiragana = 0
20 |     val iter = document.aliveParagraphs
21 |     while (iter.hasNext) {
22 |       val par = iter.next()
23 |       val text = par.text
24 |       nchars += text.length
25 |       nhiragana += countHiraganaChars(text)
26 |     }
27 |     MathUtil.ratio(nhiragana, nchars)
28 |   }
29 | 
30 |   def countHiraganaChars(str: String): Int = {
31 |     val len = str.length
32 |     var idx = 0
33 |     var count = 0
34 |     while (idx < len) {
35 |       val ch = str.charAt(idx)
36 |       if (isHiragana(ch)) {
37 |         count += 1
38 |       }
39 |       idx += 1
40 |     }
41 |     count
42 |   }
43 | }
44 | 
45 | object HiraganaRatio {
46 |   def isHiragana(c: Char): Boolean = {
47 |     c >= 0x3040 && c <= 0x309f
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala:
--------------------------------------------------------------------------------
  1 | package com.worksap.nlp.uzushio.lib.filters
  2 | 
  3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
  4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
  5 | 
  6 | import scala.collection.mutable
  7 | 
  8 | final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) {
  9 |   def isAlive: Boolean = p.isAlive
 10 | 
 11 |   def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x))
 12 | }
 13 | 
 14 | class KenLMParagraphPerplexity(
 15 |     sudachi: String,
 16 |     kenlm: String,
 17 |     outliers: Float = 0.02f,
 18 |     count: Int = 3,
 19 |     threshold: Float = 1e6f
 20 | ) extends DocFilter {
 21 |   private val lmScore = -Math.log10(threshold).toFloat
 22 | 
 23 |   @transient
 24 |   private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)
 25 | 
 26 |   override def checkDocument(doc: Document): Document = {
 27 |     val proc = processor
 28 |     val paragraphs = doc.paragraphs
 29 |       .map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer
 30 | 
 31 |     val nchanged = markParagraphs(paragraphs)
 32 | 
 33 |     if (nchanged > 0) {
 34 |       doc.copy(paragraphs = paragraphs.map(_.p))
 35 |     } else {
 36 |       doc
 37 |     }
 38 |   }
 39 | 
 40 |   def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = {
 41 |     var nchanged = 0
 42 |     var idx = 0
 43 |     val len = paragraphs.length
 44 |     while (idx < len) {
 45 |       val p = paragraphs(idx)
 46 |       if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) {
 47 |         paragraphs(idx) = p.remove(this)
 48 |         nchanged += removePrev(paragraphs, idx)
 49 |         nchanged += 1
 50 |       }
 51 |       idx += 1
 52 |     }
 53 |     nchanged
 54 |   }
 55 | 
 56 |   def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = {
 57 |     var result = 0
 58 |     val end = math.max(offset - count, 0)
 59 |     var idx = offset - 1
 60 |     while (idx >= end) {
 61 |       val p = paragraphs(idx)
 62 |       if (p.isAlive && p.ppx <= lmScore) {
 63 |         paragraphs(idx) = p.remove(this)
 64 |         result += 1
 65 |       }
 66 | 
 67 |       idx -= 1
 68 |     }
 69 |     result
 70 |   }
 71 | 
 72 |   def shouldRemoveBack(
 73 |       paragraphs: mutable.Buffer[ParagraphWithPerplexity],
 74 |       offset: Int
 75 |   ): Boolean = {
 76 |     var idx = offset
 77 |     val end = math.max(offset - count + 1, 0)
 78 |     while (idx >= end) {
 79 |       val p = paragraphs(idx)
 80 |       if (p.ppx > lmScore) {
 81 |         return false
 82 |       }
 83 |       idx -= 1
 84 |     }
 85 |     true
 86 |   }
 87 | 
 88 |   def shouldRemoveFwd(
 89 |       paragraphs: mutable.Buffer[ParagraphWithPerplexity],
 90 |       offset: Int,
 91 |       length: Int
 92 |   ): Boolean = {
 93 |     var idx = offset
 94 |     val end = math.min(offset + count, length)
 95 |     while (idx < end) {
 96 |       val p = paragraphs(idx)
 97 |       if (p.ppx > lmScore) {
 98 |         return false
 99 |       }
100 |       idx += 1
101 |     }
102 |     true
103 |   }
104 | 
105 |   override val toString = s"KenLMPar($outliers,$count,$threshold)"
106 | }
107 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphs.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 5 | 
 6 | import scala.collection.mutable
 7 | 
 8 | class LargeFreqParagraphs(count: Int = 3, freq: Int = 100) extends DocFilter {
 9 |   override def checkDocument(doc: Document): Document = {
10 |     doc.paragraphs match {
11 |       case p: mutable.Buffer[Paragraph] =>
12 |         markParagraphs(p)
13 |         val nmarked = markParagraphs(p)
14 |         if (nmarked > 0) {
15 |           doc.copy(paragraphs = p)
16 |         } else {
17 |           doc
18 |         }
19 |       case _ =>
20 |         val buf = doc.paragraphs.toBuffer
21 |         val nmarked = markParagraphs(buf)
22 |         if (nmarked > 0) {
23 |           doc.copy(paragraphs = buf)
24 |         } else {
25 |           doc
26 |         }
27 |     }
28 |   }
29 | 
30 |   def markParagraphs(paragraphs: mutable.Buffer[Paragraph]): Int = {
31 |     var nchanged = 0
32 |     var idx = 0
33 |     val len = paragraphs.length
34 |     while (idx < len) {
35 |       val p = paragraphs(idx)
36 |       if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) {
37 |         paragraphs(idx) = p.copy(remove = this)
38 |         nchanged += removePrev(paragraphs, idx)
39 |         nchanged += 1
40 |       }
41 |       idx += 1
42 |     }
43 |     nchanged
44 |   }
45 | 
46 |   def removePrev(paragraphs: mutable.Buffer[Paragraph], offset: Int): Int = {
47 |     var result = 0
48 |     val end = math.max(offset - count, 0)
49 |     var idx = offset - 1
50 |     while (idx >= end) {
51 |       val p = paragraphs(idx)
52 |       if (p.isAlive && p.nearFreq >= freq) {
53 |         paragraphs(idx) = p.copy(remove = this)
54 |         result += 1
55 |       }
56 | 
57 |       idx -= 1
58 |     }
59 |     result
60 |   }
61 | 
62 |   def shouldRemoveBack(paragraphs: mutable.Buffer[Paragraph], offset: Int): Boolean = {
63 |     var idx = offset
64 |     val end = math.max(offset - count + 1, 0)
65 |     while (idx >= end) {
66 |       val p = paragraphs(idx)
67 |       if (p.nearFreq < freq) {
68 |         return false
69 |       }
70 |       idx -= 1
71 |     }
72 |     true
73 |   }
74 | 
75 |   def shouldRemoveFwd(paragraphs: mutable.Buffer[Paragraph], offset: Int, length: Int): Boolean = {
76 |     var idx = offset
77 |     val end = math.min(offset + count, length)
78 |     while (idx < end) {
79 |       val p = paragraphs(idx)
80 |       if (p.nearFreq < freq) {
81 |         return false
82 |       }
83 |       idx += 1
84 |     }
85 |     true
86 |   }
87 | 
88 |   override val toString = s"LargeFreqParagraphs($count,$freq)"
89 | }
90 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/LinkCharRatio.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter
 5 | import com.worksap.nlp.uzushio.lib.utils.{MathUtil, Paragraphs}
 6 | 
 7 | class LinkCharRatio(
 8 |     override val low: Float = 0.0f,
 9 |     override val high: Float = 1.0f
10 | ) extends HighLowDocFilter {
11 | 
12 |   def calcLinkCharRatio(doc: Document): Float = {
13 |     val iter = doc.aliveParagraphs
14 |     var total = 0
15 |     var inLink = 0
16 |     while (iter.hasNext) {
17 |       val par = iter.next()
18 |       var i = 0
19 |       val txt = par.text
20 |       val len = txt.length
21 |       var inside = 0
22 |       while (i < len) {
23 |         val ch = txt.charAt(i)
24 |         if (ch == Paragraphs.HTML_LINK_START) {
25 |           inside = 1
26 |         } else if (ch == Paragraphs.HTML_LINK_END) {
27 |           inside = 0
28 |         } else {
29 |           total += 1
30 |           inLink += inside
31 |         }
32 |         i += 1
33 |       }
34 |     }
35 |     MathUtil.ratio(inLink, total)
36 |   }
37 | 
38 |   override def checkDocument(doc: Document): Document = {
39 |     val ratio = calcLinkCharRatio(doc)
40 |     maybeFilter(doc, ratio)
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MarkdownizeHeading.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Paragraph, PathSegment}
 4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter
 5 | 
 6 | class MarkdownizeHeading extends ParagraphFilter {
 7 |   final val acceptedTags = Seq("h1", "h2", "h3", "h4", "h5", "h6")
 8 |   final val mdHeadningSymbol = "#"
 9 | 
10 |   def tagToMarkdownSymbol(tag: PathSegment): String = {
11 |     val numHeading = acceptedTags.indexOf(tag.tag) + 1
12 | 
13 |     if (numHeading == 0) {
14 |       throw new IllegalArgumentException(s"tag $tag is not heading")
15 |     }
16 | 
17 |     mdHeadningSymbol * numHeading + " "
18 |   }
19 | 
20 |   override def checkParagraph(p: Paragraph): Paragraph = {
21 |     val tagWithCSS = p.firstMatchingTag(acceptedTags)
22 |     tagWithCSS match {
23 |       case Some(v) => p.copy(text = tagToMarkdownSymbol(v) + p.text)
24 |       case None => p
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTag.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 4 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 5 | 
 6 | import scala.collection.mutable.ArrayBuffer
 7 | 
 8 | class MergeListTag extends DocFilter {
 9 |   final private val acceptedTags: Seq[String] = Array("li", "option")
10 | 
11 |   override def checkDocument(doc: Document): Document = {
12 |     val iter = doc.paragraphs.iterator
13 | 
14 |     if (!iter.hasNext) {
15 |       return doc
16 |     }
17 | 
18 |     var paragraph = iter.next()
19 |     var merged = false
20 |     val result = new ArrayBuffer[Paragraph]()
21 |     val textBuffer = new ArrayBuffer[String]()
22 |     var exactFreq = paragraph.exactFreq
23 |     var nearFreq = paragraph.nearFreq
24 | 
25 |     while (iter.hasNext) {
26 |       val nextParagraph = iter.next()
27 |       val isList = nextParagraph.containsTags(acceptedTags)
28 |       if (
29 |         paragraph.isAlive && nextParagraph.isAlive && isList && paragraph.path == nextParagraph.path
30 |       ) {
31 |         merged = true
32 |         textBuffer += paragraph.text
33 |         exactFreq = math.min(exactFreq, nextParagraph.exactFreq)
34 |         nearFreq = math.min(nearFreq, nextParagraph.nearFreq)
35 |       } else {
36 |         if (textBuffer.nonEmpty) {
37 |           textBuffer += paragraph.text
38 |           val mergedText = textBuffer.mkString("- ", "\n- ", "")
39 |           result += Paragraph(
40 |             path = paragraph.path,
41 |             text = mergedText,
42 |             index = result.size,
43 |             exactFreq = exactFreq,
44 |             nearFreq = nearFreq
45 |           )
46 |           textBuffer.clear()
47 |         } else {
48 |           result += paragraph.copy(index = result.size)
49 |         }
50 | 
51 |         exactFreq = nextParagraph.exactFreq
52 |         nearFreq = nextParagraph.nearFreq
53 |       }
54 | 
55 |       paragraph = nextParagraph
56 |     }
57 | 
58 |     if (merged) {
59 |       if (textBuffer.nonEmpty) {
60 |         textBuffer += paragraph.text
61 |         val mergedText = textBuffer.mkString("- ", "\n- ", "")
62 |         result += Paragraph(
63 |           path = paragraph.path,
64 |           text = mergedText,
65 |           index = result.size,
66 |           exactFreq = exactFreq,
67 |           nearFreq = nearFreq
68 |         )
69 |       } else {
70 |         result += paragraph.copy(index = result.size)
71 |       }
72 | 
73 |       doc.copy(paragraphs = result)
74 |     } else {
75 |       doc
76 |     }
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/NoContentDOM.scala:
--------------------------------------------------------------------------------
  1 | package com.worksap.nlp.uzushio.lib.filters
  2 | 
  3 | import com.worksap.nlp.uzushio.lib.cleaning.{Paragraph, PathSegment}
  4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter
  5 | 
  6 | class NoContentDOM extends ParagraphFilter {
  7 |   final private val filteringDomNames: Seq[String] =
  8 |     Array("header", "footer", "aside", "nav", "noscript", "form")
  9 | 
 10 |   final private val DOMCandidatesForFiliteringClassOrId = Array("div", "p", "ul", "h1")
 11 | 
 12 |   final private val filteringFullMatchClassOrIdCandidates: Seq[String] = Array(
 13 |     "left-box",
 14 |     "blog-title-inner",
 15 |     "blogtitle",
 16 |     "blog-name",
 17 |     "head-block1",
 18 |     "head-blog-name",
 19 |     "head-introduction",
 20 |   )
 21 | 
 22 |   final private val filteringPartialMatchClassOrIdNames: Seq[String] = Array(
 23 |     "header",
 24 |     "footer",
 25 |     "side",
 26 |     "menu",
 27 |     "nav",
 28 |     "banner",
 29 |     "logo",
 30 |     "pankuzu",
 31 |     "breadcrumb",
 32 |     "widget",
 33 |     "button",
 34 |   )
 35 | 
 36 |   final private val filteringFullMatchClassOrIdNames =
 37 |     filteringPartialMatchClassOrIdNames ++ filteringFullMatchClassOrIdCandidates ++ filteringFullMatchClassOrIdCandidates
 38 |       .map(toCamelCase)
 39 | 
 40 |   def toCamelCase(s: String): String = {
 41 |     val words = s.split("[_-]")
 42 |     words.head + words.tail.map(_.capitalize).mkString
 43 |   }
 44 | 
 45 |   def partialMatchIds(css: PathSegment): Boolean = {
 46 |     if (css.id == null) {
 47 |       return false
 48 |     }
 49 | 
 50 |     filteringPartialMatchClassOrIdNames.exists(name => css.lowerId.contains(name))
 51 |   }
 52 | 
 53 |   def partialMatchClasses(css: PathSegment): Boolean = {
 54 |     filteringPartialMatchClassOrIdNames.exists(name => css.lowerClasses.exists(_.contains(name)))
 55 |   }
 56 | 
 57 |   def containsTagWithIdAndClasses(
 58 |       p: Paragraph,
 59 |       tagNames: Seq[String],
 60 |       fullMatchCandidates: Seq[String],
 61 |       partialMatchCandidates: Seq[String]
 62 |   ): Boolean = {
 63 |     val iter = p.cssPath.reverseIterator
 64 | 
 65 |     while (iter.hasNext) {
 66 |       val css = iter.next()
 67 | 
 68 |       if (
 69 |         tagNames.contains(css.tag)
 70 |         && fullMatchCandidates.exists(name => css.id == name || css.classes.contains(name))
 71 |       ) {
 72 |         return true
 73 |       }
 74 | 
 75 |       if (
 76 |         tagNames.contains(css.tag)
 77 |         && partialMatchCandidates.exists(name => partialMatchIds(css) || partialMatchClasses(css))
 78 |       ) {
 79 |         return true
 80 |       }
 81 |     }
 82 |     false
 83 |   }
 84 | 
 85 |   override def checkParagraph(p: Paragraph): Paragraph = {
 86 |     if (
 87 |       p.containsTags(filteringDomNames) || containsTagWithIdAndClasses(
 88 |         p,
 89 |         DOMCandidatesForFiliteringClassOrId,
 90 |         filteringFullMatchClassOrIdNames,
 91 |         filteringPartialMatchClassOrIdNames
 92 |       )
 93 |     ) {
 94 |       p.copy(remove = this)
 95 |     } else {
 96 |       p
 97 |     }
 98 |   }
 99 | 
100 |   override def toString: String = "Nav"
101 | }
102 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/WordInstances.scala:
--------------------------------------------------------------------------------
  1 | package com.worksap.nlp.uzushio.lib.filters
  2 | 
  3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
  4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
  5 | import com.worksap.nlp.uzushio.lib.utils.TrieNode
  6 | 
  7 | import java.io.{BufferedReader, InputStreamReader}
  8 | import java.net.URL
  9 | import java.nio.charset.StandardCharsets
 10 | import java.nio.file.{Files, Paths}
 11 | 
 12 | /** Score documents using a word list and filter them if the score is more than the [[threshold]].
 13 |   *
 14 |   * Word lists are read from
 15 |   *   - Filesystem
 16 |   *   - com.worksap.nlp.uzushio.lib.filters package in classpath
 17 |   *   - root package in classpath
 18 |   *
 19 |   * @param list
 20 |   *   word list will be read from this resource
 21 |   * @param threshold
 22 |   *   documents with score larger than this value will be filtered out
 23 |   * @param full
 24 |   *   score for a full match
 25 |   * @param partial
 26 |   *   score for a partial match
 27 |   */
 28 | class WordInstances(list: String, threshold: Float = 3, full: Float = 1.0f, partial: Float = 0.1f)
 29 |     extends DocFilter {
 30 |   private val trie = WordInstances.readToTrie(list)
 31 |   override def checkDocument(doc: Document): Document = {
 32 |     val score = scoreDocument(doc) + 1e-3f
 33 |     doc.removeWhen(score >= threshold, this)
 34 |   }
 35 | 
 36 |   def scoreDocument(document: Document): Float = {
 37 |     var score = 0.0f
 38 |     val iter = document.aliveParagraphs
 39 |     while (iter.hasNext) {
 40 |       score += scoreParagraph(iter.next())
 41 |     }
 42 |     score
 43 |   }
 44 | 
 45 |   def scoreParagraph(paragraph: Paragraph): Float = {
 46 |     var score = 0.0f
 47 |     val text = paragraph.text
 48 |     var start = 0
 49 |     val len = text.length
 50 |     while (start < len) {
 51 |       val res = trie.findLongest(text, start)
 52 |       if (res.found) {
 53 |         start = res.end
 54 |         score += full
 55 |       } else {
 56 |         start += 1
 57 |       }
 58 |     }
 59 |     score
 60 |   }
 61 | 
 62 |   override val toString = s"WordInstances($list,$threshold,$full,$partial)"
 63 | }
 64 | 
 65 | object WordInstances {
 66 |   import scala.collection.JavaConverters._
 67 |   def readToTrie(name: String): TrieNode[Boolean] = {
 68 |     val p = Paths.get(name)
 69 |     if (Files.exists(p)) {
 70 |       return readToTrie(Files.lines(p))
 71 |     }
 72 | 
 73 |     val classRes = getClass.getResource(name)
 74 |     if (classRes != null) {
 75 |       return readToTrie(classRes)
 76 |     }
 77 | 
 78 |     val loaderRes = getClass.getClassLoader.getResource(name)
 79 |     if (loaderRes != null) {
 80 |       return readToTrie(classRes)
 81 |     }
 82 | 
 83 |     throw new IllegalArgumentException(s"could not find word list $name")
 84 |   }
 85 | 
 86 |   private def readToTrie(classRes: URL): TrieNode[Boolean] = {
 87 |     val reader = new InputStreamReader(classRes.openStream(), StandardCharsets.UTF_8)
 88 |     readToTrie(new BufferedReader(reader).lines())
 89 |   }
 90 | 
 91 |   private def readToTrie(
 92 |       s: java.util.stream.Stream[String]
 93 |   ): TrieNode[Boolean] = {
 94 |     try {
 95 |       TrieNode.make(s.iterator().asScala.filterNot(_.startsWith("#")))
 96 |     } finally {
 97 |       s.close()
 98 |     }
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/WordTypes.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 5 | import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
 6 | import org.apache.commons.math3.util.FastMath
 7 | 
 8 | class WordTypes(list: String, threshold: Float = 3, kind: String = "uniq") extends DocFilter {
 9 |   private val trie = WordInstances.readToTrie(list)
10 |   private val scorer = kind match {
11 |     case "uniq" => WordTypes.SizeScorer
12 |     case "log10" => WordTypes.Log10Scorer
13 |     case "sqrt" => WordTypes.SqrtScorer
14 |     case _ => throw new IllegalArgumentException("unknown kind, can be one of: uniq, log10, sqrt")
15 |   }
16 |   override def checkDocument(doc: Document): Document = {
17 |     val score = scoreDocument(doc)
18 |     doc.removeWhen(score >= threshold, this)
19 |   }
20 | 
21 |   def scoreDocument(doc: Document): Float = {
22 |     val counts = new Int2IntOpenHashMap()
23 |     val iter = doc.aliveParagraphs
24 |     while (iter.hasNext) {
25 |       consumeParagraph(counts, iter.next())
26 |     }
27 |     scoreCounts(counts)
28 |   }
29 | 
30 |   private def consumeParagraph(counts: Int2IntOpenHashMap, paragraph: Paragraph): Unit = {
31 |     val text = paragraph.text
32 |     var start = 0
33 |     val len = text.length
34 |     while (start < len) {
35 |       val res = trie.findLongest(text, start)
36 |       if (res.found) {
37 |         start = res.end
38 |         counts.addTo(res.index, 1)
39 |       } else {
40 |         start += 1
41 |       }
42 |     }
43 |   }
44 | 
45 |   private def scoreCounts(map: Int2IntOpenHashMap): Float = {
46 |     if (map.isEmpty) return 0
47 |     scorer(map)
48 |   }
49 | 
50 |   override val toString = s"WordInstances($list,$threshold,$kind)"
51 | }
52 | 
53 | object WordTypes {
54 |   private trait Scorer extends (Int2IntOpenHashMap => Float) with Serializable
55 | 
56 |   private object SizeScorer extends Scorer {
57 |     override def apply(v1: Int2IntOpenHashMap): Float = v1.size()
58 |   }
59 | 
60 |   private object SqrtScorer extends Scorer {
61 |     override def apply(v1: Int2IntOpenHashMap): Float = {
62 |       var score = 0.0
63 |       val iter = v1.values().iterator()
64 |       while (iter.hasNext) {
65 |         score += Math.sqrt(iter.nextInt())
66 |       }
67 |       score.toFloat
68 |     }
69 |   }
70 | 
71 |   private object Log10Scorer extends Scorer {
72 |     override def apply(v1: Int2IntOpenHashMap): Float = {
73 |       var score = v1.size().toDouble // log_10 (1) == 0, so add size to the score
74 |       val iter = v1.values().iterator()
75 |       while (iter.hasNext) {
76 |         score += Math.log10(iter.nextInt())
77 |       }
78 |       score.toFloat
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/FilterBase.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters.base
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | 
 5 | /** All filter classes extend from this trait. They must have single public constructor. The
 6 |   * framework will handle passing arguments from the config files automatically and will use default
 7 |   * arguments correctly (pass value from config, then default parameter if config does not have a
 8 |   * parameter with the same name).
 9 |   *
10 |   * **On filter functions**. Filtering functions should not remove paragraphs from documents.
11 |   * Instead they should mark paragraph or a document "to delete" with a marker object which should
12 |   * contain the reason of deletion. The marker object can be a string or any JVM object with
13 |   * toString method overriden. The implementation of `toString` should not contain any spaces or
14 |   * other characters which could cause problems in filesystem paths.
15 |   */
16 | trait FilterBase extends Serializable
17 | 
18 | /** Paragraph-level filter which considers all paragraphs independently. Mark [[Paragraph.remove]]
19 |   * field with the marker object.
20 |   *
21 |   * @see
22 |   *   [[FilterBase]] about marker objects
23 |   */
24 | trait ParagraphFilter extends FilterBase {
25 |   def checkParagraph(p: Paragraph): Paragraph
26 | }
27 | 
28 | /** Document-level filter. Should not remove any paragraphs. Instead, mark [[Document.remove]] or
29 |   * [[Paragraph.remove]] with a marker object.
30 |   *
31 |   * @see
32 |   *   [[FilterBase]] about marker objects
33 |   */
34 | trait DocFilter extends FilterBase {
35 |   def checkDocument(doc: Document): Document
36 | }
37 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/HighLowDocFilter.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters.base
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | 
 5 | trait HighLowDocFilter extends DocFilter { self =>
 6 |   def high: Float
 7 | 
 8 |   def low: Float
 9 | 
10 |   def maybeFilter(doc: Document, metric: Float): Document = {
11 |     if (metric < low) {
12 |       doc.copy(remove = Low)
13 |     } else if (metric > high) {
14 |       doc.copy(remove = High)
15 |     } else doc
16 |   }
17 | 
18 |   def describeFilter: String = self.getClass.getSimpleName
19 | 
20 |   @transient object Low {
21 |     override val toString = s"$describeFilter.Low($low)"
22 |   }
23 | 
24 |   @transient object High {
25 |     override val toString = s"$describeFilter.High($high)"
26 |   }
27 | 
28 |   override def toString = s"$describeFilter($low,$high)"
29 | }
30 | 
31 | trait HighLowDocIntFilter extends DocFilter { self =>
32 |   def high: Int
33 | 
34 |   def low: Int
35 | 
36 |   def maybeFilter(doc: Document, metric: Int): Document = {
37 |     if (metric < low) {
38 |       doc.copy(remove = Low)
39 |     } else if (metric > high) {
40 |       doc.copy(remove = High)
41 |     } else doc
42 |   }
43 | 
44 |   @transient object Low {
45 |     override val toString = s"${self.getClass.getSimpleName}.Low($low)"
46 |   }
47 | 
48 |   @transient object High {
49 |     override val toString = s"${self.getClass.getSimpleName}.High($high)"
50 |   }
51 | 
52 |   override def toString = s"${self.getClass.getSimpleName}($low,$high)"
53 | }
54 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/html/AllTagMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.html
 2 | 
 3 | import org.apache.tika.parser.html.HtmlMapper
 4 | 
 5 | import java.util.Locale
 6 | 
 7 | /** Mapper class that provides all tags to handler.
 8 |   *
 9 |   * With this class set in context, handler can recognize tags specific to html s.t. div, br, etc.
10 |   * ref: https://stackoverflow.com/questions/19368018/parsing-html-elements-in-apache-tika
11 |   */
12 | class AllTagMapper extends HtmlMapper {
13 |   override def mapSafeElement(name: String): String = name.toLowerCase(Locale.ROOT)
14 | 
15 |   override def isDiscardElement(name: String): Boolean = false
16 | 
17 |   override def mapSafeAttribute(
18 |       elementName: String,
19 |       attributeName: String
20 |   ): String = attributeName.toLowerCase(Locale.ROOT)
21 | }
22 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/html/ParseAbortException.scala:
--------------------------------------------------------------------------------
1 | package com.worksap.nlp.uzushio.lib.html
2 | 
3 | class ParseAbortException extends Exception
4 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangTagSniffer.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.lang
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.lang.LangTagSniffer.{extractCharset, metaRegex}
 4 | 
 5 | import java.nio.charset.{CodingErrorAction, StandardCharsets}
 6 | import java.nio.{ByteBuffer, CharBuffer}
 7 | import java.util.regex.Pattern
 8 | 
 9 | case class LangTagSniff(charset: String, language: String)
10 | 
11 | /** Try to sniff language and encoding by decoding first 10k bytes as ASCII and using regexes to
12 |   * find `<meta>` tags.
13 |   */
14 | class LangTagSniffer() {
15 |   private val decoder = {
16 |     val dec = StandardCharsets.US_ASCII.newDecoder()
17 |     dec.onMalformedInput(CodingErrorAction.REPLACE)
18 |     dec
19 |   }
20 | 
21 |   private val charBuf = CharBuffer.allocate(10 * 1024)
22 | 
23 |   private def doSniff(buffer: CharBuffer): LangTagSniff = {
24 |     var charset = ""
25 |     var language = ""
26 |     val iter = metaRegex.findAllIn(buffer)
27 |     while (iter.hasNext) {
28 |       val metaTag = iter.next()
29 |       val cs = extractCharset(metaTag)
30 |       if (cs.nonEmpty) {
31 |         charset = cs
32 |       }
33 | 
34 |     }
35 |     LangTagSniff(charset, language)
36 |   }
37 | 
38 |   def sniffTags(data: ByteBuffer): LangTagSniff = {
39 |     val pos = data.position()
40 |     val lim = data.limit()
41 | 
42 |     charBuf.clear()
43 |     val res = decoder.decode(data, charBuf, false)
44 |     charBuf.flip()
45 | 
46 |     data.position(pos)
47 |     data.limit(lim)
48 |     doSniff(charBuf)
49 |   }
50 | 
51 |   def sniffTags(data: Array[Byte], offset: Int, position: Int): LangTagSniff = {
52 |     val buffer = ByteBuffer.wrap(data, offset, position)
53 |     sniffTags(buffer)
54 |   }
55 | }
56 | 
57 | object LangTagSniffer {
58 |   private val metaRegex = "<meta[^>]*>".r
59 |   private val charsetRegex = Pattern.compile("charset=([^\"' ;,/>]+)", Pattern.CASE_INSENSITIVE)
60 | 
61 |   def extractCharset(tag: String): String = {
62 |     val matcher = charsetRegex.matcher(tag)
63 |     if (matcher.find()) {
64 |       matcher.group(1)
65 |     } else {
66 |       ""
67 |     }
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/resources/CachedLocalResource.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.resources
 2 | 
 3 | import com.github.jbaiter.kenlm.Model
 4 | import com.worksap.nlp.sudachi.{Config, Dictionary, DictionaryFactory}
 5 | import org.apache.spark.SparkFiles
 6 | 
 7 | import java.nio.file.{Files, Path, Paths}
 8 | import java.util.concurrent.ConcurrentHashMap
 9 | 
10 | trait CachedLocalResource[T] {
11 |   final private val cache = new ConcurrentHashMap[Path, T]()
12 | 
13 |   def create(p: Path): T
14 | 
15 |   def get(dict: String): T = {
16 |     val p = resolveLocalPath(dict).orElse(resolveSparkPath(dict)).getOrElse(
17 |       throw new IllegalArgumentException(s"could not find file: $dict")
18 |     )
19 | 
20 |     cache.computeIfAbsent(
21 |       p,
22 |       p1 => create(p1)
23 |     )
24 |   }
25 | 
26 |   def resolveLocalPath(str: String): Option[Path] = {
27 |     val p = Paths.get(str)
28 |     if (Files.exists(p) && Files.isRegularFile(p)) {
29 |       Some(p)
30 |     } else None
31 |   }
32 | 
33 |   def resolveSparkPath(str: String): Option[Path] = {
34 |     resolveLocalPath(SparkFiles.get(str))
35 |   }
36 | }
37 | 
38 | object Sudachi extends CachedLocalResource[Dictionary] {
39 |   override def create(p: Path): Dictionary = {
40 |     val cfg = Config.defaultConfig().systemDictionary(p)
41 |     new DictionaryFactory().create(cfg)
42 |   }
43 | }
44 | 
45 | object KenLM extends CachedLocalResource[Model] {
46 |   override def create(p: Path): Model = new Model(p)
47 | }
48 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/KenLMRunner.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.runners
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph
 4 | import com.worksap.nlp.uzushio.lib.filters.KenLMEvaluator
 5 | import com.worksap.nlp.uzushio.lib.resources.{KenLM, Sudachi}
 6 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs
 7 | import org.apache.spark.sql.{SaveMode, SparkSession}
 8 | import org.apache.spark.sql.expressions.UserDefinedFunction
 9 | import org.apache.spark.sql.functions.{explode, udf}
10 | import org.rogach.scallop.ScallopConf
11 | 
12 | object KenLMRunner {
13 | 
14 |   class Args(args: Seq[String]) extends ScallopConf(args) {
15 |     val input = opt[List[String]](required = true)
16 |     val output = opt[String](required = true)
17 |     val sudachiDict = opt[String]()
18 |     val kenlmModel = opt[String]()
19 |     val master = opt[String]()
20 |     this.verify()
21 |   }
22 | 
23 |   class LMPerplexity(sudachi: String, kenlm: String) extends Serializable {
24 | 
25 |     @transient
26 |     private lazy val evaluator = KenLMEvaluator.make(sudachi, kenlm, 0.1f)
27 | 
28 |     def process(par: String): Double = {
29 |       val prob = evaluator.scoreParagraph(Paragraph("body", par))
30 |       Math.pow(10, -prob)
31 |     }
32 | 
33 |     def asUdf: UserDefinedFunction = udf((x: String) => process(x))
34 |   }
35 | 
36 |   def main(args: Array[String]): Unit = {
37 |     val opts = new Args(args)
38 | 
39 |     val scb = SparkSession.builder()
40 |     opts.master.toOption.foreach(scb.master)
41 | 
42 |     val sc = scb.getOrCreate()
43 | 
44 |     val inputs = sc.read.parquet(opts.input(): _*)
45 | 
46 |     import sc.implicits._
47 | 
48 |     val splitPars = udf((x: String) => Paragraphs.extractCleanParagraphs(x))
49 | 
50 |     val pars = inputs.select(explode(splitPars($"text")).as("text")).distinct()
51 | 
52 |     val ppx = new LMPerplexity(opts.sudachiDict(), opts.kenlmModel())
53 | 
54 |     val probs = pars.withColumn("perplexity", ppx.asUdf($"text"))
55 |       .repartitionByRange(20, $"perplexity".desc).sortWithinPartitions($"perplexity".desc)
56 | 
57 |     probs.write.mode(SaveMode.Overwrite).json(opts.output())
58 |   }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/Repackage.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.runners
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs
 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource
 5 | import org.apache.spark.sql.functions.udf
 6 | import org.apache.spark.sql.{SaveMode, SparkSession}
 7 | import org.rogach.scallop.ScallopConf
 8 | 
 9 | object Repackage {
10 | 
11 |   def run(args: Args, spark: SparkSession): Unit = {
12 |     val data = spark.read.parquet(args.input)
13 | 
14 |     val reparitioned = data.coalesce(args.maxParitions)
15 | 
16 |     val cleaned =
17 |       if (args.clear && reparitioned.columns.contains("text")) {
18 |         val cleanUdf = udf { s: String => Paragraphs.extractCleanParagraphs(s).mkString("\n\n") }
19 |         reparitioned.withColumn("text", cleanUdf(reparitioned.col("text")))
20 |       } else reparitioned
21 | 
22 |     cleaned.write.format(args.format).option("compression", args.compression)
23 |       .mode(SaveMode.Overwrite).save(args.output)
24 |   }
25 | 
26 |   class ArgParser(args: Seq[String]) extends ScallopConf(args) {
27 |     val input = opt[String]()
28 |     val output = opt[String]()
29 |     val format = opt[String](default = Some("parquet"))
30 |     val compression = opt[String](default = Some("zstd"))
31 |     val maxPartitions = opt[Int](default = Some(10000))
32 |     val clear = toggle("clear", default = Some(false))
33 |     verify()
34 | 
35 |     def toArgs: Args = Args(
36 |       input = input(),
37 |       output = output(),
38 |       format = format(),
39 |       compression = compression(),
40 |       maxParitions = maxPartitions(),
41 |       clear = clear()
42 |     )
43 |   }
44 | 
45 |   case class Args(
46 |       input: String,
47 |       output: String,
48 |       format: String,
49 |       compression: String,
50 |       maxParitions: Int,
51 |       clear: Boolean
52 |   )
53 | 
54 |   def main(args: Array[String]): Unit = {
55 |     val argObj = new ArgParser(args).toArgs
56 |     SparkSession.builder().master("local").getOrCreate().use { spark =>
57 |       run(argObj, spark)
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/stats/CountMinSketch.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.stats
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.MathUtil
 4 | import org.apache.spark.sql.expressions.Aggregator
 5 | import org.apache.spark.sql.{Encoder, Encoders}
 6 | 
 7 | import java.util.Random
 8 | 
 9 | case class CountMinSketchState(
10 |     rows: Int,
11 |     cols: Int,
12 |     counts: Array[Long]
13 | ) {
14 |   def update(hasher: Hasher, value: Long): Unit = {}
15 | }
16 | 
17 | case class Hasher(
18 |     coeffs: Array[Long]
19 | ) {
20 |   def hash(c1: Long, c2: Long, value: Long): Long = {
21 |     val x = (value * c1) + c2 // mod 2^64
22 |     java.lang.Long.rotateRight(x, 23)
23 |   }
24 | }
25 | 
26 | object Hasher {
27 |   def make(num: Int): Hasher = {
28 |     val rng = new Random(0xdeadbeef)
29 |     Hasher(Array.fill(num * 2)(rng.nextLong()))
30 |   }
31 | }
32 | 
33 | class CountMinSketch(
34 |     private val rows: Int,
35 |     private val cols: Int,
36 |     private val ngrams: NgramHashExtractor,
37 |     private val hasher: Hasher
38 | ) extends Aggregator[String, CountMinSketchState, CountMinSketchState] {
39 |   override def zero: CountMinSketchState =
40 |     CountMinSketchState(rows, cols, new Array[Long](rows * cols))
41 | 
42 |   override def reduce(
43 |       b: CountMinSketchState,
44 |       a: String
45 |   ): CountMinSketchState = {
46 |     ngrams.compute(a) { hash =>
47 |       b.update(hasher, hash)
48 |     }
49 |     b
50 |   }
51 | 
52 |   override def merge(
53 |       b1: CountMinSketchState,
54 |       b2: CountMinSketchState
55 |   ): CountMinSketchState = {
56 |     val result = b1.copy()
57 |     MathUtil.addArray(result.counts, b2.counts)
58 |     result
59 |   }
60 | 
61 |   override def finish(reduction: CountMinSketchState): CountMinSketchState = reduction
62 | 
63 |   override def bufferEncoder: Encoder[CountMinSketchState] = Encoders.product
64 | 
65 |   override def outputEncoder: Encoder[CountMinSketchState] = Encoders.product
66 | }
67 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/stats/SimHashProcessor.scala:
--------------------------------------------------------------------------------
  1 | package com.worksap.nlp.uzushio.lib.stats
  2 | 
  3 | import com.worksap.nlp.uzushio.lib.stats.SimHashProcessor.addVector
  4 | import com.worksap.nlp.uzushio.lib.utils.Ziggurat
  5 | import it.unimi.dsi.util.XorShiftStarRandomGenerator
  6 | 
  7 | class NgramHashExtractor(private val minOrder: Int, private val maxOrder: Int)
  8 |     extends Serializable {
  9 |   require(minOrder > 0)
 10 |   require(maxOrder > 0)
 11 |   require(minOrder < maxOrder)
 12 | 
 13 |   @inline
 14 |   final def compute(data: CharSequence)(@inline fn: Long => Unit): Unit = {
 15 |     var i = 0
 16 |     val minOrder = this.minOrder - 1
 17 |     val maxOrder = this.maxOrder
 18 |     val end = data.length()
 19 |     while (i < end) {
 20 |       var order = 0
 21 |       var hashState = NgramHashExtractor.HASH_SEED
 22 |       while (order < maxOrder && i + order < end) {
 23 |         val c = data.charAt(i + order)
 24 |         if (c == '\n') {
 25 |           order = maxOrder
 26 |         } else {
 27 |           hashState = NgramHashExtractor.mix(hashState, c & 0xffffL)
 28 |           if (order >= minOrder) {
 29 |             val hash = NgramHashExtractor.mix(hashState, order)
 30 |             fn(hash): @inline
 31 |           }
 32 |         }
 33 | 
 34 |         order += 1
 35 |       }
 36 |       i += 1
 37 |     }
 38 |   }
 39 | 
 40 | }
 41 | 
 42 | object NgramHashExtractor {
 43 |   final val HASH_SEED = 15213125612L
 44 |   final val HASH_MULT = 6364136223846793005L
 45 |   final val HASH_ADD = 1442695040888963407L
 46 | 
 47 |   def mix(seed: Long, v: Long): Long = {
 48 |     val x = (v + HASH_ADD) ^ seed
 49 |     ror(x * HASH_MULT)
 50 |   }
 51 | 
 52 |   def ror(x: Long): Long = java.lang.Long.rotateRight(x, 23)
 53 | 
 54 |   def hashString(x: String): Long = {
 55 |     var state = 0xdeadbeeffeed133L
 56 |     val nchars = x.length
 57 |     var i = 0
 58 |     while (i < nchars) {
 59 |       state = mix(state, x.charAt(i) & 0xffffL)
 60 |       i += 1
 61 |     }
 62 |     mix(state, nchars)
 63 |   }
 64 | }
 65 | 
 66 | class SimHashProcessor(private val size: Int) extends Serializable {
 67 |   def init: Array[Float] = new Array[Float](size)
 68 | 
 69 |   def update(
 70 |       state: Array[Float],
 71 |       data: CharSequence,
 72 |       ngrams: NgramHashExtractor
 73 |   ): Unit = {
 74 |     ngrams.compute(data) { hash =>
 75 |       addVector(state, hash)
 76 |     }
 77 |   }
 78 | 
 79 |   def result(state: Array[Float]): Array[Byte] = {
 80 |     val len1 = state.length
 81 |     val resultLen = (len1 / 8) + (if ((len1 & 7) != 0) 1 else 0)
 82 |     val result = new Array[Byte](resultLen)
 83 |     var step = 0
 84 |     while (step < resultLen) {
 85 |       val offset = step * 8
 86 |       var i = 0
 87 |       var l = 0
 88 |       while (i < 8 && offset + i < len1) {
 89 |         val x = state(i + offset)
 90 |         if (x > 0) {
 91 |           l |= (1 << i)
 92 |         }
 93 |         i += 1
 94 |       }
 95 |       result(step) = l.toByte
 96 |       step += 1
 97 |     }
 98 |     result
 99 |   }
100 | }
101 | 
102 | object SimHashProcessor {
103 |   def addVector(state: Array[Float], hash: Long): Unit = {
104 |     val rng = new XorShiftStarRandomGenerator(hash)
105 | 
106 |     var i = 0
107 |     val len = state.length
108 |     while (i < len) {
109 |       state(i) = state(i) + Ziggurat.computeNextGaussian(rng).toFloat
110 |       i += 1
111 |     }
112 | 
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/BuilderSyntax.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.apache.spark.sql.Dataset
 4 | 
 5 | object BuilderSyntax {
 6 |   implicit class BuilderOps[T](val o: T) extends AnyVal {
 7 |     @inline def ifEnabled(cond: Boolean)(fn: T => T): T = {
 8 |       if (cond) fn(o) else o
 9 |     }
10 |   }
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/Levenshtein.java:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils;
 2 | 
 3 | public class Levenshtein {
 4 | 
 5 |     private Levenshtein() {
 6 |         // instances forbidden
 7 |     }
 8 | 
 9 |     public static int[] floatRange(int len) {
10 |         int[] result = new int[len];
11 |         for (int i = 0; i < len; ++i) {
12 |             result[i] = i * 100;
13 |         }
14 |         return result;
15 |     }
16 | 
17 |     public static int levenshteinDistance(CharSequence a, CharSequence b, int limit, int step) {
18 |         int[] row0 = floatRange(b.length() + 1);
19 |         int[] row1 = new int[b.length() + 1];
20 | 
21 |         int al = a.length();
22 |         int bl = b.length();
23 |         for (int i = 0; i < al; ++i) {
24 |             char c = a.charAt(i);
25 |             for (int j = 1; j < bl; ++j) {
26 |                 char x = b.charAt(j - 1);
27 | 
28 |             }
29 |         }
30 |         return -1;
31 |     }
32 | 
33 |     private static final int UMASK = 0x7fff_ffff;
34 |     private static final int FMASK = 0x8000_0000;
35 | 
36 |     public static int levStep(int compressedScore, int scoreA, int scoreB) {
37 |         int uscore = compressedScore & UMASK;
38 |         int flag = compressedScore & FMASK;
39 | 
40 |         int score = scoreA;
41 |         if (flag != 0) {
42 |             score = scoreB;
43 |         }
44 | 
45 |         return (uscore + score) & flag;
46 |     }
47 | 
48 |     public static int levStepB(int compressedScore, int scoreA, int scoreB) {
49 |         int uscore = compressedScore & UMASK;
50 |         int flag = compressedScore & FMASK;
51 | 
52 |         int score = scoreA;
53 |         if (flag != 0) {
54 |             score = scoreB;
55 |         }
56 | 
57 |         return (uscore + score) & flag;
58 |     }
59 | 
60 |     public static final int MASK1 = 0b11111111;
61 |     public static final int MASK2 = 0b00000000;
62 | }
63 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/Resources.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Resources {
 6 |   implicit class AutoClosableResource[T <: AutoCloseable](val x: T) extends AnyVal {
 7 |     @inline
 8 |     def use[X](fn: T => X): X =
 9 |       try {
10 |         fn(x)
11 |       } finally {
12 |         x.close()
13 |       }
14 |   }
15 | 
16 |   implicit class SparkContextResource(val x: SparkContext) extends AnyVal {
17 |     @inline
18 |     def use[X](fn: SparkContext => X): X =
19 |       try {
20 |         fn(x)
21 |       } finally {
22 |         x.stop()
23 |       }
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/RowBuffer.java:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils;
 2 | 
 3 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 4 | 
 5 | import java.util.Iterator;
 6 | 
 7 | /**
 8 |  * Buffer for row-like objects. Indices of entries are not preserved. Has O(1) {@link #removeElementAt(int)} method
 9 |  * which removes an element at index and puts the last element to the removed position.
10 |  *
11 |  * @param <T>
12 |  *         row-like object
13 |  */
14 | public final class RowBuffer<T> extends ObjectArrayList<T> {
15 | 
16 |     /**
17 |      * An iterator class which supports removing the just returned element.
18 |      *
19 |      * @param <T>
20 |      */
21 |     public final static class DeletingIterator<T> implements Iterator<T> {
22 |         private final T[] data;
23 |         private final RowBuffer<T> parent;
24 |         private int position;
25 | 
26 |         public DeletingIterator(RowBuffer<T> parent) {
27 |             this.data = parent.a;
28 |             this.parent = parent;
29 |             this.position = 0;
30 |         }
31 | 
32 |         @Override
33 |         public boolean hasNext() {
34 |             return position < parent.size;
35 |         }
36 | 
37 |         @Override
38 |         public T next() {
39 |             T element = data[position];
40 |             position += 1;
41 |             return element;
42 |         }
43 | 
44 |         /**
45 |          * Remove the element which was returned by the previous {@link #next()} call.
46 |          *
47 |          * @return removed element
48 |          */
49 |         public T removeElement() {
50 |             int toRemoveIdx = position - 1;
51 |             T element = parent.removeElementAt(toRemoveIdx);
52 |             position = toRemoveIdx;
53 |             return element;
54 |         }
55 |     }
56 | 
57 |     public DeletingIterator<T> deletingIterator() {
58 |         return new DeletingIterator<>(this);
59 |     }
60 | 
61 |     public static <T> RowBuffer<T> single(T x) {
62 |         RowBuffer<T> buffer = new RowBuffer<>();
63 |         buffer.add(x);
64 |         return buffer;
65 |     }
66 | 
67 |     /**
68 |      * Removes the current element from the collection. Last element is placed instead of the current element.
69 |      *
70 |      * @param index
71 |      *         where to remove
72 |      * @return element which replaces current element
73 |      */
74 |     public T removeElementAt(int index) {
75 |         if (index < 0) {
76 |             throw new IllegalArgumentException("index < 0");
77 |         }
78 |         if (index >= size) {
79 |             throw new IllegalArgumentException("index >= size");
80 |         }
81 |         T[] arr = a;
82 |         int lastIdx = size - 1;
83 |         T last = arr[lastIdx];
84 |         arr[lastIdx] = null;
85 |         if (index != lastIdx) {
86 |             arr[index] = last;
87 |         }
88 |         size = lastIdx;
89 |         return last;
90 |     }
91 | 
92 |     public int addToBuffer(T element) {
93 |         int sz = size;
94 |         add(element);
95 |         return sz;
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/SentenceIterator.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | class SentenceIterator(input: String, maxLength: Int) extends Iterator[String] {
 4 | 
 5 |   private var start = 0
 6 | 
 7 |   override def hasNext: Boolean = start < input.length
 8 | 
 9 |   override def next(): String = {
10 |     val curStart = start
11 |     var curEnd = SentenceIterator.indexOfSeparator(input, curStart, input.length) match {
12 |       case -1 => input.length
13 |       case x => x + 1
14 |     }
15 | 
16 |     val curLen = curEnd - curStart
17 |     if (curLen > maxLength) {
18 |       curEnd = curStart + maxLength
19 |     }
20 | 
21 |     start = curEnd
22 | 
23 |     input.substring(curStart, curEnd)
24 |   }
25 | }
26 | 
27 | object SentenceIterator {
28 |   private val SEPARATORS = "\n。、！？!?".toCharArray
29 | 
30 |   def indexOfSeparator(input: CharSequence, start: Int, end: Int): Int = {
31 |     val seps = SEPARATORS
32 |     val nseps = seps.length
33 | 
34 |     if (start < 0 || start > input.length()) {
35 |       throw new IndexOutOfBoundsException()
36 |     }
37 | 
38 |     if (end < 0 || end > input.length()) {
39 |       throw new IndexOutOfBoundsException()
40 |     }
41 | 
42 |     var i = start
43 |     while (i < end) {
44 |       val ch = input.charAt(i)
45 |       var j = 0
46 |       while (j < nseps) {
47 |         val ch0 = seps(j)
48 |         if (ch == ch0) {
49 |           return i
50 |         }
51 |         j += 1
52 |       }
53 |       i += 1
54 |     }
55 |     -1
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/SessionBufferIn.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.hc.core5.http.impl.nio
 2 | 
 3 | import org.apache.hc.core5.http.nio.SessionInputBuffer
 4 | 
 5 | trait ResettableBuffer extends SessionInputBuffer {
 6 |   def clear(): Unit
 7 |   def putBytes(bytes: Array[Byte]): Unit
 8 | 
 9 |   def position(): Int
10 | }
11 | 
12 | object SessionBufferAccess {
13 |   def instance(size: Int, lineSize: Int): ResettableBuffer =
14 |     new SessionInputBufferImpl(size, lineSize) with ResettableBuffer {
15 |       override def putBytes(bytes: Array[Byte]): Unit = {
16 |         val b = buffer()
17 |         val totalSize = size.min(bytes.length)
18 |         b.clear()
19 |         b.put(bytes, 0, totalSize)
20 |       }
21 | 
22 |       override def clear(): Unit = super.clear()
23 | 
24 |       override def position(): Int = buffer().position()
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/TrieNode.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap
 4 | 
 5 | final class TrieNode[T] extends Char2ObjectOpenHashMap[TrieNode[T]](4) {
 6 |   private var position: Int = -1
 7 | 
 8 |   def findLongest(str: CharSequence, offset: Int): SearchResult = {
 9 |     var idx = offset
10 |     val len = str.length()
11 |     var end = -1
12 |     var value = -1
13 |     var node = this
14 |     while (idx < len && node != null) {
15 |       val ch = str.charAt(idx)
16 |       val next = node.get(ch)
17 |       if (next != null && next.position != -1) {
18 |         end = idx + 1
19 |         value = next.position
20 |       }
21 |       node = next
22 |       idx += 1
23 |     }
24 |     SearchResult(end, value)
25 |   }
26 | }
27 | 
28 | object TrieNode {
29 |   def make(data: Iterable[CharSequence]): TrieNode[Boolean] = {
30 |     make(data.iterator)
31 |   }
32 | 
33 |   def make(data: Iterator[CharSequence]): TrieNode[Boolean] = {
34 |     val root = new TrieNode[Boolean]()
35 |     var index = 0
36 |     while (data.hasNext) {
37 |       val str = data.next()
38 |       var node = root
39 |       var i = 0
40 |       val len = str.length()
41 |       while (i < len) {
42 |         val ch = str.charAt(i)
43 |         var subnode = node.get(ch)
44 |         if (subnode == null) {
45 |           subnode = new TrieNode[Boolean]()
46 |           node.put(ch, subnode)
47 |         }
48 |         node = subnode
49 |         i += 1
50 |       }
51 |       node.position = index
52 |       index += 1
53 |     }
54 |     root
55 |   }
56 | }
57 | 
58 | final class SearchResult(val carrier: Long) extends AnyVal {
59 |   def end: Int = (carrier & 0xffffffff).toInt
60 | 
61 |   def index: Int = (carrier >>> 32).toInt
62 | 
63 |   def ==(o: SearchResult): Boolean = {
64 |     o.carrier == carrier
65 |   }
66 | 
67 |   def !=(o: SearchResult): Boolean = !(this == o)
68 | 
69 |   def found: Boolean = end > 0
70 | 
71 |   def failure: Boolean = !found
72 | 
73 |   override def toString: String = s"SearchResult($end, $index)"
74 | }
75 | 
76 | object SearchResult {
77 |   def apply(end: Int, index: Int): SearchResult = {
78 |     val repr = ((index & 0xffffffffL) << 32) | (end & 0xffffffffL)
79 |     new SearchResult(repr)
80 |   }
81 | 
82 |   def empty(): SearchResult = apply(-1, -1)
83 | }
84 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/WarcFileReader.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import com.google.common.io.CountingInputStream
 4 | import com.worksap.nlp.uzushio.lib.utils.WarcFileReader.MAX_RECORD_SIZE
 5 | import com.worksap.nlp.uzushio.lib.warc.WarcRecord
 6 | import org.apache.hadoop.conf.Configuration
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.log4j.LogManager
 9 | import org.archive.io.warc.WARCReaderFactory
10 | 
11 | import java.io.BufferedInputStream
12 | 
13 | /** Reads [[WarcRecord]]s from a WARC file using Hadoop filesystem APIs. */
14 | class WarcFileReader(conf: Configuration, filePath: Path) {
15 |   @transient private lazy val logger = LogManager.getLogger(this.getClass.getSimpleName)
16 | 
17 |   /** Opens a warc file and setup an iterator of records. */
18 |   private def fs = filePath.getFileSystem(conf)
19 |   private val fileSize = fs.getFileStatus(filePath).getLen
20 |   private val fsin = {
21 |     val rawStream = fs.open(filePath)
22 |     val wrapped =
23 |       if (rawStream.markSupported()) {
24 |         rawStream
25 |       } else new BufferedInputStream(rawStream)
26 |     // noinspection UnstableApiUsage
27 |     new CountingInputStream(wrapped)
28 |   }
29 |   private val reader = WARCReaderFactory.get(filePath.getName, fsin, true)
30 |   private val recordIter = reader.iterator
31 | 
32 |   /** Init counters to report progress. */
33 |   private var recordsRead: Long = 0
34 | 
35 |   /** Closes the file and reader. */
36 |   def close(): Unit = {
37 |     reader.close()
38 |     fsin.close()
39 |   }
40 | 
41 |   /** Reads the next record from the iterator.
42 |     */
43 |   def read(): WarcRecord = {
44 |     if (!recordIter.hasNext) {
45 |       throw new java.util.NoSuchElementException()
46 |     }
47 | 
48 |     try {
49 |       val rec = recordIter.next()
50 |       val length = rec.available()
51 |       if (length > MAX_RECORD_SIZE) {
52 |         rec.skip(length)
53 |         logger.info(s"from $filePath skipped ${rec.getHeader}")
54 |         recordsRead += 1
55 |         read()
56 |       } else {
57 |         val record = new WarcRecord(rec, filePath)
58 |         recordsRead += 1
59 |         record
60 |       }
61 |     } catch {
62 |       case e: java.io.EOFException =>
63 |         logger.warn(s"error while iterating warc, try to skip: $filePath", e)
64 |         read()
65 |     }
66 |   }
67 | 
68 |   /** Returns the number of records that have been read. */
69 |   def getRecordsRead: Long = recordsRead
70 | 
71 |   /** Returns the number of bytes that have been read. */
72 |   def bytesRead: Long = fsin.getCount
73 | 
74 |   /** Returns the proportion of the file that has been read. */
75 |   def getProgress: Float = {
76 |     if (fileSize <= 0) return 1.0f
77 |     bytesRead.toFloat / fileSize.toFloat
78 |   }
79 | }
80 | 
81 | object WarcFileReader {
82 |   final val MAX_RECORD_SIZE = 16 * 1024 * 1024 // 16MB
83 | }
84 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcInputFormat.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.warc
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.WarcFileReader
 4 | import org.apache.hadoop.fs.Path
 5 | import org.apache.hadoop.io.LongWritable
 6 | import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
 7 | import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
 8 | 
 9 | /** Hadoop InputFormat for WARC files.
10 |   *
11 |   * Key is 1-index LongWritable. Use get() method to take Long value.
12 |   */
13 | class WarcInputFormat extends FileInputFormat[LongWritable, WarcWritable] {
14 | 
15 |   /** Opens a WARC file (possibly compressed), and returns a RecordReader for accessing it.
16 |     */
17 |   override def createRecordReader(
18 |       split: InputSplit,
19 |       context: TaskAttemptContext
20 |   ) = {
21 |     new WarcRecordReader()
22 |   }
23 | 
24 |   override def isSplitable(context: JobContext, filename: Path): Boolean = {
25 |     // we cannot (sanely) split warc files, due to its variable-length records.
26 |     false
27 |   }
28 | }
29 | 
30 | /** Wrapper class of [[WarcFileReader]] to implement RecordReader. */
31 | class WarcRecordReader extends RecordReader[LongWritable, WarcWritable] {
32 |   private val key = new LongWritable()
33 |   private val value = new WarcWritable()
34 | 
35 |   private var reader: WarcFileReader = null
36 | 
37 |   override def initialize(
38 |       split: InputSplit,
39 |       context: TaskAttemptContext
40 |   ): Unit = {
41 |     reader = new WarcFileReader(
42 |       context.getConfiguration,
43 |       split.asInstanceOf[FileSplit].getPath
44 |     )
45 |   }
46 | 
47 |   override def nextKeyValue(): Boolean = {
48 |     try {
49 |       val record = reader.read()
50 |       key.set(reader.getRecordsRead)
51 |       value.setRecord(record)
52 |       true
53 |     } catch {
54 |       case _: java.util.NoSuchElementException => false
55 |     }
56 |   }
57 | 
58 |   override def getCurrentKey: LongWritable = key
59 | 
60 |   override def getCurrentValue: WarcWritable = value
61 | 
62 |   override def getProgress: Float = reader.getProgress
63 | 
64 |   override def close(): Unit = {
65 |     reader.close()
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.warc
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | object WarcLoader {
 8 |   /* Load WARC file as RDD. */
 9 |   def readWarcFiles(
10 |       spark: SparkContext,
11 |       name: String
12 |   ): RDD[WarcRecord] = {
13 |     spark.newAPIHadoopFile[LongWritable, WarcWritable, WarcInputFormat](
14 |       name
15 |     ).map { case (_, v) => v.getRecord }
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcRecord.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.warc
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.warc.WarcRecord.{
 4 |   RECORD_ACCESS_DATE,
 5 |   RECORD_ID,
 6 |   RECORD_TRUNCATED,
 7 |   RECORD_TYPE,
 8 |   RECORD_URL
 9 | }
10 | import org.apache.commons.io.IOUtils
11 | import org.apache.hadoop.fs.Path
12 | import org.archive.format.warc.WARCConstants
13 | import org.archive.io.ArchiveRecord
14 | 
15 | import java.io.Serializable
16 | 
17 | /** Serializable wrapper of ArchiveRecord, with body read in memory. */
18 | class WarcRecord(record: ArchiveRecord, val path: Path) extends Serializable {
19 |   // capture headers
20 |   private val headers = record.getHeader.getHeaderFields
21 |   // read body of request
22 |   val content: Array[Byte] = IOUtils.toByteArray(record, record.available());
23 | 
24 |   def isResponse: Boolean = {
25 |     // ref https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0
26 |     val warcType = headers.getOrDefault(RECORD_TYPE, "")
27 |     "response" == warcType
28 |   }
29 | 
30 |   def isTruncated: Boolean = headers.get(RECORD_TRUNCATED) match {
31 |     case null => false
32 |     case s: CharSequence => s.length() > 0
33 |     case _ => true
34 |   }
35 | 
36 |   def url: String = headers.getOrDefault(RECORD_URL, "").toString
37 | 
38 |   def accessDate: String = headers.get(RECORD_ACCESS_DATE).toString
39 | 
40 |   def docId: String = headers.get(RECORD_ID).toString
41 | }
42 | 
43 | object WarcRecord {
44 |   final val RECORD_TYPE = WARCConstants.HEADER_KEY_TYPE
45 |   final val RECORD_TRUNCATED = WARCConstants.HEADER_KEY_TRUNCATED
46 |   final val RECORD_URL = WARCConstants.HEADER_KEY_URI
47 |   final val RECORD_ACCESS_DATE = WARCConstants.HEADER_KEY_DATE
48 |   final val RECORD_ID = WARCConstants.HEADER_KEY_ID
49 | }
50 | 


--------------------------------------------------------------------------------
/lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcWritable.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.warc
 2 | 
 3 | import org.apache.hadoop.io.Writable
 4 | 
 5 | import java.io.{DataInput, DataOutput, Serializable};
 6 | 
 7 | /** A mutable wrapper around a [[WarcRecord]] implementing the Hadoop Writable and Serializable (for
 8 |   * Spark) interfaces.
 9 |   */
10 | class WarcWritable(private var record: WarcRecord = null) extends Writable with Serializable {
11 | 
12 |   /** Returns the record currently wrapped by this writable. */
13 |   def getRecord: WarcRecord = record
14 | 
15 |   /** Updates the record held within this writable wrapper. */
16 |   def setRecord(newRecord: WarcRecord): Unit = {
17 |     record = newRecord;
18 |   }
19 | 
20 |   /** Appends the current record to a [[DataOutput]] stream. */
21 |   override def write(out: DataOutput): Unit = {
22 |     // TODO: impl (not neccessary for current use case)
23 |     // if (record != null) record.write(out);
24 |   }
25 | 
26 |   /** Parses a [[WarcRecord]] out of a [[DataInput]] stream, and make it the current record.
27 |     */
28 |   override def readFields(in: DataInput): Unit = {
29 |     // TODO: impl (not neccessary for current use case)
30 |     // record = new WarcRecord(in);
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/lib/src/test/resources/docs/links.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 | </head>
 7 | <body>
 8 |     <div class="test">
 9 |         <a href="https://example.com/test"><img src="example.com/test.png"></a>
10 |     </div>
11 |     <div>
12 |         画像リンク
13 |     </div>
14 |     <div>
15 |         <p>&nbsp;</p>
16 |     </div>
17 | </body>
18 | </html>


--------------------------------------------------------------------------------
/lib/src/test/resources/docs/paragraph_detect.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="ja">
 3 | <body>
 4 | <div class="container">
 5 |     こんにちは
 6 |     <div id="12345">
 7 |         早稲田大学で
 8 |         <div>
 9 |             自然言語処理
10 |         </div>
11 |         を
12 |     </div>
13 |     勉強する。
14 | </div>
15 | 


--------------------------------------------------------------------------------
/lib/src/test/resources/lang/shift_jis.txt:
--------------------------------------------------------------------------------
 1 | HTTP/1.0 200 OK
 2 | Date: Sun, 26 May 2013 08:11:12 GMT
 3 | Content-Length: 4186
 4 | Last-Modified: Sat, 29 Dec 2012 16:50:56 GMT
 5 | Accept-Ranges: bytes
 6 | Content-Type: text/html
 7 | Connection: close
 8 | Server: Apache
 9 | 
10 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
11 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja" lang="ja">
12 | <head>
13 | <meta http-equiv="Content-Type" content="text/html; charset=Shift-JIS" />
14 | <meta http-equiv="Content-Style-Type" content="text/css" />
15 | <link rel="stylesheet" href="question.css" type="text/css" />


--------------------------------------------------------------------------------
/lib/src/test/resources/pipeline/doc_len.conf:
--------------------------------------------------------------------------------
1 | filters: [
2 |   {"class": "DocLength", "low": 5}
3 | ]


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/DocumentSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.cleaning
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class DocumentSpec extends AnyFreeSpec {
 6 |   "Document" - {
 7 |     "computes next double correctly" in {
 8 |       val docs = (1 to 1000).map(i => Document(Vector.empty, docId = ('a' + i).toChar.toString))
 9 |       val doubles = docs.map(_.randomDouble)
10 |       for (d <- doubles) {
11 |         assert(d < 1.0)
12 |       }
13 |       assert(doubles.distinct.size == 1000)
14 |       val sum = doubles.sum
15 |       assert((sum - 500).abs < 2)
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/ParagraphSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.cleaning
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class ParagraphSpec extends AnyFreeSpec {
 6 |   "Paragraph" - {
 7 |     "can return css selector strings" in {
 8 |       val par = Paragraph("body>p.text", "hello")
 9 |       assert(par.cssPath == Seq(PathSegment("body", null, Nil), PathSegment("p", null, Seq("text"))))
10 |     }
11 | 
12 |     "can return designated tags in path without css selector" in {
13 |       val par = Paragraph("body>p.text", "hello")
14 |       assert(par.firstMatchingTag(Seq("p", "span")) == Some(PathSegment("p", null, Seq("text"))))
15 |     }
16 | 
17 |     "do not return designated tags in path" in {
18 |       val par = Paragraph("body>p.text", "hello")
19 |       assert(par.firstMatchingTag(Seq("span")) == None)
20 |     }
21 | 
22 |     "can return true if the paragraph contains designated tags" in {
23 |       val par = Paragraph("body>p.text", "hello")
24 |       assert(par.containsTags(Seq("p", "span")))
25 |     }
26 | 
27 |     "do not return true if the paragraph does not contain designated tags" in {
28 |       val par = Paragraph("body>p.text", "hello")
29 |       assert(!par.containsTags(Seq("span")))
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/PathSegmentSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.cleaning
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class PathSegmentSpec extends AnyFreeSpec{
 6 |   "PathSelector" - {
 7 |     "parses selector without classes or id" in {
 8 |       val sel = PathSegment.parse("test")
 9 |       assert(sel.tag == "test")
10 |       assert(sel.id == null)
11 |       assert(sel.classes.isEmpty)
12 |       assert(sel.toString == "test")
13 |     }
14 | 
15 |     "parses selector without classes and with id" in {
16 |       val sel = PathSegment.parse("test#id")
17 |       assert(sel.tag == "test")
18 |       assert(sel.id == "id")
19 |       assert(sel.classes.isEmpty)
20 |       assert(sel.toString == "test#id")
21 |     }
22 | 
23 |     "parses selector with one class and without id" in {
24 |       val sel = PathSegment.parse("test.clz1")
25 |       assert(sel.tag == "test")
26 |       assert(sel.id == null)
27 |       assert(sel.classes == Seq("clz1"))
28 |       assert(sel.toString == "test.clz1")
29 |     }
30 | 
31 |     "parses selector with two classes and without id" in {
32 |       val sel = PathSegment.parse("test.clz1.clz2")
33 |       assert(sel.tag == "test")
34 |       assert(sel.id == null)
35 |       assert(sel.classes == Seq("clz1", "clz2"))
36 |       assert(sel.toString == "test.clz1.clz2")
37 |     }
38 | 
39 |     "parses selector with two classes and with id" in {
40 |       val sel = PathSegment.parse("test.clz1.clz2#id")
41 |       assert(sel.tag == "test")
42 |       assert(sel.id == "id")
43 |       assert(sel.classes == Seq("clz1", "clz2"))
44 |       assert(sel.toString == "test.clz1.clz2#id")
45 |     }
46 | 
47 |     "parses selector with two classes and with id inside other string" in {
48 |       val sel = PathSegment.parse("foo test.clz1.clz2#id test.clz#id2", 4, 21)
49 |       assert(sel.tag == "test")
50 |       assert(sel.id == "id")
51 |       assert(sel.classes == Seq("clz1", "clz2"))
52 |       assert(sel.toString == "test.clz1.clz2#id")
53 |     }
54 | 
55 |     "parses path of two elements" in {
56 |       val path = PathSegment.parsePath("body>li.test")
57 |       assert(path.size == 2)
58 |       assert(path(0).tag == "body")
59 |       assert(path(1).tag == "li")
60 |       assert(path(1).classes == Seq("test"))
61 |     }
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/PipelineSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.cleaning
 2 | 
 3 | import com.typesafe.config.ConfigFactory
 4 | import com.worksap.nlp.uzushio.lib.filters.WordInstances
 5 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
 6 | import org.scalatest.freespec.AnyFreeSpec
 7 | 
 8 | case class TestFilter(test: String) extends DocFilter {
 9 |   override def checkDocument(doc: Document): Document = Document(Paragraph("", test))
10 | }
11 | 
12 | class PipelineSpec extends AnyFreeSpec {
13 |   "Pipeline" - {
14 |     "can instantiate class fully specified" in {
15 |       val cfg = ConfigFactory.parseString(
16 |         """{class: WordInstances, list: "ng_words.txt", minimum: 3}"""
17 |       )
18 |       val filter = Pipeline.instantiateFilter(cfg)
19 |       assert(filter != null)
20 |       assert(filter.isInstanceOf[WordInstances])
21 |     }
22 | 
23 |     "can instantiate class with default value" in {
24 |       val cfg = ConfigFactory.parseString(
25 |         """{class: WordInstances, list: "ng_words.txt"}"""
26 |       )
27 |       val filter = Pipeline.instantiateFilter(cfg)
28 |       assert(filter != null)
29 |       assert(filter.isInstanceOf[WordInstances])
30 |     }
31 | 
32 |     "can instantiate pipeline from classpath" - {
33 |       val pipeline = Pipeline.make("doc_len.conf", ConfigFactory.empty())
34 |       assert(pipeline != null)
35 |     }
36 | 
37 |     "can instantiate filter with props" - {
38 |       val cfg = ConfigFactory.parseString(
39 |         """filters: [
40 |            {class: "com.worksap.nlp.uzushio.lib.cleaning.TestFilter", test: ${a} }
41 |         ]"""
42 |       )
43 |       val props = ConfigFactory.parseString("""a: value""")
44 |       val pipeline = Pipeline.make(cfg, props)
45 |       val result = pipeline.applyFilters(Document())
46 |       assert(result.paragraphs.length == 1)
47 |       assert(result.paragraphs.head.text == "value")
48 |     }
49 |   }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/dupes/CandidateRowProcessorSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.dupes
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.runners.{
 4 |   CandidateRowProcessor,
 5 |   DuplicateCandidateRow
 6 | }
 7 | import com.worksap.nlp.uzushio.lib.stats.{NgramHashExtractor, SimHashProcessor}
 8 | import org.apache.spark.sql.catalyst.expressions.XXH64
 9 | import org.apache.spark.unsafe.types.UTF8String
10 | import org.scalatest.freespec.AnyFreeSpec
11 | 
12 | object RowCandidate {
13 |   private val ngram = new NgramHashExtractor(2, 4)
14 |   private val simhasher = new SimHashProcessor(128)
15 | 
16 |   def apply(x: String): DuplicateCandidateRow = {
17 |     val utf8Str = UTF8String.fromString(x)
18 |     val hash = XXH64.hashUTF8String(utf8Str, 42L)
19 |     val simhashState = simhasher.init
20 |     simhasher.update(simhashState, x, ngram)
21 |     DuplicateCandidateRow(
22 |       x,
23 |       simhasher.result(simhashState),
24 |       1,
25 |       hash,
26 |       hash
27 |     )
28 |   }
29 | }
30 | 
31 | class CandidateRowProcessorSpec extends AnyFreeSpec {
32 |   "stuff (1) is processed correctly" in {
33 |     val pars = Seq(
34 |       RowCandidate("docomo STYLE series N-01C"),
35 |       RowCandidate("docomo STYLE series SH-03E"),
36 |       RowCandidate("4位docomo STYLE series N-01E"),
37 |       RowCandidate("5位docomo STYLE series N-03D")
38 |     )
39 |     val proc = new CandidateRowProcessor(1024 * 1024, 70, pars.iterator)
40 |     val result = proc.toArray
41 |     assert(result.length == 4)
42 |     assert(result.map(_.reprHash).toSet.size == 1)
43 |   }
44 | 
45 |   "stuff (2) is processed correctly" in {
46 |     val pars = Seq(
47 |       RowCandidate("らくらくホン ベーシック3 [ゴールド]"),
48 |       RowCandidate("らくらくホン ベーシック3 [ネイビー]"),
49 |       RowCandidate("らくらくホン ベーシック3 [ピンク]"),
50 |       RowCandidate("> らくらくホン ベーシック3 [ホワイト]"),
51 |       RowCandidate("らくらくホン ベーシック3 [ホワイト]"),
52 |       RowCandidate("らくらくホン ベーシック3 [ホワイト] のクチコミ掲示板")
53 |     )
54 |     val proc = new CandidateRowProcessor(1024 * 1024, 70, pars.iterator)
55 |     val result = proc.toArray
56 |     assert(result.length == 6)
57 |     assert(result.map(_.reprHash).toSet.size == 2)
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/AdjacentDuplicateParagraphsSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class AdjacentDuplicateParagraphsSpec extends AnyFreeSpec {
 7 |   "AdjacentDuplicateParagraphs" - {
 8 |     val filter = new AdjacentDuplicateParagraphs()
 9 |     "works with empty document" in {
10 |       val filtered = filter.checkDocument(Document())
11 |       assert(filtered.paragraphs.isEmpty)
12 |     }
13 | 
14 | 
15 |     "filters out docs correctly" in {
16 |       val doc = Document(
17 |         Paragraph("", "test1"),
18 |         Paragraph("", "test1"),
19 |         Paragraph("", "test2"),
20 |       )
21 |       val filtered = filter.checkDocument(doc)
22 |       assert(filtered.paragraphs == Seq(
23 |         Paragraph("", "test1"),
24 |         Paragraph("", "test2"),
25 |       ))
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/CompressionRateSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class CompressionRateSpec extends AnyFreeSpec {
 7 |   "CompressionRate" - {
 8 |     "correctly survives serialization" in {
 9 |       val doc = Document(Array(Paragraph("", "test1 test2")))
10 |       val f1 = new CompressionRate(0.1f, 1.2f)
11 |       val b1 = f1.encodeDocContent(doc)
12 |       val f2 = cloneViaSerialization(f1)
13 |       val b2 = f2.encodeDocContent(doc)
14 |       assert(!(b1 eq b2))
15 |       assert(!(f1.High eq f2.High))
16 |       assert(!(f1 eq f2))
17 |       assert(f1.toString == f2.toString)
18 |     }
19 | 
20 |     "computes ratio" in {
21 |       val f1 = new CompressionRate(0.1f, 1.2f)
22 |       val doc = testDoc("test1test1", "test2test2", "test5test5")
23 |       val ratio = f1.compressionRatio(doc)
24 |       assert(ratio < 1)
25 |     }
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocumentsSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document
 4 | import javax.swing.tree.FixedHeightLayoutCache
 5 | import org.scalatest.freespec.AnyFreeSpec
 6 | 
 7 | 
 8 | class FixedProbRandomGenerator(
 9 |   val returnProb: Double = 0.5
10 | ) extends RandomGeneratorFromStringBase {
11 |   def generateRandom(docId: String): Double = returnProb
12 | }
13 | 
14 | 
15 | class DeduplicateDocumentsSpec extends AnyFreeSpec {
16 |   def generateFilter(returnProb: Double): DeduplicateDocuments = {
17 |     val randomGenerator = new FixedProbRandomGenerator(returnProb)
18 |     new DeduplicateDocuments(100, randomGenerator)
19 |   }
20 | 
21 |   "DeduplicateDocumentsSpec" - {
22 |     val filter = generateFilter(0.5)
23 | 
24 |     "computes correct ratio for non-deuplicated documents" in {
25 |       val paragraphs = testParagraphs(
26 |         Seq("test", "test", "test", "test"),
27 |         Seq(1, 1, 1, 1)
28 |       )
29 |       val doc = Document(paragraphs, "test")
30 |       assert(0.0f == filter.computeNearDuplicateTextRatio(doc))
31 |       assert(false == filter.shouldRemoveDocument(doc))
32 |     }
33 | 
34 |     "computes correct ratio for non-deuplicated documents (boundary)" in {
35 |       val paragraphs = testParagraphs(
36 |         Seq("test", "test", "test", "test"),
37 |         Seq(1, 1, 99, 100)
38 |       )
39 |       val doc = Document(paragraphs, "test")
40 |       assert(0.5f > filter.computeNearDuplicateTextRatio(doc))
41 |       assert(false == filter.shouldRemoveDocument(doc))
42 |     }
43 | 
44 |     "computes correct ratio for deuplicated documents" in {
45 |       val paragraphs = testParagraphs(
46 |         Seq("test", "test", "test", "test"),
47 |         Seq(100, 100, 100, 100)
48 |       )
49 |       val doc = Document(paragraphs, "test")
50 |       assert(1.0f == filter.computeNearDuplicateTextRatio(doc))
51 |       assert(true == filter.shouldRemoveDocument(doc))
52 |     }
53 | 
54 |     "computes correct ratio for deuplicated documents (boundary)" in {
55 |       val paragraphs = testParagraphs(
56 |         Seq("test", "test", "test", "test"),
57 |         Seq(1, 1, 100, 100)
58 |       )
59 |       val doc = Document(paragraphs, "test")
60 |       assert(0.5f == filter.computeNearDuplicateTextRatio(doc))
61 |       assert(true == filter.shouldRemoveDocument(doc))
62 |     }
63 |   }
64 | }


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LinkCharRatioSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class LinkCharRatioSpec extends AnyFreeSpec {
 7 |   def a(x: String): String =
 8 |     s"${Paragraphs.HTML_LINK_START}$x${Paragraphs.HTML_LINK_END}"
 9 | 
10 |   "LinkCharRatio" - {
11 |     val filter = new LinkCharRatio()
12 |     "computes correct ratio for empty document" in {
13 |       val doc = testDoc("")
14 |       assert(0.0f == filter.calcLinkCharRatio(doc))
15 |     }
16 | 
17 |     "computes correct ratio for non-empty document without links" in {
18 |       val doc = testDoc("test")
19 |       assert(0.0f == filter.calcLinkCharRatio(doc))
20 |     }
21 | 
22 |     "computes correct ratio for non-empty document with links" in {
23 |       val doc = testDoc(s"test${a("baka")}")
24 |       assert(0.5f == filter.calcLinkCharRatio(doc))
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/MarkdownizeHeadingSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class MarkdownizeHeadingSpec extends AnyFreeSpec {
 7 |   "MarkdownizeHeading" - {
 8 |     val filter = new MarkdownizeHeading()
 9 | 
10 |     "do no operation for empty paragraph" in {
11 |       val p = Paragraph("body>p.text", "")
12 |       assert("" == filter.checkParagraph(p).text)
13 |     }
14 | 
15 |     "do no operation for no heading paragraph" in {
16 |       val p = Paragraph("body>p.text", "test")
17 |       assert("test" == filter.checkParagraph(p).text)
18 |     }
19 | 
20 |     "add markdown heading symbol for h1 paragraph" in {
21 |        val p = Paragraph("body>h1.text", "test")
22 |        assert("# test" == filter.checkParagraph(p).text)
23 |     }
24 | 
25 |     "add markdown heading symbol for h2 paragraph" in {
26 |        val p = Paragraph("body>h2.text", "test")
27 |        assert("## test" == filter.checkParagraph(p).text)
28 |     }
29 | 
30 |     "add markdown heading symbol for h3 paragraph" in {
31 |        val p = Paragraph("body>h3.text", "test")
32 |        assert("### test" == filter.checkParagraph(p).text)
33 |     }
34 | 
35 |     "add markdown heading symbol for h4 paragraph" in {
36 |        val p = Paragraph("body>h4.text", "test")
37 |        assert("#### test" == filter.checkParagraph(p).text)
38 |     }
39 | 
40 |     "add markdown heading symbol for h5 paragraph" in {
41 |        val p = Paragraph("body>h5.text", "test")
42 |        assert("##### test" == filter.checkParagraph(p).text)
43 |     }
44 | 
45 |     "add markdown heading symbol for h6 paragraph" in {
46 |        val p = Paragraph("body>h6.text", "test")
47 |        assert("###### test" == filter.checkParagraph(p).text)
48 |     }
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/NoContentDOMSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class NoContentDOMSpec extends AnyFreeSpec {
 7 |   "NoContentDOM" - {
 8 |     val filter = new NoContentDOM()
 9 | 
10 |     "do no operation for paragraph in tag that be able to have content" in {
11 |       val p = Paragraph("body>article>p", "text")
12 |       assert(filter.checkParagraph(p).remove == null)
13 |     }
14 | 
15 |     "sign remove for header tag paragraph" in {
16 |       val p = Paragraph("body>header>p", "test")
17 |       assert(filter.checkParagraph(p).remove != null)
18 |     }
19 | 
20 |     "sign remove for footer tag paragraph" in {
21 |       val p = Paragraph("body>footer>p", "test")
22 |       assert(filter.checkParagraph(p).remove != null)
23 |     }
24 | 
25 |     "sign remove for aside tag paragraph" in {
26 |       val p = Paragraph("body>aside>p", "test")
27 |       assert(filter.checkParagraph(p).remove != null)
28 |     }
29 | 
30 |     "sign remove for nav tag paragraph" in {
31 |       val p = Paragraph("body>nav>p", "test")
32 |       assert(filter.checkParagraph(p).remove != null)
33 |     }
34 | 
35 |     "sign remove for noscript tag paragraph" in {
36 |       val p = Paragraph("body>noscript", "test")
37 |       assert(filter.checkParagraph(p).remove != null)
38 |     }
39 | 
40 |     "sign remove for form tag paragraph" in {
41 |       val p = Paragraph("body>form", "test")
42 |       assert(filter.checkParagraph(p).remove != null)
43 |     }
44 | 
45 |     "sign remove for div tag with header class paragraph" in {
46 |       val p = Paragraph("body>div.header>p", "test")
47 |       assert(filter.checkParagraph(p).remove != null)
48 |     }
49 | 
50 |     "sign remove for div tag with header id paragraph" in {
51 |       val p = Paragraph("body>div#header>p", "test")
52 |       assert(filter.checkParagraph(p).remove != null)
53 |     }
54 | 
55 |     "sign remove for div tag with header-test id paragraph" in {
56 |       val p = Paragraph("body>div#header-test>p", "test")
57 |       assert(filter.checkParagraph(p).remove != null)
58 |     }
59 | 
60 |     "sign remove for div tag with breadcrumbs-test id paragraph" in {
61 |       val p = Paragraph("body>div.breadcrumbs-test>p", "test")
62 |       assert(filter.checkParagraph(p).remove != null)
63 |     }
64 | 
65 |     "sign remove for div tag with widget_wrapper id paragraph" in {
66 |       val p = Paragraph("body>div#widget_wrapper>p", "test")
67 |       assert(filter.checkParagraph(p).remove != null)
68 |     }
69 | 
70 |     "sign remove for div tag with testLogo id paragraph" in {
71 |       val p = Paragraph("body>div#testLogo>p", "test")
72 |       assert(filter.checkParagraph(p).remove != null)
73 |     }
74 | 
75 |     "sign remove for div tag with headerTop id paragraph" in {
76 |       val p = Paragraph("body>div#headerTop>p", "test")
77 |       assert(filter.checkParagraph(p).remove != null)
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/WordInstancesSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.filters
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class WordInstancesSpec extends AnyFreeSpec {
 6 |   "WordInstances" - {
 7 |     "hojichar - adult" - {
 8 |       val filter = new WordInstances("hojichar/adult_keywords_ja.txt")
 9 |       "can score single paragraph document" in {
10 |         val doc = testDoc("18禁 20禁 21禁")
11 |         val score = filter.scoreDocument(doc)
12 |         assert(score == 3.0f)
13 |       }
14 |     }
15 | 
16 | 
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/package.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 4 | import com.worksap.nlp.uzushio.lib.filters.base.FilterBase
 5 | 
 6 | import java.io.{
 7 |   ByteArrayInputStream,
 8 |   ByteArrayOutputStream,
 9 |   ObjectInputStream,
10 |   ObjectOutputStream
11 | }
12 | import scala.annotation.varargs
13 | 
14 | package object filters {
15 |   def cloneViaSerialization[T <: FilterBase](f: T): T = {
16 |     val bytes = new ByteArrayOutputStream()
17 |     val str = new ObjectOutputStream(bytes)
18 |     str.writeObject(f)
19 |     str.flush()
20 |     val data = bytes.toByteArray
21 |     val binput = new ByteArrayInputStream(data)
22 |     val istr = new ObjectInputStream(binput)
23 |     val obj = istr.readObject()
24 |     f.getClass.cast(obj)
25 |   }
26 | 
27 |   def testDoc(data: String*): Document = {
28 |     Document(
29 |       data.map { text =>
30 |         Paragraph("", text)
31 |       }.toIndexedSeq
32 |     )
33 |   }
34 | 
35 |   def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int] = Seq(), exactFreqs: Seq[Int] = Seq(), paths: Seq[String] = Seq()): IndexedSeq[Paragraph] = {
36 |     require(texts.length == nearFreqs.length || nearFreqs.isEmpty)
37 |     require(texts.length == exactFreqs.length || exactFreqs.isEmpty)
38 |     require(texts.length == paths.length || paths.isEmpty)
39 | 
40 |     val nearFreqs_ = if (nearFreqs.nonEmpty) nearFreqs else Seq.fill(texts.length)(1)
41 |     val exactFreqs_ = if (exactFreqs.nonEmpty) exactFreqs else Seq.fill(texts.length)(1)
42 |     val paths_ = if (paths.nonEmpty) paths else 0.to(texts.length).map(_ => "body>p.text")
43 | 
44 |     texts
45 |       .zip(nearFreqs_)
46 |       .zip(exactFreqs_)
47 |       .zip(paths_)
48 |       .map { case (((text, nearFreq), exactFreq), path) => Paragraph(path, text, 0, exactFreq, nearFreq) }
49 |       .toIndexedSeq
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/html/HtmlParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.html
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.html.HtmlParserSpec.RichByteArray
 4 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
 5 | import com.worksap.nlp.uzushio.lib.warc.{WarcEntryParser, WarcRecord}
 6 | import org.apache.hadoop.fs.Path
 7 | import org.archive.io.{ArchiveRecord, ArchiveRecordHeader}
 8 | import org.scalatest.freespec.AnyFreeSpec
 9 | 
10 | import java.io.ByteArrayInputStream
11 | import java.nio.charset.StandardCharsets
12 | import java.util
13 | import scala.collection.mutable.ArrayBuffer
14 | 
15 | class HtmlParserSpec extends AnyFreeSpec with ClasspathAccess {
16 |   "html parsing" - {
17 |     "works with small document" in {
18 |       val processor = new WarcEntryParser
19 |       val data = classpathBytes("docs/perldoc_ja_small.html")
20 |       val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8)
21 |       assert(paragraphs.length == 26)
22 |     }
23 | 
24 |     "correct paragraph detection" in {
25 |       val processor = new WarcEntryParser
26 |       val data = classpathBytes("docs/paragraph_detect.html")
27 |       val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8)
28 |       assert(
29 |         paragraphs == Seq(
30 |           "body>div.containerこんにちは",
31 |           "body>div.container>div#12345早稲田大学で",
32 |           "body>div.container>div#12345>div自然言語処理",
33 |           "body>div.container>div#12345を",
34 |           "body>div.container勉強する。"
35 |         )
36 |       )
37 |     }
38 | 
39 |     "empty paragraphs are ignored" in {
40 |       val processor = new WarcEntryParser
41 |       val data = classpathBytes("docs/links.html")
42 |       val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8)
43 |       assert(
44 |         paragraphs == Seq(
45 |           "body>div画像リンク"
46 |         )
47 |       )
48 |     }
49 |   }
50 | }
51 | 
52 | object HtmlParserSpec {
53 |   implicit class RichByteArray(val x: Array[Byte]) extends AnyVal {
54 |     def warc: WarcRecord = new WarcRecord(
55 |       new ArchiveRecord(
56 |         new ByteArrayInputStream(x),
57 |         new ArchiveRecordHeader {
58 |           override def getDate: String = ???
59 | 
60 |           override def getLength: Long = x.length
61 |           override def getContentLength: Long = x.length
62 | 
63 |           override def getUrl: String = ???
64 | 
65 |           override def getMimetype: String = ???
66 | 
67 |           override def getVersion: String = ???
68 |           override def getOffset: Long = 0
69 |           override def getHeaderValue(key: String): AnyRef = "none"
70 |           override def getHeaderFieldKeys: util.Set[String] = ???
71 |           override def getHeaderFields: util.Map[String, AnyRef] = {
72 |             val res = new util.HashMap[String, AnyRef]()
73 |             res
74 |           }
75 |           override def getReaderIdentifier: String = ???
76 |           override def getRecordIdentifier: String = ???
77 |           override def getDigest: String = ???
78 |           override def getContentBegin: Int = ???
79 |         },
80 |         0,
81 |         false,
82 |         false
83 |       ) {},
84 |       new Path("file:///dev/mem")
85 |     )
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.lang
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | class LangEstimationSpec extends AnyFreeSpec with ClasspathAccess {
 7 |   "LangEstimation" - {
 8 |     val sniffer = new LangTagSniffer()
 9 |     "sniffs charset shift_jis fragment" in {
10 |       val data = classpathBytes("lang/shift_jis.txt")
11 |       val tags = sniffer.sniffTags(data, 0, data.length)
12 |       assert("Shift-JIS" == tags.charset)
13 |     }
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/ClasspathAccess.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.apache.commons.io.IOUtils
 4 | 
 5 | trait ClasspathAccess {
 6 |   def classpathBytes(name: String): Array[Byte] = {
 7 |     val resource = getClass.getClassLoader.getResource(name)
 8 |     IOUtils.toByteArray(resource)
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/ParagraphsSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class ParagraphsSpec extends AnyFreeSpec {
 6 |   "correctly splits paragraphs" in {
 7 |     val doc = "test1\n\ntest2\ntest3"
 8 |     val pars = Paragraphs.extractCleanParagraphs(doc)
 9 |     assert(pars.length == 2)
10 |     assert(pars == Seq("test1", "test2\ntest3"))
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/RowBufferSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class RowBufferSpec extends AnyFreeSpec {
 6 |   "RowBuffer" - {
 7 |     "single item can be deleted" in {
 8 |       val buf = new RowBuffer[Int]()
 9 |       buf.addToBuffer(5)
10 |       assert(buf.size() == 1)
11 |       val item = buf.removeElementAt(0)
12 |       assert(item == 5)
13 |       assert(buf.size() == 0)
14 |       assertThrows[IllegalArgumentException](buf.removeElementAt(0))
15 |     }
16 | 
17 |     "removing item with invalid index throws an exception" in {
18 |       val buf = new RowBuffer[Int]()
19 |       buf.addToBuffer(5)
20 |       assert(buf.size() == 1)
21 |       assertThrows[IllegalArgumentException](buf.removeElementAt(1))
22 |     }
23 | 
24 |     "works when removing last item of two" in {
25 |       val buf = new RowBuffer[Int]()
26 |       buf.addToBuffer(2)
27 |       buf.addToBuffer(3)
28 |       assert(buf.size() == 2)
29 |       assert(buf.removeElementAt(1) == 3)
30 |       assert(buf.size() == 1)
31 |       assert(buf.get(0) == 2)
32 |     }
33 | 
34 |     "works when removing first item of two" in {
35 |       val buf = new RowBuffer[Int]()
36 |       buf.addToBuffer(2)
37 |       buf.addToBuffer(3)
38 |       assert(buf.size() == 2)
39 |       assert(buf.removeElementAt(0) == 3)
40 |       assert(buf.size() == 1)
41 |       assert(buf.get(0) == 3)
42 |     }
43 | 
44 |     "works when removing first item of three" in {
45 |       val buf = new RowBuffer[Int]()
46 |       buf.addToBuffer(2)
47 |       buf.addToBuffer(3)
48 |       buf.addToBuffer(4)
49 |       assert(buf.size() == 3)
50 |       assert(buf.removeElementAt(0) == 4)
51 |       assert(buf.size() == 2)
52 |       assert(buf.get(0) == 4)
53 |       assert(buf.get(1) == 3)
54 |     }
55 | 
56 |     "works when removing second item of three" in {
57 |       val buf = new RowBuffer[Int]()
58 |       buf.addToBuffer(2)
59 |       buf.addToBuffer(3)
60 |       buf.addToBuffer(4)
61 |       assert(buf.size() == 3)
62 |       assert(buf.removeElementAt(1) == 4)
63 |       assert(buf.size() == 2)
64 |       assert(buf.get(0) == 2)
65 |       assert(buf.get(1) == 4)
66 |     }
67 | 
68 |     "works when removing third item of three" in {
69 |       val buf = new RowBuffer[Int]()
70 |       buf.addToBuffer(2)
71 |       buf.addToBuffer(3)
72 |       buf.addToBuffer(4)
73 |       assert(buf.size() == 3)
74 |       assert(buf.removeElementAt(2) == 4)
75 |       assert(buf.size() == 2)
76 |       assert(buf.get(0) == 2)
77 |       assert(buf.get(1) == 3)
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/SentenceIteratorSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class SentenceIteratorSpec extends AnyFreeSpec {
 6 |   "SentenceIterator" - {
 7 |     "indexOf" - {
 8 |       "returns correct value for a simple case" in {
 9 |         val seq = "this。 is a test"
10 |         assert(4 == SentenceIterator.indexOfSeparator(seq, 0, seq.length))
11 |       }
12 | 
13 |       "works with empty string" in {
14 |         val seq = ""
15 |         assert(-1 == SentenceIterator.indexOfSeparator(seq, 0, seq.length))
16 |       }
17 | 
18 |       "works with last index of a string" in {
19 |         val seq = "test"
20 |         assert(-1 == SentenceIterator.indexOfSeparator(seq, 4, seq.length))
21 |       }
22 | 
23 |       "works with not last index of a string not containing required characters" in {
24 |         val seq = "test"
25 |         assert(-1 == SentenceIterator.indexOfSeparator(seq, 2, seq.length))
26 |       }
27 |     }
28 | 
29 |     "produces correct sequence of sentences" in {
30 |       val iter = new SentenceIterator("this。 is a test", 1024)
31 |       assert(Seq("this。", " is a test") == iter.toSeq)
32 |     }
33 | 
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/TrieSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.utils
 2 | 
 3 | import org.scalatest.freespec.AnyFreeSpec
 4 | 
 5 | class TrieSpec extends AnyFreeSpec {
 6 |   "TrieNode" - {
 7 |     "can be created" in {
 8 |       val trie = TrieNode.make(Seq("test", "tfst", "fist"))
 9 |       assert(trie != null)
10 |     }
11 | 
12 |     "can find strings" in {
13 |       val trie = TrieNode.make(Seq("test", "tfst", "fist"))
14 |       assert(SearchResult(4, 0) == trie.findLongest("testing", 0))
15 |       assert(SearchResult(4, 0) == trie.findLongest("testtfst", 0))
16 |       assert(SearchResult(8, 1) == trie.findLongest("testtfst", 4))
17 |       assert(SearchResult(4, 1) == trie.findLongest("tfsttest", 0))
18 |       assert(SearchResult.empty() == trie.findLongest("tfest", 0))
19 |     }
20 | 
21 |     "finds a longest substring" in {
22 |       val trie = TrieNode.make(Seq("ab", "abc", "abcd"))
23 |       assert(SearchResult(2, 0) == trie.findLongest("abed", 0))
24 |       assert(SearchResult(2, 0) == trie.findLongest("abecd", 0))
25 |       assert(SearchResult(4, 2) == trie.findLongest("abcdf", 0))
26 |       assert(SearchResult(3, 1) == trie.findLongest("abcfd", 0))
27 |     }
28 |   }
29 | 
30 |   "SearchResult" - {
31 |     "has correct fields for (0, 0)" in {
32 |       val sr = SearchResult(0, 0)
33 |       assert(0 == sr.end)
34 |       assert(0 == sr.index)
35 |     }
36 | 
37 |     "has correct fields for (1, 1)" in {
38 |       val sr = SearchResult(1, 1)
39 |       assert(1 == sr.end)
40 |       assert(1 == sr.index)
41 |     }
42 | 
43 |     "has correct fields for (100, 5000)" in {
44 |       val sr = SearchResult(100, 5000)
45 |       assert(100 == sr.end)
46 |       assert(5000 == sr.index)
47 |     }
48 | 
49 |     "has correct fields for (-1, -1)" in {
50 |       val sr = SearchResult(-1, -1)
51 |       assert(-1 == sr.end)
52 |       assert(-1 == sr.index)
53 |     }
54 | 
55 |     "has correct fields for (-100, -100)" in {
56 |       val sr = SearchResult(-5, -100)
57 |       assert(-5 == sr.end)
58 |       assert(-100 == sr.index)
59 |     }
60 | 
61 |     "has correct toString" in {
62 |       assert(SearchResult(0, 0).toString == "SearchResult(0, 0)")
63 |       assert(SearchResult(1, 1).toString == "SearchResult(1, 1)")
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/lib/src/test/scala/com/worksap/nlp/uzushio/lib/warc/WarcEntryParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.worksap.nlp.uzushio.lib.warc
 2 | 
 3 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
 4 | import org.scalatest.freespec.AnyFreeSpec
 5 | 
 6 | import java.util.UUID
 7 | 
 8 | class WarcEntryParserSpec extends AnyFreeSpec with ClasspathAccess {
 9 |   "WarcEntryParser" - {
10 |     val parser = new WarcEntryParser()
11 |     "parses http header" in {
12 |       val data = classpathBytes("lang/shift_jis.txt")
13 |       val parsed = parser.parseHttpHeader(data)
14 |       assert(parsed.isDefined)
15 |       val Some((message, offset)) = parsed
16 |       assert(offset == 197)
17 |       assertResult("text/html")(message.getHeader("Content-Type").getValue)
18 |       val date = WarcEntryParser.resolveEarliestDate("", message)
19 |       assert("2012-12-29T16:50:56" == date)
20 |     }
21 | 
22 |     "parses UUID" - {
23 |       "<urn:uuid:f1a9564a-ae00-40ef-838e-a4486a83fd1d>" in {
24 |         val uuid = WarcEntryParser.parseWarcUuid(
25 |           "<urn:uuid:f1a9564a-ae00-40ef-838e-a4486a83fd1d>"
26 |         )
27 |         assert(uuid == "f1a9564a-ae00-40ef-838e-a4486a83fd1d")
28 |       }
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.9.4
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.1")
2 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2")
3 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6")


--------------------------------------------------------------------------------
/scripts/cal_overlap_ratio/README.md:
--------------------------------------------------------------------------------
 1 | # Overlap ratio calculation
 2 | 
 3 | ## Goal of the script
 4 | To calculate the overlap ratio between multiple Common Crawl dumps.
 5 | We define 3 types of overlap ratio:
 6 | - `len(dump_1 & dump_2) / len(dump_1)`
 7 | - `len(dump_1 & dump_2) / len(dump_2)`
 8 | - `len(dump_1 & dump_2) / len(dump_1 | dump_2)`
 9 | 
10 | ## How to run the script
11 | If you add some new dumps and run again, the program will automatically skip the dump pairs that were already calculated.
12 | ```
13 | python3 cal_overlap.py --dump_direc_path $dump_direc_path --output_path $output_path
14 | ```
15 | 
16 | ## How to visualize the results
17 | Use the script to process the outputed csv into a heat map figure.
18 | ```
19 | python3 visualize.py --input_path $path_to/overlap.csv --output_path $output_path
20 | ```


--------------------------------------------------------------------------------
/scripts/count_filter_statistics.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from dataclasses import dataclass
  3 | from pathlib import Path
  4 | from multiprocessing.pool import ThreadPool
  5 | import csv
  6 | 
  7 | # Count filter statistics from the directory structure created by Uzushio
  8 | # If tqdm is installed, show show a progress bar while processing.
  9 | 
 10 | 
 11 | @dataclass
 12 | class Args(object):
 13 |     input: list[Path]
 14 |     output: Path
 15 |     workers: int
 16 | 
 17 |     @staticmethod
 18 |     def parse() -> "Args":
 19 |         p = argparse.ArgumentParser()
 20 |         p.add_argument("--output", type=Path, required=True)
 21 |         p.add_argument("--workers", default=4, type=int)
 22 |         p.add_argument("input", type=Path, nargs="+")
 23 |         return Args(**vars(p.parse_args()))
 24 | 
 25 | 
 26 | def directory_size(p: Path) -> int:
 27 |     # print(f"calcluating size of {p}")
 28 |     result = 0
 29 |     for p in p.iterdir():
 30 |         result += p.stat().st_size
 31 |     return result
 32 | 
 33 | 
 34 | def print_progress(data):
 35 |     try:
 36 |         from tqdm import tqdm
 37 |     except ImportError:
 38 |         return
 39 |     for v in tqdm(data.values()):
 40 |         v.wait()
 41 | 
 42 | 
 43 | class Processor(object):
 44 |     def __init__(self, args: Args) -> None:
 45 |         self.args = args
 46 |         self.executor = ThreadPool(args.workers)
 47 | 
 48 |     def run(self):
 49 |         matrix = {}
 50 | 
 51 |         for input_dir in self.args.input:
 52 |             for child in input_dir.iterdir():
 53 |                 chname = child.name
 54 |                 if not chname.startswith("segment="):
 55 |                     continue
 56 |                 segment = chname[8:]
 57 |                 res = self.process_segment(segment, child)
 58 |                 matrix.update(res)
 59 | 
 60 |         self.executor.close()
 61 | 
 62 |         filters = set()
 63 |         segments = set()
 64 | 
 65 |         for segment, filter in matrix.keys():
 66 |             filters.add(filter)
 67 |             segments.add(segment)
 68 | 
 69 |         filters = sorted(filters)
 70 |         segments = sorted(segments)
 71 | 
 72 |         print_progress(matrix)
 73 | 
 74 |         self.executor.join()
 75 | 
 76 |         with self.args.output.open("wt", newline="\n") as of:
 77 |             wr = csv.writer(of)
 78 | 
 79 |             wr.writerow([""] + filters)
 80 | 
 81 |             for segment in segments:
 82 |                 row = [segment]
 83 |                 for filter in filters:
 84 |                     v = matrix.get((segment, filter), None)
 85 |                     if v is None:
 86 |                         r = ""
 87 |                     else:
 88 |                         r = str(v.get())
 89 |                     row.append(r)
 90 |                 wr.writerow(row)
 91 | 
 92 |     def process_segment(self, segment: str, segment_dir: Path):
 93 |         result = {}
 94 |         for child in segment_dir.iterdir():
 95 |             chname = child.name
 96 |             if not chname.startswith("filter="):
 97 |                 continue
 98 | 
 99 |             filter = chname[7:]
100 |             result[(segment, filter)] = self.executor.apply_async(
101 |                 directory_size, [child]
102 |             )
103 | 
104 |         return result
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     args = Args.parse()
109 |     p = Processor(args)
110 |     p.run()
111 | 


--------------------------------------------------------------------------------
/scripts/count_tokens.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import gzip
 4 | from tqdm import tqdm
 5 | from transformers import AutoTokenizer
 6 | import os
 7 | 
 8 | tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-13b-v1.0")
 9 | 
10 | 
11 | def count_tokens(input_file):
12 |     num_tokens = 0
13 |     compressed_size = os.path.getsize(input_file)
14 | 
15 |     with gzip.open(input_file, "rb") as f:
16 |         for line in tqdm(f):
17 |             example = json.loads(line)
18 |             text = example["text"]
19 |             tokens = tokenizer.encode(text)
20 |             num_tokens += len(tokens)
21 | 
22 |     tokens_per_byte = num_tokens / compressed_size
23 |     return num_tokens, tokens_per_byte
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     input_file = sys.argv[1]
28 |     num_tokens, tokens_per_byte = count_tokens(input_file)
29 |     print(f"Total number of tokens: {num_tokens}")
30 |     print(f"Tokens per byte: {tokens_per_byte:.3f}")
31 | 


--------------------------------------------------------------------------------
/scripts/pipeline_01.conf:
--------------------------------------------------------------------------------
 1 | filters: [
 2 |     {"class": "DocLength", "low": 50},
 3 |     {"class": "CompressionRate", "low": 0.15, "high": 1.0},
 4 |     {"class": "CompressionRate", "low": 0.25, "high": 0.9},
 5 |     {"class": "CompressionRate", "low": 0.35, "high": 0.8},
 6 |     {"class": "CompressionRate", "low": 0.45, "high": 0.75},
 7 |     {"class": "CompressionRate", "low": 0.55, "high": 0.7},
 8 |     {"class": "HiraganaRatio", "low": 0.03, "high": 2.0},
 9 |     {"class": "HiraganaRatio", "low": 0.05, "high": 2.0},
10 |     {"class": "HiraganaRatio", "low": 0.07, "high": 2.0},
11 |     {"class": "HiraganaRatio", "low": 0.1, "high": 2.0},
12 |     {"class": "HiraganaRatio", "low": 0.13, "high": 2.0},
13 |     {"class": "HiraganaRatio", "low": 0.15, "high": 2.0},
14 |     {"class": "LinkCharRatio", "low": 0, "high": 0.8},
15 |     {"class": "LinkCharRatio", "low": 0, "high": 0.7},
16 |     {"class": "LinkCharRatio", "low": 0, "high": 0.6},
17 |     {"class": "LinkCharRatio", "low": 0, "high": 0.5},
18 |     {"class": "LinkCharRatio", "low": 0, "high": 0.4},
19 |     {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.1},
20 |     {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1},
21 |     {"class": "MergeListTag"},
22 |     {"class": "MarkdownizeHeading"},
23 |     {"class": "NoContentDOM"},
24 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000},
25 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 100},
26 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
27 |     {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
28 |     {"class": "WordTypes", "threshold": 5, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
29 |     {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
30 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
31 |     {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
32 |     {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
33 |     {"class": "DocLength", "low": 100},
34 |     {"class": "DocLength", "low": 150},
35 |     {"class": "DocLength", "low": 200},
36 | ]


--------------------------------------------------------------------------------
/scripts/pipeline_02.conf:
--------------------------------------------------------------------------------
 1 | filters: [
 2 |     {"class": "AdjacentDuplicateParagraphs"},
 3 |     {"class": "DocLength", "low": 50},
 4 |     {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.1},
 5 |     {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1},
 6 |     {"class": "HiraganaRatio", "low": 0.03, "high": 2.0},
 7 |     {"class": "HiraganaRatio", "low": 0.05, "high": 2.0},
 8 |     {"class": "HiraganaRatio", "low": 0.07, "high": 2.0},
 9 |     {"class": "HiraganaRatio", "low": 0.1, "high": 2.0},
10 |     {"class": "HiraganaRatio", "low": 0.13, "high": 2.0},
11 |     {"class": "HiraganaRatio", "low": 0.15, "high": 2.0},
12 |     {"class": "LinkCharRatio", "low": 0, "high": 0.8},
13 |     {"class": "LinkCharRatio", "low": 0, "high": 0.7},
14 |     {"class": "LinkCharRatio", "low": 0, "high": 0.6},
15 |     {"class": "LinkCharRatio", "low": 0, "high": 0.5},
16 |     {"class": "LinkCharRatio", "low": 0, "high": 0.4},
17 |     {"class": "MergeListTag"},
18 |     {"class": "MarkdownizeHeading"},
19 |     {"class": "NoContentDOM"},
20 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000},
21 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 100},
22 |     {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 3, "threshold": 1e6},
23 |     {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 2, "threshold": 5e6},
24 |     {"class": "CompressionRate", "low": 0.15, "high": 1.0},
25 |     {"class": "CompressionRate", "low": 0.25, "high": 0.9},
26 |     {"class": "CompressionRate", "low": 0.35, "high": 0.8},
27 |     {"class": "CompressionRate", "low": 0.45, "high": 0.75},
28 |     {"class": "CompressionRate", "low": 0.50, "high": 0.75},
29 |     {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 1e6, low: 2},
30 |     {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 5e5, low: 5},
31 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
32 |     {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
33 |     # {"class": "WordTypes", "threshold": 5, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
34 |     # {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
35 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
36 |     # {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
37 |     # {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
38 |     {"class": "DocLength", "low": 100},
39 |     {"class": "DocLength", "low": 150},
40 |     {"class": "DocLength", "low": 200},
41 | ]


--------------------------------------------------------------------------------
/scripts/pipeline_03a.conf:
--------------------------------------------------------------------------------
 1 | filters: [
 2 |     # {"class": "AdjacentDuplicateParagraphs"},
 3 |     {"class": "DocLength", "low": 50},
 4 |     {"class": "DeduplicateDocumentsPercentile", "expected": 5, "percentile": 0.05},
 5 |     {"class": "HiraganaRatio", "low": 0.1, "high": 2.0},
 6 |     {"class": "HiraganaRatio", "low": 0.15, "high": 2.0},
 7 |     {"class": "LinkCharRatio", "low": 0, "high": 0.8},
 8 |     {"class": "LinkCharRatio", "low": 0, "high": 0.4},
 9 |     {"class": "MergeListTag"},
10 |     {"class": "MarkdownizeHeading"},
11 |     {"class": "NoContentDOM"},
12 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000},
13 |     {"class": "LargeFreqParagraphs", "count": 3, "freq": 100},
14 |     {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 3, "threshold": 1e6},
15 |     {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 2, "threshold": 5e6},
16 |     {"class": "CompressionRate", "low": 0.25, "high": 5.0},
17 |     {"class": "CompressionRate", "low": 0.40, "high": 0.75},
18 |     {"class": "CompressionRate", "low": 0.50, "high": 0.75},
19 |     {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 1e6, low: 5},
20 |     {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 5e5, low: 7},
21 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
22 |     {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
23 |     {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
24 |     {"class": "DocLength", "low": 200},
25 |     {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.05},
26 |     {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1},
27 | ]


--------------------------------------------------------------------------------
/scripts/pipeline_test_perplexity.conf:
--------------------------------------------------------------------------------
1 | filters: [
2 |     {"class": "DocLength", "low": 50},
3 |     {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e7 },
4 |     {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e6 },
5 |     {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e5 },
6 |     {"class": "KenLMParagraphPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, count: 3, threshold: 1e6 },
7 | ]


--------------------------------------------------------------------------------
/scripts/submit_all_compute_stats.sh:
--------------------------------------------------------------------------------
 1 | submit_post2017() {
 2 |     qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \
 3 |         "/groups/gcf51199/cc/extracted/segment\=$1" \
 4 |         /groups/gcf51199/cc/stats_raw_v2/segment=$1 \
 5 |         500 4000
 6 | }
 7 | 
 8 | # submit_post2017 CC-MAIN-2017-04
 9 | # submit_post2017 CC-MAIN-2017-09
10 | # submit_post2017 CC-MAIN-2017-13
11 | # submit_post2017 CC-MAIN-2017-17
12 | # submit_post2017 CC-MAIN-2017-22
13 | # submit_post2017 CC-MAIN-2017-26
14 | # submit_post2017 CC-MAIN-2017-30
15 | # submit_post2017 CC-MAIN-2017-34
16 | # submit_post2017 CC-MAIN-2017-39
17 | # submit_post2017 CC-MAIN-2017-43
18 | # submit_post2017 CC-MAIN-2017-47
19 | # submit_post2017 CC-MAIN-2017-51
20 | # submit_post2017 CC-MAIN-2018-05
21 | # submit_post2017 CC-MAIN-2018-09
22 | # submit_post2017 CC-MAIN-2018-13
23 | # submit_post2017 CC-MAIN-2018-17
24 | # submit_post2017 CC-MAIN-2018-22
25 | # submit_post2017 CC-MAIN-2018-26
26 | # submit_post2017 CC-MAIN-2018-30
27 | # submit_post2017 CC-MAIN-2018-34
28 | # submit_post2017 CC-MAIN-2018-39
29 | # submit_post2017 CC-MAIN-2018-43
30 | # submit_post2017 CC-MAIN-2018-47
31 | # submit_post2017 CC-MAIN-2018-51
32 | # submit_post2017 CC-MAIN-2019-04
33 | # submit_post2017 CC-MAIN-2019-09
34 | # submit_post2017 CC-MAIN-2019-13
35 | # submit_post2017 CC-MAIN-2019-18
36 | # submit_post2017 CC-MAIN-2019-22
37 | # submit_post2017 CC-MAIN-2019-26
38 | # submit_post2017 CC-MAIN-2019-30
39 | # submit_post2017 CC-MAIN-2019-35
40 | # submit_post2017 CC-MAIN-2019-39
41 | # submit_post2017 CC-MAIN-2019-43
42 | # submit_post2017 CC-MAIN-2019-47
43 | # submit_post2017 CC-MAIN-2019-51
44 | # submit_post2017 CC-MAIN-2020-05
45 | # submit_post2017 CC-MAIN-2020-10
46 | # submit_post2017 CC-MAIN-2020-16
47 | # submit_post2017 CC-MAIN-2020-24
48 | # submit_post2017 CC-MAIN-2020-29
49 | # submit_post2017 CC-MAIN-2020-34
50 | # submit_post2017 CC-MAIN-2020-40
51 | # submit_post2017 CC-MAIN-2020-45
52 | # submit_post2017 CC-MAIN-2020-50
53 | # submit_post2017 CC-MAIN-2021-04
54 | # submit_post2017 CC-MAIN-2021-10
55 | # submit_post2017 CC-MAIN-2021-17
56 | # submit_post2017 CC-MAIN-2021-21
57 | # submit_post2017 CC-MAIN-2021-25
58 | # submit_post2017 CC-MAIN-2021-31
59 | # submit_post2017 CC-MAIN-2021-39
60 | # submit_post2017 CC-MAIN-2021-43
61 | # submit_post2017 CC-MAIN-2021-49
62 | # submit_post2017 CC-MAIN-2022-05
63 | # submit_post2017 CC-MAIN-2022-21
64 | # submit_post2017 CC-MAIN-2022-27
65 | # submit_post2017 CC-MAIN-2022-33
66 | # submit_post2017 CC-MAIN-2022-40
67 | # submit_post2017 CC-MAIN-2022-49
68 | # submit_post2017 CC-MAIN-2023-06
69 | # submit_post2017 CC-MAIN-2023-14
70 | # submit_post2017 CC-MAIN-2023-23
71 | # submit_post2017 CC-MAIN-2023-40
72 | submit_post2017 CC-MAIN-2023-50


--------------------------------------------------------------------------------
/scripts/submit_all_compute_stats_old.sh:
--------------------------------------------------------------------------------
 1 | submit_pre2016() {
 2 |     qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \
 3 |         "/groups/gcf51199/cc2/extracted/$1" \
 4 |         "/groups/gcf51199/cc/stats_raw_v2/segment=$1" \
 5 |         500 4000
 6 | }
 7 | 
 8 | submit_pre2016 merged-2013
 9 | submit_pre2016 merged-2014
10 | submit_pre2016 merged-2015
11 | submit_pre2016 merged-2016


--------------------------------------------------------------------------------
/scripts/submit_all_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | 
 3 | submit() {
 4 |     qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \
 5 |         "/groups/gcf51199/cc/extracted/segment\=$1" \
 6 |         /groups/gcf51199/cc/stats_merged_v2/for_filter/all \
 7 |         "/groups/gcf51199/cc/filtered_v3/segment=$1"
 8 | }
 9 | 
10 | submit_pre2016() {
11 |     qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \
12 |         "/groups/gcf51199/cc2/extracted/$1" \
13 |         /groups/gcf51199/cc/stats_merged_v2/for_filter/all \
14 |         "/groups/gcf51199/cc/filtered_v3/segment=$1"
15 | }
16 | 
17 | # submit_pre2016 merged-2013
18 | # submit_pre2016 merged-2014
19 | # submit_pre2016 merged-2015
20 | # submit_pre2016 merged-2016
21 | 
22 | # submit CC-MAIN-2017-04
23 | # submit CC-MAIN-2017-09
24 | # submit CC-MAIN-2017-13
25 | # submit CC-MAIN-2017-17
26 | # submit CC-MAIN-2017-22
27 | # submit CC-MAIN-2017-26
28 | # submit CC-MAIN-2017-30
29 | # submit CC-MAIN-2017-34
30 | # submit CC-MAIN-2017-39
31 | submit CC-MAIN-2017-43
32 | # submit CC-MAIN-2017-47
33 | # submit CC-MAIN-2017-51
34 | # submit CC-MAIN-2018-05
35 | # submit CC-MAIN-2018-09
36 | # submit CC-MAIN-2018-13
37 | # submit CC-MAIN-2018-17
38 | # submit CC-MAIN-2018-22
39 | # submit CC-MAIN-2018-26
40 | # submit CC-MAIN-2018-30
41 | # submit CC-MAIN-2018-34
42 | # submit CC-MAIN-2018-39
43 | # submit CC-MAIN-2018-43
44 | # submit CC-MAIN-2018-47
45 | # submit CC-MAIN-2018-51
46 | # submit CC-MAIN-2019-04
47 | # submit CC-MAIN-2019-09
48 | # submit CC-MAIN-2019-13
49 | # submit CC-MAIN-2019-18
50 | # submit CC-MAIN-2019-22
51 | # submit CC-MAIN-2019-26
52 | # submit CC-MAIN-2019-30
53 | # submit CC-MAIN-2019-35
54 | # submit CC-MAIN-2019-39
55 | # submit CC-MAIN-2019-43
56 | # submit CC-MAIN-2019-47
57 | # submit CC-MAIN-2019-51
58 | # submit CC-MAIN-2020-05
59 | # submit CC-MAIN-2020-10
60 | # submit CC-MAIN-2020-16
61 | # submit CC-MAIN-2020-24
62 | # submit CC-MAIN-2020-29
63 | # submit CC-MAIN-2020-34
64 | # submit CC-MAIN-2020-40
65 | # submit CC-MAIN-2020-45
66 | # submit CC-MAIN-2020-50
67 | # submit CC-MAIN-2021-04
68 | # submit CC-MAIN-2021-10
69 | # submit CC-MAIN-2021-17
70 | # submit CC-MAIN-2021-21
71 | # submit CC-MAIN-2021-25
72 | # submit CC-MAIN-2021-31
73 | # submit CC-MAIN-2021-39
74 | # submit CC-MAIN-2021-43
75 | # submit CC-MAIN-2021-49
76 | # submit CC-MAIN-2022-05
77 | # submit CC-MAIN-2022-21
78 | # submit CC-MAIN-2022-27
79 | # submit CC-MAIN-2022-33
80 | # submit CC-MAIN-2022-40
81 | # submit CC-MAIN-2022-49
82 | # submit CC-MAIN-2023-06
83 | # submit CC-MAIN-2023-14
84 | # submit CC-MAIN-2023-23
85 | # submit CC-MAIN-2023-40


--------------------------------------------------------------------------------
/scripts/submit_all_merges_stage1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats.sh"
 4 | MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year
 5 | MERGE_BASIC_ROOT=/groups/gcf51199/cc/stats_raw_v2
 6 | 
 7 | eval $COMMAND_START $MERGE_ROOT/2016 $MERGE_BASIC_ROOT/segment=merged-*
 8 | eval $COMMAND_START $MERGE_ROOT/2017 $MERGE_BASIC_ROOT/segment=CC-MAIN-2017-*
 9 | eval $COMMAND_START $MERGE_ROOT/2018 $MERGE_BASIC_ROOT/segment=CC-MAIN-2018-*
10 | eval $COMMAND_START $MERGE_ROOT/2019 $MERGE_BASIC_ROOT/segment=CC-MAIN-2019-*
11 | eval $COMMAND_START $MERGE_ROOT/2020 $MERGE_BASIC_ROOT/segment=CC-MAIN-2020-*
12 | eval $COMMAND_START $MERGE_ROOT/2021 $MERGE_BASIC_ROOT/segment=CC-MAIN-2021-*
13 | eval $COMMAND_START $MERGE_ROOT/2022 $MERGE_BASIC_ROOT/segment=CC-MAIN-2022-*
14 | eval $COMMAND_START $MERGE_ROOT/2023 $MERGE_BASIC_ROOT/segment=CC-MAIN-2023-*
15 | 


--------------------------------------------------------------------------------
/scripts/submit_all_merges_stage2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats_final.sh"
 4 | MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year
 5 | MERGE_FINAL_ROOT=/groups/gcf51199/cc/stats_merged_v2/for_filter
 6 | 
 7 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2016 $MERGE_ROOT/2016 $MERGE_ROOT/2017
 8 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2017 $MERGE_ROOT/2016 $MERGE_ROOT/2017 $MERGE_ROOT/2018
 9 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2018 $MERGE_ROOT/2017 $MERGE_ROOT/2018 $MERGE_ROOT/2019
10 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2019 $MERGE_ROOT/2018 $MERGE_ROOT/2019 $MERGE_ROOT/2020
11 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2020 $MERGE_ROOT/2019 $MERGE_ROOT/2020 $MERGE_ROOT/2021
12 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2021 $MERGE_ROOT/2020 $MERGE_ROOT/2021 $MERGE_ROOT/2022
13 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2022 $MERGE_ROOT/2021 $MERGE_ROOT/2022 $MERGE_ROOT/2023
14 | eval $COMMAND_START $MERGE_FINAL_ROOT/all $MERGE_ROOT/*
15 | 


--------------------------------------------------------------------------------
/scripts/submit_calc_overlap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | 
 6 | source $HOME/work/uzushio/.venv/bin/activate
 7 | 
 8 | python3 $HOME/work/uzushio/scripts/cal_overlap_ratio/cal_overlap.py \
 9 |     --dump_direc_path=/groups/gcf51199/cc/extracted \
10 |     --output_path=$HOME/work/overlap-extracted.csv
11 | 


--------------------------------------------------------------------------------
/scripts/submit_dedup_stage1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | du -hs "$1" > /dev/null &
 9 | 
10 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
11 | UZUSHIO_ROOT=$HOME/work/uzushio
12 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
13 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
14 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
15 | 
16 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
17 | "$SPARK_HOME/sbin/start-master.sh"
18 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
19 | export SPARK_SSH_OPTS="-p 2222"
20 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
21 | SPARK_MASTER="spark://$(hostname):7077"
22 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
23 | 
24 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
25 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
26 | 
27 | mkdir -p /scratch/$USER/spark-exlog
28 | 
29 | INPUT=$1
30 | OUTPUT=$2
31 | CACHE=/dev/null
32 | NUM_PARTITIONS=${3:-50}
33 | NUM_PARTITIONS_PROPAGATION=${4:-$(($NUM_PARTITIONS * 4))}
34 | 
35 | "$SPARK_HOME/bin/spark-submit" \
36 |     --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \
37 |     --master $SPARK_MASTER \
38 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
39 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
40 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
41 |     --conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \
42 |     --conf spark.sql.parquet.columnarReaderBatchSize=512 \
43 |     local://$UZUSHIO_JAR \
44 |     --input="$INPUT" \
45 |     --output="$OUTPUT" \
46 |     --execution=reprHashes,stats,saveStats \
47 |     --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \
48 |     --partitions=$NUM_PARTITIONS --intermediate
49 | 
50 | wait


--------------------------------------------------------------------------------
/scripts/submit_dedup_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | 
 7 | INPUT=$1
 8 | OUTPUT=$2
 9 | CACHE=$3
10 | NUM_PARTITIONS=${4:-50}
11 | NUM_PARTITIONS_PROPAGATION=${5:-$(($NUM_PARTITIONS * 4))}
12 | 
13 | du -hs "$INPUT" > /dev/null &
14 | du -hs "$CACHE" > /dev/null &
15 | 
16 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
17 | UZUSHIO_ROOT=$HOME/work/uzushio
18 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
19 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
20 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
21 | 
22 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
23 | "$SPARK_HOME/sbin/start-master.sh"
24 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
25 | export SPARK_SSH_OPTS="-p 2222"
26 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
27 | SPARK_MASTER="spark://$(hostname):7077"
28 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
29 | 
30 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
31 | echo "$(date -Iseconds) $JOB_ID DedupFilterStatistics ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
32 | 
33 | mkdir -p /scratch/$USER/spark-exlog
34 | 
35 | "$SPARK_HOME/bin/spark-submit" \
36 |     --class com.worksap.nlp.uzushio.lib.runners.DedupFilterStatistics \
37 |     --master $SPARK_MASTER \
38 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
39 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
40 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
41 |     --conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \
42 |     local://$UZUSHIO_JAR \
43 |     --input="$INPUT" \
44 |     --stats="$CACHE" \
45 |     --output="$OUTPUT" --partitions=$NUM_PARTITIONS \
46 |     --filter=large-freq-paragraphs
47 | 
48 | wait


--------------------------------------------------------------------------------
/scripts/submit_filter_debug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | INPUT=$1
 9 | STATS=$2
10 | OUTPUT=$3
11 | 
12 | du -hs "$INPUT" > /dev/null &
13 | du -hs "$STATS" > /dev/null &
14 | 
15 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
16 | UZUSHIO_ROOT=$HOME/work/uzushio
17 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
18 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
19 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
20 | 
21 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
22 | "$SPARK_HOME/sbin/start-master.sh"
23 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
24 | export SPARK_SSH_OPTS="-p 2222"
25 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
26 | SPARK_MASTER="spark://$(hostname):7077"
27 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
28 | 
29 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
30 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
31 | 
32 | mkdir -p /scratch/$USER/spark-exlog
33 | 
34 | NUM_PARTITIONS=1000
35 | NUM_PARTITIONS_PROPAGATION=4000
36 | 
37 | "$SPARK_HOME/bin/spark-submit" \
38 |     --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \
39 |     --master $SPARK_MASTER \
40 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
41 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
42 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
43 |     --conf spark.sql.shuffle.partitions=$NUM_PARTITIONS_PROPAGATION \
44 |     --conf spark.sql.parquet.columnarReaderBatchSize=256 \
45 |     local://$UZUSHIO_JAR \
46 |     --input=$INPUT \
47 |     --cache=$STATS \
48 |     --output=$OUTPUT \
49 |     --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \
50 |     --filters=$SCRIPT_DIR/pipeline_01.conf \
51 |     --partitions=$NUM_PARTITIONS \
52 |     --execution=filter-debug \
53 |     --format=json --compression=gzip --text-only
54 | 
55 | 
56 | wait


--------------------------------------------------------------------------------
/scripts/submit_filter_debug_2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | INPUT=$1
 9 | STATS=$2
10 | OUTPUT=$3
11 | 
12 | du -hs "$INPUT" > /dev/null &
13 | du -hs "$STATS" > /dev/null &
14 | 
15 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
16 | UZUSHIO_ROOT=$HOME/work/uzushio
17 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
18 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
19 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
20 | 
21 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
22 | "$SPARK_HOME/sbin/start-master.sh"
23 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
24 | export SPARK_SSH_OPTS="-p 2222"
25 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
26 | SPARK_MASTER="spark://$(hostname):7077"
27 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
28 | 
29 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
30 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
31 | 
32 | mkdir -p /scratch/$USER/spark-exlog
33 | 
34 | NUM_PARTITIONS=1000
35 | NUM_PARTITIONS_PROPAGATION=4000
36 | 
37 | "$SPARK_HOME/bin/spark-submit" \
38 |     --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \
39 |     --master $SPARK_MASTER \
40 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
41 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
42 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
43 |     --conf spark.sql.shuffle.partitions=$NUM_PARTITIONS_PROPAGATION \
44 |     --conf spark.sql.parquet.columnarReaderBatchSize=256 \
45 |     local://$UZUSHIO_JAR \
46 |     --input=$INPUT \
47 |     --cache=$STATS \
48 |     --output=$OUTPUT \
49 |     --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \
50 |     --filters=$SCRIPT_DIR/pipeline_03a.conf \
51 |     --partitions=$NUM_PARTITIONS \
52 |     --execution=filter-debug \
53 |     -Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_merge-code_0.05_model.bin \
54 |     -Psudachi=/groups/gcf51199/resources/sudachi-dictionary-20230927/system_core.dic \
55 |     --format=json --compression=gzip --text-only
56 | 
57 | 
58 | wait


--------------------------------------------------------------------------------
/scripts/submit_kenlm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | INPUT=$1
 9 | OUTPUT=$2
10 | KENLM=$3
11 | SUDACHI=$4
12 | 
13 | du -hs "$INPUT" > /dev/null &
14 | 
15 | 
16 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
17 | UZUSHIO_ROOT=$HOME/work/uzushio
18 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
19 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
20 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
21 | 
22 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
23 | "$SPARK_HOME/sbin/start-master.sh"
24 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
25 | export SPARK_SSH_OPTS="-p 2222"
26 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
27 | SPARK_MASTER="spark://$(hostname):7077"
28 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
29 | 
30 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
31 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
32 | 
33 | mkdir -p /scratch/$USER/spark-exlog
34 | 
35 | 
36 | "$SPARK_HOME/bin/spark-submit" \
37 |     --class com.worksap.nlp.uzushio.lib.runners.KenLMRunner \
38 |     --master $SPARK_MASTER \
39 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
40 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
41 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
42 |     --conf spark.sql.parquet.columnarReaderBatchSize=512 \
43 |     local://$UZUSHIO_JAR \
44 |     --input=$INPUT \
45 |     --output=$OUTPUT \
46 |     --sudachi-dict=$SUDACHI \
47 |     --kenlm-model=$KENLM
48 | 
49 | wait


--------------------------------------------------------------------------------
/scripts/submit_merge_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | OUTPUT=$1
 9 | shift
10 | INPUT=()
11 | for arg in "$@"; do
12 |   INPUT+=("--input=$arg")
13 |   du -hs "$arg" > /dev/null &
14 | done
15 | 
16 | 
17 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
18 | UZUSHIO_ROOT=$HOME/work/uzushio
19 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
20 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
21 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
22 | 
23 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
24 | "$SPARK_HOME/sbin/start-master.sh"
25 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
26 | export SPARK_SSH_OPTS="-p 2222"
27 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
28 | SPARK_MASTER="spark://$(hostname):7077"
29 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
30 | 
31 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
32 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
33 | 
34 | mkdir -p /scratch/$USER/spark-exlog
35 | 
36 | "$SPARK_HOME/bin/spark-submit" \
37 |     --class com.worksap.nlp.uzushio.lib.runners.MergeDedupStats \
38 |     --master $SPARK_MASTER \
39 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
40 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
41 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
42 |     --conf spark.sql.shuffle.partitions=1000 \
43 |     local://$UZUSHIO_JAR \
44 |     ${INPUT[*]} \
45 |     --output="$OUTPUT"
46 | 
47 | wait


--------------------------------------------------------------------------------
/scripts/submit_merge_stats_final.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | #$ -j y
 4 | #$ -cwd
 5 | #$ -l USE_SSH=1
 6 | #$ -l USE_EXTRA_NETWORK=1
 7 | 
 8 | OUTPUT=$1
 9 | shift
10 | INPUT=()
11 | for arg in "$@"; do
12 |   INPUT+=("--input=$arg")
13 |   du -hs "$arg" > /dev/null &
14 | done
15 | 
16 | 
17 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
18 | UZUSHIO_ROOT=$HOME/work/uzushio
19 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts
20 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3}
21 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar")
22 | 
23 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f
24 | "$SPARK_HOME/sbin/start-master.sh"
25 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST
26 | export SPARK_SSH_OPTS="-p 2222"
27 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR
28 | SPARK_MASTER="spark://$(hostname):7077"
29 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER
30 | 
31 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding
32 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring
33 | 
34 | mkdir -p /scratch/$USER/spark-exlog
35 | 
36 | "$SPARK_HOME/bin/spark-submit" \
37 |     --class com.worksap.nlp.uzushio.lib.runners.MergeDedupStats \
38 |     --master $SPARK_MASTER \
39 |     --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \
40 |     --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
41 |     --conf spark.local.dir=$SPARK_LOCAL_DIRS \
42 |     --conf spark.sql.shuffle.partitions=4000 \
43 |     local://$UZUSHIO_JAR \
44 |     ${INPUT[*]} \
45 |     --output="$OUTPUT" --no-ones --partitions=1000
46 | 
47 | wait


--------------------------------------------------------------------------------
/scripts/vis/vis_filter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import dataclasses
 3 | import matplotlib.pyplot as plt
 4 | from pathlib import Path
 5 | import pandas as pd
 6 | import numpy as np
 7 | import pyarrow.csv as pcsv
 8 | 
 9 | 
10 | @dataclasses.dataclass
11 | class Args(object):
12 |     output: str
13 |     input: list[str]
14 |     title: str = None
15 |     dpi: float = None
16 |     log: bool = False
17 | 
18 |     @staticmethod
19 |     def parse():
20 |         p = argparse.ArgumentParser()
21 |         p.add_argument("--output", type=Path)
22 |         p.add_argument("--title")
23 |         p.add_argument("--dpi", type=float)
24 |         p.add_argument("--log", action="store_true")
25 |         p.add_argument("input", type=Path, nargs="+")
26 |         return Args(**vars(p.parse_args()))
27 | 
28 | 
29 | def plot_histogram(args: Args, folder_paths: list[Path]):
30 |     histogram_data = []
31 |     titles = []
32 | 
33 |     # Iterate through subfolders and CSV files
34 |     for folder in folder_paths:
35 |         if folder.is_dir():
36 |             total_df = []
37 |             csv_files = folder.glob("*.csv")
38 |             for csv_file in csv_files:
39 |                 data = pcsv.read_csv(
40 |                     csv_file,
41 |                     read_options=pcsv.ReadOptions(column_names=["val", "text"]),
42 |                     convert_options=pcsv.ConvertOptions(include_columns=["val"]),
43 |                 )
44 | 
45 |                 total_df.append(data.column(0).to_numpy())
46 | 
47 |             total_df = np.concatenate(total_df, axis=0)
48 |             histogram_data.append(total_df)
49 |             titles.append(folder.name)
50 | 
51 |     plt.hist(
52 |         histogram_data,
53 |         bins=200,
54 |         density=True,
55 |         label=titles,
56 |         histtype="stepfilled",
57 |         alpha=0.5,
58 |         log=args.log,
59 |     )
60 |     plt.legend(titles)
61 |     plt.ylabel("Data %")
62 |     plt.xlabel("Value")
63 |     plt.title(args.title)
64 | 
65 | 
66 | def main(args: Args):
67 |     plot_histogram(args, args.input)
68 |     plt.savefig(args.output)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main(Args.parse())
73 | 


--------------------------------------------------------------------------------
/spark-config/abci-f/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | spark.driver.memory 30G
 2 | spark.executor.memory 63G
 3 | spark.executor.extraJavaOptions     -XX:ObjectAlignmentInBytes=16 -XX:+UseParallelGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps
 4 | spark.driver.log.persistToDfs.enabled   true
 5 | spark.eventLog.enabled true
 6 | spark.eventLog.compress true
 7 | spark.checkpoint.compress true
 8 | spark.memory.offHeap.enabled true
 9 | spark.memory.offHeap.size 200G
10 | spark.ui.reverseProxy true
11 | spark.executor.extraLibraryPath /groups/gcf51199/native-libs
12 | spark.driver.extraLibraryPath /groups/gcf51199/native-libs


--------------------------------------------------------------------------------
/spark-config/abci-f/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Spark environment variables for ABCI rt_F nodes
 4 | # Correctly set temp directory and local storage to local scratch directory
 5 | 
 6 | export SPARK_LOCAL_DIRS="$SGE_LOCALDIR"
 7 | export SPARK_DAEMON_JAVA_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR"
 8 | 
 9 | export SPARK_WORKER_DIR="$SGE_LOCALDIR"
10 | export SPARK_WORKER_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR"
11 | export SPARK_EXECUTOR_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR"
12 | 
13 | export SPARK_LOG_DIR="/scratch/${USER:-nouser}/spark-log/${JOB_ID:-nojob}"
14 | mkdir -p "$SPARK_LOG_DIR"
15 | 
16 | export MKL_NUM_THREADS=1
17 | export OPENBLAS_NUM_THREADS=1
18 | 
19 | export JAVA_HOME="$HOME/soft/jdk-17-2023-09-27"


--------------------------------------------------------------------------------