├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .jvmopts ├── .scalafmt.conf ├── LICENSE ├── README.md ├── bench └── src │ └── main │ └── java │ └── test │ ├── Log10Bench.java │ └── SqrtBench.java ├── build.sbt ├── core └── src │ └── main │ ├── resources │ ├── chitra.conf │ ├── ng_words.txt │ ├── reference.conf │ ├── rmTemplate.conf │ ├── sudachiDictCorpus.conf │ ├── template_sentences.txt │ └── warc.conf │ └── scala │ └── com │ └── worksap │ └── nlp │ └── uzushio │ ├── CorpusCleaner.scala │ ├── DocumentIO.scala │ ├── MinHash.scala │ ├── MinHashDeduplicator.scala │ ├── Sudachi.scala │ ├── SudachiTokenizer.scala │ ├── TokenHasher.scala │ ├── cleaning │ ├── ConcatShortSentence.scala │ ├── DeduplicateElement.scala │ ├── DeduplicateRepeatingSentence.scala │ ├── FieldSettable.scala │ ├── Filter.scala │ ├── FilterBySentenceLength.scala │ ├── FilterJapaneseBasedOnCharacter.scala │ ├── NormalizeCharacter.scala │ ├── NormalizeWhitespace.scala │ ├── Normalizer.scala │ ├── Pipeline.scala │ ├── RemoveEmail.scala │ ├── RemoveNGWordDocument.scala │ ├── RemoveScriptDocument.scala │ ├── RemoveShortDocument.scala │ ├── RemoveSubstring.scala │ ├── RemoveURL.scala │ ├── RemoveWikipediaCitation.scala │ ├── SplitElement.scala │ └── Transformer.scala │ └── main │ ├── DeduplicateParagraphs.scala │ └── ExtractTextFromWarc.scala ├── docs └── tutorial.md ├── legacy ├── README.md ├── list_common_substr.py ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── worksap │ │ └── nlp │ │ └── uzushio │ │ └── warc │ │ ├── HttpResponseParser.scala │ │ ├── HttpResponseSerializable.scala │ │ ├── JusTextHandler.scala │ │ ├── LongWritableSerializable.scala │ │ ├── NWCToolkitHandler.scala │ │ ├── ParagraphHandler.scala │ │ ├── README.md │ │ ├── WarcFileReader.scala │ │ ├── WarcInputFormat.scala │ │ ├── WarcLoader.scala │ │ ├── WarcRecord.scala │ │ ├── WarcToDocument.scala │ │ └── WarcWritable.scala └── suffixarray.py ├── lib └── src │ ├── main │ ├── resources │ │ ├── com │ │ │ └── worksap │ │ │ │ └── nlp │ │ │ │ └── uzushio │ │ │ │ └── lib │ │ │ │ └── filters │ │ │ │ ├── hojichar │ │ │ │ ├── README.md │ │ │ │ ├── adult_keywords_ja.txt │ │ │ │ └── discriminations_keywords_ja.txt │ │ │ │ └── ng_words.txt │ │ └── pipeline │ │ │ └── all_duplicate_paragraphs.conf │ └── scala │ │ └── com │ │ └── worksap │ │ └── nlp │ │ └── uzushio │ │ └── lib │ │ ├── cleaning │ │ ├── PathSegment.scala │ │ └── Pipeline.scala │ │ ├── filters │ │ ├── AdjacentDuplicateParagraphs.scala │ │ ├── CompressionRate.scala │ │ ├── DeduplicateDocuments.scala │ │ ├── DeduplicateDocumentsPercentile.scala │ │ ├── DocLength.scala │ │ ├── DuplicateDocumentsLengthWeighted.scala │ │ ├── DuplicateParagraphs.scala │ │ ├── HiraganaRatio.scala │ │ ├── KenLMDocAvgPerplexity.scala │ │ ├── KenLMParagraphPerplexity.scala │ │ ├── LargeFreqParagraphs.scala │ │ ├── LinkCharRatio.scala │ │ ├── MarkdownizeHeading.scala │ │ ├── MergeListTag.scala │ │ ├── NoContentDOM.scala │ │ ├── WordInstances.scala │ │ ├── WordTypes.scala │ │ └── base │ │ │ ├── FilterBase.scala │ │ │ └── HighLowDocFilter.scala │ │ ├── html │ │ ├── AllTagMapper.scala │ │ ├── ParagraphExtractor.scala │ │ └── ParseAbortException.scala │ │ ├── lang │ │ ├── LangEstimation.scala │ │ └── LangTagSniffer.scala │ │ ├── resources │ │ └── CachedLocalResource.scala │ │ ├── runners │ │ ├── DedupFilterStatistics.scala │ │ ├── DeduplicateParagraphs.scala │ │ ├── ExtractParagraphsFromWARC.scala │ │ ├── FilterStatistics.scala │ │ ├── KenLMRunner.scala │ │ ├── MergeDedupStats.scala │ │ └── Repackage.scala │ │ ├── stats │ │ ├── CountMinSketch.scala │ │ ├── NgramBitSignatures.java │ │ └── SimHashProcessor.scala │ │ ├── utils │ │ ├── BuilderSyntax.scala │ │ ├── Levenshtein.java │ │ ├── MathUtil.java │ │ ├── Paragraphs.scala │ │ ├── Resources.scala │ │ ├── RowBuffer.java │ │ ├── SentenceIterator.scala │ │ ├── SessionBufferIn.scala │ │ ├── TrieNode.scala │ │ ├── WarcFileReader.scala │ │ └── Ziggurat.java │ │ └── warc │ │ ├── WarcEntryParser.scala │ │ ├── WarcInputFormat.scala │ │ ├── WarcLoader.scala │ │ ├── WarcRecord.scala │ │ └── WarcWritable.scala │ └── test │ ├── resources │ ├── docs │ │ ├── links.html │ │ ├── paragraph_detect.html │ │ ├── perldoc_ja.html │ │ └── perldoc_ja_small.html │ ├── lang │ │ └── shift_jis.txt │ ├── pipeline │ │ └── doc_len.conf │ └── text │ │ └── dedup_docomo.txt │ └── scala │ └── com │ └── worksap │ └── nlp │ └── uzushio │ └── lib │ ├── cleaning │ ├── DocumentSpec.scala │ ├── ParagraphSpec.scala │ ├── PathSegmentSpec.scala │ └── PipelineSpec.scala │ ├── dupes │ └── CandidateRowProcessorSpec.scala │ ├── filters │ ├── AdjacentDuplicateParagraphsSpec.scala │ ├── CompressionRateSpec.scala │ ├── DeduplicateDocumentsSpec.scala │ ├── LargeFreqParagraphsSpec.scala │ ├── LinkCharRatioSpec.scala │ ├── MarkdownizeHeadingSpec.scala │ ├── MergeListTagSpec.scala │ ├── NoContentDOMSpec.scala │ ├── WordInstancesSpec.scala │ └── package.scala │ ├── html │ └── HtmlParserSpec.scala │ ├── lang │ └── LangEstimationSpec.scala │ ├── runners │ └── MergeStatsSpec.scala │ ├── utils │ ├── ClasspathAccess.scala │ ├── MathUtilTest.scala │ ├── ParagraphsSpec.scala │ ├── RowBufferSpec.scala │ ├── SentenceIteratorSpec.scala │ └── TrieSpec.scala │ └── warc │ └── WarcEntryParserSpec.scala ├── project ├── Build.scala ├── build.properties └── plugins.sbt ├── scripts ├── cal_overlap_ratio │ ├── README.md │ ├── cal_overlap.py │ └── visualize.py ├── count_filter_statistics.py ├── count_tokens.py ├── pipeline_01.conf ├── pipeline_02.conf ├── pipeline_03a.conf ├── pipeline_test_perplexity.conf ├── submit_all_compute_stats.sh ├── submit_all_compute_stats_old.sh ├── submit_all_filter.sh ├── submit_all_merges_stage1.sh ├── submit_all_merges_stage2.sh ├── submit_calc_overlap.sh ├── submit_dedup_stage1.sh ├── submit_dedup_stats.sh ├── submit_filter_debug.sh ├── submit_filter_debug_2.sh ├── submit_kenlm.sh ├── submit_merge_stats.sh ├── submit_merge_stats_final.sh └── vis │ └── vis_filter.py └── spark-config └── abci-f ├── spark-defaults.conf └── spark-env.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | *.txt text eol=lf 2 | *.html text eol=lf -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | paths: 5 | - main 6 | pull_request: 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: actions/setup-java@v4 13 | with: 14 | distribution: temurin 15 | java-version: 17 16 | cache: sbt 17 | - name: Setup sbt launcher 18 | uses: sbt/setup-sbt@v1 19 | - run: sbt "scalafmtCheck;test" 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # sbt 2 | target/ 3 | .bloop/ 4 | .bsp/ 5 | .idea/ 6 | .venv/ 7 | 8 | # metal 9 | .metals/ 10 | metals.sbt 11 | 12 | # ignore abci logs for shell scripts 13 | *.sh.o* -------------------------------------------------------------------------------- /.jvmopts: -------------------------------------------------------------------------------- 1 | -Xmx2G 2 | --add-exports=java.base/sun.nio.ch=ALL-UNNAMED -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "3.7.12" 2 | runner.dialect = scala213 3 | project.excludePaths = [ 4 | "glob:**/legacy/**/**.scala" 5 | ] 6 | 7 | maxColumn = 100 8 | align.preset = none 9 | 10 | newlines.avoidForSimpleOverflow = [punct] 11 | newlines.beforeMultiline = fold 12 | newlines.selectChains = fold 13 | 14 | rewrite.rules = [RedundantBraces, RedundantParens, SortModifiers] 15 | rewrite.redundantBraces.stringInterpolation = true 16 | rewrite.redundantBraces.generalExpressions = false 17 | rewrite.redundantBraces.defnBodies = noParams 18 | runner.optimizer.forceConfigStyleMinArgCount = 4 19 | 20 | rewrite.trailingCommas.style = "keep" -------------------------------------------------------------------------------- /bench/src/main/java/test/Log10Bench.java: -------------------------------------------------------------------------------- 1 | package test; 2 | 3 | import org.apache.commons.math3.util.FastMath; 4 | import org.openjdk.jmh.annotations.Benchmark; 5 | import org.openjdk.jmh.annotations.Scope; 6 | import org.openjdk.jmh.annotations.Setup; 7 | import org.openjdk.jmh.annotations.State; 8 | 9 | import java.util.Random; 10 | 11 | @State(Scope.Benchmark) 12 | public class Log10Bench { 13 | double[] arg; 14 | 15 | @Setup 16 | public void setup() { 17 | double[] arr = new double[10000]; 18 | Random rng = new Random(42L); 19 | for (int i = 0; i < 10000; ++i) { 20 | arr[i] = rng.nextInt(10000) + 1; 21 | } 22 | arg = arr; 23 | } 24 | 25 | 26 | @Benchmark 27 | public double bulitin() { 28 | double result = 0; 29 | double[] arr = arg; 30 | for (int i = 0; i < 10000; i++) { 31 | double v = arr[i]; 32 | result += Math.log10(v); 33 | } 34 | return result; 35 | } 36 | 37 | @Benchmark 38 | public double fastMath() { 39 | double result = 0; 40 | double[] arr = arg; 41 | for (int i = 0; i < 10000; i++) { 42 | double v = arr[i]; 43 | result += FastMath.log10(v); 44 | } 45 | return result; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /bench/src/main/java/test/SqrtBench.java: -------------------------------------------------------------------------------- 1 | package test; 2 | 3 | import org.apache.commons.math3.util.FastMath; 4 | import org.openjdk.jmh.annotations.Benchmark; 5 | import org.openjdk.jmh.annotations.Scope; 6 | import org.openjdk.jmh.annotations.Setup; 7 | import org.openjdk.jmh.annotations.State; 8 | 9 | import java.util.Random; 10 | 11 | @State(Scope.Benchmark) 12 | public class SqrtBench { 13 | double[] arg; 14 | 15 | @Setup 16 | public void setup() { 17 | double[] arr = new double[10000]; 18 | Random rng = new Random(42L); 19 | for (int i = 0; i < 10000; ++i) { 20 | arr[i] = rng.nextInt(10000) + 1; 21 | } 22 | arg = arr; 23 | } 24 | 25 | 26 | @Benchmark 27 | public double bulitin() { 28 | double result = 0; 29 | double[] arr = arg; 30 | for (int i = 0; i < 10000; i++) { 31 | double v = arr[i]; 32 | result += Math.sqrt(v); 33 | } 34 | return result; 35 | } 36 | 37 | @Benchmark 38 | public double fastMath() { 39 | double result = 0; 40 | double[] arr = arg; 41 | for (int i = 0; i < 10000; i++) { 42 | double v = arr[i]; 43 | result += FastMath.sqrt(v); 44 | } 45 | return result; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import Build._ 2 | 3 | inThisBuild( 4 | Seq( 5 | scalaVersion := V.scala212, 6 | organization := "com.worksap", 7 | organizationName := "Works Applications", 8 | startYear := Some(2023), 9 | homepage := Some(url("https://github.com/WorksApplications/uzushio")), 10 | versionScheme := Some("early-semver"), 11 | developers := List( 12 | Developer( 13 | "eiennohito", 14 | "Arseny Tolmachev", 15 | "arseny@kotonoha.ws", 16 | url("https://github.com/eiennohito") 17 | ) 18 | ) 19 | ) 20 | ) 21 | lazy val commonSettings = Seq( 22 | crossScalaVersions := Seq(V.scala212), 23 | scalacOptions ++= Seq( 24 | "-feature", 25 | "-deprecation", 26 | "-unchecked", 27 | "-encoding", 28 | "utf-8" 29 | ), 30 | javacOptions ++= Seq( 31 | "-encoding", 32 | "utf8", 33 | "-Xlint:all", 34 | "-source", 35 | "1.8", 36 | "-target", 37 | "1.8" 38 | ) 39 | ) 40 | 41 | disablePlugins(sbtassembly.AssemblyPlugin) 42 | 43 | lazy val root = (project in file(".")) 44 | .aggregate( 45 | lib, 46 | core, 47 | legacy 48 | ) 49 | .settings( 50 | name := "uzushio-root" 51 | ) 52 | .settings(noPublishSettings) 53 | .settings(commonSettings) 54 | 55 | lazy val legacy = (project in file("legacy")) 56 | .disablePlugins(sbtassembly.AssemblyPlugin) 57 | .dependsOn(lib) 58 | .settings( 59 | libraryDependencies ++= sparkDependencies.map(_ % Provided) 60 | ) 61 | 62 | lazy val core = (project in file("core")) 63 | .enablePlugins(sbtassembly.AssemblyPlugin) 64 | .settings( 65 | name := "uzushio", 66 | libraryDependencies ++= sparkDependencies.map( 67 | _ % Provided 68 | ) 69 | ) 70 | .settings(commonSettings) 71 | .settings(lintSettings) 72 | .settings(assemblySettings) 73 | .dependsOn(lib) 74 | 75 | lazy val lib = (project in file("lib")) 76 | .disablePlugins(sbtassembly.AssemblyPlugin) 77 | .settings( 78 | name := "uzushio-lib", 79 | libraryDependencies ++= sparkDependencies.map(_ % Optional), 80 | libraryDependencies ++= libdependencies, 81 | scalacOptions ++= ( 82 | if (scalaVersion.value.startsWith("2.")) { 83 | Seq("-opt:l:inline", "-opt-inline-from:classpath") 84 | } else { 85 | Seq.empty 86 | } 87 | ), 88 | ) 89 | .settings(commonSettings) 90 | .settings(lintSettings) 91 | .settings(scalaCompatSettings) 92 | 93 | lazy val bench = (project in file("bench")) 94 | .disablePlugins(sbtassembly.AssemblyPlugin) 95 | .enablePlugins(JmhPlugin) 96 | .settings(commonSettings) 97 | .settings(noPublishSettings) 98 | .dependsOn(lib) 99 | -------------------------------------------------------------------------------- /core/src/main/resources/chitra.conf: -------------------------------------------------------------------------------- 1 | { 2 | "stages": [ 3 | {"class": "SplitIntoSentence"}, 4 | {"class": "RemoveWikipediaCitation"}, 5 | {"class": "NormalizeCharacter", "keepWS": false}, 6 | {"class": "NormalizeWhitespace"}, 7 | {"class": "ConcatShortSentence", "concatThr": 2}, 8 | {"class": "RemoveEmail"}, 9 | {"class": "RemoveURL"}, 10 | {"class": "FilterBySentenceLength", "min":10, "max": 200}, 11 | {"class": "RemoveShortDocument", "min": 5}, 12 | {"class": "RemoveScriptDocument"}, 13 | {"class": "RemoveNGWordDocument", "path": "ng_words.txt"}, 14 | ], 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/resources/ng_words.txt: -------------------------------------------------------------------------------- 1 | fuck 2 | g スポット 3 | sm女王 4 | tenga 5 | あばずれ 6 | あぱずれ 7 | あほ 8 | うざ 9 | うんこ 10 | え〇 11 | えっち 12 | おしっこ 13 | おしりのあな 14 | おっぱい 15 | おもらし 16 | かたわ 17 | きちがい 18 | きめぇ 19 | きめえ 20 | くそ 21 | せんずり 22 | ち〇 23 | ちんぐり 24 | ちんこ 25 | つるぺた 26 | つんぼ 27 | ふたなり 28 | ぶさいく 29 | ぶす 30 | ま〇 31 | まんぐり 32 | まんこ 33 | めくら 34 | やりまん 35 | アスペ 36 | アスホール 37 | アナリングス 38 | アナル 39 | アヌス 40 | アバズレ 41 | アパズレ 42 | アホ 43 | イマラチオ 44 | イメクラ 45 | イラマチオ 46 | ウザ 47 | ウンコ 48 | エ〇 49 | エッチ 50 | エロ 51 | オカマ 52 | オッパイ 53 | オナ 54 | オナニー 55 | オフパコ 56 | オマンコ 57 | オルガズム 58 | オーガズム 59 | カス 60 | ガイジ 61 | キチガイ 62 | キモ 63 | クズ 64 | クソ 65 | クリトリス 66 | クンニ 67 | クンニリングス 68 | グループ・セックス 69 | グロ 70 | ゲイボーイ 71 | ゲイ・セックス 72 | ゲロ 73 | コカイン 74 | コキ 75 | コンドーム 76 | ザーメン 77 | シコ 78 | ショタ 79 | スカトロ 80 | スケベ 81 | ストリップ劇場 82 | スマタ 83 | セクロス 84 | セックス 85 | セフレ 86 | センズリ 87 | ダッチワイフ 88 | チ〇 89 | テレフォンセックス 90 | ディルド 91 | ディープ・スロート 92 | デブ 93 | デリヘル 94 | デートレイプ 95 | ドキュン 96 | ナマポ 97 | ニガー 98 | ヌい 99 | ヌく 100 | ヌけ 101 | ネオ・ナチ 102 | ハメ撮り 103 | パイズリ 104 | パイパン 105 | パンチラ 106 | パンティー 107 | ビッチ 108 | ピロートーク 109 | ファック 110 | フェラ 111 | フェラチオ 112 | ブサイク 113 | ブス 114 | プリンス アルバート ピアス 115 | ペッティング 116 | ペニス 117 | ペニスバンド 118 | ホモ 119 | ボンテージ 120 | ボールギャグ 121 | ポルノグラフィー 122 | マ〇 123 | マザー・ファッカー 124 | マスターベーション 125 | マラ 126 | マンコ 127 | ヤラせ 128 | ラブホ 129 | リスカ 130 | リストカット 131 | リョナ 132 | リンチ 133 | レイプ 134 | レズ 135 | 不細工 136 | 中出し 137 | 乱交 138 | 二穴 139 | 人妻 140 | 側位 141 | 児童性虐待 142 | 前戯 143 | 勃起する 144 | 合いの子 145 | 四十八手 146 | 売り専 147 | 売国 148 | 売女 149 | 売春婦 150 | 外人 151 | 夢精 152 | 大人のおもちゃ 153 | 大人のオモチャ 154 | 大人の玩具 155 | 大陰唇 156 | 射精 157 | 尻軽 158 | 尿道プレイ 159 | 巨乳 160 | 巨根 161 | 強姦犯 162 | 後戯 163 | 後背位 164 | 手コキ 165 | 手マン 166 | 援交 167 | 援助交際 168 | 支那 169 | 新しいポルノ 170 | 正常位 171 | 殺し方 172 | 殺人方法 173 | 氏ね 174 | 氏んだ 175 | 氏んで 176 | 気違い 177 | 池沼 178 | 淫乱 179 | 潮吹き女 180 | 潮吹き男性 181 | 熟女 182 | 獣姦 183 | 玉なめ 184 | 玉舐め 185 | 男根 186 | 痴呆 187 | 穴兄弟 188 | 竿姉妹 189 | 筆おろし 190 | 精液 191 | 糞便 192 | 糞尿愛好症 193 | 素股 194 | 緊縛 195 | 老害 196 | 肉便器 197 | 自慰 198 | 裸の女性 199 | 貞操帯 200 | 賢者タイム 201 | 足フェチ 202 | 輪姦 203 | 近親相姦 204 | 阿呆 205 | 陰毛 206 | 電マ 207 | 顔射 208 | 顔面騎乗 209 | 騎上位 210 | 騎乗位 -------------------------------------------------------------------------------- /core/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | // this conf contains the default values. 2 | { 3 | // list of pipeline stages. 4 | "stages": [ 5 | {"class": "Identity"}, 6 | ], 7 | "input": { 8 | // text or parquet. 9 | "format": "text", 10 | // Delimiter of documents (text). 11 | "delimiter": "\n\n", 12 | // Name of the document column (parquet). 13 | "column": "document", 14 | }, 15 | "output": { 16 | // text or parquet. 17 | "format": "text", 18 | // Delimiter of documents (text). 19 | "delimiter": "\n\n", 20 | // Name of the document column (parquet). 21 | "column": "document", 22 | // Delimiter of elements e.g. paragraph, sentence. 23 | "elementDelimiter": "\n", 24 | }, 25 | } 26 | -------------------------------------------------------------------------------- /core/src/main/resources/rmTemplate.conf: -------------------------------------------------------------------------------- 1 | { 2 | "stages": [ 3 | {"class": "DeduplicateElement"}, // deduplicate per document 4 | {"class": "SplitIntoSentence"}, 5 | {"class": "DeduplicateRepeatingSentence", "minRepeat": 2}, 6 | {"class": "RemoveSubstring", 7 | "path": "template_sentences.txt", 8 | "delim": "\n\n", // template_sentences contains multi-sentence pattern. 9 | "matchSentence": true}, // match full sentence only. 10 | {"class": "RemoveShortDocument", "min": 5}, 11 | ], 12 | } 13 | -------------------------------------------------------------------------------- /core/src/main/resources/sudachiDictCorpus.conf: -------------------------------------------------------------------------------- 1 | { 2 | "stages": [ 3 | {"class": "SplitIntoSentence"}, 4 | {"class": "NormalizeCharacter", "keepWS": true}, 5 | {"class": "NormalizeWhitespace"}, 6 | {"class": "DeduplicateElement"}, // deduplicate per sentence 7 | ], 8 | "output": { 9 | "delimiter": "\n", // concat documents 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /core/src/main/resources/template_sentences.txt: -------------------------------------------------------------------------------- 1 | 管理人のみ閲覧できます 2 | 3 | このコメントは管理人のみ閲覧できます 4 | 5 | 管理者にだけ表示を許可する 6 | 7 | このコメントは管理者の承認待ちです 8 | 9 | 管理人の承認後に表示されます 10 | 11 | 気になるリストに保存 この求人の詳細をみる 12 | 13 | 最新コメントのRSS 14 | 15 | 最新トラックバックのRSS 16 | 17 | この広告は60日以上更新がないブログに表示がされております 18 | 19 | この広告は60日以上更新がないブログに表示がされております 20 | 以下のいずれかの方法で非表示にすることが可能です 21 | ・記事の投稿、編集をおこなう 22 | ・マイブログの【設定】 >【広告設定】 より、「60日間更新が無い場合」 の 「広告を表示しない」にチェックを入れて保存する 23 | 24 | 上記の広告は1ヶ月以上更新のないブログに表示されています 25 | 新しい記事を書く事で広告が消せます 26 | 27 | 上記広告は1ヶ月以上更新のないブログに表示されています 28 | 新しい記事を書くことで広告を消せます 29 | 30 | [PR]この広告は3ヶ月以上更新がないため表示されています 31 | ホームページを更新後24時間以内に表示されなくなります 32 | 33 | ブロとも申請フォーム 34 | 35 | ■ブロとも申請フォーム 36 | 37 | ■ ブロとも申請フォーム 38 | 39 | この人とブロともになる 40 | 41 | この記事にトラックバックする(FC2ブログユーザー) 42 | 43 | この記事に対してトラックバックを送信する(FC2ブログユーザー) 44 | 45 | FC2ブログユーザー専用トラックバックURLはこちら 46 | 47 | トラックバックURLはこちら 48 | 49 | この記事のトラックバックURL 50 | 51 | この記事へのトラックバック 52 | 53 | この記事へのトラックバックURL 54 | 55 | 最近のトラックバック 56 | 57 | ■最近のトラックバック 58 | 59 | ■ 最近のトラックバック 60 | 61 | ※ブログオーナーが承認したトラックバックのみ表示されます 62 | 63 | ※言及リンクのないトラックバックは受信されません 64 | 65 | コメントをする・見る 66 | 67 | トラックバックする・見る 68 | 69 | スマートフォン専用ページを表示 70 | 71 | このブログをリンクに追加する 72 | 73 | このページのトップへ 74 | 75 | ページのトップへ戻る 76 | 77 | FC2ブログへようこそ 78 | 79 | 自分のブログにトラックバック記事作成(会員用) 80 | 81 | この記事へのコメント 82 | 83 | この記事に対するコメント 84 | 85 | この記事に対するコメントの投稿 86 | 87 | この記事に対するトラックバック 88 | 89 | ※画像の中の文字を半角で入力してください 90 | 91 | お名前: [必須入力] 92 | 93 | クリックして気持ちを伝えよう 94 | ログインしてクリックすれば、自分のブログへのリンクが付きます 95 | 96 | 最近の記事+コメント 97 | 98 | アクセスランキングを見る>> 99 | 100 | さらに詳しい情報はコチラ 101 | 102 | このブログの読者になる 103 | 104 | 更新情報をチェックする 105 | 106 | 同じテーマのブログ記事 107 | 108 | 開始・終了時間は直接の確認をおすすめします 109 | 110 | 閲覧するには管理人が設定した 111 | パスワードの入力が必要です 112 | 113 | 管理人からのメッセージ 114 | 115 | ブログ画像一覧を見る 116 | 117 | このブログの読者になる(チェック) 118 | 119 | アメーバブログトップへ 120 | 121 | ※著作権についてのご注意 122 | 123 | ブログのトップページへ 124 | 125 | 最新の記事一覧ページへ 126 | 127 | このブログの更新情報が届きます 128 | 129 | 自分のランキングを詳しく見る>> 130 | 131 | 人気ブログランキングトップへ 132 | 133 | このブログはランキングに参加していません 134 | 135 | このブログにコメントするにはログインが必要です 136 | 137 | この記事には許可ユーザしかコメントができません 138 | 139 | 読者になると、このブログの更新情報が届きます 140 | 141 | ブログの更新情報が受け取れて、アクセスが簡単になります 142 | 143 | このページの先頭へ▲ 144 | 145 | [ コメント記入欄を表示 ] 146 | 147 | 人気ブログランキングへ 148 | 149 | このBlogのトップへ│前の記事│次の記事 150 | 151 | このブログはジャンルランキングに参加していません 152 | 153 | アメーバID登録して、ブログをつくろう 154 | 155 | 本ブログパーツの提供を終了しました 156 | 157 | この記事は削除されているか、 158 | または未来記事設定(現日時以降の公開)された記事のため表示できません 159 | 160 | 前の記事│このブログのトップへ│次の記事 161 | 162 | FLO:Qで世界にひとつだけのブログパーツを作ろう 163 | 164 | ブログの説明を入力します 165 | 166 | あなたもピュアブログでブログをつくりませんか 167 | 168 | あなたもエコ・ブログでブログをつくりませんか 169 | 170 | 「気になる」をクリックで 171 | 回答がついた時に通知でお知らせします 172 | 173 | ブログやるならFC2ブログ 174 | 175 | 無料ブログはココログ 176 | 177 | この広告は1年以上新しい記事の投稿がないブログに表示されております 178 | 179 | 掲載情報の著作権は提供元企業等に帰属します 180 | 181 | こんにちはゲストさん 182 | 183 | 会員登録(無料)して質問・回答してみよう 184 | 185 | 管理者にだけ表示を許可 186 | 187 | FC2ブックマークに追加する 188 | 189 | この記事にトラックバックする(FC2ブログユーザー限定) 190 | 191 | この記事を 編集・削除 192 | 193 | ※コメント書き込みは制限されています 194 | 195 | ブックマークに登録する 196 | 197 | ※ブログオーナーが承認したコメントのみ表示されます 198 | 199 | この記事にトラックバック 200 | 201 | この記事にトラックバック(FC2ブログユーザー) 202 | 203 | このブログをマイリストに追加 204 | 205 | この記事へのコメント一覧 206 | 207 | コメントは新しいものから表示されます 208 | 209 | 編集・削除するのに必要 210 | 管理者だけにコメントを表示 211 | 212 | -------------------------------------------------------------------------------- /core/src/main/resources/warc.conf: -------------------------------------------------------------------------------- 1 | { 2 | "stages": [ 3 | // warc postprocess 4 | {"class": "SplitIntoParagraph"}, 5 | {"class": "FilterJapaneseBasedOnCharacter", "kanaRate": 0.05, "jpRate": 0.7}, 6 | {"class": "DeduplicateElement"}, // deduplicate per paragraph 7 | {"class": "RemoveShortDocument", "min": 5}, 8 | // chitra preprocess 9 | {"class": "SplitIntoSentence"}, 10 | {"class": "RemoveWikipediaCitation"}, 11 | {"class": "NormalizeCharacter", "keepWS": false}, 12 | {"class": "NormalizeWhitespace"}, 13 | {"class": "ConcatShortSentence", "concatThr": 2}, 14 | {"class": "RemoveEmail"}, 15 | {"class": "RemoveURL"}, 16 | {"class": "FilterBySentenceLength", "min":10, "max": 200}, 17 | {"class": "RemoveShortDocument", "min": 5}, 18 | {"class": "RemoveScriptDocument"}, 19 | {"class": "RemoveNGWordDocument", "path": "ng_words.txt"}, 20 | // remove template 21 | {"class": "DeduplicateRepeatingSentence", "minRepeat": 2}, 22 | {"class": "RemoveSubstring", 23 | "path": "template_sentences.txt", 24 | "delim": "\n\n", // template_sentences contains multi-sentence pattern. 25 | "matchSentence": true}, // match full sentence only. 26 | {"class": "RemoveShortDocument", "min": 5}, 27 | ], 28 | "input": { 29 | "format": "parquet", 30 | "column": "document", 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/DocumentIO.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio 2 | 3 | import java.nio.file.{Path, Paths} 4 | import org.rogach.scallop.ScallopConf 5 | 6 | import org.apache.spark.sql.{SparkSession, DataFrame} 7 | import org.apache.spark.sql.functions.{expr, monotonically_increasing_id} 8 | 9 | object DocumentIO { 10 | val idxCol = "documentId" 11 | val docCol = "document" 12 | 13 | private class Conf(args: Seq[String]) extends ScallopConf(args) { 14 | // args `--input ./hoge.md ./*.txt` will be parsed like 15 | // List(./hoge.md, ./fuga.txt, ./piyo.txt) 16 | val input = opt[List[Path]](required = true) 17 | val output = opt[Path](default = Some(Paths.get("./out"))) 18 | verify() 19 | } 20 | 21 | def run(spark: SparkSession, conf: Conf): Unit = { 22 | val docs = loadRawDocuments(spark, conf.input()) 23 | val docWithIdx = addIndex(docs) 24 | saveIndexedDocuments(docWithIdx, conf.output()) 25 | } 26 | 27 | def main(args: Array[String]): Unit = { 28 | val conf = new Conf(args) 29 | val spark = SparkSession.builder().appName("DocumentIO").getOrCreate() 30 | 31 | try { run(spark, conf) } 32 | finally { spark.stop() } 33 | } 34 | 35 | def addIndex( 36 | dataframe: DataFrame, 37 | idxColName: String = idxCol 38 | ): DataFrame = { 39 | // add index column 40 | dataframe.withColumn(idxColName, monotonically_increasing_id) 41 | } 42 | 43 | def formatPathList(paths: Seq[Path]): Seq[Path] = { 44 | // align list to fix the order of file load (todo: check if necessary) 45 | paths.distinct.sorted 46 | } 47 | 48 | def saveRawDocuments( 49 | documents: DataFrame, 50 | output: Path, 51 | docCol: String = docCol, 52 | sep: String = "\n\n" 53 | ): Unit = { 54 | documents.select(docCol).write.option("lineSep", sep).text(output.toString) 55 | } 56 | 57 | def loadRawDocuments( 58 | spark: SparkSession, 59 | input: Seq[Path], 60 | sep: String = "\n\n" 61 | ): DataFrame = { 62 | // load document data. 63 | // 64 | // Assumes each input file contains multiple documents, 65 | // and they are separated by `sep` (by default two empty lines). 66 | val paths = formatPathList(input).map(_.toString) 67 | spark.read.option("lineSep", sep).text(paths: _*).filter(r => r.getAs[String](0).trim != "") 68 | .select(expr(s"value as $docCol")) 69 | } 70 | 71 | def saveIndexedDocuments( 72 | dataframe: DataFrame, 73 | output: Path, 74 | idxColName: String = idxCol, 75 | docColName: String = docCol, 76 | format: String = "parquet" 77 | ): Unit = { 78 | val data = dataframe.select( 79 | expr(s"$idxColName as $idxCol"), 80 | expr(s"$docColName as $docCol") 81 | ) 82 | 83 | data.write.format(format).save(output.toString) 84 | } 85 | 86 | def loadIndexedDocuments( 87 | spark: SparkSession, 88 | input: Seq[Path], 89 | format: String = "parquet" 90 | ): DataFrame = { 91 | // Assume the schema of files is same to `saveIndexedDocuments` output 92 | val paths = formatPathList(input).map(_.toString) 93 | spark.read.format(format).load(paths: _*) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/Sudachi.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio 2 | 3 | import java.nio.file.Paths 4 | import com.worksap.nlp.sudachi.{DictionaryFactory, Tokenizer, Config} 5 | 6 | object Sudachi { 7 | def parseSplitMode(mode: String): Tokenizer.SplitMode = { 8 | // Parse sudachi SplitMode from a string. 9 | mode.capitalize match { 10 | case "A" => Tokenizer.SplitMode.A 11 | case "B" => Tokenizer.SplitMode.B 12 | case _ => Tokenizer.SplitMode.C 13 | } 14 | } 15 | 16 | def setupSudachiTokenizer(): Tokenizer = { 17 | // create sudachi Tokenizer instance. 18 | // system_core.dict must be in cwd. 19 | // TODO: load config file 20 | val dictPath = Paths.get("system_core.dic") 21 | val conf = Config.defaultConfig().systemDictionary(dictPath) 22 | new DictionaryFactory().create(conf).create() 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/SudachiTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio 2 | 3 | import collection.JavaConverters._ 4 | 5 | import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row} 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.ml.Transformer 8 | import org.apache.spark.ml.param.{Param, ParamMap} 9 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} 10 | import org.apache.spark.ml.util.Identifiable 11 | 12 | /** Tokenizer based on Sudachi. 13 | * 14 | * The input col should contains document (String consists of "\n" delimited sentences). 15 | * SudachiTokenizer runs sudachi for each sentences and returns concatenated array of surfaces. 16 | */ 17 | class SudachiTokenizer(override val uid: String) 18 | extends Transformer 19 | with HasInputCol 20 | with HasOutputCol { 21 | def this() = this(Identifiable.randomUID("sudachiTokenizer")) 22 | 23 | override def copy(extra: ParamMap) = defaultCopy(extra) 24 | 25 | def outputDataType = new ArrayType(StringType, true) 26 | 27 | def setInputCol(value: String) = set(inputCol, value) 28 | def setOutputCol(value: String) = set(outputCol, value) 29 | 30 | // sudachi split mode. 31 | val splitMode: Param[String] = new Param( 32 | this, 33 | "splitMode", 34 | "sudachi split mode (A/B/C)", 35 | (c: String) => { 36 | c.length == 1 && "aAbBcC".contains(c) 37 | } 38 | ) 39 | def setSplitMode(value: String): this.type = set(splitMode, value) 40 | def getSplitMode: String = $(splitMode) 41 | 42 | setDefault(splitMode -> "C") 43 | 44 | override def transformSchema(schema: StructType): StructType = { 45 | val inputType = schema($(inputCol)).dataType 46 | require( 47 | inputType == StringType, 48 | s"Input type must be ${StringType.catalogString} type but got ${inputType.catalogString}." 49 | ) 50 | 51 | if (schema.fieldNames.contains($(outputCol))) { 52 | throw new IllegalArgumentException( 53 | s"Output column ${$(outputCol)} already exists." 54 | ) 55 | } 56 | val outputFields = schema.fields :+ 57 | StructField($(outputCol), outputDataType, nullable = false) 58 | StructType(outputFields) 59 | } 60 | 61 | override def transform(dataset: Dataset[_]): DataFrame = { 62 | val outputSchema = transformSchema(dataset.schema) 63 | 64 | val mode = Sudachi.parseSplitMode($(splitMode)) 65 | val tokenized = dataset.toDF.rdd.mapPartitions(iter => { 66 | val tok = Sudachi.setupSudachiTokenizer() 67 | 68 | iter.map(row => { 69 | val tokens = row.getAs[String]($(inputCol)).split("\n") 70 | .flatMap(sent => tok.tokenize(mode, sent).asScala.map(_.surface())) 71 | 72 | Row(row.toSeq :+ tokens: _*) 73 | }) 74 | }) 75 | 76 | dataset.sparkSession.createDataFrame(tokenized, outputSchema) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/TokenHasher.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio 2 | 3 | import collection.JavaConverters._ 4 | 5 | import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row} 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.ml.UnaryTransformer 8 | import org.apache.spark.ml.util.Identifiable 9 | import org.apache.spark.ml.linalg.{Vectors, VectorUDT} 10 | 11 | /**/ 12 | class TokenHasher(override val uid: String) 13 | extends UnaryTransformer[Seq[String], Seq[Long], TokenHasher] { 14 | def this() = this(Identifiable.randomUID("TokenHasher")) 15 | 16 | override protected def outputDataType: DataType = new ArrayType(LongType, false) 17 | 18 | override protected def createTransformFunc: Seq[String] => Seq[Long] = 19 | _.iterator.map(hashString).toSet.toSeq 20 | 21 | override protected def validateInputType(inputType: DataType): Unit = { 22 | require( 23 | inputType == ArrayType(StringType, true) || 24 | inputType == ArrayType(StringType, false), 25 | s"Input type must be ${ArrayType(StringType).catalogString} but got " + 26 | inputType.catalogString 27 | ) 28 | } 29 | 30 | def hashString(s: String): Long = { 31 | /* long version of scala String.hashCode */ 32 | s.foldLeft(0L) { case (code, c) => 31 * code + c } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/ConcatShortSentence.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Concat too short sentences to the previous sentence. */ 6 | class ConcatShortSentence(concatThr: Int = 2) 7 | extends DocumentNormalizer 8 | with FieldSettable[ConcatShortSentence] { 9 | override def normalizeDocument(doc: Seq[String]): Seq[String] = { 10 | if (doc.length <= 1) { 11 | doc 12 | } else { 13 | val shortSentIdx = doc.zipWithIndex.map(z => { 14 | if (z._1.length <= concatThr) z._2 else -1 15 | }).filter(_ > 0) // keep first sentence regardless of its length 16 | 17 | val appended = shortSentIdx.reverse.foldLeft(doc)((d, i) => d.updated(i - 1, d(i - 1) + d(i))) 18 | 19 | for (i <- 0 until appended.length if !shortSentIdx.contains(i)) yield appended(i) 20 | } 21 | } 22 | 23 | override def toString(): String = s"${this.getClass.getSimpleName}($concatThr)" 24 | } 25 | 26 | object ConcatShortSentence extends FromConfig { 27 | override def fromConfig(conf: ConfigObject): ConcatShortSentence = { 28 | val args = Map[String, Option[Any]]( 29 | "concatThr" -> conf.getAs[Int]("concatThr") 30 | ).collect { case (k, Some(v)) => k -> v } 31 | 32 | new ConcatShortSentence().setFields(args) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/DeduplicateElement.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | import org.apache.spark.sql.Dataset 5 | import org.apache.spark.sql.functions.monotonically_increasing_id 6 | 7 | /** Deduplicate elements of sequences, keeping seq order. */ 8 | class DeduplicateElement extends Transformer { 9 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 10 | import ds.sparkSession.implicits._ 11 | 12 | // add indices: (doc_id, elem_id, txt) 13 | val indexed = ds.withColumn("did", monotonically_increasing_id) 14 | .flatMap(r => r.getSeq[String](0).zipWithIndex.map(z => (r.getLong(1), z._2, z._1))) 15 | // drop duplicate paragraphs 16 | val dedup = indexed.dropDuplicates("_3") 17 | // reconstruct documents 18 | dedup.groupByKey(_._1).mapGroups((k, itr) => itr.toSeq.sortBy(_._2).map(_._3)) 19 | } 20 | } 21 | 22 | object DeduplicateElement extends FromConfig { 23 | override def fromConfig(conf: ConfigObject): DeduplicateElement = new DeduplicateElement 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/DeduplicateRepeatingSentence.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Deduplicate same sentences repeating many times. 6 | */ 7 | class DeduplicateRepeatingSentence(minRep: Int = 2) 8 | extends DocumentNormalizer 9 | with FieldSettable[DeduplicateRepeatingSentence] { 10 | override def normalizeDocument(doc: Seq[String]): Seq[String] = { 11 | var (i, j) = (0, 0) 12 | var indices: Seq[Int] = Vector() 13 | while (i < doc.length) { 14 | j = i + 1 15 | while ((j < doc.length) && (doc(i) == doc(j))) { j += 1 } 16 | 17 | if (i + minRep <= j) { indices :+= i } 18 | else { indices ++= i until j } 19 | i = j 20 | } 21 | for (i <- indices) yield doc(i) 22 | } 23 | 24 | override def toString(): String = s"${this.getClass.getSimpleName}($minRep)" 25 | } 26 | 27 | object DeduplicateRepeatingSentence extends FromConfig { 28 | override def fromConfig(conf: ConfigObject): DeduplicateRepeatingSentence = { 29 | val args = Map[String, Option[Any]]( 30 | "minRep" -> conf.getAs[Int]("minRepeat") 31 | ).collect { case (k, Some(v)) => k -> v } 32 | 33 | new DeduplicateRepeatingSentence().setFields(args) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FieldSettable.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | /** Set class field via setField method. 4 | * 5 | * T must be the type of the class this is implemented with, e.g. `class MyClass extends 6 | * FieldSettable[MyClass]`. 7 | */ 8 | trait FieldSettable[T] { 9 | def setFields(map: Map[String, Any]): T = { 10 | for ((k, v) <- map) setField(k, v) 11 | this.asInstanceOf[T] 12 | } 13 | 14 | def setField(key: String, value: Any): T = { 15 | this.getClass.getDeclaredFields.find(_.getName == key) match { 16 | case Some(field) => { 17 | field.setAccessible(true) 18 | field.set(this, value) 19 | } 20 | case None => throw new IllegalArgumentException(s"No field named $key") 21 | } 22 | this.asInstanceOf[T] 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Filter.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import org.apache.spark.sql.Dataset 4 | 5 | /** Filters documents with specific condition. */ 6 | abstract class DocumentFilter extends Transformer { 7 | 8 | /** Determines if the document should be kept or not. */ 9 | def isFiltered(doc: Seq[String]): Boolean 10 | 11 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 12 | ds.filter(isFiltered(_)) 13 | } 14 | } 15 | 16 | /** Filters sentences with specific condition. */ 17 | abstract class SentenceFilter extends Transformer { 18 | 19 | /** Determines if the sentence should be kept or not. */ 20 | def isFiltered(sent: String): Boolean 21 | 22 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 23 | import ds.sparkSession.implicits._ 24 | ds.map(_.filter(isFiltered)).filter(_.length > 0) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FilterBySentenceLength.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters sentences that are too short or too long. 6 | * 7 | * @constructor 8 | * create a new filter. 9 | * @param min 10 | * the minimum number of characters a sentence should contain 11 | * @param max 12 | * the maximum number of characters a sentence should contain 13 | */ 14 | class FilterBySentenceLength(min: Int = 10, max: Int = 200) 15 | extends SentenceFilter 16 | with FieldSettable[FilterBySentenceLength] { 17 | override def isFiltered(sent: String): Boolean = { 18 | min <= sent.length && sent.length <= max 19 | } 20 | 21 | override def toString(): String = s"${this.getClass.getSimpleName}($min, $max)" 22 | } 23 | 24 | object FilterBySentenceLength extends FromConfig { 25 | override def fromConfig(conf: ConfigObject): FilterBySentenceLength = { 26 | val args = Map[String, Option[Any]]( 27 | "min" -> conf.getAs[Int]("min"), 28 | "max" -> conf.getAs[Int]("max") 29 | ).collect { case (k, Some(v)) => k -> v } 30 | 31 | new FilterBySentenceLength().setFields(args) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/FilterJapaneseBasedOnCharacter.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters non Japanese document based on the type of characters. 6 | * 7 | * Default threshold follows nwc-toolkit:text-filter. 8 | * 9 | * @param kanaRate 10 | * texts with hiragana/katakana less than this are filtered. 11 | * @param jpRate 12 | * texts with kana/kanji less than this are filtered. 13 | */ 14 | class FilterJapaneseBasedOnCharacter( 15 | kanaRate: Double = 0.05, 16 | jpRate: Double = 0.7 17 | ) extends SentenceFilter 18 | with FieldSettable[FilterJapaneseBasedOnCharacter] { 19 | val kanaPattern = """\p{InHiragana}|\p{InKatakana}""".r 20 | val jpCharPattern = """\p{InHiragana}|\p{InKatakana}|\p{InCJKUnifiedIdeographs}""".r 21 | 22 | override def isFiltered(sent: String): Boolean = { 23 | val kanaCount = kanaPattern.findAllIn(sent).length.toDouble 24 | val jpCount = jpCharPattern.findAllIn(sent).length.toDouble 25 | val charCount = sent.length.toDouble 26 | 27 | (kanaCount / charCount) > kanaRate && (jpCount / charCount) > jpRate 28 | } 29 | 30 | override def toString(): String = s"${this.getClass.getSimpleName}($kanaRate, $jpRate)" 31 | } 32 | 33 | object FilterJapaneseBasedOnCharacter extends FromConfig { 34 | override def fromConfig( 35 | conf: ConfigObject 36 | ): FilterJapaneseBasedOnCharacter = { 37 | val args = Map[String, Option[Any]]( 38 | "kanaRate" -> conf.getAs[Double]("kanaRate"), 39 | "jpRate" -> conf.getAs[Double]("jpRate") 40 | ).collect { case (k, Some(v)) => k -> v } 41 | 42 | new FilterJapaneseBasedOnCharacter().setFields(args) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/NormalizeCharacter.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Removes non-printable characters. 6 | * 7 | * Following python's str.isprintable, remove unicode general-category "Other" or "Separator" 8 | * except space. We also keep surrogate code points (that are not in python). 9 | * 10 | * @param keepWS 11 | * If true, keep whitespaces other than space (" "), including \u3000. This is not python 12 | * compatible behaviour. 13 | */ 14 | class NormalizeCharacter(keepWS: Boolean = NormalizeCharacter.defaultKeepWS) 15 | extends SentenceNormalizer { 16 | val nonPrintablePattern = 17 | if (keepWS) """[\p{gc=C}\p{gc=Z}&&[^\s \p{gc=Cs}]]""".r 18 | else """[\p{gc=C}\p{gc=Z}&&[^ \p{gc=Cs}]]""".r 19 | 20 | override def normalizeSentence(sent: String): String = { 21 | nonPrintablePattern.replaceAllIn(sent, "") 22 | } 23 | 24 | override def toString(): String = s"${this.getClass.getSimpleName}(keepWS=$keepWS)" 25 | } 26 | 27 | object NormalizeCharacter extends FromConfig { 28 | val defaultKeepWS = false 29 | 30 | override def fromConfig(conf: ConfigObject): NormalizeCharacter = { 31 | val keepWS = conf.getOrElseAs[Boolean]("keepWS", defaultKeepWS) 32 | new NormalizeCharacter(keepWS) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/NormalizeWhitespace.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Removes excess whitespaces. */ 6 | class NormalizeWhitespace extends SentenceNormalizer { 7 | val continuousWhitespacePattern = """[\s ]+""".r 8 | 9 | override def normalizeSentence(sent: String): String = { 10 | continuousWhitespacePattern.replaceAllIn(sent, " ") 11 | } 12 | } 13 | 14 | object NormalizeWhitespace extends FromConfig { 15 | override def fromConfig(conf: ConfigObject): NormalizeWhitespace = new NormalizeWhitespace 16 | } 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Normalizer.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import org.apache.spark.sql.Dataset 4 | 5 | /** Normalizes document in document-wise way. */ 6 | abstract class DocumentNormalizer extends Transformer { 7 | def normalizeDocument(doc: Seq[String]): Seq[String] 8 | 9 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 10 | import ds.sparkSession.implicits._ 11 | ds.map(doc => normalizeDocument(doc)) 12 | } 13 | } 14 | 15 | /** Normalizes document in sentence-wise way. */ 16 | abstract class SentenceNormalizer extends DocumentNormalizer { 17 | def normalizeSentence(sent: String): String 18 | 19 | override def normalizeDocument(doc: Seq[String]): Seq[String] = { 20 | doc.map(sent => normalizeSentence(sent)) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Pipeline.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import collection.JavaConverters._ 4 | import java.nio.file.{Path, Paths, Files} 5 | import com.typesafe.config.{Config, ConfigFactory, ConfigObject} 6 | import org.apache.spark.sql.Dataset 7 | import java.nio.channels.Pipe 8 | 9 | /** Sequencially apply multiple transformers. 10 | * 11 | * @param stages 12 | * list of transformers to apply 13 | */ 14 | class Pipeline(private var stages: Seq[Transformer] = Seq()) extends Transformer { 15 | 16 | def setStages(value: Seq[Transformer]): Pipeline = { 17 | stages = value 18 | this 19 | } 20 | 21 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 22 | stages.foldLeft(ds)((ds, tr) => tr.transform(ds)) 23 | } 24 | 25 | override def toString(): String = { 26 | s"Pipeline($stages)" 27 | } 28 | } 29 | 30 | object Pipeline { 31 | def fromConfig(conf: Config): Pipeline = { 32 | val stageConfs = conf.getObjectList("stages").asScala.map(_.asInstanceOf[ConfigObject]) 33 | val stages = getStagesFromCompanion(stageConfs) 34 | new Pipeline(stages) 35 | } 36 | 37 | /** Instantiate stages based on the config. Use constructor. */ 38 | private def getStagesFromConstructor(confObjs: Seq[ConfigObject]) = { 39 | confObjs.map(co => { 40 | val name = co.get("class").unwrapped.asInstanceOf[String] 41 | getConstructorOf(name).newInstance(co) 42 | }) 43 | } 44 | 45 | /** Instantiate stages based on the config. Use companion object. */ 46 | private def getStagesFromCompanion(confObjs: Seq[ConfigObject]) = { 47 | confObjs.map(co => { 48 | val name = co.get("class").unwrapped.asInstanceOf[String] 49 | getCompanionOf(name).asInstanceOf[FromConfig].fromConfig(co) 50 | }) 51 | } 52 | 53 | /** Get a constructor of a class from the given name. */ 54 | private def getConstructorOf(name: String) = { 55 | val clz = Class.forName(withClassPrefix(name)) 56 | clz.getConstructor(Class.forName("com.typesafe.config.ConfigObject")) 57 | } 58 | 59 | /** Get a companion object of a class from the given name. */ 60 | private def getCompanionOf(name: String) = { 61 | val clz = Class.forName(withClassPrefix(name)) 62 | clz.getClassLoader.loadClass(clz.getName + "$").getField("MODULE$").get(null) 63 | } 64 | 65 | private val classname = this.getClass.getName() 66 | private val classPrefix = classname.take(classname.lastIndexOf(".")) 67 | 68 | /** Append class name prefix if not exists. 69 | * 70 | * Note: This assume each transformer classes belong to the same package to this class. 71 | */ 72 | private def withClassPrefix(name: String): String = { 73 | if (name.startsWith(classPrefix)) { name } 74 | else { 75 | s"$classPrefix.$name" 76 | } 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveEmail.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters sentences that contain email address. */ 6 | class RemoveEmail extends SentenceFilter { 7 | val emailPattern = """[\w\d_-]+@[\w\d_-]+\.[\w\d._-]+""".r 8 | 9 | override def isFiltered(sent: String): Boolean = { 10 | emailPattern.findFirstIn(sent).isEmpty 11 | } 12 | } 13 | 14 | object RemoveEmail extends FromConfig { 15 | override def fromConfig(conf: ConfigObject): RemoveEmail = new RemoveEmail 16 | } 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveNGWordDocument.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | import com.worksap.nlp.sudachi.Tokenizer 5 | import com.worksap.nlp.uzushio.Sudachi 6 | 7 | import collection.JavaConverters._ 8 | import java.nio.charset.StandardCharsets 9 | import java.nio.file.{Files, Path, Paths} 10 | import org.apache.spark.sql.Dataset 11 | 12 | import scala.io.Source 13 | 14 | /** Filters documents that contain one of the specified words. 15 | * 16 | * @constructor 17 | * create a new filter with ng-word list. 18 | * @param ngwords 19 | * the set of words which should not appear in the filtered documents 20 | */ 21 | class RemoveNGWordDocument(ngwords: Set[String]) extends Transformer { 22 | val ngwordPattern = s"""(${ngwords.mkString("|")})""".r 23 | val mode = Tokenizer.SplitMode.C 24 | 25 | def containsNgword(tok: Tokenizer, doc: Seq[String]): Boolean = { 26 | for (sent <- doc) { 27 | val matchIter = ngwordPattern.findAllMatchIn(sent) 28 | val (matches, forSize) = matchIter.duplicate 29 | 30 | if (forSize.size != 0) { 31 | try { 32 | val morphmes = tok.tokenize(sent).asScala 33 | val morphBegins = morphmes.map(_.begin()).toSet 34 | val morphEnds = morphmes.map(_.end()).toSet 35 | 36 | for (m <- matches) { 37 | if (morphBegins.contains(m.start) && morphEnds.contains(m.end)) { 38 | return true 39 | } 40 | } 41 | } catch { 42 | case err: Exception => println(s"$sent") 43 | } 44 | } 45 | } 46 | false 47 | } 48 | 49 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 50 | import ds.sparkSession.implicits._ 51 | 52 | if (ngwords.size == 0) { ds } 53 | else { 54 | ds.mapPartitions(iter => { 55 | // setup sudachi tokenizer per partition 56 | val tok = Sudachi.setupSudachiTokenizer() 57 | iter.filter(doc => !containsNgword(tok, doc)) 58 | }) 59 | } 60 | } 61 | 62 | override def toString(): String = s"${this.getClass.getSimpleName}(#word=${ngwords.size})" 63 | } 64 | 65 | object RemoveNGWordDocument extends FromConfig { 66 | val defaultPath = "ng_words.txt" 67 | 68 | def fromFile(ngwordsFile: Path): RemoveNGWordDocument = { 69 | val fullstr = new String(Files.readAllBytes(ngwordsFile), StandardCharsets.UTF_8) 70 | new RemoveNGWordDocument( 71 | fullstr.split("\n").map(_.trim).filter(_.nonEmpty).toSet 72 | ) 73 | } 74 | 75 | override def fromConfig(conf: ConfigObject): RemoveNGWordDocument = { 76 | val pathStr = conf.getOrElseAs[String]("path", defaultPath) 77 | 78 | val filepath = Paths.get(pathStr) 79 | if (filepath.toFile.exists) { 80 | fromFile(filepath) 81 | } else { 82 | val fullstr = Source.fromResource(pathStr).mkString 83 | new RemoveNGWordDocument( 84 | fullstr.split("\n").map(_.trim).filter(_.nonEmpty).toSet 85 | ) 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveScriptDocument.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters documents that contain script. */ 6 | class RemoveScriptDocument extends DocumentFilter { 7 | val curlyBracketsPattern = """[\{|\}]""".r 8 | 9 | def isFilteredSent(sent: String): Boolean = { 10 | curlyBracketsPattern.findFirstIn(sent).isEmpty 11 | } 12 | 13 | override def isFiltered(doc: Seq[String]): Boolean = { 14 | doc.forall(sent => isFilteredSent(sent)) 15 | } 16 | } 17 | 18 | object RemoveScriptDocument extends FromConfig { 19 | override def fromConfig(conf: ConfigObject): RemoveScriptDocument = new RemoveScriptDocument 20 | } 21 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveShortDocument.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters documents that are too short. 6 | * 7 | * @constructor 8 | * create a new filter. 9 | * @param min 10 | * the minimum number of sentences a document should contain 11 | */ 12 | class RemoveShortDocument(min: Int = 5) 13 | extends DocumentFilter 14 | with FieldSettable[RemoveShortDocument] { 15 | override def isFiltered(doc: Seq[String]): Boolean = { 16 | min <= doc.map(_.split("\n").length).reduce(_ + _) 17 | } 18 | 19 | override def toString(): String = s"${this.getClass.getSimpleName}($min)" 20 | } 21 | 22 | object RemoveShortDocument extends FromConfig { 23 | override def fromConfig(conf: ConfigObject): RemoveShortDocument = { 24 | val args = Map[String, Option[Any]]("min" -> conf.getAs[Int]("min")) 25 | .collect { case (k, Some(v)) => k -> v } 26 | 27 | new RemoveShortDocument().setFields(args) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveSubstring.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | import java.nio.charset.StandardCharsets 5 | import java.nio.file.{Path, Paths, Files} 6 | import scala.io.Source 7 | 8 | /** Removes given substrings from documents. 9 | * 10 | * @param matchSentence 11 | * If true, match string with only full sentence, i.e. substr have to start/end at newline. 12 | */ 13 | class RemoveSubstring( 14 | substrs: Set[String], 15 | matchSentence: Boolean = RemoveSubstring.defaultMatchSentence 16 | ) extends DocumentNormalizer { 17 | val substrPattern = matchSentence match { 18 | case false => { s"""(${substrs.mkString("|")})""".r } 19 | case true => { 20 | s"""(?m)(^${substrs.mkString("$|^")}$$)""".r 21 | } 22 | } 23 | 24 | override def normalizeDocument(doc: Seq[String]): Seq[String] = { 25 | val fullDoc = doc.mkString("\n") 26 | val removed = substrPattern.replaceAllIn(fullDoc, "") 27 | removed.split("\n").filter(_.length > 0).toSeq 28 | } 29 | 30 | override def toString(): String = s"${this.getClass.getSimpleName}(#substr=${substrs.size})" 31 | } 32 | 33 | object RemoveSubstring extends FromConfig { 34 | val defaultPath = "template_sentences.txt" 35 | val defaultDelim = "\n\n" // Delimiter of substrings in the file. 36 | val defaultMatchSentence = false // Whether if match only full sentence. 37 | 38 | def fromFile( 39 | filePath: Path, 40 | delim: String = defaultDelim, 41 | matchSentence: Boolean = defaultMatchSentence 42 | ): RemoveSubstring = { 43 | val fullstr = new String(Files.readAllBytes(filePath), StandardCharsets.UTF_8) 44 | new RemoveSubstring( 45 | fullstr.split(delim).map(_.trim).filter(_.nonEmpty).toSet, 46 | matchSentence 47 | ) 48 | } 49 | 50 | override def fromConfig(conf: ConfigObject): RemoveSubstring = { 51 | val pathStr = conf.getOrElseAs[String]("path", defaultPath) 52 | val delim = conf.getOrElseAs[String]("delim", defaultDelim) 53 | val matchSentence = conf.getOrElseAs[Boolean]("matchSentence", defaultMatchSentence) 54 | 55 | val filepath = Paths.get(pathStr) 56 | if (filepath.toFile.exists) { 57 | fromFile(filepath, delim, matchSentence) 58 | } else { 59 | val fullstr = Source.fromResource(pathStr).mkString 60 | new RemoveSubstring( 61 | fullstr.split(delim).map(_.trim).filter(_.nonEmpty).toSet, 62 | matchSentence 63 | ) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveURL.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Filters sentences that contain URL. */ 6 | class RemoveURL extends SentenceFilter { 7 | val urlPattern = """(https?|sftp?)://[\w/:%#\$&\?\(\)~\.=\+\-]+""".r 8 | 9 | override def isFiltered(sent: String): Boolean = { 10 | urlPattern.findFirstIn(sent).isEmpty 11 | } 12 | } 13 | 14 | object RemoveURL extends FromConfig { 15 | override def fromConfig(conf: ConfigObject): RemoveURL = new RemoveURL 16 | } 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/RemoveWikipediaCitation.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | 5 | /** Removes citation markers (from Wikipedia). */ 6 | class RemoveWikipediaCitation extends SentenceNormalizer { 7 | val citationPattern = """\[\d+?\]|\[要.+?\]|\{\{+[^{}]+?\}\}+|\[(要出典|リンク切れ|.+?\?)\]""".r 8 | 9 | override def normalizeSentence(sent: String): String = { 10 | citationPattern.replaceAllIn(sent, "") 11 | } 12 | } 13 | 14 | object RemoveWikipediaCitation extends FromConfig { 15 | override def fromConfig(conf: ConfigObject): RemoveWikipediaCitation = new RemoveWikipediaCitation 16 | } 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/SplitElement.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | import org.apache.spark.sql.Dataset 5 | 6 | /** Split each elements of the document by given delimiter and flatten. 7 | * 8 | * Use this to split document/paragraph into paragraph/sentence. 9 | * 10 | * @param delim 11 | * the delimiter to split each elements. 12 | */ 13 | class SplitElement(delim: String = "\n") extends Transformer with FieldSettable[SplitElement] { 14 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = { 15 | import ds.sparkSession.implicits._ 16 | ds.map(_.flatMap(_.split(delim))) 17 | } 18 | } 19 | 20 | object SplitElement extends FromConfig { 21 | override def fromConfig(conf: ConfigObject): SplitElement = { 22 | val args = Map[String, Option[Any]]("delim" -> conf.getAs[String]("delim")) 23 | .collect { case (k, Some(v)) => k -> v } 24 | new SplitElement().setFields(args) 25 | } 26 | } 27 | 28 | /** Split element into sentences. */ 29 | class SplitIntoSentence extends SplitElement(delim = "\n") 30 | object SplitIntoSentence extends FromConfig { 31 | override def fromConfig(conf: ConfigObject): SplitIntoSentence = new SplitIntoSentence() 32 | } 33 | 34 | /** Split element into paragraphs. */ 35 | class SplitIntoParagraph extends SplitElement(delim = "\n\n") 36 | object SplitIntoParagraph extends FromConfig { 37 | override def fromConfig(conf: ConfigObject): SplitIntoParagraph = new SplitIntoParagraph() 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/cleaning/Transformer.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.cleaning 2 | 3 | import com.typesafe.config.ConfigObject 4 | import org.apache.spark.sql.Dataset 5 | 6 | /** Transforms given spark dataset. */ 7 | trait Transformer extends scala.Serializable { 8 | def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] 9 | 10 | override def toString(): String = s"${this.getClass.getSimpleName}" 11 | } 12 | 13 | /** Trait to instanciate transformer based on config file. 14 | * 15 | * Every Transformers should have a companion object with this trait. 16 | */ 17 | trait FromConfig { 18 | def fromConfig(conf: ConfigObject): Transformer 19 | 20 | /** Wrapper class for easy config value access. */ 21 | implicit class ConfigObjectWrapper(val conf: ConfigObject) { 22 | def getAs[T](key: String): Option[T] = Option(conf.get(key)).map(_.unwrapped.asInstanceOf[T]) 23 | 24 | def getOrElseAs[T](key: String, default: T): T = conf.getAs[T](key).getOrElse(default) 25 | } 26 | } 27 | 28 | /** Transformer that does nothing. */ 29 | class Identity extends Transformer { 30 | override def transform(ds: Dataset[Seq[String]]): Dataset[Seq[String]] = ds 31 | } 32 | 33 | object Identity extends FromConfig { 34 | override def fromConfig(conf: ConfigObject): Identity = new Identity 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/main/DeduplicateParagraphs.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.main 2 | 3 | import com.worksap.nlp.uzushio.lib.runners.{DeduplicateParagraphs => DedupTask} 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object DeduplicateParagraphs { 8 | def main(args: Array[String]): Unit = { 9 | val argObj = new DedupTask.ArgParser(args).toArgs 10 | SparkSession.builder().getOrCreate().use { spark => 11 | new DedupTask(argObj, spark).process() 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/scala/com/worksap/nlp/uzushio/main/ExtractTextFromWarc.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.main 2 | 3 | import com.worksap.nlp.uzushio.lib.runners.{ExtractParagraphsFromWARC, WarcTextExtractionRaw} 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** Extracts text from WARC files. 8 | * 9 | * @see 10 | * [[WarcTextExtractionRaw.ConfigParser]] 11 | */ 12 | object ExtractTextFromWarc { 13 | def main(args: Array[String]): Unit = { 14 | val cfg = new WarcTextExtractionRaw.ConfigParser(args).asArgs() 15 | SparkSession.builder().appName(getClass.getSimpleName).getOrCreate().use { spark => 16 | ExtractParagraphsFromWARC.run(cfg)(spark) 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /legacy/list_common_substr.py: -------------------------------------------------------------------------------- 1 | import argparse as ap 2 | from pathlib import Path 3 | from collections import Counter 4 | import itertools as it 5 | 6 | import numpy as np 7 | import suffixarray 8 | 9 | doc_delim = "\n\n" 10 | 11 | 12 | def main(): 13 | args = parse_args() 14 | validate_args(args) 15 | 16 | data = "" 17 | for p in args.input: 18 | with p.open() as fin: 19 | data += fin.read() 20 | 21 | n_doc = len(data.split(doc_delim)) 22 | min_len = args.min_len 23 | min_freq = n_doc * args.min_freq if args.min_freq < 1 else args.min_freq 24 | 25 | # use results for the reversed text to make result set closed 26 | # (do not want to handle every prefix/suffix of substrings) 27 | sa = suffixarray.SuffixArray(data) 28 | sa_rev = suffixarray.SuffixArray(data[::-1]) 29 | 30 | def key_func(s, i, l, c): 31 | # take if target substr has enough length and freq count 32 | # also check new-line to handle per sentence 33 | return l >= min_len and c >= min_freq and s[i] == '\n' and s[i+l-1] == '\n' 34 | 35 | ss_cnt = {sa.str[x:x+l]: c 36 | for x, l, c in sa.iter_repeated_substrings(key=key_func)} 37 | rev_cnt = {sa_rev.str[x+l-1:x-1:-1]: c 38 | for x, l, c in sa_rev.iter_repeated_substrings(key=key_func)} 39 | common_ss = set(ss_cnt.keys()) & set(rev_cnt.keys()) 40 | 41 | with args.output.open("w") as fout: 42 | for ss in common_ss: 43 | fout.write(f"{ss_cnt[ss]}\n{ss}{doc_delim}") 44 | 45 | return 46 | 47 | 48 | def parse_args(): 49 | parser = ap.ArgumentParser() 50 | parser.add_argument(dest="input", type=str, nargs="+", 51 | help="Input text file.") 52 | 53 | parser.add_argument("--min-len", type=int, 54 | default=10, help="minimum length") 55 | parser.add_argument("--min-freq", default=10, help="minimum frequency") 56 | 57 | parser.add_argument("-o", "--output", dest="output", type=str, default="./ss.txt", 58 | help="File to output summary.") 59 | parser.add_argument("--overwrite", action="store_true", 60 | help="Overwrite output files when they already exist.") 61 | 62 | args = parser.parse_args() 63 | args.input = [Path(s) for s in args.input] 64 | args.output = Path(args.output) 65 | args.min_freq = float(args.min_freq) 66 | return args 67 | 68 | 69 | def validate_args(args): 70 | if not args.overwrite: 71 | if args.output.exists(): 72 | raise ValueError( 73 | f"File {args.output} already exists. Set --overwrite to continue anyway.") 74 | return 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/HttpResponseParser.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import java.io.ByteArrayInputStream 4 | import java.io.InputStream 5 | import java.io.SequenceInputStream 6 | import java.io.Serializable 7 | import org.apache.commons.io.IOUtils 8 | import org.apache.hc.core5.http.impl.io.{ 9 | DefaultHttpResponseParser, 10 | SessionInputBufferImpl 11 | } 12 | import org.apache.hc.core5.http.io.SessionInputBuffer 13 | import org.apache.log4j.LogManager 14 | 15 | /** Http response parser for warc record. */ 16 | class HttpResponseParser(bufSize: Int = 128 * 1024) extends Serializable { 17 | @transient lazy val logger = LogManager.getLogger(this.getClass.getSimpleName) 18 | 19 | private val responseParser = new DefaultHttpResponseParser() 20 | private val siBuffer = new SessionInputBufferImpl(bufSize) 21 | private val byteBuffer = Array.ofDim[Byte](bufSize) 22 | 23 | /** Parses WarcRecord body as http response. 24 | * 25 | * Make sure that provided warc record has proper type. 26 | */ 27 | def parseWarcRecord(warc: WarcRecord) = { 28 | val is = new ByteArrayInputStream(warc.content) 29 | 30 | try { 31 | val resp = responseParser.parse(siBuffer, is) 32 | val body = readBody(siBuffer, is); 33 | new HttpResponseSerializable(resp, body) 34 | } catch { 35 | // TODO: data handling in the error cases 36 | case e: org.apache.hc.core5.http.HttpException => { 37 | logger.warn(s"error parsing http response: ${e}") 38 | new HttpResponseSerializable() 39 | } 40 | } finally { 41 | is.close() 42 | } 43 | } 44 | 45 | /** Read body bytes from buffers after headers are read. */ 46 | private def readBody(isBuf: SessionInputBuffer, rest: InputStream) = { 47 | val emptyIs = new java.io.ByteArrayInputStream(Array.emptyByteArray) 48 | val restBytes = isBuf.read(byteBuffer, emptyIs) 49 | 50 | IOUtils.toByteArray( 51 | new SequenceInputStream( 52 | new ByteArrayInputStream(byteBuffer.slice(0, restBytes)), 53 | rest 54 | ) 55 | ) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/HttpResponseSerializable.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import java.io.Serializable 4 | import org.apache.hc.core5.http.ClassicHttpResponse 5 | import org.apache.hc.core5.http.message.BasicClassicHttpResponse 6 | 7 | /** Seritalizable wrapper of ClassicHttpResponse. */ 8 | class HttpResponseSerializable( 9 | resp: ClassicHttpResponse = new BasicClassicHttpResponse(600), 10 | val body: Array[Byte] = Array.empty[Byte] 11 | ) extends Serializable { 12 | 13 | /** Returns the value of the first header with the given name. 14 | * 15 | * @throws ProtocolException 16 | * in case multiple headers with the given name are found. 17 | */ 18 | def getHeader(name: String): Option[String] = { 19 | Option(resp.getHeader(name)).map(_.getValue) 20 | } 21 | 22 | def getFirstHeader(name: String): Option[String] = { 23 | Option(resp.getFirstHeader(name)).map(_.getValue) 24 | } 25 | 26 | def getLastHeader(name: String): Option[String] = { 27 | Option(resp.getLastHeader(name)).map(_.getValue) 28 | } 29 | 30 | def getHeaders(): Seq[(String, String)] = { 31 | resp.getHeaders().map(header => (header.getName(), header.getValue())) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/LongWritableSerializable.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import java.io.Serializable 5 | 6 | /* Serializable wrapper of Hadoop LongWritable class. 7 | * 8 | * ref: https://issues.apache.org/jira/browse/SPARK-2421 9 | */ 10 | class LongWritableSerializable extends LongWritable with Serializable 11 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcFileReader.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import collection.JavaConverters._ 4 | import java.io.{InputStream, FilterInputStream} 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.log4j.LogManager 8 | import org.archive.io.warc.WARCReaderFactory 9 | 10 | /** Reads {@link WarcRecord}s from a WARC file using Hadoop filesystem APIs. */ 11 | class WarcFileReader(conf: Configuration, filePath: Path) { 12 | @transient lazy val logger = LogManager.getLogger(this.getClass.getSimpleName) 13 | 14 | /** Opens a warc file and setup an iterator of records. */ 15 | private val fs = filePath.getFileSystem(conf) 16 | private val fileSize = fs.getFileStatus(filePath).getLen 17 | private val fsin = new CountingInputStream(fs.open(filePath)) 18 | private val reader = WARCReaderFactory.get(filePath.getName(), fsin, true) 19 | private val recordIter = reader.iterator.asScala 20 | 21 | /** Init counters to report progress. */ 22 | private var recordsRead: Long = 0 23 | private var bytesRead: Long = 0 24 | 25 | /** Closes the file and reader. */ 26 | def close(): Unit = { 27 | reader.close() 28 | fsin.close() 29 | } 30 | 31 | /** Reads the next record from the iterator. 32 | * 33 | * @throws java.util.NoSuchElementException 34 | */ 35 | def read(): WarcRecord = { 36 | if (!recordIter.hasNext) { 37 | throw new java.util.NoSuchElementException() 38 | } 39 | 40 | try { 41 | val record = new WarcRecord(recordIter.next()) 42 | recordsRead += 1 43 | return record 44 | } catch { 45 | case e: java.io.EOFException => { 46 | logger.warn(s"error while iterating warc, try to skip: ${e}") 47 | return read() 48 | } 49 | } 50 | } 51 | 52 | /** Returns the number of records that have been read. */ 53 | def getRecordsRead: Long = { 54 | return recordsRead 55 | } 56 | 57 | /** Returns the number of bytes that have been read. */ 58 | def getBytesRead: Long = { 59 | return bytesRead 60 | } 61 | 62 | /** Returns the proportion of the file thet has been read. */ 63 | def getProgress: Float = { 64 | if (fileSize <= 0) return 1.0f 65 | return bytesRead.toFloat / fileSize.toFloat 66 | } 67 | 68 | /** InputStream that records the number of bytes read. */ 69 | private class CountingInputStream(in: InputStream) 70 | extends FilterInputStream(in) { 71 | override def read(): Int = { 72 | val result = in.read() 73 | if (result != -1) bytesRead += 1 74 | return result 75 | } 76 | 77 | override def read(b: Array[Byte], off: Int, len: Int): Int = { 78 | val result = in.read(b, off, len) 79 | if (result != -1) bytesRead += result 80 | return result 81 | } 82 | 83 | override def skip(n: Long): Long = { 84 | val result = in.skip(n) 85 | bytesRead += result 86 | return result 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcInputFormat.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.mapreduce.InputSplit; 5 | import org.apache.hadoop.mapreduce.JobContext; 6 | import org.apache.hadoop.mapreduce.RecordReader; 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 10 | 11 | /* Hadoop InputFormat for WARC files. 12 | * 13 | * Key is 1-index LongWritable. Use get() method to take Long value. 14 | */ 15 | class WarcInputFormat 16 | extends FileInputFormat[LongWritableSerializable, WarcWritable] { 17 | 18 | /* Opens a WARC file (possibly compressed), and returns a RecordReader for accessing it. */ 19 | override def createRecordReader( 20 | split: InputSplit, 21 | context: TaskAttemptContext 22 | ) = { 23 | new WarcRecordReader() 24 | } 25 | 26 | override def isSplitable(context: JobContext, filename: Path): Boolean = { 27 | // we cannot (sanely) split warc files, due to its variable-length records. 28 | return false 29 | } 30 | } 31 | 32 | /* Wrapper class of {@link WarcFileReader} to implement RecordReader. */ 33 | class WarcRecordReader 34 | extends RecordReader[LongWritableSerializable, WarcWritable] { 35 | private val key = new LongWritableSerializable(); 36 | private val value = new WarcWritable(); 37 | 38 | private var reader: WarcFileReader = null 39 | 40 | override def initialize( 41 | split: InputSplit, 42 | context: TaskAttemptContext 43 | ): Unit = { 44 | reader = new WarcFileReader( 45 | context.getConfiguration(), 46 | split.asInstanceOf[FileSplit].getPath 47 | ); 48 | } 49 | 50 | override def nextKeyValue(): Boolean = { 51 | try { 52 | val record = reader.read(); 53 | key.set(reader.getRecordsRead); 54 | value.setRecord(record); 55 | } catch { 56 | case e: java.util.NoSuchElementException => { return false } 57 | } 58 | 59 | return true; 60 | } 61 | 62 | override def getCurrentKey(): LongWritableSerializable = { 63 | return key; 64 | } 65 | 66 | override def getCurrentValue(): WarcWritable = { 67 | return value; 68 | } 69 | 70 | override def getProgress(): Float = { 71 | return reader.getProgress 72 | } 73 | 74 | override def close(): Unit = { 75 | reader.close(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcLoader.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.rdd.RDD 5 | 6 | object WarcLoader { 7 | /* Load WARC file as RDD. */ 8 | def readFrom( 9 | spark: SparkSession, 10 | name: String 11 | ): RDD[WarcRecord] = { 12 | spark.sparkContext 13 | .newAPIHadoopFile( 14 | name, 15 | classOf[WarcInputFormat], 16 | classOf[LongWritableSerializable], 17 | classOf[WarcWritable] 18 | ) 19 | .map { case (k, v) => v.getRecord() } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcRecord.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import collection.JavaConverters._ 4 | import java.io.Serializable 5 | import org.apache.commons.io.IOUtils 6 | import org.archive.io.ArchiveRecord 7 | 8 | /* Serializable wrapper of ArchiveRecord, with its contents loaded. */ 9 | class WarcRecord(record: ArchiveRecord) extends Serializable { 10 | val headers: Map[String, String] = record 11 | .getHeader() 12 | .getHeaderFields() 13 | .asScala 14 | .map { case (k, v) => (k, v.toString) } 15 | .toMap 16 | 17 | /* read contents to safely step iterator forward. */ 18 | val content = IOUtils.toByteArray(record, record.available()); 19 | 20 | def isResponse(): Boolean = { 21 | // ref https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0 22 | val warcType = headers.getOrElse("WARC-Type", "") 23 | return warcType == "response" 24 | } 25 | 26 | def isTruncated(): Boolean = { 27 | val truncated = headers.get("WARC-Truncated") 28 | return truncated.nonEmpty 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /legacy/src/main/scala/com/worksap/nlp/uzushio/warc/WarcWritable.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.warc 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.Serializable 6 | import org.apache.hadoop.io.Writable; 7 | 8 | /* A mutable wrapper around a {@link WarcRecord} implementing the Hadoop 9 | * Writable and Serializable (for Spark) interfaces. 10 | */ 11 | class WarcWritable(var record: WarcRecord = null) 12 | extends Writable 13 | with Serializable { 14 | 15 | /* Returns the record currently wrapped by this writable. */ 16 | def getRecord(): WarcRecord = { 17 | return record; 18 | } 19 | 20 | /* Updates the record held within this writable wrapper. */ 21 | def setRecord(newRecord: WarcRecord): Unit = { 22 | record = newRecord; 23 | } 24 | 25 | /* Appends the current record to a {@link DataOutput} stream. */ 26 | override def write(out: DataOutput): Unit = { 27 | // TODO: impl (not neccessary for current use case) 28 | // if (record != null) record.write(out); 29 | } 30 | 31 | /* Parses a {@link WarcRecord} out of a {@link DataInput} stream, and make it 32 | * the current record. 33 | */ 34 | override def readFields(in: DataInput): Unit = { 35 | // TODO: impl (not neccessary for current use case) 36 | // record = new WarcRecord(in); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/hojichar/README.md: -------------------------------------------------------------------------------- 1 | These lists are imported from [HojiChar](https://github.com/HojiChar/HojiChar/tree/main/hojichar/dict). 2 | Lists contain offensive words and used for filtering. 3 | 4 | -------------------------------------------------------------------------------- /lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/hojichar/discriminations_keywords_ja.txt: -------------------------------------------------------------------------------- 1 | アイヌ系 2 | アカ 3 | アメ公 4 | アル中 5 | イカサマ 6 | イタ公 7 | イモ 8 | インチキ 9 | インディアン嘘つかない 10 | エスキモー 11 | エチゼンクラゲ 12 | オカマ 13 | オールドミス 14 | カッペ 15 | ガキ 16 | ガサ 17 | キ●ガイ 18 | キチ 19 | キチガ● 20 | キチガイ 21 | キ印 22 | ゲンナマ 23 | コロシ 24 | ゴミ屋 25 | サツ 26 | サラ金 27 | ザギン 28 | シマ 29 | ジプシー 30 | ジャップ 31 | ジャリ 32 | スケ 33 | スチュワーデス 34 | スラム 35 | ズージャー 36 | タケノコ医者 37 | ダッチマン 38 | チビ 39 | チャリンコ 40 | チャンコロ 41 | チョン 42 | デカ 43 | トルコ嬢 44 | トルコ風呂 45 | ドヤ街 46 | ナオン 47 | ニガー 48 | ニグロ 49 | ニコヨン 50 | ノビ 51 | バタ屋 52 | パクる 53 | パン助 54 | パーマ屋 55 | ヒモ 56 | ブス 57 | ブタ箱 58 | ブツ 59 | ブラインドタッチ 60 | ポコペン 61 | ポリ公 62 | マンコ 63 | ヤンキー 64 | ヤー様 65 | ヨツ 66 | ルンペン 67 | レントゲン技師 68 | ロンパリ 69 | 丁稚 70 | 三つ口 71 | 三助 72 | 三国人 73 | 三韓征伐 74 | 上方の贅六 75 | 下女 76 | 下男 77 | 不具 78 | 不可触民 79 | 不治の病 80 | 中共 81 | 乞食 82 | 二号 83 | 人夫 84 | 人足 85 | 人非人 86 | 他力本願 87 | 代書屋 88 | 令嬢 89 | 伊勢乞食 90 | 低脳 91 | 低脳児 92 | 低開発国 93 | 保母 94 | 保線工夫 95 | 借り腹 96 | 健全なる精神は健全なる身体に宿る 97 | 傴僂 98 | 八百屋 99 | 共稼ぎ 100 | 処女作 101 | 処女峰 102 | 出戻り 103 | 出稼ぎ 104 | 助産婦 105 | 労務者 106 | 北鮮 107 | 千摺り 108 | ナオン 109 | ヒモ 110 | オールドミス 111 | 女子供 112 | 狂女 113 | 下女 114 | 下男 115 | 女給 116 | 女傑 117 | 女工 118 | 処女作 119 | 処女峰 120 | 女中 121 | #スケ 122 | 端女 123 | 醜男 124 | 阿婆擦れ 125 | 男のくせに 126 | 女のくせに 127 | 男らしく 128 | 女らしく 129 | 女々しい 130 | 女だてらに 131 | 男勝り 132 | 紅一点 133 | 女の腐ったような 134 | 女の腐ったの 135 | 売れ残り 136 | 出戻り 137 | めかけ 138 | 職場の花 139 | 二号さん 140 | フェミナチ 141 | あげまん 142 | さげまん 143 | あげちん 144 | さげちん 145 | まんこ 146 | ちんこ 147 | ビッチ 148 | 毒女 149 | 鬼女 150 | ババア 151 | -------------------------------------------------------------------------------- /lib/src/main/resources/com/worksap/nlp/uzushio/lib/filters/ng_words.txt: -------------------------------------------------------------------------------- 1 | fuck 2 | g スポット 3 | sm女王 4 | tenga 5 | あばずれ 6 | あぱずれ 7 | あほ 8 | うざ 9 | うんこ 10 | え〇 11 | えっち 12 | おしっこ 13 | おしりのあな 14 | おっぱい 15 | おもらし 16 | かたわ 17 | きちがい 18 | きめぇ 19 | きめえ 20 | くそ 21 | せんずり 22 | ち〇 23 | ちんぐり 24 | ちんこ 25 | つるぺた 26 | つんぼ 27 | ふたなり 28 | ぶさいく 29 | ぶす 30 | ま〇 31 | まんぐり 32 | まんこ 33 | めくら 34 | やりまん 35 | アスペ 36 | アスホール 37 | アナリングス 38 | アナル 39 | アヌス 40 | アバズレ 41 | アパズレ 42 | アホ 43 | イマラチオ 44 | イメクラ 45 | イラマチオ 46 | ウザ 47 | ウンコ 48 | エ〇 49 | エッチ 50 | エロ 51 | オカマ 52 | オッパイ 53 | オナ 54 | オナニー 55 | オフパコ 56 | オマンコ 57 | オルガズム 58 | オーガズム 59 | カス 60 | ガイジ 61 | キチガイ 62 | キモ 63 | クズ 64 | クソ 65 | クリトリス 66 | クンニ 67 | クンニリングス 68 | グループ・セックス 69 | グロ 70 | ゲイボーイ 71 | ゲイ・セックス 72 | ゲロ 73 | コカイン 74 | コキ 75 | コンドーム 76 | ザーメン 77 | シコ 78 | ショタ 79 | スカトロ 80 | スケベ 81 | ストリップ劇場 82 | スマタ 83 | セクロス 84 | セックス 85 | セフレ 86 | センズリ 87 | ダッチワイフ 88 | チ〇 89 | テレフォンセックス 90 | ディルド 91 | ディープ・スロート 92 | デブ 93 | デリヘル 94 | デートレイプ 95 | ドキュン 96 | ナマポ 97 | ニガー 98 | ヌい 99 | ヌく 100 | ヌけ 101 | ネオ・ナチ 102 | ハメ撮り 103 | パイズリ 104 | パイパン 105 | パンチラ 106 | パンティー 107 | ビッチ 108 | ピロートーク 109 | ファック 110 | フェラ 111 | フェラチオ 112 | ブサイク 113 | ブス 114 | プリンス アルバート ピアス 115 | ペッティング 116 | ペニス 117 | ペニスバンド 118 | ホモ 119 | ボンテージ 120 | ボールギャグ 121 | ポルノグラフィー 122 | マ〇 123 | マザー・ファッカー 124 | マスターベーション 125 | マラ 126 | マンコ 127 | ヤラせ 128 | ラブホ 129 | リスカ 130 | リストカット 131 | リョナ 132 | リンチ 133 | レイプ 134 | レズ 135 | 不細工 136 | 中出し 137 | 乱交 138 | 二穴 139 | 人妻 140 | 側位 141 | 児童性虐待 142 | 前戯 143 | 勃起する 144 | 合いの子 145 | 四十八手 146 | 売り専 147 | 売国 148 | 売女 149 | 売春婦 150 | 外人 151 | 夢精 152 | 大人のおもちゃ 153 | 大人のオモチャ 154 | 大人の玩具 155 | 大陰唇 156 | 射精 157 | 尻軽 158 | 尿道プレイ 159 | 巨乳 160 | 巨根 161 | 強姦犯 162 | 後戯 163 | 後背位 164 | 手コキ 165 | 手マン 166 | 援交 167 | 援助交際 168 | 支那 169 | 新しいポルノ 170 | 正常位 171 | 殺し方 172 | 殺人方法 173 | 氏ね 174 | 氏んだ 175 | 氏んで 176 | 気違い 177 | 池沼 178 | 淫乱 179 | 潮吹き女 180 | 潮吹き男性 181 | 熟女 182 | 獣姦 183 | 玉なめ 184 | 玉舐め 185 | 男根 186 | 痴呆 187 | 穴兄弟 188 | 竿姉妹 189 | 筆おろし 190 | 精液 191 | 糞便 192 | 糞尿愛好症 193 | 素股 194 | 緊縛 195 | 老害 196 | 肉便器 197 | 自慰 198 | 裸の女性 199 | 貞操帯 200 | 賢者タイム 201 | 足フェチ 202 | 輪姦 203 | 近親相姦 204 | 阿呆 205 | 陰毛 206 | 電マ 207 | 顔射 208 | 顔面騎乗 209 | 騎上位 210 | 騎乗位 -------------------------------------------------------------------------------- /lib/src/main/resources/pipeline/all_duplicate_paragraphs.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | {"class": "DuplicateParagraphs", "limit": 2} 3 | ] -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/cleaning/PathSegment.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.cleaning 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | case class PathSegment(tag: String, id: String, classes: Seq[String]) { 6 | override def toString: String = classes 7 | .mkString(tag + (if (classes.isEmpty) "" else "."), ".", if (id == null) "" else s"#$id") 8 | 9 | lazy val lowerClasses: Set[String] = classes.map(_.toLowerCase).toSet 10 | lazy val lowerId: String = if (id == null) null else id.toLowerCase 11 | } 12 | 13 | object PathSegment { 14 | 15 | final private val EMPTY_PATH: Seq[PathSegment] = ArrayBuffer.empty 16 | 17 | def parsePath(path: String): Seq[PathSegment] = { 18 | var start = 0 19 | val end = path.length 20 | val result = new ArrayBuffer[PathSegment]() 21 | while (start < end) { 22 | val separator = path.indexOf('>', start) 23 | if (separator == -1) { 24 | result += parse(path, start, end) 25 | start = end 26 | } else { 27 | result += parse(path, start, separator) 28 | start = separator + 1 29 | } 30 | } 31 | if (result.isEmpty) EMPTY_PATH else result 32 | } 33 | 34 | final private val EMPTY_CLASSES: Seq[String] = new ArrayBuffer[String]() 35 | def parse(raw: String, start: Int, end: Int): PathSegment = { 36 | var dotIdx = raw.indexOf('.', start) 37 | var hashIdx = raw.indexOf('#', start) 38 | if (dotIdx > end) { 39 | dotIdx = -1 40 | } 41 | if (hashIdx > end) { 42 | hashIdx = -1 43 | } 44 | 45 | if (dotIdx == -1 && hashIdx == -1) { 46 | return PathSegment(raw.substring(start, end), null, EMPTY_CLASSES) 47 | } 48 | 49 | var tagEndIdx = end 50 | 51 | val id = 52 | if (hashIdx == -1) null 53 | else { 54 | tagEndIdx = hashIdx 55 | raw.substring(hashIdx + 1, end) 56 | } 57 | 58 | val classes = 59 | if (dotIdx == -1) { 60 | EMPTY_CLASSES 61 | } else { 62 | val classesEndIdx = tagEndIdx 63 | tagEndIdx = dotIdx 64 | var classesIdx = dotIdx 65 | val classes = new ArrayBuffer[String]() 66 | while (classesIdx < classesEndIdx) { 67 | var nextClassIdx = raw.indexOf('.', classesIdx + 1) 68 | if (nextClassIdx > end) { 69 | nextClassIdx = -1 70 | } 71 | if (nextClassIdx > 0) { 72 | classes += raw.substring(classesIdx + 1, nextClassIdx) 73 | classesIdx = nextClassIdx 74 | } else { 75 | if (classesIdx != classesEndIdx) { 76 | classes += raw.substring(classesIdx + 1, classesEndIdx) 77 | } 78 | classesIdx = classesEndIdx 79 | } 80 | } 81 | classes 82 | } 83 | 84 | PathSegment( 85 | tag = raw.substring(start, tagEndIdx), 86 | id = id, 87 | classes = classes 88 | ) 89 | } 90 | 91 | def parse(raw: String): PathSegment = parse(raw, 0, raw.length) 92 | } 93 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/AdjacentDuplicateParagraphs.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | 8 | /** This class is a hack put in place before the final bugfix 9 | */ 10 | class AdjacentDuplicateParagraphs extends DocFilter { 11 | 12 | private def compressParagraphs(paragraphs: Seq[Paragraph]): Seq[Paragraph] = { 13 | val result = new ArrayBuffer[Paragraph]() 14 | val iter = paragraphs.iterator 15 | if (!iter.hasNext) { 16 | return paragraphs 17 | } 18 | 19 | var prev = iter.next() 20 | while (iter.hasNext) { 21 | val next = iter.next() 22 | if (next.text != prev.text) { 23 | result += prev 24 | prev = next 25 | } 26 | } 27 | 28 | result += prev 29 | result 30 | } 31 | 32 | override def checkDocument(doc: Document): Document = { 33 | val newPars = compressParagraphs(doc.paragraphs) 34 | if (newPars.length == doc.paragraphs.length) { 35 | doc 36 | } else { 37 | doc.copy(paragraphs = newPars) 38 | } 39 | } 40 | 41 | override val toString = "AdjacentDuplicateParagraphs" 42 | } 43 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/CompressionRate.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.CompressionRate.{INPUT_SIZE, OUTPUT_SIZE} 5 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter 6 | import net.jpountz.lz4.{LZ4Exception, LZ4Factory} 7 | 8 | import java.nio.charset.StandardCharsets 9 | import java.nio.{ByteBuffer, CharBuffer} 10 | 11 | /** Filter out documents which have too low or too high compression rate (using LZ4 algorithm) 12 | * 13 | * @param low 14 | * low compression rate threshold 15 | * @param high 16 | * high compression rate threshold 17 | */ 18 | class CompressionRate(override val low: Float, override val high: Float) extends HighLowDocFilter { 19 | @transient private lazy val lz4 = LZ4Factory.fastestInstance() 20 | @transient private lazy val utf8Buffer = ByteBuffer.allocateDirect(INPUT_SIZE) 21 | @transient private lazy val compressBuffer = ByteBuffer.allocateDirect(OUTPUT_SIZE) 22 | 23 | def encodeDocContent(doc: Document): ByteBuffer = { 24 | val enc = StandardCharsets.UTF_8.newEncoder() 25 | val buf = utf8Buffer 26 | buf.clear() 27 | val iter = doc.aliveParagraphs 28 | while (iter.hasNext) { 29 | val p = iter.next() 30 | val cbuf = CharBuffer.wrap(p.text) 31 | val res = enc.encode(cbuf, buf, true) 32 | if (res.isOverflow) { 33 | // Scala does not has nice break/continue :/ 34 | buf.flip() 35 | return buf 36 | } 37 | } 38 | buf.flip() 39 | buf 40 | } 41 | 42 | override def checkDocument(doc: Document): Document = { 43 | val ratio: Float = compressionRatio(doc) 44 | maybeFilter(doc, ratio) 45 | } 46 | 47 | def compressionRatio(doc: Document): Float = { 48 | val compressor = lz4.fastCompressor() 49 | val buf = encodeDocContent(doc) 50 | val uncompressedSize = buf.limit() 51 | val outBuf = compressBuffer 52 | outBuf.clear() 53 | val compressedSize = 54 | try { 55 | compressor.compress(buf, outBuf) 56 | outBuf.position() 57 | } catch { 58 | case _: LZ4Exception => OUTPUT_SIZE 59 | } 60 | val ratio = compressedSize.toFloat / uncompressedSize.toFloat 61 | ratio 62 | } 63 | } 64 | 65 | object CompressionRate { 66 | final val INPUT_SIZE = 1024 * 1024 67 | final val OUTPUT_SIZE = 1200 * 1024 68 | } 69 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocuments.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.stats.NgramHashExtractor 4 | import com.worksap.nlp.uzushio.lib.cleaning.Document 5 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 6 | import com.worksap.nlp.uzushio.lib.utils.MathUtil 7 | import scala.math._ 8 | import scala.util.Random 9 | 10 | trait RandomGeneratorFromStringBase { 11 | def generateRandom(docId: String): Double 12 | } 13 | 14 | // An object in arguments of DocFilter on Spark needs to mixin Serializable. 15 | object RandomGeneratorFromString extends RandomGeneratorFromStringBase with Serializable { 16 | def generateRandom(docId: String): Double = { 17 | val seed = NgramHashExtractor.hashString(docId) 18 | MathUtil.asRandomDouble(seed) 19 | } 20 | } 21 | 22 | class GaussianRandomGeneratorFromString( 23 | val mu: Double = 0.3, 24 | val sd: Double = 0.1 25 | ) extends RandomGeneratorFromStringBase 26 | with Serializable { 27 | def generateRandom(docId: String): Double = { 28 | val seed = NgramHashExtractor.hashString(docId) 29 | val rng = new Random(seed) 30 | rng.nextGaussian() * mu + sd 31 | } 32 | } 33 | 34 | class DeduplicateDocuments( 35 | val baseNumFreq: Int = 10, 36 | val randomGenerator: RandomGeneratorFromStringBase = new GaussianRandomGeneratorFromString 37 | ) extends DocFilter { 38 | 39 | def computeNearDuplicateTextRatio(doc: Document): Float = { 40 | val iter = doc.aliveParagraphs 41 | 42 | var totalLengthWeightedNearFreq = 0.0 43 | var totalLength = 0.0 44 | 45 | while (iter.hasNext) { 46 | val paragraph = iter.next() 47 | val text = paragraph.text 48 | val textLength = text.length() 49 | val nearFreq = if (paragraph.nearFreq < baseNumFreq) paragraph.nearFreq else baseNumFreq 50 | val weight = log(nearFreq) / log(baseNumFreq) 51 | 52 | totalLength += textLength 53 | totalLengthWeightedNearFreq += (textLength * weight) 54 | } 55 | 56 | MathUtil.ratio(totalLengthWeightedNearFreq.toFloat, totalLength.toFloat) 57 | } 58 | 59 | def shouldRemoveDocument(doc: Document) = { 60 | val nearDuplicateTextRatio = computeNearDuplicateTextRatio(doc) 61 | val thresholdProb = randomGenerator.generateRandom(doc.docId) 62 | 63 | nearDuplicateTextRatio >= thresholdProb 64 | } 65 | 66 | override def checkDocument(doc: Document): Document = { 67 | doc.removeWhen(shouldRemoveDocument(doc), this) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocumentsPercentile.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | import spire.math.QuickSelect 6 | 7 | class DeduplicateDocumentsPercentile(percentile: Float = 0.05f, expected: Double = 1.0) 8 | extends DocFilter { 9 | override def checkDocument(doc: Document): Document = { 10 | val freq = DeduplicateDocumentsPercentile.freqAtPercentile(doc, percentile) 11 | val probability = expected / freq 12 | doc.removeWhen(doc.randomDouble > probability, this) 13 | } 14 | 15 | override val toString = s"DedupDocsPercentile($percentile,$expected)" 16 | } 17 | 18 | object DeduplicateDocumentsPercentile { 19 | import spire.std.any.IntAlgebra 20 | 21 | def freqAtPercentile(doc: Document, percentile: Float): Int = { 22 | val counts = doc.aliveParagraphs.map(_.nearFreq).toArray 23 | if (counts.isEmpty) { 24 | return 0 25 | } 26 | val position = (counts.length * percentile).toInt 27 | QuickSelect.select(counts, position) 28 | counts(position) 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DocLength.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocIntFilter 5 | 6 | class DocLength( 7 | override val low: Int = 0, 8 | override val high: Int = Int.MaxValue 9 | ) extends HighLowDocIntFilter { 10 | override def checkDocument(doc: Document): Document = { 11 | val length = doc.aliveParagraphs.map(_.text.length).sum 12 | maybeFilter(doc, length) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DuplicateDocumentsLengthWeighted.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | import com.worksap.nlp.uzushio.lib.utils.MathUtil 6 | 7 | class DuplicateDocumentsLengthWeighted(expected: Double = 1.0) extends DocFilter { 8 | override def checkDocument(doc: Document): Document = { 9 | val weight = DuplicateDocumentsLengthWeighted.nearFreqWeight(doc) 10 | val prob = expected / weight 11 | doc.removeWhen(doc.randomDouble > prob, this) 12 | } 13 | 14 | override val toString = s"DuplicateDocumentsLengthWeighted($expected)" 15 | } 16 | 17 | object DuplicateDocumentsLengthWeighted { 18 | def nearFreqWeight(doc: Document): Double = { 19 | var nchars = 0L 20 | var weight = 0.0 21 | 22 | val iter = doc.aliveParagraphs 23 | while (iter.hasNext) { 24 | val par = iter.next() 25 | val len = par.text.length.toLong 26 | nchars += len 27 | weight += len * (Math.log10(par.nearFreq) + 1) 28 | } 29 | MathUtil.doubleRatio(weight, nchars) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/DuplicateParagraphs.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph 4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter 5 | 6 | class DuplicateParagraphs(limit: Int = 2) extends ParagraphFilter { 7 | override def checkParagraph(p: Paragraph): Paragraph = { 8 | if (p.nearFreq >= limit) { 9 | p.copy(remove = this) 10 | } else p 11 | } 12 | 13 | override val toString = s"DuplicateParagraphs($limit)" 14 | } 15 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/HiraganaRatio.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.HiraganaRatio.isHiragana 5 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter 6 | import com.worksap.nlp.uzushio.lib.utils.MathUtil 7 | 8 | final class HiraganaRatio( 9 | override val low: Float = 0.0f, 10 | override val high: Float = 1.0f 11 | ) extends HighLowDocFilter { 12 | override def checkDocument(doc: Document): Document = { 13 | val ratio = computeHiraganaRatio(doc) 14 | maybeFilter(doc, ratio) 15 | } 16 | 17 | def computeHiraganaRatio(document: Document): Float = { 18 | var nchars = 0 19 | var nhiragana = 0 20 | val iter = document.aliveParagraphs 21 | while (iter.hasNext) { 22 | val par = iter.next() 23 | val text = par.text 24 | nchars += text.length 25 | nhiragana += countHiraganaChars(text) 26 | } 27 | MathUtil.ratio(nhiragana, nchars) 28 | } 29 | 30 | def countHiraganaChars(str: String): Int = { 31 | val len = str.length 32 | var idx = 0 33 | var count = 0 34 | while (idx < len) { 35 | val ch = str.charAt(idx) 36 | if (isHiragana(ch)) { 37 | count += 1 38 | } 39 | idx += 1 40 | } 41 | count 42 | } 43 | } 44 | 45 | object HiraganaRatio { 46 | def isHiragana(c: Char): Boolean = { 47 | c >= 0x3040 && c <= 0x309f 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | 6 | import scala.collection.mutable 7 | 8 | final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) { 9 | def isAlive: Boolean = p.isAlive 10 | 11 | def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x)) 12 | } 13 | 14 | class KenLMParagraphPerplexity( 15 | sudachi: String, 16 | kenlm: String, 17 | outliers: Float = 0.02f, 18 | count: Int = 3, 19 | threshold: Float = 1e6f 20 | ) extends DocFilter { 21 | private val lmScore = -Math.log10(threshold).toFloat 22 | 23 | @transient 24 | private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers) 25 | 26 | override def checkDocument(doc: Document): Document = { 27 | val proc = processor 28 | val paragraphs = doc.paragraphs 29 | .map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer 30 | 31 | val nchanged = markParagraphs(paragraphs) 32 | 33 | if (nchanged > 0) { 34 | doc.copy(paragraphs = paragraphs.map(_.p)) 35 | } else { 36 | doc 37 | } 38 | } 39 | 40 | def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = { 41 | var nchanged = 0 42 | var idx = 0 43 | val len = paragraphs.length 44 | while (idx < len) { 45 | val p = paragraphs(idx) 46 | if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) { 47 | paragraphs(idx) = p.remove(this) 48 | nchanged += removePrev(paragraphs, idx) 49 | nchanged += 1 50 | } 51 | idx += 1 52 | } 53 | nchanged 54 | } 55 | 56 | def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = { 57 | var result = 0 58 | val end = math.max(offset - count, 0) 59 | var idx = offset - 1 60 | while (idx >= end) { 61 | val p = paragraphs(idx) 62 | if (p.isAlive && p.ppx <= lmScore) { 63 | paragraphs(idx) = p.remove(this) 64 | result += 1 65 | } 66 | 67 | idx -= 1 68 | } 69 | result 70 | } 71 | 72 | def shouldRemoveBack( 73 | paragraphs: mutable.Buffer[ParagraphWithPerplexity], 74 | offset: Int 75 | ): Boolean = { 76 | var idx = offset 77 | val end = math.max(offset - count + 1, 0) 78 | while (idx >= end) { 79 | val p = paragraphs(idx) 80 | if (p.ppx > lmScore) { 81 | return false 82 | } 83 | idx -= 1 84 | } 85 | true 86 | } 87 | 88 | def shouldRemoveFwd( 89 | paragraphs: mutable.Buffer[ParagraphWithPerplexity], 90 | offset: Int, 91 | length: Int 92 | ): Boolean = { 93 | var idx = offset 94 | val end = math.min(offset + count, length) 95 | while (idx < end) { 96 | val p = paragraphs(idx) 97 | if (p.ppx > lmScore) { 98 | return false 99 | } 100 | idx += 1 101 | } 102 | true 103 | } 104 | 105 | override val toString = s"KenLMPar($outliers,$count,$threshold)" 106 | } 107 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphs.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | 6 | import scala.collection.mutable 7 | 8 | class LargeFreqParagraphs(count: Int = 3, freq: Int = 100) extends DocFilter { 9 | override def checkDocument(doc: Document): Document = { 10 | doc.paragraphs match { 11 | case p: mutable.Buffer[Paragraph] => 12 | markParagraphs(p) 13 | val nmarked = markParagraphs(p) 14 | if (nmarked > 0) { 15 | doc.copy(paragraphs = p) 16 | } else { 17 | doc 18 | } 19 | case _ => 20 | val buf = doc.paragraphs.toBuffer 21 | val nmarked = markParagraphs(buf) 22 | if (nmarked > 0) { 23 | doc.copy(paragraphs = buf) 24 | } else { 25 | doc 26 | } 27 | } 28 | } 29 | 30 | def markParagraphs(paragraphs: mutable.Buffer[Paragraph]): Int = { 31 | var nchanged = 0 32 | var idx = 0 33 | val len = paragraphs.length 34 | while (idx < len) { 35 | val p = paragraphs(idx) 36 | if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) { 37 | paragraphs(idx) = p.copy(remove = this) 38 | nchanged += removePrev(paragraphs, idx) 39 | nchanged += 1 40 | } 41 | idx += 1 42 | } 43 | nchanged 44 | } 45 | 46 | def removePrev(paragraphs: mutable.Buffer[Paragraph], offset: Int): Int = { 47 | var result = 0 48 | val end = math.max(offset - count, 0) 49 | var idx = offset - 1 50 | while (idx >= end) { 51 | val p = paragraphs(idx) 52 | if (p.isAlive && p.nearFreq >= freq) { 53 | paragraphs(idx) = p.copy(remove = this) 54 | result += 1 55 | } 56 | 57 | idx -= 1 58 | } 59 | result 60 | } 61 | 62 | def shouldRemoveBack(paragraphs: mutable.Buffer[Paragraph], offset: Int): Boolean = { 63 | var idx = offset 64 | val end = math.max(offset - count + 1, 0) 65 | while (idx >= end) { 66 | val p = paragraphs(idx) 67 | if (p.nearFreq < freq) { 68 | return false 69 | } 70 | idx -= 1 71 | } 72 | true 73 | } 74 | 75 | def shouldRemoveFwd(paragraphs: mutable.Buffer[Paragraph], offset: Int, length: Int): Boolean = { 76 | var idx = offset 77 | val end = math.min(offset + count, length) 78 | while (idx < end) { 79 | val p = paragraphs(idx) 80 | if (p.nearFreq < freq) { 81 | return false 82 | } 83 | idx += 1 84 | } 85 | true 86 | } 87 | 88 | override val toString = s"LargeFreqParagraphs($count,$freq)" 89 | } 90 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/LinkCharRatio.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import com.worksap.nlp.uzushio.lib.filters.base.HighLowDocFilter 5 | import com.worksap.nlp.uzushio.lib.utils.{MathUtil, Paragraphs} 6 | 7 | class LinkCharRatio( 8 | override val low: Float = 0.0f, 9 | override val high: Float = 1.0f 10 | ) extends HighLowDocFilter { 11 | 12 | def calcLinkCharRatio(doc: Document): Float = { 13 | val iter = doc.aliveParagraphs 14 | var total = 0 15 | var inLink = 0 16 | while (iter.hasNext) { 17 | val par = iter.next() 18 | var i = 0 19 | val txt = par.text 20 | val len = txt.length 21 | var inside = 0 22 | while (i < len) { 23 | val ch = txt.charAt(i) 24 | if (ch == Paragraphs.HTML_LINK_START) { 25 | inside = 1 26 | } else if (ch == Paragraphs.HTML_LINK_END) { 27 | inside = 0 28 | } else { 29 | total += 1 30 | inLink += inside 31 | } 32 | i += 1 33 | } 34 | } 35 | MathUtil.ratio(inLink, total) 36 | } 37 | 38 | override def checkDocument(doc: Document): Document = { 39 | val ratio = calcLinkCharRatio(doc) 40 | maybeFilter(doc, ratio) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MarkdownizeHeading.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Paragraph, PathSegment} 4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter 5 | 6 | class MarkdownizeHeading extends ParagraphFilter { 7 | final val acceptedTags = Seq("h1", "h2", "h3", "h4", "h5", "h6") 8 | final val mdHeadningSymbol = "#" 9 | 10 | def tagToMarkdownSymbol(tag: PathSegment): String = { 11 | val numHeading = acceptedTags.indexOf(tag.tag) + 1 12 | 13 | if (numHeading == 0) { 14 | throw new IllegalArgumentException(s"tag $tag is not heading") 15 | } 16 | 17 | mdHeadningSymbol * numHeading + " " 18 | } 19 | 20 | override def checkParagraph(p: Paragraph): Paragraph = { 21 | val tagWithCSS = p.firstMatchingTag(acceptedTags) 22 | tagWithCSS match { 23 | case Some(v) => p.copy(text = tagToMarkdownSymbol(v) + p.text) 24 | case None => p 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTag.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 4 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | 8 | class MergeListTag extends DocFilter { 9 | final private val acceptedTags: Seq[String] = Array("li", "option") 10 | 11 | override def checkDocument(doc: Document): Document = { 12 | val iter = doc.paragraphs.iterator 13 | 14 | if (!iter.hasNext) { 15 | return doc 16 | } 17 | 18 | var paragraph = iter.next() 19 | var merged = false 20 | val result = new ArrayBuffer[Paragraph]() 21 | val textBuffer = new ArrayBuffer[String]() 22 | var exactFreq = paragraph.exactFreq 23 | var nearFreq = paragraph.nearFreq 24 | 25 | while (iter.hasNext) { 26 | val nextParagraph = iter.next() 27 | val isList = nextParagraph.containsTags(acceptedTags) 28 | if ( 29 | paragraph.isAlive && nextParagraph.isAlive && isList && paragraph.path == nextParagraph.path 30 | ) { 31 | merged = true 32 | textBuffer += paragraph.text 33 | exactFreq = math.min(exactFreq, nextParagraph.exactFreq) 34 | nearFreq = math.min(nearFreq, nextParagraph.nearFreq) 35 | } else { 36 | if (textBuffer.nonEmpty) { 37 | textBuffer += paragraph.text 38 | val mergedText = textBuffer.mkString("- ", "\n- ", "") 39 | result += Paragraph( 40 | path = paragraph.path, 41 | text = mergedText, 42 | index = result.size, 43 | exactFreq = exactFreq, 44 | nearFreq = nearFreq 45 | ) 46 | textBuffer.clear() 47 | } else { 48 | result += paragraph.copy(index = result.size) 49 | } 50 | 51 | exactFreq = nextParagraph.exactFreq 52 | nearFreq = nextParagraph.nearFreq 53 | } 54 | 55 | paragraph = nextParagraph 56 | } 57 | 58 | if (merged) { 59 | if (textBuffer.nonEmpty) { 60 | textBuffer += paragraph.text 61 | val mergedText = textBuffer.mkString("- ", "\n- ", "") 62 | result += Paragraph( 63 | path = paragraph.path, 64 | text = mergedText, 65 | index = result.size, 66 | exactFreq = exactFreq, 67 | nearFreq = nearFreq 68 | ) 69 | } else { 70 | result += paragraph.copy(index = result.size) 71 | } 72 | 73 | doc.copy(paragraphs = result) 74 | } else { 75 | doc 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/NoContentDOM.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Paragraph, PathSegment} 4 | import com.worksap.nlp.uzushio.lib.filters.base.ParagraphFilter 5 | 6 | class NoContentDOM extends ParagraphFilter { 7 | final private val filteringDomNames: Seq[String] = 8 | Array("header", "footer", "aside", "nav", "noscript", "form") 9 | 10 | final private val DOMCandidatesForFiliteringClassOrId = Array("div", "p", "ul", "h1") 11 | 12 | final private val filteringFullMatchClassOrIdCandidates: Seq[String] = Array( 13 | "left-box", 14 | "blog-title-inner", 15 | "blogtitle", 16 | "blog-name", 17 | "head-block1", 18 | "head-blog-name", 19 | "head-introduction", 20 | ) 21 | 22 | final private val filteringPartialMatchClassOrIdNames: Seq[String] = Array( 23 | "header", 24 | "footer", 25 | "side", 26 | "menu", 27 | "nav", 28 | "banner", 29 | "logo", 30 | "pankuzu", 31 | "breadcrumb", 32 | "widget", 33 | "button", 34 | ) 35 | 36 | final private val filteringFullMatchClassOrIdNames = 37 | filteringPartialMatchClassOrIdNames ++ filteringFullMatchClassOrIdCandidates ++ filteringFullMatchClassOrIdCandidates 38 | .map(toCamelCase) 39 | 40 | def toCamelCase(s: String): String = { 41 | val words = s.split("[_-]") 42 | words.head + words.tail.map(_.capitalize).mkString 43 | } 44 | 45 | def partialMatchIds(css: PathSegment): Boolean = { 46 | if (css.id == null) { 47 | return false 48 | } 49 | 50 | filteringPartialMatchClassOrIdNames.exists(name => css.lowerId.contains(name)) 51 | } 52 | 53 | def partialMatchClasses(css: PathSegment): Boolean = { 54 | filteringPartialMatchClassOrIdNames.exists(name => css.lowerClasses.exists(_.contains(name))) 55 | } 56 | 57 | def containsTagWithIdAndClasses( 58 | p: Paragraph, 59 | tagNames: Seq[String], 60 | fullMatchCandidates: Seq[String], 61 | partialMatchCandidates: Seq[String] 62 | ): Boolean = { 63 | val iter = p.cssPath.reverseIterator 64 | 65 | while (iter.hasNext) { 66 | val css = iter.next() 67 | 68 | if ( 69 | tagNames.contains(css.tag) 70 | && fullMatchCandidates.exists(name => css.id == name || css.classes.contains(name)) 71 | ) { 72 | return true 73 | } 74 | 75 | if ( 76 | tagNames.contains(css.tag) 77 | && partialMatchCandidates.exists(name => partialMatchIds(css) || partialMatchClasses(css)) 78 | ) { 79 | return true 80 | } 81 | } 82 | false 83 | } 84 | 85 | override def checkParagraph(p: Paragraph): Paragraph = { 86 | if ( 87 | p.containsTags(filteringDomNames) || containsTagWithIdAndClasses( 88 | p, 89 | DOMCandidatesForFiliteringClassOrId, 90 | filteringFullMatchClassOrIdNames, 91 | filteringPartialMatchClassOrIdNames 92 | ) 93 | ) { 94 | p.copy(remove = this) 95 | } else { 96 | p 97 | } 98 | } 99 | 100 | override def toString: String = "Nav" 101 | } 102 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/WordInstances.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | import com.worksap.nlp.uzushio.lib.utils.TrieNode 6 | 7 | import java.io.{BufferedReader, InputStreamReader} 8 | import java.net.URL 9 | import java.nio.charset.StandardCharsets 10 | import java.nio.file.{Files, Paths} 11 | 12 | /** Score documents using a word list and filter them if the score is more than the [[threshold]]. 13 | * 14 | * Word lists are read from 15 | * - Filesystem 16 | * - com.worksap.nlp.uzushio.lib.filters package in classpath 17 | * - root package in classpath 18 | * 19 | * @param list 20 | * word list will be read from this resource 21 | * @param threshold 22 | * documents with score larger than this value will be filtered out 23 | * @param full 24 | * score for a full match 25 | * @param partial 26 | * score for a partial match 27 | */ 28 | class WordInstances(list: String, threshold: Float = 3, full: Float = 1.0f, partial: Float = 0.1f) 29 | extends DocFilter { 30 | private val trie = WordInstances.readToTrie(list) 31 | override def checkDocument(doc: Document): Document = { 32 | val score = scoreDocument(doc) + 1e-3f 33 | doc.removeWhen(score >= threshold, this) 34 | } 35 | 36 | def scoreDocument(document: Document): Float = { 37 | var score = 0.0f 38 | val iter = document.aliveParagraphs 39 | while (iter.hasNext) { 40 | score += scoreParagraph(iter.next()) 41 | } 42 | score 43 | } 44 | 45 | def scoreParagraph(paragraph: Paragraph): Float = { 46 | var score = 0.0f 47 | val text = paragraph.text 48 | var start = 0 49 | val len = text.length 50 | while (start < len) { 51 | val res = trie.findLongest(text, start) 52 | if (res.found) { 53 | start = res.end 54 | score += full 55 | } else { 56 | start += 1 57 | } 58 | } 59 | score 60 | } 61 | 62 | override val toString = s"WordInstances($list,$threshold,$full,$partial)" 63 | } 64 | 65 | object WordInstances { 66 | import scala.collection.JavaConverters._ 67 | def readToTrie(name: String): TrieNode[Boolean] = { 68 | val p = Paths.get(name) 69 | if (Files.exists(p)) { 70 | return readToTrie(Files.lines(p)) 71 | } 72 | 73 | val classRes = getClass.getResource(name) 74 | if (classRes != null) { 75 | return readToTrie(classRes) 76 | } 77 | 78 | val loaderRes = getClass.getClassLoader.getResource(name) 79 | if (loaderRes != null) { 80 | return readToTrie(classRes) 81 | } 82 | 83 | throw new IllegalArgumentException(s"could not find word list $name") 84 | } 85 | 86 | private def readToTrie(classRes: URL): TrieNode[Boolean] = { 87 | val reader = new InputStreamReader(classRes.openStream(), StandardCharsets.UTF_8) 88 | readToTrie(new BufferedReader(reader).lines()) 89 | } 90 | 91 | private def readToTrie( 92 | s: java.util.stream.Stream[String] 93 | ): TrieNode[Boolean] = { 94 | try { 95 | TrieNode.make(s.iterator().asScala.filterNot(_.startsWith("#"))) 96 | } finally { 97 | s.close() 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/WordTypes.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 5 | import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap 6 | import org.apache.commons.math3.util.FastMath 7 | 8 | class WordTypes(list: String, threshold: Float = 3, kind: String = "uniq") extends DocFilter { 9 | private val trie = WordInstances.readToTrie(list) 10 | private val scorer = kind match { 11 | case "uniq" => WordTypes.SizeScorer 12 | case "log10" => WordTypes.Log10Scorer 13 | case "sqrt" => WordTypes.SqrtScorer 14 | case _ => throw new IllegalArgumentException("unknown kind, can be one of: uniq, log10, sqrt") 15 | } 16 | override def checkDocument(doc: Document): Document = { 17 | val score = scoreDocument(doc) 18 | doc.removeWhen(score >= threshold, this) 19 | } 20 | 21 | def scoreDocument(doc: Document): Float = { 22 | val counts = new Int2IntOpenHashMap() 23 | val iter = doc.aliveParagraphs 24 | while (iter.hasNext) { 25 | consumeParagraph(counts, iter.next()) 26 | } 27 | scoreCounts(counts) 28 | } 29 | 30 | private def consumeParagraph(counts: Int2IntOpenHashMap, paragraph: Paragraph): Unit = { 31 | val text = paragraph.text 32 | var start = 0 33 | val len = text.length 34 | while (start < len) { 35 | val res = trie.findLongest(text, start) 36 | if (res.found) { 37 | start = res.end 38 | counts.addTo(res.index, 1) 39 | } else { 40 | start += 1 41 | } 42 | } 43 | } 44 | 45 | private def scoreCounts(map: Int2IntOpenHashMap): Float = { 46 | if (map.isEmpty) return 0 47 | scorer(map) 48 | } 49 | 50 | override val toString = s"WordInstances($list,$threshold,$kind)" 51 | } 52 | 53 | object WordTypes { 54 | private trait Scorer extends (Int2IntOpenHashMap => Float) with Serializable 55 | 56 | private object SizeScorer extends Scorer { 57 | override def apply(v1: Int2IntOpenHashMap): Float = v1.size() 58 | } 59 | 60 | private object SqrtScorer extends Scorer { 61 | override def apply(v1: Int2IntOpenHashMap): Float = { 62 | var score = 0.0 63 | val iter = v1.values().iterator() 64 | while (iter.hasNext) { 65 | score += Math.sqrt(iter.nextInt()) 66 | } 67 | score.toFloat 68 | } 69 | } 70 | 71 | private object Log10Scorer extends Scorer { 72 | override def apply(v1: Int2IntOpenHashMap): Float = { 73 | var score = v1.size().toDouble // log_10 (1) == 0, so add size to the score 74 | val iter = v1.values().iterator() 75 | while (iter.hasNext) { 76 | score += Math.log10(iter.nextInt()) 77 | } 78 | score.toFloat 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/FilterBase.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters.base 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | 5 | /** All filter classes extend from this trait. They must have single public constructor. The 6 | * framework will handle passing arguments from the config files automatically and will use default 7 | * arguments correctly (pass value from config, then default parameter if config does not have a 8 | * parameter with the same name). 9 | * 10 | * **On filter functions**. Filtering functions should not remove paragraphs from documents. 11 | * Instead they should mark paragraph or a document "to delete" with a marker object which should 12 | * contain the reason of deletion. The marker object can be a string or any JVM object with 13 | * toString method overriden. The implementation of `toString` should not contain any spaces or 14 | * other characters which could cause problems in filesystem paths. 15 | */ 16 | trait FilterBase extends Serializable 17 | 18 | /** Paragraph-level filter which considers all paragraphs independently. Mark [[Paragraph.remove]] 19 | * field with the marker object. 20 | * 21 | * @see 22 | * [[FilterBase]] about marker objects 23 | */ 24 | trait ParagraphFilter extends FilterBase { 25 | def checkParagraph(p: Paragraph): Paragraph 26 | } 27 | 28 | /** Document-level filter. Should not remove any paragraphs. Instead, mark [[Document.remove]] or 29 | * [[Paragraph.remove]] with a marker object. 30 | * 31 | * @see 32 | * [[FilterBase]] about marker objects 33 | */ 34 | trait DocFilter extends FilterBase { 35 | def checkDocument(doc: Document): Document 36 | } 37 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/HighLowDocFilter.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters.base 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | 5 | trait HighLowDocFilter extends DocFilter { self => 6 | def high: Float 7 | 8 | def low: Float 9 | 10 | def maybeFilter(doc: Document, metric: Float): Document = { 11 | if (metric < low) { 12 | doc.copy(remove = Low) 13 | } else if (metric > high) { 14 | doc.copy(remove = High) 15 | } else doc 16 | } 17 | 18 | def describeFilter: String = self.getClass.getSimpleName 19 | 20 | @transient object Low { 21 | override val toString = s"$describeFilter.Low($low)" 22 | } 23 | 24 | @transient object High { 25 | override val toString = s"$describeFilter.High($high)" 26 | } 27 | 28 | override def toString = s"$describeFilter($low,$high)" 29 | } 30 | 31 | trait HighLowDocIntFilter extends DocFilter { self => 32 | def high: Int 33 | 34 | def low: Int 35 | 36 | def maybeFilter(doc: Document, metric: Int): Document = { 37 | if (metric < low) { 38 | doc.copy(remove = Low) 39 | } else if (metric > high) { 40 | doc.copy(remove = High) 41 | } else doc 42 | } 43 | 44 | @transient object Low { 45 | override val toString = s"${self.getClass.getSimpleName}.Low($low)" 46 | } 47 | 48 | @transient object High { 49 | override val toString = s"${self.getClass.getSimpleName}.High($high)" 50 | } 51 | 52 | override def toString = s"${self.getClass.getSimpleName}($low,$high)" 53 | } 54 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/html/AllTagMapper.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.html 2 | 3 | import org.apache.tika.parser.html.HtmlMapper 4 | 5 | import java.util.Locale 6 | 7 | /** Mapper class that provides all tags to handler. 8 | * 9 | * With this class set in context, handler can recognize tags specific to html s.t. div, br, etc. 10 | * ref: https://stackoverflow.com/questions/19368018/parsing-html-elements-in-apache-tika 11 | */ 12 | class AllTagMapper extends HtmlMapper { 13 | override def mapSafeElement(name: String): String = name.toLowerCase(Locale.ROOT) 14 | 15 | override def isDiscardElement(name: String): Boolean = false 16 | 17 | override def mapSafeAttribute( 18 | elementName: String, 19 | attributeName: String 20 | ): String = attributeName.toLowerCase(Locale.ROOT) 21 | } 22 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/html/ParseAbortException.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.html 2 | 3 | class ParseAbortException extends Exception 4 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangTagSniffer.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.lang 2 | 3 | import com.worksap.nlp.uzushio.lib.lang.LangTagSniffer.{extractCharset, metaRegex} 4 | 5 | import java.nio.charset.{CodingErrorAction, StandardCharsets} 6 | import java.nio.{ByteBuffer, CharBuffer} 7 | import java.util.regex.Pattern 8 | 9 | case class LangTagSniff(charset: String, language: String) 10 | 11 | /** Try to sniff language and encoding by decoding first 10k bytes as ASCII and using regexes to 12 | * find `` tags. 13 | */ 14 | class LangTagSniffer() { 15 | private val decoder = { 16 | val dec = StandardCharsets.US_ASCII.newDecoder() 17 | dec.onMalformedInput(CodingErrorAction.REPLACE) 18 | dec 19 | } 20 | 21 | private val charBuf = CharBuffer.allocate(10 * 1024) 22 | 23 | private def doSniff(buffer: CharBuffer): LangTagSniff = { 24 | var charset = "" 25 | var language = "" 26 | val iter = metaRegex.findAllIn(buffer) 27 | while (iter.hasNext) { 28 | val metaTag = iter.next() 29 | val cs = extractCharset(metaTag) 30 | if (cs.nonEmpty) { 31 | charset = cs 32 | } 33 | 34 | } 35 | LangTagSniff(charset, language) 36 | } 37 | 38 | def sniffTags(data: ByteBuffer): LangTagSniff = { 39 | val pos = data.position() 40 | val lim = data.limit() 41 | 42 | charBuf.clear() 43 | val res = decoder.decode(data, charBuf, false) 44 | charBuf.flip() 45 | 46 | data.position(pos) 47 | data.limit(lim) 48 | doSniff(charBuf) 49 | } 50 | 51 | def sniffTags(data: Array[Byte], offset: Int, position: Int): LangTagSniff = { 52 | val buffer = ByteBuffer.wrap(data, offset, position) 53 | sniffTags(buffer) 54 | } 55 | } 56 | 57 | object LangTagSniffer { 58 | private val metaRegex = "]*>".r 59 | private val charsetRegex = Pattern.compile("charset=([^\"' ;,/>]+)", Pattern.CASE_INSENSITIVE) 60 | 61 | def extractCharset(tag: String): String = { 62 | val matcher = charsetRegex.matcher(tag) 63 | if (matcher.find()) { 64 | matcher.group(1) 65 | } else { 66 | "" 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/resources/CachedLocalResource.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.resources 2 | 3 | import com.github.jbaiter.kenlm.Model 4 | import com.worksap.nlp.sudachi.{Config, Dictionary, DictionaryFactory} 5 | import org.apache.spark.SparkFiles 6 | 7 | import java.nio.file.{Files, Path, Paths} 8 | import java.util.concurrent.ConcurrentHashMap 9 | 10 | trait CachedLocalResource[T] { 11 | final private val cache = new ConcurrentHashMap[Path, T]() 12 | 13 | def create(p: Path): T 14 | 15 | def get(dict: String): T = { 16 | val p = resolveLocalPath(dict).orElse(resolveSparkPath(dict)).getOrElse( 17 | throw new IllegalArgumentException(s"could not find file: $dict") 18 | ) 19 | 20 | cache.computeIfAbsent( 21 | p, 22 | p1 => create(p1) 23 | ) 24 | } 25 | 26 | def resolveLocalPath(str: String): Option[Path] = { 27 | val p = Paths.get(str) 28 | if (Files.exists(p) && Files.isRegularFile(p)) { 29 | Some(p) 30 | } else None 31 | } 32 | 33 | def resolveSparkPath(str: String): Option[Path] = { 34 | resolveLocalPath(SparkFiles.get(str)) 35 | } 36 | } 37 | 38 | object Sudachi extends CachedLocalResource[Dictionary] { 39 | override def create(p: Path): Dictionary = { 40 | val cfg = Config.defaultConfig().systemDictionary(p) 41 | new DictionaryFactory().create(cfg) 42 | } 43 | } 44 | 45 | object KenLM extends CachedLocalResource[Model] { 46 | override def create(p: Path): Model = new Model(p) 47 | } 48 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/KenLMRunner.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.runners 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph 4 | import com.worksap.nlp.uzushio.lib.filters.KenLMEvaluator 5 | import com.worksap.nlp.uzushio.lib.resources.{KenLM, Sudachi} 6 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs 7 | import org.apache.spark.sql.{SaveMode, SparkSession} 8 | import org.apache.spark.sql.expressions.UserDefinedFunction 9 | import org.apache.spark.sql.functions.{explode, udf} 10 | import org.rogach.scallop.ScallopConf 11 | 12 | object KenLMRunner { 13 | 14 | class Args(args: Seq[String]) extends ScallopConf(args) { 15 | val input = opt[List[String]](required = true) 16 | val output = opt[String](required = true) 17 | val sudachiDict = opt[String]() 18 | val kenlmModel = opt[String]() 19 | val master = opt[String]() 20 | this.verify() 21 | } 22 | 23 | class LMPerplexity(sudachi: String, kenlm: String) extends Serializable { 24 | 25 | @transient 26 | private lazy val evaluator = KenLMEvaluator.make(sudachi, kenlm, 0.1f) 27 | 28 | def process(par: String): Double = { 29 | val prob = evaluator.scoreParagraph(Paragraph("body", par)) 30 | Math.pow(10, -prob) 31 | } 32 | 33 | def asUdf: UserDefinedFunction = udf((x: String) => process(x)) 34 | } 35 | 36 | def main(args: Array[String]): Unit = { 37 | val opts = new Args(args) 38 | 39 | val scb = SparkSession.builder() 40 | opts.master.toOption.foreach(scb.master) 41 | 42 | val sc = scb.getOrCreate() 43 | 44 | val inputs = sc.read.parquet(opts.input(): _*) 45 | 46 | import sc.implicits._ 47 | 48 | val splitPars = udf((x: String) => Paragraphs.extractCleanParagraphs(x)) 49 | 50 | val pars = inputs.select(explode(splitPars($"text")).as("text")).distinct() 51 | 52 | val ppx = new LMPerplexity(opts.sudachiDict(), opts.kenlmModel()) 53 | 54 | val probs = pars.withColumn("perplexity", ppx.asUdf($"text")) 55 | .repartitionByRange(20, $"perplexity".desc).sortWithinPartitions($"perplexity".desc) 56 | 57 | probs.write.mode(SaveMode.Overwrite).json(opts.output()) 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/Repackage.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.runners 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs 4 | import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource 5 | import org.apache.spark.sql.functions.udf 6 | import org.apache.spark.sql.{SaveMode, SparkSession} 7 | import org.rogach.scallop.ScallopConf 8 | 9 | object Repackage { 10 | 11 | def run(args: Args, spark: SparkSession): Unit = { 12 | val data = spark.read.parquet(args.input) 13 | 14 | val reparitioned = data.coalesce(args.maxParitions) 15 | 16 | val cleaned = 17 | if (args.clear && reparitioned.columns.contains("text")) { 18 | val cleanUdf = udf { s: String => Paragraphs.extractCleanParagraphs(s).mkString("\n\n") } 19 | reparitioned.withColumn("text", cleanUdf(reparitioned.col("text"))) 20 | } else reparitioned 21 | 22 | cleaned.write.format(args.format).option("compression", args.compression) 23 | .mode(SaveMode.Overwrite).save(args.output) 24 | } 25 | 26 | class ArgParser(args: Seq[String]) extends ScallopConf(args) { 27 | val input = opt[String]() 28 | val output = opt[String]() 29 | val format = opt[String](default = Some("parquet")) 30 | val compression = opt[String](default = Some("zstd")) 31 | val maxPartitions = opt[Int](default = Some(10000)) 32 | val clear = toggle("clear", default = Some(false)) 33 | verify() 34 | 35 | def toArgs: Args = Args( 36 | input = input(), 37 | output = output(), 38 | format = format(), 39 | compression = compression(), 40 | maxParitions = maxPartitions(), 41 | clear = clear() 42 | ) 43 | } 44 | 45 | case class Args( 46 | input: String, 47 | output: String, 48 | format: String, 49 | compression: String, 50 | maxParitions: Int, 51 | clear: Boolean 52 | ) 53 | 54 | def main(args: Array[String]): Unit = { 55 | val argObj = new ArgParser(args).toArgs 56 | SparkSession.builder().master("local").getOrCreate().use { spark => 57 | run(argObj, spark) 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/stats/CountMinSketch.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.stats 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.MathUtil 4 | import org.apache.spark.sql.expressions.Aggregator 5 | import org.apache.spark.sql.{Encoder, Encoders} 6 | 7 | import java.util.Random 8 | 9 | case class CountMinSketchState( 10 | rows: Int, 11 | cols: Int, 12 | counts: Array[Long] 13 | ) { 14 | def update(hasher: Hasher, value: Long): Unit = {} 15 | } 16 | 17 | case class Hasher( 18 | coeffs: Array[Long] 19 | ) { 20 | def hash(c1: Long, c2: Long, value: Long): Long = { 21 | val x = (value * c1) + c2 // mod 2^64 22 | java.lang.Long.rotateRight(x, 23) 23 | } 24 | } 25 | 26 | object Hasher { 27 | def make(num: Int): Hasher = { 28 | val rng = new Random(0xdeadbeef) 29 | Hasher(Array.fill(num * 2)(rng.nextLong())) 30 | } 31 | } 32 | 33 | class CountMinSketch( 34 | private val rows: Int, 35 | private val cols: Int, 36 | private val ngrams: NgramHashExtractor, 37 | private val hasher: Hasher 38 | ) extends Aggregator[String, CountMinSketchState, CountMinSketchState] { 39 | override def zero: CountMinSketchState = 40 | CountMinSketchState(rows, cols, new Array[Long](rows * cols)) 41 | 42 | override def reduce( 43 | b: CountMinSketchState, 44 | a: String 45 | ): CountMinSketchState = { 46 | ngrams.compute(a) { hash => 47 | b.update(hasher, hash) 48 | } 49 | b 50 | } 51 | 52 | override def merge( 53 | b1: CountMinSketchState, 54 | b2: CountMinSketchState 55 | ): CountMinSketchState = { 56 | val result = b1.copy() 57 | MathUtil.addArray(result.counts, b2.counts) 58 | result 59 | } 60 | 61 | override def finish(reduction: CountMinSketchState): CountMinSketchState = reduction 62 | 63 | override def bufferEncoder: Encoder[CountMinSketchState] = Encoders.product 64 | 65 | override def outputEncoder: Encoder[CountMinSketchState] = Encoders.product 66 | } 67 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/stats/SimHashProcessor.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.stats 2 | 3 | import com.worksap.nlp.uzushio.lib.stats.SimHashProcessor.addVector 4 | import com.worksap.nlp.uzushio.lib.utils.Ziggurat 5 | import it.unimi.dsi.util.XorShiftStarRandomGenerator 6 | 7 | class NgramHashExtractor(private val minOrder: Int, private val maxOrder: Int) 8 | extends Serializable { 9 | require(minOrder > 0) 10 | require(maxOrder > 0) 11 | require(minOrder < maxOrder) 12 | 13 | @inline 14 | final def compute(data: CharSequence)(@inline fn: Long => Unit): Unit = { 15 | var i = 0 16 | val minOrder = this.minOrder - 1 17 | val maxOrder = this.maxOrder 18 | val end = data.length() 19 | while (i < end) { 20 | var order = 0 21 | var hashState = NgramHashExtractor.HASH_SEED 22 | while (order < maxOrder && i + order < end) { 23 | val c = data.charAt(i + order) 24 | if (c == '\n') { 25 | order = maxOrder 26 | } else { 27 | hashState = NgramHashExtractor.mix(hashState, c & 0xffffL) 28 | if (order >= minOrder) { 29 | val hash = NgramHashExtractor.mix(hashState, order) 30 | fn(hash): @inline 31 | } 32 | } 33 | 34 | order += 1 35 | } 36 | i += 1 37 | } 38 | } 39 | 40 | } 41 | 42 | object NgramHashExtractor { 43 | final val HASH_SEED = 15213125612L 44 | final val HASH_MULT = 6364136223846793005L 45 | final val HASH_ADD = 1442695040888963407L 46 | 47 | def mix(seed: Long, v: Long): Long = { 48 | val x = (v + HASH_ADD) ^ seed 49 | ror(x * HASH_MULT) 50 | } 51 | 52 | def ror(x: Long): Long = java.lang.Long.rotateRight(x, 23) 53 | 54 | def hashString(x: String): Long = { 55 | var state = 0xdeadbeeffeed133L 56 | val nchars = x.length 57 | var i = 0 58 | while (i < nchars) { 59 | state = mix(state, x.charAt(i) & 0xffffL) 60 | i += 1 61 | } 62 | mix(state, nchars) 63 | } 64 | } 65 | 66 | class SimHashProcessor(private val size: Int) extends Serializable { 67 | def init: Array[Float] = new Array[Float](size) 68 | 69 | def update( 70 | state: Array[Float], 71 | data: CharSequence, 72 | ngrams: NgramHashExtractor 73 | ): Unit = { 74 | ngrams.compute(data) { hash => 75 | addVector(state, hash) 76 | } 77 | } 78 | 79 | def result(state: Array[Float]): Array[Byte] = { 80 | val len1 = state.length 81 | val resultLen = (len1 / 8) + (if ((len1 & 7) != 0) 1 else 0) 82 | val result = new Array[Byte](resultLen) 83 | var step = 0 84 | while (step < resultLen) { 85 | val offset = step * 8 86 | var i = 0 87 | var l = 0 88 | while (i < 8 && offset + i < len1) { 89 | val x = state(i + offset) 90 | if (x > 0) { 91 | l |= (1 << i) 92 | } 93 | i += 1 94 | } 95 | result(step) = l.toByte 96 | step += 1 97 | } 98 | result 99 | } 100 | } 101 | 102 | object SimHashProcessor { 103 | def addVector(state: Array[Float], hash: Long): Unit = { 104 | val rng = new XorShiftStarRandomGenerator(hash) 105 | 106 | var i = 0 107 | val len = state.length 108 | while (i < len) { 109 | state(i) = state(i) + Ziggurat.computeNextGaussian(rng).toFloat 110 | i += 1 111 | } 112 | 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/BuilderSyntax.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.apache.spark.sql.Dataset 4 | 5 | object BuilderSyntax { 6 | implicit class BuilderOps[T](val o: T) extends AnyVal { 7 | @inline def ifEnabled(cond: Boolean)(fn: T => T): T = { 8 | if (cond) fn(o) else o 9 | } 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/Levenshtein.java: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils; 2 | 3 | public class Levenshtein { 4 | 5 | private Levenshtein() { 6 | // instances forbidden 7 | } 8 | 9 | public static int[] floatRange(int len) { 10 | int[] result = new int[len]; 11 | for (int i = 0; i < len; ++i) { 12 | result[i] = i * 100; 13 | } 14 | return result; 15 | } 16 | 17 | public static int levenshteinDistance(CharSequence a, CharSequence b, int limit, int step) { 18 | int[] row0 = floatRange(b.length() + 1); 19 | int[] row1 = new int[b.length() + 1]; 20 | 21 | int al = a.length(); 22 | int bl = b.length(); 23 | for (int i = 0; i < al; ++i) { 24 | char c = a.charAt(i); 25 | for (int j = 1; j < bl; ++j) { 26 | char x = b.charAt(j - 1); 27 | 28 | } 29 | } 30 | return -1; 31 | } 32 | 33 | private static final int UMASK = 0x7fff_ffff; 34 | private static final int FMASK = 0x8000_0000; 35 | 36 | public static int levStep(int compressedScore, int scoreA, int scoreB) { 37 | int uscore = compressedScore & UMASK; 38 | int flag = compressedScore & FMASK; 39 | 40 | int score = scoreA; 41 | if (flag != 0) { 42 | score = scoreB; 43 | } 44 | 45 | return (uscore + score) & flag; 46 | } 47 | 48 | public static int levStepB(int compressedScore, int scoreA, int scoreB) { 49 | int uscore = compressedScore & UMASK; 50 | int flag = compressedScore & FMASK; 51 | 52 | int score = scoreA; 53 | if (flag != 0) { 54 | score = scoreB; 55 | } 56 | 57 | return (uscore + score) & flag; 58 | } 59 | 60 | public static final int MASK1 = 0b11111111; 61 | public static final int MASK2 = 0b00000000; 62 | } 63 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/Resources.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Resources { 6 | implicit class AutoClosableResource[T <: AutoCloseable](val x: T) extends AnyVal { 7 | @inline 8 | def use[X](fn: T => X): X = 9 | try { 10 | fn(x) 11 | } finally { 12 | x.close() 13 | } 14 | } 15 | 16 | implicit class SparkContextResource(val x: SparkContext) extends AnyVal { 17 | @inline 18 | def use[X](fn: SparkContext => X): X = 19 | try { 20 | fn(x) 21 | } finally { 22 | x.stop() 23 | } 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/RowBuffer.java: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils; 2 | 3 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 4 | 5 | import java.util.Iterator; 6 | 7 | /** 8 | * Buffer for row-like objects. Indices of entries are not preserved. Has O(1) {@link #removeElementAt(int)} method 9 | * which removes an element at index and puts the last element to the removed position. 10 | * 11 | * @param 12 | * row-like object 13 | */ 14 | public final class RowBuffer extends ObjectArrayList { 15 | 16 | /** 17 | * An iterator class which supports removing the just returned element. 18 | * 19 | * @param 20 | */ 21 | public final static class DeletingIterator implements Iterator { 22 | private final T[] data; 23 | private final RowBuffer parent; 24 | private int position; 25 | 26 | public DeletingIterator(RowBuffer parent) { 27 | this.data = parent.a; 28 | this.parent = parent; 29 | this.position = 0; 30 | } 31 | 32 | @Override 33 | public boolean hasNext() { 34 | return position < parent.size; 35 | } 36 | 37 | @Override 38 | public T next() { 39 | T element = data[position]; 40 | position += 1; 41 | return element; 42 | } 43 | 44 | /** 45 | * Remove the element which was returned by the previous {@link #next()} call. 46 | * 47 | * @return removed element 48 | */ 49 | public T removeElement() { 50 | int toRemoveIdx = position - 1; 51 | T element = parent.removeElementAt(toRemoveIdx); 52 | position = toRemoveIdx; 53 | return element; 54 | } 55 | } 56 | 57 | public DeletingIterator deletingIterator() { 58 | return new DeletingIterator<>(this); 59 | } 60 | 61 | public static RowBuffer single(T x) { 62 | RowBuffer buffer = new RowBuffer<>(); 63 | buffer.add(x); 64 | return buffer; 65 | } 66 | 67 | /** 68 | * Removes the current element from the collection. Last element is placed instead of the current element. 69 | * 70 | * @param index 71 | * where to remove 72 | * @return element which replaces current element 73 | */ 74 | public T removeElementAt(int index) { 75 | if (index < 0) { 76 | throw new IllegalArgumentException("index < 0"); 77 | } 78 | if (index >= size) { 79 | throw new IllegalArgumentException("index >= size"); 80 | } 81 | T[] arr = a; 82 | int lastIdx = size - 1; 83 | T last = arr[lastIdx]; 84 | arr[lastIdx] = null; 85 | if (index != lastIdx) { 86 | arr[index] = last; 87 | } 88 | size = lastIdx; 89 | return last; 90 | } 91 | 92 | public int addToBuffer(T element) { 93 | int sz = size; 94 | add(element); 95 | return sz; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/SentenceIterator.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | class SentenceIterator(input: String, maxLength: Int) extends Iterator[String] { 4 | 5 | private var start = 0 6 | 7 | override def hasNext: Boolean = start < input.length 8 | 9 | override def next(): String = { 10 | val curStart = start 11 | var curEnd = SentenceIterator.indexOfSeparator(input, curStart, input.length) match { 12 | case -1 => input.length 13 | case x => x + 1 14 | } 15 | 16 | val curLen = curEnd - curStart 17 | if (curLen > maxLength) { 18 | curEnd = curStart + maxLength 19 | } 20 | 21 | start = curEnd 22 | 23 | input.substring(curStart, curEnd) 24 | } 25 | } 26 | 27 | object SentenceIterator { 28 | private val SEPARATORS = "\n。、!?!?".toCharArray 29 | 30 | def indexOfSeparator(input: CharSequence, start: Int, end: Int): Int = { 31 | val seps = SEPARATORS 32 | val nseps = seps.length 33 | 34 | if (start < 0 || start > input.length()) { 35 | throw new IndexOutOfBoundsException() 36 | } 37 | 38 | if (end < 0 || end > input.length()) { 39 | throw new IndexOutOfBoundsException() 40 | } 41 | 42 | var i = start 43 | while (i < end) { 44 | val ch = input.charAt(i) 45 | var j = 0 46 | while (j < nseps) { 47 | val ch0 = seps(j) 48 | if (ch == ch0) { 49 | return i 50 | } 51 | j += 1 52 | } 53 | i += 1 54 | } 55 | -1 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/SessionBufferIn.scala: -------------------------------------------------------------------------------- 1 | package org.apache.hc.core5.http.impl.nio 2 | 3 | import org.apache.hc.core5.http.nio.SessionInputBuffer 4 | 5 | trait ResettableBuffer extends SessionInputBuffer { 6 | def clear(): Unit 7 | def putBytes(bytes: Array[Byte]): Unit 8 | 9 | def position(): Int 10 | } 11 | 12 | object SessionBufferAccess { 13 | def instance(size: Int, lineSize: Int): ResettableBuffer = 14 | new SessionInputBufferImpl(size, lineSize) with ResettableBuffer { 15 | override def putBytes(bytes: Array[Byte]): Unit = { 16 | val b = buffer() 17 | val totalSize = size.min(bytes.length) 18 | b.clear() 19 | b.put(bytes, 0, totalSize) 20 | } 21 | 22 | override def clear(): Unit = super.clear() 23 | 24 | override def position(): Int = buffer().position() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/TrieNode.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap 4 | 5 | final class TrieNode[T] extends Char2ObjectOpenHashMap[TrieNode[T]](4) { 6 | private var position: Int = -1 7 | 8 | def findLongest(str: CharSequence, offset: Int): SearchResult = { 9 | var idx = offset 10 | val len = str.length() 11 | var end = -1 12 | var value = -1 13 | var node = this 14 | while (idx < len && node != null) { 15 | val ch = str.charAt(idx) 16 | val next = node.get(ch) 17 | if (next != null && next.position != -1) { 18 | end = idx + 1 19 | value = next.position 20 | } 21 | node = next 22 | idx += 1 23 | } 24 | SearchResult(end, value) 25 | } 26 | } 27 | 28 | object TrieNode { 29 | def make(data: Iterable[CharSequence]): TrieNode[Boolean] = { 30 | make(data.iterator) 31 | } 32 | 33 | def make(data: Iterator[CharSequence]): TrieNode[Boolean] = { 34 | val root = new TrieNode[Boolean]() 35 | var index = 0 36 | while (data.hasNext) { 37 | val str = data.next() 38 | var node = root 39 | var i = 0 40 | val len = str.length() 41 | while (i < len) { 42 | val ch = str.charAt(i) 43 | var subnode = node.get(ch) 44 | if (subnode == null) { 45 | subnode = new TrieNode[Boolean]() 46 | node.put(ch, subnode) 47 | } 48 | node = subnode 49 | i += 1 50 | } 51 | node.position = index 52 | index += 1 53 | } 54 | root 55 | } 56 | } 57 | 58 | final class SearchResult(val carrier: Long) extends AnyVal { 59 | def end: Int = (carrier & 0xffffffff).toInt 60 | 61 | def index: Int = (carrier >>> 32).toInt 62 | 63 | def ==(o: SearchResult): Boolean = { 64 | o.carrier == carrier 65 | } 66 | 67 | def !=(o: SearchResult): Boolean = !(this == o) 68 | 69 | def found: Boolean = end > 0 70 | 71 | def failure: Boolean = !found 72 | 73 | override def toString: String = s"SearchResult($end, $index)" 74 | } 75 | 76 | object SearchResult { 77 | def apply(end: Int, index: Int): SearchResult = { 78 | val repr = ((index & 0xffffffffL) << 32) | (end & 0xffffffffL) 79 | new SearchResult(repr) 80 | } 81 | 82 | def empty(): SearchResult = apply(-1, -1) 83 | } 84 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/utils/WarcFileReader.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import com.google.common.io.CountingInputStream 4 | import com.worksap.nlp.uzushio.lib.utils.WarcFileReader.MAX_RECORD_SIZE 5 | import com.worksap.nlp.uzushio.lib.warc.WarcRecord 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.log4j.LogManager 9 | import org.archive.io.warc.WARCReaderFactory 10 | 11 | import java.io.BufferedInputStream 12 | 13 | /** Reads [[WarcRecord]]s from a WARC file using Hadoop filesystem APIs. */ 14 | class WarcFileReader(conf: Configuration, filePath: Path) { 15 | @transient private lazy val logger = LogManager.getLogger(this.getClass.getSimpleName) 16 | 17 | /** Opens a warc file and setup an iterator of records. */ 18 | private def fs = filePath.getFileSystem(conf) 19 | private val fileSize = fs.getFileStatus(filePath).getLen 20 | private val fsin = { 21 | val rawStream = fs.open(filePath) 22 | val wrapped = 23 | if (rawStream.markSupported()) { 24 | rawStream 25 | } else new BufferedInputStream(rawStream) 26 | // noinspection UnstableApiUsage 27 | new CountingInputStream(wrapped) 28 | } 29 | private val reader = WARCReaderFactory.get(filePath.getName, fsin, true) 30 | private val recordIter = reader.iterator 31 | 32 | /** Init counters to report progress. */ 33 | private var recordsRead: Long = 0 34 | 35 | /** Closes the file and reader. */ 36 | def close(): Unit = { 37 | reader.close() 38 | fsin.close() 39 | } 40 | 41 | /** Reads the next record from the iterator. 42 | */ 43 | def read(): WarcRecord = { 44 | if (!recordIter.hasNext) { 45 | throw new java.util.NoSuchElementException() 46 | } 47 | 48 | try { 49 | val rec = recordIter.next() 50 | val length = rec.available() 51 | if (length > MAX_RECORD_SIZE) { 52 | rec.skip(length) 53 | logger.info(s"from $filePath skipped ${rec.getHeader}") 54 | recordsRead += 1 55 | read() 56 | } else { 57 | val record = new WarcRecord(rec, filePath) 58 | recordsRead += 1 59 | record 60 | } 61 | } catch { 62 | case e: java.io.EOFException => 63 | logger.warn(s"error while iterating warc, try to skip: $filePath", e) 64 | read() 65 | } 66 | } 67 | 68 | /** Returns the number of records that have been read. */ 69 | def getRecordsRead: Long = recordsRead 70 | 71 | /** Returns the number of bytes that have been read. */ 72 | def bytesRead: Long = fsin.getCount 73 | 74 | /** Returns the proportion of the file that has been read. */ 75 | def getProgress: Float = { 76 | if (fileSize <= 0) return 1.0f 77 | bytesRead.toFloat / fileSize.toFloat 78 | } 79 | } 80 | 81 | object WarcFileReader { 82 | final val MAX_RECORD_SIZE = 16 * 1024 * 1024 // 16MB 83 | } 84 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcInputFormat.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.warc 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.WarcFileReader 4 | import org.apache.hadoop.fs.Path 5 | import org.apache.hadoop.io.LongWritable 6 | import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} 7 | import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} 8 | 9 | /** Hadoop InputFormat for WARC files. 10 | * 11 | * Key is 1-index LongWritable. Use get() method to take Long value. 12 | */ 13 | class WarcInputFormat extends FileInputFormat[LongWritable, WarcWritable] { 14 | 15 | /** Opens a WARC file (possibly compressed), and returns a RecordReader for accessing it. 16 | */ 17 | override def createRecordReader( 18 | split: InputSplit, 19 | context: TaskAttemptContext 20 | ) = { 21 | new WarcRecordReader() 22 | } 23 | 24 | override def isSplitable(context: JobContext, filename: Path): Boolean = { 25 | // we cannot (sanely) split warc files, due to its variable-length records. 26 | false 27 | } 28 | } 29 | 30 | /** Wrapper class of [[WarcFileReader]] to implement RecordReader. */ 31 | class WarcRecordReader extends RecordReader[LongWritable, WarcWritable] { 32 | private val key = new LongWritable() 33 | private val value = new WarcWritable() 34 | 35 | private var reader: WarcFileReader = null 36 | 37 | override def initialize( 38 | split: InputSplit, 39 | context: TaskAttemptContext 40 | ): Unit = { 41 | reader = new WarcFileReader( 42 | context.getConfiguration, 43 | split.asInstanceOf[FileSplit].getPath 44 | ) 45 | } 46 | 47 | override def nextKeyValue(): Boolean = { 48 | try { 49 | val record = reader.read() 50 | key.set(reader.getRecordsRead) 51 | value.setRecord(record) 52 | true 53 | } catch { 54 | case _: java.util.NoSuchElementException => false 55 | } 56 | } 57 | 58 | override def getCurrentKey: LongWritable = key 59 | 60 | override def getCurrentValue: WarcWritable = value 61 | 62 | override def getProgress: Float = reader.getProgress 63 | 64 | override def close(): Unit = { 65 | reader.close() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcLoader.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.warc 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | object WarcLoader { 8 | /* Load WARC file as RDD. */ 9 | def readWarcFiles( 10 | spark: SparkContext, 11 | name: String 12 | ): RDD[WarcRecord] = { 13 | spark.newAPIHadoopFile[LongWritable, WarcWritable, WarcInputFormat]( 14 | name 15 | ).map { case (_, v) => v.getRecord } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcRecord.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.warc 2 | 3 | import com.worksap.nlp.uzushio.lib.warc.WarcRecord.{ 4 | RECORD_ACCESS_DATE, 5 | RECORD_ID, 6 | RECORD_TRUNCATED, 7 | RECORD_TYPE, 8 | RECORD_URL 9 | } 10 | import org.apache.commons.io.IOUtils 11 | import org.apache.hadoop.fs.Path 12 | import org.archive.format.warc.WARCConstants 13 | import org.archive.io.ArchiveRecord 14 | 15 | import java.io.Serializable 16 | 17 | /** Serializable wrapper of ArchiveRecord, with body read in memory. */ 18 | class WarcRecord(record: ArchiveRecord, val path: Path) extends Serializable { 19 | // capture headers 20 | private val headers = record.getHeader.getHeaderFields 21 | // read body of request 22 | val content: Array[Byte] = IOUtils.toByteArray(record, record.available()); 23 | 24 | def isResponse: Boolean = { 25 | // ref https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0 26 | val warcType = headers.getOrDefault(RECORD_TYPE, "") 27 | "response" == warcType 28 | } 29 | 30 | def isTruncated: Boolean = headers.get(RECORD_TRUNCATED) match { 31 | case null => false 32 | case s: CharSequence => s.length() > 0 33 | case _ => true 34 | } 35 | 36 | def url: String = headers.getOrDefault(RECORD_URL, "").toString 37 | 38 | def accessDate: String = headers.get(RECORD_ACCESS_DATE).toString 39 | 40 | def docId: String = headers.get(RECORD_ID).toString 41 | } 42 | 43 | object WarcRecord { 44 | final val RECORD_TYPE = WARCConstants.HEADER_KEY_TYPE 45 | final val RECORD_TRUNCATED = WARCConstants.HEADER_KEY_TRUNCATED 46 | final val RECORD_URL = WARCConstants.HEADER_KEY_URI 47 | final val RECORD_ACCESS_DATE = WARCConstants.HEADER_KEY_DATE 48 | final val RECORD_ID = WARCConstants.HEADER_KEY_ID 49 | } 50 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcWritable.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.warc 2 | 3 | import org.apache.hadoop.io.Writable 4 | 5 | import java.io.{DataInput, DataOutput, Serializable}; 6 | 7 | /** A mutable wrapper around a [[WarcRecord]] implementing the Hadoop Writable and Serializable (for 8 | * Spark) interfaces. 9 | */ 10 | class WarcWritable(private var record: WarcRecord = null) extends Writable with Serializable { 11 | 12 | /** Returns the record currently wrapped by this writable. */ 13 | def getRecord: WarcRecord = record 14 | 15 | /** Updates the record held within this writable wrapper. */ 16 | def setRecord(newRecord: WarcRecord): Unit = { 17 | record = newRecord; 18 | } 19 | 20 | /** Appends the current record to a [[DataOutput]] stream. */ 21 | override def write(out: DataOutput): Unit = { 22 | // TODO: impl (not neccessary for current use case) 23 | // if (record != null) record.write(out); 24 | } 25 | 26 | /** Parses a [[WarcRecord]] out of a [[DataInput]] stream, and make it the current record. 27 | */ 28 | override def readFields(in: DataInput): Unit = { 29 | // TODO: impl (not neccessary for current use case) 30 | // record = new WarcRecord(in); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /lib/src/test/resources/docs/links.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 |
9 | 10 |
11 |
12 | 画像リンク 13 |
14 |
15 |

 

16 |
17 | 18 | -------------------------------------------------------------------------------- /lib/src/test/resources/docs/paragraph_detect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | こんにちは 6 |
7 | 早稲田大学で 8 |
9 | 自然言語処理 10 |
11 | を 12 |
13 | 勉強する。 14 |
15 | -------------------------------------------------------------------------------- /lib/src/test/resources/lang/shift_jis.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Date: Sun, 26 May 2013 08:11:12 GMT 3 | Content-Length: 4186 4 | Last-Modified: Sat, 29 Dec 2012 16:50:56 GMT 5 | Accept-Ranges: bytes 6 | Content-Type: text/html 7 | Connection: close 8 | Server: Apache 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /lib/src/test/resources/pipeline/doc_len.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | {"class": "DocLength", "low": 5} 3 | ] -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/DocumentSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.cleaning 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class DocumentSpec extends AnyFreeSpec { 6 | "Document" - { 7 | "computes next double correctly" in { 8 | val docs = (1 to 1000).map(i => Document(Vector.empty, docId = ('a' + i).toChar.toString)) 9 | val doubles = docs.map(_.randomDouble) 10 | for (d <- doubles) { 11 | assert(d < 1.0) 12 | } 13 | assert(doubles.distinct.size == 1000) 14 | val sum = doubles.sum 15 | assert((sum - 500).abs < 2) 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/ParagraphSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.cleaning 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class ParagraphSpec extends AnyFreeSpec { 6 | "Paragraph" - { 7 | "can return css selector strings" in { 8 | val par = Paragraph("body>p.text", "hello") 9 | assert(par.cssPath == Seq(PathSegment("body", null, Nil), PathSegment("p", null, Seq("text")))) 10 | } 11 | 12 | "can return designated tags in path without css selector" in { 13 | val par = Paragraph("body>p.text", "hello") 14 | assert(par.firstMatchingTag(Seq("p", "span")) == Some(PathSegment("p", null, Seq("text")))) 15 | } 16 | 17 | "do not return designated tags in path" in { 18 | val par = Paragraph("body>p.text", "hello") 19 | assert(par.firstMatchingTag(Seq("span")) == None) 20 | } 21 | 22 | "can return true if the paragraph contains designated tags" in { 23 | val par = Paragraph("body>p.text", "hello") 24 | assert(par.containsTags(Seq("p", "span"))) 25 | } 26 | 27 | "do not return true if the paragraph does not contain designated tags" in { 28 | val par = Paragraph("body>p.text", "hello") 29 | assert(!par.containsTags(Seq("span"))) 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/PathSegmentSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.cleaning 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class PathSegmentSpec extends AnyFreeSpec{ 6 | "PathSelector" - { 7 | "parses selector without classes or id" in { 8 | val sel = PathSegment.parse("test") 9 | assert(sel.tag == "test") 10 | assert(sel.id == null) 11 | assert(sel.classes.isEmpty) 12 | assert(sel.toString == "test") 13 | } 14 | 15 | "parses selector without classes and with id" in { 16 | val sel = PathSegment.parse("test#id") 17 | assert(sel.tag == "test") 18 | assert(sel.id == "id") 19 | assert(sel.classes.isEmpty) 20 | assert(sel.toString == "test#id") 21 | } 22 | 23 | "parses selector with one class and without id" in { 24 | val sel = PathSegment.parse("test.clz1") 25 | assert(sel.tag == "test") 26 | assert(sel.id == null) 27 | assert(sel.classes == Seq("clz1")) 28 | assert(sel.toString == "test.clz1") 29 | } 30 | 31 | "parses selector with two classes and without id" in { 32 | val sel = PathSegment.parse("test.clz1.clz2") 33 | assert(sel.tag == "test") 34 | assert(sel.id == null) 35 | assert(sel.classes == Seq("clz1", "clz2")) 36 | assert(sel.toString == "test.clz1.clz2") 37 | } 38 | 39 | "parses selector with two classes and with id" in { 40 | val sel = PathSegment.parse("test.clz1.clz2#id") 41 | assert(sel.tag == "test") 42 | assert(sel.id == "id") 43 | assert(sel.classes == Seq("clz1", "clz2")) 44 | assert(sel.toString == "test.clz1.clz2#id") 45 | } 46 | 47 | "parses selector with two classes and with id inside other string" in { 48 | val sel = PathSegment.parse("foo test.clz1.clz2#id test.clz#id2", 4, 21) 49 | assert(sel.tag == "test") 50 | assert(sel.id == "id") 51 | assert(sel.classes == Seq("clz1", "clz2")) 52 | assert(sel.toString == "test.clz1.clz2#id") 53 | } 54 | 55 | "parses path of two elements" in { 56 | val path = PathSegment.parsePath("body>li.test") 57 | assert(path.size == 2) 58 | assert(path(0).tag == "body") 59 | assert(path(1).tag == "li") 60 | assert(path(1).classes == Seq("test")) 61 | } 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/PipelineSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.cleaning 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import com.worksap.nlp.uzushio.lib.filters.WordInstances 5 | import com.worksap.nlp.uzushio.lib.filters.base.DocFilter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | 8 | case class TestFilter(test: String) extends DocFilter { 9 | override def checkDocument(doc: Document): Document = Document(Paragraph("", test)) 10 | } 11 | 12 | class PipelineSpec extends AnyFreeSpec { 13 | "Pipeline" - { 14 | "can instantiate class fully specified" in { 15 | val cfg = ConfigFactory.parseString( 16 | """{class: WordInstances, list: "ng_words.txt", minimum: 3}""" 17 | ) 18 | val filter = Pipeline.instantiateFilter(cfg) 19 | assert(filter != null) 20 | assert(filter.isInstanceOf[WordInstances]) 21 | } 22 | 23 | "can instantiate class with default value" in { 24 | val cfg = ConfigFactory.parseString( 25 | """{class: WordInstances, list: "ng_words.txt"}""" 26 | ) 27 | val filter = Pipeline.instantiateFilter(cfg) 28 | assert(filter != null) 29 | assert(filter.isInstanceOf[WordInstances]) 30 | } 31 | 32 | "can instantiate pipeline from classpath" - { 33 | val pipeline = Pipeline.make("doc_len.conf", ConfigFactory.empty()) 34 | assert(pipeline != null) 35 | } 36 | 37 | "can instantiate filter with props" - { 38 | val cfg = ConfigFactory.parseString( 39 | """filters: [ 40 | {class: "com.worksap.nlp.uzushio.lib.cleaning.TestFilter", test: ${a} } 41 | ]""" 42 | ) 43 | val props = ConfigFactory.parseString("""a: value""") 44 | val pipeline = Pipeline.make(cfg, props) 45 | val result = pipeline.applyFilters(Document()) 46 | assert(result.paragraphs.length == 1) 47 | assert(result.paragraphs.head.text == "value") 48 | } 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/dupes/CandidateRowProcessorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.dupes 2 | 3 | import com.worksap.nlp.uzushio.lib.runners.{ 4 | CandidateRowProcessor, 5 | DuplicateCandidateRow 6 | } 7 | import com.worksap.nlp.uzushio.lib.stats.{NgramHashExtractor, SimHashProcessor} 8 | import org.apache.spark.sql.catalyst.expressions.XXH64 9 | import org.apache.spark.unsafe.types.UTF8String 10 | import org.scalatest.freespec.AnyFreeSpec 11 | 12 | object RowCandidate { 13 | private val ngram = new NgramHashExtractor(2, 4) 14 | private val simhasher = new SimHashProcessor(128) 15 | 16 | def apply(x: String): DuplicateCandidateRow = { 17 | val utf8Str = UTF8String.fromString(x) 18 | val hash = XXH64.hashUTF8String(utf8Str, 42L) 19 | val simhashState = simhasher.init 20 | simhasher.update(simhashState, x, ngram) 21 | DuplicateCandidateRow( 22 | x, 23 | simhasher.result(simhashState), 24 | 1, 25 | hash, 26 | hash 27 | ) 28 | } 29 | } 30 | 31 | class CandidateRowProcessorSpec extends AnyFreeSpec { 32 | "stuff (1) is processed correctly" in { 33 | val pars = Seq( 34 | RowCandidate("docomo STYLE series N-01C"), 35 | RowCandidate("docomo STYLE series SH-03E"), 36 | RowCandidate("4位docomo STYLE series N-01E"), 37 | RowCandidate("5位docomo STYLE series N-03D") 38 | ) 39 | val proc = new CandidateRowProcessor(1024 * 1024, 70, pars.iterator) 40 | val result = proc.toArray 41 | assert(result.length == 4) 42 | assert(result.map(_.reprHash).toSet.size == 1) 43 | } 44 | 45 | "stuff (2) is processed correctly" in { 46 | val pars = Seq( 47 | RowCandidate("らくらくホン ベーシック3 [ゴールド]"), 48 | RowCandidate("らくらくホン ベーシック3 [ネイビー]"), 49 | RowCandidate("らくらくホン ベーシック3 [ピンク]"), 50 | RowCandidate("> らくらくホン ベーシック3 [ホワイト]"), 51 | RowCandidate("らくらくホン ベーシック3 [ホワイト]"), 52 | RowCandidate("らくらくホン ベーシック3 [ホワイト] のクチコミ掲示板") 53 | ) 54 | val proc = new CandidateRowProcessor(1024 * 1024, 70, pars.iterator) 55 | val result = proc.toArray 56 | assert(result.length == 6) 57 | assert(result.map(_.reprHash).toSet.size == 2) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/AdjacentDuplicateParagraphsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class AdjacentDuplicateParagraphsSpec extends AnyFreeSpec { 7 | "AdjacentDuplicateParagraphs" - { 8 | val filter = new AdjacentDuplicateParagraphs() 9 | "works with empty document" in { 10 | val filtered = filter.checkDocument(Document()) 11 | assert(filtered.paragraphs.isEmpty) 12 | } 13 | 14 | 15 | "filters out docs correctly" in { 16 | val doc = Document( 17 | Paragraph("", "test1"), 18 | Paragraph("", "test1"), 19 | Paragraph("", "test2"), 20 | ) 21 | val filtered = filter.checkDocument(doc) 22 | assert(filtered.paragraphs == Seq( 23 | Paragraph("", "test1"), 24 | Paragraph("", "test2"), 25 | )) 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/CompressionRateSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class CompressionRateSpec extends AnyFreeSpec { 7 | "CompressionRate" - { 8 | "correctly survives serialization" in { 9 | val doc = Document(Array(Paragraph("", "test1 test2"))) 10 | val f1 = new CompressionRate(0.1f, 1.2f) 11 | val b1 = f1.encodeDocContent(doc) 12 | val f2 = cloneViaSerialization(f1) 13 | val b2 = f2.encodeDocContent(doc) 14 | assert(!(b1 eq b2)) 15 | assert(!(f1.High eq f2.High)) 16 | assert(!(f1 eq f2)) 17 | assert(f1.toString == f2.toString) 18 | } 19 | 20 | "computes ratio" in { 21 | val f1 = new CompressionRate(0.1f, 1.2f) 22 | val doc = testDoc("test1test1", "test2test2", "test5test5") 23 | val ratio = f1.compressionRatio(doc) 24 | assert(ratio < 1) 25 | } 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/DeduplicateDocumentsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Document 4 | import javax.swing.tree.FixedHeightLayoutCache 5 | import org.scalatest.freespec.AnyFreeSpec 6 | 7 | 8 | class FixedProbRandomGenerator( 9 | val returnProb: Double = 0.5 10 | ) extends RandomGeneratorFromStringBase { 11 | def generateRandom(docId: String): Double = returnProb 12 | } 13 | 14 | 15 | class DeduplicateDocumentsSpec extends AnyFreeSpec { 16 | def generateFilter(returnProb: Double): DeduplicateDocuments = { 17 | val randomGenerator = new FixedProbRandomGenerator(returnProb) 18 | new DeduplicateDocuments(100, randomGenerator) 19 | } 20 | 21 | "DeduplicateDocumentsSpec" - { 22 | val filter = generateFilter(0.5) 23 | 24 | "computes correct ratio for non-deuplicated documents" in { 25 | val paragraphs = testParagraphs( 26 | Seq("test", "test", "test", "test"), 27 | Seq(1, 1, 1, 1) 28 | ) 29 | val doc = Document(paragraphs, "test") 30 | assert(0.0f == filter.computeNearDuplicateTextRatio(doc)) 31 | assert(false == filter.shouldRemoveDocument(doc)) 32 | } 33 | 34 | "computes correct ratio for non-deuplicated documents (boundary)" in { 35 | val paragraphs = testParagraphs( 36 | Seq("test", "test", "test", "test"), 37 | Seq(1, 1, 99, 100) 38 | ) 39 | val doc = Document(paragraphs, "test") 40 | assert(0.5f > filter.computeNearDuplicateTextRatio(doc)) 41 | assert(false == filter.shouldRemoveDocument(doc)) 42 | } 43 | 44 | "computes correct ratio for deuplicated documents" in { 45 | val paragraphs = testParagraphs( 46 | Seq("test", "test", "test", "test"), 47 | Seq(100, 100, 100, 100) 48 | ) 49 | val doc = Document(paragraphs, "test") 50 | assert(1.0f == filter.computeNearDuplicateTextRatio(doc)) 51 | assert(true == filter.shouldRemoveDocument(doc)) 52 | } 53 | 54 | "computes correct ratio for deuplicated documents (boundary)" in { 55 | val paragraphs = testParagraphs( 56 | Seq("test", "test", "test", "test"), 57 | Seq(1, 1, 100, 100) 58 | ) 59 | val doc = Document(paragraphs, "test") 60 | assert(0.5f == filter.computeNearDuplicateTextRatio(doc)) 61 | assert(true == filter.shouldRemoveDocument(doc)) 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LinkCharRatioSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.Paragraphs 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class LinkCharRatioSpec extends AnyFreeSpec { 7 | def a(x: String): String = 8 | s"${Paragraphs.HTML_LINK_START}$x${Paragraphs.HTML_LINK_END}" 9 | 10 | "LinkCharRatio" - { 11 | val filter = new LinkCharRatio() 12 | "computes correct ratio for empty document" in { 13 | val doc = testDoc("") 14 | assert(0.0f == filter.calcLinkCharRatio(doc)) 15 | } 16 | 17 | "computes correct ratio for non-empty document without links" in { 18 | val doc = testDoc("test") 19 | assert(0.0f == filter.calcLinkCharRatio(doc)) 20 | } 21 | 22 | "computes correct ratio for non-empty document with links" in { 23 | val doc = testDoc(s"test${a("baka")}") 24 | assert(0.5f == filter.calcLinkCharRatio(doc)) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/MarkdownizeHeadingSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class MarkdownizeHeadingSpec extends AnyFreeSpec { 7 | "MarkdownizeHeading" - { 8 | val filter = new MarkdownizeHeading() 9 | 10 | "do no operation for empty paragraph" in { 11 | val p = Paragraph("body>p.text", "") 12 | assert("" == filter.checkParagraph(p).text) 13 | } 14 | 15 | "do no operation for no heading paragraph" in { 16 | val p = Paragraph("body>p.text", "test") 17 | assert("test" == filter.checkParagraph(p).text) 18 | } 19 | 20 | "add markdown heading symbol for h1 paragraph" in { 21 | val p = Paragraph("body>h1.text", "test") 22 | assert("# test" == filter.checkParagraph(p).text) 23 | } 24 | 25 | "add markdown heading symbol for h2 paragraph" in { 26 | val p = Paragraph("body>h2.text", "test") 27 | assert("## test" == filter.checkParagraph(p).text) 28 | } 29 | 30 | "add markdown heading symbol for h3 paragraph" in { 31 | val p = Paragraph("body>h3.text", "test") 32 | assert("### test" == filter.checkParagraph(p).text) 33 | } 34 | 35 | "add markdown heading symbol for h4 paragraph" in { 36 | val p = Paragraph("body>h4.text", "test") 37 | assert("#### test" == filter.checkParagraph(p).text) 38 | } 39 | 40 | "add markdown heading symbol for h5 paragraph" in { 41 | val p = Paragraph("body>h5.text", "test") 42 | assert("##### test" == filter.checkParagraph(p).text) 43 | } 44 | 45 | "add markdown heading symbol for h6 paragraph" in { 46 | val p = Paragraph("body>h6.text", "test") 47 | assert("###### test" == filter.checkParagraph(p).text) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/NoContentDOMSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.Paragraph 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class NoContentDOMSpec extends AnyFreeSpec { 7 | "NoContentDOM" - { 8 | val filter = new NoContentDOM() 9 | 10 | "do no operation for paragraph in tag that be able to have content" in { 11 | val p = Paragraph("body>article>p", "text") 12 | assert(filter.checkParagraph(p).remove == null) 13 | } 14 | 15 | "sign remove for header tag paragraph" in { 16 | val p = Paragraph("body>header>p", "test") 17 | assert(filter.checkParagraph(p).remove != null) 18 | } 19 | 20 | "sign remove for footer tag paragraph" in { 21 | val p = Paragraph("body>footer>p", "test") 22 | assert(filter.checkParagraph(p).remove != null) 23 | } 24 | 25 | "sign remove for aside tag paragraph" in { 26 | val p = Paragraph("body>aside>p", "test") 27 | assert(filter.checkParagraph(p).remove != null) 28 | } 29 | 30 | "sign remove for nav tag paragraph" in { 31 | val p = Paragraph("body>nav>p", "test") 32 | assert(filter.checkParagraph(p).remove != null) 33 | } 34 | 35 | "sign remove for noscript tag paragraph" in { 36 | val p = Paragraph("body>noscript", "test") 37 | assert(filter.checkParagraph(p).remove != null) 38 | } 39 | 40 | "sign remove for form tag paragraph" in { 41 | val p = Paragraph("body>form", "test") 42 | assert(filter.checkParagraph(p).remove != null) 43 | } 44 | 45 | "sign remove for div tag with header class paragraph" in { 46 | val p = Paragraph("body>div.header>p", "test") 47 | assert(filter.checkParagraph(p).remove != null) 48 | } 49 | 50 | "sign remove for div tag with header id paragraph" in { 51 | val p = Paragraph("body>div#header>p", "test") 52 | assert(filter.checkParagraph(p).remove != null) 53 | } 54 | 55 | "sign remove for div tag with header-test id paragraph" in { 56 | val p = Paragraph("body>div#header-test>p", "test") 57 | assert(filter.checkParagraph(p).remove != null) 58 | } 59 | 60 | "sign remove for div tag with breadcrumbs-test id paragraph" in { 61 | val p = Paragraph("body>div.breadcrumbs-test>p", "test") 62 | assert(filter.checkParagraph(p).remove != null) 63 | } 64 | 65 | "sign remove for div tag with widget_wrapper id paragraph" in { 66 | val p = Paragraph("body>div#widget_wrapper>p", "test") 67 | assert(filter.checkParagraph(p).remove != null) 68 | } 69 | 70 | "sign remove for div tag with testLogo id paragraph" in { 71 | val p = Paragraph("body>div#testLogo>p", "test") 72 | assert(filter.checkParagraph(p).remove != null) 73 | } 74 | 75 | "sign remove for div tag with headerTop id paragraph" in { 76 | val p = Paragraph("body>div#headerTop>p", "test") 77 | assert(filter.checkParagraph(p).remove != null) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/WordInstancesSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.filters 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class WordInstancesSpec extends AnyFreeSpec { 6 | "WordInstances" - { 7 | "hojichar - adult" - { 8 | val filter = new WordInstances("hojichar/adult_keywords_ja.txt") 9 | "can score single paragraph document" in { 10 | val doc = testDoc("18禁 20禁 21禁") 11 | val score = filter.scoreDocument(doc) 12 | assert(score == 3.0f) 13 | } 14 | } 15 | 16 | 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/package.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib 2 | 3 | import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} 4 | import com.worksap.nlp.uzushio.lib.filters.base.FilterBase 5 | 6 | import java.io.{ 7 | ByteArrayInputStream, 8 | ByteArrayOutputStream, 9 | ObjectInputStream, 10 | ObjectOutputStream 11 | } 12 | import scala.annotation.varargs 13 | 14 | package object filters { 15 | def cloneViaSerialization[T <: FilterBase](f: T): T = { 16 | val bytes = new ByteArrayOutputStream() 17 | val str = new ObjectOutputStream(bytes) 18 | str.writeObject(f) 19 | str.flush() 20 | val data = bytes.toByteArray 21 | val binput = new ByteArrayInputStream(data) 22 | val istr = new ObjectInputStream(binput) 23 | val obj = istr.readObject() 24 | f.getClass.cast(obj) 25 | } 26 | 27 | def testDoc(data: String*): Document = { 28 | Document( 29 | data.map { text => 30 | Paragraph("", text) 31 | }.toIndexedSeq 32 | ) 33 | } 34 | 35 | def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int] = Seq(), exactFreqs: Seq[Int] = Seq(), paths: Seq[String] = Seq()): IndexedSeq[Paragraph] = { 36 | require(texts.length == nearFreqs.length || nearFreqs.isEmpty) 37 | require(texts.length == exactFreqs.length || exactFreqs.isEmpty) 38 | require(texts.length == paths.length || paths.isEmpty) 39 | 40 | val nearFreqs_ = if (nearFreqs.nonEmpty) nearFreqs else Seq.fill(texts.length)(1) 41 | val exactFreqs_ = if (exactFreqs.nonEmpty) exactFreqs else Seq.fill(texts.length)(1) 42 | val paths_ = if (paths.nonEmpty) paths else 0.to(texts.length).map(_ => "body>p.text") 43 | 44 | texts 45 | .zip(nearFreqs_) 46 | .zip(exactFreqs_) 47 | .zip(paths_) 48 | .map { case (((text, nearFreq), exactFreq), path) => Paragraph(path, text, 0, exactFreq, nearFreq) } 49 | .toIndexedSeq 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/html/HtmlParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.html 2 | 3 | import com.worksap.nlp.uzushio.lib.html.HtmlParserSpec.RichByteArray 4 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess 5 | import com.worksap.nlp.uzushio.lib.warc.{WarcEntryParser, WarcRecord} 6 | import org.apache.hadoop.fs.Path 7 | import org.archive.io.{ArchiveRecord, ArchiveRecordHeader} 8 | import org.scalatest.freespec.AnyFreeSpec 9 | 10 | import java.io.ByteArrayInputStream 11 | import java.nio.charset.StandardCharsets 12 | import java.util 13 | import scala.collection.mutable.ArrayBuffer 14 | 15 | class HtmlParserSpec extends AnyFreeSpec with ClasspathAccess { 16 | "html parsing" - { 17 | "works with small document" in { 18 | val processor = new WarcEntryParser 19 | val data = classpathBytes("docs/perldoc_ja_small.html") 20 | val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8) 21 | assert(paragraphs.length == 26) 22 | } 23 | 24 | "correct paragraph detection" in { 25 | val processor = new WarcEntryParser 26 | val data = classpathBytes("docs/paragraph_detect.html") 27 | val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8) 28 | assert( 29 | paragraphs == Seq( 30 | "body>div.containerこんにちは", 31 | "body>div.container>div#12345早稲田大学で", 32 | "body>div.container>div#12345>div自然言語処理", 33 | "body>div.container>div#12345を", 34 | "body>div.container勉強する。" 35 | ) 36 | ) 37 | } 38 | 39 | "empty paragraphs are ignored" in { 40 | val processor = new WarcEntryParser 41 | val data = classpathBytes("docs/links.html") 42 | val paragraphs = processor.parseHtml(data.warc, 0, StandardCharsets.UTF_8) 43 | assert( 44 | paragraphs == Seq( 45 | "body>div画像リンク" 46 | ) 47 | ) 48 | } 49 | } 50 | } 51 | 52 | object HtmlParserSpec { 53 | implicit class RichByteArray(val x: Array[Byte]) extends AnyVal { 54 | def warc: WarcRecord = new WarcRecord( 55 | new ArchiveRecord( 56 | new ByteArrayInputStream(x), 57 | new ArchiveRecordHeader { 58 | override def getDate: String = ??? 59 | 60 | override def getLength: Long = x.length 61 | override def getContentLength: Long = x.length 62 | 63 | override def getUrl: String = ??? 64 | 65 | override def getMimetype: String = ??? 66 | 67 | override def getVersion: String = ??? 68 | override def getOffset: Long = 0 69 | override def getHeaderValue(key: String): AnyRef = "none" 70 | override def getHeaderFieldKeys: util.Set[String] = ??? 71 | override def getHeaderFields: util.Map[String, AnyRef] = { 72 | val res = new util.HashMap[String, AnyRef]() 73 | res 74 | } 75 | override def getReaderIdentifier: String = ??? 76 | override def getRecordIdentifier: String = ??? 77 | override def getDigest: String = ??? 78 | override def getContentBegin: Int = ??? 79 | }, 80 | 0, 81 | false, 82 | false 83 | ) {}, 84 | new Path("file:///dev/mem") 85 | ) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.lang 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | class LangEstimationSpec extends AnyFreeSpec with ClasspathAccess { 7 | "LangEstimation" - { 8 | val sniffer = new LangTagSniffer() 9 | "sniffs charset shift_jis fragment" in { 10 | val data = classpathBytes("lang/shift_jis.txt") 11 | val tags = sniffer.sniffTags(data, 0, data.length) 12 | assert("Shift-JIS" == tags.charset) 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/ClasspathAccess.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.apache.commons.io.IOUtils 4 | 5 | trait ClasspathAccess { 6 | def classpathBytes(name: String): Array[Byte] = { 7 | val resource = getClass.getClassLoader.getResource(name) 8 | IOUtils.toByteArray(resource) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/ParagraphsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class ParagraphsSpec extends AnyFreeSpec { 6 | "correctly splits paragraphs" in { 7 | val doc = "test1\n\ntest2\ntest3" 8 | val pars = Paragraphs.extractCleanParagraphs(doc) 9 | assert(pars.length == 2) 10 | assert(pars == Seq("test1", "test2\ntest3")) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/RowBufferSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class RowBufferSpec extends AnyFreeSpec { 6 | "RowBuffer" - { 7 | "single item can be deleted" in { 8 | val buf = new RowBuffer[Int]() 9 | buf.addToBuffer(5) 10 | assert(buf.size() == 1) 11 | val item = buf.removeElementAt(0) 12 | assert(item == 5) 13 | assert(buf.size() == 0) 14 | assertThrows[IllegalArgumentException](buf.removeElementAt(0)) 15 | } 16 | 17 | "removing item with invalid index throws an exception" in { 18 | val buf = new RowBuffer[Int]() 19 | buf.addToBuffer(5) 20 | assert(buf.size() == 1) 21 | assertThrows[IllegalArgumentException](buf.removeElementAt(1)) 22 | } 23 | 24 | "works when removing last item of two" in { 25 | val buf = new RowBuffer[Int]() 26 | buf.addToBuffer(2) 27 | buf.addToBuffer(3) 28 | assert(buf.size() == 2) 29 | assert(buf.removeElementAt(1) == 3) 30 | assert(buf.size() == 1) 31 | assert(buf.get(0) == 2) 32 | } 33 | 34 | "works when removing first item of two" in { 35 | val buf = new RowBuffer[Int]() 36 | buf.addToBuffer(2) 37 | buf.addToBuffer(3) 38 | assert(buf.size() == 2) 39 | assert(buf.removeElementAt(0) == 3) 40 | assert(buf.size() == 1) 41 | assert(buf.get(0) == 3) 42 | } 43 | 44 | "works when removing first item of three" in { 45 | val buf = new RowBuffer[Int]() 46 | buf.addToBuffer(2) 47 | buf.addToBuffer(3) 48 | buf.addToBuffer(4) 49 | assert(buf.size() == 3) 50 | assert(buf.removeElementAt(0) == 4) 51 | assert(buf.size() == 2) 52 | assert(buf.get(0) == 4) 53 | assert(buf.get(1) == 3) 54 | } 55 | 56 | "works when removing second item of three" in { 57 | val buf = new RowBuffer[Int]() 58 | buf.addToBuffer(2) 59 | buf.addToBuffer(3) 60 | buf.addToBuffer(4) 61 | assert(buf.size() == 3) 62 | assert(buf.removeElementAt(1) == 4) 63 | assert(buf.size() == 2) 64 | assert(buf.get(0) == 2) 65 | assert(buf.get(1) == 4) 66 | } 67 | 68 | "works when removing third item of three" in { 69 | val buf = new RowBuffer[Int]() 70 | buf.addToBuffer(2) 71 | buf.addToBuffer(3) 72 | buf.addToBuffer(4) 73 | assert(buf.size() == 3) 74 | assert(buf.removeElementAt(2) == 4) 75 | assert(buf.size() == 2) 76 | assert(buf.get(0) == 2) 77 | assert(buf.get(1) == 3) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/SentenceIteratorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class SentenceIteratorSpec extends AnyFreeSpec { 6 | "SentenceIterator" - { 7 | "indexOf" - { 8 | "returns correct value for a simple case" in { 9 | val seq = "this。 is a test" 10 | assert(4 == SentenceIterator.indexOfSeparator(seq, 0, seq.length)) 11 | } 12 | 13 | "works with empty string" in { 14 | val seq = "" 15 | assert(-1 == SentenceIterator.indexOfSeparator(seq, 0, seq.length)) 16 | } 17 | 18 | "works with last index of a string" in { 19 | val seq = "test" 20 | assert(-1 == SentenceIterator.indexOfSeparator(seq, 4, seq.length)) 21 | } 22 | 23 | "works with not last index of a string not containing required characters" in { 24 | val seq = "test" 25 | assert(-1 == SentenceIterator.indexOfSeparator(seq, 2, seq.length)) 26 | } 27 | } 28 | 29 | "produces correct sequence of sentences" in { 30 | val iter = new SentenceIterator("this。 is a test", 1024) 31 | assert(Seq("this。", " is a test") == iter.toSeq) 32 | } 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/utils/TrieSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.utils 2 | 3 | import org.scalatest.freespec.AnyFreeSpec 4 | 5 | class TrieSpec extends AnyFreeSpec { 6 | "TrieNode" - { 7 | "can be created" in { 8 | val trie = TrieNode.make(Seq("test", "tfst", "fist")) 9 | assert(trie != null) 10 | } 11 | 12 | "can find strings" in { 13 | val trie = TrieNode.make(Seq("test", "tfst", "fist")) 14 | assert(SearchResult(4, 0) == trie.findLongest("testing", 0)) 15 | assert(SearchResult(4, 0) == trie.findLongest("testtfst", 0)) 16 | assert(SearchResult(8, 1) == trie.findLongest("testtfst", 4)) 17 | assert(SearchResult(4, 1) == trie.findLongest("tfsttest", 0)) 18 | assert(SearchResult.empty() == trie.findLongest("tfest", 0)) 19 | } 20 | 21 | "finds a longest substring" in { 22 | val trie = TrieNode.make(Seq("ab", "abc", "abcd")) 23 | assert(SearchResult(2, 0) == trie.findLongest("abed", 0)) 24 | assert(SearchResult(2, 0) == trie.findLongest("abecd", 0)) 25 | assert(SearchResult(4, 2) == trie.findLongest("abcdf", 0)) 26 | assert(SearchResult(3, 1) == trie.findLongest("abcfd", 0)) 27 | } 28 | } 29 | 30 | "SearchResult" - { 31 | "has correct fields for (0, 0)" in { 32 | val sr = SearchResult(0, 0) 33 | assert(0 == sr.end) 34 | assert(0 == sr.index) 35 | } 36 | 37 | "has correct fields for (1, 1)" in { 38 | val sr = SearchResult(1, 1) 39 | assert(1 == sr.end) 40 | assert(1 == sr.index) 41 | } 42 | 43 | "has correct fields for (100, 5000)" in { 44 | val sr = SearchResult(100, 5000) 45 | assert(100 == sr.end) 46 | assert(5000 == sr.index) 47 | } 48 | 49 | "has correct fields for (-1, -1)" in { 50 | val sr = SearchResult(-1, -1) 51 | assert(-1 == sr.end) 52 | assert(-1 == sr.index) 53 | } 54 | 55 | "has correct fields for (-100, -100)" in { 56 | val sr = SearchResult(-5, -100) 57 | assert(-5 == sr.end) 58 | assert(-100 == sr.index) 59 | } 60 | 61 | "has correct toString" in { 62 | assert(SearchResult(0, 0).toString == "SearchResult(0, 0)") 63 | assert(SearchResult(1, 1).toString == "SearchResult(1, 1)") 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /lib/src/test/scala/com/worksap/nlp/uzushio/lib/warc/WarcEntryParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.worksap.nlp.uzushio.lib.warc 2 | 3 | import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess 4 | import org.scalatest.freespec.AnyFreeSpec 5 | 6 | import java.util.UUID 7 | 8 | class WarcEntryParserSpec extends AnyFreeSpec with ClasspathAccess { 9 | "WarcEntryParser" - { 10 | val parser = new WarcEntryParser() 11 | "parses http header" in { 12 | val data = classpathBytes("lang/shift_jis.txt") 13 | val parsed = parser.parseHttpHeader(data) 14 | assert(parsed.isDefined) 15 | val Some((message, offset)) = parsed 16 | assert(offset == 197) 17 | assertResult("text/html")(message.getHeader("Content-Type").getValue) 18 | val date = WarcEntryParser.resolveEarliestDate("", message) 19 | assert("2012-12-29T16:50:56" == date) 20 | } 21 | 22 | "parses UUID" - { 23 | "" in { 24 | val uuid = WarcEntryParser.parseWarcUuid( 25 | "" 26 | ) 27 | assert(uuid == "f1a9564a-ae00-40ef-838e-a4486a83fd1d") 28 | } 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.4 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.1") 2 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") 3 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6") -------------------------------------------------------------------------------- /scripts/cal_overlap_ratio/README.md: -------------------------------------------------------------------------------- 1 | # Overlap ratio calculation 2 | 3 | ## Goal of the script 4 | To calculate the overlap ratio between multiple Common Crawl dumps. 5 | We define 3 types of overlap ratio: 6 | - `len(dump_1 & dump_2) / len(dump_1)` 7 | - `len(dump_1 & dump_2) / len(dump_2)` 8 | - `len(dump_1 & dump_2) / len(dump_1 | dump_2)` 9 | 10 | ## How to run the script 11 | If you add some new dumps and run again, the program will automatically skip the dump pairs that were already calculated. 12 | ``` 13 | python3 cal_overlap.py --dump_direc_path $dump_direc_path --output_path $output_path 14 | ``` 15 | 16 | ## How to visualize the results 17 | Use the script to process the outputed csv into a heat map figure. 18 | ``` 19 | python3 visualize.py --input_path $path_to/overlap.csv --output_path $output_path 20 | ``` -------------------------------------------------------------------------------- /scripts/count_filter_statistics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from dataclasses import dataclass 3 | from pathlib import Path 4 | from multiprocessing.pool import ThreadPool 5 | import csv 6 | 7 | # Count filter statistics from the directory structure created by Uzushio 8 | # If tqdm is installed, show show a progress bar while processing. 9 | 10 | 11 | @dataclass 12 | class Args(object): 13 | input: list[Path] 14 | output: Path 15 | workers: int 16 | 17 | @staticmethod 18 | def parse() -> "Args": 19 | p = argparse.ArgumentParser() 20 | p.add_argument("--output", type=Path, required=True) 21 | p.add_argument("--workers", default=4, type=int) 22 | p.add_argument("input", type=Path, nargs="+") 23 | return Args(**vars(p.parse_args())) 24 | 25 | 26 | def directory_size(p: Path) -> int: 27 | # print(f"calcluating size of {p}") 28 | result = 0 29 | for p in p.iterdir(): 30 | result += p.stat().st_size 31 | return result 32 | 33 | 34 | def print_progress(data): 35 | try: 36 | from tqdm import tqdm 37 | except ImportError: 38 | return 39 | for v in tqdm(data.values()): 40 | v.wait() 41 | 42 | 43 | class Processor(object): 44 | def __init__(self, args: Args) -> None: 45 | self.args = args 46 | self.executor = ThreadPool(args.workers) 47 | 48 | def run(self): 49 | matrix = {} 50 | 51 | for input_dir in self.args.input: 52 | for child in input_dir.iterdir(): 53 | chname = child.name 54 | if not chname.startswith("segment="): 55 | continue 56 | segment = chname[8:] 57 | res = self.process_segment(segment, child) 58 | matrix.update(res) 59 | 60 | self.executor.close() 61 | 62 | filters = set() 63 | segments = set() 64 | 65 | for segment, filter in matrix.keys(): 66 | filters.add(filter) 67 | segments.add(segment) 68 | 69 | filters = sorted(filters) 70 | segments = sorted(segments) 71 | 72 | print_progress(matrix) 73 | 74 | self.executor.join() 75 | 76 | with self.args.output.open("wt", newline="\n") as of: 77 | wr = csv.writer(of) 78 | 79 | wr.writerow([""] + filters) 80 | 81 | for segment in segments: 82 | row = [segment] 83 | for filter in filters: 84 | v = matrix.get((segment, filter), None) 85 | if v is None: 86 | r = "" 87 | else: 88 | r = str(v.get()) 89 | row.append(r) 90 | wr.writerow(row) 91 | 92 | def process_segment(self, segment: str, segment_dir: Path): 93 | result = {} 94 | for child in segment_dir.iterdir(): 95 | chname = child.name 96 | if not chname.startswith("filter="): 97 | continue 98 | 99 | filter = chname[7:] 100 | result[(segment, filter)] = self.executor.apply_async( 101 | directory_size, [child] 102 | ) 103 | 104 | return result 105 | 106 | 107 | if __name__ == "__main__": 108 | args = Args.parse() 109 | p = Processor(args) 110 | p.run() 111 | -------------------------------------------------------------------------------- /scripts/count_tokens.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import gzip 4 | from tqdm import tqdm 5 | from transformers import AutoTokenizer 6 | import os 7 | 8 | tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-13b-v1.0") 9 | 10 | 11 | def count_tokens(input_file): 12 | num_tokens = 0 13 | compressed_size = os.path.getsize(input_file) 14 | 15 | with gzip.open(input_file, "rb") as f: 16 | for line in tqdm(f): 17 | example = json.loads(line) 18 | text = example["text"] 19 | tokens = tokenizer.encode(text) 20 | num_tokens += len(tokens) 21 | 22 | tokens_per_byte = num_tokens / compressed_size 23 | return num_tokens, tokens_per_byte 24 | 25 | 26 | if __name__ == "__main__": 27 | input_file = sys.argv[1] 28 | num_tokens, tokens_per_byte = count_tokens(input_file) 29 | print(f"Total number of tokens: {num_tokens}") 30 | print(f"Tokens per byte: {tokens_per_byte:.3f}") 31 | -------------------------------------------------------------------------------- /scripts/pipeline_01.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | {"class": "DocLength", "low": 50}, 3 | {"class": "CompressionRate", "low": 0.15, "high": 1.0}, 4 | {"class": "CompressionRate", "low": 0.25, "high": 0.9}, 5 | {"class": "CompressionRate", "low": 0.35, "high": 0.8}, 6 | {"class": "CompressionRate", "low": 0.45, "high": 0.75}, 7 | {"class": "CompressionRate", "low": 0.55, "high": 0.7}, 8 | {"class": "HiraganaRatio", "low": 0.03, "high": 2.0}, 9 | {"class": "HiraganaRatio", "low": 0.05, "high": 2.0}, 10 | {"class": "HiraganaRatio", "low": 0.07, "high": 2.0}, 11 | {"class": "HiraganaRatio", "low": 0.1, "high": 2.0}, 12 | {"class": "HiraganaRatio", "low": 0.13, "high": 2.0}, 13 | {"class": "HiraganaRatio", "low": 0.15, "high": 2.0}, 14 | {"class": "LinkCharRatio", "low": 0, "high": 0.8}, 15 | {"class": "LinkCharRatio", "low": 0, "high": 0.7}, 16 | {"class": "LinkCharRatio", "low": 0, "high": 0.6}, 17 | {"class": "LinkCharRatio", "low": 0, "high": 0.5}, 18 | {"class": "LinkCharRatio", "low": 0, "high": 0.4}, 19 | {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.1}, 20 | {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1}, 21 | {"class": "MergeListTag"}, 22 | {"class": "MarkdownizeHeading"}, 23 | {"class": "NoContentDOM"}, 24 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000}, 25 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 100}, 26 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 27 | {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 28 | {"class": "WordTypes", "threshold": 5, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 29 | {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 30 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 31 | {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 32 | {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 33 | {"class": "DocLength", "low": 100}, 34 | {"class": "DocLength", "low": 150}, 35 | {"class": "DocLength", "low": 200}, 36 | ] -------------------------------------------------------------------------------- /scripts/pipeline_02.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | {"class": "AdjacentDuplicateParagraphs"}, 3 | {"class": "DocLength", "low": 50}, 4 | {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.1}, 5 | {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1}, 6 | {"class": "HiraganaRatio", "low": 0.03, "high": 2.0}, 7 | {"class": "HiraganaRatio", "low": 0.05, "high": 2.0}, 8 | {"class": "HiraganaRatio", "low": 0.07, "high": 2.0}, 9 | {"class": "HiraganaRatio", "low": 0.1, "high": 2.0}, 10 | {"class": "HiraganaRatio", "low": 0.13, "high": 2.0}, 11 | {"class": "HiraganaRatio", "low": 0.15, "high": 2.0}, 12 | {"class": "LinkCharRatio", "low": 0, "high": 0.8}, 13 | {"class": "LinkCharRatio", "low": 0, "high": 0.7}, 14 | {"class": "LinkCharRatio", "low": 0, "high": 0.6}, 15 | {"class": "LinkCharRatio", "low": 0, "high": 0.5}, 16 | {"class": "LinkCharRatio", "low": 0, "high": 0.4}, 17 | {"class": "MergeListTag"}, 18 | {"class": "MarkdownizeHeading"}, 19 | {"class": "NoContentDOM"}, 20 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000}, 21 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 100}, 22 | {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 3, "threshold": 1e6}, 23 | {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 2, "threshold": 5e6}, 24 | {"class": "CompressionRate", "low": 0.15, "high": 1.0}, 25 | {"class": "CompressionRate", "low": 0.25, "high": 0.9}, 26 | {"class": "CompressionRate", "low": 0.35, "high": 0.8}, 27 | {"class": "CompressionRate", "low": 0.45, "high": 0.75}, 28 | {"class": "CompressionRate", "low": 0.50, "high": 0.75}, 29 | {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 1e6, low: 2}, 30 | {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 5e5, low: 5}, 31 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 32 | {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 33 | # {"class": "WordTypes", "threshold": 5, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 34 | # {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 35 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 36 | # {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 37 | # {"class": "WordTypes", "threshold": 4, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 38 | {"class": "DocLength", "low": 100}, 39 | {"class": "DocLength", "low": 150}, 40 | {"class": "DocLength", "low": 200}, 41 | ] -------------------------------------------------------------------------------- /scripts/pipeline_03a.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | # {"class": "AdjacentDuplicateParagraphs"}, 3 | {"class": "DocLength", "low": 50}, 4 | {"class": "DeduplicateDocumentsPercentile", "expected": 5, "percentile": 0.05}, 5 | {"class": "HiraganaRatio", "low": 0.1, "high": 2.0}, 6 | {"class": "HiraganaRatio", "low": 0.15, "high": 2.0}, 7 | {"class": "LinkCharRatio", "low": 0, "high": 0.8}, 8 | {"class": "LinkCharRatio", "low": 0, "high": 0.4}, 9 | {"class": "MergeListTag"}, 10 | {"class": "MarkdownizeHeading"}, 11 | {"class": "NoContentDOM"}, 12 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 1000}, 13 | {"class": "LargeFreqParagraphs", "count": 3, "freq": 100}, 14 | {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 3, "threshold": 1e6}, 15 | {"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 2, "threshold": 5e6}, 16 | {"class": "CompressionRate", "low": 0.25, "high": 5.0}, 17 | {"class": "CompressionRate", "low": 0.40, "high": 0.75}, 18 | {"class": "CompressionRate", "low": 0.50, "high": 0.75}, 19 | {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 1e6, low: 5}, 20 | {"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 5e5, low: 7}, 21 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 22 | {"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"}, 23 | {"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"}, 24 | {"class": "DocLength", "low": 200}, 25 | {"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.05}, 26 | {"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1}, 27 | ] -------------------------------------------------------------------------------- /scripts/pipeline_test_perplexity.conf: -------------------------------------------------------------------------------- 1 | filters: [ 2 | {"class": "DocLength", "low": 50}, 3 | {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e7 }, 4 | {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e6 }, 5 | {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e5 }, 6 | {"class": "KenLMParagraphPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, count: 3, threshold: 1e6 }, 7 | ] -------------------------------------------------------------------------------- /scripts/submit_all_compute_stats.sh: -------------------------------------------------------------------------------- 1 | submit_post2017() { 2 | qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \ 3 | "/groups/gcf51199/cc/extracted/segment\=$1" \ 4 | /groups/gcf51199/cc/stats_raw_v2/segment=$1 \ 5 | 500 4000 6 | } 7 | 8 | # submit_post2017 CC-MAIN-2017-04 9 | # submit_post2017 CC-MAIN-2017-09 10 | # submit_post2017 CC-MAIN-2017-13 11 | # submit_post2017 CC-MAIN-2017-17 12 | # submit_post2017 CC-MAIN-2017-22 13 | # submit_post2017 CC-MAIN-2017-26 14 | # submit_post2017 CC-MAIN-2017-30 15 | # submit_post2017 CC-MAIN-2017-34 16 | # submit_post2017 CC-MAIN-2017-39 17 | # submit_post2017 CC-MAIN-2017-43 18 | # submit_post2017 CC-MAIN-2017-47 19 | # submit_post2017 CC-MAIN-2017-51 20 | # submit_post2017 CC-MAIN-2018-05 21 | # submit_post2017 CC-MAIN-2018-09 22 | # submit_post2017 CC-MAIN-2018-13 23 | # submit_post2017 CC-MAIN-2018-17 24 | # submit_post2017 CC-MAIN-2018-22 25 | # submit_post2017 CC-MAIN-2018-26 26 | # submit_post2017 CC-MAIN-2018-30 27 | # submit_post2017 CC-MAIN-2018-34 28 | # submit_post2017 CC-MAIN-2018-39 29 | # submit_post2017 CC-MAIN-2018-43 30 | # submit_post2017 CC-MAIN-2018-47 31 | # submit_post2017 CC-MAIN-2018-51 32 | # submit_post2017 CC-MAIN-2019-04 33 | # submit_post2017 CC-MAIN-2019-09 34 | # submit_post2017 CC-MAIN-2019-13 35 | # submit_post2017 CC-MAIN-2019-18 36 | # submit_post2017 CC-MAIN-2019-22 37 | # submit_post2017 CC-MAIN-2019-26 38 | # submit_post2017 CC-MAIN-2019-30 39 | # submit_post2017 CC-MAIN-2019-35 40 | # submit_post2017 CC-MAIN-2019-39 41 | # submit_post2017 CC-MAIN-2019-43 42 | # submit_post2017 CC-MAIN-2019-47 43 | # submit_post2017 CC-MAIN-2019-51 44 | # submit_post2017 CC-MAIN-2020-05 45 | # submit_post2017 CC-MAIN-2020-10 46 | # submit_post2017 CC-MAIN-2020-16 47 | # submit_post2017 CC-MAIN-2020-24 48 | # submit_post2017 CC-MAIN-2020-29 49 | # submit_post2017 CC-MAIN-2020-34 50 | # submit_post2017 CC-MAIN-2020-40 51 | # submit_post2017 CC-MAIN-2020-45 52 | # submit_post2017 CC-MAIN-2020-50 53 | # submit_post2017 CC-MAIN-2021-04 54 | # submit_post2017 CC-MAIN-2021-10 55 | # submit_post2017 CC-MAIN-2021-17 56 | # submit_post2017 CC-MAIN-2021-21 57 | # submit_post2017 CC-MAIN-2021-25 58 | # submit_post2017 CC-MAIN-2021-31 59 | # submit_post2017 CC-MAIN-2021-39 60 | # submit_post2017 CC-MAIN-2021-43 61 | # submit_post2017 CC-MAIN-2021-49 62 | # submit_post2017 CC-MAIN-2022-05 63 | # submit_post2017 CC-MAIN-2022-21 64 | # submit_post2017 CC-MAIN-2022-27 65 | # submit_post2017 CC-MAIN-2022-33 66 | # submit_post2017 CC-MAIN-2022-40 67 | # submit_post2017 CC-MAIN-2022-49 68 | # submit_post2017 CC-MAIN-2023-06 69 | # submit_post2017 CC-MAIN-2023-14 70 | # submit_post2017 CC-MAIN-2023-23 71 | # submit_post2017 CC-MAIN-2023-40 72 | submit_post2017 CC-MAIN-2023-50 -------------------------------------------------------------------------------- /scripts/submit_all_compute_stats_old.sh: -------------------------------------------------------------------------------- 1 | submit_pre2016() { 2 | qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \ 3 | "/groups/gcf51199/cc2/extracted/$1" \ 4 | "/groups/gcf51199/cc/stats_raw_v2/segment=$1" \ 5 | 500 4000 6 | } 7 | 8 | submit_pre2016 merged-2013 9 | submit_pre2016 merged-2014 10 | submit_pre2016 merged-2015 11 | submit_pre2016 merged-2016 -------------------------------------------------------------------------------- /scripts/submit_all_filter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | 3 | submit() { 4 | qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \ 5 | "/groups/gcf51199/cc/extracted/segment\=$1" \ 6 | /groups/gcf51199/cc/stats_merged_v2/for_filter/all \ 7 | "/groups/gcf51199/cc/filtered_v3/segment=$1" 8 | } 9 | 10 | submit_pre2016() { 11 | qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \ 12 | "/groups/gcf51199/cc2/extracted/$1" \ 13 | /groups/gcf51199/cc/stats_merged_v2/for_filter/all \ 14 | "/groups/gcf51199/cc/filtered_v3/segment=$1" 15 | } 16 | 17 | # submit_pre2016 merged-2013 18 | # submit_pre2016 merged-2014 19 | # submit_pre2016 merged-2015 20 | # submit_pre2016 merged-2016 21 | 22 | # submit CC-MAIN-2017-04 23 | # submit CC-MAIN-2017-09 24 | # submit CC-MAIN-2017-13 25 | # submit CC-MAIN-2017-17 26 | # submit CC-MAIN-2017-22 27 | # submit CC-MAIN-2017-26 28 | # submit CC-MAIN-2017-30 29 | # submit CC-MAIN-2017-34 30 | # submit CC-MAIN-2017-39 31 | submit CC-MAIN-2017-43 32 | # submit CC-MAIN-2017-47 33 | # submit CC-MAIN-2017-51 34 | # submit CC-MAIN-2018-05 35 | # submit CC-MAIN-2018-09 36 | # submit CC-MAIN-2018-13 37 | # submit CC-MAIN-2018-17 38 | # submit CC-MAIN-2018-22 39 | # submit CC-MAIN-2018-26 40 | # submit CC-MAIN-2018-30 41 | # submit CC-MAIN-2018-34 42 | # submit CC-MAIN-2018-39 43 | # submit CC-MAIN-2018-43 44 | # submit CC-MAIN-2018-47 45 | # submit CC-MAIN-2018-51 46 | # submit CC-MAIN-2019-04 47 | # submit CC-MAIN-2019-09 48 | # submit CC-MAIN-2019-13 49 | # submit CC-MAIN-2019-18 50 | # submit CC-MAIN-2019-22 51 | # submit CC-MAIN-2019-26 52 | # submit CC-MAIN-2019-30 53 | # submit CC-MAIN-2019-35 54 | # submit CC-MAIN-2019-39 55 | # submit CC-MAIN-2019-43 56 | # submit CC-MAIN-2019-47 57 | # submit CC-MAIN-2019-51 58 | # submit CC-MAIN-2020-05 59 | # submit CC-MAIN-2020-10 60 | # submit CC-MAIN-2020-16 61 | # submit CC-MAIN-2020-24 62 | # submit CC-MAIN-2020-29 63 | # submit CC-MAIN-2020-34 64 | # submit CC-MAIN-2020-40 65 | # submit CC-MAIN-2020-45 66 | # submit CC-MAIN-2020-50 67 | # submit CC-MAIN-2021-04 68 | # submit CC-MAIN-2021-10 69 | # submit CC-MAIN-2021-17 70 | # submit CC-MAIN-2021-21 71 | # submit CC-MAIN-2021-25 72 | # submit CC-MAIN-2021-31 73 | # submit CC-MAIN-2021-39 74 | # submit CC-MAIN-2021-43 75 | # submit CC-MAIN-2021-49 76 | # submit CC-MAIN-2022-05 77 | # submit CC-MAIN-2022-21 78 | # submit CC-MAIN-2022-27 79 | # submit CC-MAIN-2022-33 80 | # submit CC-MAIN-2022-40 81 | # submit CC-MAIN-2022-49 82 | # submit CC-MAIN-2023-06 83 | # submit CC-MAIN-2023-14 84 | # submit CC-MAIN-2023-23 85 | # submit CC-MAIN-2023-40 -------------------------------------------------------------------------------- /scripts/submit_all_merges_stage1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats.sh" 4 | MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year 5 | MERGE_BASIC_ROOT=/groups/gcf51199/cc/stats_raw_v2 6 | 7 | eval $COMMAND_START $MERGE_ROOT/2016 $MERGE_BASIC_ROOT/segment=merged-* 8 | eval $COMMAND_START $MERGE_ROOT/2017 $MERGE_BASIC_ROOT/segment=CC-MAIN-2017-* 9 | eval $COMMAND_START $MERGE_ROOT/2018 $MERGE_BASIC_ROOT/segment=CC-MAIN-2018-* 10 | eval $COMMAND_START $MERGE_ROOT/2019 $MERGE_BASIC_ROOT/segment=CC-MAIN-2019-* 11 | eval $COMMAND_START $MERGE_ROOT/2020 $MERGE_BASIC_ROOT/segment=CC-MAIN-2020-* 12 | eval $COMMAND_START $MERGE_ROOT/2021 $MERGE_BASIC_ROOT/segment=CC-MAIN-2021-* 13 | eval $COMMAND_START $MERGE_ROOT/2022 $MERGE_BASIC_ROOT/segment=CC-MAIN-2022-* 14 | eval $COMMAND_START $MERGE_ROOT/2023 $MERGE_BASIC_ROOT/segment=CC-MAIN-2023-* 15 | -------------------------------------------------------------------------------- /scripts/submit_all_merges_stage2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats_final.sh" 4 | MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year 5 | MERGE_FINAL_ROOT=/groups/gcf51199/cc/stats_merged_v2/for_filter 6 | 7 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2016 $MERGE_ROOT/2016 $MERGE_ROOT/2017 8 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2017 $MERGE_ROOT/2016 $MERGE_ROOT/2017 $MERGE_ROOT/2018 9 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2018 $MERGE_ROOT/2017 $MERGE_ROOT/2018 $MERGE_ROOT/2019 10 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2019 $MERGE_ROOT/2018 $MERGE_ROOT/2019 $MERGE_ROOT/2020 11 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2020 $MERGE_ROOT/2019 $MERGE_ROOT/2020 $MERGE_ROOT/2021 12 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2021 $MERGE_ROOT/2020 $MERGE_ROOT/2021 $MERGE_ROOT/2022 13 | # eval $COMMAND_START $MERGE_FINAL_ROOT/2022 $MERGE_ROOT/2021 $MERGE_ROOT/2022 $MERGE_ROOT/2023 14 | eval $COMMAND_START $MERGE_FINAL_ROOT/all $MERGE_ROOT/* 15 | -------------------------------------------------------------------------------- /scripts/submit_calc_overlap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | 6 | source $HOME/work/uzushio/.venv/bin/activate 7 | 8 | python3 $HOME/work/uzushio/scripts/cal_overlap_ratio/cal_overlap.py \ 9 | --dump_direc_path=/groups/gcf51199/cc/extracted \ 10 | --output_path=$HOME/work/overlap-extracted.csv 11 | -------------------------------------------------------------------------------- /scripts/submit_dedup_stage1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | du -hs "$1" > /dev/null & 9 | 10 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 11 | UZUSHIO_ROOT=$HOME/work/uzushio 12 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 13 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 14 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 15 | 16 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 17 | "$SPARK_HOME/sbin/start-master.sh" 18 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 19 | export SPARK_SSH_OPTS="-p 2222" 20 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 21 | SPARK_MASTER="spark://$(hostname):7077" 22 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 23 | 24 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 25 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 26 | 27 | mkdir -p /scratch/$USER/spark-exlog 28 | 29 | INPUT=$1 30 | OUTPUT=$2 31 | CACHE=/dev/null 32 | NUM_PARTITIONS=${3:-50} 33 | NUM_PARTITIONS_PROPAGATION=${4:-$(($NUM_PARTITIONS * 4))} 34 | 35 | "$SPARK_HOME/bin/spark-submit" \ 36 | --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \ 37 | --master $SPARK_MASTER \ 38 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 39 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 40 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 41 | --conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \ 42 | --conf spark.sql.parquet.columnarReaderBatchSize=512 \ 43 | local://$UZUSHIO_JAR \ 44 | --input="$INPUT" \ 45 | --output="$OUTPUT" \ 46 | --execution=reprHashes,stats,saveStats \ 47 | --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \ 48 | --partitions=$NUM_PARTITIONS --intermediate 49 | 50 | wait -------------------------------------------------------------------------------- /scripts/submit_dedup_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | 7 | INPUT=$1 8 | OUTPUT=$2 9 | CACHE=$3 10 | NUM_PARTITIONS=${4:-50} 11 | NUM_PARTITIONS_PROPAGATION=${5:-$(($NUM_PARTITIONS * 4))} 12 | 13 | du -hs "$INPUT" > /dev/null & 14 | du -hs "$CACHE" > /dev/null & 15 | 16 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 17 | UZUSHIO_ROOT=$HOME/work/uzushio 18 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 19 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 20 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 21 | 22 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 23 | "$SPARK_HOME/sbin/start-master.sh" 24 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 25 | export SPARK_SSH_OPTS="-p 2222" 26 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 27 | SPARK_MASTER="spark://$(hostname):7077" 28 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 29 | 30 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 31 | echo "$(date -Iseconds) $JOB_ID DedupFilterStatistics ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 32 | 33 | mkdir -p /scratch/$USER/spark-exlog 34 | 35 | "$SPARK_HOME/bin/spark-submit" \ 36 | --class com.worksap.nlp.uzushio.lib.runners.DedupFilterStatistics \ 37 | --master $SPARK_MASTER \ 38 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 39 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 40 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 41 | --conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \ 42 | local://$UZUSHIO_JAR \ 43 | --input="$INPUT" \ 44 | --stats="$CACHE" \ 45 | --output="$OUTPUT" --partitions=$NUM_PARTITIONS \ 46 | --filter=large-freq-paragraphs 47 | 48 | wait -------------------------------------------------------------------------------- /scripts/submit_filter_debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | INPUT=$1 9 | STATS=$2 10 | OUTPUT=$3 11 | 12 | du -hs "$INPUT" > /dev/null & 13 | du -hs "$STATS" > /dev/null & 14 | 15 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 16 | UZUSHIO_ROOT=$HOME/work/uzushio 17 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 18 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 19 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 20 | 21 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 22 | "$SPARK_HOME/sbin/start-master.sh" 23 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 24 | export SPARK_SSH_OPTS="-p 2222" 25 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 26 | SPARK_MASTER="spark://$(hostname):7077" 27 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 28 | 29 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 30 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 31 | 32 | mkdir -p /scratch/$USER/spark-exlog 33 | 34 | NUM_PARTITIONS=1000 35 | NUM_PARTITIONS_PROPAGATION=4000 36 | 37 | "$SPARK_HOME/bin/spark-submit" \ 38 | --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \ 39 | --master $SPARK_MASTER \ 40 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 41 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 42 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 43 | --conf spark.sql.shuffle.partitions=$NUM_PARTITIONS_PROPAGATION \ 44 | --conf spark.sql.parquet.columnarReaderBatchSize=256 \ 45 | local://$UZUSHIO_JAR \ 46 | --input=$INPUT \ 47 | --cache=$STATS \ 48 | --output=$OUTPUT \ 49 | --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \ 50 | --filters=$SCRIPT_DIR/pipeline_01.conf \ 51 | --partitions=$NUM_PARTITIONS \ 52 | --execution=filter-debug \ 53 | --format=json --compression=gzip --text-only 54 | 55 | 56 | wait -------------------------------------------------------------------------------- /scripts/submit_filter_debug_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | INPUT=$1 9 | STATS=$2 10 | OUTPUT=$3 11 | 12 | du -hs "$INPUT" > /dev/null & 13 | du -hs "$STATS" > /dev/null & 14 | 15 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 16 | UZUSHIO_ROOT=$HOME/work/uzushio 17 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 18 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 19 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 20 | 21 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 22 | "$SPARK_HOME/sbin/start-master.sh" 23 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 24 | export SPARK_SSH_OPTS="-p 2222" 25 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 26 | SPARK_MASTER="spark://$(hostname):7077" 27 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 28 | 29 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 30 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 31 | 32 | mkdir -p /scratch/$USER/spark-exlog 33 | 34 | NUM_PARTITIONS=1000 35 | NUM_PARTITIONS_PROPAGATION=4000 36 | 37 | "$SPARK_HOME/bin/spark-submit" \ 38 | --class com.worksap.nlp.uzushio.main.DeduplicateParagraphs \ 39 | --master $SPARK_MASTER \ 40 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 41 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 42 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 43 | --conf spark.sql.shuffle.partitions=$NUM_PARTITIONS_PROPAGATION \ 44 | --conf spark.sql.parquet.columnarReaderBatchSize=256 \ 45 | local://$UZUSHIO_JAR \ 46 | --input=$INPUT \ 47 | --cache=$STATS \ 48 | --output=$OUTPUT \ 49 | --propagate-partitions=$NUM_PARTITIONS_PROPAGATION \ 50 | --filters=$SCRIPT_DIR/pipeline_03a.conf \ 51 | --partitions=$NUM_PARTITIONS \ 52 | --execution=filter-debug \ 53 | -Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_merge-code_0.05_model.bin \ 54 | -Psudachi=/groups/gcf51199/resources/sudachi-dictionary-20230927/system_core.dic \ 55 | --format=json --compression=gzip --text-only 56 | 57 | 58 | wait -------------------------------------------------------------------------------- /scripts/submit_kenlm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | INPUT=$1 9 | OUTPUT=$2 10 | KENLM=$3 11 | SUDACHI=$4 12 | 13 | du -hs "$INPUT" > /dev/null & 14 | 15 | 16 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 17 | UZUSHIO_ROOT=$HOME/work/uzushio 18 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 19 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 20 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 21 | 22 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 23 | "$SPARK_HOME/sbin/start-master.sh" 24 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 25 | export SPARK_SSH_OPTS="-p 2222" 26 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 27 | SPARK_MASTER="spark://$(hostname):7077" 28 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 29 | 30 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 31 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 32 | 33 | mkdir -p /scratch/$USER/spark-exlog 34 | 35 | 36 | "$SPARK_HOME/bin/spark-submit" \ 37 | --class com.worksap.nlp.uzushio.lib.runners.KenLMRunner \ 38 | --master $SPARK_MASTER \ 39 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 40 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 41 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 42 | --conf spark.sql.parquet.columnarReaderBatchSize=512 \ 43 | local://$UZUSHIO_JAR \ 44 | --input=$INPUT \ 45 | --output=$OUTPUT \ 46 | --sudachi-dict=$SUDACHI \ 47 | --kenlm-model=$KENLM 48 | 49 | wait -------------------------------------------------------------------------------- /scripts/submit_merge_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | OUTPUT=$1 9 | shift 10 | INPUT=() 11 | for arg in "$@"; do 12 | INPUT+=("--input=$arg") 13 | du -hs "$arg" > /dev/null & 14 | done 15 | 16 | 17 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 18 | UZUSHIO_ROOT=$HOME/work/uzushio 19 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 20 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 21 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 22 | 23 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 24 | "$SPARK_HOME/sbin/start-master.sh" 25 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 26 | export SPARK_SSH_OPTS="-p 2222" 27 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 28 | SPARK_MASTER="spark://$(hostname):7077" 29 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 30 | 31 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 32 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 33 | 34 | mkdir -p /scratch/$USER/spark-exlog 35 | 36 | "$SPARK_HOME/bin/spark-submit" \ 37 | --class com.worksap.nlp.uzushio.lib.runners.MergeDedupStats \ 38 | --master $SPARK_MASTER \ 39 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 40 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 41 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 42 | --conf spark.sql.shuffle.partitions=1000 \ 43 | local://$UZUSHIO_JAR \ 44 | ${INPUT[*]} \ 45 | --output="$OUTPUT" 46 | 47 | wait -------------------------------------------------------------------------------- /scripts/submit_merge_stats_final.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | #$ -j y 4 | #$ -cwd 5 | #$ -l USE_SSH=1 6 | #$ -l USE_EXTRA_NETWORK=1 7 | 8 | OUTPUT=$1 9 | shift 10 | INPUT=() 11 | for arg in "$@"; do 12 | INPUT+=("--input=$arg") 13 | du -hs "$arg" > /dev/null & 14 | done 15 | 16 | 17 | # SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 18 | UZUSHIO_ROOT=$HOME/work/uzushio 19 | SCRIPT_DIR=$UZUSHIO_ROOT/scripts 20 | export SPARK_HOME=${SPARK:-$HOME/soft/spark-3.4.1-bin-hadoop3} 21 | UZUSHIO_JAR=$(readlink -f "$SCRIPT_DIR/../core/target/scala-2.12/uzushio-assembly-0.1.0-SNAPSHOT.jar") 22 | 23 | export SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f 24 | "$SPARK_HOME/sbin/start-master.sh" 25 | export SPARK_WORKERS=$SGE_JOB_HOSTLIST 26 | export SPARK_SSH_OPTS="-p 2222" 27 | export SPARK_LOCAL_DIRS=$SGE_LOCALDIR 28 | SPARK_MASTER="spark://$(hostname):7077" 29 | "$SPARK_HOME/sbin/workers.sh" "SPARK_CONF_DIR=$UZUSHIO_ROOT/spark-config/abci-f" "$SPARK_HOME/sbin/start-worker.sh" $SPARK_MASTER 30 | 31 | # it is possible to monitor task progress with Spark UI accessible by ssh port forwarding 32 | echo "$(date -Iseconds) $JOB_ID ssh abci -L8080:$(hostname):8080" >> /scratch/$USER/spark-ui-monitoring 33 | 34 | mkdir -p /scratch/$USER/spark-exlog 35 | 36 | "$SPARK_HOME/bin/spark-submit" \ 37 | --class com.worksap.nlp.uzushio.lib.runners.MergeDedupStats \ 38 | --master $SPARK_MASTER \ 39 | --conf spark.driver.log.dfsDir=/scratch/$USER/spark-exlog \ 40 | --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ 41 | --conf spark.local.dir=$SPARK_LOCAL_DIRS \ 42 | --conf spark.sql.shuffle.partitions=4000 \ 43 | local://$UZUSHIO_JAR \ 44 | ${INPUT[*]} \ 45 | --output="$OUTPUT" --no-ones --partitions=1000 46 | 47 | wait -------------------------------------------------------------------------------- /scripts/vis/vis_filter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dataclasses 3 | import matplotlib.pyplot as plt 4 | from pathlib import Path 5 | import pandas as pd 6 | import numpy as np 7 | import pyarrow.csv as pcsv 8 | 9 | 10 | @dataclasses.dataclass 11 | class Args(object): 12 | output: str 13 | input: list[str] 14 | title: str = None 15 | dpi: float = None 16 | log: bool = False 17 | 18 | @staticmethod 19 | def parse(): 20 | p = argparse.ArgumentParser() 21 | p.add_argument("--output", type=Path) 22 | p.add_argument("--title") 23 | p.add_argument("--dpi", type=float) 24 | p.add_argument("--log", action="store_true") 25 | p.add_argument("input", type=Path, nargs="+") 26 | return Args(**vars(p.parse_args())) 27 | 28 | 29 | def plot_histogram(args: Args, folder_paths: list[Path]): 30 | histogram_data = [] 31 | titles = [] 32 | 33 | # Iterate through subfolders and CSV files 34 | for folder in folder_paths: 35 | if folder.is_dir(): 36 | total_df = [] 37 | csv_files = folder.glob("*.csv") 38 | for csv_file in csv_files: 39 | data = pcsv.read_csv( 40 | csv_file, 41 | read_options=pcsv.ReadOptions(column_names=["val", "text"]), 42 | convert_options=pcsv.ConvertOptions(include_columns=["val"]), 43 | ) 44 | 45 | total_df.append(data.column(0).to_numpy()) 46 | 47 | total_df = np.concatenate(total_df, axis=0) 48 | histogram_data.append(total_df) 49 | titles.append(folder.name) 50 | 51 | plt.hist( 52 | histogram_data, 53 | bins=200, 54 | density=True, 55 | label=titles, 56 | histtype="stepfilled", 57 | alpha=0.5, 58 | log=args.log, 59 | ) 60 | plt.legend(titles) 61 | plt.ylabel("Data %") 62 | plt.xlabel("Value") 63 | plt.title(args.title) 64 | 65 | 66 | def main(args: Args): 67 | plot_histogram(args, args.input) 68 | plt.savefig(args.output) 69 | 70 | 71 | if __name__ == "__main__": 72 | main(Args.parse()) 73 | -------------------------------------------------------------------------------- /spark-config/abci-f/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.driver.memory 30G 2 | spark.executor.memory 63G 3 | spark.executor.extraJavaOptions -XX:ObjectAlignmentInBytes=16 -XX:+UseParallelGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps 4 | spark.driver.log.persistToDfs.enabled true 5 | spark.eventLog.enabled true 6 | spark.eventLog.compress true 7 | spark.checkpoint.compress true 8 | spark.memory.offHeap.enabled true 9 | spark.memory.offHeap.size 200G 10 | spark.ui.reverseProxy true 11 | spark.executor.extraLibraryPath /groups/gcf51199/native-libs 12 | spark.driver.extraLibraryPath /groups/gcf51199/native-libs -------------------------------------------------------------------------------- /spark-config/abci-f/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Spark environment variables for ABCI rt_F nodes 4 | # Correctly set temp directory and local storage to local scratch directory 5 | 6 | export SPARK_LOCAL_DIRS="$SGE_LOCALDIR" 7 | export SPARK_DAEMON_JAVA_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR" 8 | 9 | export SPARK_WORKER_DIR="$SGE_LOCALDIR" 10 | export SPARK_WORKER_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR" 11 | export SPARK_EXECUTOR_OPTS="-Djava.io.tmpdir=$SGE_LOCALDIR" 12 | 13 | export SPARK_LOG_DIR="/scratch/${USER:-nouser}/spark-log/${JOB_ID:-nojob}" 14 | mkdir -p "$SPARK_LOG_DIR" 15 | 16 | export MKL_NUM_THREADS=1 17 | export OPENBLAS_NUM_THREADS=1 18 | 19 | export JAVA_HOME="$HOME/soft/jdk-17-2023-09-27" --------------------------------------------------------------------------------