├── LICENSE.txt ├── README.md ├── extract_jawp_names_new.pl └── extract_jawp_names.pl /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Hiroshi Manabe 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Japanese Wikipedia personal name extractor 2 | =================== 3 | 4 | What's this? 5 | ------------ 6 | This is a Perl script which extracts personal names (both their Kanji and Kana) in Japanese Wikipedia and output them in a form in which the family and given names are separated by a space. 7 | 8 | How to use 9 | ---------- 10 | 11 | 1. Get the Japanese Wikipedia dump file. 12 | 13 | ``` 14 | curl -LO https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 15 | ``` 16 | 17 | 2. Run this script to extract personal names. 18 | 19 | ``` 20 | bzcat ./jawiki-latest-pages-articles.xml.bz2 | perl extract_jawp_names.pl 21 | ``` 22 | 23 | Or do it all at once. 24 | 25 | ``` 26 | curl -s https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 \ 27 | | bzcat | perl extract_jawp_names.pl 28 | ``` 29 | 30 | 説明 31 | ---- 32 | 日本語Wikipediaの人名(漢字・読み)を抽出し、姓と名がスペースで区切られた形で出力するPerlスクリプトです。 33 | 34 | 使い方 35 | ------ 36 | 1. 日本語Wikipediaのファイルを取ってきます。 37 | 38 | ``` 39 | curl -LO https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 40 | ``` 41 | 42 | 2. スクリプトを動かして人名を抽出します。 43 | 44 | ``` 45 | bzcat ./jawiki-latest-pages-articles.xml.bz2 | perl extract_jawp_names.pl 46 | ``` 47 | 48 | あるいはまとめて 49 | 50 | ``` 51 | curl -s https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 \ 52 | | bzcat | perl extract_jawp_names.pl 53 | ``` 54 | 55 | 56 | Expected Output/期待される出力 57 | ------------------------------ 58 | 士郎 正宗 しろう まさむね 59 | 高橋 留美子 たかはし るみこ 60 | 村上 もとか むらかみ もとか 61 | 青木 光恵 あおき みつえ 62 | 赤塚 不二夫 あかつか ふじお 63 | 一条 ゆかり いちじょう ゆかり 64 | うすた 京介 うすた きょうすけ 65 | 浦沢 直樹 うらさわ なおき 66 | 車田 正美 くるまだ まさみ 67 | 高橋 しん たかはし しん 68 | ... 69 | 70 | License 71 | ------- 72 | This software is released under the MIT License, see LICENSE.txt. 73 | -------------------------------------------------------------------------------- /extract_jawp_names_new.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use utf8; 5 | use open IO => ':utf8'; 6 | use open ':std'; 7 | 8 | use FindBin; 9 | use File::Basename; 10 | use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat); 11 | 12 | my %exists_dict = (); 13 | my %surname_dict = (); 14 | my %name_dict = (); 15 | my %surname_kana_dict = (); 16 | my %name_kana_dict = (); 17 | my %surname_kanji_dict = (); 18 | my %name_kanji_dict = (); 19 | 20 | my %opts; 21 | GetOptions(\%opts, 22 | "wikipedia-data=s", 23 | "base-name-file=s", 24 | "blacklist=s" 25 | ); 26 | 27 | $| = 1; 28 | 29 | if (not exists $opts{"wikipedia-data"} or not exists $opts{"base-name-file"}) { 30 | my $script_name = basename($FindBin::Script); 31 | die "$script_name --wikipedia-data --base-name-file [--blacklist ]\n"; 32 | } 33 | 34 | open IN, "<", $opts{"base-name-file"} or die "$opts{'$base-name-file'}: $!"; 35 | while () { 36 | print; 37 | chomp; 38 | $exists_dict{$_} = (); 39 | my @F = split/\t/; 40 | my @han = split/ /, $F[0]; 41 | my @kana = split/ /, $F[1]; 42 | $surname_dict{$han[0]}->{$kana[0]} = (); 43 | $surname_kana_dict{$kana[0]} = (); 44 | $surname_kanji_dict{$han[0]} = (); 45 | $name_dict{$han[1]}->{$kana[1]} = (); 46 | $name_kanji_dict{$han[1]} = (); 47 | $name_kana_dict{$kana[1]} = (); 48 | } 49 | close IN; 50 | 51 | if (exists $opts{"blacklist"}) { 52 | my $blacklist = $opts{"blacklist"}; 53 | 54 | open IN, "<", $blacklist or die "$blacklist: $!"; 55 | while () { 56 | chomp; 57 | $exists_dict{$_} = (); 58 | } 59 | close IN; 60 | } 61 | 62 | open IN, "<", $opts{"wikipedia-data"} or die "opts{'wikipedia-data'}: $!"; 63 | while () { 64 | chomp; 65 | while (m{(\b[\p{sc=Han}\p{sc=Hiragana}\p{sc=Katakana}]{1,5}|(? 4; 72 | $name_kanji =~ s{の(?:祖?父|祖?母|兄|姉|弟|妹)$}{}; 73 | next if length($name_kanji) > 4; 74 | next if $surname_kanji eq "" or $name_kanji eq ""; 75 | my $fullname = "$surname_kanji $name_kanji\t$surname_kana $name_kana"; 76 | next if exists $exists_dict{$fullname}; 77 | my $kanji = "$surname_kanji $name_kanji"; 78 | my $kana = "$surname_kana $name_kana"; 79 | if ($kanji =~ m{^((?![炭郷])\p{sc=Han}|司馬|欧陽|諸葛|司徒) (\p{sc=Han}{1,2})$}) { 80 | my $surname_len = length($1); 81 | my $name_len = length($2); 82 | my $regex_yomi = q{\p{sc=Hiragana}(?:[ゃゅょ][うくつんっ]?|[いうきくちつんっ]?)}; 83 | my $regex = qr{^(?:$regex_yomi){$surname_len} (?:$regex_yomi){$name_len}$}; 84 | next if $kana =~ m{$regex}; 85 | } 86 | my $name_kanji_kana = $name_kanji; 87 | $name_kanji_kana =~ tr/ァ-ン/ぁ-ん/; 88 | if (((exists $surname_dict{$surname_kanji}->{$surname_kana} or exists $name_dict{$name_kanji}->{$name_kana} or $name_kanji_kana eq $name_kana) or (exists $surname_kanji_dict{$surname_kanji} and exists $name_kanji_dict{$name_kanji}) or (exists $surname_kana_dict{$surname_kana} and exists $name_kana_dict{$name_kana})) and length($surname_kana) >= length($surname_kanji) and length($name_kana) >= length($name_kanji)) { 89 | print "$fullname\n"; 90 | $surname_dict{$surname_kanji}->{$surname_kana} = (); 91 | $surname_kana_dict{$surname_kana} = (); 92 | $surname_kanji_dict{$surname_kanji} = (); 93 | $name_dict{$name_kanji}->{$name_kana} = (); 94 | $name_kanji_dict{$name_kanji} = (); 95 | $name_kana_dict{$name_kana} = (); 96 | $exists_dict{$fullname} = (); 97 | } 98 | else { 99 | 1; 100 | } 101 | } 102 | } 103 | close IN; 104 | -------------------------------------------------------------------------------- /extract_jawp_names.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use utf8; 5 | use open IO => ':utf8'; 6 | use open ':std'; 7 | 8 | my $page = ""; 9 | while () { 10 | if (m{^\s*}) { 11 | $page = ""; 12 | } 13 | $page .= $_; 14 | if (m{^\s*}) { 15 | next if $page !~ m{\[\[Category:.+(人物|年生|年没)\b}i; 16 | next if $page =~ m{\[\[Category:.*コンビ\b}i; 17 | 18 | $page =~ m{(.+?)}; 19 | my $name = $1; 20 | 21 | $page =~ s{\{\{[^\|\}]+?フォント\|(.+?)\}\}}{$1}g; 22 | $page =~ s{\{\{lang\|[^\|]*\|([^\}]*)\}\}}{$1}g; 23 | $page =~ s{&#(\d+);}{chr($1)}eg; 24 | $page =~ s{&#x([A-Fa-f0-9]+);}{chr(hex($1))}eg; 25 | 26 | next if $page !~ m{\{\{(?:DEFAULTSORT|デフォルトソート):(.+?)\}\}}i; 27 | 28 | my $defaultsort = $1; 29 | $defaultsort =~ tr/ァ-ヴ/ぁ-ゔ/; 30 | $defaultsort =~ tr/ぁぃぅぇぉっゃゅょゎ/あいうえおつやゆよわ/; 31 | $defaultsort =~ tr/ゔがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽ/うかきくけこさしすせそたちつてとはひふへほはひふへほ/; 32 | $defaultsort =~ tr/ぁ-ゔ//cd; 33 | 34 | $name =~ s{ \(.+\)}{}; 35 | my @names = ($name); 36 | if ($page =~ m{\{\{記事名の制約\|title=([^|\}]+)}) { 37 | push @names, $1; 38 | } 39 | my @name_regexps = map { 40 | join(q{[\W\x{fe00}-\x{fe0f}\x{e0100}-\x{e01ef}]*}, map { quotemeta($_) } split(//, $_)); 41 | } @names; 42 | my $name_regexp = ""; 43 | if (scalar(@names) == 2 and length($names[0]) == length($names[1])) { 44 | for (my $i = 0; $i < length($names[0]); ++$i) { 45 | my $ch1 = quotemeta(substr($names[0], $i, 1)); 46 | my $ch2 = quotemeta(substr($names[1], $i, 1)); 47 | $name_regexp .= ($ch1 eq $ch2) ? $ch1 : "(?:$ch1|$ch2)"; 48 | $name_regexp .= q{[\W\x{fe00}-\x{fe0f}\x{e0100}-\x{e01ef}]*} if $i != length($names[0]) - 1; 49 | } 50 | } 51 | else { 52 | $name_regexp = "(?:".join("|", @name_regexps).")"; 53 | } 54 | 55 | my @chars = split//, $defaultsort; 56 | my @ambiguous_chars = (); 57 | 58 | for (my $i = 0; $i < scalar(@chars); ++$i) { 59 | my $ch = $chars[$i]; 60 | my @chs = ($ch); 61 | @chs = map { 62 | if (m{[あいうえおつやゆよわ]}) { 63 | my $c = $_; 64 | $c =~ tr/あいうえおつやゆよわ/ぁぃぅぇぉっゃゅょゎ/; 65 | ($_, $c); 66 | } 67 | else { 68 | $_; 69 | } 70 | } @chs; 71 | @chs = map { 72 | if (m{[うかきくけこさしすせそたちつてとはひふへほ]}) { 73 | my $c = $_; 74 | $c =~ tr/うかきくけこさしすせそたちつてとはひふへほ/ゔがぎぐげござじずぜぞだぢづでどばびぶべぼ/; 75 | ($_, $c); 76 | } 77 | else { 78 | $_; 79 | } 80 | } @chs; 81 | @chs = map { 82 | if (m{[はひふへほ]}) { 83 | my $c = $_; 84 | $c =~ tr/はひふへほ/ぱぴぷぺぽ/; 85 | ($_, $c); 86 | } 87 | else { 88 | $_; 89 | } 90 | } @chs; 91 | @chs = map { 92 | my $c = $_; 93 | $c =~ tr/ぁ-ゔ/ァ-ヴ/; 94 | ($_, $c); 95 | } @chs; 96 | if ($i != 0) { 97 | my $prev_ch = $chars[$i - 1]; 98 | if ($prev_ch =~ m{[あかさたはなまわやわ]} and $ch eq 'あ' or 99 | $prev_ch =~ m{[いきしちにひみり]} and $ch eq 'い' or 100 | $prev_ch =~ m{[うくすつぬふむゆる]} and $ch eq 'う' or 101 | $prev_ch =~ m{[えけせてねへめれ]} and $ch eq 'え' or 102 | $prev_ch =~ m{[おこそとのほもよろを]} and $ch =~ m{^[おう]$}) { 103 | push @chs, ""; 104 | } 105 | } 106 | push @ambiguous_chars, "(?:".join("|", map { quotemeta $_ } @chs).")"; 107 | } 108 | my $defaultsort_regexp = join('[\Wー]*', @ambiguous_chars).'ー?'; 109 | 110 | $page =~ s{('''[^']+ ([\p{sc=Hiragana}\p{sc=Katakana}ー]+)'''[\W\p{sc=Hiragana}\p{sc=Katakana}ー]+ )-}{$1$2}; 111 | $page =~ s{('''([\p{sc=Hiragana}\p{sc=Katakana}ー]+) [^']+'''\W+)-}{$1$2}; 112 | 113 | while ($page =~ m{($name_regexp).*?[^\p{sc=Hiragana}\p{sc=Katakana}]($defaultsort_regexp)[^\p{sc=Hiragana}\p{sc=Katakana}ー]}g) { 114 | my $matched_name = $1; 115 | my $matched_kana = $2; 116 | if ($matched_name =~ s{親王$}{}) { 117 | next if $matched_kana !~ s{しんのう$}{}; 118 | } 119 | $matched_name =~ tr/\x{fe00}-\x{fe0f}\x{e0100}-\x{e01ef}//d; 120 | 121 | my @split_name = split/\W+/, $matched_name; 122 | my @split_kana = map { my $t = $_; $t =~ tr/ァ-ン/ぁ-ん/; $t; } split(/\W+/, $matched_kana); 123 | my @split_name_hiragana = map { my $t = $_; $t =~ tr/ァ-ン/ぁ-ん/; $t; } @split_name; 124 | 125 | if (scalar(@split_name) == 2 and scalar(@split_kana) == 2) { 126 | my $ok_flag = 1; 127 | if ("$split_name[0] $split_name[1]" =~ m{^((?![炭郷団関])\p{sc=Han}|司馬|欧陽|諸葛|司徒)\W*(\p{sc=Han}{1,2})$}) { 128 | my $surname_len = length($1); 129 | my $name_len = length($2); 130 | my $regex_yomi = q{\p{sc=Hiragana}(?:[ゃゅょ][うくつんっ]?|[いうきくちつんっ]?)}; 131 | my $regex = qr{^(?:$regex_yomi){$surname_len}\W+(?:$regex_yomi){$name_len}$}; 132 | if ("$split_kana[0] $split_kana[1]" =~ m{$regex}) { 133 | $ok_flag = 0; 134 | } 135 | } 136 | if (length($split_name[0]) == 1 and $matched_kana !~ m{\p{sc=Hiragana}}) { 137 | $ok_flag = 0; 138 | } 139 | for my $i(0..1) { 140 | if ($split_name_hiragana[$i] =~ m{(\p{sc=Hiragana}+)}) { 141 | my $t = $1; 142 | $ok_flag = 0 if $split_kana[$i] !~ m{$t}; 143 | } 144 | } 145 | if ($matched_name !~ m{[\p{sc=Han}\p{sc=Hiragana}]} or 146 | $matched_name =~ m{\d} or 147 | $matched_name =~ m{.一覧} or 148 | $matched_name =~ m{王后|王妃}) { 149 | $ok_flag = 0; 150 | } 151 | if ($ok_flag == 0) { 152 | next; 153 | } 154 | 155 | print join(" ", @split_name)."\t".join(" ", @split_kana)."\n"; 156 | last; 157 | } 158 | } 159 | } 160 | } 161 | --------------------------------------------------------------------------------