├── .gitignore ├── README.md ├── reader ├── __init__.py ├── divide_xml_revisions.py ├── divide_xml_revisions_new.py ├── extract_revisions.py ├── extract_revisions_new.py ├── extract_spelling_errors.py ├── extract_spelling_errors_new.py ├── fix_extracted.py ├── utils.py └── wikiextractor │ ├── WikiExtractor.py │ ├── __init__.py │ ├── cirrus-extract.py │ ├── clean.py │ ├── extract.py │ └── extractPage.py └── run └── pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | tcdata/ 133 | user_data/* 134 | !user_data/extra_data 135 | !user_data/track3 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WiKi-Error-Extract 2 | 3 | **\*\*\*\*\* Update 2022-03-04 \*\*\*\*\*** 4 | 5 | 由于现在这个pipeline运行起来还是非常慢的,可以使用我抽取的结果: 6 | 7 | [下载地址+密码: c4qk](https://pan.baidu.com/s/1PovlwB9H1Zu-Jv_WN9xZnQ) 8 | 9 | > 建议用混淆集对错误做一下过滤。 10 | 11 | 同时修改了一下抽取逻辑,避免读写大量小文件。运行: 12 | 13 | ```python 14 | python code_dir/reader/divide_xml_revisions_new.py unzipped_file_path output_file_path 0.1 15 | ``` 16 | 17 | **\*\*\*\*\* Update End \*\*\*\*\*** 18 | 19 | 根据wiki history数据提取纠错的平行语料。这里只提取字是1-1对齐的语料,不考虑删减字的情况。需要的话自行修改代码即可。 20 | 21 | 步骤: 22 | 23 | 1. 从wiki dumps中现在zhwiki有完整编辑历史的数据。我下载的是[20211201](https://dumps.wikimedia.org/zhwiki/20211201/)中 All pages with complete edit history (.7z)格式的数据。 24 | 25 | 2. 将所有7z文件放在同一个目录A下,在该目录下新建extracted目录。修改run/pipeline.py中对应目录地址。 26 | 27 | 3. 在A目录下运行pipeline文件。`python wiki-error-extract dir/run/pipeline.py` 28 | 29 | 4. 最终结果会在A目录下的stage5文件夹中。 30 | 31 | 32 | ## 例子 33 | 34 | ``` 35 | [{'src': '比如我把勾股定理叫做勾理定理,并不意味着我认为为它是中国人最先证明的,而是"勾"股"弦"本来就是指直角三角形的三边。', 36 | 'tgt': '比如我把勾股定理叫做勾股定理,并不意味着我认为为它是中国人最先证明的,而是"勾"股"弦"本来就是指直角三角形的三边。'}, 37 | {'src': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪,而支持杀局的李国宝在投票前二十秒赶到等「戏剧化」情节下而令草案得到通过。', 38 | 'tgt': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪,而支持杀局的李国宝在投票前20秒赶到等「戏剧化」情节下而令草案得到通过。'}, 39 | {'src': '禁止公众摄影图书馆 康文署拒放宽.', 'tgt': '禁止公众拍摄图书馆 康文署拒放宽.'}, 40 | {'src': '上白礁谒祖祭典由台湾台南县学甲镇慈济宫所举行,于每年农历三月十一日举行,即当地所称上白礁活动,于当地将军溪畔「头前寮」举行祭典,隔海遥祭保生大帝祖庙,即中国大陆福建白礁慈济宫。', 41 | 'tgt': '上白礁谒祖祭典由台湾台南市学甲区慈济宫所举行,于每年农历三月十一日举行,即当地所称上白礁活动,于当地将军溪畔「头前寮」举行祭典,隔海遥祭保生大帝祖庙,即中国大陆福建白礁慈济宫。'}, 42 | {'src': '1980年后,改善分离的选择性成为色谱工作者的主要问题,人们越来越认识到改变流动相的组成事提高选择性的关键。', 43 | 'tgt': '1980年后,改善分离的选择性成为色谱工作者的主要问题,人们越来越认识到改变流动相的组成是提高选择性的关键。'}, 44 | {'src': '德国战车最大的缺点是生产速度慢,直到战终德国主力四号战车也仅生产九千多、五号豹式六千多部。', 45 | 'tgt': '德国战车最大的缺点是生产速度慢,直到战终德国主力四号坦克也仅生产九千多、五号豹式六千多部。'}, 46 | {'src': '由于过多贵格会信仰者居住于费城,因此费城人又称"贵格会信仰者"。', 47 | 'tgt': '由于许多贵格会信仰者居住于费城,因此费城人又称"贵格会信仰者"。'}, 48 | {'src': '进入校门,映入眼帘的即是建中的指标建筑红楼,其建于日治时期(1909年),目前已列入台北市市定古迹。', 49 | 'tgt': '进入校门,映入眼帘的即是建中的指标建筑红楼,其建于日据时期(1909年),目前已列入台北市市定古迹。'}, 50 | {'src': '2015年1月20日,该校一名姓陈17岁中五女生因担心迟到而追小巴,却被撞至阴道重创昏迷。', 51 | 'tgt': '2015年1月20日,该校一名姓陈17岁中五女生因担心迟到而追小巴,却被撞至头部重创昏迷。'}, 52 | {'src': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星团之一的人,他在观测天狼星系统在天空中的移动路径之后得出这个结论。', 53 | 'tgt': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星群之一的人,他在观测天狼星系统在天空中的移动路径之后得出这个结论。'}] 54 | ``` -------------------------------------------------------------------------------- /reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/__init__.py -------------------------------------------------------------------------------- /reader/divide_xml_revisions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Divide the large XML revision dump file into per page revisions. 4 | 5 | """ 6 | import codecs 7 | import os 8 | import xml.sax 9 | import xml.sax.saxutils 10 | 11 | 12 | html_escape_table = { 13 | u'‘': "'", 14 | u'’': "'", 15 | u'“': '"', 16 | u'”': '"' 17 | } 18 | 19 | html_unescape_table = {v:k for k, v in html_escape_table.items()} 20 | 21 | def html_escape(text): 22 | return xml.sax.saxutils.escape(text, html_escape_table) 23 | 24 | def html_unescape(text): 25 | return xml.sax.saxutils.unescape(text, html_unescape_table) 26 | 27 | 28 | class WikiRevisionDumpHandler(xml.sax.ContentHandler): 29 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid', 30 | 'timestamp', 'contributor', 'ip', 'username', 31 | 'comment', 'model', 'format', 'text', 'sha1']) 32 | file_counter = 0 33 | file_handle = '' 34 | 35 | def __init__(self, input_file, output_dir): 36 | # Input/output locations 37 | self.input_file = input_file 38 | self.output_dir = output_dir 39 | 40 | # Recent tag visited by SAX parser 41 | self.curr_tag = '' 42 | self.content = '' 43 | 44 | def startElement(self, tag, attributes): 45 | self.curr_tag = tag 46 | if self.curr_tag == 'page': 47 | # close the unclosed handles first if any 48 | if self.file_handle: 49 | self.file_handle.close() 50 | fname = repr(self.file_counter).zfill(10) + '.xml' 51 | abspath = self.output_dir + '/' + fname 52 | print('Writing to file: ', abspath ) 53 | self.file_handle = codecs.open(abspath, 'w', 'utf-8') 54 | self.file_handle.write(self.tag_start('page')+'\n') 55 | elif self.curr_tag in self.wiki_dump_tags: 56 | self.file_handle.write(self.tag_start(self.curr_tag)) 57 | 58 | def endElement(self, tag): 59 | self.curr_tag = tag 60 | if self.curr_tag == 'page': 61 | self.file_handle.write(self.tag_end('page')) 62 | self.file_handle.close() 63 | self.file_counter += 1 64 | elif self.curr_tag in self.wiki_dump_tags: 65 | self.file_handle.write(self.tag_end(self.curr_tag)) 66 | 67 | def characters(self, contents): 68 | self.content = contents 69 | if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags: 70 | self.file_handle.write(html_escape(self.content)) 71 | 72 | @staticmethod 73 | def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+'' 74 | 75 | @staticmethod 76 | def tag_start(tag): return '<'+tag+'>' 77 | 78 | @staticmethod 79 | def tag_end(tag): return '' 80 | 81 | 82 | if __name__ == '__main__': 83 | import argparse 84 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.') 85 | arg_parser.add_argument('input_file', help='XML revision dump file name') 86 | arg_parser.add_argument('output_dir', help='Output directory') 87 | args = arg_parser.parse_args() 88 | if not os.path.exists(args.output_dir): 89 | os.makedirs(args.output_dir) 90 | 91 | # SAX XML reader 92 | xml_parser = xml.sax.make_parser() 93 | 94 | revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_dir) 95 | xml_parser.setContentHandler(revision_dump_handler) 96 | xml_parser.parse(args.input_file) 97 | -------------------------------------------------------------------------------- /reader/divide_xml_revisions_new.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Divide the large XML revision dump file into per page revisions. 4 | 5 | """ 6 | import codecs 7 | import os 8 | import xml.sax 9 | import xml.sax.saxutils 10 | import io 11 | import json 12 | from extract_revisions_new import extract_revisions 13 | from extract_spelling_errors_new import converter,check_error 14 | 15 | html_escape_table = { 16 | u'‘': "'", 17 | u'’': "'", 18 | u'“': '"', 19 | u'”': '"' 20 | } 21 | 22 | html_unescape_table = {v:k for k, v in html_escape_table.items()} 23 | 24 | def html_escape(text): 25 | return xml.sax.saxutils.escape(text, html_escape_table) 26 | 27 | def html_unescape(text): 28 | return xml.sax.saxutils.unescape(text, html_unescape_table) 29 | 30 | 31 | def extract_errors(content,number_of_edits,outfile): 32 | buffer = '' 33 | # extract revisions 34 | for timestamp,text in extract_revisions(io.StringIO(content)): 35 | if len(text) > 10: 36 | buffer += '\n\n[Revision timestamp: ' + timestamp + ']\n\n' 37 | buffer += text 38 | 39 | revisions = [] 40 | line = [] 41 | 42 | srcs,tgts = [],[] 43 | pre_revision = '' 44 | current_revision = '' 45 | cnt = 0 46 | for line in buffer.splitlines(): 47 | line = converter.convert(line) 48 | if "Revision timestamp" in line: 49 | if current_revision: 50 | if pre_revision: 51 | cnt += 1 52 | # if cnt % 100 == 0: 53 | # print('processed',cnt) 54 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits) 55 | pre_revision = current_revision.strip() 56 | current_revision = '' 57 | else: 58 | current_revision += line 59 | if current_revision and pre_revision: 60 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits) 61 | 62 | 63 | keeps = [] 64 | if srcs: 65 | errors = set() 66 | for src,tgt in zip(srcs[::-1],tgts[::-1]): 67 | if src in errors or tgt in errors: 68 | continue 69 | errors.add(src) 70 | errors.add(tgt) 71 | keeps.append({"src":src,'tgt':tgt}) 72 | 73 | if keeps: 74 | for x in keeps: 75 | outfile.write(json.dumps(x,ensure_ascii=False) + '\n') 76 | 77 | class WikiRevisionDumpHandler(xml.sax.ContentHandler): 78 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid', 79 | 'timestamp', 'contributor', 'ip', 'username', 80 | 'comment', 'model', 'format', 'text', 'sha1']) 81 | file_counter = 0 82 | file_handle = '' 83 | 84 | def __init__(self, input_file, output_file, number_of_edits): 85 | # Input/output locations 86 | self.input_file = input_file 87 | self.output_file = open(output_file,'a') 88 | self.number_of_edits = number_of_edits 89 | 90 | # Recent tag visited by SAX parser 91 | self.curr_tag = '' 92 | self.content = '' 93 | 94 | def startElement(self, tag, attributes): 95 | self.curr_tag = tag 96 | if self.curr_tag == 'page': 97 | # close the unclosed handles first if any 98 | if self.file_handle: 99 | self.file_handle = '' 100 | self.file_handle += self.tag_start('page')+'\n' 101 | elif self.curr_tag in self.wiki_dump_tags: 102 | self.file_handle += self.tag_start(self.curr_tag) 103 | 104 | def endElement(self, tag): 105 | self.curr_tag = tag 106 | if self.curr_tag == 'page': 107 | self.file_handle += self.tag_end('page') 108 | self.file_counter += 1 109 | extract_errors(self.file_handle,self.number_of_edits,self.output_file) 110 | self.file_handle = '' 111 | # if self.file_counter % 100 == 0: 112 | print(f'{self.input_file} processed {self.file_counter} pages') 113 | elif self.curr_tag in self.wiki_dump_tags: 114 | self.file_handle += self.tag_end(self.curr_tag) 115 | 116 | def characters(self, contents): 117 | self.content = contents 118 | if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags: 119 | self.file_handle += html_escape(self.content) 120 | 121 | @staticmethod 122 | def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+'' 123 | 124 | @staticmethod 125 | def tag_start(tag): return '<'+tag+'>' 126 | 127 | @staticmethod 128 | def tag_end(tag): return '' 129 | 130 | 131 | if __name__ == '__main__': 132 | import argparse 133 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.') 134 | arg_parser.add_argument('input_file', help='XML revision dump file name') 135 | arg_parser.add_argument('output_file', help='Output file') 136 | arg_parser.add_argument('number_of_edits', help='number_of_edits') 137 | args = arg_parser.parse_args() 138 | number_of_edits = float(args.number_of_edits) 139 | if not os.path.exists(os.path.dirname(args.output_file)): 140 | os.makedirs(os.path.dirname(args.output_file)) 141 | 142 | # SAX XML reader 143 | xml_parser = xml.sax.make_parser() 144 | 145 | revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_file,number_of_edits) 146 | xml_parser.setContentHandler(revision_dump_handler) 147 | xml_parser.parse(args.input_file) 148 | -------------------------------------------------------------------------------- /reader/extract_revisions.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import codecs 4 | import os 5 | import xml.sax 6 | import xml.sax.saxutils 7 | 8 | from fix_extracted import fix_extraction 9 | 10 | 11 | html_escape_table = { 12 | u'‘': "'", 13 | u'’': "'", 14 | u'“': '"', 15 | u'”': '"', 16 | u'&': '&' 17 | } 18 | 19 | html_unescape_table = {v:k for k, v in html_escape_table.items()} 20 | 21 | def html_escape(text): 22 | return xml.sax.saxutils.escape(text, html_escape_table) 23 | 24 | def html_unescape(text): 25 | return xml.sax.saxutils.unescape(text, html_unescape_table) 26 | 27 | 28 | class WikiRevisionHandler(xml.sax.ContentHandler): 29 | input_file = 'wiki.xml' 30 | output_dir = '.' 31 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid', 32 | 'timestamp', 'contributor', 'ip', 'username', 33 | 'comment', 'model', 'format', 'text', 'sha1']) 34 | file_handle = '' 35 | 36 | def __init__(self, input_file, output_file): 37 | # Input/output locations 38 | self.input_file = input_file 39 | self.output_file = output_file 40 | 41 | # Recent tag visited by SAX parser 42 | self.curr_tag = '' 43 | self.content = '' 44 | 45 | # Revisions 46 | self.revisions = [] 47 | self.curr_rev = [] 48 | self.rev_start = False 49 | self.ts_start = False 50 | self.timestamps = [] 51 | 52 | 53 | def startElement(self, tag, attributes): 54 | self.curr_tag = tag 55 | if self.curr_tag == 'timestamp': 56 | self.ts_start = True 57 | if self.curr_tag == 'revision': 58 | self.rev_start = True 59 | if self.curr_tag == 'page': 60 | # close the unclosed handles first if any 61 | if self.file_handle: 62 | self.file_handle.close() 63 | print('Writing to file: ', self.output_file) 64 | self.file_handle = codecs.open(self.output_file, 'w', 'utf-8') 65 | # self.file_handle.write(self.tag_start('page')+'\n') 66 | #elif self.curr_tag in self.wiki_dump_tags: 67 | # self.file_handle.write(self.tag_start(self.curr_tag)) 68 | 69 | def endElement(self, tag): 70 | self.curr_tag = tag 71 | if self.curr_tag == 'timestamp': 72 | self.ts_start = False 73 | if self.curr_tag == 'revision': 74 | self.rev_start = False 75 | if len(self.curr_rev) > 0: 76 | self.revisions.append(self.curr_rev) 77 | self.curr_rev = [] 78 | if self.curr_tag == 'page': 79 | # self.file_handle.write(self.tag_end('page')) 80 | print('revisions',len(self.revisions)) 81 | ts_revs = list(zip(self.timestamps, self.revisions)) 82 | for t_r in ts_revs[::-1]: 83 | self.file_handle.write('\n[Revision timestamp: ' + t_r[0] + ']\n') 84 | html_escaped = html_escape(''.join(t_r[1])) 85 | self.file_handle.write(html_escaped) 86 | self.file_handle.close() 87 | #elif self.curr_tag in self.wiki_dump_tags: 88 | # self.file_handle.write(self.tag_end(self.curr_tag)) 89 | 90 | def characters(self, contents): 91 | self.content = contents 92 | if self.curr_tag == 'text' and self.rev_start: 93 | self.curr_rev.append(self.content) 94 | if self.curr_tag == 'timestamp' and self.ts_start: 95 | self.timestamps.append(self.content) 96 | #self.file_handle.write('[Revision timestamp: ' + self.content + ']\n') 97 | #if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags: 98 | # self.file_handle.write(html_escape(self.content)) 99 | 100 | 101 | class WikiRevErrorHandler(xml.sax.handler.ErrorHandler): 102 | 103 | def error(self, exception): 104 | pass 105 | 106 | def fatalError(self, exception): 107 | pass 108 | 109 | def warning(self, exception): 110 | pass 111 | 112 | 113 | 114 | if __name__ == '__main__': 115 | import argparse 116 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.') 117 | arg_parser.add_argument('input_dir', help='Input dir') 118 | arg_parser.add_argument('input_file', help='Input file') 119 | arg_parser.add_argument('output_dir', help='Output dir') 120 | args = arg_parser.parse_args() 121 | 122 | # fix extraction 123 | fix_extraction(args.input_dir,args.input_file,args.input_dir) 124 | 125 | input_file = args.input_dir + '/' + args.input_file 126 | output_file = args.output_dir + '/' + args.input_file 127 | # SAX XML reader 128 | xml_parser = xml.sax.make_parser() 129 | 130 | revision_handler = WikiRevisionHandler(input_file, output_file) 131 | wiki_err_handler = WikiRevErrorHandler() 132 | xml_parser.setContentHandler(revision_handler) 133 | xml_parser.setErrorHandler(wiki_err_handler) 134 | xml_parser.parse(input_file) 135 | 136 | -------------------------------------------------------------------------------- /reader/extract_revisions_new.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | from wikiextractor.extract import Extractor 4 | 5 | extractor = Extractor('##NoName##') 6 | 7 | 8 | def clean_revison(revision): 9 | # fix text 10 | revision = '\n'.join(revision) 11 | 12 | m = re.search(r'(.*?)', revision, flags=re.DOTALL) 13 | if m: 14 | text = m.group(1) 15 | else: 16 | logging.warning('Missing text element') 17 | return None 18 | 19 | text = extractor.extract(text) 20 | m = re.search(r'(.*?)', revision) 21 | timestamp = 'none' 22 | if m: 23 | timestamp = m.group(1) 24 | return (timestamp,text) 25 | 26 | def extract_revisions(fname): 27 | revision_cnt = 0 28 | revison_content = [] 29 | revison_area = False 30 | if isinstance(fname,str): 31 | fname = open(fname) 32 | for line in fname: 33 | if'' in line: 34 | # 如果revision有内容,那么肯定哪里出错了,直接丢弃数据 35 | if revison_content: 36 | revison_content = [] 37 | revison_area = True 38 | 39 | if '' in line: 40 | revision_cnt += 1 41 | if revision_cnt % 100 == 0: 42 | print(fname, 'revision cnt', revision_cnt) 43 | revison_content.append(line) 44 | fixed = clean_revison(revison_content) 45 | if fixed is not None: 46 | yield fixed 47 | revison_content = [] 48 | revison_area = False 49 | continue 50 | 51 | if revison_area: 52 | revison_content.append(line) 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | import argparse 58 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.') 59 | arg_parser.add_argument('input_dir', help='Input dir') 60 | arg_parser.add_argument('input_file', help='Input file') 61 | arg_parser.add_argument('output_dir', help='Output dir') 62 | args = arg_parser.parse_args() 63 | 64 | 65 | input_file = args.input_dir + '/' + args.input_file 66 | output_file = args.output_dir + '/' + args.input_file 67 | with open(output_file,'w') as f: 68 | for timestamp,text in extract_revisions(input_file): 69 | if len(text) > 10: 70 | f.write('\n\n[Revision timestamp: ' + timestamp + ']\n\n') 71 | f.write(text) 72 | 73 | 74 | -------------------------------------------------------------------------------- /reader/extract_spelling_errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Extracts spelling errors from revision history. 4 | 5 | """ 6 | 7 | import codecs 8 | import re 9 | import utils 10 | 11 | class RevisionSentence(object): 12 | """Class for representing an error sentence together with original sentence. 13 | 14 | """ 15 | def __init__(self, orig_tokens): 16 | self.orig_tokens = orig_tokens 17 | self.err_sen = [] 18 | 19 | def add_err_sentence(self, err_tokens): 20 | self.err_sen.append(err_tokens) 21 | 22 | def contains_spelling_errors(self): 23 | """Whether the earlier revisions of the same sentences have spelling errors. 24 | 25 | Returns: 26 | bool: True or False 27 | 28 | """ 29 | if len(self.err_sen) > 0: 30 | return True 31 | else: 32 | return False 33 | 34 | class ErrorCorpus(object): 35 | """Class for representing the original text data with spelling errors. 36 | 37 | """ 38 | lang = 'english' 39 | max_dist = 3 40 | min_sen_len = 3 41 | 42 | def __init__(self, lang='english', max_edit_distance=3, min_sen_len=3): 43 | self.corpus = None 44 | self.num_rev = 0 45 | self.lang = lang 46 | self.max_edit = max_edit_distance 47 | self.min_sen_len = min_sen_len 48 | 49 | def create_corpus_from_wiki(self, corpus_root, filename, output_dir): 50 | create_error_corpus = False 51 | valid_word_pat = r'(?u)^\w+$' 52 | sentences = utils.get_sentences_for_text(corpus_root, filename) 53 | if sentences == None: 54 | return 55 | top_rev = [] 56 | top_rev_with_err = [] 57 | try: 58 | for s_list in sentences: 59 | s = ''.join(s_list) 60 | if s.startswith('[Revision timestamp:'): 61 | self.num_rev += 1 62 | else: 63 | if self.num_rev == 1: 64 | if len(s_list) >= self.min_sen_len: 65 | rev_sen = RevisionSentence(s_list) 66 | top_rev.append(rev_sen) 67 | elif self.num_rev > 1: 68 | for r in top_rev: 69 | if len(s_list) == len(r.orig_tokens): 70 | valid_errors = True 71 | errors = False 72 | old_curr_rev_sen = zip(r.orig_tokens, s_list) 73 | for t in old_curr_rev_sen: 74 | dist = utils.levenshtein_distance(t[0], t[1]) 75 | if dist > 0 and dist <= self.max_dist: 76 | # token must be a word 77 | orig_uni = utils.to_unicode_or_bust(t[0]) 78 | match = re.search(valid_word_pat, orig_uni) 79 | if match: 80 | errors = True 81 | elif dist > self.max_dist: 82 | valid_errors = False 83 | break 84 | if errors == True and valid_errors == True: 85 | print('errr') 86 | r.add_err_sentence(s_list) 87 | create_error_corpus = True 88 | break 89 | except AssertionError: 90 | print('Empty file') 91 | 92 | if create_error_corpus == True: 93 | with codecs.open(output_dir + '/' + filename, 'w', 'utf-8', errors='ignore') as f: 94 | for r in top_rev: 95 | if r.contains_spelling_errors() == True: 96 | orig_sen = ' '.join(r.orig_tokens) 97 | err_as_sen = map(lambda x: ' '.join(x), r.err_sen) 98 | orig_err_sen = [orig_sen] + list(err_as_sen) 99 | to_write_uni = '####'.join(orig_err_sen) 100 | f.write(to_write_uni + u'\n') 101 | 102 | if __name__ == '__main__': 103 | import argparse 104 | arg_parser = argparse.ArgumentParser(description='Script for extracting spelling errors from a revision history') 105 | arg_parser.add_argument('corpus_root', help='The directory in which the revision file exists') 106 | arg_parser.add_argument('input_file', help='Revision file') 107 | arg_parser.add_argument('output_dir', help='Output directory') 108 | arg_parser.add_argument('lang', help='Language of the text data') 109 | arg_parser.add_argument('max_edit', help='Maximum edit distance between the correct word and the misspelled work') 110 | 111 | args = arg_parser.parse_args() 112 | err_corpus = ErrorCorpus(args.lang.lower(), args.max_edit) 113 | err_corpus.create_corpus_from_wiki(args.corpus_root, args.input_file, args.output_dir) 114 | 115 | #import os 116 | #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3' 117 | #for root, dirnames, filenames in os.walk(corpus_root): 118 | # for f in filenames: 119 | # err_corpus = ErrorCorpus() 120 | # print 'Extracting errors from: ', f 121 | # err_corpus.create_corpus_from_wiki(corpus_root, f, '') 122 | 123 | #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/tmp_out' 124 | #file_name = 'hello.txt' 125 | #err_corpus = ErrorCorpus() 126 | #err_corpus.create_corpus_from_wiki(corpus_root, file_name, '') 127 | -------------------------------------------------------------------------------- /reader/extract_spelling_errors_new.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import codecs 3 | import utils 4 | import re 5 | import sys 6 | from nltk.metrics import edit_distance 7 | import opencc 8 | import string 9 | import math 10 | converter = opencc.OpenCC('t2s.json') 11 | 12 | 13 | def hasNumbers(inputString): 14 | return bool(re.search(r'\d',inputString)) 15 | 16 | def hasBrackets(inputString): 17 | return bool(re.search(r'\[|\]|\)|\(',inputString)) 18 | 19 | def hasAlphabets(inputString): 20 | return bool(re.search(r'[a-zA-Z]',inputString)) 21 | 22 | def hasSpecialCharacters(inputString): 23 | return bool(re.search(r'[\|\s]',inputString)) 24 | 25 | def create_files(fname): 26 | # files = codecs.open(output+ '/' + fname + "_spelling_error.txt","w", encoding='utf-8') 27 | cf = codecs.open(output+ '/' + fname + "_orig_sen.txt","w", encoding='utf-8') 28 | ef = codecs.open(output+ '/' + fname + "_error_sen.txt","w", encoding='utf-8') 29 | return ef,cf 30 | 31 | def check_error(earlier,current, srcs,tgts,number_of_edits): 32 | earlier = utils.split_sentence(earlier) 33 | current = utils.split_sentence(current) 34 | if len(earlier)==len(current): 35 | for j in range(0, len(earlier)): 36 | f=0 37 | earlier_words = earlier[j] 38 | current_words = current[j] 39 | if earlier_words == current_words: 40 | continue 41 | if len(earlier_words) < 5: 42 | continue 43 | if sum([1 if utils.is_chinese_char(x) else 0 for x in current_words]) / len(current_words) <= 0.7: 44 | continue 45 | 46 | if(len(earlier_words) == len(current_words)): 47 | for k in range(0,len(earlier_words)): 48 | if earlier_words[k]==current_words[k]: 49 | continue 50 | elif utils.is_chinese_char(earlier_words[k]): 51 | f += 1 52 | 53 | thr = min(max(math.ceil(number_of_edits * len(current_words)),1),10) 54 | if(1<=f<=thr): 55 | srcs.append(earlier[j]) 56 | tgts.append(current[j]) 57 | 58 | if __name__ == '__main__': 59 | 60 | source = sys.argv[1]+"/" 61 | source += sys.argv[2] 62 | language = sys.argv[4] 63 | number_of_edits = float(sys.argv[5]) 64 | output = sys.argv[3] 65 | 66 | files,files_2,files_3 = None,None,None 67 | revisions = [] 68 | line = [] 69 | f=0 70 | 71 | srcs,tgts = [],[] 72 | pre_revision = '' 73 | current_revision = '' 74 | cnt = 0 75 | for line in open(source): 76 | line = converter.convert(line) 77 | if "Revision timestamp" in line: 78 | if current_revision: 79 | if pre_revision: 80 | cnt += 1 81 | if cnt % 100 == 0: 82 | print(source,'processed',cnt) 83 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits) 84 | pre_revision = current_revision.strip() 85 | current_revision = '' 86 | else: 87 | current_revision += line 88 | if current_revision and pre_revision: 89 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits) 90 | 91 | 92 | if srcs: 93 | ef,cf = create_files(sys.argv[2]) 94 | errors = set() 95 | for src,tgt in zip(srcs[::-1],tgts[::-1]): 96 | if src in errors or tgt in errors: 97 | continue 98 | errors.add(src) 99 | errors.add(tgt) 100 | ef.write(src + '\n') 101 | cf.write(tgt + '\n') 102 | 103 | ef.close() 104 | cf.close() 105 | -------------------------------------------------------------------------------- /reader/fix_extracted.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Fixes output of WikiExtractor.py 4 | 5 | """ 6 | 7 | import argparse 8 | import codecs 9 | import re 10 | 11 | import xml.sax 12 | from xml.sax.handler import ContentHandler 13 | handler = ContentHandler() 14 | 15 | def fix_revison(revision): 16 | # fix text 17 | text_area = False 18 | new_revison = [] 19 | for x in revision: 20 | if '' in x: 21 | text_area = True 22 | 23 | # No 24 | if text_area and '' in x and '' not in x: 25 | text_area = False 26 | new_revison.append('') 27 | 28 | if text_area: 29 | # 将text里面的<符号替换掉 30 | x = x.replace('','##LOSPR##').replace('','##ROSPR##').replace('<','#') 31 | x = x.replace('##LOSPR##','').replace('##ROSPR##','') 32 | 33 | if '' in x: 34 | text_area = False 35 | 36 | new_revison.append(x) 37 | 38 | try: 39 | xml.sax.parseString('\n'.join(new_revison),handler) 40 | except: 41 | return None 42 | return new_revison 43 | 44 | def fix_extraction(input_dir, input_file, output_dir): 45 | with codecs.open(input_dir + '/' + input_file, 'r', encoding='utf-8') as f: 46 | contents = f.read() 47 | contents = contents.replace("&","&").replace('<','<').replace('>','>').replace('"','"').replace(''','\'') 48 | contents = re.sub(r'<\/text>\s*', '##LOSPR##', contents) 49 | contents = re.sub(r'', '\n\t', contents) 50 | contents = re.sub(r'##LOSPR##', '\n\t', contents) 51 | 52 | # HTML entities 53 | contents = re.sub(r'&', '&', contents) 54 | 55 | # Remove HTML tags if not removed already 56 | tag_pat1 = (r'<\/?(textarea|select|strong|center|option|' 57 | r'input|param|small|style|table|tbody|thead|tfoot|' 58 | r'body|head|html|span|font|form|' 59 | r'div|img|var|pre|sub|sup|var|ref|wiki|' 60 | r'br|dl|dt|dd|em|h[1-6]|hr|li|ol|td|tr|th|ul|a|b|p|q|u)>' 61 | ) 62 | contents = re.sub(tag_pat1, '', contents) 63 | 64 | # remove bad revisions 65 | new_content = [] 66 | revison_content = [] 67 | revison_area = False 68 | 69 | for line in contents.splitlines(): 70 | if'' in line: 71 | # 如果revision有内容,那么肯定哪里出错了,直接丢弃数据 72 | if revison_content: 73 | revison_content = [] 74 | revison_area = True 75 | 76 | if '' in line: 77 | revison_content.append(line) 78 | fixed = fix_revison(revison_content) 79 | if fixed is not None: 80 | new_content.extend(fixed) 81 | revison_content = [] 82 | revison_area = False 83 | continue 84 | 85 | if revison_area: 86 | revison_content.append(line) 87 | else: 88 | new_content.append(line) 89 | with codecs.open(output_dir + '/' + input_file, 'w', encoding='utf-8') as fw: 90 | fw.write('\n'.join(new_content)) 91 | 92 | 93 | if __name__ == '__main__': 94 | arg_parser = argparse.ArgumentParser(description='Script for fixing WikiExtractor.py outputs') 95 | arg_parser.add_argument('input_dir', help='Input dir') 96 | arg_parser.add_argument('input_file', help='Input file') 97 | arg_parser.add_argument('output_dir', help='Output directory') 98 | args = arg_parser.parse_args() 99 | fix_extraction(args.input_dir, args.input_file, args.output_dir) 100 | -------------------------------------------------------------------------------- /reader/utils.py: -------------------------------------------------------------------------------- 1 | # *-* coding: utf-8 *-* 2 | 3 | """Utility functions. 4 | 5 | """ 6 | # import nltk.data 7 | # from nltk.tokenize.regexp import WhitespaceTokenizer 8 | # from nltk.corpus import PlaintextCorpusReader 9 | import jieba 10 | import numpy as np 11 | import sys 12 | 13 | import re 14 | from typing import List 15 | 16 | def is_chinese_char(cp): 17 | """Checks whether CP is the codepoint of a CJK character.""" 18 | # This defines a "chinese character" as anything in the CJK Unicode block: 19 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 20 | # 21 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 22 | # despite its name. The modern Korean Hangul alphabet is a different block, 23 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 24 | # space-separated words, so they are not treated specially and handled 25 | # like the all of the other languages. 26 | cp = ord(cp) 27 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 28 | (cp >= 0x3400 and cp <= 0x4DBF) or # 29 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 30 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 31 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 32 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 33 | (cp >= 0xF900 and cp <= 0xFAFF) or # 34 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 35 | return True 36 | 37 | return False 38 | 39 | def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]: 40 | """ 41 | Args: 42 | document: 43 | flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句 44 | limit: 默认单句最大长度为510个字符 45 | Returns: Type:list 46 | """ 47 | sent_list = [] 48 | try: 49 | if flag == "zh": 50 | document = re.sub('(?P([。?!…](?![”’"\'])))', r'\g\n', document) # 单字符断句符 51 | document = re.sub('(?P([。?!]|…{1,2})[”’"\'])', r'\g\n', document) # 特殊引号 52 | elif flag == "en": 53 | document = re.sub('(?P([.?!](?![”’"\'])))', r'\g\n', document) # 英文单字符断句符 54 | document = re.sub('(?P([?!.]["\']))', r'\g\n', document) # 特殊引号 55 | else: 56 | document = re.sub('(?P([。?!….?!](?![”’"\'])))', r'\g\n', document) # 单字符断句符 57 | document = re.sub('(?P(([。?!.!?]|…{1,2})[”’"\']))', r'\g\n', 58 | document) # 特殊引号 59 | 60 | sent_list_ori = document.splitlines() 61 | for sent in sent_list_ori: 62 | sent = sent.strip() 63 | if not sent: 64 | continue 65 | else: 66 | while len(sent) > limit: 67 | temp = sent[0:limit] 68 | sent_list.append(temp) 69 | sent = sent[limit:] 70 | sent_list.append(sent) 71 | except: 72 | sent_list.clear() 73 | sent_list.append(document) 74 | return sent_list 75 | 76 | 77 | def to_unicode_or_bust(s, encoding='utf-8'): 78 | """Converts the bytestring in utf-8 to Unicode. 79 | 80 | Credit: Method from 'Unicode in Python, Completely Demystified'. 81 | 82 | Args: 83 | s: Bytestring 84 | encoding: Encoding 85 | 86 | Returns: 87 | Return the Unicode version of the given bytestring 88 | 89 | """ 90 | # if isinstance(s, str): 91 | # if not isinstance(s, unicode): 92 | # s = unicode(s, encoding) 93 | return s 94 | 95 | 96 | def get_sentences_for_text(corpus_root, filename, lang='english'): 97 | """Segments the given text into sentences. 98 | 99 | Args: 100 | corpus_root: Directory in which the text file is residing. 101 | filename: Name of the text file. 102 | lang: Tokenizer language. For possible values, look at: 103 | ${NLTK_DATA}/tokenizers/punkt 104 | 105 | Returns: 106 | Sentences in the given text. 107 | 108 | """ 109 | sents = [] 110 | for s in split_sentence(open(corpus_root + '/' + filename).read()): 111 | sents.append(jieba.lcut(s)) 112 | return sents 113 | # tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle' 114 | # text = PlaintextCorpusReader(corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(), 115 | # sent_tokenizer=nltk.data.LazyLoader(tokenizer_path)) 116 | # return text.sents() 117 | 118 | def levenshtein_distance(s, t): 119 | """Minimum edit distance between two strings. 120 | 121 | Args: 122 | s: Source string 123 | t: Target string 124 | 125 | Returns: 126 | int: Minimum edit distance between the two input strings. 127 | 128 | """ 129 | m = len(s) 130 | n = len(t) 131 | if m == 0: 132 | return n 133 | if n == 0: 134 | return m 135 | d = np.zeros((m+1, n+1)) 136 | d[:, 0] = np.arange(m+1) 137 | d[0, :] = np.arange(n+1) 138 | for j in range(1, n+1): 139 | for i in range(1, m+1): 140 | if s[i-1] == t[j-1]: 141 | d[i][j] = d[i-1][j-1] 142 | else: 143 | d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+1) 144 | return int(d[m][n]) 145 | 146 | 147 | if __name__ == '__main__': 148 | corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3' 149 | file_name = '0000000007.xml' 150 | sentences = get_sentences_for_text(corpus_root, file_name) 151 | # try: 152 | # for s in sentences: 153 | # print s 154 | # print '\n----END----' 155 | # except AssertionError: 156 | # print 'Empty file' 157 | 158 | -------------------------------------------------------------------------------- /reader/wikiextractor/WikiExtractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # ============================================================================= 5 | # Version: 3.0 (July 22, 2020) 6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa 7 | # 8 | # Contributors: 9 | # Antonio Fuschetto (fuschett@aol.com) 10 | # Leonardo Souza (lsouza@amtera.com.br) 11 | # Juan Manuel Caicedo (juan@cavorite.com) 12 | # Humberto Pereira (begini@gmail.com) 13 | # Siegfried-A. Gevatter (siegfried@gevatter.com) 14 | # Pedro Assis (pedroh2306@gmail.com) 15 | # Wim Muskee (wimmuskee@gmail.com) 16 | # Radics Geza (radicsge@gmail.com) 17 | # Nick Ulven (nulven@github) 18 | # 19 | # ============================================================================= 20 | # Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it). 21 | # ============================================================================= 22 | # This file is part of Tanl. 23 | # 24 | # Tanl is free software; you can redistribute it and/or modify it 25 | # under the terms of the GNU Affero General Public License, version 3, 26 | # as published by the Free Software Foundation. 27 | # 28 | # Tanl is distributed in the hope that it will be useful, 29 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 30 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 | # GNU Affero General Public License for more details. 32 | # 33 | # You should have received a copy of the GNU Affero General Public License 34 | # along with this program. If not, see . 35 | # ============================================================================= 36 | 37 | """Wikipedia Extractor: 38 | Extracts and cleans text from a Wikipedia database dump and stores output in a 39 | number of files of similar size in a given directory. 40 | Each file will contain several documents in the format: 41 | 42 | 43 | ... 44 | 45 | 46 | If the program is invoked with the --json flag, then each file will 47 | contain several documents formatted as json ojects, one per line, with 48 | the following structure 49 | 50 | {"id": "", "revid": "", "url": "", "title": "", "text": "..."} 51 | 52 | The program performs template expansion by preprocesssng the whole dump and 53 | collecting template definitions. 54 | """ 55 | 56 | import argparse 57 | import bz2 58 | import logging 59 | import os.path 60 | import re # TODO use regex when it will be standard 61 | import sys 62 | from io import StringIO 63 | from multiprocessing import Queue, get_context, cpu_count 64 | from timeit import default_timer 65 | 66 | from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces 67 | 68 | # =========================================================================== 69 | 70 | # Program version 71 | __version__ = '3.0.6' 72 | 73 | ## 74 | # Defined in 75 | # We include as default Template, when loading external template file. 76 | knownNamespaces = set(['Template']) 77 | 78 | ## 79 | # The namespace used for template definitions 80 | # It is the name associated with namespace key=10 in the siteinfo header. 81 | templateNamespace = '' 82 | templatePrefix = '' 83 | 84 | ## 85 | # The namespace used for module definitions 86 | # It is the name associated with namespace key=828 in the siteinfo header. 87 | moduleNamespace = '' 88 | 89 | # ---------------------------------------------------------------------- 90 | # Modules 91 | 92 | # Only minimal support 93 | # FIXME: import Lua modules. 94 | 95 | modules = { 96 | 'convert': { 97 | 'convert': lambda x, u, *rest: x + ' ' + u, # no conversion 98 | } 99 | } 100 | # ---------------------------------------------------------------------- 101 | # Expand using WikiMedia API 102 | # import json 103 | 104 | # def expandTemplates(text): 105 | # """Expand templates invoking MediaWiki API""" 106 | # text = urlib.urlencodew(text) 107 | # base = urlbase[:urlbase.rfind('/')] 108 | # url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text 109 | # exp = json.loads(urllib.urlopen(url)) 110 | # return exp['expandtemplates']['*'] 111 | 112 | # ------------------------------------------------------------------------------ 113 | # Output 114 | 115 | 116 | class NextFile(): 117 | 118 | """ 119 | Synchronous generation of next available file name. 120 | """ 121 | 122 | filesPerDir = 100 123 | 124 | def __init__(self, path_name): 125 | self.path_name = path_name 126 | self.dir_index = -1 127 | self.file_index = -1 128 | 129 | def next(self): 130 | self.file_index = (self.file_index + 1) % NextFile.filesPerDir 131 | if self.file_index == 0: 132 | self.dir_index += 1 133 | dirname = self._dirname() 134 | if not os.path.isdir(dirname): 135 | os.makedirs(dirname) 136 | return self._filepath() 137 | 138 | def _dirname(self): 139 | char1 = self.dir_index % 26 140 | char2 = int(self.dir_index / 26) % 26 141 | return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) 142 | 143 | def _filepath(self): 144 | return '%s/wiki_%02d' % (self._dirname(), self.file_index) 145 | 146 | 147 | class OutputSplitter(): 148 | 149 | """ 150 | File-like object, that splits output to multiple files of a given max size. 151 | """ 152 | 153 | def __init__(self, nextFile, max_file_size=0, compress=True): 154 | """ 155 | :param nextFile: a NextFile object from which to obtain filenames 156 | to use. 157 | :param max_file_size: the maximum size of each file. 158 | :para compress: whether to write data with bzip compression. 159 | """ 160 | self.nextFile = nextFile 161 | self.compress = compress 162 | self.max_file_size = max_file_size 163 | self.file = self.open(self.nextFile.next()) 164 | 165 | def reserve(self, size): 166 | if self.file.tell() + size > self.max_file_size: 167 | self.close() 168 | self.file = self.open(self.nextFile.next()) 169 | 170 | def write(self, data): 171 | self.reserve(len(data)) 172 | if self.compress: 173 | self.file.write(data) 174 | else: 175 | self.file.write(data) 176 | 177 | def close(self): 178 | self.file.close() 179 | 180 | def open(self, filename): 181 | if self.compress: 182 | return bz2.BZ2File(filename + '.bz2', 'w') 183 | else: 184 | return open(filename, 'w') 185 | 186 | 187 | # ---------------------------------------------------------------------- 188 | # READER 189 | 190 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?') 191 | # 1 2 3 4 192 | 193 | 194 | def load_templates(file, output_file=None): 195 | """ 196 | Load templates from :param file:. 197 | :param output_file: file where to save templates and modules. 198 | """ 199 | global templateNamespace, templatePrefix 200 | templatePrefix = templateNamespace + ':' 201 | global moduleNamespace, modulePrefix 202 | modulePrefix = moduleNamespace + ':' 203 | articles = 0 204 | templates = 0 205 | page = [] 206 | inText = False 207 | if output_file: 208 | output = open(output_file, 'w') 209 | for line in file: 210 | #line = line.decode('utf-8') 211 | if '<' not in line: # faster than doing re.search() 212 | if inText: 213 | page.append(line) 214 | continue 215 | m = tagRE.search(line) 216 | if not m: 217 | continue 218 | tag = m.group(2) 219 | if tag == 'page': 220 | page = [] 221 | elif tag == 'title': 222 | title = m.group(3) 223 | elif tag == 'text': 224 | inText = True 225 | line = line[m.start(3):m.end(3)] 226 | page.append(line) 227 | if m.lastindex == 4: # open-close 228 | inText = False 229 | elif tag == '/text': 230 | if m.group(1): 231 | page.append(m.group(1)) 232 | inText = False 233 | elif inText: 234 | page.append(line) 235 | elif tag == '/page': 236 | if not output_file and not templateNamespace: # do not know it yet 237 | # we reconstruct it from the first title 238 | colon = title.find(':') 239 | if colon > 1: 240 | templateNamespace = title[:colon] 241 | templatePrefix = title[:colon + 1] 242 | # FIXME: should reconstruct also moduleNamespace 243 | if title.startswith(templatePrefix): 244 | define_template(title, page) 245 | templates += 1 246 | # save templates and modules to file 247 | if output_file and (title.startswith(templatePrefix) or 248 | title.startswith(modulePrefix)): 249 | output.write('\n') 250 | output.write(' %s\n' % title) 251 | output.write(' 10\n') 252 | output.write(' ') 253 | for line in page: 254 | output.write(line) 255 | output.write(' \n') 256 | output.write('\n') 257 | page = [] 258 | articles += 1 259 | if articles % 100000 == 0: 260 | logging.info("Preprocessed %d pages", articles) 261 | if output_file: 262 | output.close() 263 | logging.info("Saved %d templates to '%s'", templates, output_file) 264 | return templates 265 | 266 | 267 | def decode_open(filename, mode='rt', encoding='utf-8'): 268 | """ 269 | Open a file, decode and decompress, depending on extension `gz`, or 'bz2`. 270 | :param filename: the file to open. 271 | """ 272 | ext = os.path.splitext(filename)[1] 273 | if ext == '.gz': 274 | import gzip 275 | return gzip.open(filename, mode, encoding=encoding) 276 | elif ext == '.bz2': 277 | return bz2.open(filename, mode=mode, encoding=encoding) 278 | else: 279 | return open(filename, mode, encoding=encoding) 280 | 281 | 282 | def process_dump(input_file, template_file, out_file, file_size, file_compress, 283 | process_count, html_safe): 284 | """ 285 | :param input_file: name of the wikipedia dump file; '-' to read from stdin 286 | :param template_file: optional file with template definitions. 287 | :param out_file: directory where to store extracted data, or '-' for stdout 288 | :param file_size: max size of each extracted file, or None for no max (one file) 289 | :param file_compress: whether to compress files with bzip. 290 | :param process_count: number of extraction processes to spawn. 291 | """ 292 | global knownNamespaces 293 | global templateNamespace, templatePrefix 294 | global moduleNamespace, modulePrefix 295 | 296 | urlbase = '' # This is obtained from 297 | 298 | input = decode_open(input_file) 299 | 300 | # collect siteinfo 301 | for line in input: 302 | line = line #.decode('utf-8') 303 | m = tagRE.search(line) 304 | if not m: 305 | continue 306 | tag = m.group(2) 307 | if tag == 'base': 308 | # discover urlbase from the xml dump file 309 | # /mediawiki/siteinfo/base 310 | base = m.group(3) 311 | urlbase = base[:base.rfind("/")] 312 | elif tag == 'namespace': 313 | knownNamespaces.add(m.group(3)) 314 | if re.search('key="10"', line): 315 | templateNamespace = m.group(3) 316 | templatePrefix = templateNamespace + ':' 317 | elif re.search('key="828"', line): 318 | moduleNamespace = m.group(3) 319 | modulePrefix = moduleNamespace + ':' 320 | elif tag == '/siteinfo': 321 | break 322 | 323 | if expand_templates: 324 | # preprocess 325 | template_load_start = default_timer() 326 | if template_file and os.path.exists(template_file): 327 | logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file) 328 | file = decode_open(template_file) 329 | templates = load_templates(file) 330 | file.close() 331 | else: 332 | if input_file == '-': 333 | # can't scan then reset stdin; must error w/ suggestion to specify template_file 334 | raise ValueError("to use templates with stdin dump, must supply explicit template-file") 335 | logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) 336 | templates = load_templates(input, template_file) 337 | input.close() 338 | input = decode_open(input_file) 339 | template_load_elapsed = default_timer() - template_load_start 340 | logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed) 341 | 342 | if out_file == '-': 343 | output = sys.stdout 344 | if file_compress: 345 | logging.warn("writing to stdout, so no output compression (use an external tool)") 346 | else: 347 | nextFile = NextFile(out_file) 348 | output = OutputSplitter(nextFile, file_size, file_compress) 349 | 350 | # process pages 351 | logging.info("Starting page extraction from %s.", input_file) 352 | extract_start = default_timer() 353 | 354 | # Parallel Map/Reduce: 355 | # - pages to be processed are dispatched to workers 356 | # - a reduce process collects the results, sort them and print them. 357 | 358 | # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object 359 | Process = get_context("fork").Process 360 | 361 | maxsize = 10 * process_count 362 | # output queue 363 | output_queue = Queue(maxsize=maxsize) 364 | 365 | # Reduce job that sorts and prints output 366 | reduce = Process(target=reduce_process, args=(output_queue, output)) 367 | reduce.start() 368 | 369 | # initialize jobs queue 370 | jobs_queue = Queue(maxsize=maxsize) 371 | 372 | # start worker processes 373 | logging.info("Using %d extract processes.", process_count) 374 | workers = [] 375 | for _ in range(max(1, process_count)): 376 | extractor = Process(target=extract_process, 377 | args=(jobs_queue, output_queue, html_safe)) 378 | extractor.daemon = True # only live while parent process lives 379 | extractor.start() 380 | workers.append(extractor) 381 | 382 | # Mapper process 383 | 384 | # we collect individual lines, since str.join() is significantly faster 385 | # than concatenation 386 | page = [] 387 | id = '' 388 | revid = '' 389 | last_id = '' 390 | ordinal = 0 # page count 391 | inText = False 392 | redirect = False 393 | for line in input: 394 | if '<' not in line: # faster than doing re.search() 395 | if inText: 396 | page.append(line) 397 | continue 398 | m = tagRE.search(line) 399 | if not m: 400 | continue 401 | tag = m.group(2) 402 | if tag == 'page': 403 | page = [] 404 | redirect = False 405 | elif tag == 'id' and not id: 406 | id = m.group(3) 407 | elif tag == 'id' and id: # 408 | revid = m.group(3) 409 | elif tag == 'title': 410 | title = m.group(3) 411 | elif tag == 'redirect': 412 | redirect = True 413 | elif tag == 'text': 414 | inText = True 415 | line = line[m.start(3):m.end(3)] 416 | page.append(line) 417 | if m.lastindex == 4: # open-close 418 | inText = False 419 | elif tag == '/text': 420 | if m.group(1): 421 | page.append(m.group(1)) 422 | inText = False 423 | elif inText: 424 | page.append(line) 425 | elif tag == '/page': 426 | colon = title.find(':') 427 | if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and 428 | not redirect and not title.startswith(templateNamespace)): 429 | job = (id, revid, urlbase, title, page, ordinal) 430 | jobs_queue.put(job) # goes to any available extract_process 431 | last_id = id 432 | ordinal += 1 433 | id = '' 434 | revid = '' 435 | page = [] 436 | 437 | input.close() 438 | 439 | # signal termination 440 | for _ in workers: 441 | jobs_queue.put(None) 442 | # wait for workers to terminate 443 | for w in workers: 444 | w.join() 445 | 446 | # signal end of work to reduce process 447 | output_queue.put(None) 448 | # wait for it to finish 449 | reduce.join() 450 | 451 | if output != sys.stdout: 452 | output.close() 453 | extract_duration = default_timer() - extract_start 454 | extract_rate = ordinal / extract_duration 455 | logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", 456 | process_count, ordinal, extract_duration, extract_rate) 457 | 458 | 459 | # ---------------------------------------------------------------------- 460 | # Multiprocess support 461 | 462 | 463 | def extract_process(jobs_queue, output_queue, html_safe): 464 | """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text 465 | :param jobs_queue: where to get jobs. 466 | :param output_queue: where to queue extracted text for output. 467 | :html_safe: whether to convert entities in text to HTML. 468 | """ 469 | while True: 470 | job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal) 471 | if job: 472 | out = StringIO() # memory buffer 473 | Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page) 474 | text = out.getvalue() 475 | output_queue.put((job[-1], text)) # (ordinal, extracted_text) 476 | out.close() 477 | else: 478 | break 479 | 480 | 481 | def reduce_process(output_queue, output): 482 | """Pull finished article text, write series of files (or stdout) 483 | :param output_queue: text to be output. 484 | :param output: file object where to print. 485 | """ 486 | 487 | interval_start = default_timer() 488 | period = 100000 489 | # FIXME: use a heap 490 | ordering_buffer = {} # collected pages 491 | next_ordinal = 0 # sequence number of pages 492 | while True: 493 | if next_ordinal in ordering_buffer: 494 | output.write(ordering_buffer.pop(next_ordinal)) 495 | next_ordinal += 1 496 | # progress report 497 | if next_ordinal % period == 0: 498 | interval_rate = period / (default_timer() - interval_start) 499 | logging.info("Extracted %d articles (%.1f art/s)", 500 | next_ordinal, interval_rate) 501 | interval_start = default_timer() 502 | else: 503 | # mapper puts None to signal finish 504 | pair = output_queue.get() 505 | if not pair: 506 | break 507 | ordinal, text = pair 508 | ordering_buffer[ordinal] = text 509 | 510 | 511 | # ---------------------------------------------------------------------- 512 | 513 | # Minimum size of output files 514 | minFileSize = 200 * 1024 515 | 516 | 517 | def main(): 518 | global urlbase, acceptedNamespaces 519 | global expand_templates, templateCache 520 | 521 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), 522 | formatter_class=argparse.RawDescriptionHelpFormatter, 523 | description=__doc__) 524 | parser.add_argument("input", 525 | help="XML wiki dump file") 526 | groupO = parser.add_argument_group('Output') 527 | groupO.add_argument("-o", "--output", default="text", 528 | help="directory for extracted files (or '-' for dumping to stdout)") 529 | groupO.add_argument("-b", "--bytes", default="1M", 530 | help="maximum bytes per output file (default %(default)s); 0 means to put a single article per file", 531 | metavar="n[KMG]") 532 | groupO.add_argument("-c", "--compress", action="store_true", 533 | help="compress output files using bzip") 534 | groupO.add_argument("--json", action="store_true", 535 | help="write output in json format instead of the default format") 536 | 537 | groupP = parser.add_argument_group('Processing') 538 | groupP.add_argument("--html", action="store_true", 539 | help="produce HTML output, subsumes --links") 540 | groupP.add_argument("-l", "--links", action="store_true", 541 | help="preserve links") 542 | groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", 543 | help="accepted namespaces") 544 | groupP.add_argument("--templates", 545 | help="use or create file containing templates") 546 | groupP.add_argument("--no-templates", action="store_false", 547 | help="Do not expand templates") 548 | groupP.add_argument("--html-safe", default=True, 549 | help="use to produce HTML safe output within ...") 550 | default_process_count = cpu_count() - 1 551 | parser.add_argument("--processes", type=int, default=default_process_count, 552 | help="Number of processes to use (default %(default)s)") 553 | 554 | groupS = parser.add_argument_group('Special') 555 | groupS.add_argument("-q", "--quiet", action="store_true", 556 | help="suppress reporting progress info") 557 | groupS.add_argument("--debug", action="store_true", 558 | help="print debug info") 559 | groupS.add_argument("-a", "--article", action="store_true", 560 | help="analyze a file containing a single article (debug option)") 561 | groupS.add_argument("-v", "--version", action="version", 562 | version='%(prog)s ' + __version__, 563 | help="print program version") 564 | 565 | args = parser.parse_args() 566 | 567 | Extractor.keepLinks = args.links 568 | Extractor.HtmlFormatting = args.html 569 | if args.html: 570 | Extractor.keepLinks = True 571 | Extractor.to_json = args.json 572 | 573 | expand_templates = args.no_templates 574 | 575 | try: 576 | power = 'kmg'.find(args.bytes[-1].lower()) + 1 577 | # 0 bytes means put a single article per file. 578 | file_size = 0 if args.bytes == '0' else int(args.bytes[:-1]) * 1024 ** power 579 | if file_size and file_size < minFileSize: 580 | raise ValueError() 581 | except ValueError: 582 | logging.error('Insufficient or invalid size: %s', args.bytes) 583 | return 584 | 585 | if args.namespaces: 586 | acceptedNamespaces = set(args.namespaces.split(',')) 587 | 588 | FORMAT = '%(levelname)s: %(message)s' 589 | logging.basicConfig(format=FORMAT) 590 | 591 | logger = logging.getLogger() 592 | if not args.quiet: 593 | logger.setLevel(logging.INFO) 594 | if args.debug: 595 | logger.setLevel(logging.DEBUG) 596 | 597 | input_file = args.input 598 | 599 | if not Extractor.keepLinks: 600 | ignoreTag('a') 601 | 602 | # sharing cache of parser templates is too slow: 603 | # manager = Manager() 604 | # templateCache = manager.dict() 605 | 606 | if args.article: 607 | if args.templates: 608 | if os.path.exists(args.templates): 609 | with open(args.templates) as file: 610 | load_templates(file) 611 | 612 | with open(input_file) as file: 613 | page = file.read() 614 | ids = re.findall(r'(\d*?)', page) 615 | id = ids[0] if ids else '' 616 | revid = ids[1] if len(ids) > 1 else '' 617 | m = re.search(r'(.*?)', page) 618 | if m: 619 | title = m.group(1) 620 | else: 621 | logging.error('Missing title element') 622 | return 623 | m = re.search(r'(.*?)', page) 624 | if m: 625 | base = m.group(1) 626 | urlbase = base[:base.rfind("/")] 627 | else: 628 | urlbase = '' 629 | Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout) 630 | return 631 | 632 | output_path = args.output 633 | if output_path != '-' and not os.path.isdir(output_path): 634 | try: 635 | os.makedirs(output_path) 636 | except: 637 | logging.error('Could not create: %s', output_path) 638 | return 639 | 640 | process_dump(input_file, args.templates, output_path, file_size, 641 | args.compress, args.processes, args.html_safe) 642 | 643 | 644 | if __name__ == '__main__': 645 | main() 646 | -------------------------------------------------------------------------------- /reader/wikiextractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/wikiextractor/__init__.py -------------------------------------------------------------------------------- /reader/wikiextractor/cirrus-extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ============================================================================= 5 | # Version: 1.00 (December 15, 2015) 6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa 7 | # 8 | # ============================================================================= 9 | # Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it). 10 | # ============================================================================= 11 | # This file is part of Tanl. 12 | # 13 | # Tanl is free software; you can redistribute it and/or modify it 14 | # under the terms of the GNU Affero General Public License, version 3, 15 | # as published by the Free Software Foundation. 16 | # 17 | # Tanl is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | # GNU Affero General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Affero General Public License 23 | # along with this program. If not, see . 24 | # ============================================================================= 25 | 26 | """Wikipedia Cirrus Extractor: 27 | Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a 28 | number of files of similar size in a given directory. 29 | Each file will contain several documents in the format: 30 | 31 | 32 | ... 33 | 34 | 35 | """ 36 | 37 | import sys, os.path, time 38 | import re 39 | import json 40 | import argparse 41 | import bz2 42 | import gzip 43 | import logging 44 | 45 | # Program version 46 | version = '3.0' 47 | 48 | urlbase = 'http://it.wikipedia.org/' 49 | 50 | # ---------------------------------------------------------------------- 51 | 52 | class NextFile(object): 53 | """ 54 | Synchronous generation of next available file name. 55 | """ 56 | 57 | filesPerDir = 100 58 | 59 | def __init__(self, path_name): 60 | self.path_name = path_name 61 | self.dir_index = -1 62 | self.file_index = -1 63 | 64 | def next(self): 65 | self.file_index = (self.file_index + 1) % NextFile.filesPerDir 66 | if self.file_index == 0: 67 | self.dir_index += 1 68 | dirname = self._dirname() 69 | if not os.path.isdir(dirname): 70 | os.makedirs(dirname) 71 | return self._filepath() 72 | 73 | def _dirname(self): 74 | char1 = self.dir_index % 26 75 | char2 = int(self.dir_index / 26) % 26 76 | return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) 77 | 78 | def _filepath(self): 79 | return '%s/wiki_%02d' % (self._dirname(), self.file_index) 80 | 81 | class OutputSplitter(object): 82 | """ 83 | File-like object, that splits output to multiple files of a given max size. 84 | """ 85 | 86 | def __init__(self, nextFile, max_file_size=0, compress=True): 87 | """ 88 | :param nextfile: a NextFile object from which to obtain filenames 89 | to use. 90 | :param max_file_size: the maximum size of each file. 91 | :para compress: whether to write data with bzip compression. 92 | """ 93 | self.nextFile = nextFile 94 | self.compress = compress 95 | self.max_file_size = max_file_size 96 | self.file = self.open(self.nextFile.next()) 97 | 98 | def reserve(self, size): 99 | if self.file.tell() + size > self.max_file_size: 100 | self.close() 101 | self.file = self.open(self.nextFile.next()) 102 | 103 | def write(self, data): 104 | self.reserve(len(data)) 105 | self.file.write(data) 106 | 107 | def close(self): 108 | self.file.close() 109 | 110 | def open(self, filename): 111 | if self.compress: 112 | return bz2.BZ2File(filename + '.bz2', 'w') 113 | else: 114 | return open(filename, 'w') 115 | 116 | # ---------------------------------------------------------------------- 117 | 118 | class Extractor(object): 119 | 120 | def extract(self, out): 121 | """ 122 | :param out: output file. 123 | """ 124 | logging.debug("%s\t%s", self.id, self.title) 125 | text = ''.join(self.page) 126 | url = get_url(self.id) 127 | header = '\n' % (self.id, url, self.title, self.language, self.revision) 128 | # Separate header from text with a newline. 129 | header += self.title + '\n\n' 130 | header = header.encode('utf-8') 131 | footer = "\n\n" 132 | out.write(header) 133 | text = clean(self, text) 134 | for line in compact(text): 135 | out.write(line.encode('utf-8')) 136 | out.write('\n') 137 | out.write(footer) 138 | 139 | def process_dump(input_file, out_file, file_size, file_compress): 140 | """ 141 | :param input_file: name of the wikipedia dump file; '-' to read from stdin 142 | :param out_file: directory where to store extracted data, or '-' for stdout 143 | :param file_size: max size of each extracted file, or None for no max (one file) 144 | :param file_compress: whether to compress files with bzip. 145 | """ 146 | 147 | if input_file == '-': 148 | input = sys.stdin 149 | else: 150 | input = gzip.open(input_file) 151 | 152 | if out_file == '-': 153 | output = sys.stdout 154 | if file_compress: 155 | logging.warn("writing to stdout, so no output compression (use external tool)") 156 | else: 157 | nextFile = NextFile(out_file) 158 | output = OutputSplitter(nextFile, file_size, file_compress) 159 | 160 | # process dump 161 | # format 162 | # {"index":{"_type":"page","_id":"3825914"}} 163 | # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...} 164 | while True: 165 | line = input.readline() 166 | if not line: 167 | break 168 | index = json.loads(line) 169 | content = json.loads(input.readline()) 170 | type = index['index']['_type'] 171 | id = index['index']['_id'] 172 | language = content['language'] 173 | revision = content['version'] 174 | if type == 'page' and content['namespace'] == 0: 175 | title = content['title'] 176 | text = content['text'] 177 | # drop references: 178 | # ^ The Penguin Dictionary 179 | text = re.sub(r' \^ .*', '', text) 180 | url = urlbase + 'wiki?curid=' + id 181 | header = '\n' % (id, url, title, language, revision) 182 | page = header + title + '\n\n' + text + '\n\n' 183 | output.write(page.encode('utf-8')) 184 | 185 | # ---------------------------------------------------------------------- 186 | 187 | # Minimum size of output files 188 | minFileSize = 200 * 1024 189 | 190 | def main(): 191 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), 192 | formatter_class=argparse.RawDescriptionHelpFormatter, 193 | description=__doc__) 194 | parser.add_argument("input", 195 | help="Cirrus Json wiki dump file") 196 | groupO = parser.add_argument_group('Output') 197 | groupO.add_argument("-o", "--output", default="text", 198 | help="directory for extracted files (or '-' for dumping to stdin)") 199 | groupO.add_argument("-b", "--bytes", default="1M", 200 | help="maximum bytes per output file (default %(default)s)", 201 | metavar="n[KMG]") 202 | groupO.add_argument("-c", "--compress", action="store_true", 203 | help="compress output files using bzip") 204 | 205 | groupP = parser.add_argument_group('Processing') 206 | groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", 207 | help="accepted namespaces") 208 | 209 | groupS = parser.add_argument_group('Special') 210 | groupS.add_argument("-q", "--quiet", action="store_true", 211 | help="suppress reporting progress info") 212 | groupS.add_argument("-v", "--version", action="version", 213 | version='%(prog)s ' + version, 214 | help="print program version") 215 | 216 | args = parser.parse_args() 217 | 218 | try: 219 | power = 'kmg'.find(args.bytes[-1].lower()) + 1 220 | file_size = int(args.bytes[:-1]) * 1024 ** power 221 | if file_size < minFileSize: 222 | raise ValueError() 223 | except ValueError: 224 | logging.error('Insufficient or invalid size: %s', args.bytes) 225 | return 226 | 227 | FORMAT = '%(levelname)s: %(message)s' 228 | logging.basicConfig(format=FORMAT) 229 | 230 | logger = logging.getLogger() 231 | if not args.quiet: 232 | logger.setLevel(logging.INFO) 233 | 234 | input_file = args.input 235 | 236 | output_path = args.output 237 | if output_path != '-' and not os.path.isdir(output_path): 238 | try: 239 | os.makedirs(output_path) 240 | except: 241 | logging.error('Could not create: %s', output_path) 242 | return 243 | 244 | process_dump(input_file, output_path, file_size, args.compress) 245 | 246 | 247 | if __name__ == '__main__': 248 | main() 249 | -------------------------------------------------------------------------------- /reader/wikiextractor/clean.py: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it). 3 | # ============================================================================= 4 | # This file is part of Tanl. 5 | # 6 | # Tanl is free software; you can redistribute it and/or modify it 7 | # under the terms of the GNU Affero General Public License, version 3, 8 | # as published by the Free Software Foundation. 9 | # 10 | # Tanl is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | # ============================================================================= 18 | 19 | from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags 20 | 21 | 22 | def clean_markup(markup, keep_links=False, ignore_headers=True): 23 | """ 24 | Clean Wikimarkup to produce plaintext. 25 | 26 | :param keep_links: Set to True to keep internal and external links 27 | :param ignore_headers: if set to True, the output list will not contain 28 | headers, only 29 | 30 | Returns a list of paragraphs (unicode strings). 31 | """ 32 | 33 | if not keep_links: 34 | ignoreTag('a') 35 | 36 | extractor = Extractor(0, '', []) 37 | 38 | # returns a list of strings 39 | paragraphs = extractor.clean_text(markup, 40 | mark_headers=True, 41 | expand_templates=False, 42 | escape_doc=True) 43 | resetIgnoredTags() 44 | 45 | if ignore_headers: 46 | paragraphs = filter(lambda s: not s.startswith('## '), paragraphs) 47 | 48 | return paragraphs 49 | -------------------------------------------------------------------------------- /reader/wikiextractor/extract.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # ============================================================================= 4 | # Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it). 5 | # ============================================================================= 6 | # This file is part of Tanl. 7 | # 8 | # Tanl is free software; you can redistribute it and/or modify it 9 | # under the terms of the GNU Affero General Public License, version 3, 10 | # as published by the Free Software Foundation. 11 | # 12 | # Tanl is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | # ============================================================================= 20 | 21 | import re 22 | import html 23 | import json 24 | from itertools import zip_longest 25 | from urllib.parse import quote as urlencode 26 | from html.entities import name2codepoint 27 | import logging 28 | import time 29 | 30 | # ---------------------------------------------------------------------- 31 | 32 | # match tail after wikilink 33 | tailRE = re.compile('\w+') 34 | syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) 35 | 36 | ## PARAMS #################################################################### 37 | 38 | ## 39 | # Defined in 40 | # We include as default Template, when loading external template file. 41 | knownNamespaces = set(['Template']) 42 | 43 | ## 44 | # Drop these elements from article text 45 | # 46 | discardElements = [ 47 | 'gallery', 'timeline', 'noinclude', 'pre', 48 | 'table', 'tr', 'td', 'th', 'caption', 'div', 49 | 'form', 'input', 'select', 'option', 'textarea', 50 | 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir', 51 | 'ref', 'references', 'img', 'imagemap', 'source', 'small' 52 | ] 53 | 54 | ## 55 | # Recognize only these namespaces 56 | # w: Internal links to the Wikipedia 57 | # wiktionary: Wiki dictionary 58 | # wikt: shortcut for Wiktionary 59 | # 60 | acceptedNamespaces = ['w', 'wiktionary', 'wikt'] 61 | 62 | 63 | def get_url(urlbase, uid): 64 | return "%s?curid=%s" % (urlbase, uid) 65 | 66 | 67 | # ====================================================================== 68 | 69 | 70 | def clean(extractor, text, expand_templates=False, html_safe=True): 71 | """ 72 | Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped 73 | @see https://www.mediawiki.org/wiki/Help:Formatting 74 | :param extractor: the Extractor t use. 75 | :param text: the text to clean. 76 | :param expand_templates: whether to perform template expansion. 77 | :param html_safe: whether to convert reserved HTML characters to entities. 78 | @return: the cleaned text. 79 | """ 80 | 81 | if expand_templates: 82 | # expand templates 83 | # See: http://www.mediawiki.org/wiki/Help:Templates 84 | text = extractor.expandTemplates(text) 85 | else: 86 | # Drop transclusions (template, parser functions) 87 | text = dropNested(text, r'{{', r'}}') 88 | 89 | # Drop tables 90 | text = dropNested(text, r'{\|', r'\|}') 91 | 92 | # replace external links 93 | text = replaceExternalLinks(text) 94 | 95 | # replace internal links 96 | text = replaceInternalLinks(text) 97 | 98 | # drop MagicWords behavioral switches 99 | text = magicWordsRE.sub('', text) 100 | 101 | # ############### Process HTML ############### 102 | 103 | # turn into HTML, except for the content of 104 | res = '' 105 | cur = 0 106 | for m in syntaxhighlight.finditer(text): 107 | end = m.end() 108 | res += unescape(text[cur:m.start()]) + m.group(1) 109 | cur = end 110 | text = res + unescape(text[cur:]) 111 | 112 | # Handle bold/italic/quote 113 | if extractor.HtmlFormatting: 114 | text = bold_italic.sub(r'\1', text) 115 | text = bold.sub(r'\1', text) 116 | text = italic.sub(r'\1', text) 117 | else: 118 | text = bold_italic.sub(r'\1', text) 119 | text = bold.sub(r'\1', text) 120 | text = italic_quote.sub(r'"\1"', text) 121 | text = italic.sub(r'"\1"', text) 122 | text = quote_quote.sub(r'"\1"', text) 123 | # residuals of unbalanced quotes 124 | text = text.replace("'''", '').replace("''", '"') 125 | 126 | # Collect spans 127 | 128 | spans = [] 129 | # Drop HTML comments 130 | for m in comment.finditer(text): 131 | spans.append((m.start(), m.end())) 132 | 133 | # Drop self-closing tags 134 | for pattern in selfClosing_tag_patterns: 135 | for m in pattern.finditer(text): 136 | spans.append((m.start(), m.end())) 137 | 138 | # Drop ignored tags 139 | for left, right in ignored_tag_patterns: 140 | for m in left.finditer(text): 141 | spans.append((m.start(), m.end())) 142 | for m in right.finditer(text): 143 | spans.append((m.start(), m.end())) 144 | 145 | # Bulk remove all spans 146 | text = dropSpans(spans, text) 147 | 148 | # Drop discarded elements 149 | for tag in discardElements: 150 | text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) 151 | 152 | if not extractor.HtmlFormatting: 153 | # Turn into text what is left (&nbsp;) and 154 | text = unescape(text) 155 | 156 | # Expand placeholders 157 | for pattern, placeholder in placeholder_tag_patterns: 158 | index = 1 159 | for match in pattern.finditer(text): 160 | text = text.replace(match.group(), '%s_%d' % (placeholder, index)) 161 | index += 1 162 | 163 | text = text.replace('<<', u'«').replace('>>', u'»') 164 | 165 | ############################################# 166 | 167 | # Cleanup text 168 | text = text.replace('\t', ' ') 169 | text = spaces.sub(' ', text) 170 | text = dots.sub('...', text) 171 | text = re.sub(u' (,:\.\)\]»)', r'\1', text) 172 | text = re.sub(u'(\[\(«) ', r'\1', text) 173 | text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations 174 | text = text.replace(',,', ',').replace(',.', '.') 175 | if html_safe: 176 | text = html.escape(text, quote=False) 177 | return text 178 | 179 | 180 | # skip level 1, it is page name level 181 | section = re.compile(r'(==+)\s*(.*?)\s*\1') 182 | 183 | listOpen = {'*': '
    ', '#': '
      ', ';': '
      ', ':': '
      '} 184 | listClose = {'*': '
', '#': '', ';': '', ':': ''} 185 | listItem = {'*': '
  • %s
  • ', '#': '
  • %s', ';': '
    %s
    ', 186 | ':': '
    %s
    '} 187 | 188 | 189 | def compact(text, mark_headers=False): 190 | """Deal with headers, lists, empty sections, residuals of tables. 191 | :param text: convert to HTML 192 | """ 193 | 194 | page = [] # list of paragraph 195 | headers = {} # Headers for unfilled sections 196 | emptySection = False # empty sections are discarded 197 | listLevel = '' # nesting of lists 198 | 199 | for line in text.split('\n'): 200 | 201 | if not line: 202 | continue 203 | # Handle section titles 204 | m = section.match(line) 205 | if m: 206 | title = m.group(2) 207 | lev = len(m.group(1)) 208 | if Extractor.HtmlFormatting: 209 | page.append("%s" % (lev, title, lev)) 210 | if title and title[-1] not in '!?': 211 | title += '.' 212 | 213 | if mark_headers: 214 | title = "## " + title 215 | 216 | headers[lev] = title 217 | # drop previous headers 218 | headers = { k:v for k,v in headers.items() if k <= lev } 219 | emptySection = True 220 | continue 221 | # Handle page title 222 | if line.startswith('++'): 223 | title = line[2:-2] 224 | if title: 225 | if title[-1] not in '!?': 226 | title += '.' 227 | page.append(title) 228 | # handle indents 229 | elif line[0] == ':': 230 | # page.append(line.lstrip(':*#;')) 231 | continue 232 | # handle lists 233 | elif line[0] in '*#;:': 234 | if Extractor.HtmlFormatting: 235 | i = 0 236 | for c, n in zip_longest(listLevel, line, fillvalue=''): 237 | if not n or n not in '*#;:': 238 | if c: 239 | page.append(listClose[c]) 240 | listLevel = listLevel[:-1] 241 | continue 242 | else: 243 | break 244 | # n != '' 245 | if c != n and (not c or (c not in ';:' and n not in ';:')): 246 | if c: 247 | # close level 248 | page.append(listClose[c]) 249 | listLevel = listLevel[:-1] 250 | listLevel += n 251 | page.append(listOpen[n]) 252 | i += 1 253 | n = line[i - 1] # last list char 254 | line = line[i:].strip() 255 | if line: # FIXME: n is '"' 256 | page.append(listItem[n] % line) 257 | else: 258 | continue 259 | elif len(listLevel): 260 | for c in reversed(listLevel): 261 | page.append(listClose[c]) 262 | listLevel = [] 263 | 264 | # Drop residuals of lists 265 | elif line[0] in '{|' or line[-1] == '}': 266 | continue 267 | # Drop irrelevant lines 268 | elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': 269 | continue 270 | elif len(headers): 271 | if Extractor.keepSections: 272 | items = sorted(headers.items()) 273 | for (i, v) in items: 274 | page.append(v) 275 | headers.clear() 276 | page.append(line) # first line 277 | emptySection = False 278 | elif not emptySection: 279 | page.append(line) 280 | # dangerous 281 | # # Drop preformatted 282 | # elif line[0] == ' ': 283 | # continue 284 | 285 | return page 286 | 287 | 288 | # ---------------------------------------------------------------------- 289 | 290 | def dropNested(text, openDelim, closeDelim): 291 | """ 292 | A matching function for nested expressions, e.g. namespaces and tables. 293 | """ 294 | openRE = re.compile(openDelim, re.IGNORECASE) 295 | closeRE = re.compile(closeDelim, re.IGNORECASE) 296 | # partition text in separate blocks { } { } 297 | spans = [] # pairs (s, e) for each partition 298 | nest = 0 # nesting level 299 | start = openRE.search(text, 0) 300 | if not start: 301 | return text 302 | end = closeRE.search(text, start.end()) 303 | next = start 304 | while end: 305 | next = openRE.search(text, next.end()) 306 | if not next: # termination 307 | while nest: # close all pending 308 | nest -= 1 309 | end0 = closeRE.search(text, end.end()) 310 | if end0: 311 | end = end0 312 | else: 313 | break 314 | spans.append((start.start(), end.end())) 315 | break 316 | while end.end() < next.start(): 317 | # { } { 318 | if nest: 319 | nest -= 1 320 | # try closing more 321 | last = end.end() 322 | end = closeRE.search(text, end.end()) 323 | if not end: # unbalanced 324 | if spans: 325 | span = (spans[0][0], last) 326 | else: 327 | span = (start.start(), last) 328 | spans = [span] 329 | break 330 | else: 331 | spans.append((start.start(), end.end())) 332 | # advance start, find next close 333 | start = next 334 | end = closeRE.search(text, next.end()) 335 | break # { } 336 | if next != start: 337 | # { { } 338 | nest += 1 339 | # collect text outside partitions 340 | return dropSpans(spans, text) 341 | 342 | 343 | def dropSpans(spans, text): 344 | """ 345 | Drop from text the blocks identified in :param spans:, possibly nested. 346 | """ 347 | spans.sort() 348 | res = '' 349 | offset = 0 350 | for s, e in spans: 351 | if offset <= s: # handle nesting 352 | if offset < s: 353 | res += text[offset:s] 354 | offset = e 355 | res += text[offset:] 356 | return res 357 | 358 | 359 | # ---------------------------------------------------------------------- 360 | # External links 361 | 362 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html 363 | 364 | wgUrlProtocols = [ 365 | 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', 366 | 'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:', 367 | 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://', 368 | 'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//' 369 | ] 370 | 371 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html 372 | 373 | # Constants needed for external link processing 374 | # Everything except bracket, space, or control characters 375 | # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 376 | # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 377 | EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' 378 | ExtLinkBracketedRegex = re.compile( 379 | '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', 380 | re.S | re.U) 381 | EXT_IMAGE_REGEX = re.compile( 382 | r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) 383 | /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", 384 | re.X | re.S | re.U) 385 | 386 | 387 | def replaceExternalLinks(text): 388 | s = '' 389 | cur = 0 390 | for m in ExtLinkBracketedRegex.finditer(text): 391 | s += text[cur:m.start()] 392 | cur = m.end() 393 | 394 | url = m.group(1) 395 | label = m.group(3) 396 | 397 | # # The characters '<' and '>' (which were escaped by 398 | # # removeHTMLtags()) should not be included in 399 | # # URLs, per RFC 2396. 400 | # m2 = re.search('&(lt|gt);', url) 401 | # if m2: 402 | # link = url[m2.end():] + ' ' + link 403 | # url = url[0:m2.end()] 404 | 405 | # If the link text is an image URL, replace it with an tag 406 | # This happened by accident in the original parser, but some people used it extensively 407 | m = EXT_IMAGE_REGEX.match(label) 408 | if m: 409 | label = makeExternalImage(label) 410 | 411 | # Use the encoded URL 412 | # This means that users can paste URLs directly into the text 413 | # Funny characters like ö aren't valid in URLs anyway 414 | # This was changed in August 2004 415 | s += makeExternalLink(url, label) # + trail 416 | 417 | return s + text[cur:] 418 | 419 | 420 | def makeExternalLink(url, anchor): 421 | """Function applied to wikiLinks""" 422 | if Extractor.keepLinks: 423 | return '%s' % (urlencode(url), anchor) 424 | else: 425 | return anchor 426 | 427 | 428 | def makeExternalImage(url, alt=''): 429 | if Extractor.keepLinks: 430 | return '%s' % (url, alt) 431 | else: 432 | return alt 433 | 434 | 435 | # ---------------------------------------------------------------------- 436 | # WikiLinks 437 | # See https://www.mediawiki.org/wiki/Help:Links#Internal_links 438 | 439 | # Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc. 440 | # Also: [[Help:IPA for Catalan|[andora]]] 441 | 442 | 443 | def replaceInternalLinks(text): 444 | """ 445 | Replaces external links of the form: 446 | [[title |...|label]]trail 447 | 448 | with title concatenated with trail, when present, e.g. 's' for plural. 449 | """ 450 | # call this after removal of external links, so we need not worry about 451 | # triple closing ]]]. 452 | cur = 0 453 | res = '' 454 | for s, e in findBalanced(text, ['[['], [']]']): 455 | m = tailRE.match(text, e) 456 | if m: 457 | trail = m.group(0) 458 | end = m.end() 459 | else: 460 | trail = '' 461 | end = e 462 | inner = text[s + 2:e - 2] 463 | # find first | 464 | pipe = inner.find('|') 465 | if pipe < 0: 466 | title = inner 467 | label = title 468 | else: 469 | title = inner[:pipe].rstrip() 470 | # find last | 471 | curp = pipe + 1 472 | for s1, e1 in findBalanced(inner, ['[['], [']]']): 473 | last = inner.rfind('|', curp, s1) 474 | if last >= 0: 475 | pipe = last # advance 476 | curp = e1 477 | label = inner[pipe + 1:].strip() 478 | res += text[cur:s] + makeInternalLink(title, label) + trail 479 | cur = end 480 | return res + text[cur:] 481 | 482 | 483 | def makeInternalLink(title, label): 484 | colon = title.find(':') 485 | if colon > 0 and title[:colon] not in acceptedNamespaces: 486 | return '' 487 | if colon == 0: 488 | # drop also :File: 489 | colon2 = title.find(':', colon + 1) 490 | if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces: 491 | return '' 492 | if Extractor.keepLinks: 493 | return '%s' % (urlencode(title), label) 494 | else: 495 | return label 496 | 497 | 498 | # ---------------------------------------------------------------------- 499 | # variables 500 | 501 | 502 | class MagicWords(): 503 | 504 | """ 505 | One copy in each Extractor. 506 | 507 | @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html 508 | """ 509 | names = [ 510 | '!', 511 | 'currentmonth', 512 | 'currentmonth1', 513 | 'currentmonthname', 514 | 'currentmonthnamegen', 515 | 'currentmonthabbrev', 516 | 'currentday', 517 | 'currentday2', 518 | 'currentdayname', 519 | 'currentyear', 520 | 'currenttime', 521 | 'currenthour', 522 | 'localmonth', 523 | 'localmonth1', 524 | 'localmonthname', 525 | 'localmonthnamegen', 526 | 'localmonthabbrev', 527 | 'localday', 528 | 'localday2', 529 | 'localdayname', 530 | 'localyear', 531 | 'localtime', 532 | 'localhour', 533 | 'numberofarticles', 534 | 'numberoffiles', 535 | 'numberofedits', 536 | 'articlepath', 537 | 'pageid', 538 | 'sitename', 539 | 'server', 540 | 'servername', 541 | 'scriptpath', 542 | 'stylepath', 543 | 'pagename', 544 | 'pagenamee', 545 | 'fullpagename', 546 | 'fullpagenamee', 547 | 'namespace', 548 | 'namespacee', 549 | 'namespacenumber', 550 | 'currentweek', 551 | 'currentdow', 552 | 'localweek', 553 | 'localdow', 554 | 'revisionid', 555 | 'revisionday', 556 | 'revisionday2', 557 | 'revisionmonth', 558 | 'revisionmonth1', 559 | 'revisionyear', 560 | 'revisiontimestamp', 561 | 'revisionuser', 562 | 'revisionsize', 563 | 'subpagename', 564 | 'subpagenamee', 565 | 'talkspace', 566 | 'talkspacee', 567 | 'subjectspace', 568 | 'subjectspacee', 569 | 'talkpagename', 570 | 'talkpagenamee', 571 | 'subjectpagename', 572 | 'subjectpagenamee', 573 | 'numberofusers', 574 | 'numberofactiveusers', 575 | 'numberofpages', 576 | 'currentversion', 577 | 'rootpagename', 578 | 'rootpagenamee', 579 | 'basepagename', 580 | 'basepagenamee', 581 | 'currenttimestamp', 582 | 'localtimestamp', 583 | 'directionmark', 584 | 'contentlanguage', 585 | 'numberofadmins', 586 | 'cascadingsources', 587 | ] 588 | 589 | def __init__(self): 590 | self.values = {'!': '|'} 591 | 592 | def __getitem__(self, name): 593 | return self.values.get(name) 594 | 595 | def __setitem__(self, name, value): 596 | self.values[name] = value 597 | 598 | switches = ( 599 | '__NOTOC__', 600 | '__FORCETOC__', 601 | '__TOC__', 602 | '__TOC__', 603 | '__NEWSECTIONLINK__', 604 | '__NONEWSECTIONLINK__', 605 | '__NOGALLERY__', 606 | '__HIDDENCAT__', 607 | '__NOCONTENTCONVERT__', 608 | '__NOCC__', 609 | '__NOTITLECONVERT__', 610 | '__NOTC__', 611 | '__START__', 612 | '__END__', 613 | '__INDEX__', 614 | '__NOINDEX__', 615 | '__STATICREDIRECT__', 616 | '__DISAMBIG__' 617 | ) 618 | 619 | 620 | magicWordsRE = re.compile('|'.join(MagicWords.switches)) 621 | 622 | 623 | # ========================================================================= 624 | # 625 | # MediaWiki Markup Grammar 626 | # https://www.mediawiki.org/wiki/Preprocessor_ABNF 627 | 628 | # xml-char = %x9 / %xA / %xD / %x20-D7FF / %xE000-FFFD / %x10000-10FFFF 629 | # sptab = SP / HTAB 630 | 631 | # ; everything except ">" (%x3E) 632 | # attr-char = %x9 / %xA / %xD / %x20-3D / %x3F-D7FF / %xE000-FFFD / %x10000-10FFFF 633 | 634 | # literal = *xml-char 635 | # title = wikitext-L3 636 | # part-name = wikitext-L3 637 | # part-value = wikitext-L3 638 | # part = ( part-name "=" part-value ) / ( part-value ) 639 | # parts = [ title *( "|" part ) ] 640 | # tplarg = "{{{" parts "}}}" 641 | # template = "{{" parts "}}" 642 | # link = "[[" wikitext-L3 "]]" 643 | 644 | # comment = "" 645 | # unclosed-comment = "', re.DOTALL) 739 | 740 | # Match ignored tags 741 | ignored_tag_patterns = [] 742 | 743 | 744 | def ignoreTag(tag): 745 | left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both and 746 | right = re.compile(r'' % tag, re.IGNORECASE) 747 | ignored_tag_patterns.append((left, right)) 748 | 749 | 750 | def resetIgnoredTags(): 751 | global ignored_tag_patterns 752 | ignored_tag_patterns = [] 753 | 754 | 755 | for tag in ignoredTags: 756 | ignoreTag(tag) 757 | 758 | # Match selfClosing HTML tags 759 | selfClosing_tag_patterns = [ 760 | re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags 761 | ] 762 | 763 | # Match HTML placeholder tags 764 | placeholder_tag_patterns = [ 765 | (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), 766 | repl) for tag, repl in placeholder_tags.items() 767 | ] 768 | 769 | # Match preformatted lines 770 | preformatted = re.compile(r'^ .*?$') 771 | 772 | # Match external links (space separates second optional parameter) 773 | externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]') 774 | externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]') 775 | 776 | # Matches bold/italic 777 | bold_italic = re.compile(r"'''''(.*?)'''''") 778 | bold = re.compile(r"'''(.*?)'''") 779 | italic_quote = re.compile(r"''\"([^\"]*?)\"''") 780 | italic = re.compile(r"''(.*?)''") 781 | quote_quote = re.compile(r'""([^"]*?)""') 782 | 783 | # Matches space 784 | spaces = re.compile(r' {2,}') 785 | 786 | # Matches dots 787 | dots = re.compile(r'\.{4,}') 788 | 789 | # ====================================================================== 790 | 791 | substWords = 'subst:|safesubst:' 792 | 793 | 794 | class Extractor(): 795 | """ 796 | An extraction task on a article. 797 | """ 798 | ## 799 | # Whether to preserve links in output 800 | keepLinks = False 801 | 802 | ## 803 | # Whether to preserve section titles 804 | keepSections = True 805 | 806 | ## 807 | # Whether to output text with HTML formatting elements in files. 808 | HtmlFormatting = False 809 | 810 | ## 811 | # Whether to produce json instead of the default output format. 812 | toJson = False 813 | 814 | def __init__(self,title): 815 | """ 816 | :param page: a list of lines. 817 | """ 818 | # self.id = id 819 | # self.revid = revid 820 | # self.url = get_url(urlbase, id) 821 | self.title = title 822 | # self.page = page 823 | self.magicWords = MagicWords() 824 | self.frame = [] 825 | self.recursion_exceeded_1_errs = 0 # template recursion within expandTemplates() 826 | self.recursion_exceeded_2_errs = 0 # template recursion within expandTemplate() 827 | self.recursion_exceeded_3_errs = 0 # parameter recursion 828 | self.template_title_errs = 0 829 | 830 | def clean_text(self, text, mark_headers=False, expand_templates=False, 831 | html_safe=True): 832 | """ 833 | :param mark_headers: True to distinguish headers from paragraphs 834 | e.g. "## Section 1" 835 | """ 836 | self.magicWords['pagename'] = self.title 837 | self.magicWords['fullpagename'] = self.title 838 | self.magicWords['currentyear'] = time.strftime('%Y') 839 | self.magicWords['currentmonth'] = time.strftime('%m') 840 | self.magicWords['currentday'] = time.strftime('%d') 841 | self.magicWords['currenthour'] = time.strftime('%H') 842 | self.magicWords['currenttime'] = time.strftime('%H:%M:%S') 843 | 844 | text = clean(self, text, expand_templates=expand_templates, 845 | html_safe=html_safe) 846 | 847 | text = compact(text, mark_headers=mark_headers) 848 | return text 849 | 850 | def extract(self, page, html_safe=False): 851 | """ 852 | :param out: a memory file. 853 | :param html_safe: whether to escape HTML entities. 854 | """ 855 | # logging.debug("%s\t%s", self.id, self.title) 856 | text = self.clean_text(page, html_safe=html_safe) 857 | return '\n'.join(text) 858 | 859 | 860 | # header = '\n' % (self.id, self.url, self.title) 861 | # # Separate header from text with a newline. 862 | # header += self.title + '\n\n' 863 | # footer = "\n\n" 864 | # out.write(header) 865 | # out.write('\n'.join(text)) 866 | # out.write('\n') 867 | # out.write(footer) 868 | 869 | # errs = (self.template_title_errs, 870 | # self.recursion_exceeded_1_errs, 871 | # self.recursion_exceeded_2_errs, 872 | # self.recursion_exceeded_3_errs) 873 | # if any(errs): 874 | # logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", 875 | # self.title, self.id, *errs) 876 | 877 | # ---------------------------------------------------------------------- 878 | # Expand templates 879 | 880 | maxTemplateRecursionLevels = 30 881 | maxParameterRecursionLevels = 10 882 | 883 | # check for template beginning 884 | reOpen = re.compile('(?= self.maxTemplateRecursionLevels: 910 | self.recursion_exceeded_1_errs += 1 911 | return res 912 | 913 | # logging.debug(' %d %s', len(self.frame), res) 923 | return res 924 | 925 | def templateParams(self, parameters): 926 | """ 927 | Build a dictionary with positional or name key to expanded parameters. 928 | :param parameters: the parts[1:] of a template, i.e. all except the title. 929 | """ 930 | templateParams = {} 931 | 932 | if not parameters: 933 | return templateParams 934 | logging.debug(' 963 | # Parameters may span several lines, like: 964 | # {{Reflist|colwidth=30em|refs= 965 | # <ref name="Goode">Title</ref> 966 | 967 | # The '=' might occurr within an HTML attribute: 968 | # "<ref name=value" 969 | # but we stop at first. 970 | m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL) 971 | if m: 972 | # This is a named parameter. This case also handles parameter 973 | # assignments like "2=xxx", where the number of an unnamed 974 | # parameter ("2") is specified explicitly - this is handled 975 | # transparently. 976 | 977 | parameterName = m.group(1).strip() 978 | parameterValue = m.group(2) 979 | 980 | if ']]' not in parameterValue: # if the value does not contain a link, trim whitespace 981 | parameterValue = parameterValue.strip() 982 | templateParams[parameterName] = parameterValue 983 | else: 984 | # this is an unnamed parameter 985 | unnamedParameterCounter += 1 986 | 987 | if ']]' not in param: # if the value does not contain a link, trim whitespace 988 | param = param.strip() 989 | templateParams[str(unnamedParameterCounter)] = param 990 | logging.debug(' templateParams> %s', '|'.join(templateParams.values())) 991 | return templateParams 992 | 993 | def expandTemplate(self, body): 994 | """Expands template invocation. 995 | :param body: the parts of a template. 996 | 997 | :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation 998 | of the process. 999 | 1000 | See in particular: Expansion of names and values 1001 | http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values 1002 | 1003 | For most parser functions all names and values are expanded, 1004 | regardless of what is relevant for the result. The branching functions 1005 | (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions. 1006 | 1007 | All names in a template call are expanded, and the titles of the 1008 | tplargs in the template body, after which it is determined which 1009 | values must be expanded, and for which tplargs in the template body 1010 | the first part (default). 1011 | 1012 | In the case of a tplarg, any parts beyond the first are never 1013 | expanded. The possible name and the value of the first part is 1014 | expanded if the title does not match a name in the template call. 1015 | 1016 | :see code for braceSubstitution at 1017 | https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397: 1018 | 1019 | """ 1020 | 1021 | # template = "{{" parts "}}" 1022 | 1023 | # Templates and tplargs are decomposed in the same way, with pipes as 1024 | # separator, even though eventually any parts in a tplarg after the first 1025 | # (the parameter default) are ignored, and an equals sign in the first 1026 | # part is treated as plain text. 1027 | # Pipes inside inner templates and tplargs, or inside double rectangular 1028 | # brackets within the template or tplargs are not taken into account in 1029 | # this decomposition. 1030 | # The first part is called title, the other parts are simply called parts. 1031 | 1032 | # If a part has one or more equals signs in it, the first equals sign 1033 | # determines the division into name = value. Equals signs inside inner 1034 | # templates and tplargs, or inside double rectangular brackets within the 1035 | # part are not taken into account in this decomposition. Parts without 1036 | # equals sign are indexed 1, 2, .., given as attribute in the tag. 1037 | 1038 | if len(self.frame) >= self.maxTemplateRecursionLevels: 1039 | self.recursion_exceeded_2_errs += 1 1040 | # logging.debug(' INVOCATION> %d %s', len(self.frame), body) 1041 | return '' 1042 | 1043 | logging.debug('INVOCATION %d %s', len(self.frame), body) 1044 | 1045 | parts = splitParts(body) 1046 | # title is the portion before the first | 1047 | logging.debug('TITLE %s', parts[0].strip()) 1048 | title = self.expandTemplates(parts[0].strip()) 1049 | 1050 | # SUBST 1051 | # Apply the template tag to parameters without 1052 | # substituting into them, e.g. 1053 | # {{subst:t|a{{{p|q}}}b}} gives the wikitext start-a{{{p|q}}}b-end 1054 | # @see https://www.mediawiki.org/wiki/Manual:Substitution#Partial_substitution 1055 | subst = False 1056 | if re.match(substWords, title, re.IGNORECASE): 1057 | title = re.sub(substWords, '', title, 1, re.IGNORECASE) 1058 | subst = True 1059 | 1060 | if title.lower() in self.magicWords.values: 1061 | return self.magicWords[title.lower()] 1062 | 1063 | # Parser functions 1064 | # The first argument is everything after the first colon. 1065 | # It has been evaluated above. 1066 | colon = title.find(':') 1067 | if colon > 1: 1068 | funct = title[:colon] 1069 | parts[0] = title[colon + 1:].strip() # side-effect (parts[0] not used later) 1070 | # arguments after first are not evaluated 1071 | ret = callParserFunction(funct, parts, self.frame) 1072 | return self.expandTemplates(ret) 1073 | 1074 | title = fullyQualifiedTemplateTitle(title) 1075 | if not title: 1076 | self.template_title_errs += 1 1077 | return '' 1078 | 1079 | redirected = redirects.get(title) 1080 | if redirected: 1081 | title = redirected 1082 | 1083 | # get the template 1084 | if title in templateCache: 1085 | template = templateCache[title] 1086 | elif title in templates: 1087 | template = Template.parse(templates[title]) 1088 | # add it to cache 1089 | templateCache[title] = template 1090 | del templates[title] 1091 | else: 1092 | # The page being included could not be identified 1093 | return '' 1094 | 1095 | # logging.debug('TEMPLATE %s: %s', title, template) 1096 | 1097 | # tplarg = "{{{" parts "}}}" 1098 | # parts = [ title *( "|" part ) ] 1099 | # part = ( part-name "=" part-value ) / ( part-value ) 1100 | # part-name = wikitext-L3 1101 | # part-value = wikitext-L3 1102 | # wikitext-L3 = literal / template / tplarg / link / comment / 1103 | # line-eating-comment / unclosed-comment / 1104 | # xmlish-element / *wikitext-L3 1105 | 1106 | # A tplarg may contain other parameters as well as templates, e.g.: 1107 | # {{{text|{{{quote|{{{1|{{error|Error: No text given}}}}}}}}}}} 1108 | # hence no simple RE like this would work: 1109 | # '{{{((?:(?!{{{).)*?)}}}' 1110 | # We must use full CF parsing. 1111 | 1112 | # the parameter name itself might be computed, e.g.: 1113 | # {{{appointe{{#if:{{{appointer14|}}}|r|d}}14|}}} 1114 | 1115 | # Because of the multiple uses of double-brace and triple-brace 1116 | # syntax, expressions can sometimes be ambiguous. 1117 | # Precedence rules specifed here: 1118 | # http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence 1119 | # resolve ambiguities like this: 1120 | # {{{{ }}}} -> { {{{ }}} } 1121 | # {{{{{ }}}}} -> {{ {{{ }}} }} 1122 | # 1123 | # :see: https://en.wikipedia.org/wiki/Help:Template#Handling_parameters 1124 | 1125 | params = parts[1:] 1126 | 1127 | if not subst: 1128 | # Evaluate parameters, since they may contain templates, including 1129 | # the symbol "=". 1130 | # {{#ifexpr: {{{1}}} = 1 }} 1131 | params = [self.expandTemplates(p) for p in params] 1132 | 1133 | # build a dict of name-values for the parameter values 1134 | params = self.templateParams(params) 1135 | 1136 | # Perform parameter substitution 1137 | # extend frame before subst, since there may be recursion in default 1138 | # parameter value, e.g. {{OTRS|celebrative|date=April 2015}} in article 1139 | # 21637542 in enwiki. 1140 | self.frame.append((title, params)) 1141 | instantiated = template.subst(params, self) 1142 | # logging.debug('instantiated %d %s', len(self.frame), instantiated) 1143 | value = self.expandTemplates(instantiated) 1144 | self.frame.pop() 1145 | # logging.debug(' INVOCATION> %s %d %s', title, len(self.frame), value) 1146 | return value 1147 | 1148 | 1149 | # ---------------------------------------------------------------------- 1150 | # parameter handling 1151 | 1152 | 1153 | def splitParts(paramsList): 1154 | """ 1155 | :param paramsList: the parts of a template or tplarg. 1156 | 1157 | Split template parameters at the separator "|". 1158 | separator "=". 1159 | 1160 | Template parameters often contain URLs, internal links, text or even 1161 | template expressions, since we evaluate templates outside in. 1162 | This is required for cases like: 1163 | {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}} 1164 | Parameters are separated by "|" symbols. However, we 1165 | cannot simply split the string on "|" symbols, since these 1166 | also appear inside templates and internal links, e.g. 1167 | 1168 | {{if:| 1169 | |{{#if:the president| 1170 | |{{#if:| 1171 | [[Category:Hatnote templates|A{{PAGENAME}}]] 1172 | }} 1173 | }} 1174 | }} 1175 | 1176 | We split parts at the "|" symbols that are not inside any pair 1177 | {{{...}}}, {{...}}, [[...]], {|...|}. 1178 | """ 1179 | 1180 | # Must consider '[' as normal in expansion of Template:EMedicine2: 1181 | # #ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}} 1182 | # as part of: 1183 | # {{#ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}}} ped/180{{#if: |~}}] 1184 | 1185 | # should handle both tpl arg like: 1186 | # 4|{{{{{subst|}}}CURRENTYEAR}} 1187 | # and tpl parameters like: 1188 | # ||[[Category:People|{{#if:A|A|{{PAGENAME}}}}]] 1189 | 1190 | sep = '|' 1191 | parameters = [] 1192 | cur = 0 1193 | for s, e in findMatchingBraces(paramsList): 1194 | par = paramsList[cur:s].split(sep) 1195 | if par: 1196 | if parameters: 1197 | # portion before | belongs to previous parameter 1198 | parameters[-1] += par[0] 1199 | if len(par) > 1: 1200 | # rest are new parameters 1201 | parameters.extend(par[1:]) 1202 | else: 1203 | parameters = par 1204 | elif not parameters: 1205 | parameters = [''] # create first param 1206 | # add span to last previous parameter 1207 | parameters[-1] += paramsList[s:e] 1208 | cur = e 1209 | # leftover 1210 | par = paramsList[cur:].split(sep) 1211 | if par: 1212 | if parameters: 1213 | # portion before | belongs to previous parameter 1214 | parameters[-1] += par[0] 1215 | if len(par) > 1: 1216 | # rest are new parameters 1217 | parameters.extend(par[1:]) 1218 | else: 1219 | parameters = par 1220 | 1221 | # logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, str(parameters)) 1222 | return parameters 1223 | 1224 | 1225 | def findMatchingBraces(text, ldelim=0): 1226 | """ 1227 | :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}. 1228 | """ 1229 | # Parsing is done with respect to pairs of double braces {{..}} delimiting 1230 | # a template, and pairs of triple braces {{{..}}} delimiting a tplarg. 1231 | # If double opening braces are followed by triple closing braces or 1232 | # conversely, this is taken as delimiting a template, with one left-over 1233 | # brace outside it, taken as plain text. For any pattern of braces this 1234 | # defines a set of templates and tplargs such that any two are either 1235 | # separate or nested (not overlapping). 1236 | 1237 | # Unmatched double rectangular closing brackets can be in a template or 1238 | # tplarg, but unmatched double rectangular opening brackets cannot. 1239 | # Unmatched double or triple closing braces inside a pair of 1240 | # double rectangular brackets are treated as plain text. 1241 | # Other formulation: in ambiguity between template or tplarg on one hand, 1242 | # and a link on the other hand, the structure with the rightmost opening 1243 | # takes precedence, even if this is the opening of a link without any 1244 | # closing, so not producing an actual link. 1245 | 1246 | # In the case of more than three opening braces the last three are assumed 1247 | # to belong to a tplarg, unless there is no matching triple of closing 1248 | # braces, in which case the last two opening braces are are assumed to 1249 | # belong to a template. 1250 | 1251 | # We must skip individual { like in: 1252 | # {{#ifeq: {{padleft:|1|}} | { | |  }} 1253 | # We must resolve ambiguities like this: 1254 | # {{{{ }}}} -> { {{{ }}} } 1255 | # {{{{{ }}}}} -> {{ {{{ }}} }} 1256 | # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}} 1257 | 1258 | # Handle: 1259 | # {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}} 1260 | # as well as expressions with stray }: 1261 | # {{{link|{{ucfirst:{{{1}}}}}} interchange}}} 1262 | 1263 | if ldelim: # 2-3 1264 | reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim 1265 | reNext = re.compile('[{]{2,}|}{2,}') # at least 2 1266 | else: 1267 | reOpen = re.compile('{{2,}|\[{2,}') 1268 | reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2 1269 | 1270 | cur = 0 1271 | while True: 1272 | m1 = reOpen.search(text, cur) 1273 | if not m1: 1274 | return 1275 | lmatch = m1.end() - m1.start() 1276 | if m1.group()[0] == '{': 1277 | stack = [lmatch] # stack of opening braces lengths 1278 | else: 1279 | stack = [-lmatch] # negative means [ 1280 | end = m1.end() 1281 | while True: 1282 | m2 = reNext.search(text, end) 1283 | if not m2: 1284 | return # unbalanced 1285 | end = m2.end() 1286 | brac = m2.group()[0] 1287 | lmatch = m2.end() - m2.start() 1288 | 1289 | if brac == '{': 1290 | stack.append(lmatch) 1291 | elif brac == '}': 1292 | while stack: 1293 | openCount = stack.pop() # opening span 1294 | if openCount == 0: # illegal unmatched [[ 1295 | continue 1296 | if lmatch >= openCount: 1297 | lmatch -= openCount 1298 | if lmatch <= 1: # either close or stray } 1299 | break 1300 | else: 1301 | # put back unmatched 1302 | stack.append(openCount - lmatch) 1303 | break 1304 | if not stack: 1305 | yield m1.start(), end - lmatch 1306 | cur = end 1307 | break 1308 | elif len(stack) == 1 and 0 < stack[0] < ldelim: 1309 | # ambiguous {{{{{ }}} }} 1310 | yield m1.start() + stack[0], end 1311 | cur = end 1312 | break 1313 | elif brac == '[': # [[ 1314 | stack.append(-lmatch) 1315 | else: # ]] 1316 | while stack and stack[-1] < 0: # matching [[ 1317 | openCount = -stack.pop() 1318 | if lmatch >= openCount: 1319 | lmatch -= openCount 1320 | if lmatch <= 1: # either close or stray ] 1321 | break 1322 | else: 1323 | # put back unmatched (negative) 1324 | stack.append(lmatch - openCount) 1325 | break 1326 | if not stack: 1327 | yield m1.start(), end - lmatch 1328 | cur = end 1329 | break 1330 | # unmatched ]] are discarded 1331 | cur = end 1332 | 1333 | 1334 | def findBalanced(text, openDelim, closeDelim): 1335 | """ 1336 | Assuming that text contains a properly balanced expression using 1337 | :param openDelim: as opening delimiters and 1338 | :param closeDelim: as closing delimiters. 1339 | :return: an iterator producing pairs (start, end) of start and end 1340 | positions in text containing a balanced expression. 1341 | """ 1342 | openPat = '|'.join([re.escape(x) for x in openDelim]) 1343 | # patter for delimiters expected after each opening delimiter 1344 | afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)} 1345 | stack = [] 1346 | start = 0 1347 | cur = 0 1348 | # end = len(text) 1349 | startSet = False 1350 | startPat = re.compile(openPat) 1351 | nextPat = startPat 1352 | while True: 1353 | next = nextPat.search(text, cur) 1354 | if not next: 1355 | return 1356 | if not startSet: 1357 | start = next.start() 1358 | startSet = True 1359 | delim = next.group(0) 1360 | if delim in openDelim: 1361 | stack.append(delim) 1362 | nextPat = afterPat[delim] 1363 | else: 1364 | opening = stack.pop() 1365 | # assert opening == openDelim[closeDelim.index(next.group(0))] 1366 | if stack: 1367 | nextPat = afterPat[stack[-1]] 1368 | else: 1369 | yield start, next.end() 1370 | nextPat = startPat 1371 | start = next.end() 1372 | startSet = False 1373 | cur = next.end() 1374 | 1375 | # ---------------------------------------------------------------------- 1376 | # parser functions utilities 1377 | 1378 | 1379 | def ucfirst(string): 1380 | """:return: a string with just its first character uppercase 1381 | We can't use title() since it coverts all words. 1382 | """ 1383 | if string: 1384 | if len(string) > 1: 1385 | return string[0].upper() + string[1:] 1386 | else: 1387 | return string.upper() 1388 | else: 1389 | return '' 1390 | 1391 | 1392 | def lcfirst(string): 1393 | """:return: a string with its first character lowercase""" 1394 | if string: 1395 | if len(string) > 1: 1396 | return string[0].lower() + string[1:] 1397 | else: 1398 | return string.lower() 1399 | else: 1400 | return '' 1401 | 1402 | 1403 | def fullyQualifiedTemplateTitle(templateTitle): 1404 | """ 1405 | Determine the namespace of the page being included through the template 1406 | mechanism 1407 | """ 1408 | if templateTitle.startswith(':'): 1409 | # Leading colon by itself implies main namespace, so strip this colon 1410 | return ucfirst(templateTitle[1:]) 1411 | else: 1412 | m = re.match('([^:]*)(:.*)', templateTitle) 1413 | if m: 1414 | # colon found but not in the first position - check if it 1415 | # designates a known namespace 1416 | prefix = normalizeNamespace(m.group(1)) 1417 | if prefix in knownNamespaces: 1418 | return prefix + ucfirst(m.group(2)) 1419 | # The title of the page being included is NOT in the main namespace and 1420 | # lacks any other explicit designation of the namespace - therefore, it 1421 | # is resolved to the Template namespace (that's the default for the 1422 | # template inclusion mechanism). 1423 | 1424 | # This is a defense against pages whose title only contains UTF-8 chars 1425 | # that are reduced to an empty string. Right now I can think of one such 1426 | # case - which represents the non-breaking space. 1427 | # In this particular case, this page is a redirect to [[Non-nreaking 1428 | # space]], but having in the system a redirect page with an empty title 1429 | # causes numerous problems, so we'll live happier without it. 1430 | if templateTitle: 1431 | return templatePrefix + ucfirst(templateTitle) 1432 | else: 1433 | return '' # caller may log as error 1434 | 1435 | 1436 | def normalizeNamespace(ns): 1437 | return ucfirst(ns) 1438 | 1439 | 1440 | # ---------------------------------------------------------------------- 1441 | # Parser functions 1442 | # see http://www.mediawiki.org/wiki/Help:Extension:ParserFunctions 1443 | # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php 1444 | 1445 | 1446 | class Infix(): 1447 | 1448 | """Infix operators. 1449 | The calling sequence for the infix is: 1450 | x |op| y 1451 | """ 1452 | 1453 | def __init__(self, function): 1454 | self.function = function 1455 | 1456 | def __ror__(self, other): 1457 | return Infix(lambda x, self=self, other=other: self.function(other, x)) 1458 | 1459 | def __or__(self, other): 1460 | return self.function(other) 1461 | 1462 | def __rlshift__(self, other): 1463 | return Infix(lambda x, self=self, other=other: self.function(other, x)) 1464 | 1465 | def __rshift__(self, other): 1466 | return self.function(other) 1467 | 1468 | def __call__(self, value1, value2): 1469 | return self.function(value1, value2) 1470 | 1471 | 1472 | ROUND = Infix(lambda x, y: round(x, y)) 1473 | 1474 | 1475 | def sharp_expr(expr): 1476 | try: 1477 | expr = re.sub('=', '==', expr) 1478 | expr = re.sub('mod', '%', expr) 1479 | expr = re.sub('\bdiv\b', '/', expr) 1480 | expr = re.sub('\bround\b', '|ROUND|', expr) 1481 | return unicode(eval(expr)) 1482 | except: 1483 | return '' 1484 | 1485 | 1486 | def sharp_if(testValue, valueIfTrue, valueIfFalse=None, *args): 1487 | # In theory, we should evaluate the first argument here, 1488 | # but it was evaluated while evaluating part[0] in expandTemplate(). 1489 | if testValue.strip(): 1490 | # The {{#if:}} function is an if-then-else construct. 1491 | # The applied condition is: "The condition string is non-empty". 1492 | valueIfTrue = valueIfTrue.strip() 1493 | if valueIfTrue: 1494 | return valueIfTrue 1495 | elif valueIfFalse: 1496 | return valueIfFalse.strip() 1497 | return "" 1498 | 1499 | 1500 | def sharp_ifeq(lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args): 1501 | rvalue = rvalue.strip() 1502 | if rvalue: 1503 | # lvalue is always defined 1504 | if lvalue.strip() == rvalue: 1505 | # The {{#ifeq:}} function is an if-then-else construct. The 1506 | # applied condition is "is rvalue equal to lvalue". Note that this 1507 | # does only string comparison while MediaWiki implementation also 1508 | # supports numerical comparissons. 1509 | 1510 | if valueIfTrue: 1511 | return valueIfTrue.strip() 1512 | else: 1513 | if valueIfFalse: 1514 | return valueIfFalse.strip() 1515 | return "" 1516 | 1517 | 1518 | def sharp_iferror(test, then='', Else=None, *args): 1519 | if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test): 1520 | return then 1521 | elif Else is None: 1522 | return test.strip() 1523 | else: 1524 | return Else.strip() 1525 | 1526 | 1527 | def sharp_switch(primary, *params): 1528 | # FIXME: we don't support numeric expressions in primary 1529 | 1530 | # {{#switch: comparison string 1531 | # | case1 = result1 1532 | # | case2 1533 | # | case4 = result2 1534 | # | 1 | case5 = result3 1535 | # | #default = result4 1536 | # }} 1537 | 1538 | primary = primary.strip() 1539 | found = False # for fall through cases 1540 | default = None 1541 | rvalue = None 1542 | lvalue = '' 1543 | for param in params: 1544 | # handle cases like: 1545 | # #default = [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...] 1546 | pair = param.split('=', 1) 1547 | lvalue = pair[0].strip() 1548 | rvalue = None 1549 | if len(pair) > 1: 1550 | # got "=" 1551 | rvalue = pair[1].strip() 1552 | # check for any of multiple values pipe separated 1553 | if found or primary in [v.strip() for v in lvalue.split('|')]: 1554 | # Found a match, return now 1555 | return rvalue 1556 | elif lvalue == '#default': 1557 | default = rvalue 1558 | rvalue = None # avoid defaulting to last case 1559 | elif lvalue == primary: 1560 | # If the value matches, set a flag and continue 1561 | found = True 1562 | # Default case 1563 | # Check if the last item had no = sign, thus specifying the default case 1564 | if rvalue is not None: 1565 | return lvalue 1566 | elif default is not None: 1567 | return default 1568 | return '' 1569 | 1570 | 1571 | # Extension Scribuntu 1572 | def sharp_invoke(module, function, frame): 1573 | functions = modules.get(module) 1574 | if functions: 1575 | funct = functions.get(function) 1576 | if funct: 1577 | # find parameters in frame whose title is the one of the original 1578 | # template invocation 1579 | templateTitle = fullyQualifiedTemplateTitle(function) 1580 | if not templateTitle: 1581 | logging.warn("Template with empty title") 1582 | pair = next((x for x in frame if x[0] == templateTitle), None) 1583 | if pair: 1584 | params = pair[1] 1585 | # extract positional args 1586 | params = [params.get(str(i + 1)) for i in range(len(params))] 1587 | return funct(*params) 1588 | else: 1589 | return funct() 1590 | return '' 1591 | 1592 | 1593 | parserFunctions = { 1594 | 1595 | '#expr': sharp_expr, 1596 | 1597 | '#if': sharp_if, 1598 | 1599 | '#ifeq': sharp_ifeq, 1600 | 1601 | '#iferror': sharp_iferror, 1602 | 1603 | '#ifexpr': lambda *args: '', # not supported 1604 | 1605 | '#ifexist': lambda *args: '', # not supported 1606 | 1607 | '#rel2abs': lambda *args: '', # not supported 1608 | 1609 | '#switch': sharp_switch, 1610 | 1611 | '# language': lambda *args: '', # not supported 1612 | 1613 | '#time': lambda *args: '', # not supported 1614 | 1615 | '#timel': lambda *args: '', # not supported 1616 | 1617 | '#titleparts': lambda *args: '', # not supported 1618 | 1619 | # This function is used in some pages to construct links 1620 | # http://meta.wikimedia.org/wiki/Help:URL 1621 | 'urlencode': lambda string, *rest: urlencode(string), 1622 | 1623 | 'lc': lambda string, *rest: string.lower() if string else '', 1624 | 1625 | 'lcfirst': lambda string, *rest: lcfirst(string), 1626 | 1627 | 'uc': lambda string, *rest: string.upper() if string else '', 1628 | 1629 | 'ucfirst': lambda string, *rest: ucfirst(string), 1630 | 1631 | 'int': lambda string, *rest: str(int(string)), 1632 | 1633 | } 1634 | 1635 | 1636 | def callParserFunction(functionName, args, frame): 1637 | """ 1638 | Parser functions have similar syntax as templates, except that 1639 | the first argument is everything after the first colon. 1640 | :return: the result of the invocation, None in case of failure. 1641 | 1642 | http://meta.wikimedia.org/wiki/Help:ParserFunctions 1643 | """ 1644 | 1645 | try: 1646 | if functionName == '#invoke': 1647 | # special handling of frame 1648 | ret = sharp_invoke(args[0].strip(), args[1].strip(), frame) 1649 | # logging.debug('parserFunction> %s %s', functionName, ret) 1650 | return ret 1651 | if functionName in parserFunctions: 1652 | ret = parserFunctions[functionName](*args) 1653 | # logging.debug('parserFunction> %s %s', functionName, ret) 1654 | return ret 1655 | except: 1656 | return "" # FIXME: fix errors 1657 | 1658 | return "" 1659 | 1660 | 1661 | # ---------------------------------------------------------------------- 1662 | # Extract Template definition 1663 | 1664 | reNoinclude = re.compile(r'(?:.*?)', re.DOTALL) 1665 | reIncludeonly = re.compile(r'|', re.DOTALL) 1666 | 1667 | # These are built before spawning processes, hence thay are shared. 1668 | templates = {} 1669 | redirects = {} 1670 | # cache of parser templates 1671 | # FIXME: sharing this with a Manager slows down. 1672 | templateCache = {} 1673 | 1674 | 1675 | def define_template(title, page): 1676 | """ 1677 | Adds a template defined in the :param page:. 1678 | @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude 1679 | """ 1680 | global templates 1681 | global redirects 1682 | 1683 | # title = normalizeTitle(title) 1684 | 1685 | # check for redirects 1686 | m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE) 1687 | if m: 1688 | redirects[title] = m.group(1) # normalizeTitle(m.group(1)) 1689 | return 1690 | 1691 | text = unescape(''.join(page)) 1692 | 1693 | # We're storing template text for future inclusion, therefore, 1694 | # remove all text and keep all text 1695 | # (but eliminate tags per se). 1696 | # However, if ... parts are present, 1697 | # then only keep them and discard the rest of the template body. 1698 | # This is because using on a text fragment is 1699 | # equivalent to enclosing it in tags **AND** 1700 | # enclosing all the rest of the template body in tags. 1701 | 1702 | # remove comments 1703 | text = comment.sub('', text) 1704 | 1705 | # eliminate fragments 1706 | text = reNoinclude.sub('', text) 1707 | # eliminate unterminated elements 1708 | text = re.sub(r'.*$', '', text, flags=re.DOTALL) 1709 | text = re.sub(r'', '', text) 1710 | 1711 | onlyincludeAccumulator = '' 1712 | for m in re.finditer('(.*?)', text, re.DOTALL): 1713 | onlyincludeAccumulator += m.group(1) 1714 | if onlyincludeAccumulator: 1715 | text = onlyincludeAccumulator 1716 | else: 1717 | text = reIncludeonly.sub('', text) 1718 | 1719 | if text: 1720 | if title in templates: 1721 | logging.warn('Redefining: %s', title) 1722 | templates[title] = text 1723 | -------------------------------------------------------------------------------- /reader/wikiextractor/extractPage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ============================================================================= 5 | # Version: 3.0 (July 22, 2020) 6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa 7 | 8 | # ============================================================================= 9 | # Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it). 10 | # ============================================================================= 11 | # This file is part of Tanl. 12 | # 13 | # Tanl is free software; you can redistribute it and/or modify it 14 | # under the terms of the GNU Affero General Public License, version 3, 15 | # as published by the Free Software Foundation. 16 | # 17 | # Tanl is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | # GNU Affero General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Affero General Public License 23 | # along with this program. If not, see . 24 | # ============================================================================= 25 | 26 | """Wikipedia Page Extractor: 27 | Extracts a single page from a Wikipedia dump file. 28 | """ 29 | 30 | import sys, os.path 31 | import re 32 | import argparse 33 | import bz2 34 | 35 | 36 | # Program version 37 | __version__ = '3.0.5' 38 | 39 | # ---------------------------------------------------------------------- 40 | # READER 41 | 42 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?') 43 | #tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)') 44 | # 1 2 3 45 | 46 | def process_data(input_file, id, templates=False): 47 | """ 48 | :param input_file: name of the wikipedia dump file. 49 | :param id: article id 50 | """ 51 | 52 | if input_file.lower().endswith(".bz2"): 53 | input = bz2.open(input_file, mode='rt', encoding='utf-8') 54 | else: 55 | input = open(input_file) 56 | 57 | page = [] 58 | for line in input: 59 | line = line 60 | if '<' not in line: # faster than doing re.search() 61 | if page: 62 | page.append(line) 63 | continue 64 | m = tagRE.search(line) 65 | if not m: 66 | continue 67 | tag = m.group(2) 68 | if tag == 'page': 69 | page = [] 70 | page.append(line) 71 | inArticle = False 72 | elif tag == 'id': 73 | curid = m.group(3) 74 | if id == curid: 75 | page.append(line) 76 | inArticle = True 77 | elif not inArticle and not templates: 78 | page = [] 79 | elif tag == 'title': 80 | if templates: 81 | if m.group(3).startswith('Template:'): 82 | page.append(line) 83 | else: 84 | page = [] 85 | else: 86 | page.append(line) 87 | elif tag == '/page': 88 | if page: 89 | page.append(line) 90 | print(''.join(page)) 91 | if not templates: 92 | break 93 | page = [] 94 | elif page: 95 | page.append(line) 96 | 97 | input.close() 98 | 99 | def main(): 100 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), 101 | formatter_class=argparse.RawDescriptionHelpFormatter, 102 | description=__doc__) 103 | parser.add_argument("input", 104 | help="XML wiki dump file") 105 | parser.add_argument("--id", default="1", 106 | help="article number") 107 | parser.add_argument("--template", action="store_true", 108 | help="template number") 109 | parser.add_argument("-v", "--version", action="version", 110 | version='%(prog)s ' + version, 111 | help="print program version") 112 | 113 | args = parser.parse_args() 114 | 115 | process_data(args.input, args.id, args.template) 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /run/pipeline.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import os 4 | import logging 5 | import copy 6 | from glob import glob 7 | 8 | import threading 9 | from multiprocessing import Process 10 | import multiprocessing as mp 11 | 12 | def run_bash(args,cmd): 13 | cmd = cmd.format(**args) 14 | print('cmd:',cmd) 15 | os.system(cmd) 16 | 17 | def check_output_dir(output_dir): 18 | if not os.path.exists(output_dir): 19 | os.mkdir(output_dir) 20 | 21 | POOL_SIZE = 9 22 | MAX_EDIT = 0.1 23 | MIN_FILESIZE = 5 # KB 24 | 25 | global_args = { 26 | "code_root" : '/nfs/users/xueyou/github/wiki-error-corpus', 27 | # "xml_dump": '/nfs/users/xueyou/data/speller/wiki/zhwiki-20211201-pages-meta-history1.xml-p2981p11534', 28 | "input_dir" : '/data/xueyou/data/speller/wiki/', 29 | "output_dir" : '/data/xueyou/data/speller/wiki/', 30 | 'max_edit': MAX_EDIT 31 | } 32 | 33 | for xml_dump_file in list(glob('/data/xueyou/data/speller/wiki/*.7z')): 34 | global_args['xml_dump'] = xml_dump_file.replace('.7z','') 35 | 36 | # Stage 1 37 | # Extract 7z file 38 | print(f'extract {xml_dump_file}') 39 | cmd = f'7z e {xml_dump_file}' 40 | run_bash({},cmd) 41 | 42 | # Stage 2 43 | # Divide the large XML revision dump file into per page revisions. 44 | print(f'divide XML file') 45 | cmd = 'python {code_root}/reader/divide_xml_revisions.py {xml_dump} {output_dir}' 46 | args = copy.deepcopy(global_args) 47 | args['output_dir'] = args['output_dir'] + 'stage1' 48 | check_output_dir(args['output_dir']) 49 | run_bash(args,cmd) 50 | 51 | 52 | # Stage 3 53 | # Extract Revisions from page history 54 | cmd = 'python {code_root}/reader/extract_revisions_new.py {input_dir} {input_file} {output_dir}' 55 | input_dir = global_args['input_dir'] + 'stage1' 56 | output_dir = global_args['output_dir'] + 'stage3' 57 | check_output_dir(output_dir) 58 | 59 | pool = mp.Pool(processes = POOL_SIZE) 60 | for fname in glob(input_dir + '/*.xml'): 61 | fsize = os.path.getsize(fname) / 1024 # KB 62 | if fsize < MIN_FILESIZE: 63 | # print(f'small size, skip {fname}') 64 | continue 65 | args = copy.deepcopy(global_args) 66 | args['input_dir'] = input_dir 67 | args['output_dir'] = output_dir 68 | args['input_file'] = os.path.basename(fname) 69 | pool.apply_async(run_bash,(args, cmd)) 70 | pool.close() 71 | pool.join() 72 | 73 | # Stage 4 74 | # Extract errors with edit distance 75 | cmd = 'python {code_root}/reader/extract_spelling_errors_new.py {input_dir} {input_file} {output_dir} zh {max_edit}' 76 | input_dir = global_args['input_dir'] + 'stage3' 77 | output_dir = global_args['output_dir'] + 'stage4' 78 | check_output_dir(output_dir) 79 | 80 | pool = mp.Pool(processes = POOL_SIZE) 81 | for fname in glob(input_dir + '/*.xml'): 82 | basename = os.path.basename(fname) 83 | args = copy.deepcopy(global_args) 84 | args['input_dir'] = input_dir 85 | args['output_dir'] = output_dir 86 | args['input_file'] = basename 87 | pool.apply_async(run_bash,(args, cmd)) 88 | pool.close() 89 | pool.join() 90 | 91 | # Stage5 92 | # collect all the errors 93 | input_dir = global_args['input_dir'] + 'stage4' 94 | output_dir = global_args['input_dir'] + 'stage5' 95 | check_output_dir(output_dir) 96 | with open(output_dir + '/error_sent.txt','a') as ef,open(output_dir + '/ori_sent.txt','a') as of: 97 | for fname in glob(input_dir + '/*.xml_error_sen.txt'): 98 | basename = os.path.basename(fname).split('.')[0] 99 | # err_f.write(open(input_dir + '/' + basename + '.xml_spelling_error.txt').read()) 100 | ef.write(open(input_dir + '/' + basename + '.xml_error_sen.txt').read()) 101 | of.write(open(input_dir + '/' + basename + '.xml_orig_sen.txt').read()) 102 | 103 | # Stage 6 104 | # clear 105 | print('clear tmp files') 106 | check_output_dir('./extracted') 107 | cmd = f'''rm {global_args['xml_dump']} 108 | rm -rf stage1 stage3 stage4 109 | mv {xml_dump_file} extracted 110 | ''' 111 | run_bash({},cmd) 112 | print('all done') 113 | 114 | 115 | 116 | 117 | --------------------------------------------------------------------------------