├── .gitignore
├── README.md
├── reader
├── __init__.py
├── divide_xml_revisions.py
├── divide_xml_revisions_new.py
├── extract_revisions.py
├── extract_revisions_new.py
├── extract_spelling_errors.py
├── extract_spelling_errors_new.py
├── fix_extracted.py
├── utils.py
└── wikiextractor
│ ├── WikiExtractor.py
│ ├── __init__.py
│ ├── cirrus-extract.py
│ ├── clean.py
│ ├── extract.py
│ └── extractPage.py
└── run
└── pipeline.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
132 | tcdata/
133 | user_data/*
134 | !user_data/extra_data
135 | !user_data/track3
136 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WiKi-Error-Extract
2 |
3 | **\*\*\*\*\* Update 2022-03-04 \*\*\*\*\***
4 |
5 | 由于现在这个pipeline运行起来还是非常慢的,可以使用我抽取的结果:
6 |
7 | [下载地址+密码: c4qk](https://pan.baidu.com/s/1PovlwB9H1Zu-Jv_WN9xZnQ)
8 |
9 | > 建议用混淆集对错误做一下过滤。
10 |
11 | 同时修改了一下抽取逻辑,避免读写大量小文件。运行:
12 |
13 | ```python
14 | python code_dir/reader/divide_xml_revisions_new.py unzipped_file_path output_file_path 0.1
15 | ```
16 |
17 | **\*\*\*\*\* Update End \*\*\*\*\***
18 |
19 | 根据wiki history数据提取纠错的平行语料。这里只提取字是1-1对齐的语料,不考虑删减字的情况。需要的话自行修改代码即可。
20 |
21 | 步骤:
22 |
23 | 1. 从wiki dumps中现在zhwiki有完整编辑历史的数据。我下载的是[20211201](https://dumps.wikimedia.org/zhwiki/20211201/)中 All pages with complete edit history (.7z)格式的数据。
24 |
25 | 2. 将所有7z文件放在同一个目录A下,在该目录下新建extracted目录。修改run/pipeline.py中对应目录地址。
26 |
27 | 3. 在A目录下运行pipeline文件。`python wiki-error-extract dir/run/pipeline.py`
28 |
29 | 4. 最终结果会在A目录下的stage5文件夹中。
30 |
31 |
32 | ## 例子
33 |
34 | ```
35 | [{'src': '比如我把勾股定理叫做勾理定理,并不意味着我认为为它是中国人最先证明的,而是"勾"股"弦"本来就是指直角三角形的三边。',
36 | 'tgt': '比如我把勾股定理叫做勾股定理,并不意味着我认为为它是中国人最先证明的,而是"勾"股"弦"本来就是指直角三角形的三边。'},
37 | {'src': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪,而支持杀局的李国宝在投票前二十秒赶到等「戏剧化」情节下而令草案得到通过。',
38 | 'tgt': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪,而支持杀局的李国宝在投票前20秒赶到等「戏剧化」情节下而令草案得到通过。'},
39 | {'src': '禁止公众摄影图书馆 康文署拒放宽.', 'tgt': '禁止公众拍摄图书馆 康文署拒放宽.'},
40 | {'src': '上白礁谒祖祭典由台湾台南县学甲镇慈济宫所举行,于每年农历三月十一日举行,即当地所称上白礁活动,于当地将军溪畔「头前寮」举行祭典,隔海遥祭保生大帝祖庙,即中国大陆福建白礁慈济宫。',
41 | 'tgt': '上白礁谒祖祭典由台湾台南市学甲区慈济宫所举行,于每年农历三月十一日举行,即当地所称上白礁活动,于当地将军溪畔「头前寮」举行祭典,隔海遥祭保生大帝祖庙,即中国大陆福建白礁慈济宫。'},
42 | {'src': '1980年后,改善分离的选择性成为色谱工作者的主要问题,人们越来越认识到改变流动相的组成事提高选择性的关键。',
43 | 'tgt': '1980年后,改善分离的选择性成为色谱工作者的主要问题,人们越来越认识到改变流动相的组成是提高选择性的关键。'},
44 | {'src': '德国战车最大的缺点是生产速度慢,直到战终德国主力四号战车也仅生产九千多、五号豹式六千多部。',
45 | 'tgt': '德国战车最大的缺点是生产速度慢,直到战终德国主力四号坦克也仅生产九千多、五号豹式六千多部。'},
46 | {'src': '由于过多贵格会信仰者居住于费城,因此费城人又称"贵格会信仰者"。',
47 | 'tgt': '由于许多贵格会信仰者居住于费城,因此费城人又称"贵格会信仰者"。'},
48 | {'src': '进入校门,映入眼帘的即是建中的指标建筑红楼,其建于日治时期(1909年),目前已列入台北市市定古迹。',
49 | 'tgt': '进入校门,映入眼帘的即是建中的指标建筑红楼,其建于日据时期(1909年),目前已列入台北市市定古迹。'},
50 | {'src': '2015年1月20日,该校一名姓陈17岁中五女生因担心迟到而追小巴,却被撞至阴道重创昏迷。',
51 | 'tgt': '2015年1月20日,该校一名姓陈17岁中五女生因担心迟到而追小巴,却被撞至头部重创昏迷。'},
52 | {'src': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星团之一的人,他在观测天狼星系统在天空中的移动路径之后得出这个结论。',
53 | 'tgt': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星群之一的人,他在观测天狼星系统在天空中的移动路径之后得出这个结论。'}]
54 | ```
--------------------------------------------------------------------------------
/reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/__init__.py
--------------------------------------------------------------------------------
/reader/divide_xml_revisions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Divide the large XML revision dump file into per page revisions.
4 |
5 | """
6 | import codecs
7 | import os
8 | import xml.sax
9 | import xml.sax.saxutils
10 |
11 |
12 | html_escape_table = {
13 | u'‘': "'",
14 | u'’': "'",
15 | u'“': '"',
16 | u'”': '"'
17 | }
18 |
19 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
20 |
21 | def html_escape(text):
22 | return xml.sax.saxutils.escape(text, html_escape_table)
23 |
24 | def html_unescape(text):
25 | return xml.sax.saxutils.unescape(text, html_unescape_table)
26 |
27 |
28 | class WikiRevisionDumpHandler(xml.sax.ContentHandler):
29 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
30 | 'timestamp', 'contributor', 'ip', 'username',
31 | 'comment', 'model', 'format', 'text', 'sha1'])
32 | file_counter = 0
33 | file_handle = ''
34 |
35 | def __init__(self, input_file, output_dir):
36 | # Input/output locations
37 | self.input_file = input_file
38 | self.output_dir = output_dir
39 |
40 | # Recent tag visited by SAX parser
41 | self.curr_tag = ''
42 | self.content = ''
43 |
44 | def startElement(self, tag, attributes):
45 | self.curr_tag = tag
46 | if self.curr_tag == 'page':
47 | # close the unclosed handles first if any
48 | if self.file_handle:
49 | self.file_handle.close()
50 | fname = repr(self.file_counter).zfill(10) + '.xml'
51 | abspath = self.output_dir + '/' + fname
52 | print('Writing to file: ', abspath )
53 | self.file_handle = codecs.open(abspath, 'w', 'utf-8')
54 | self.file_handle.write(self.tag_start('page')+'\n')
55 | elif self.curr_tag in self.wiki_dump_tags:
56 | self.file_handle.write(self.tag_start(self.curr_tag))
57 |
58 | def endElement(self, tag):
59 | self.curr_tag = tag
60 | if self.curr_tag == 'page':
61 | self.file_handle.write(self.tag_end('page'))
62 | self.file_handle.close()
63 | self.file_counter += 1
64 | elif self.curr_tag in self.wiki_dump_tags:
65 | self.file_handle.write(self.tag_end(self.curr_tag))
66 |
67 | def characters(self, contents):
68 | self.content = contents
69 | if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
70 | self.file_handle.write(html_escape(self.content))
71 |
72 | @staticmethod
73 | def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+''+tag+'>'
74 |
75 | @staticmethod
76 | def tag_start(tag): return '<'+tag+'>'
77 |
78 | @staticmethod
79 | def tag_end(tag): return ''+tag+'>'
80 |
81 |
82 | if __name__ == '__main__':
83 | import argparse
84 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
85 | arg_parser.add_argument('input_file', help='XML revision dump file name')
86 | arg_parser.add_argument('output_dir', help='Output directory')
87 | args = arg_parser.parse_args()
88 | if not os.path.exists(args.output_dir):
89 | os.makedirs(args.output_dir)
90 |
91 | # SAX XML reader
92 | xml_parser = xml.sax.make_parser()
93 |
94 | revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_dir)
95 | xml_parser.setContentHandler(revision_dump_handler)
96 | xml_parser.parse(args.input_file)
97 |
--------------------------------------------------------------------------------
/reader/divide_xml_revisions_new.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Divide the large XML revision dump file into per page revisions.
4 |
5 | """
6 | import codecs
7 | import os
8 | import xml.sax
9 | import xml.sax.saxutils
10 | import io
11 | import json
12 | from extract_revisions_new import extract_revisions
13 | from extract_spelling_errors_new import converter,check_error
14 |
15 | html_escape_table = {
16 | u'‘': "'",
17 | u'’': "'",
18 | u'“': '"',
19 | u'”': '"'
20 | }
21 |
22 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
23 |
24 | def html_escape(text):
25 | return xml.sax.saxutils.escape(text, html_escape_table)
26 |
27 | def html_unescape(text):
28 | return xml.sax.saxutils.unescape(text, html_unescape_table)
29 |
30 |
31 | def extract_errors(content,number_of_edits,outfile):
32 | buffer = ''
33 | # extract revisions
34 | for timestamp,text in extract_revisions(io.StringIO(content)):
35 | if len(text) > 10:
36 | buffer += '\n\n[Revision timestamp: ' + timestamp + ']\n\n'
37 | buffer += text
38 |
39 | revisions = []
40 | line = []
41 |
42 | srcs,tgts = [],[]
43 | pre_revision = ''
44 | current_revision = ''
45 | cnt = 0
46 | for line in buffer.splitlines():
47 | line = converter.convert(line)
48 | if "Revision timestamp" in line:
49 | if current_revision:
50 | if pre_revision:
51 | cnt += 1
52 | # if cnt % 100 == 0:
53 | # print('processed',cnt)
54 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
55 | pre_revision = current_revision.strip()
56 | current_revision = ''
57 | else:
58 | current_revision += line
59 | if current_revision and pre_revision:
60 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
61 |
62 |
63 | keeps = []
64 | if srcs:
65 | errors = set()
66 | for src,tgt in zip(srcs[::-1],tgts[::-1]):
67 | if src in errors or tgt in errors:
68 | continue
69 | errors.add(src)
70 | errors.add(tgt)
71 | keeps.append({"src":src,'tgt':tgt})
72 |
73 | if keeps:
74 | for x in keeps:
75 | outfile.write(json.dumps(x,ensure_ascii=False) + '\n')
76 |
77 | class WikiRevisionDumpHandler(xml.sax.ContentHandler):
78 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
79 | 'timestamp', 'contributor', 'ip', 'username',
80 | 'comment', 'model', 'format', 'text', 'sha1'])
81 | file_counter = 0
82 | file_handle = ''
83 |
84 | def __init__(self, input_file, output_file, number_of_edits):
85 | # Input/output locations
86 | self.input_file = input_file
87 | self.output_file = open(output_file,'a')
88 | self.number_of_edits = number_of_edits
89 |
90 | # Recent tag visited by SAX parser
91 | self.curr_tag = ''
92 | self.content = ''
93 |
94 | def startElement(self, tag, attributes):
95 | self.curr_tag = tag
96 | if self.curr_tag == 'page':
97 | # close the unclosed handles first if any
98 | if self.file_handle:
99 | self.file_handle = ''
100 | self.file_handle += self.tag_start('page')+'\n'
101 | elif self.curr_tag in self.wiki_dump_tags:
102 | self.file_handle += self.tag_start(self.curr_tag)
103 |
104 | def endElement(self, tag):
105 | self.curr_tag = tag
106 | if self.curr_tag == 'page':
107 | self.file_handle += self.tag_end('page')
108 | self.file_counter += 1
109 | extract_errors(self.file_handle,self.number_of_edits,self.output_file)
110 | self.file_handle = ''
111 | # if self.file_counter % 100 == 0:
112 | print(f'{self.input_file} processed {self.file_counter} pages')
113 | elif self.curr_tag in self.wiki_dump_tags:
114 | self.file_handle += self.tag_end(self.curr_tag)
115 |
116 | def characters(self, contents):
117 | self.content = contents
118 | if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
119 | self.file_handle += html_escape(self.content)
120 |
121 | @staticmethod
122 | def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+''+tag+'>'
123 |
124 | @staticmethod
125 | def tag_start(tag): return '<'+tag+'>'
126 |
127 | @staticmethod
128 | def tag_end(tag): return ''+tag+'>'
129 |
130 |
131 | if __name__ == '__main__':
132 | import argparse
133 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
134 | arg_parser.add_argument('input_file', help='XML revision dump file name')
135 | arg_parser.add_argument('output_file', help='Output file')
136 | arg_parser.add_argument('number_of_edits', help='number_of_edits')
137 | args = arg_parser.parse_args()
138 | number_of_edits = float(args.number_of_edits)
139 | if not os.path.exists(os.path.dirname(args.output_file)):
140 | os.makedirs(os.path.dirname(args.output_file))
141 |
142 | # SAX XML reader
143 | xml_parser = xml.sax.make_parser()
144 |
145 | revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_file,number_of_edits)
146 | xml_parser.setContentHandler(revision_dump_handler)
147 | xml_parser.parse(args.input_file)
148 |
--------------------------------------------------------------------------------
/reader/extract_revisions.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import codecs
4 | import os
5 | import xml.sax
6 | import xml.sax.saxutils
7 |
8 | from fix_extracted import fix_extraction
9 |
10 |
11 | html_escape_table = {
12 | u'‘': "'",
13 | u'’': "'",
14 | u'“': '"',
15 | u'”': '"',
16 | u'&': '&'
17 | }
18 |
19 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
20 |
21 | def html_escape(text):
22 | return xml.sax.saxutils.escape(text, html_escape_table)
23 |
24 | def html_unescape(text):
25 | return xml.sax.saxutils.unescape(text, html_unescape_table)
26 |
27 |
28 | class WikiRevisionHandler(xml.sax.ContentHandler):
29 | input_file = 'wiki.xml'
30 | output_dir = '.'
31 | wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
32 | 'timestamp', 'contributor', 'ip', 'username',
33 | 'comment', 'model', 'format', 'text', 'sha1'])
34 | file_handle = ''
35 |
36 | def __init__(self, input_file, output_file):
37 | # Input/output locations
38 | self.input_file = input_file
39 | self.output_file = output_file
40 |
41 | # Recent tag visited by SAX parser
42 | self.curr_tag = ''
43 | self.content = ''
44 |
45 | # Revisions
46 | self.revisions = []
47 | self.curr_rev = []
48 | self.rev_start = False
49 | self.ts_start = False
50 | self.timestamps = []
51 |
52 |
53 | def startElement(self, tag, attributes):
54 | self.curr_tag = tag
55 | if self.curr_tag == 'timestamp':
56 | self.ts_start = True
57 | if self.curr_tag == 'revision':
58 | self.rev_start = True
59 | if self.curr_tag == 'page':
60 | # close the unclosed handles first if any
61 | if self.file_handle:
62 | self.file_handle.close()
63 | print('Writing to file: ', self.output_file)
64 | self.file_handle = codecs.open(self.output_file, 'w', 'utf-8')
65 | # self.file_handle.write(self.tag_start('page')+'\n')
66 | #elif self.curr_tag in self.wiki_dump_tags:
67 | # self.file_handle.write(self.tag_start(self.curr_tag))
68 |
69 | def endElement(self, tag):
70 | self.curr_tag = tag
71 | if self.curr_tag == 'timestamp':
72 | self.ts_start = False
73 | if self.curr_tag == 'revision':
74 | self.rev_start = False
75 | if len(self.curr_rev) > 0:
76 | self.revisions.append(self.curr_rev)
77 | self.curr_rev = []
78 | if self.curr_tag == 'page':
79 | # self.file_handle.write(self.tag_end('page'))
80 | print('revisions',len(self.revisions))
81 | ts_revs = list(zip(self.timestamps, self.revisions))
82 | for t_r in ts_revs[::-1]:
83 | self.file_handle.write('\n[Revision timestamp: ' + t_r[0] + ']\n')
84 | html_escaped = html_escape(''.join(t_r[1]))
85 | self.file_handle.write(html_escaped)
86 | self.file_handle.close()
87 | #elif self.curr_tag in self.wiki_dump_tags:
88 | # self.file_handle.write(self.tag_end(self.curr_tag))
89 |
90 | def characters(self, contents):
91 | self.content = contents
92 | if self.curr_tag == 'text' and self.rev_start:
93 | self.curr_rev.append(self.content)
94 | if self.curr_tag == 'timestamp' and self.ts_start:
95 | self.timestamps.append(self.content)
96 | #self.file_handle.write('[Revision timestamp: ' + self.content + ']\n')
97 | #if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
98 | # self.file_handle.write(html_escape(self.content))
99 |
100 |
101 | class WikiRevErrorHandler(xml.sax.handler.ErrorHandler):
102 |
103 | def error(self, exception):
104 | pass
105 |
106 | def fatalError(self, exception):
107 | pass
108 |
109 | def warning(self, exception):
110 | pass
111 |
112 |
113 |
114 | if __name__ == '__main__':
115 | import argparse
116 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
117 | arg_parser.add_argument('input_dir', help='Input dir')
118 | arg_parser.add_argument('input_file', help='Input file')
119 | arg_parser.add_argument('output_dir', help='Output dir')
120 | args = arg_parser.parse_args()
121 |
122 | # fix extraction
123 | fix_extraction(args.input_dir,args.input_file,args.input_dir)
124 |
125 | input_file = args.input_dir + '/' + args.input_file
126 | output_file = args.output_dir + '/' + args.input_file
127 | # SAX XML reader
128 | xml_parser = xml.sax.make_parser()
129 |
130 | revision_handler = WikiRevisionHandler(input_file, output_file)
131 | wiki_err_handler = WikiRevErrorHandler()
132 | xml_parser.setContentHandler(revision_handler)
133 | xml_parser.setErrorHandler(wiki_err_handler)
134 | xml_parser.parse(input_file)
135 |
136 |
--------------------------------------------------------------------------------
/reader/extract_revisions_new.py:
--------------------------------------------------------------------------------
1 | import re
2 | import logging
3 | from wikiextractor.extract import Extractor
4 |
5 | extractor = Extractor('##NoName##')
6 |
7 |
8 | def clean_revison(revision):
9 | # fix text
10 | revision = '\n'.join(revision)
11 |
12 | m = re.search(r'(.*?)', revision, flags=re.DOTALL)
13 | if m:
14 | text = m.group(1)
15 | else:
16 | logging.warning('Missing text element')
17 | return None
18 |
19 | text = extractor.extract(text)
20 | m = re.search(r'(.*?)', revision)
21 | timestamp = 'none'
22 | if m:
23 | timestamp = m.group(1)
24 | return (timestamp,text)
25 |
26 | def extract_revisions(fname):
27 | revision_cnt = 0
28 | revison_content = []
29 | revison_area = False
30 | if isinstance(fname,str):
31 | fname = open(fname)
32 | for line in fname:
33 | if'' in line:
34 | # 如果revision有内容,那么肯定哪里出错了,直接丢弃数据
35 | if revison_content:
36 | revison_content = []
37 | revison_area = True
38 |
39 | if '' in line:
40 | revision_cnt += 1
41 | if revision_cnt % 100 == 0:
42 | print(fname, 'revision cnt', revision_cnt)
43 | revison_content.append(line)
44 | fixed = clean_revison(revison_content)
45 | if fixed is not None:
46 | yield fixed
47 | revison_content = []
48 | revison_area = False
49 | continue
50 |
51 | if revison_area:
52 | revison_content.append(line)
53 |
54 |
55 |
56 | if __name__ == '__main__':
57 | import argparse
58 | arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
59 | arg_parser.add_argument('input_dir', help='Input dir')
60 | arg_parser.add_argument('input_file', help='Input file')
61 | arg_parser.add_argument('output_dir', help='Output dir')
62 | args = arg_parser.parse_args()
63 |
64 |
65 | input_file = args.input_dir + '/' + args.input_file
66 | output_file = args.output_dir + '/' + args.input_file
67 | with open(output_file,'w') as f:
68 | for timestamp,text in extract_revisions(input_file):
69 | if len(text) > 10:
70 | f.write('\n\n[Revision timestamp: ' + timestamp + ']\n\n')
71 | f.write(text)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/reader/extract_spelling_errors.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Extracts spelling errors from revision history.
4 |
5 | """
6 |
7 | import codecs
8 | import re
9 | import utils
10 |
11 | class RevisionSentence(object):
12 | """Class for representing an error sentence together with original sentence.
13 |
14 | """
15 | def __init__(self, orig_tokens):
16 | self.orig_tokens = orig_tokens
17 | self.err_sen = []
18 |
19 | def add_err_sentence(self, err_tokens):
20 | self.err_sen.append(err_tokens)
21 |
22 | def contains_spelling_errors(self):
23 | """Whether the earlier revisions of the same sentences have spelling errors.
24 |
25 | Returns:
26 | bool: True or False
27 |
28 | """
29 | if len(self.err_sen) > 0:
30 | return True
31 | else:
32 | return False
33 |
34 | class ErrorCorpus(object):
35 | """Class for representing the original text data with spelling errors.
36 |
37 | """
38 | lang = 'english'
39 | max_dist = 3
40 | min_sen_len = 3
41 |
42 | def __init__(self, lang='english', max_edit_distance=3, min_sen_len=3):
43 | self.corpus = None
44 | self.num_rev = 0
45 | self.lang = lang
46 | self.max_edit = max_edit_distance
47 | self.min_sen_len = min_sen_len
48 |
49 | def create_corpus_from_wiki(self, corpus_root, filename, output_dir):
50 | create_error_corpus = False
51 | valid_word_pat = r'(?u)^\w+$'
52 | sentences = utils.get_sentences_for_text(corpus_root, filename)
53 | if sentences == None:
54 | return
55 | top_rev = []
56 | top_rev_with_err = []
57 | try:
58 | for s_list in sentences:
59 | s = ''.join(s_list)
60 | if s.startswith('[Revision timestamp:'):
61 | self.num_rev += 1
62 | else:
63 | if self.num_rev == 1:
64 | if len(s_list) >= self.min_sen_len:
65 | rev_sen = RevisionSentence(s_list)
66 | top_rev.append(rev_sen)
67 | elif self.num_rev > 1:
68 | for r in top_rev:
69 | if len(s_list) == len(r.orig_tokens):
70 | valid_errors = True
71 | errors = False
72 | old_curr_rev_sen = zip(r.orig_tokens, s_list)
73 | for t in old_curr_rev_sen:
74 | dist = utils.levenshtein_distance(t[0], t[1])
75 | if dist > 0 and dist <= self.max_dist:
76 | # token must be a word
77 | orig_uni = utils.to_unicode_or_bust(t[0])
78 | match = re.search(valid_word_pat, orig_uni)
79 | if match:
80 | errors = True
81 | elif dist > self.max_dist:
82 | valid_errors = False
83 | break
84 | if errors == True and valid_errors == True:
85 | print('errr')
86 | r.add_err_sentence(s_list)
87 | create_error_corpus = True
88 | break
89 | except AssertionError:
90 | print('Empty file')
91 |
92 | if create_error_corpus == True:
93 | with codecs.open(output_dir + '/' + filename, 'w', 'utf-8', errors='ignore') as f:
94 | for r in top_rev:
95 | if r.contains_spelling_errors() == True:
96 | orig_sen = ' '.join(r.orig_tokens)
97 | err_as_sen = map(lambda x: ' '.join(x), r.err_sen)
98 | orig_err_sen = [orig_sen] + list(err_as_sen)
99 | to_write_uni = '####'.join(orig_err_sen)
100 | f.write(to_write_uni + u'\n')
101 |
102 | if __name__ == '__main__':
103 | import argparse
104 | arg_parser = argparse.ArgumentParser(description='Script for extracting spelling errors from a revision history')
105 | arg_parser.add_argument('corpus_root', help='The directory in which the revision file exists')
106 | arg_parser.add_argument('input_file', help='Revision file')
107 | arg_parser.add_argument('output_dir', help='Output directory')
108 | arg_parser.add_argument('lang', help='Language of the text data')
109 | arg_parser.add_argument('max_edit', help='Maximum edit distance between the correct word and the misspelled work')
110 |
111 | args = arg_parser.parse_args()
112 | err_corpus = ErrorCorpus(args.lang.lower(), args.max_edit)
113 | err_corpus.create_corpus_from_wiki(args.corpus_root, args.input_file, args.output_dir)
114 |
115 | #import os
116 | #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3'
117 | #for root, dirnames, filenames in os.walk(corpus_root):
118 | # for f in filenames:
119 | # err_corpus = ErrorCorpus()
120 | # print 'Extracting errors from: ', f
121 | # err_corpus.create_corpus_from_wiki(corpus_root, f, '')
122 |
123 | #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/tmp_out'
124 | #file_name = 'hello.txt'
125 | #err_corpus = ErrorCorpus()
126 | #err_corpus.create_corpus_from_wiki(corpus_root, file_name, '')
127 |
--------------------------------------------------------------------------------
/reader/extract_spelling_errors_new.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import codecs
3 | import utils
4 | import re
5 | import sys
6 | from nltk.metrics import edit_distance
7 | import opencc
8 | import string
9 | import math
10 | converter = opencc.OpenCC('t2s.json')
11 |
12 |
13 | def hasNumbers(inputString):
14 | return bool(re.search(r'\d',inputString))
15 |
16 | def hasBrackets(inputString):
17 | return bool(re.search(r'\[|\]|\)|\(',inputString))
18 |
19 | def hasAlphabets(inputString):
20 | return bool(re.search(r'[a-zA-Z]',inputString))
21 |
22 | def hasSpecialCharacters(inputString):
23 | return bool(re.search(r'[\|\s]',inputString))
24 |
25 | def create_files(fname):
26 | # files = codecs.open(output+ '/' + fname + "_spelling_error.txt","w", encoding='utf-8')
27 | cf = codecs.open(output+ '/' + fname + "_orig_sen.txt","w", encoding='utf-8')
28 | ef = codecs.open(output+ '/' + fname + "_error_sen.txt","w", encoding='utf-8')
29 | return ef,cf
30 |
31 | def check_error(earlier,current, srcs,tgts,number_of_edits):
32 | earlier = utils.split_sentence(earlier)
33 | current = utils.split_sentence(current)
34 | if len(earlier)==len(current):
35 | for j in range(0, len(earlier)):
36 | f=0
37 | earlier_words = earlier[j]
38 | current_words = current[j]
39 | if earlier_words == current_words:
40 | continue
41 | if len(earlier_words) < 5:
42 | continue
43 | if sum([1 if utils.is_chinese_char(x) else 0 for x in current_words]) / len(current_words) <= 0.7:
44 | continue
45 |
46 | if(len(earlier_words) == len(current_words)):
47 | for k in range(0,len(earlier_words)):
48 | if earlier_words[k]==current_words[k]:
49 | continue
50 | elif utils.is_chinese_char(earlier_words[k]):
51 | f += 1
52 |
53 | thr = min(max(math.ceil(number_of_edits * len(current_words)),1),10)
54 | if(1<=f<=thr):
55 | srcs.append(earlier[j])
56 | tgts.append(current[j])
57 |
58 | if __name__ == '__main__':
59 |
60 | source = sys.argv[1]+"/"
61 | source += sys.argv[2]
62 | language = sys.argv[4]
63 | number_of_edits = float(sys.argv[5])
64 | output = sys.argv[3]
65 |
66 | files,files_2,files_3 = None,None,None
67 | revisions = []
68 | line = []
69 | f=0
70 |
71 | srcs,tgts = [],[]
72 | pre_revision = ''
73 | current_revision = ''
74 | cnt = 0
75 | for line in open(source):
76 | line = converter.convert(line)
77 | if "Revision timestamp" in line:
78 | if current_revision:
79 | if pre_revision:
80 | cnt += 1
81 | if cnt % 100 == 0:
82 | print(source,'processed',cnt)
83 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
84 | pre_revision = current_revision.strip()
85 | current_revision = ''
86 | else:
87 | current_revision += line
88 | if current_revision and pre_revision:
89 | check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
90 |
91 |
92 | if srcs:
93 | ef,cf = create_files(sys.argv[2])
94 | errors = set()
95 | for src,tgt in zip(srcs[::-1],tgts[::-1]):
96 | if src in errors or tgt in errors:
97 | continue
98 | errors.add(src)
99 | errors.add(tgt)
100 | ef.write(src + '\n')
101 | cf.write(tgt + '\n')
102 |
103 | ef.close()
104 | cf.close()
105 |
--------------------------------------------------------------------------------
/reader/fix_extracted.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Fixes output of WikiExtractor.py
4 |
5 | """
6 |
7 | import argparse
8 | import codecs
9 | import re
10 |
11 | import xml.sax
12 | from xml.sax.handler import ContentHandler
13 | handler = ContentHandler()
14 |
15 | def fix_revison(revision):
16 | # fix text
17 | text_area = False
18 | new_revison = []
19 | for x in revision:
20 | if '' in x:
21 | text_area = True
22 |
23 | # No
24 | if text_area and '' in x and '' not in x:
25 | text_area = False
26 | new_revison.append('')
27 |
28 | if text_area:
29 | # 将text里面的<符号替换掉
30 | x = x.replace('','##LOSPR##').replace('','##ROSPR##').replace('<','#')
31 | x = x.replace('##LOSPR##','').replace('##ROSPR##','')
32 |
33 | if '' in x:
34 | text_area = False
35 |
36 | new_revison.append(x)
37 |
38 | try:
39 | xml.sax.parseString('\n'.join(new_revison),handler)
40 | except:
41 | return None
42 | return new_revison
43 |
44 | def fix_extraction(input_dir, input_file, output_dir):
45 | with codecs.open(input_dir + '/' + input_file, 'r', encoding='utf-8') as f:
46 | contents = f.read()
47 | contents = contents.replace("&","&").replace('<','<').replace('>','>').replace('"','"').replace(''','\'')
48 | contents = re.sub(r'<\/text>\s*', '##LOSPR##', contents)
49 | contents = re.sub(r'', '\n\t', contents)
50 | contents = re.sub(r'##LOSPR##', '\n\t', contents)
51 |
52 | # HTML entities
53 | contents = re.sub(r'&', '&', contents)
54 |
55 | # Remove HTML tags if not removed already
56 | tag_pat1 = (r'<\/?(textarea|select|strong|center|option|'
57 | r'input|param|small|style|table|tbody|thead|tfoot|'
58 | r'body|head|html|span|font|form|'
59 | r'div|img|var|pre|sub|sup|var|ref|wiki|'
60 | r'br|dl|dt|dd|em|h[1-6]|hr|li|ol|td|tr|th|ul|a|b|p|q|u)>'
61 | )
62 | contents = re.sub(tag_pat1, '', contents)
63 |
64 | # remove bad revisions
65 | new_content = []
66 | revison_content = []
67 | revison_area = False
68 |
69 | for line in contents.splitlines():
70 | if'' in line:
71 | # 如果revision有内容,那么肯定哪里出错了,直接丢弃数据
72 | if revison_content:
73 | revison_content = []
74 | revison_area = True
75 |
76 | if '' in line:
77 | revison_content.append(line)
78 | fixed = fix_revison(revison_content)
79 | if fixed is not None:
80 | new_content.extend(fixed)
81 | revison_content = []
82 | revison_area = False
83 | continue
84 |
85 | if revison_area:
86 | revison_content.append(line)
87 | else:
88 | new_content.append(line)
89 | with codecs.open(output_dir + '/' + input_file, 'w', encoding='utf-8') as fw:
90 | fw.write('\n'.join(new_content))
91 |
92 |
93 | if __name__ == '__main__':
94 | arg_parser = argparse.ArgumentParser(description='Script for fixing WikiExtractor.py outputs')
95 | arg_parser.add_argument('input_dir', help='Input dir')
96 | arg_parser.add_argument('input_file', help='Input file')
97 | arg_parser.add_argument('output_dir', help='Output directory')
98 | args = arg_parser.parse_args()
99 | fix_extraction(args.input_dir, args.input_file, args.output_dir)
100 |
--------------------------------------------------------------------------------
/reader/utils.py:
--------------------------------------------------------------------------------
1 | # *-* coding: utf-8 *-*
2 |
3 | """Utility functions.
4 |
5 | """
6 | # import nltk.data
7 | # from nltk.tokenize.regexp import WhitespaceTokenizer
8 | # from nltk.corpus import PlaintextCorpusReader
9 | import jieba
10 | import numpy as np
11 | import sys
12 |
13 | import re
14 | from typing import List
15 |
16 | def is_chinese_char(cp):
17 | """Checks whether CP is the codepoint of a CJK character."""
18 | # This defines a "chinese character" as anything in the CJK Unicode block:
19 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
20 | #
21 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
22 | # despite its name. The modern Korean Hangul alphabet is a different block,
23 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
24 | # space-separated words, so they are not treated specially and handled
25 | # like the all of the other languages.
26 | cp = ord(cp)
27 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
28 | (cp >= 0x3400 and cp <= 0x4DBF) or #
29 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
30 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
31 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
32 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
33 | (cp >= 0xF900 and cp <= 0xFAFF) or #
34 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
35 | return True
36 |
37 | return False
38 |
39 | def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
40 | """
41 | Args:
42 | document:
43 | flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句
44 | limit: 默认单句最大长度为510个字符
45 | Returns: Type:list
46 | """
47 | sent_list = []
48 | try:
49 | if flag == "zh":
50 | document = re.sub('(?P([。?!…](?![”’"\'])))', r'\g\n', document) # 单字符断句符
51 | document = re.sub('(?P([。?!]|…{1,2})[”’"\'])', r'\g\n', document) # 特殊引号
52 | elif flag == "en":
53 | document = re.sub('(?P([.?!](?![”’"\'])))', r'\g\n', document) # 英文单字符断句符
54 | document = re.sub('(?P([?!.]["\']))', r'\g\n', document) # 特殊引号
55 | else:
56 | document = re.sub('(?P([。?!….?!](?![”’"\'])))', r'\g\n', document) # 单字符断句符
57 | document = re.sub('(?P(([。?!.!?]|…{1,2})[”’"\']))', r'\g\n',
58 | document) # 特殊引号
59 |
60 | sent_list_ori = document.splitlines()
61 | for sent in sent_list_ori:
62 | sent = sent.strip()
63 | if not sent:
64 | continue
65 | else:
66 | while len(sent) > limit:
67 | temp = sent[0:limit]
68 | sent_list.append(temp)
69 | sent = sent[limit:]
70 | sent_list.append(sent)
71 | except:
72 | sent_list.clear()
73 | sent_list.append(document)
74 | return sent_list
75 |
76 |
77 | def to_unicode_or_bust(s, encoding='utf-8'):
78 | """Converts the bytestring in utf-8 to Unicode.
79 |
80 | Credit: Method from 'Unicode in Python, Completely Demystified'.
81 |
82 | Args:
83 | s: Bytestring
84 | encoding: Encoding
85 |
86 | Returns:
87 | Return the Unicode version of the given bytestring
88 |
89 | """
90 | # if isinstance(s, str):
91 | # if not isinstance(s, unicode):
92 | # s = unicode(s, encoding)
93 | return s
94 |
95 |
96 | def get_sentences_for_text(corpus_root, filename, lang='english'):
97 | """Segments the given text into sentences.
98 |
99 | Args:
100 | corpus_root: Directory in which the text file is residing.
101 | filename: Name of the text file.
102 | lang: Tokenizer language. For possible values, look at:
103 | ${NLTK_DATA}/tokenizers/punkt
104 |
105 | Returns:
106 | Sentences in the given text.
107 |
108 | """
109 | sents = []
110 | for s in split_sentence(open(corpus_root + '/' + filename).read()):
111 | sents.append(jieba.lcut(s))
112 | return sents
113 | # tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle'
114 | # text = PlaintextCorpusReader(corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(),
115 | # sent_tokenizer=nltk.data.LazyLoader(tokenizer_path))
116 | # return text.sents()
117 |
118 | def levenshtein_distance(s, t):
119 | """Minimum edit distance between two strings.
120 |
121 | Args:
122 | s: Source string
123 | t: Target string
124 |
125 | Returns:
126 | int: Minimum edit distance between the two input strings.
127 |
128 | """
129 | m = len(s)
130 | n = len(t)
131 | if m == 0:
132 | return n
133 | if n == 0:
134 | return m
135 | d = np.zeros((m+1, n+1))
136 | d[:, 0] = np.arange(m+1)
137 | d[0, :] = np.arange(n+1)
138 | for j in range(1, n+1):
139 | for i in range(1, m+1):
140 | if s[i-1] == t[j-1]:
141 | d[i][j] = d[i-1][j-1]
142 | else:
143 | d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+1)
144 | return int(d[m][n])
145 |
146 |
147 | if __name__ == '__main__':
148 | corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3'
149 | file_name = '0000000007.xml'
150 | sentences = get_sentences_for_text(corpus_root, file_name)
151 | # try:
152 | # for s in sentences:
153 | # print s
154 | # print '\n----END----'
155 | # except AssertionError:
156 | # print 'Empty file'
157 |
158 |
--------------------------------------------------------------------------------
/reader/wikiextractor/WikiExtractor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # =============================================================================
5 | # Version: 3.0 (July 22, 2020)
6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
7 | #
8 | # Contributors:
9 | # Antonio Fuschetto (fuschett@aol.com)
10 | # Leonardo Souza (lsouza@amtera.com.br)
11 | # Juan Manuel Caicedo (juan@cavorite.com)
12 | # Humberto Pereira (begini@gmail.com)
13 | # Siegfried-A. Gevatter (siegfried@gevatter.com)
14 | # Pedro Assis (pedroh2306@gmail.com)
15 | # Wim Muskee (wimmuskee@gmail.com)
16 | # Radics Geza (radicsge@gmail.com)
17 | # Nick Ulven (nulven@github)
18 | #
19 | # =============================================================================
20 | # Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
21 | # =============================================================================
22 | # This file is part of Tanl.
23 | #
24 | # Tanl is free software; you can redistribute it and/or modify it
25 | # under the terms of the GNU Affero General Public License, version 3,
26 | # as published by the Free Software Foundation.
27 | #
28 | # Tanl is distributed in the hope that it will be useful,
29 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
30 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 | # GNU Affero General Public License for more details.
32 | #
33 | # You should have received a copy of the GNU Affero General Public License
34 | # along with this program. If not, see .
35 | # =============================================================================
36 |
37 | """Wikipedia Extractor:
38 | Extracts and cleans text from a Wikipedia database dump and stores output in a
39 | number of files of similar size in a given directory.
40 | Each file will contain several documents in the format:
41 |
42 |
43 | ...
44 |
45 |
46 | If the program is invoked with the --json flag, then each file will
47 | contain several documents formatted as json ojects, one per line, with
48 | the following structure
49 |
50 | {"id": "", "revid": "", "url": "", "title": "", "text": "..."}
51 |
52 | The program performs template expansion by preprocesssng the whole dump and
53 | collecting template definitions.
54 | """
55 |
56 | import argparse
57 | import bz2
58 | import logging
59 | import os.path
60 | import re # TODO use regex when it will be standard
61 | import sys
62 | from io import StringIO
63 | from multiprocessing import Queue, get_context, cpu_count
64 | from timeit import default_timer
65 |
66 | from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
67 |
68 | # ===========================================================================
69 |
70 | # Program version
71 | __version__ = '3.0.6'
72 |
73 | ##
74 | # Defined in
75 | # We include as default Template, when loading external template file.
76 | knownNamespaces = set(['Template'])
77 |
78 | ##
79 | # The namespace used for template definitions
80 | # It is the name associated with namespace key=10 in the siteinfo header.
81 | templateNamespace = ''
82 | templatePrefix = ''
83 |
84 | ##
85 | # The namespace used for module definitions
86 | # It is the name associated with namespace key=828 in the siteinfo header.
87 | moduleNamespace = ''
88 |
89 | # ----------------------------------------------------------------------
90 | # Modules
91 |
92 | # Only minimal support
93 | # FIXME: import Lua modules.
94 |
95 | modules = {
96 | 'convert': {
97 | 'convert': lambda x, u, *rest: x + ' ' + u, # no conversion
98 | }
99 | }
100 | # ----------------------------------------------------------------------
101 | # Expand using WikiMedia API
102 | # import json
103 |
104 | # def expandTemplates(text):
105 | # """Expand templates invoking MediaWiki API"""
106 | # text = urlib.urlencodew(text)
107 | # base = urlbase[:urlbase.rfind('/')]
108 | # url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
109 | # exp = json.loads(urllib.urlopen(url))
110 | # return exp['expandtemplates']['*']
111 |
112 | # ------------------------------------------------------------------------------
113 | # Output
114 |
115 |
116 | class NextFile():
117 |
118 | """
119 | Synchronous generation of next available file name.
120 | """
121 |
122 | filesPerDir = 100
123 |
124 | def __init__(self, path_name):
125 | self.path_name = path_name
126 | self.dir_index = -1
127 | self.file_index = -1
128 |
129 | def next(self):
130 | self.file_index = (self.file_index + 1) % NextFile.filesPerDir
131 | if self.file_index == 0:
132 | self.dir_index += 1
133 | dirname = self._dirname()
134 | if not os.path.isdir(dirname):
135 | os.makedirs(dirname)
136 | return self._filepath()
137 |
138 | def _dirname(self):
139 | char1 = self.dir_index % 26
140 | char2 = int(self.dir_index / 26) % 26
141 | return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
142 |
143 | def _filepath(self):
144 | return '%s/wiki_%02d' % (self._dirname(), self.file_index)
145 |
146 |
147 | class OutputSplitter():
148 |
149 | """
150 | File-like object, that splits output to multiple files of a given max size.
151 | """
152 |
153 | def __init__(self, nextFile, max_file_size=0, compress=True):
154 | """
155 | :param nextFile: a NextFile object from which to obtain filenames
156 | to use.
157 | :param max_file_size: the maximum size of each file.
158 | :para compress: whether to write data with bzip compression.
159 | """
160 | self.nextFile = nextFile
161 | self.compress = compress
162 | self.max_file_size = max_file_size
163 | self.file = self.open(self.nextFile.next())
164 |
165 | def reserve(self, size):
166 | if self.file.tell() + size > self.max_file_size:
167 | self.close()
168 | self.file = self.open(self.nextFile.next())
169 |
170 | def write(self, data):
171 | self.reserve(len(data))
172 | if self.compress:
173 | self.file.write(data)
174 | else:
175 | self.file.write(data)
176 |
177 | def close(self):
178 | self.file.close()
179 |
180 | def open(self, filename):
181 | if self.compress:
182 | return bz2.BZ2File(filename + '.bz2', 'w')
183 | else:
184 | return open(filename, 'w')
185 |
186 |
187 | # ----------------------------------------------------------------------
188 | # READER
189 |
190 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
191 | # 1 2 3 4
192 |
193 |
194 | def load_templates(file, output_file=None):
195 | """
196 | Load templates from :param file:.
197 | :param output_file: file where to save templates and modules.
198 | """
199 | global templateNamespace, templatePrefix
200 | templatePrefix = templateNamespace + ':'
201 | global moduleNamespace, modulePrefix
202 | modulePrefix = moduleNamespace + ':'
203 | articles = 0
204 | templates = 0
205 | page = []
206 | inText = False
207 | if output_file:
208 | output = open(output_file, 'w')
209 | for line in file:
210 | #line = line.decode('utf-8')
211 | if '<' not in line: # faster than doing re.search()
212 | if inText:
213 | page.append(line)
214 | continue
215 | m = tagRE.search(line)
216 | if not m:
217 | continue
218 | tag = m.group(2)
219 | if tag == 'page':
220 | page = []
221 | elif tag == 'title':
222 | title = m.group(3)
223 | elif tag == 'text':
224 | inText = True
225 | line = line[m.start(3):m.end(3)]
226 | page.append(line)
227 | if m.lastindex == 4: # open-close
228 | inText = False
229 | elif tag == '/text':
230 | if m.group(1):
231 | page.append(m.group(1))
232 | inText = False
233 | elif inText:
234 | page.append(line)
235 | elif tag == '/page':
236 | if not output_file and not templateNamespace: # do not know it yet
237 | # we reconstruct it from the first title
238 | colon = title.find(':')
239 | if colon > 1:
240 | templateNamespace = title[:colon]
241 | templatePrefix = title[:colon + 1]
242 | # FIXME: should reconstruct also moduleNamespace
243 | if title.startswith(templatePrefix):
244 | define_template(title, page)
245 | templates += 1
246 | # save templates and modules to file
247 | if output_file and (title.startswith(templatePrefix) or
248 | title.startswith(modulePrefix)):
249 | output.write('\n')
250 | output.write(' %s\n' % title)
251 | output.write(' 10\n')
252 | output.write(' ')
253 | for line in page:
254 | output.write(line)
255 | output.write(' \n')
256 | output.write('\n')
257 | page = []
258 | articles += 1
259 | if articles % 100000 == 0:
260 | logging.info("Preprocessed %d pages", articles)
261 | if output_file:
262 | output.close()
263 | logging.info("Saved %d templates to '%s'", templates, output_file)
264 | return templates
265 |
266 |
267 | def decode_open(filename, mode='rt', encoding='utf-8'):
268 | """
269 | Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
270 | :param filename: the file to open.
271 | """
272 | ext = os.path.splitext(filename)[1]
273 | if ext == '.gz':
274 | import gzip
275 | return gzip.open(filename, mode, encoding=encoding)
276 | elif ext == '.bz2':
277 | return bz2.open(filename, mode=mode, encoding=encoding)
278 | else:
279 | return open(filename, mode, encoding=encoding)
280 |
281 |
282 | def process_dump(input_file, template_file, out_file, file_size, file_compress,
283 | process_count, html_safe):
284 | """
285 | :param input_file: name of the wikipedia dump file; '-' to read from stdin
286 | :param template_file: optional file with template definitions.
287 | :param out_file: directory where to store extracted data, or '-' for stdout
288 | :param file_size: max size of each extracted file, or None for no max (one file)
289 | :param file_compress: whether to compress files with bzip.
290 | :param process_count: number of extraction processes to spawn.
291 | """
292 | global knownNamespaces
293 | global templateNamespace, templatePrefix
294 | global moduleNamespace, modulePrefix
295 |
296 | urlbase = '' # This is obtained from
297 |
298 | input = decode_open(input_file)
299 |
300 | # collect siteinfo
301 | for line in input:
302 | line = line #.decode('utf-8')
303 | m = tagRE.search(line)
304 | if not m:
305 | continue
306 | tag = m.group(2)
307 | if tag == 'base':
308 | # discover urlbase from the xml dump file
309 | # /mediawiki/siteinfo/base
310 | base = m.group(3)
311 | urlbase = base[:base.rfind("/")]
312 | elif tag == 'namespace':
313 | knownNamespaces.add(m.group(3))
314 | if re.search('key="10"', line):
315 | templateNamespace = m.group(3)
316 | templatePrefix = templateNamespace + ':'
317 | elif re.search('key="828"', line):
318 | moduleNamespace = m.group(3)
319 | modulePrefix = moduleNamespace + ':'
320 | elif tag == '/siteinfo':
321 | break
322 |
323 | if expand_templates:
324 | # preprocess
325 | template_load_start = default_timer()
326 | if template_file and os.path.exists(template_file):
327 | logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
328 | file = decode_open(template_file)
329 | templates = load_templates(file)
330 | file.close()
331 | else:
332 | if input_file == '-':
333 | # can't scan then reset stdin; must error w/ suggestion to specify template_file
334 | raise ValueError("to use templates with stdin dump, must supply explicit template-file")
335 | logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
336 | templates = load_templates(input, template_file)
337 | input.close()
338 | input = decode_open(input_file)
339 | template_load_elapsed = default_timer() - template_load_start
340 | logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
341 |
342 | if out_file == '-':
343 | output = sys.stdout
344 | if file_compress:
345 | logging.warn("writing to stdout, so no output compression (use an external tool)")
346 | else:
347 | nextFile = NextFile(out_file)
348 | output = OutputSplitter(nextFile, file_size, file_compress)
349 |
350 | # process pages
351 | logging.info("Starting page extraction from %s.", input_file)
352 | extract_start = default_timer()
353 |
354 | # Parallel Map/Reduce:
355 | # - pages to be processed are dispatched to workers
356 | # - a reduce process collects the results, sort them and print them.
357 |
358 | # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object
359 | Process = get_context("fork").Process
360 |
361 | maxsize = 10 * process_count
362 | # output queue
363 | output_queue = Queue(maxsize=maxsize)
364 |
365 | # Reduce job that sorts and prints output
366 | reduce = Process(target=reduce_process, args=(output_queue, output))
367 | reduce.start()
368 |
369 | # initialize jobs queue
370 | jobs_queue = Queue(maxsize=maxsize)
371 |
372 | # start worker processes
373 | logging.info("Using %d extract processes.", process_count)
374 | workers = []
375 | for _ in range(max(1, process_count)):
376 | extractor = Process(target=extract_process,
377 | args=(jobs_queue, output_queue, html_safe))
378 | extractor.daemon = True # only live while parent process lives
379 | extractor.start()
380 | workers.append(extractor)
381 |
382 | # Mapper process
383 |
384 | # we collect individual lines, since str.join() is significantly faster
385 | # than concatenation
386 | page = []
387 | id = ''
388 | revid = ''
389 | last_id = ''
390 | ordinal = 0 # page count
391 | inText = False
392 | redirect = False
393 | for line in input:
394 | if '<' not in line: # faster than doing re.search()
395 | if inText:
396 | page.append(line)
397 | continue
398 | m = tagRE.search(line)
399 | if not m:
400 | continue
401 | tag = m.group(2)
402 | if tag == 'page':
403 | page = []
404 | redirect = False
405 | elif tag == 'id' and not id:
406 | id = m.group(3)
407 | elif tag == 'id' and id: #
408 | revid = m.group(3)
409 | elif tag == 'title':
410 | title = m.group(3)
411 | elif tag == 'redirect':
412 | redirect = True
413 | elif tag == 'text':
414 | inText = True
415 | line = line[m.start(3):m.end(3)]
416 | page.append(line)
417 | if m.lastindex == 4: # open-close
418 | inText = False
419 | elif tag == '/text':
420 | if m.group(1):
421 | page.append(m.group(1))
422 | inText = False
423 | elif inText:
424 | page.append(line)
425 | elif tag == '/page':
426 | colon = title.find(':')
427 | if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
428 | not redirect and not title.startswith(templateNamespace)):
429 | job = (id, revid, urlbase, title, page, ordinal)
430 | jobs_queue.put(job) # goes to any available extract_process
431 | last_id = id
432 | ordinal += 1
433 | id = ''
434 | revid = ''
435 | page = []
436 |
437 | input.close()
438 |
439 | # signal termination
440 | for _ in workers:
441 | jobs_queue.put(None)
442 | # wait for workers to terminate
443 | for w in workers:
444 | w.join()
445 |
446 | # signal end of work to reduce process
447 | output_queue.put(None)
448 | # wait for it to finish
449 | reduce.join()
450 |
451 | if output != sys.stdout:
452 | output.close()
453 | extract_duration = default_timer() - extract_start
454 | extract_rate = ordinal / extract_duration
455 | logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
456 | process_count, ordinal, extract_duration, extract_rate)
457 |
458 |
459 | # ----------------------------------------------------------------------
460 | # Multiprocess support
461 |
462 |
463 | def extract_process(jobs_queue, output_queue, html_safe):
464 | """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
465 | :param jobs_queue: where to get jobs.
466 | :param output_queue: where to queue extracted text for output.
467 | :html_safe: whether to convert entities in text to HTML.
468 | """
469 | while True:
470 | job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal)
471 | if job:
472 | out = StringIO() # memory buffer
473 | Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page)
474 | text = out.getvalue()
475 | output_queue.put((job[-1], text)) # (ordinal, extracted_text)
476 | out.close()
477 | else:
478 | break
479 |
480 |
481 | def reduce_process(output_queue, output):
482 | """Pull finished article text, write series of files (or stdout)
483 | :param output_queue: text to be output.
484 | :param output: file object where to print.
485 | """
486 |
487 | interval_start = default_timer()
488 | period = 100000
489 | # FIXME: use a heap
490 | ordering_buffer = {} # collected pages
491 | next_ordinal = 0 # sequence number of pages
492 | while True:
493 | if next_ordinal in ordering_buffer:
494 | output.write(ordering_buffer.pop(next_ordinal))
495 | next_ordinal += 1
496 | # progress report
497 | if next_ordinal % period == 0:
498 | interval_rate = period / (default_timer() - interval_start)
499 | logging.info("Extracted %d articles (%.1f art/s)",
500 | next_ordinal, interval_rate)
501 | interval_start = default_timer()
502 | else:
503 | # mapper puts None to signal finish
504 | pair = output_queue.get()
505 | if not pair:
506 | break
507 | ordinal, text = pair
508 | ordering_buffer[ordinal] = text
509 |
510 |
511 | # ----------------------------------------------------------------------
512 |
513 | # Minimum size of output files
514 | minFileSize = 200 * 1024
515 |
516 |
517 | def main():
518 | global urlbase, acceptedNamespaces
519 | global expand_templates, templateCache
520 |
521 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
522 | formatter_class=argparse.RawDescriptionHelpFormatter,
523 | description=__doc__)
524 | parser.add_argument("input",
525 | help="XML wiki dump file")
526 | groupO = parser.add_argument_group('Output')
527 | groupO.add_argument("-o", "--output", default="text",
528 | help="directory for extracted files (or '-' for dumping to stdout)")
529 | groupO.add_argument("-b", "--bytes", default="1M",
530 | help="maximum bytes per output file (default %(default)s); 0 means to put a single article per file",
531 | metavar="n[KMG]")
532 | groupO.add_argument("-c", "--compress", action="store_true",
533 | help="compress output files using bzip")
534 | groupO.add_argument("--json", action="store_true",
535 | help="write output in json format instead of the default format")
536 |
537 | groupP = parser.add_argument_group('Processing')
538 | groupP.add_argument("--html", action="store_true",
539 | help="produce HTML output, subsumes --links")
540 | groupP.add_argument("-l", "--links", action="store_true",
541 | help="preserve links")
542 | groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
543 | help="accepted namespaces")
544 | groupP.add_argument("--templates",
545 | help="use or create file containing templates")
546 | groupP.add_argument("--no-templates", action="store_false",
547 | help="Do not expand templates")
548 | groupP.add_argument("--html-safe", default=True,
549 | help="use to produce HTML safe output within ...")
550 | default_process_count = cpu_count() - 1
551 | parser.add_argument("--processes", type=int, default=default_process_count,
552 | help="Number of processes to use (default %(default)s)")
553 |
554 | groupS = parser.add_argument_group('Special')
555 | groupS.add_argument("-q", "--quiet", action="store_true",
556 | help="suppress reporting progress info")
557 | groupS.add_argument("--debug", action="store_true",
558 | help="print debug info")
559 | groupS.add_argument("-a", "--article", action="store_true",
560 | help="analyze a file containing a single article (debug option)")
561 | groupS.add_argument("-v", "--version", action="version",
562 | version='%(prog)s ' + __version__,
563 | help="print program version")
564 |
565 | args = parser.parse_args()
566 |
567 | Extractor.keepLinks = args.links
568 | Extractor.HtmlFormatting = args.html
569 | if args.html:
570 | Extractor.keepLinks = True
571 | Extractor.to_json = args.json
572 |
573 | expand_templates = args.no_templates
574 |
575 | try:
576 | power = 'kmg'.find(args.bytes[-1].lower()) + 1
577 | # 0 bytes means put a single article per file.
578 | file_size = 0 if args.bytes == '0' else int(args.bytes[:-1]) * 1024 ** power
579 | if file_size and file_size < minFileSize:
580 | raise ValueError()
581 | except ValueError:
582 | logging.error('Insufficient or invalid size: %s', args.bytes)
583 | return
584 |
585 | if args.namespaces:
586 | acceptedNamespaces = set(args.namespaces.split(','))
587 |
588 | FORMAT = '%(levelname)s: %(message)s'
589 | logging.basicConfig(format=FORMAT)
590 |
591 | logger = logging.getLogger()
592 | if not args.quiet:
593 | logger.setLevel(logging.INFO)
594 | if args.debug:
595 | logger.setLevel(logging.DEBUG)
596 |
597 | input_file = args.input
598 |
599 | if not Extractor.keepLinks:
600 | ignoreTag('a')
601 |
602 | # sharing cache of parser templates is too slow:
603 | # manager = Manager()
604 | # templateCache = manager.dict()
605 |
606 | if args.article:
607 | if args.templates:
608 | if os.path.exists(args.templates):
609 | with open(args.templates) as file:
610 | load_templates(file)
611 |
612 | with open(input_file) as file:
613 | page = file.read()
614 | ids = re.findall(r'(\d*?)', page)
615 | id = ids[0] if ids else ''
616 | revid = ids[1] if len(ids) > 1 else ''
617 | m = re.search(r'(.*?)', page)
618 | if m:
619 | title = m.group(1)
620 | else:
621 | logging.error('Missing title element')
622 | return
623 | m = re.search(r'(.*?)', page)
624 | if m:
625 | base = m.group(1)
626 | urlbase = base[:base.rfind("/")]
627 | else:
628 | urlbase = ''
629 | Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
630 | return
631 |
632 | output_path = args.output
633 | if output_path != '-' and not os.path.isdir(output_path):
634 | try:
635 | os.makedirs(output_path)
636 | except:
637 | logging.error('Could not create: %s', output_path)
638 | return
639 |
640 | process_dump(input_file, args.templates, output_path, file_size,
641 | args.compress, args.processes, args.html_safe)
642 |
643 |
644 | if __name__ == '__main__':
645 | main()
646 |
--------------------------------------------------------------------------------
/reader/wikiextractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/wikiextractor/__init__.py
--------------------------------------------------------------------------------
/reader/wikiextractor/cirrus-extract.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # =============================================================================
5 | # Version: 1.00 (December 15, 2015)
6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
7 | #
8 | # =============================================================================
9 | # Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
10 | # =============================================================================
11 | # This file is part of Tanl.
12 | #
13 | # Tanl is free software; you can redistribute it and/or modify it
14 | # under the terms of the GNU Affero General Public License, version 3,
15 | # as published by the Free Software Foundation.
16 | #
17 | # Tanl is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | # GNU Affero General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Affero General Public License
23 | # along with this program. If not, see .
24 | # =============================================================================
25 |
26 | """Wikipedia Cirrus Extractor:
27 | Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
28 | number of files of similar size in a given directory.
29 | Each file will contain several documents in the format:
30 |
31 |
32 | ...
33 |
34 |
35 | """
36 |
37 | import sys, os.path, time
38 | import re
39 | import json
40 | import argparse
41 | import bz2
42 | import gzip
43 | import logging
44 |
45 | # Program version
46 | version = '3.0'
47 |
48 | urlbase = 'http://it.wikipedia.org/'
49 |
50 | # ----------------------------------------------------------------------
51 |
52 | class NextFile(object):
53 | """
54 | Synchronous generation of next available file name.
55 | """
56 |
57 | filesPerDir = 100
58 |
59 | def __init__(self, path_name):
60 | self.path_name = path_name
61 | self.dir_index = -1
62 | self.file_index = -1
63 |
64 | def next(self):
65 | self.file_index = (self.file_index + 1) % NextFile.filesPerDir
66 | if self.file_index == 0:
67 | self.dir_index += 1
68 | dirname = self._dirname()
69 | if not os.path.isdir(dirname):
70 | os.makedirs(dirname)
71 | return self._filepath()
72 |
73 | def _dirname(self):
74 | char1 = self.dir_index % 26
75 | char2 = int(self.dir_index / 26) % 26
76 | return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
77 |
78 | def _filepath(self):
79 | return '%s/wiki_%02d' % (self._dirname(), self.file_index)
80 |
81 | class OutputSplitter(object):
82 | """
83 | File-like object, that splits output to multiple files of a given max size.
84 | """
85 |
86 | def __init__(self, nextFile, max_file_size=0, compress=True):
87 | """
88 | :param nextfile: a NextFile object from which to obtain filenames
89 | to use.
90 | :param max_file_size: the maximum size of each file.
91 | :para compress: whether to write data with bzip compression.
92 | """
93 | self.nextFile = nextFile
94 | self.compress = compress
95 | self.max_file_size = max_file_size
96 | self.file = self.open(self.nextFile.next())
97 |
98 | def reserve(self, size):
99 | if self.file.tell() + size > self.max_file_size:
100 | self.close()
101 | self.file = self.open(self.nextFile.next())
102 |
103 | def write(self, data):
104 | self.reserve(len(data))
105 | self.file.write(data)
106 |
107 | def close(self):
108 | self.file.close()
109 |
110 | def open(self, filename):
111 | if self.compress:
112 | return bz2.BZ2File(filename + '.bz2', 'w')
113 | else:
114 | return open(filename, 'w')
115 |
116 | # ----------------------------------------------------------------------
117 |
118 | class Extractor(object):
119 |
120 | def extract(self, out):
121 | """
122 | :param out: output file.
123 | """
124 | logging.debug("%s\t%s", self.id, self.title)
125 | text = ''.join(self.page)
126 | url = get_url(self.id)
127 | header = '\n' % (self.id, url, self.title, self.language, self.revision)
128 | # Separate header from text with a newline.
129 | header += self.title + '\n\n'
130 | header = header.encode('utf-8')
131 | footer = "\n\n"
132 | out.write(header)
133 | text = clean(self, text)
134 | for line in compact(text):
135 | out.write(line.encode('utf-8'))
136 | out.write('\n')
137 | out.write(footer)
138 |
139 | def process_dump(input_file, out_file, file_size, file_compress):
140 | """
141 | :param input_file: name of the wikipedia dump file; '-' to read from stdin
142 | :param out_file: directory where to store extracted data, or '-' for stdout
143 | :param file_size: max size of each extracted file, or None for no max (one file)
144 | :param file_compress: whether to compress files with bzip.
145 | """
146 |
147 | if input_file == '-':
148 | input = sys.stdin
149 | else:
150 | input = gzip.open(input_file)
151 |
152 | if out_file == '-':
153 | output = sys.stdout
154 | if file_compress:
155 | logging.warn("writing to stdout, so no output compression (use external tool)")
156 | else:
157 | nextFile = NextFile(out_file)
158 | output = OutputSplitter(nextFile, file_size, file_compress)
159 |
160 | # process dump
161 | # format
162 | # {"index":{"_type":"page","_id":"3825914"}}
163 | # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
164 | while True:
165 | line = input.readline()
166 | if not line:
167 | break
168 | index = json.loads(line)
169 | content = json.loads(input.readline())
170 | type = index['index']['_type']
171 | id = index['index']['_id']
172 | language = content['language']
173 | revision = content['version']
174 | if type == 'page' and content['namespace'] == 0:
175 | title = content['title']
176 | text = content['text']
177 | # drop references:
178 | # ^ The Penguin Dictionary
179 | text = re.sub(r' \^ .*', '', text)
180 | url = urlbase + 'wiki?curid=' + id
181 | header = '\n' % (id, url, title, language, revision)
182 | page = header + title + '\n\n' + text + '\n\n'
183 | output.write(page.encode('utf-8'))
184 |
185 | # ----------------------------------------------------------------------
186 |
187 | # Minimum size of output files
188 | minFileSize = 200 * 1024
189 |
190 | def main():
191 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
192 | formatter_class=argparse.RawDescriptionHelpFormatter,
193 | description=__doc__)
194 | parser.add_argument("input",
195 | help="Cirrus Json wiki dump file")
196 | groupO = parser.add_argument_group('Output')
197 | groupO.add_argument("-o", "--output", default="text",
198 | help="directory for extracted files (or '-' for dumping to stdin)")
199 | groupO.add_argument("-b", "--bytes", default="1M",
200 | help="maximum bytes per output file (default %(default)s)",
201 | metavar="n[KMG]")
202 | groupO.add_argument("-c", "--compress", action="store_true",
203 | help="compress output files using bzip")
204 |
205 | groupP = parser.add_argument_group('Processing')
206 | groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
207 | help="accepted namespaces")
208 |
209 | groupS = parser.add_argument_group('Special')
210 | groupS.add_argument("-q", "--quiet", action="store_true",
211 | help="suppress reporting progress info")
212 | groupS.add_argument("-v", "--version", action="version",
213 | version='%(prog)s ' + version,
214 | help="print program version")
215 |
216 | args = parser.parse_args()
217 |
218 | try:
219 | power = 'kmg'.find(args.bytes[-1].lower()) + 1
220 | file_size = int(args.bytes[:-1]) * 1024 ** power
221 | if file_size < minFileSize:
222 | raise ValueError()
223 | except ValueError:
224 | logging.error('Insufficient or invalid size: %s', args.bytes)
225 | return
226 |
227 | FORMAT = '%(levelname)s: %(message)s'
228 | logging.basicConfig(format=FORMAT)
229 |
230 | logger = logging.getLogger()
231 | if not args.quiet:
232 | logger.setLevel(logging.INFO)
233 |
234 | input_file = args.input
235 |
236 | output_path = args.output
237 | if output_path != '-' and not os.path.isdir(output_path):
238 | try:
239 | os.makedirs(output_path)
240 | except:
241 | logging.error('Could not create: %s', output_path)
242 | return
243 |
244 | process_dump(input_file, output_path, file_size, args.compress)
245 |
246 |
247 | if __name__ == '__main__':
248 | main()
249 |
--------------------------------------------------------------------------------
/reader/wikiextractor/clean.py:
--------------------------------------------------------------------------------
1 | # =============================================================================
2 | # Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
3 | # =============================================================================
4 | # This file is part of Tanl.
5 | #
6 | # Tanl is free software; you can redistribute it and/or modify it
7 | # under the terms of the GNU Affero General Public License, version 3,
8 | # as published by the Free Software Foundation.
9 | #
10 | # Tanl is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 | # =============================================================================
18 |
19 | from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
20 |
21 |
22 | def clean_markup(markup, keep_links=False, ignore_headers=True):
23 | """
24 | Clean Wikimarkup to produce plaintext.
25 |
26 | :param keep_links: Set to True to keep internal and external links
27 | :param ignore_headers: if set to True, the output list will not contain
28 | headers, only
29 |
30 | Returns a list of paragraphs (unicode strings).
31 | """
32 |
33 | if not keep_links:
34 | ignoreTag('a')
35 |
36 | extractor = Extractor(0, '', [])
37 |
38 | # returns a list of strings
39 | paragraphs = extractor.clean_text(markup,
40 | mark_headers=True,
41 | expand_templates=False,
42 | escape_doc=True)
43 | resetIgnoredTags()
44 |
45 | if ignore_headers:
46 | paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
47 |
48 | return paragraphs
49 |
--------------------------------------------------------------------------------
/reader/wikiextractor/extract.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # =============================================================================
4 | # Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
5 | # =============================================================================
6 | # This file is part of Tanl.
7 | #
8 | # Tanl is free software; you can redistribute it and/or modify it
9 | # under the terms of the GNU Affero General Public License, version 3,
10 | # as published by the Free Software Foundation.
11 | #
12 | # Tanl is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 | # =============================================================================
20 |
21 | import re
22 | import html
23 | import json
24 | from itertools import zip_longest
25 | from urllib.parse import quote as urlencode
26 | from html.entities import name2codepoint
27 | import logging
28 | import time
29 |
30 | # ----------------------------------------------------------------------
31 |
32 | # match tail after wikilink
33 | tailRE = re.compile('\w+')
34 | syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL)
35 |
36 | ## PARAMS ####################################################################
37 |
38 | ##
39 | # Defined in
40 | # We include as default Template, when loading external template file.
41 | knownNamespaces = set(['Template'])
42 |
43 | ##
44 | # Drop these elements from article text
45 | #
46 | discardElements = [
47 | 'gallery', 'timeline', 'noinclude', 'pre',
48 | 'table', 'tr', 'td', 'th', 'caption', 'div',
49 | 'form', 'input', 'select', 'option', 'textarea',
50 | 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
51 | 'ref', 'references', 'img', 'imagemap', 'source', 'small'
52 | ]
53 |
54 | ##
55 | # Recognize only these namespaces
56 | # w: Internal links to the Wikipedia
57 | # wiktionary: Wiki dictionary
58 | # wikt: shortcut for Wiktionary
59 | #
60 | acceptedNamespaces = ['w', 'wiktionary', 'wikt']
61 |
62 |
63 | def get_url(urlbase, uid):
64 | return "%s?curid=%s" % (urlbase, uid)
65 |
66 |
67 | # ======================================================================
68 |
69 |
70 | def clean(extractor, text, expand_templates=False, html_safe=True):
71 | """
72 | Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
73 | @see https://www.mediawiki.org/wiki/Help:Formatting
74 | :param extractor: the Extractor t use.
75 | :param text: the text to clean.
76 | :param expand_templates: whether to perform template expansion.
77 | :param html_safe: whether to convert reserved HTML characters to entities.
78 | @return: the cleaned text.
79 | """
80 |
81 | if expand_templates:
82 | # expand templates
83 | # See: http://www.mediawiki.org/wiki/Help:Templates
84 | text = extractor.expandTemplates(text)
85 | else:
86 | # Drop transclusions (template, parser functions)
87 | text = dropNested(text, r'{{', r'}}')
88 |
89 | # Drop tables
90 | text = dropNested(text, r'{\|', r'\|}')
91 |
92 | # replace external links
93 | text = replaceExternalLinks(text)
94 |
95 | # replace internal links
96 | text = replaceInternalLinks(text)
97 |
98 | # drop MagicWords behavioral switches
99 | text = magicWordsRE.sub('', text)
100 |
101 | # ############### Process HTML ###############
102 |
103 | # turn into HTML, except for the content of
104 | res = ''
105 | cur = 0
106 | for m in syntaxhighlight.finditer(text):
107 | end = m.end()
108 | res += unescape(text[cur:m.start()]) + m.group(1)
109 | cur = end
110 | text = res + unescape(text[cur:])
111 |
112 | # Handle bold/italic/quote
113 | if extractor.HtmlFormatting:
114 | text = bold_italic.sub(r'\1', text)
115 | text = bold.sub(r'\1', text)
116 | text = italic.sub(r'\1', text)
117 | else:
118 | text = bold_italic.sub(r'\1', text)
119 | text = bold.sub(r'\1', text)
120 | text = italic_quote.sub(r'"\1"', text)
121 | text = italic.sub(r'"\1"', text)
122 | text = quote_quote.sub(r'"\1"', text)
123 | # residuals of unbalanced quotes
124 | text = text.replace("'''", '').replace("''", '"')
125 |
126 | # Collect spans
127 |
128 | spans = []
129 | # Drop HTML comments
130 | for m in comment.finditer(text):
131 | spans.append((m.start(), m.end()))
132 |
133 | # Drop self-closing tags
134 | for pattern in selfClosing_tag_patterns:
135 | for m in pattern.finditer(text):
136 | spans.append((m.start(), m.end()))
137 |
138 | # Drop ignored tags
139 | for left, right in ignored_tag_patterns:
140 | for m in left.finditer(text):
141 | spans.append((m.start(), m.end()))
142 | for m in right.finditer(text):
143 | spans.append((m.start(), m.end()))
144 |
145 | # Bulk remove all spans
146 | text = dropSpans(spans, text)
147 |
148 | # Drop discarded elements
149 | for tag in discardElements:
150 | text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
151 |
152 | if not extractor.HtmlFormatting:
153 | # Turn into text what is left ( ) and
154 | text = unescape(text)
155 |
156 | # Expand placeholders
157 | for pattern, placeholder in placeholder_tag_patterns:
158 | index = 1
159 | for match in pattern.finditer(text):
160 | text = text.replace(match.group(), '%s_%d' % (placeholder, index))
161 | index += 1
162 |
163 | text = text.replace('<<', u'«').replace('>>', u'»')
164 |
165 | #############################################
166 |
167 | # Cleanup text
168 | text = text.replace('\t', ' ')
169 | text = spaces.sub(' ', text)
170 | text = dots.sub('...', text)
171 | text = re.sub(u' (,:\.\)\]»)', r'\1', text)
172 | text = re.sub(u'(\[\(«) ', r'\1', text)
173 | text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
174 | text = text.replace(',,', ',').replace(',.', '.')
175 | if html_safe:
176 | text = html.escape(text, quote=False)
177 | return text
178 |
179 |
180 | # skip level 1, it is page name level
181 | section = re.compile(r'(==+)\s*(.*?)\s*\1')
182 |
183 | listOpen = {'*': '', '#': '', ';': '', ':': ''}
184 | listClose = {'*': '
', '#': '', ';': '', ':': ''}
185 | listItem = {'*': '%s', '#': '%s', ';': '%s',
186 | ':': '%s'}
187 |
188 |
189 | def compact(text, mark_headers=False):
190 | """Deal with headers, lists, empty sections, residuals of tables.
191 | :param text: convert to HTML
192 | """
193 |
194 | page = [] # list of paragraph
195 | headers = {} # Headers for unfilled sections
196 | emptySection = False # empty sections are discarded
197 | listLevel = '' # nesting of lists
198 |
199 | for line in text.split('\n'):
200 |
201 | if not line:
202 | continue
203 | # Handle section titles
204 | m = section.match(line)
205 | if m:
206 | title = m.group(2)
207 | lev = len(m.group(1))
208 | if Extractor.HtmlFormatting:
209 | page.append("%s" % (lev, title, lev))
210 | if title and title[-1] not in '!?':
211 | title += '.'
212 |
213 | if mark_headers:
214 | title = "## " + title
215 |
216 | headers[lev] = title
217 | # drop previous headers
218 | headers = { k:v for k,v in headers.items() if k <= lev }
219 | emptySection = True
220 | continue
221 | # Handle page title
222 | if line.startswith('++'):
223 | title = line[2:-2]
224 | if title:
225 | if title[-1] not in '!?':
226 | title += '.'
227 | page.append(title)
228 | # handle indents
229 | elif line[0] == ':':
230 | # page.append(line.lstrip(':*#;'))
231 | continue
232 | # handle lists
233 | elif line[0] in '*#;:':
234 | if Extractor.HtmlFormatting:
235 | i = 0
236 | for c, n in zip_longest(listLevel, line, fillvalue=''):
237 | if not n or n not in '*#;:':
238 | if c:
239 | page.append(listClose[c])
240 | listLevel = listLevel[:-1]
241 | continue
242 | else:
243 | break
244 | # n != ''
245 | if c != n and (not c or (c not in ';:' and n not in ';:')):
246 | if c:
247 | # close level
248 | page.append(listClose[c])
249 | listLevel = listLevel[:-1]
250 | listLevel += n
251 | page.append(listOpen[n])
252 | i += 1
253 | n = line[i - 1] # last list char
254 | line = line[i:].strip()
255 | if line: # FIXME: n is '"'
256 | page.append(listItem[n] % line)
257 | else:
258 | continue
259 | elif len(listLevel):
260 | for c in reversed(listLevel):
261 | page.append(listClose[c])
262 | listLevel = []
263 |
264 | # Drop residuals of lists
265 | elif line[0] in '{|' or line[-1] == '}':
266 | continue
267 | # Drop irrelevant lines
268 | elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
269 | continue
270 | elif len(headers):
271 | if Extractor.keepSections:
272 | items = sorted(headers.items())
273 | for (i, v) in items:
274 | page.append(v)
275 | headers.clear()
276 | page.append(line) # first line
277 | emptySection = False
278 | elif not emptySection:
279 | page.append(line)
280 | # dangerous
281 | # # Drop preformatted
282 | # elif line[0] == ' ':
283 | # continue
284 |
285 | return page
286 |
287 |
288 | # ----------------------------------------------------------------------
289 |
290 | def dropNested(text, openDelim, closeDelim):
291 | """
292 | A matching function for nested expressions, e.g. namespaces and tables.
293 | """
294 | openRE = re.compile(openDelim, re.IGNORECASE)
295 | closeRE = re.compile(closeDelim, re.IGNORECASE)
296 | # partition text in separate blocks { } { }
297 | spans = [] # pairs (s, e) for each partition
298 | nest = 0 # nesting level
299 | start = openRE.search(text, 0)
300 | if not start:
301 | return text
302 | end = closeRE.search(text, start.end())
303 | next = start
304 | while end:
305 | next = openRE.search(text, next.end())
306 | if not next: # termination
307 | while nest: # close all pending
308 | nest -= 1
309 | end0 = closeRE.search(text, end.end())
310 | if end0:
311 | end = end0
312 | else:
313 | break
314 | spans.append((start.start(), end.end()))
315 | break
316 | while end.end() < next.start():
317 | # { } {
318 | if nest:
319 | nest -= 1
320 | # try closing more
321 | last = end.end()
322 | end = closeRE.search(text, end.end())
323 | if not end: # unbalanced
324 | if spans:
325 | span = (spans[0][0], last)
326 | else:
327 | span = (start.start(), last)
328 | spans = [span]
329 | break
330 | else:
331 | spans.append((start.start(), end.end()))
332 | # advance start, find next close
333 | start = next
334 | end = closeRE.search(text, next.end())
335 | break # { }
336 | if next != start:
337 | # { { }
338 | nest += 1
339 | # collect text outside partitions
340 | return dropSpans(spans, text)
341 |
342 |
343 | def dropSpans(spans, text):
344 | """
345 | Drop from text the blocks identified in :param spans:, possibly nested.
346 | """
347 | spans.sort()
348 | res = ''
349 | offset = 0
350 | for s, e in spans:
351 | if offset <= s: # handle nesting
352 | if offset < s:
353 | res += text[offset:s]
354 | offset = e
355 | res += text[offset:]
356 | return res
357 |
358 |
359 | # ----------------------------------------------------------------------
360 | # External links
361 |
362 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html
363 |
364 | wgUrlProtocols = [
365 | 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://',
366 | 'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:',
367 | 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
368 | 'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//'
369 | ]
370 |
371 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html
372 |
373 | # Constants needed for external link processing
374 | # Everything except bracket, space, or control characters
375 | # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
376 | # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
377 | EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
378 | ExtLinkBracketedRegex = re.compile(
379 | '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
380 | re.S | re.U)
381 | EXT_IMAGE_REGEX = re.compile(
382 | r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
383 | /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
384 | re.X | re.S | re.U)
385 |
386 |
387 | def replaceExternalLinks(text):
388 | s = ''
389 | cur = 0
390 | for m in ExtLinkBracketedRegex.finditer(text):
391 | s += text[cur:m.start()]
392 | cur = m.end()
393 |
394 | url = m.group(1)
395 | label = m.group(3)
396 |
397 | # # The characters '<' and '>' (which were escaped by
398 | # # removeHTMLtags()) should not be included in
399 | # # URLs, per RFC 2396.
400 | # m2 = re.search('&(lt|gt);', url)
401 | # if m2:
402 | # link = url[m2.end():] + ' ' + link
403 | # url = url[0:m2.end()]
404 |
405 | # If the link text is an image URL, replace it with an
tag
406 | # This happened by accident in the original parser, but some people used it extensively
407 | m = EXT_IMAGE_REGEX.match(label)
408 | if m:
409 | label = makeExternalImage(label)
410 |
411 | # Use the encoded URL
412 | # This means that users can paste URLs directly into the text
413 | # Funny characters like ö aren't valid in URLs anyway
414 | # This was changed in August 2004
415 | s += makeExternalLink(url, label) # + trail
416 |
417 | return s + text[cur:]
418 |
419 |
420 | def makeExternalLink(url, anchor):
421 | """Function applied to wikiLinks"""
422 | if Extractor.keepLinks:
423 | return '%s' % (urlencode(url), anchor)
424 | else:
425 | return anchor
426 |
427 |
428 | def makeExternalImage(url, alt=''):
429 | if Extractor.keepLinks:
430 | return '
' % (url, alt)
431 | else:
432 | return alt
433 |
434 |
435 | # ----------------------------------------------------------------------
436 | # WikiLinks
437 | # See https://www.mediawiki.org/wiki/Help:Links#Internal_links
438 |
439 | # Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
440 | # Also: [[Help:IPA for Catalan|[andora]]]
441 |
442 |
443 | def replaceInternalLinks(text):
444 | """
445 | Replaces external links of the form:
446 | [[title |...|label]]trail
447 |
448 | with title concatenated with trail, when present, e.g. 's' for plural.
449 | """
450 | # call this after removal of external links, so we need not worry about
451 | # triple closing ]]].
452 | cur = 0
453 | res = ''
454 | for s, e in findBalanced(text, ['[['], [']]']):
455 | m = tailRE.match(text, e)
456 | if m:
457 | trail = m.group(0)
458 | end = m.end()
459 | else:
460 | trail = ''
461 | end = e
462 | inner = text[s + 2:e - 2]
463 | # find first |
464 | pipe = inner.find('|')
465 | if pipe < 0:
466 | title = inner
467 | label = title
468 | else:
469 | title = inner[:pipe].rstrip()
470 | # find last |
471 | curp = pipe + 1
472 | for s1, e1 in findBalanced(inner, ['[['], [']]']):
473 | last = inner.rfind('|', curp, s1)
474 | if last >= 0:
475 | pipe = last # advance
476 | curp = e1
477 | label = inner[pipe + 1:].strip()
478 | res += text[cur:s] + makeInternalLink(title, label) + trail
479 | cur = end
480 | return res + text[cur:]
481 |
482 |
483 | def makeInternalLink(title, label):
484 | colon = title.find(':')
485 | if colon > 0 and title[:colon] not in acceptedNamespaces:
486 | return ''
487 | if colon == 0:
488 | # drop also :File:
489 | colon2 = title.find(':', colon + 1)
490 | if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
491 | return ''
492 | if Extractor.keepLinks:
493 | return '%s' % (urlencode(title), label)
494 | else:
495 | return label
496 |
497 |
498 | # ----------------------------------------------------------------------
499 | # variables
500 |
501 |
502 | class MagicWords():
503 |
504 | """
505 | One copy in each Extractor.
506 |
507 | @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html
508 | """
509 | names = [
510 | '!',
511 | 'currentmonth',
512 | 'currentmonth1',
513 | 'currentmonthname',
514 | 'currentmonthnamegen',
515 | 'currentmonthabbrev',
516 | 'currentday',
517 | 'currentday2',
518 | 'currentdayname',
519 | 'currentyear',
520 | 'currenttime',
521 | 'currenthour',
522 | 'localmonth',
523 | 'localmonth1',
524 | 'localmonthname',
525 | 'localmonthnamegen',
526 | 'localmonthabbrev',
527 | 'localday',
528 | 'localday2',
529 | 'localdayname',
530 | 'localyear',
531 | 'localtime',
532 | 'localhour',
533 | 'numberofarticles',
534 | 'numberoffiles',
535 | 'numberofedits',
536 | 'articlepath',
537 | 'pageid',
538 | 'sitename',
539 | 'server',
540 | 'servername',
541 | 'scriptpath',
542 | 'stylepath',
543 | 'pagename',
544 | 'pagenamee',
545 | 'fullpagename',
546 | 'fullpagenamee',
547 | 'namespace',
548 | 'namespacee',
549 | 'namespacenumber',
550 | 'currentweek',
551 | 'currentdow',
552 | 'localweek',
553 | 'localdow',
554 | 'revisionid',
555 | 'revisionday',
556 | 'revisionday2',
557 | 'revisionmonth',
558 | 'revisionmonth1',
559 | 'revisionyear',
560 | 'revisiontimestamp',
561 | 'revisionuser',
562 | 'revisionsize',
563 | 'subpagename',
564 | 'subpagenamee',
565 | 'talkspace',
566 | 'talkspacee',
567 | 'subjectspace',
568 | 'subjectspacee',
569 | 'talkpagename',
570 | 'talkpagenamee',
571 | 'subjectpagename',
572 | 'subjectpagenamee',
573 | 'numberofusers',
574 | 'numberofactiveusers',
575 | 'numberofpages',
576 | 'currentversion',
577 | 'rootpagename',
578 | 'rootpagenamee',
579 | 'basepagename',
580 | 'basepagenamee',
581 | 'currenttimestamp',
582 | 'localtimestamp',
583 | 'directionmark',
584 | 'contentlanguage',
585 | 'numberofadmins',
586 | 'cascadingsources',
587 | ]
588 |
589 | def __init__(self):
590 | self.values = {'!': '|'}
591 |
592 | def __getitem__(self, name):
593 | return self.values.get(name)
594 |
595 | def __setitem__(self, name, value):
596 | self.values[name] = value
597 |
598 | switches = (
599 | '__NOTOC__',
600 | '__FORCETOC__',
601 | '__TOC__',
602 | '__TOC__',
603 | '__NEWSECTIONLINK__',
604 | '__NONEWSECTIONLINK__',
605 | '__NOGALLERY__',
606 | '__HIDDENCAT__',
607 | '__NOCONTENTCONVERT__',
608 | '__NOCC__',
609 | '__NOTITLECONVERT__',
610 | '__NOTC__',
611 | '__START__',
612 | '__END__',
613 | '__INDEX__',
614 | '__NOINDEX__',
615 | '__STATICREDIRECT__',
616 | '__DISAMBIG__'
617 | )
618 |
619 |
620 | magicWordsRE = re.compile('|'.join(MagicWords.switches))
621 |
622 |
623 | # =========================================================================
624 | #
625 | # MediaWiki Markup Grammar
626 | # https://www.mediawiki.org/wiki/Preprocessor_ABNF
627 |
628 | # xml-char = %x9 / %xA / %xD / %x20-D7FF / %xE000-FFFD / %x10000-10FFFF
629 | # sptab = SP / HTAB
630 |
631 | # ; everything except ">" (%x3E)
632 | # attr-char = %x9 / %xA / %xD / %x20-3D / %x3F-D7FF / %xE000-FFFD / %x10000-10FFFF
633 |
634 | # literal = *xml-char
635 | # title = wikitext-L3
636 | # part-name = wikitext-L3
637 | # part-value = wikitext-L3
638 | # part = ( part-name "=" part-value ) / ( part-value )
639 | # parts = [ title *( "|" part ) ]
640 | # tplarg = "{{{" parts "}}}"
641 | # template = "{{" parts "}}"
642 | # link = "[[" wikitext-L3 "]]"
643 |
644 | # comment = ""
645 | # unclosed-comment = "', re.DOTALL)
739 |
740 | # Match ignored tags
741 | ignored_tag_patterns = []
742 |
743 |
744 | def ignoreTag(tag):
745 | left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both [ and
746 | right = re.compile(r'\s*%s>' % tag, re.IGNORECASE)
747 | ignored_tag_patterns.append((left, right))
748 |
749 |
750 | def resetIgnoredTags():
751 | global ignored_tag_patterns
752 | ignored_tag_patterns = []
753 |
754 |
755 | for tag in ignoredTags:
756 | ignoreTag(tag)
757 |
758 | # Match selfClosing HTML tags
759 | selfClosing_tag_patterns = [
760 | re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
761 | ]
762 |
763 | # Match HTML placeholder tags
764 | placeholder_tag_patterns = [
765 | (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE),
766 | repl) for tag, repl in placeholder_tags.items()
767 | ]
768 |
769 | # Match preformatted lines
770 | preformatted = re.compile(r'^ .*?$')
771 |
772 | # Match external links (space separates second optional parameter)
773 | externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]')
774 | externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')
775 |
776 | # Matches bold/italic
777 | bold_italic = re.compile(r"'''''(.*?)'''''")
778 | bold = re.compile(r"'''(.*?)'''")
779 | italic_quote = re.compile(r"''\"([^\"]*?)\"''")
780 | italic = re.compile(r"''(.*?)''")
781 | quote_quote = re.compile(r'""([^"]*?)""')
782 |
783 | # Matches space
784 | spaces = re.compile(r' {2,}')
785 |
786 | # Matches dots
787 | dots = re.compile(r'\.{4,}')
788 |
789 | # ======================================================================
790 |
791 | substWords = 'subst:|safesubst:'
792 |
793 |
794 | class Extractor():
795 | """
796 | An extraction task on a article.
797 | """
798 | ##
799 | # Whether to preserve links in output
800 | keepLinks = False
801 |
802 | ##
803 | # Whether to preserve section titles
804 | keepSections = True
805 |
806 | ##
807 | # Whether to output text with HTML formatting elements in files.
808 | HtmlFormatting = False
809 |
810 | ##
811 | # Whether to produce json instead of the default output format.
812 | toJson = False
813 |
814 | def __init__(self,title):
815 | """
816 | :param page: a list of lines.
817 | """
818 | # self.id = id
819 | # self.revid = revid
820 | # self.url = get_url(urlbase, id)
821 | self.title = title
822 | # self.page = page
823 | self.magicWords = MagicWords()
824 | self.frame = []
825 | self.recursion_exceeded_1_errs = 0 # template recursion within expandTemplates()
826 | self.recursion_exceeded_2_errs = 0 # template recursion within expandTemplate()
827 | self.recursion_exceeded_3_errs = 0 # parameter recursion
828 | self.template_title_errs = 0
829 |
830 | def clean_text(self, text, mark_headers=False, expand_templates=False,
831 | html_safe=True):
832 | """
833 | :param mark_headers: True to distinguish headers from paragraphs
834 | e.g. "## Section 1"
835 | """
836 | self.magicWords['pagename'] = self.title
837 | self.magicWords['fullpagename'] = self.title
838 | self.magicWords['currentyear'] = time.strftime('%Y')
839 | self.magicWords['currentmonth'] = time.strftime('%m')
840 | self.magicWords['currentday'] = time.strftime('%d')
841 | self.magicWords['currenthour'] = time.strftime('%H')
842 | self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
843 |
844 | text = clean(self, text, expand_templates=expand_templates,
845 | html_safe=html_safe)
846 |
847 | text = compact(text, mark_headers=mark_headers)
848 | return text
849 |
850 | def extract(self, page, html_safe=False):
851 | """
852 | :param out: a memory file.
853 | :param html_safe: whether to escape HTML entities.
854 | """
855 | # logging.debug("%s\t%s", self.id, self.title)
856 | text = self.clean_text(page, html_safe=html_safe)
857 | return '\n'.join(text)
858 |
859 |
860 | # header = '\n' % (self.id, self.url, self.title)
861 | # # Separate header from text with a newline.
862 | # header += self.title + '\n\n'
863 | # footer = "\n\n"
864 | # out.write(header)
865 | # out.write('\n'.join(text))
866 | # out.write('\n')
867 | # out.write(footer)
868 |
869 | # errs = (self.template_title_errs,
870 | # self.recursion_exceeded_1_errs,
871 | # self.recursion_exceeded_2_errs,
872 | # self.recursion_exceeded_3_errs)
873 | # if any(errs):
874 | # logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
875 | # self.title, self.id, *errs)
876 |
877 | # ----------------------------------------------------------------------
878 | # Expand templates
879 |
880 | maxTemplateRecursionLevels = 30
881 | maxParameterRecursionLevels = 10
882 |
883 | # check for template beginning
884 | reOpen = re.compile('(?= self.maxTemplateRecursionLevels:
910 | self.recursion_exceeded_1_errs += 1
911 | return res
912 |
913 | # logging.debug(' %d %s', len(self.frame), res)
923 | return res
924 |
925 | def templateParams(self, parameters):
926 | """
927 | Build a dictionary with positional or name key to expanded parameters.
928 | :param parameters: the parts[1:] of a template, i.e. all except the title.
929 | """
930 | templateParams = {}
931 |
932 | if not parameters:
933 | return templateParams
934 | logging.debug('
963 | # Parameters may span several lines, like:
964 | # {{Reflist|colwidth=30em|refs=
965 | # <ref name="Goode">Title</ref>
966 |
967 | # The '=' might occurr within an HTML attribute:
968 | # "<ref name=value"
969 | # but we stop at first.
970 | m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL)
971 | if m:
972 | # This is a named parameter. This case also handles parameter
973 | # assignments like "2=xxx", where the number of an unnamed
974 | # parameter ("2") is specified explicitly - this is handled
975 | # transparently.
976 |
977 | parameterName = m.group(1).strip()
978 | parameterValue = m.group(2)
979 |
980 | if ']]' not in parameterValue: # if the value does not contain a link, trim whitespace
981 | parameterValue = parameterValue.strip()
982 | templateParams[parameterName] = parameterValue
983 | else:
984 | # this is an unnamed parameter
985 | unnamedParameterCounter += 1
986 |
987 | if ']]' not in param: # if the value does not contain a link, trim whitespace
988 | param = param.strip()
989 | templateParams[str(unnamedParameterCounter)] = param
990 | logging.debug(' templateParams> %s', '|'.join(templateParams.values()))
991 | return templateParams
992 |
993 | def expandTemplate(self, body):
994 | """Expands template invocation.
995 | :param body: the parts of a template.
996 |
997 | :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation
998 | of the process.
999 |
1000 | See in particular: Expansion of names and values
1001 | http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values
1002 |
1003 | For most parser functions all names and values are expanded,
1004 | regardless of what is relevant for the result. The branching functions
1005 | (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions.
1006 |
1007 | All names in a template call are expanded, and the titles of the
1008 | tplargs in the template body, after which it is determined which
1009 | values must be expanded, and for which tplargs in the template body
1010 | the first part (default).
1011 |
1012 | In the case of a tplarg, any parts beyond the first are never
1013 | expanded. The possible name and the value of the first part is
1014 | expanded if the title does not match a name in the template call.
1015 |
1016 | :see code for braceSubstitution at
1017 | https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397:
1018 |
1019 | """
1020 |
1021 | # template = "{{" parts "}}"
1022 |
1023 | # Templates and tplargs are decomposed in the same way, with pipes as
1024 | # separator, even though eventually any parts in a tplarg after the first
1025 | # (the parameter default) are ignored, and an equals sign in the first
1026 | # part is treated as plain text.
1027 | # Pipes inside inner templates and tplargs, or inside double rectangular
1028 | # brackets within the template or tplargs are not taken into account in
1029 | # this decomposition.
1030 | # The first part is called title, the other parts are simply called parts.
1031 |
1032 | # If a part has one or more equals signs in it, the first equals sign
1033 | # determines the division into name = value. Equals signs inside inner
1034 | # templates and tplargs, or inside double rectangular brackets within the
1035 | # part are not taken into account in this decomposition. Parts without
1036 | # equals sign are indexed 1, 2, .., given as attribute in the tag.
1037 |
1038 | if len(self.frame) >= self.maxTemplateRecursionLevels:
1039 | self.recursion_exceeded_2_errs += 1
1040 | # logging.debug(' INVOCATION> %d %s', len(self.frame), body)
1041 | return ''
1042 |
1043 | logging.debug('INVOCATION %d %s', len(self.frame), body)
1044 |
1045 | parts = splitParts(body)
1046 | # title is the portion before the first |
1047 | logging.debug('TITLE %s', parts[0].strip())
1048 | title = self.expandTemplates(parts[0].strip())
1049 |
1050 | # SUBST
1051 | # Apply the template tag to parameters without
1052 | # substituting into them, e.g.
1053 | # {{subst:t|a{{{p|q}}}b}} gives the wikitext start-a{{{p|q}}}b-end
1054 | # @see https://www.mediawiki.org/wiki/Manual:Substitution#Partial_substitution
1055 | subst = False
1056 | if re.match(substWords, title, re.IGNORECASE):
1057 | title = re.sub(substWords, '', title, 1, re.IGNORECASE)
1058 | subst = True
1059 |
1060 | if title.lower() in self.magicWords.values:
1061 | return self.magicWords[title.lower()]
1062 |
1063 | # Parser functions
1064 | # The first argument is everything after the first colon.
1065 | # It has been evaluated above.
1066 | colon = title.find(':')
1067 | if colon > 1:
1068 | funct = title[:colon]
1069 | parts[0] = title[colon + 1:].strip() # side-effect (parts[0] not used later)
1070 | # arguments after first are not evaluated
1071 | ret = callParserFunction(funct, parts, self.frame)
1072 | return self.expandTemplates(ret)
1073 |
1074 | title = fullyQualifiedTemplateTitle(title)
1075 | if not title:
1076 | self.template_title_errs += 1
1077 | return ''
1078 |
1079 | redirected = redirects.get(title)
1080 | if redirected:
1081 | title = redirected
1082 |
1083 | # get the template
1084 | if title in templateCache:
1085 | template = templateCache[title]
1086 | elif title in templates:
1087 | template = Template.parse(templates[title])
1088 | # add it to cache
1089 | templateCache[title] = template
1090 | del templates[title]
1091 | else:
1092 | # The page being included could not be identified
1093 | return ''
1094 |
1095 | # logging.debug('TEMPLATE %s: %s', title, template)
1096 |
1097 | # tplarg = "{{{" parts "}}}"
1098 | # parts = [ title *( "|" part ) ]
1099 | # part = ( part-name "=" part-value ) / ( part-value )
1100 | # part-name = wikitext-L3
1101 | # part-value = wikitext-L3
1102 | # wikitext-L3 = literal / template / tplarg / link / comment /
1103 | # line-eating-comment / unclosed-comment /
1104 | # xmlish-element / *wikitext-L3
1105 |
1106 | # A tplarg may contain other parameters as well as templates, e.g.:
1107 | # {{{text|{{{quote|{{{1|{{error|Error: No text given}}}}}}}}}}}
1108 | # hence no simple RE like this would work:
1109 | # '{{{((?:(?!{{{).)*?)}}}'
1110 | # We must use full CF parsing.
1111 |
1112 | # the parameter name itself might be computed, e.g.:
1113 | # {{{appointe{{#if:{{{appointer14|}}}|r|d}}14|}}}
1114 |
1115 | # Because of the multiple uses of double-brace and triple-brace
1116 | # syntax, expressions can sometimes be ambiguous.
1117 | # Precedence rules specifed here:
1118 | # http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
1119 | # resolve ambiguities like this:
1120 | # {{{{ }}}} -> { {{{ }}} }
1121 | # {{{{{ }}}}} -> {{ {{{ }}} }}
1122 | #
1123 | # :see: https://en.wikipedia.org/wiki/Help:Template#Handling_parameters
1124 |
1125 | params = parts[1:]
1126 |
1127 | if not subst:
1128 | # Evaluate parameters, since they may contain templates, including
1129 | # the symbol "=".
1130 | # {{#ifexpr: {{{1}}} = 1 }}
1131 | params = [self.expandTemplates(p) for p in params]
1132 |
1133 | # build a dict of name-values for the parameter values
1134 | params = self.templateParams(params)
1135 |
1136 | # Perform parameter substitution
1137 | # extend frame before subst, since there may be recursion in default
1138 | # parameter value, e.g. {{OTRS|celebrative|date=April 2015}} in article
1139 | # 21637542 in enwiki.
1140 | self.frame.append((title, params))
1141 | instantiated = template.subst(params, self)
1142 | # logging.debug('instantiated %d %s', len(self.frame), instantiated)
1143 | value = self.expandTemplates(instantiated)
1144 | self.frame.pop()
1145 | # logging.debug(' INVOCATION> %s %d %s', title, len(self.frame), value)
1146 | return value
1147 |
1148 |
1149 | # ----------------------------------------------------------------------
1150 | # parameter handling
1151 |
1152 |
1153 | def splitParts(paramsList):
1154 | """
1155 | :param paramsList: the parts of a template or tplarg.
1156 |
1157 | Split template parameters at the separator "|".
1158 | separator "=".
1159 |
1160 | Template parameters often contain URLs, internal links, text or even
1161 | template expressions, since we evaluate templates outside in.
1162 | This is required for cases like:
1163 | {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}}
1164 | Parameters are separated by "|" symbols. However, we
1165 | cannot simply split the string on "|" symbols, since these
1166 | also appear inside templates and internal links, e.g.
1167 |
1168 | {{if:|
1169 | |{{#if:the president|
1170 | |{{#if:|
1171 | [[Category:Hatnote templates|A{{PAGENAME}}]]
1172 | }}
1173 | }}
1174 | }}
1175 |
1176 | We split parts at the "|" symbols that are not inside any pair
1177 | {{{...}}}, {{...}}, [[...]], {|...|}.
1178 | """
1179 |
1180 | # Must consider '[' as normal in expansion of Template:EMedicine2:
1181 | # #ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}
1182 | # as part of:
1183 | # {{#ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}}} ped/180{{#if: |~}}]
1184 |
1185 | # should handle both tpl arg like:
1186 | # 4|{{{{{subst|}}}CURRENTYEAR}}
1187 | # and tpl parameters like:
1188 | # ||[[Category:People|{{#if:A|A|{{PAGENAME}}}}]]
1189 |
1190 | sep = '|'
1191 | parameters = []
1192 | cur = 0
1193 | for s, e in findMatchingBraces(paramsList):
1194 | par = paramsList[cur:s].split(sep)
1195 | if par:
1196 | if parameters:
1197 | # portion before | belongs to previous parameter
1198 | parameters[-1] += par[0]
1199 | if len(par) > 1:
1200 | # rest are new parameters
1201 | parameters.extend(par[1:])
1202 | else:
1203 | parameters = par
1204 | elif not parameters:
1205 | parameters = [''] # create first param
1206 | # add span to last previous parameter
1207 | parameters[-1] += paramsList[s:e]
1208 | cur = e
1209 | # leftover
1210 | par = paramsList[cur:].split(sep)
1211 | if par:
1212 | if parameters:
1213 | # portion before | belongs to previous parameter
1214 | parameters[-1] += par[0]
1215 | if len(par) > 1:
1216 | # rest are new parameters
1217 | parameters.extend(par[1:])
1218 | else:
1219 | parameters = par
1220 |
1221 | # logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, str(parameters))
1222 | return parameters
1223 |
1224 |
1225 | def findMatchingBraces(text, ldelim=0):
1226 | """
1227 | :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}.
1228 | """
1229 | # Parsing is done with respect to pairs of double braces {{..}} delimiting
1230 | # a template, and pairs of triple braces {{{..}}} delimiting a tplarg.
1231 | # If double opening braces are followed by triple closing braces or
1232 | # conversely, this is taken as delimiting a template, with one left-over
1233 | # brace outside it, taken as plain text. For any pattern of braces this
1234 | # defines a set of templates and tplargs such that any two are either
1235 | # separate or nested (not overlapping).
1236 |
1237 | # Unmatched double rectangular closing brackets can be in a template or
1238 | # tplarg, but unmatched double rectangular opening brackets cannot.
1239 | # Unmatched double or triple closing braces inside a pair of
1240 | # double rectangular brackets are treated as plain text.
1241 | # Other formulation: in ambiguity between template or tplarg on one hand,
1242 | # and a link on the other hand, the structure with the rightmost opening
1243 | # takes precedence, even if this is the opening of a link without any
1244 | # closing, so not producing an actual link.
1245 |
1246 | # In the case of more than three opening braces the last three are assumed
1247 | # to belong to a tplarg, unless there is no matching triple of closing
1248 | # braces, in which case the last two opening braces are are assumed to
1249 | # belong to a template.
1250 |
1251 | # We must skip individual { like in:
1252 | # {{#ifeq: {{padleft:|1|}} | { | | }}
1253 | # We must resolve ambiguities like this:
1254 | # {{{{ }}}} -> { {{{ }}} }
1255 | # {{{{{ }}}}} -> {{ {{{ }}} }}
1256 | # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}}
1257 |
1258 | # Handle:
1259 | # {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}}
1260 | # as well as expressions with stray }:
1261 | # {{{link|{{ucfirst:{{{1}}}}}} interchange}}}
1262 |
1263 | if ldelim: # 2-3
1264 | reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim
1265 | reNext = re.compile('[{]{2,}|}{2,}') # at least 2
1266 | else:
1267 | reOpen = re.compile('{{2,}|\[{2,}')
1268 | reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2
1269 |
1270 | cur = 0
1271 | while True:
1272 | m1 = reOpen.search(text, cur)
1273 | if not m1:
1274 | return
1275 | lmatch = m1.end() - m1.start()
1276 | if m1.group()[0] == '{':
1277 | stack = [lmatch] # stack of opening braces lengths
1278 | else:
1279 | stack = [-lmatch] # negative means [
1280 | end = m1.end()
1281 | while True:
1282 | m2 = reNext.search(text, end)
1283 | if not m2:
1284 | return # unbalanced
1285 | end = m2.end()
1286 | brac = m2.group()[0]
1287 | lmatch = m2.end() - m2.start()
1288 |
1289 | if brac == '{':
1290 | stack.append(lmatch)
1291 | elif brac == '}':
1292 | while stack:
1293 | openCount = stack.pop() # opening span
1294 | if openCount == 0: # illegal unmatched [[
1295 | continue
1296 | if lmatch >= openCount:
1297 | lmatch -= openCount
1298 | if lmatch <= 1: # either close or stray }
1299 | break
1300 | else:
1301 | # put back unmatched
1302 | stack.append(openCount - lmatch)
1303 | break
1304 | if not stack:
1305 | yield m1.start(), end - lmatch
1306 | cur = end
1307 | break
1308 | elif len(stack) == 1 and 0 < stack[0] < ldelim:
1309 | # ambiguous {{{{{ }}} }}
1310 | yield m1.start() + stack[0], end
1311 | cur = end
1312 | break
1313 | elif brac == '[': # [[
1314 | stack.append(-lmatch)
1315 | else: # ]]
1316 | while stack and stack[-1] < 0: # matching [[
1317 | openCount = -stack.pop()
1318 | if lmatch >= openCount:
1319 | lmatch -= openCount
1320 | if lmatch <= 1: # either close or stray ]
1321 | break
1322 | else:
1323 | # put back unmatched (negative)
1324 | stack.append(lmatch - openCount)
1325 | break
1326 | if not stack:
1327 | yield m1.start(), end - lmatch
1328 | cur = end
1329 | break
1330 | # unmatched ]] are discarded
1331 | cur = end
1332 |
1333 |
1334 | def findBalanced(text, openDelim, closeDelim):
1335 | """
1336 | Assuming that text contains a properly balanced expression using
1337 | :param openDelim: as opening delimiters and
1338 | :param closeDelim: as closing delimiters.
1339 | :return: an iterator producing pairs (start, end) of start and end
1340 | positions in text containing a balanced expression.
1341 | """
1342 | openPat = '|'.join([re.escape(x) for x in openDelim])
1343 | # patter for delimiters expected after each opening delimiter
1344 | afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
1345 | stack = []
1346 | start = 0
1347 | cur = 0
1348 | # end = len(text)
1349 | startSet = False
1350 | startPat = re.compile(openPat)
1351 | nextPat = startPat
1352 | while True:
1353 | next = nextPat.search(text, cur)
1354 | if not next:
1355 | return
1356 | if not startSet:
1357 | start = next.start()
1358 | startSet = True
1359 | delim = next.group(0)
1360 | if delim in openDelim:
1361 | stack.append(delim)
1362 | nextPat = afterPat[delim]
1363 | else:
1364 | opening = stack.pop()
1365 | # assert opening == openDelim[closeDelim.index(next.group(0))]
1366 | if stack:
1367 | nextPat = afterPat[stack[-1]]
1368 | else:
1369 | yield start, next.end()
1370 | nextPat = startPat
1371 | start = next.end()
1372 | startSet = False
1373 | cur = next.end()
1374 |
1375 | # ----------------------------------------------------------------------
1376 | # parser functions utilities
1377 |
1378 |
1379 | def ucfirst(string):
1380 | """:return: a string with just its first character uppercase
1381 | We can't use title() since it coverts all words.
1382 | """
1383 | if string:
1384 | if len(string) > 1:
1385 | return string[0].upper() + string[1:]
1386 | else:
1387 | return string.upper()
1388 | else:
1389 | return ''
1390 |
1391 |
1392 | def lcfirst(string):
1393 | """:return: a string with its first character lowercase"""
1394 | if string:
1395 | if len(string) > 1:
1396 | return string[0].lower() + string[1:]
1397 | else:
1398 | return string.lower()
1399 | else:
1400 | return ''
1401 |
1402 |
1403 | def fullyQualifiedTemplateTitle(templateTitle):
1404 | """
1405 | Determine the namespace of the page being included through the template
1406 | mechanism
1407 | """
1408 | if templateTitle.startswith(':'):
1409 | # Leading colon by itself implies main namespace, so strip this colon
1410 | return ucfirst(templateTitle[1:])
1411 | else:
1412 | m = re.match('([^:]*)(:.*)', templateTitle)
1413 | if m:
1414 | # colon found but not in the first position - check if it
1415 | # designates a known namespace
1416 | prefix = normalizeNamespace(m.group(1))
1417 | if prefix in knownNamespaces:
1418 | return prefix + ucfirst(m.group(2))
1419 | # The title of the page being included is NOT in the main namespace and
1420 | # lacks any other explicit designation of the namespace - therefore, it
1421 | # is resolved to the Template namespace (that's the default for the
1422 | # template inclusion mechanism).
1423 |
1424 | # This is a defense against pages whose title only contains UTF-8 chars
1425 | # that are reduced to an empty string. Right now I can think of one such
1426 | # case - which represents the non-breaking space.
1427 | # In this particular case, this page is a redirect to [[Non-nreaking
1428 | # space]], but having in the system a redirect page with an empty title
1429 | # causes numerous problems, so we'll live happier without it.
1430 | if templateTitle:
1431 | return templatePrefix + ucfirst(templateTitle)
1432 | else:
1433 | return '' # caller may log as error
1434 |
1435 |
1436 | def normalizeNamespace(ns):
1437 | return ucfirst(ns)
1438 |
1439 |
1440 | # ----------------------------------------------------------------------
1441 | # Parser functions
1442 | # see http://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
1443 | # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
1444 |
1445 |
1446 | class Infix():
1447 |
1448 | """Infix operators.
1449 | The calling sequence for the infix is:
1450 | x |op| y
1451 | """
1452 |
1453 | def __init__(self, function):
1454 | self.function = function
1455 |
1456 | def __ror__(self, other):
1457 | return Infix(lambda x, self=self, other=other: self.function(other, x))
1458 |
1459 | def __or__(self, other):
1460 | return self.function(other)
1461 |
1462 | def __rlshift__(self, other):
1463 | return Infix(lambda x, self=self, other=other: self.function(other, x))
1464 |
1465 | def __rshift__(self, other):
1466 | return self.function(other)
1467 |
1468 | def __call__(self, value1, value2):
1469 | return self.function(value1, value2)
1470 |
1471 |
1472 | ROUND = Infix(lambda x, y: round(x, y))
1473 |
1474 |
1475 | def sharp_expr(expr):
1476 | try:
1477 | expr = re.sub('=', '==', expr)
1478 | expr = re.sub('mod', '%', expr)
1479 | expr = re.sub('\bdiv\b', '/', expr)
1480 | expr = re.sub('\bround\b', '|ROUND|', expr)
1481 | return unicode(eval(expr))
1482 | except:
1483 | return ''
1484 |
1485 |
1486 | def sharp_if(testValue, valueIfTrue, valueIfFalse=None, *args):
1487 | # In theory, we should evaluate the first argument here,
1488 | # but it was evaluated while evaluating part[0] in expandTemplate().
1489 | if testValue.strip():
1490 | # The {{#if:}} function is an if-then-else construct.
1491 | # The applied condition is: "The condition string is non-empty".
1492 | valueIfTrue = valueIfTrue.strip()
1493 | if valueIfTrue:
1494 | return valueIfTrue
1495 | elif valueIfFalse:
1496 | return valueIfFalse.strip()
1497 | return ""
1498 |
1499 |
1500 | def sharp_ifeq(lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args):
1501 | rvalue = rvalue.strip()
1502 | if rvalue:
1503 | # lvalue is always defined
1504 | if lvalue.strip() == rvalue:
1505 | # The {{#ifeq:}} function is an if-then-else construct. The
1506 | # applied condition is "is rvalue equal to lvalue". Note that this
1507 | # does only string comparison while MediaWiki implementation also
1508 | # supports numerical comparissons.
1509 |
1510 | if valueIfTrue:
1511 | return valueIfTrue.strip()
1512 | else:
1513 | if valueIfFalse:
1514 | return valueIfFalse.strip()
1515 | return ""
1516 |
1517 |
1518 | def sharp_iferror(test, then='', Else=None, *args):
1519 | if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
1520 | return then
1521 | elif Else is None:
1522 | return test.strip()
1523 | else:
1524 | return Else.strip()
1525 |
1526 |
1527 | def sharp_switch(primary, *params):
1528 | # FIXME: we don't support numeric expressions in primary
1529 |
1530 | # {{#switch: comparison string
1531 | # | case1 = result1
1532 | # | case2
1533 | # | case4 = result2
1534 | # | 1 | case5 = result3
1535 | # | #default = result4
1536 | # }}
1537 |
1538 | primary = primary.strip()
1539 | found = False # for fall through cases
1540 | default = None
1541 | rvalue = None
1542 | lvalue = ''
1543 | for param in params:
1544 | # handle cases like:
1545 | # #default = [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...]
1546 | pair = param.split('=', 1)
1547 | lvalue = pair[0].strip()
1548 | rvalue = None
1549 | if len(pair) > 1:
1550 | # got "="
1551 | rvalue = pair[1].strip()
1552 | # check for any of multiple values pipe separated
1553 | if found or primary in [v.strip() for v in lvalue.split('|')]:
1554 | # Found a match, return now
1555 | return rvalue
1556 | elif lvalue == '#default':
1557 | default = rvalue
1558 | rvalue = None # avoid defaulting to last case
1559 | elif lvalue == primary:
1560 | # If the value matches, set a flag and continue
1561 | found = True
1562 | # Default case
1563 | # Check if the last item had no = sign, thus specifying the default case
1564 | if rvalue is not None:
1565 | return lvalue
1566 | elif default is not None:
1567 | return default
1568 | return ''
1569 |
1570 |
1571 | # Extension Scribuntu
1572 | def sharp_invoke(module, function, frame):
1573 | functions = modules.get(module)
1574 | if functions:
1575 | funct = functions.get(function)
1576 | if funct:
1577 | # find parameters in frame whose title is the one of the original
1578 | # template invocation
1579 | templateTitle = fullyQualifiedTemplateTitle(function)
1580 | if not templateTitle:
1581 | logging.warn("Template with empty title")
1582 | pair = next((x for x in frame if x[0] == templateTitle), None)
1583 | if pair:
1584 | params = pair[1]
1585 | # extract positional args
1586 | params = [params.get(str(i + 1)) for i in range(len(params))]
1587 | return funct(*params)
1588 | else:
1589 | return funct()
1590 | return ''
1591 |
1592 |
1593 | parserFunctions = {
1594 |
1595 | '#expr': sharp_expr,
1596 |
1597 | '#if': sharp_if,
1598 |
1599 | '#ifeq': sharp_ifeq,
1600 |
1601 | '#iferror': sharp_iferror,
1602 |
1603 | '#ifexpr': lambda *args: '', # not supported
1604 |
1605 | '#ifexist': lambda *args: '', # not supported
1606 |
1607 | '#rel2abs': lambda *args: '', # not supported
1608 |
1609 | '#switch': sharp_switch,
1610 |
1611 | '# language': lambda *args: '', # not supported
1612 |
1613 | '#time': lambda *args: '', # not supported
1614 |
1615 | '#timel': lambda *args: '', # not supported
1616 |
1617 | '#titleparts': lambda *args: '', # not supported
1618 |
1619 | # This function is used in some pages to construct links
1620 | # http://meta.wikimedia.org/wiki/Help:URL
1621 | 'urlencode': lambda string, *rest: urlencode(string),
1622 |
1623 | 'lc': lambda string, *rest: string.lower() if string else '',
1624 |
1625 | 'lcfirst': lambda string, *rest: lcfirst(string),
1626 |
1627 | 'uc': lambda string, *rest: string.upper() if string else '',
1628 |
1629 | 'ucfirst': lambda string, *rest: ucfirst(string),
1630 |
1631 | 'int': lambda string, *rest: str(int(string)),
1632 |
1633 | }
1634 |
1635 |
1636 | def callParserFunction(functionName, args, frame):
1637 | """
1638 | Parser functions have similar syntax as templates, except that
1639 | the first argument is everything after the first colon.
1640 | :return: the result of the invocation, None in case of failure.
1641 |
1642 | http://meta.wikimedia.org/wiki/Help:ParserFunctions
1643 | """
1644 |
1645 | try:
1646 | if functionName == '#invoke':
1647 | # special handling of frame
1648 | ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
1649 | # logging.debug('parserFunction> %s %s', functionName, ret)
1650 | return ret
1651 | if functionName in parserFunctions:
1652 | ret = parserFunctions[functionName](*args)
1653 | # logging.debug('parserFunction> %s %s', functionName, ret)
1654 | return ret
1655 | except:
1656 | return "" # FIXME: fix errors
1657 |
1658 | return ""
1659 |
1660 |
1661 | # ----------------------------------------------------------------------
1662 | # Extract Template definition
1663 |
1664 | reNoinclude = re.compile(r'(?:.*?)', re.DOTALL)
1665 | reIncludeonly = re.compile(r'|', re.DOTALL)
1666 |
1667 | # These are built before spawning processes, hence thay are shared.
1668 | templates = {}
1669 | redirects = {}
1670 | # cache of parser templates
1671 | # FIXME: sharing this with a Manager slows down.
1672 | templateCache = {}
1673 |
1674 |
1675 | def define_template(title, page):
1676 | """
1677 | Adds a template defined in the :param page:.
1678 | @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
1679 | """
1680 | global templates
1681 | global redirects
1682 |
1683 | # title = normalizeTitle(title)
1684 |
1685 | # check for redirects
1686 | m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
1687 | if m:
1688 | redirects[title] = m.group(1) # normalizeTitle(m.group(1))
1689 | return
1690 |
1691 | text = unescape(''.join(page))
1692 |
1693 | # We're storing template text for future inclusion, therefore,
1694 | # remove all text and keep all text
1695 | # (but eliminate tags per se).
1696 | # However, if ... parts are present,
1697 | # then only keep them and discard the rest of the template body.
1698 | # This is because using on a text fragment is
1699 | # equivalent to enclosing it in tags **AND**
1700 | # enclosing all the rest of the template body in tags.
1701 |
1702 | # remove comments
1703 | text = comment.sub('', text)
1704 |
1705 | # eliminate fragments
1706 | text = reNoinclude.sub('', text)
1707 | # eliminate unterminated elements
1708 | text = re.sub(r'.*$', '', text, flags=re.DOTALL)
1709 | text = re.sub(r'', '', text)
1710 |
1711 | onlyincludeAccumulator = ''
1712 | for m in re.finditer('(.*?)', text, re.DOTALL):
1713 | onlyincludeAccumulator += m.group(1)
1714 | if onlyincludeAccumulator:
1715 | text = onlyincludeAccumulator
1716 | else:
1717 | text = reIncludeonly.sub('', text)
1718 |
1719 | if text:
1720 | if title in templates:
1721 | logging.warn('Redefining: %s', title)
1722 | templates[title] = text
1723 |
--------------------------------------------------------------------------------
/reader/wikiextractor/extractPage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # =============================================================================
5 | # Version: 3.0 (July 22, 2020)
6 | # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
7 |
8 | # =============================================================================
9 | # Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
10 | # =============================================================================
11 | # This file is part of Tanl.
12 | #
13 | # Tanl is free software; you can redistribute it and/or modify it
14 | # under the terms of the GNU Affero General Public License, version 3,
15 | # as published by the Free Software Foundation.
16 | #
17 | # Tanl is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | # GNU Affero General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Affero General Public License
23 | # along with this program. If not, see .
24 | # =============================================================================
25 |
26 | """Wikipedia Page Extractor:
27 | Extracts a single page from a Wikipedia dump file.
28 | """
29 |
30 | import sys, os.path
31 | import re
32 | import argparse
33 | import bz2
34 |
35 |
36 | # Program version
37 | __version__ = '3.0.5'
38 |
39 | # ----------------------------------------------------------------------
40 | # READER
41 |
42 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
43 | #tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
44 | # 1 2 3
45 |
46 | def process_data(input_file, id, templates=False):
47 | """
48 | :param input_file: name of the wikipedia dump file.
49 | :param id: article id
50 | """
51 |
52 | if input_file.lower().endswith(".bz2"):
53 | input = bz2.open(input_file, mode='rt', encoding='utf-8')
54 | else:
55 | input = open(input_file)
56 |
57 | page = []
58 | for line in input:
59 | line = line
60 | if '<' not in line: # faster than doing re.search()
61 | if page:
62 | page.append(line)
63 | continue
64 | m = tagRE.search(line)
65 | if not m:
66 | continue
67 | tag = m.group(2)
68 | if tag == 'page':
69 | page = []
70 | page.append(line)
71 | inArticle = False
72 | elif tag == 'id':
73 | curid = m.group(3)
74 | if id == curid:
75 | page.append(line)
76 | inArticle = True
77 | elif not inArticle and not templates:
78 | page = []
79 | elif tag == 'title':
80 | if templates:
81 | if m.group(3).startswith('Template:'):
82 | page.append(line)
83 | else:
84 | page = []
85 | else:
86 | page.append(line)
87 | elif tag == '/page':
88 | if page:
89 | page.append(line)
90 | print(''.join(page))
91 | if not templates:
92 | break
93 | page = []
94 | elif page:
95 | page.append(line)
96 |
97 | input.close()
98 |
99 | def main():
100 | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
101 | formatter_class=argparse.RawDescriptionHelpFormatter,
102 | description=__doc__)
103 | parser.add_argument("input",
104 | help="XML wiki dump file")
105 | parser.add_argument("--id", default="1",
106 | help="article number")
107 | parser.add_argument("--template", action="store_true",
108 | help="template number")
109 | parser.add_argument("-v", "--version", action="version",
110 | version='%(prog)s ' + version,
111 | help="print program version")
112 |
113 | args = parser.parse_args()
114 |
115 | process_data(args.input, args.id, args.template)
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/run/pipeline.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | import os
4 | import logging
5 | import copy
6 | from glob import glob
7 |
8 | import threading
9 | from multiprocessing import Process
10 | import multiprocessing as mp
11 |
12 | def run_bash(args,cmd):
13 | cmd = cmd.format(**args)
14 | print('cmd:',cmd)
15 | os.system(cmd)
16 |
17 | def check_output_dir(output_dir):
18 | if not os.path.exists(output_dir):
19 | os.mkdir(output_dir)
20 |
21 | POOL_SIZE = 9
22 | MAX_EDIT = 0.1
23 | MIN_FILESIZE = 5 # KB
24 |
25 | global_args = {
26 | "code_root" : '/nfs/users/xueyou/github/wiki-error-corpus',
27 | # "xml_dump": '/nfs/users/xueyou/data/speller/wiki/zhwiki-20211201-pages-meta-history1.xml-p2981p11534',
28 | "input_dir" : '/data/xueyou/data/speller/wiki/',
29 | "output_dir" : '/data/xueyou/data/speller/wiki/',
30 | 'max_edit': MAX_EDIT
31 | }
32 |
33 | for xml_dump_file in list(glob('/data/xueyou/data/speller/wiki/*.7z')):
34 | global_args['xml_dump'] = xml_dump_file.replace('.7z','')
35 |
36 | # Stage 1
37 | # Extract 7z file
38 | print(f'extract {xml_dump_file}')
39 | cmd = f'7z e {xml_dump_file}'
40 | run_bash({},cmd)
41 |
42 | # Stage 2
43 | # Divide the large XML revision dump file into per page revisions.
44 | print(f'divide XML file')
45 | cmd = 'python {code_root}/reader/divide_xml_revisions.py {xml_dump} {output_dir}'
46 | args = copy.deepcopy(global_args)
47 | args['output_dir'] = args['output_dir'] + 'stage1'
48 | check_output_dir(args['output_dir'])
49 | run_bash(args,cmd)
50 |
51 |
52 | # Stage 3
53 | # Extract Revisions from page history
54 | cmd = 'python {code_root}/reader/extract_revisions_new.py {input_dir} {input_file} {output_dir}'
55 | input_dir = global_args['input_dir'] + 'stage1'
56 | output_dir = global_args['output_dir'] + 'stage3'
57 | check_output_dir(output_dir)
58 |
59 | pool = mp.Pool(processes = POOL_SIZE)
60 | for fname in glob(input_dir + '/*.xml'):
61 | fsize = os.path.getsize(fname) / 1024 # KB
62 | if fsize < MIN_FILESIZE:
63 | # print(f'small size, skip {fname}')
64 | continue
65 | args = copy.deepcopy(global_args)
66 | args['input_dir'] = input_dir
67 | args['output_dir'] = output_dir
68 | args['input_file'] = os.path.basename(fname)
69 | pool.apply_async(run_bash,(args, cmd))
70 | pool.close()
71 | pool.join()
72 |
73 | # Stage 4
74 | # Extract errors with edit distance
75 | cmd = 'python {code_root}/reader/extract_spelling_errors_new.py {input_dir} {input_file} {output_dir} zh {max_edit}'
76 | input_dir = global_args['input_dir'] + 'stage3'
77 | output_dir = global_args['output_dir'] + 'stage4'
78 | check_output_dir(output_dir)
79 |
80 | pool = mp.Pool(processes = POOL_SIZE)
81 | for fname in glob(input_dir + '/*.xml'):
82 | basename = os.path.basename(fname)
83 | args = copy.deepcopy(global_args)
84 | args['input_dir'] = input_dir
85 | args['output_dir'] = output_dir
86 | args['input_file'] = basename
87 | pool.apply_async(run_bash,(args, cmd))
88 | pool.close()
89 | pool.join()
90 |
91 | # Stage5
92 | # collect all the errors
93 | input_dir = global_args['input_dir'] + 'stage4'
94 | output_dir = global_args['input_dir'] + 'stage5'
95 | check_output_dir(output_dir)
96 | with open(output_dir + '/error_sent.txt','a') as ef,open(output_dir + '/ori_sent.txt','a') as of:
97 | for fname in glob(input_dir + '/*.xml_error_sen.txt'):
98 | basename = os.path.basename(fname).split('.')[0]
99 | # err_f.write(open(input_dir + '/' + basename + '.xml_spelling_error.txt').read())
100 | ef.write(open(input_dir + '/' + basename + '.xml_error_sen.txt').read())
101 | of.write(open(input_dir + '/' + basename + '.xml_orig_sen.txt').read())
102 |
103 | # Stage 6
104 | # clear
105 | print('clear tmp files')
106 | check_output_dir('./extracted')
107 | cmd = f'''rm {global_args['xml_dump']}
108 | rm -rf stage1 stage3 stage4
109 | mv {xml_dump_file} extracted
110 | '''
111 | run_bash({},cmd)
112 | print('all done')
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
]