├── .gitignore
├── README.md
├── reader
    ├── __init__.py
    ├── divide_xml_revisions.py
    ├── divide_xml_revisions_new.py
    ├── extract_revisions.py
    ├── extract_revisions_new.py
    ├── extract_spelling_errors.py
    ├── extract_spelling_errors_new.py
    ├── fix_extracted.py
    ├── utils.py
    └── wikiextractor
    │   ├── WikiExtractor.py
    │   ├── __init__.py
    │   ├── cirrus-extract.py
    │   ├── clean.py
    │   ├── extract.py
    │   └── extractPage.py
└── run
    └── pipeline.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | tcdata/
133 | user_data/*
134 | !user_data/extra_data
135 | !user_data/track3
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WiKi-Error-Extract
 2 | 
 3 | **\*\*\*\*\* Update 2022-03-04 \*\*\*\*\***
 4 | 
 5 | 由于现在这个pipeline运行起来还是非常慢的，可以使用我抽取的结果：
 6 | 
 7 | [下载地址+密码: c4qk](https://pan.baidu.com/s/1PovlwB9H1Zu-Jv_WN9xZnQ) 
 8 | 
 9 | > 建议用混淆集对错误做一下过滤。
10 | 
11 | 同时修改了一下抽取逻辑，避免读写大量小文件。运行:
12 | 
13 | ```python
14 | python code_dir/reader/divide_xml_revisions_new.py unzipped_file_path output_file_path 0.1
15 | ```
16 | 
17 | **\*\*\*\*\* Update End \*\*\*\*\***
18 | 
19 | 根据wiki history数据提取纠错的平行语料。这里只提取字是1-1对齐的语料，不考虑删减字的情况。需要的话自行修改代码即可。
20 | 
21 | 步骤：
22 | 
23 | 1. 从wiki dumps中现在zhwiki有完整编辑历史的数据。我下载的是[20211201](https://dumps.wikimedia.org/zhwiki/20211201/)中 All pages with complete edit history (.7z)格式的数据。
24 | 
25 | 2. 将所有7z文件放在同一个目录A下，在该目录下新建extracted目录。修改run/pipeline.py中对应目录地址。
26 | 
27 | 3. 在A目录下运行pipeline文件。`python wiki-error-extract dir/run/pipeline.py`
28 | 
29 | 4. 最终结果会在A目录下的stage5文件夹中。
30 | 
31 | 
32 | ## 例子
33 | 
34 | ```
35 | [{'src': '比如我把勾股定理叫做勾理定理，并不意味着我认为为它是中国人最先证明的，而是"勾"股"弦"本来就是指直角三角形的三边。',
36 |   'tgt': '比如我把勾股定理叫做勾股定理，并不意味着我认为为它是中国人最先证明的，而是"勾"股"弦"本来就是指直角三角形的三边。'},
37 |  {'src': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪，而支持杀局的李国宝在投票前二十秒赶到等「戏剧化」情节下而令草案得到通过。',
38 |   'tgt': '最后在反对杀局的民建联议员黄容根及临区局主席刘皇发在投票前「巧合」地失踪，而支持杀局的李国宝在投票前20秒赶到等「戏剧化」情节下而令草案得到通过。'},
39 |  {'src': '禁止公众摄影图书馆 康文署拒放宽.', 'tgt': '禁止公众拍摄图书馆 康文署拒放宽.'},
40 |  {'src': '上白礁谒祖祭典由台湾台南县学甲镇慈济宫所举行，于每年农历三月十一日举行，即当地所称上白礁活动，于当地将军溪畔「头前寮」举行祭典，隔海遥祭保生大帝祖庙，即中国大陆福建白礁慈济宫。',
41 |   'tgt': '上白礁谒祖祭典由台湾台南市学甲区慈济宫所举行，于每年农历三月十一日举行，即当地所称上白礁活动，于当地将军溪畔「头前寮」举行祭典，隔海遥祭保生大帝祖庙，即中国大陆福建白礁慈济宫。'},
42 |  {'src': '1980年后，改善分离的选择性成为色谱工作者的主要问题，人们越来越认识到改变流动相的组成事提高选择性的关键。',
43 |   'tgt': '1980年后，改善分离的选择性成为色谱工作者的主要问题，人们越来越认识到改变流动相的组成是提高选择性的关键。'},
44 |  {'src': '德国战车最大的缺点是生产速度慢，直到战终德国主力四号战车也仅生产九千多、五号豹式六千多部。',
45 |   'tgt': '德国战车最大的缺点是生产速度慢，直到战终德国主力四号坦克也仅生产九千多、五号豹式六千多部。'},
46 |  {'src': '由于过多贵格会信仰者居住于费城，因此费城人又称"贵格会信仰者"。',
47 |   'tgt': '由于许多贵格会信仰者居住于费城，因此费城人又称"贵格会信仰者"。'},
48 |  {'src': '进入校门，映入眼帘的即是建中的指标建筑红楼，其建于日治时期（1909年），目前已列入台北市市定古迹。',
49 |   'tgt': '进入校门，映入眼帘的即是建中的指标建筑红楼，其建于日据时期（1909年），目前已列入台北市市定古迹。'},
50 |  {'src': '2015年1月20日，该校一名姓陈17岁中五女生因担心迟到而追小巴，却被撞至阴道重创昏迷。',
51 |   'tgt': '2015年1月20日，该校一名姓陈17岁中五女生因担心迟到而追小巴，却被撞至头部重创昏迷。'},
52 |  {'src': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星团之一的人，他在观测天狼星系统在天空中的移动路径之后得出这个结论。',
53 |   'tgt': '在1909年埃希纳·赫茨普龙是第一位提出天狼星是大熊座移动星群之一的人，他在观测天狼星系统在天空中的移动路径之后得出这个结论。'}]
54 | ```


--------------------------------------------------------------------------------
/reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/__init__.py


--------------------------------------------------------------------------------
/reader/divide_xml_revisions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Divide the large XML revision dump file into per page revisions.
 4 | 
 5 | """
 6 | import codecs
 7 | import os
 8 | import xml.sax
 9 | import xml.sax.saxutils
10 | 
11 | 
12 | html_escape_table = {
13 |   u'‘': "&apos;",
14 |   u'’': "&apos;",
15 |   u'“': '&quot;',
16 |   u'”': '&quot;'
17 | }
18 | 
19 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
20 | 
21 | def html_escape(text):
22 |   return xml.sax.saxutils.escape(text, html_escape_table)
23 | 
24 | def html_unescape(text):
25 |   return xml.sax.saxutils.unescape(text, html_unescape_table)
26 | 
27 | 
28 | class WikiRevisionDumpHandler(xml.sax.ContentHandler):
29 |   wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
30 |                         'timestamp', 'contributor', 'ip', 'username', 
31 |                         'comment', 'model', 'format', 'text', 'sha1'])
32 |   file_counter = 0
33 |   file_handle = '' 
34 |  
35 |   def __init__(self, input_file, output_dir):
36 |     # Input/output locations
37 |     self.input_file = input_file
38 |     self.output_dir = output_dir
39 | 
40 |     # Recent tag visited by SAX parser
41 |     self.curr_tag = ''
42 |     self.content = ''
43 | 
44 |   def startElement(self, tag, attributes):
45 |     self.curr_tag = tag
46 |     if self.curr_tag == 'page': 
47 |       # close the unclosed handles first if any
48 |       if self.file_handle:
49 |         self.file_handle.close()
50 |       fname = repr(self.file_counter).zfill(10) + '.xml'
51 |       abspath = self.output_dir + '/' + fname
52 |       print('Writing to file: ', abspath )
53 |       self.file_handle = codecs.open(abspath, 'w', 'utf-8')
54 |       self.file_handle.write(self.tag_start('page')+'\n')
55 |     elif self.curr_tag in self.wiki_dump_tags:
56 |       self.file_handle.write(self.tag_start(self.curr_tag))      
57 |       
58 |   def endElement(self, tag):
59 |     self.curr_tag = tag
60 |     if self.curr_tag == 'page': 
61 |       self.file_handle.write(self.tag_end('page'))
62 |       self.file_handle.close()
63 |       self.file_counter += 1
64 |     elif self.curr_tag in self.wiki_dump_tags:
65 |       self.file_handle.write(self.tag_end(self.curr_tag))      
66 | 
67 |   def characters(self, contents):
68 |     self.content = contents  
69 |     if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
70 |       self.file_handle.write(html_escape(self.content))
71 |  
72 |   @staticmethod    
73 |   def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+'</'+tag+'>'
74 | 
75 |   @staticmethod
76 |   def tag_start(tag): return '<'+tag+'>'
77 |   
78 |   @staticmethod
79 |   def tag_end(tag): return '</'+tag+'>'
80 |     
81 | 
82 | if __name__ == '__main__':
83 |   import argparse
84 |   arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
85 |   arg_parser.add_argument('input_file', help='XML revision dump file name')
86 |   arg_parser.add_argument('output_dir', help='Output directory')
87 |   args = arg_parser.parse_args()
88 |   if not os.path.exists(args.output_dir):
89 |     os.makedirs(args.output_dir)
90 | 
91 |   # SAX XML reader
92 |   xml_parser = xml.sax.make_parser()
93 |   
94 |   revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_dir)
95 |   xml_parser.setContentHandler(revision_dump_handler)
96 |   xml_parser.parse(args.input_file)
97 | 


--------------------------------------------------------------------------------
/reader/divide_xml_revisions_new.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Divide the large XML revision dump file into per page revisions.
  4 | 
  5 | """
  6 | import codecs
  7 | import os
  8 | import xml.sax
  9 | import xml.sax.saxutils
 10 | import io
 11 | import json
 12 | from extract_revisions_new import extract_revisions
 13 | from extract_spelling_errors_new import converter,check_error
 14 | 
 15 | html_escape_table = {
 16 |   u'‘': "&apos;",
 17 |   u'’': "&apos;",
 18 |   u'“': '&quot;',
 19 |   u'”': '&quot;'
 20 | }
 21 | 
 22 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
 23 | 
 24 | def html_escape(text):
 25 |   return xml.sax.saxutils.escape(text, html_escape_table)
 26 | 
 27 | def html_unescape(text):
 28 |   return xml.sax.saxutils.unescape(text, html_unescape_table)
 29 | 
 30 | 
 31 | def extract_errors(content,number_of_edits,outfile):
 32 |   buffer = ''
 33 |   # extract revisions
 34 |   for timestamp,text in extract_revisions(io.StringIO(content)):
 35 |     if len(text) > 10:
 36 |       buffer += '\n\n[Revision timestamp: ' + timestamp + ']\n\n'
 37 |       buffer += text
 38 | 
 39 |   revisions = []
 40 |   line = []
 41 | 
 42 |   srcs,tgts = [],[]
 43 |   pre_revision = ''
 44 |   current_revision = ''
 45 |   cnt = 0
 46 |   for line in buffer.splitlines():
 47 |     line = converter.convert(line)
 48 |     if "Revision timestamp" in line:
 49 |       if current_revision:
 50 |         if pre_revision:
 51 |           cnt += 1
 52 |           # if cnt % 100 == 0:
 53 |           #   print('processed',cnt)
 54 |           check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
 55 |       pre_revision = current_revision.strip()
 56 |       current_revision = ''
 57 |     else:
 58 |       current_revision += line
 59 |   if current_revision and pre_revision:
 60 |     check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
 61 | 
 62 |   
 63 |   keeps = []
 64 |   if srcs:
 65 |     errors = set()
 66 |     for src,tgt in zip(srcs[::-1],tgts[::-1]):
 67 |       if src in errors or tgt in errors:
 68 |         continue
 69 |       errors.add(src)
 70 |       errors.add(tgt)
 71 |       keeps.append({"src":src,'tgt':tgt})
 72 | 
 73 |   if keeps:
 74 |     for x in keeps:
 75 |       outfile.write(json.dumps(x,ensure_ascii=False) + '\n')
 76 | 
 77 | class WikiRevisionDumpHandler(xml.sax.ContentHandler):
 78 |   wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
 79 |                         'timestamp', 'contributor', 'ip', 'username', 
 80 |                         'comment', 'model', 'format', 'text', 'sha1'])
 81 |   file_counter = 0
 82 |   file_handle = '' 
 83 |  
 84 |   def __init__(self, input_file, output_file, number_of_edits):
 85 |     # Input/output locations
 86 |     self.input_file = input_file
 87 |     self.output_file = open(output_file,'a')
 88 |     self.number_of_edits = number_of_edits
 89 | 
 90 |     # Recent tag visited by SAX parser
 91 |     self.curr_tag = ''
 92 |     self.content = ''
 93 | 
 94 |   def startElement(self, tag, attributes):
 95 |     self.curr_tag = tag
 96 |     if self.curr_tag == 'page': 
 97 |       # close the unclosed handles first if any
 98 |       if self.file_handle:
 99 |         self.file_handle = ''
100 |       self.file_handle += self.tag_start('page')+'\n'
101 |     elif self.curr_tag in self.wiki_dump_tags:
102 |       self.file_handle += self.tag_start(self.curr_tag)    
103 |       
104 |   def endElement(self, tag):
105 |     self.curr_tag = tag
106 |     if self.curr_tag == 'page': 
107 |       self.file_handle += self.tag_end('page')
108 |       self.file_counter += 1
109 |       extract_errors(self.file_handle,self.number_of_edits,self.output_file)
110 |       self.file_handle = ''
111 |       # if self.file_counter % 100 == 0:
112 |       print(f'{self.input_file} processed {self.file_counter} pages')
113 |     elif self.curr_tag in self.wiki_dump_tags:
114 |       self.file_handle += self.tag_end(self.curr_tag)     
115 | 
116 |   def characters(self, contents):
117 |     self.content = contents  
118 |     if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
119 |       self.file_handle += html_escape(self.content)
120 |  
121 |   @staticmethod    
122 |   def surround_wih_tag(tag, cont): return '<'+tag+'>'+cont+'</'+tag+'>'
123 | 
124 |   @staticmethod
125 |   def tag_start(tag): return '<'+tag+'>'
126 |   
127 |   @staticmethod
128 |   def tag_end(tag): return '</'+tag+'>'
129 |     
130 | 
131 | if __name__ == '__main__':
132 |   import argparse
133 |   arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
134 |   arg_parser.add_argument('input_file', help='XML revision dump file name')
135 |   arg_parser.add_argument('output_file', help='Output file')
136 |   arg_parser.add_argument('number_of_edits', help='number_of_edits')
137 |   args = arg_parser.parse_args()
138 |   number_of_edits = float(args.number_of_edits)
139 |   if not os.path.exists(os.path.dirname(args.output_file)):
140 |     os.makedirs(os.path.dirname(args.output_file))
141 | 
142 |   # SAX XML reader
143 |   xml_parser = xml.sax.make_parser()
144 |   
145 |   revision_dump_handler = WikiRevisionDumpHandler(args.input_file, args.output_file,number_of_edits)
146 |   xml_parser.setContentHandler(revision_dump_handler)
147 |   xml_parser.parse(args.input_file)
148 | 


--------------------------------------------------------------------------------
/reader/extract_revisions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import codecs
  4 | import os
  5 | import xml.sax
  6 | import xml.sax.saxutils
  7 | 
  8 | from fix_extracted import fix_extraction
  9 | 
 10 | 
 11 | html_escape_table = {
 12 |   u'‘': "&apos;",
 13 |   u'’': "&apos;",
 14 |   u'“': '&quot;',
 15 |   u'”': '&quot;',
 16 |   u'&': '&amp;'
 17 | }
 18 | 
 19 | html_unescape_table = {v:k for k, v in html_escape_table.items()}
 20 | 
 21 | def html_escape(text):
 22 |   return xml.sax.saxutils.escape(text, html_escape_table)
 23 | 
 24 | def html_unescape(text):
 25 |   return xml.sax.saxutils.unescape(text, html_unescape_table)
 26 | 
 27 | 
 28 | class WikiRevisionHandler(xml.sax.ContentHandler):
 29 |   input_file = 'wiki.xml'
 30 |   output_dir = '.'
 31 |   wiki_dump_tags = set(['page', 'title', 'ns', 'id', 'revision', 'parentid',
 32 |                         'timestamp', 'contributor', 'ip', 'username', 
 33 |                         'comment', 'model', 'format', 'text', 'sha1'])
 34 |   file_handle = ''  
 35 | 
 36 |   def __init__(self, input_file, output_file):
 37 |     # Input/output locations
 38 |     self.input_file = input_file
 39 |     self.output_file = output_file
 40 | 
 41 |     # Recent tag visited by SAX parser
 42 |     self.curr_tag = ''
 43 |     self.content = ''
 44 |     
 45 |     # Revisions
 46 |     self.revisions = []
 47 |     self.curr_rev = []
 48 |     self.rev_start = False
 49 |     self.ts_start = False
 50 |     self.timestamps = []
 51 |     
 52 |     
 53 |   def startElement(self, tag, attributes):
 54 |     self.curr_tag = tag
 55 |     if self.curr_tag == 'timestamp':
 56 |       self.ts_start = True
 57 |     if self.curr_tag == 'revision':
 58 |       self.rev_start = True
 59 |     if self.curr_tag == 'page': 
 60 |       # close the unclosed handles first if any
 61 |       if self.file_handle:
 62 |         self.file_handle.close()
 63 |       print('Writing to file: ', self.output_file)
 64 |       self.file_handle = codecs.open(self.output_file, 'w', 'utf-8')
 65 |     #  self.file_handle.write(self.tag_start('page')+'\n')
 66 |     #elif self.curr_tag in self.wiki_dump_tags:
 67 |     #  self.file_handle.write(self.tag_start(self.curr_tag))      
 68 |       
 69 |   def endElement(self, tag):
 70 |     self.curr_tag = tag
 71 |     if self.curr_tag == 'timestamp':
 72 |       self.ts_start = False
 73 |     if self.curr_tag == 'revision':
 74 |       self.rev_start = False
 75 |       if len(self.curr_rev) > 0:
 76 |         self.revisions.append(self.curr_rev)
 77 |         self.curr_rev = []
 78 |     if self.curr_tag == 'page': 
 79 |     #  self.file_handle.write(self.tag_end('page'))
 80 |       print('revisions',len(self.revisions))
 81 |       ts_revs = list(zip(self.timestamps, self.revisions))
 82 |       for t_r in ts_revs[::-1]:
 83 |         self.file_handle.write('\n[Revision timestamp: ' + t_r[0] + ']\n')
 84 |         html_escaped = html_escape(''.join(t_r[1]))
 85 |         self.file_handle.write(html_escaped)        
 86 |       self.file_handle.close()
 87 |     #elif self.curr_tag in self.wiki_dump_tags:
 88 |     #  self.file_handle.write(self.tag_end(self.curr_tag))      
 89 | 
 90 |   def characters(self, contents):
 91 |     self.content = contents  
 92 |     if self.curr_tag == 'text' and self.rev_start:
 93 |       self.curr_rev.append(self.content)
 94 |     if self.curr_tag == 'timestamp' and self.ts_start:
 95 |       self.timestamps.append(self.content)
 96 |       #self.file_handle.write('[Revision timestamp: ' + self.content + ']\n')
 97 |     #if self.curr_tag != 'page' and self.curr_tag in self.wiki_dump_tags:
 98 |     #  self.file_handle.write(html_escape(self.content))
 99 | 
100 |  
101 | class WikiRevErrorHandler(xml.sax.handler.ErrorHandler):
102 | 
103 |   def error(self, exception):
104 |     pass
105 | 
106 |   def fatalError(self, exception):
107 |     pass  
108 |   
109 |   def warning(self, exception):
110 |     pass
111 | 
112 | 
113 | 
114 | if __name__ == '__main__':
115 |   import argparse
116 |   arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
117 |   arg_parser.add_argument('input_dir', help='Input dir')
118 |   arg_parser.add_argument('input_file', help='Input file')
119 |   arg_parser.add_argument('output_dir', help='Output dir')
120 |   args = arg_parser.parse_args()
121 | 
122 |   # fix extraction
123 |   fix_extraction(args.input_dir,args.input_file,args.input_dir)
124 | 
125 |   input_file = args.input_dir + '/' + args.input_file
126 |   output_file = args.output_dir + '/' + args.input_file
127 |   # SAX XML reader
128 |   xml_parser = xml.sax.make_parser()
129 |   
130 |   revision_handler = WikiRevisionHandler(input_file, output_file)
131 |   wiki_err_handler = WikiRevErrorHandler()
132 |   xml_parser.setContentHandler(revision_handler)
133 |   xml_parser.setErrorHandler(wiki_err_handler)
134 |   xml_parser.parse(input_file)
135 | 
136 | 


--------------------------------------------------------------------------------
/reader/extract_revisions_new.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import logging
 3 | from wikiextractor.extract import Extractor
 4 | 
 5 | extractor = Extractor('##NoName##')
 6 | 
 7 | 
 8 | def clean_revison(revision):
 9 |     # fix text
10 |     revision = '\n'.join(revision)
11 | 
12 |     m = re.search(r'<text>(.*?)</text>', revision, flags=re.DOTALL)
13 |     if m:
14 |         text = m.group(1)
15 |     else:
16 |         logging.warning('Missing text element')
17 |         return None
18 |     
19 |     text = extractor.extract(text)
20 |     m = re.search(r'<timestamp>(.*?)</timestamp>', revision)
21 |     timestamp = 'none'
22 |     if m:
23 |         timestamp = m.group(1)
24 |     return (timestamp,text)
25 | 
26 | def extract_revisions(fname):
27 |     revision_cnt = 0
28 |     revison_content = []
29 |     revison_area = False
30 |     if isinstance(fname,str):
31 |       fname = open(fname)
32 |     for line in fname:
33 |         if'<revision>' in line:
34 |           # 如果revision有内容，那么肯定哪里出错了，直接丢弃数据
35 |           if revison_content:
36 |             revison_content = []
37 |           revison_area = True
38 | 
39 |         if '</revision>' in line:
40 |           revision_cnt += 1
41 |           if revision_cnt % 100 == 0:
42 |               print(fname, 'revision cnt', revision_cnt)
43 |           revison_content.append(line)
44 |           fixed = clean_revison(revison_content)
45 |           if fixed is not None:
46 |             yield fixed
47 |           revison_content = []
48 |           revison_area = False
49 |           continue
50 | 
51 |         if revison_area:
52 |           revison_content.append(line)
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 |   import argparse
58 |   arg_parser = argparse.ArgumentParser(description='Script for dividing the large XML revision dump into individual page revisions.')
59 |   arg_parser.add_argument('input_dir', help='Input dir')
60 |   arg_parser.add_argument('input_file', help='Input file')
61 |   arg_parser.add_argument('output_dir', help='Output dir')
62 |   args = arg_parser.parse_args()
63 | 
64 | 
65 |   input_file = args.input_dir + '/' + args.input_file
66 |   output_file = args.output_dir + '/' + args.input_file
67 |   with open(output_file,'w') as f:
68 |       for timestamp,text in extract_revisions(input_file):
69 |           if len(text) > 10:
70 |             f.write('\n\n[Revision timestamp: ' + timestamp + ']\n\n')
71 |             f.write(text)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/reader/extract_spelling_errors.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Extracts spelling errors from revision history. 
  4 | 
  5 | """
  6 | 
  7 | import codecs
  8 | import re
  9 | import utils
 10 | 
 11 | class RevisionSentence(object):
 12 |   """Class for representing an error sentence together with original sentence.
 13 | 
 14 |   """
 15 |   def __init__(self, orig_tokens):
 16 |     self.orig_tokens = orig_tokens
 17 |     self.err_sen = []
 18 | 
 19 |   def add_err_sentence(self, err_tokens):
 20 |     self.err_sen.append(err_tokens)
 21 | 
 22 |   def contains_spelling_errors(self):
 23 |     """Whether the earlier revisions of the same sentences have spelling errors.
 24 |     
 25 |     Returns:
 26 |       bool: True or False
 27 | 
 28 |     """
 29 |     if len(self.err_sen) > 0:
 30 |       return True
 31 |     else:
 32 |       return False
 33 | 
 34 | class ErrorCorpus(object):
 35 |   """Class for representing the original text data with spelling errors.
 36 | 
 37 |   """
 38 |   lang = 'english'
 39 |   max_dist = 3
 40 |   min_sen_len = 3
 41 | 
 42 |   def __init__(self, lang='english', max_edit_distance=3, min_sen_len=3):
 43 |     self.corpus = None
 44 |     self.num_rev = 0
 45 |     self.lang = lang
 46 |     self.max_edit = max_edit_distance
 47 |     self.min_sen_len = min_sen_len
 48 | 
 49 |   def create_corpus_from_wiki(self, corpus_root, filename, output_dir):
 50 |     create_error_corpus = False
 51 |     valid_word_pat = r'(?u)^\w+$'
 52 |     sentences = utils.get_sentences_for_text(corpus_root, filename)
 53 |     if sentences == None:
 54 |       return
 55 |     top_rev = []
 56 |     top_rev_with_err = []
 57 |     try:
 58 |       for s_list in sentences:
 59 |         s = ''.join(s_list)
 60 |         if s.startswith('[Revision timestamp:'):
 61 |           self.num_rev += 1
 62 |         else:
 63 |           if self.num_rev == 1:
 64 |             if len(s_list) >= self.min_sen_len:
 65 |               rev_sen = RevisionSentence(s_list)
 66 |               top_rev.append(rev_sen)
 67 |           elif self.num_rev > 1:
 68 |             for r in top_rev:
 69 |               if len(s_list) == len(r.orig_tokens):
 70 |                 valid_errors = True
 71 |                 errors = False
 72 |                 old_curr_rev_sen = zip(r.orig_tokens, s_list)
 73 |                 for t in old_curr_rev_sen:
 74 |                   dist = utils.levenshtein_distance(t[0], t[1])
 75 |                   if dist > 0 and dist <= self.max_dist:
 76 |                     # token must be a word 
 77 |                     orig_uni = utils.to_unicode_or_bust(t[0])
 78 |                     match = re.search(valid_word_pat, orig_uni)
 79 |                     if match:
 80 |                       errors = True
 81 |                   elif dist > self.max_dist:
 82 |                     valid_errors = False
 83 |                     break
 84 |                 if errors == True and valid_errors == True:
 85 |                   print('errr')
 86 |                   r.add_err_sentence(s_list)
 87 |                   create_error_corpus = True
 88 |                   break
 89 |     except AssertionError:
 90 |       print('Empty file')
 91 | 
 92 |     if create_error_corpus == True:
 93 |       with codecs.open(output_dir + '/' + filename, 'w', 'utf-8', errors='ignore') as f:
 94 |         for r in top_rev:
 95 |           if r.contains_spelling_errors() == True:
 96 |             orig_sen = ' '.join(r.orig_tokens)
 97 |             err_as_sen = map(lambda x: ' '.join(x), r.err_sen)
 98 |             orig_err_sen = [orig_sen] + list(err_as_sen)
 99 |             to_write_uni = '####'.join(orig_err_sen)
100 |             f.write(to_write_uni + u'\n')
101 | 
102 | if __name__ == '__main__':
103 |   import argparse
104 |   arg_parser = argparse.ArgumentParser(description='Script for extracting spelling errors from a revision history')
105 |   arg_parser.add_argument('corpus_root', help='The directory in which the revision file exists')
106 |   arg_parser.add_argument('input_file', help='Revision file')
107 |   arg_parser.add_argument('output_dir', help='Output directory')
108 |   arg_parser.add_argument('lang', help='Language of the text data')
109 |   arg_parser.add_argument('max_edit', help='Maximum edit distance between the correct word and the misspelled work')
110 | 
111 |   args = arg_parser.parse_args()
112 |   err_corpus = ErrorCorpus(args.lang.lower(), args.max_edit)
113 |   err_corpus.create_corpus_from_wiki(args.corpus_root, args.input_file, args.output_dir)
114 | 
115 |   #import os
116 |   #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3'
117 |   #for root, dirnames, filenames in os.walk(corpus_root):
118 |   #  for f in filenames:
119 |   #    err_corpus = ErrorCorpus()
120 |   #    print 'Extracting errors from: ', f
121 |   #    err_corpus.create_corpus_from_wiki(corpus_root, f, '')
122 | 
123 |   #corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/tmp_out'
124 |   #file_name = 'hello.txt'
125 |   #err_corpus = ErrorCorpus()
126 |   #err_corpus.create_corpus_from_wiki(corpus_root, file_name, '')
127 | 


--------------------------------------------------------------------------------
/reader/extract_spelling_errors_new.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import codecs
  3 | import utils
  4 | import re
  5 | import sys
  6 | from nltk.metrics import edit_distance
  7 | import opencc
  8 | import string
  9 | import math
 10 | converter = opencc.OpenCC('t2s.json')
 11 | 
 12 | 
 13 | def hasNumbers(inputString):
 14 | 	return bool(re.search(r'\d',inputString))
 15 | 
 16 | def hasBrackets(inputString):
 17 | 	return bool(re.search(r'\[|\]|\)|\(',inputString))
 18 | 
 19 | def hasAlphabets(inputString):
 20 | 	return bool(re.search(r'[a-zA-Z]',inputString))
 21 | 
 22 | def hasSpecialCharacters(inputString):
 23 | 	return bool(re.search(r'[\|\s]',inputString))
 24 | 
 25 | def create_files(fname):
 26 | 	# files = codecs.open(output+ '/' + fname +  "_spelling_error.txt","w", encoding='utf-8')
 27 | 	cf = codecs.open(output+ '/' + fname +  "_orig_sen.txt","w", encoding='utf-8')
 28 | 	ef = codecs.open(output+ '/' + fname +  "_error_sen.txt","w", encoding='utf-8')
 29 | 	return ef,cf
 30 | 
 31 | def check_error(earlier,current, srcs,tgts,number_of_edits):
 32 | 	earlier = utils.split_sentence(earlier)
 33 | 	current = utils.split_sentence(current)
 34 | 	if len(earlier)==len(current):
 35 | 		for j in range(0, len(earlier)):
 36 | 			f=0
 37 | 			earlier_words = earlier[j]
 38 | 			current_words = current[j]
 39 | 			if earlier_words == current_words:
 40 | 				continue
 41 | 			if len(earlier_words) < 5:
 42 | 				continue
 43 | 			if sum([1 if utils.is_chinese_char(x) else 0 for x in current_words]) / len(current_words) <= 0.7:
 44 | 				continue
 45 | 
 46 | 			if(len(earlier_words) == len(current_words)):
 47 | 				for k in range(0,len(earlier_words)):
 48 | 					if earlier_words[k]==current_words[k]:
 49 | 						continue
 50 | 					elif utils.is_chinese_char(earlier_words[k]):
 51 | 						f += 1
 52 | 
 53 | 			thr = min(max(math.ceil(number_of_edits * len(current_words)),1),10)
 54 | 			if(1<=f<=thr):
 55 | 				srcs.append(earlier[j])
 56 | 				tgts.append(current[j])
 57 | 
 58 | if __name__ == '__main__':
 59 | 
 60 | 	source = sys.argv[1]+"/"
 61 | 	source += sys.argv[2]
 62 | 	language = sys.argv[4]
 63 | 	number_of_edits = float(sys.argv[5])
 64 | 	output = sys.argv[3]
 65 | 	
 66 | 	files,files_2,files_3 = None,None,None
 67 | 	revisions = []
 68 | 	line = []
 69 | 	f=0
 70 | 
 71 | 	srcs,tgts = [],[]
 72 | 	pre_revision = ''
 73 | 	current_revision = ''
 74 | 	cnt = 0
 75 | 	for line in open(source):
 76 | 		line = converter.convert(line)
 77 | 		if "Revision timestamp" in line:
 78 | 			if current_revision:
 79 | 				if pre_revision:
 80 | 					cnt += 1
 81 | 					if cnt % 100 == 0:
 82 | 						print(source,'processed',cnt)
 83 | 					check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
 84 | 			pre_revision = current_revision.strip()
 85 | 			current_revision = ''
 86 | 		else:
 87 | 			current_revision += line
 88 | 	if current_revision and pre_revision:
 89 | 		check_error(pre_revision,current_revision.strip(),srcs,tgts,number_of_edits)
 90 | 
 91 | 	
 92 | 	if srcs:
 93 | 		ef,cf = create_files(sys.argv[2])
 94 | 		errors = set()
 95 | 		for src,tgt in zip(srcs[::-1],tgts[::-1]):
 96 | 			if src in errors or tgt in errors:
 97 | 				continue
 98 | 			errors.add(src)
 99 | 			errors.add(tgt)
100 | 			ef.write(src + '\n')
101 | 			cf.write(tgt + '\n')
102 | 
103 | 		ef.close()
104 | 		cf.close()
105 | 


--------------------------------------------------------------------------------
/reader/fix_extracted.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Fixes output of WikiExtractor.py
  4 | 
  5 | """
  6 | 
  7 | import argparse
  8 | import codecs
  9 | import re
 10 | 
 11 | import xml.sax
 12 | from xml.sax.handler import ContentHandler
 13 | handler = ContentHandler()
 14 | 
 15 | def fix_revison(revision):
 16 |   # fix text
 17 |   text_area = False
 18 |   new_revison = []
 19 |   for x in revision:
 20 |     if '<text>' in x:
 21 |       text_area = True
 22 | 
 23 |     # No </text>
 24 |     if text_area and '<sha1>' in x and '</text>' not in x:
 25 |       text_area = False
 26 |       new_revison.append('</text>')
 27 |     
 28 |     if text_area:
 29 |       # 将text里面的<符号替换掉
 30 |       x = x.replace('<text>','##LOSPR##').replace('</text>','##ROSPR##').replace('<','#')
 31 |       x = x.replace('##LOSPR##','<text>').replace('##ROSPR##','</text>')
 32 |     
 33 |     if '</text>' in x:
 34 |       text_area = False
 35 | 
 36 |     new_revison.append(x)
 37 | 
 38 |   try:
 39 |     xml.sax.parseString('\n'.join(new_revison),handler)
 40 |   except:
 41 |     return None
 42 |   return new_revison
 43 | 
 44 | def fix_extraction(input_dir, input_file, output_dir):
 45 |   with codecs.open(input_dir + '/' + input_file, 'r', encoding='utf-8') as f:
 46 |       contents = f.read()
 47 |       contents = contents.replace("&amp;","&").replace('&lt;','<').replace('&gt;','>').replace('&quot;','"').replace('&apos;','\'')
 48 |       contents = re.sub(r'<\/text>\s*<sha1>', '##LOSPR##', contents)
 49 |       contents = re.sub(r'<sha1>', '</text>\n\t<sha1>', contents)
 50 |       contents = re.sub(r'##LOSPR##', '</text>\n\t<sha1>', contents)
 51 |       
 52 |       # HTML entities
 53 |       contents = re.sub(r'&', '&amp;', contents)
 54 | 
 55 |       # Remove HTML tags if not removed already
 56 |       tag_pat1 = (r'<\/?(textarea|select|strong|center|option|'
 57 |                   r'input|param|small|style|table|tbody|thead|tfoot|'
 58 |                   r'body|head|html|span|font|form|'
 59 |                   r'div|img|var|pre|sub|sup|var|ref|wiki|'
 60 |                   r'br|dl|dt|dd|em|h[1-6]|hr|li|ol|td|tr|th|ul|a|b|p|q|u)>'
 61 |                   )
 62 |       contents = re.sub(tag_pat1, '', contents)
 63 | 
 64 |       # remove bad revisions
 65 |       new_content = []
 66 |       revison_content = []
 67 |       revison_area = False
 68 | 
 69 |       for line in contents.splitlines():
 70 |         if'<revision>' in line:
 71 |           # 如果revision有内容，那么肯定哪里出错了，直接丢弃数据
 72 |           if revison_content:
 73 |             revison_content = []
 74 |           revison_area = True
 75 | 
 76 |         if '</revision>' in line:
 77 |           revison_content.append(line)
 78 |           fixed = fix_revison(revison_content)
 79 |           if fixed is not None:
 80 |             new_content.extend(fixed)
 81 |           revison_content = []
 82 |           revison_area = False
 83 |           continue
 84 | 
 85 |         if revison_area:
 86 |           revison_content.append(line)
 87 |         else:
 88 |           new_content.append(line)
 89 |   with codecs.open(output_dir + '/' + input_file, 'w', encoding='utf-8') as fw:
 90 |       fw.write('\n'.join(new_content))
 91 |       
 92 | 
 93 | if __name__ == '__main__':
 94 |   arg_parser = argparse.ArgumentParser(description='Script for fixing WikiExtractor.py outputs')
 95 |   arg_parser.add_argument('input_dir', help='Input dir')
 96 |   arg_parser.add_argument('input_file', help='Input file')
 97 |   arg_parser.add_argument('output_dir', help='Output directory')
 98 |   args = arg_parser.parse_args()
 99 |   fix_extraction(args.input_dir, args.input_file, args.output_dir)
100 | 


--------------------------------------------------------------------------------
/reader/utils.py:
--------------------------------------------------------------------------------
  1 | # *-* coding: utf-8 *-*
  2 | 
  3 | """Utility functions.
  4 | 
  5 | """
  6 | # import nltk.data
  7 | # from nltk.tokenize.regexp import WhitespaceTokenizer
  8 | # from nltk.corpus import PlaintextCorpusReader
  9 | import jieba
 10 | import numpy as np
 11 | import sys
 12 | 
 13 | import re
 14 | from typing import List
 15 | 
 16 | def is_chinese_char(cp):
 17 |   """Checks whether CP is the codepoint of a CJK character."""
 18 |   # This defines a "chinese character" as anything in the CJK Unicode block:
 19 |   #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
 20 |   #
 21 |   # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
 22 |   # despite its name. The modern Korean Hangul alphabet is a different block,
 23 |   # as is Japanese Hiragana and Katakana. Those alphabets are used to write
 24 |   # space-separated words, so they are not treated specially and handled
 25 |   # like the all of the other languages.
 26 |   cp = ord(cp)
 27 |   if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
 28 |       (cp >= 0x3400 and cp <= 0x4DBF) or  #
 29 |       (cp >= 0x20000 and cp <= 0x2A6DF) or  #
 30 |       (cp >= 0x2A700 and cp <= 0x2B73F) or  #
 31 |       (cp >= 0x2B740 and cp <= 0x2B81F) or  #
 32 |       (cp >= 0x2B820 and cp <= 0x2CEAF) or
 33 |       (cp >= 0xF900 and cp <= 0xFAFF) or  #
 34 |       (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
 35 |     return True
 36 | 
 37 |   return False
 38 | 
 39 | def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
 40 |     """
 41 |     Args:
 42 |         document:
 43 |         flag: Type:str, "all" 中英文标点分句，"zh" 中文标点分句，"en" 英文标点分句
 44 |         limit: 默认单句最大长度为510个字符
 45 |     Returns: Type:list
 46 |     """
 47 |     sent_list = []
 48 |     try:
 49 |         if flag == "zh":
 50 |             document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
 51 |             document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)  # 特殊引号
 52 |         elif flag == "en":
 53 |             document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 英文单字符断句符
 54 |             document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document)  # 特殊引号
 55 |         else:
 56 |             document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
 57 |             document = re.sub('(?P<quotation_mark>(([。？！.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
 58 |                               document)  # 特殊引号
 59 | 
 60 |         sent_list_ori = document.splitlines()
 61 |         for sent in sent_list_ori:
 62 |             sent = sent.strip()
 63 |             if not sent:
 64 |                 continue
 65 |             else:
 66 |                 while len(sent) > limit:
 67 |                     temp = sent[0:limit]
 68 |                     sent_list.append(temp)
 69 |                     sent = sent[limit:]
 70 |                 sent_list.append(sent)
 71 |     except:
 72 |         sent_list.clear()
 73 |         sent_list.append(document)
 74 |     return sent_list
 75 | 
 76 | 
 77 | def to_unicode_or_bust(s, encoding='utf-8'):
 78 |   """Converts the bytestring in utf-8 to Unicode.
 79 | 
 80 |   Credit: Method from 'Unicode in Python, Completely Demystified'.
 81 |   
 82 |   Args:
 83 |     s: Bytestring
 84 |     encoding: Encoding
 85 | 
 86 |   Returns:
 87 |     Return the Unicode version of the given bytestring
 88 | 
 89 |   """
 90 |   # if isinstance(s, str):
 91 |   #   if not isinstance(s, unicode):
 92 |   #     s = unicode(s, encoding)
 93 |   return s   
 94 | 
 95 | 
 96 | def get_sentences_for_text(corpus_root, filename, lang='english'):
 97 |   """Segments the given text into sentences.
 98 | 
 99 |   Args:
100 |     corpus_root: Directory in which the text file is residing.
101 |     filename: Name of the text file.
102 |     lang: Tokenizer language. For possible values, look at:
103 |     ${NLTK_DATA}/tokenizers/punkt
104 | 
105 |   Returns:
106 |     Sentences in the given text. 
107 | 
108 |   """
109 |   sents = []
110 |   for s in split_sentence(open(corpus_root + '/' + filename).read()):
111 |     sents.append(jieba.lcut(s))
112 |   return sents
113 |   # tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle'
114 |   # text = PlaintextCorpusReader(corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(), 
115 |   #                              sent_tokenizer=nltk.data.LazyLoader(tokenizer_path))
116 |   # return text.sents()
117 | 
118 | def levenshtein_distance(s, t):
119 |   """Minimum edit distance between two strings.
120 | 
121 |   Args:
122 |     s: Source string
123 |     t: Target string
124 | 
125 |   Returns:
126 |     int: Minimum edit distance between the two input strings.
127 | 
128 |   """
129 |   m = len(s)
130 |   n = len(t)
131 |   if m == 0:
132 |     return n
133 |   if n == 0:
134 |     return m
135 |   d = np.zeros((m+1, n+1))
136 |   d[:, 0] = np.arange(m+1)
137 |   d[0, :] = np.arange(n+1)
138 |   for j in range(1, n+1):
139 |     for i in range(1, m+1):
140 |       if s[i-1] == t[j-1]:
141 |         d[i][j] = d[i-1][j-1]
142 |       else:
143 |         d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+1)
144 |   return int(d[m][n])
145 | 
146 | 
147 | if __name__ == '__main__':
148 |   corpus_root = '/net/cluster/TMP/loganathan/wiki_dump/cs/processing/stage3'
149 |   file_name = '0000000007.xml'
150 |   sentences = get_sentences_for_text(corpus_root, file_name)
151 |   # try:
152 |   #   for s in sentences:
153 |   #     print s
154 |   #     print '\n----END----'
155 |   # except AssertionError:
156 |   #   print 'Empty file'
157 |   
158 | 


--------------------------------------------------------------------------------
/reader/wikiextractor/WikiExtractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # =============================================================================
  5 | #  Version: 3.0 (July 22, 2020)
  6 | #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
  7 | #
  8 | #  Contributors:
  9 | #   Antonio Fuschetto (fuschett@aol.com)
 10 | #   Leonardo Souza (lsouza@amtera.com.br)
 11 | #   Juan Manuel Caicedo (juan@cavorite.com)
 12 | #   Humberto Pereira (begini@gmail.com)
 13 | #   Siegfried-A. Gevatter (siegfried@gevatter.com)
 14 | #   Pedro Assis (pedroh2306@gmail.com)
 15 | #   Wim Muskee (wimmuskee@gmail.com)
 16 | #   Radics Geza (radicsge@gmail.com)
 17 | #   Nick Ulven (nulven@github)
 18 | #
 19 | # =============================================================================
 20 | #  Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
 21 | # =============================================================================
 22 | #  This file is part of Tanl.
 23 | #
 24 | #  Tanl is free software; you can redistribute it and/or modify it
 25 | #  under the terms of the GNU Affero General Public License, version 3,
 26 | #  as published by the Free Software Foundation.
 27 | #
 28 | #  Tanl is distributed in the hope that it will be useful,
 29 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 30 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 31 | #  GNU Affero General Public License for more details.
 32 | #
 33 | #  You should have received a copy of the GNU Affero General Public License
 34 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 35 | # =============================================================================
 36 | 
 37 | """Wikipedia Extractor:
 38 | Extracts and cleans text from a Wikipedia database dump and stores output in a
 39 | number of files of similar size in a given directory.
 40 | Each file will contain several documents in the format:
 41 | 
 42 |     <doc id="" url="" title="">
 43 |         ...
 44 |         </doc>
 45 | 
 46 | If the program is invoked with the --json flag, then each file will                                            
 47 | contain several documents formatted as json ojects, one per line, with                                         
 48 | the following structure
 49 | 
 50 |     {"id": "", "revid": "", "url": "", "title": "", "text": "..."}
 51 | 
 52 | The program performs template expansion by preprocesssng the whole dump and
 53 | collecting template definitions.
 54 | """
 55 | 
 56 | import argparse
 57 | import bz2
 58 | import logging
 59 | import os.path
 60 | import re  # TODO use regex when it will be standard
 61 | import sys
 62 | from io import StringIO
 63 | from multiprocessing import Queue, get_context, cpu_count
 64 | from timeit import default_timer
 65 | 
 66 | from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
 67 | 
 68 | # ===========================================================================
 69 | 
 70 | # Program version
 71 | __version__ = '3.0.6'
 72 | 
 73 | ##
 74 | # Defined in <siteinfo>
 75 | # We include as default Template, when loading external template file.
 76 | knownNamespaces = set(['Template'])
 77 | 
 78 | ##
 79 | # The namespace used for template definitions
 80 | # It is the name associated with namespace key=10 in the siteinfo header.
 81 | templateNamespace = ''
 82 | templatePrefix = ''
 83 | 
 84 | ##
 85 | # The namespace used for module definitions
 86 | # It is the name associated with namespace key=828 in the siteinfo header.
 87 | moduleNamespace = ''
 88 | 
 89 | # ----------------------------------------------------------------------
 90 | # Modules
 91 | 
 92 | # Only minimal support
 93 | # FIXME: import Lua modules.
 94 | 
 95 | modules = {
 96 |     'convert': {
 97 |         'convert': lambda x, u, *rest: x + ' ' + u,  # no conversion
 98 |     }
 99 | }
100 | # ----------------------------------------------------------------------
101 | # Expand using WikiMedia API
102 | # import json
103 | 
104 | # def expandTemplates(text):
105 | #     """Expand templates invoking MediaWiki API"""
106 | #     text = urlib.urlencodew(text)
107 | #     base = urlbase[:urlbase.rfind('/')]
108 | #     url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
109 | #     exp = json.loads(urllib.urlopen(url))
110 | #     return exp['expandtemplates']['*']
111 | 
112 | # ------------------------------------------------------------------------------
113 | # Output
114 | 
115 | 
116 | class NextFile():
117 | 
118 |     """
119 |     Synchronous generation of next available file name.
120 |     """
121 | 
122 |     filesPerDir = 100
123 | 
124 |     def __init__(self, path_name):
125 |         self.path_name = path_name
126 |         self.dir_index = -1
127 |         self.file_index = -1
128 | 
129 |     def next(self):
130 |         self.file_index = (self.file_index + 1) % NextFile.filesPerDir
131 |         if self.file_index == 0:
132 |             self.dir_index += 1
133 |         dirname = self._dirname()
134 |         if not os.path.isdir(dirname):
135 |             os.makedirs(dirname)
136 |         return self._filepath()
137 | 
138 |     def _dirname(self):
139 |         char1 = self.dir_index % 26
140 |         char2 = int(self.dir_index / 26) % 26
141 |         return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
142 | 
143 |     def _filepath(self):
144 |         return '%s/wiki_%02d' % (self._dirname(), self.file_index)
145 | 
146 | 
147 | class OutputSplitter():
148 | 
149 |     """
150 |     File-like object, that splits output to multiple files of a given max size.
151 |     """
152 | 
153 |     def __init__(self, nextFile, max_file_size=0, compress=True):
154 |         """
155 |         :param nextFile: a NextFile object from which to obtain filenames
156 |             to use.
157 |         :param max_file_size: the maximum size of each file.
158 |         :para compress: whether to write data with bzip compression.
159 |         """
160 |         self.nextFile = nextFile
161 |         self.compress = compress
162 |         self.max_file_size = max_file_size
163 |         self.file = self.open(self.nextFile.next())
164 | 
165 |     def reserve(self, size):
166 |         if self.file.tell() + size > self.max_file_size:
167 |             self.close()
168 |             self.file = self.open(self.nextFile.next())
169 | 
170 |     def write(self, data):
171 |         self.reserve(len(data))
172 |         if self.compress:
173 |             self.file.write(data)
174 |         else:
175 |             self.file.write(data)
176 | 
177 |     def close(self):
178 |         self.file.close()
179 | 
180 |     def open(self, filename):
181 |         if self.compress:
182 |             return bz2.BZ2File(filename + '.bz2', 'w')
183 |         else:
184 |             return open(filename, 'w')
185 | 
186 | 
187 | # ----------------------------------------------------------------------
188 | # READER
189 | 
190 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
191 | #                    1     2               3      4
192 | 
193 | 
194 | def load_templates(file, output_file=None):
195 |     """
196 |     Load templates from :param file:.
197 |     :param output_file: file where to save templates and modules.
198 |     """
199 |     global templateNamespace, templatePrefix
200 |     templatePrefix = templateNamespace + ':'
201 |     global moduleNamespace, modulePrefix
202 |     modulePrefix = moduleNamespace + ':'
203 |     articles = 0
204 |     templates = 0
205 |     page = []
206 |     inText = False
207 |     if output_file:
208 |         output = open(output_file, 'w')
209 |     for line in file:
210 |         #line = line.decode('utf-8')
211 |         if '<' not in line:  # faster than doing re.search()
212 |             if inText:
213 |                 page.append(line)
214 |             continue
215 |         m = tagRE.search(line)
216 |         if not m:
217 |             continue
218 |         tag = m.group(2)
219 |         if tag == 'page':
220 |             page = []
221 |         elif tag == 'title':
222 |             title = m.group(3)
223 |         elif tag == 'text':
224 |             inText = True
225 |             line = line[m.start(3):m.end(3)]
226 |             page.append(line)
227 |             if m.lastindex == 4:  # open-close
228 |                 inText = False
229 |         elif tag == '/text':
230 |             if m.group(1):
231 |                 page.append(m.group(1))
232 |             inText = False
233 |         elif inText:
234 |             page.append(line)
235 |         elif tag == '/page':
236 |             if not output_file and not templateNamespace:  # do not know it yet
237 |                 # we reconstruct it from the first title
238 |                 colon = title.find(':')
239 |                 if colon > 1:
240 |                     templateNamespace = title[:colon]
241 |                     templatePrefix = title[:colon + 1]
242 |             # FIXME: should reconstruct also moduleNamespace
243 |             if title.startswith(templatePrefix):
244 |                 define_template(title, page)
245 |                 templates += 1
246 |             # save templates and modules to file
247 |             if output_file and (title.startswith(templatePrefix) or
248 |                                 title.startswith(modulePrefix)):
249 |                 output.write('<page>\n')
250 |                 output.write('   <title>%s</title>\n' % title)
251 |                 output.write('   <ns>10</ns>\n')
252 |                 output.write('   <text>')
253 |                 for line in page:
254 |                     output.write(line)
255 |                 output.write('   </text>\n')
256 |                 output.write('</page>\n')
257 |             page = []
258 |             articles += 1
259 |             if articles % 100000 == 0:
260 |                 logging.info("Preprocessed %d pages", articles)
261 |     if output_file:
262 |         output.close()
263 |         logging.info("Saved %d templates to '%s'", templates, output_file)
264 |     return templates
265 | 
266 | 
267 | def decode_open(filename, mode='rt', encoding='utf-8'):
268 |     """
269 |     Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
270 |     :param filename: the file to open.
271 |     """
272 |     ext = os.path.splitext(filename)[1]
273 |     if ext == '.gz':
274 |         import gzip
275 |         return gzip.open(filename, mode, encoding=encoding)
276 |     elif ext == '.bz2':
277 |         return bz2.open(filename, mode=mode, encoding=encoding)
278 |     else:
279 |         return open(filename, mode, encoding=encoding)
280 | 
281 | 
282 | def process_dump(input_file, template_file, out_file, file_size, file_compress,
283 |                  process_count, html_safe):
284 |     """
285 |     :param input_file: name of the wikipedia dump file; '-' to read from stdin
286 |     :param template_file: optional file with template definitions.
287 |     :param out_file: directory where to store extracted data, or '-' for stdout
288 |     :param file_size: max size of each extracted file, or None for no max (one file)
289 |     :param file_compress: whether to compress files with bzip.
290 |     :param process_count: number of extraction processes to spawn.
291 |     """
292 |     global knownNamespaces
293 |     global templateNamespace, templatePrefix
294 |     global moduleNamespace, modulePrefix
295 | 
296 |     urlbase = ''                # This is obtained from <siteinfo>
297 | 
298 |     input = decode_open(input_file)
299 | 
300 |     # collect siteinfo
301 |     for line in input:
302 |         line = line #.decode('utf-8')
303 |         m = tagRE.search(line)
304 |         if not m:
305 |             continue
306 |         tag = m.group(2)
307 |         if tag == 'base':
308 |             # discover urlbase from the xml dump file
309 |             # /mediawiki/siteinfo/base
310 |             base = m.group(3)
311 |             urlbase = base[:base.rfind("/")]
312 |         elif tag == 'namespace':
313 |             knownNamespaces.add(m.group(3))
314 |             if re.search('key="10"', line):
315 |                 templateNamespace = m.group(3)
316 |                 templatePrefix = templateNamespace + ':'
317 |             elif re.search('key="828"', line):
318 |                 moduleNamespace = m.group(3)
319 |                 modulePrefix = moduleNamespace + ':'
320 |         elif tag == '/siteinfo':
321 |             break
322 | 
323 |     if expand_templates:
324 |         # preprocess
325 |         template_load_start = default_timer()
326 |         if template_file and os.path.exists(template_file):
327 |             logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
328 |             file = decode_open(template_file)
329 |             templates = load_templates(file)
330 |             file.close()
331 |         else:
332 |             if input_file == '-':
333 |                 # can't scan then reset stdin; must error w/ suggestion to specify template_file
334 |                 raise ValueError("to use templates with stdin dump, must supply explicit template-file")
335 |             logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
336 |             templates = load_templates(input, template_file)
337 |             input.close()
338 |             input = decode_open(input_file)
339 |         template_load_elapsed = default_timer() - template_load_start
340 |         logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
341 | 
342 |     if out_file == '-':
343 |         output = sys.stdout
344 |         if file_compress:
345 |             logging.warn("writing to stdout, so no output compression (use an external tool)")
346 |     else:
347 |         nextFile = NextFile(out_file)
348 |         output = OutputSplitter(nextFile, file_size, file_compress)
349 | 
350 |     # process pages
351 |     logging.info("Starting page extraction from %s.", input_file)
352 |     extract_start = default_timer()
353 | 
354 |     # Parallel Map/Reduce:
355 |     # - pages to be processed are dispatched to workers
356 |     # - a reduce process collects the results, sort them and print them.
357 | 
358 |     # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object
359 |     Process = get_context("fork").Process
360 | 
361 |     maxsize = 10 * process_count
362 |     # output queue
363 |     output_queue = Queue(maxsize=maxsize)
364 | 
365 |     # Reduce job that sorts and prints output
366 |     reduce = Process(target=reduce_process, args=(output_queue, output))
367 |     reduce.start()
368 | 
369 |     # initialize jobs queue
370 |     jobs_queue = Queue(maxsize=maxsize)
371 | 
372 |     # start worker processes
373 |     logging.info("Using %d extract processes.", process_count)
374 |     workers = []
375 |     for _ in range(max(1, process_count)):
376 |         extractor = Process(target=extract_process,
377 |                             args=(jobs_queue, output_queue, html_safe))
378 |         extractor.daemon = True  # only live while parent process lives
379 |         extractor.start()
380 |         workers.append(extractor)
381 | 
382 |     # Mapper process
383 | 
384 |     # we collect individual lines, since str.join() is significantly faster
385 |     # than concatenation
386 |     page = []
387 |     id = ''
388 |     revid = ''
389 |     last_id = ''
390 |     ordinal = 0  # page count
391 |     inText = False
392 |     redirect = False
393 |     for line in input:
394 |         if '<' not in line:  # faster than doing re.search()
395 |             if inText:
396 |                 page.append(line)
397 |             continue
398 |         m = tagRE.search(line)
399 |         if not m:
400 |             continue
401 |         tag = m.group(2)
402 |         if tag == 'page':
403 |             page = []
404 |             redirect = False
405 |         elif tag == 'id' and not id:
406 |             id = m.group(3)
407 |         elif tag == 'id' and id: # <revision> <id></id> </revision>
408 |             revid = m.group(3)
409 |         elif tag == 'title':
410 |             title = m.group(3)
411 |         elif tag == 'redirect':
412 |             redirect = True
413 |         elif tag == 'text':
414 |             inText = True
415 |             line = line[m.start(3):m.end(3)]
416 |             page.append(line)
417 |             if m.lastindex == 4:  # open-close
418 |                 inText = False
419 |         elif tag == '/text':
420 |             if m.group(1):
421 |                 page.append(m.group(1))
422 |             inText = False
423 |         elif inText:
424 |             page.append(line)
425 |         elif tag == '/page':
426 |             colon = title.find(':')
427 |             if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
428 |                     not redirect and not title.startswith(templateNamespace)):
429 |                 job = (id, revid, urlbase, title, page, ordinal)
430 |                 jobs_queue.put(job)  # goes to any available extract_process
431 |                 last_id = id
432 |                 ordinal += 1
433 |             id = ''
434 |             revid = ''
435 |             page = []
436 | 
437 |     input.close()
438 | 
439 |     # signal termination
440 |     for _ in workers:
441 |         jobs_queue.put(None)
442 |     # wait for workers to terminate
443 |     for w in workers:
444 |         w.join()
445 | 
446 |     # signal end of work to reduce process
447 |     output_queue.put(None)
448 |     # wait for it to finish
449 |     reduce.join()
450 | 
451 |     if output != sys.stdout:
452 |         output.close()
453 |     extract_duration = default_timer() - extract_start
454 |     extract_rate = ordinal / extract_duration
455 |     logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
456 |                  process_count, ordinal, extract_duration, extract_rate)
457 | 
458 | 
459 | # ----------------------------------------------------------------------
460 | # Multiprocess support
461 | 
462 | 
463 | def extract_process(jobs_queue, output_queue, html_safe):
464 |     """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
465 |     :param jobs_queue: where to get jobs.
466 |     :param output_queue: where to queue extracted text for output.
467 |     :html_safe: whether to convert entities in text to HTML.
468 |     """
469 |     while True:
470 |         job = jobs_queue.get()  # job is (id, revid, urlbase, title, page, ordinal)
471 |         if job:
472 |             out = StringIO()  # memory buffer
473 |             Extractor(*job[:-1]).extract(out, html_safe)  # (id, urlbase, title, page)
474 |             text = out.getvalue()
475 |             output_queue.put((job[-1], text))  # (ordinal, extracted_text)
476 |             out.close()
477 |         else:
478 |             break
479 | 
480 | 
481 | def reduce_process(output_queue, output):
482 |     """Pull finished article text, write series of files (or stdout)
483 |     :param output_queue: text to be output.
484 |     :param output: file object where to print.
485 |     """
486 | 
487 |     interval_start = default_timer()
488 |     period = 100000
489 |     # FIXME: use a heap
490 |     ordering_buffer = {}  # collected pages
491 |     next_ordinal = 0  # sequence number of pages
492 |     while True:
493 |         if next_ordinal in ordering_buffer:
494 |             output.write(ordering_buffer.pop(next_ordinal))
495 |             next_ordinal += 1
496 |             # progress report
497 |             if next_ordinal % period == 0:
498 |                 interval_rate = period / (default_timer() - interval_start)
499 |                 logging.info("Extracted %d articles (%.1f art/s)",
500 |                              next_ordinal, interval_rate)
501 |                 interval_start = default_timer()
502 |         else:
503 |             # mapper puts None to signal finish
504 |             pair = output_queue.get()
505 |             if not pair:
506 |                 break
507 |             ordinal, text = pair
508 |             ordering_buffer[ordinal] = text
509 | 
510 | 
511 | # ----------------------------------------------------------------------
512 | 
513 | # Minimum size of output files
514 | minFileSize = 200 * 1024
515 | 
516 | 
517 | def main():
518 |     global urlbase, acceptedNamespaces
519 |     global expand_templates, templateCache
520 | 
521 |     parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
522 |                                      formatter_class=argparse.RawDescriptionHelpFormatter,
523 |                                      description=__doc__)
524 |     parser.add_argument("input",
525 |                         help="XML wiki dump file")
526 |     groupO = parser.add_argument_group('Output')
527 |     groupO.add_argument("-o", "--output", default="text",
528 |                         help="directory for extracted files (or '-' for dumping to stdout)")
529 |     groupO.add_argument("-b", "--bytes", default="1M",
530 |                         help="maximum bytes per output file (default %(default)s); 0 means to put a single article per file",
531 |                         metavar="n[KMG]")
532 |     groupO.add_argument("-c", "--compress", action="store_true",
533 |                         help="compress output files using bzip")
534 |     groupO.add_argument("--json", action="store_true",
535 |                         help="write output in json format instead of the default <doc> format")
536 | 
537 |     groupP = parser.add_argument_group('Processing')
538 |     groupP.add_argument("--html", action="store_true",
539 |                         help="produce HTML output, subsumes --links")
540 |     groupP.add_argument("-l", "--links", action="store_true",
541 |                         help="preserve links")
542 |     groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
543 |                         help="accepted namespaces")
544 |     groupP.add_argument("--templates",
545 |                         help="use or create file containing templates")
546 |     groupP.add_argument("--no-templates", action="store_false",
547 |                         help="Do not expand templates")
548 |     groupP.add_argument("--html-safe", default=True,
549 |                         help="use to produce HTML safe output within <doc>...</doc>")
550 |     default_process_count = cpu_count() - 1
551 |     parser.add_argument("--processes", type=int, default=default_process_count,
552 |                         help="Number of processes to use (default %(default)s)")
553 | 
554 |     groupS = parser.add_argument_group('Special')
555 |     groupS.add_argument("-q", "--quiet", action="store_true",
556 |                         help="suppress reporting progress info")
557 |     groupS.add_argument("--debug", action="store_true",
558 |                         help="print debug info")
559 |     groupS.add_argument("-a", "--article", action="store_true",
560 |                         help="analyze a file containing a single article (debug option)")
561 |     groupS.add_argument("-v", "--version", action="version",
562 |                         version='%(prog)s ' + __version__,
563 |                         help="print program version")
564 | 
565 |     args = parser.parse_args()
566 | 
567 |     Extractor.keepLinks = args.links
568 |     Extractor.HtmlFormatting = args.html
569 |     if args.html:
570 |         Extractor.keepLinks = True
571 |     Extractor.to_json = args.json
572 | 
573 |     expand_templates = args.no_templates
574 | 
575 |     try:
576 |         power = 'kmg'.find(args.bytes[-1].lower()) + 1
577 |         # 0 bytes means put a single article per file.
578 |         file_size = 0 if args.bytes == '0' else int(args.bytes[:-1]) * 1024 ** power
579 |         if file_size and file_size < minFileSize:
580 |             raise ValueError()
581 |     except ValueError:
582 |         logging.error('Insufficient or invalid size: %s', args.bytes)
583 |         return
584 | 
585 |     if args.namespaces:
586 |         acceptedNamespaces = set(args.namespaces.split(','))
587 | 
588 |     FORMAT = '%(levelname)s: %(message)s'
589 |     logging.basicConfig(format=FORMAT)
590 | 
591 |     logger = logging.getLogger()
592 |     if not args.quiet:
593 |         logger.setLevel(logging.INFO)
594 |     if args.debug:
595 |         logger.setLevel(logging.DEBUG)
596 | 
597 |     input_file = args.input
598 | 
599 |     if not Extractor.keepLinks:
600 |         ignoreTag('a')
601 | 
602 |     # sharing cache of parser templates is too slow:
603 |     # manager = Manager()
604 |     # templateCache = manager.dict()
605 | 
606 |     if args.article:
607 |         if args.templates:
608 |             if os.path.exists(args.templates):
609 |                 with open(args.templates) as file:
610 |                     load_templates(file)
611 | 
612 |         with open(input_file) as file:
613 |             page = file.read()
614 |             ids = re.findall(r'<id>(\d*?)</id>', page)
615 |             id = ids[0] if ids else ''
616 |             revid = ids[1] if len(ids) > 1 else ''
617 |             m = re.search(r'<title>(.*?)</title>', page)
618 |             if m:
619 |                 title = m.group(1)
620 |             else:
621 |                 logging.error('Missing title element')
622 |                 return
623 |             m = re.search(r'<base>(.*?)</base>', page)
624 |             if m:
625 |                 base = m.group(1)
626 |                 urlbase = base[:base.rfind("/")]
627 |             else:
628 |                 urlbase = ''
629 |             Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
630 |         return
631 | 
632 |     output_path = args.output
633 |     if output_path != '-' and not os.path.isdir(output_path):
634 |         try:
635 |             os.makedirs(output_path)
636 |         except:
637 |             logging.error('Could not create: %s', output_path)
638 |             return
639 | 
640 |     process_dump(input_file, args.templates, output_path, file_size,
641 |                  args.compress, args.processes, args.html_safe)
642 | 
643 | 
644 | if __name__ == '__main__':
645 |     main()
646 | 


--------------------------------------------------------------------------------
/reader/wikiextractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xueyouluo/wiki-error-extract/7871514c89bb3e6e2ceb314ac4f5eba94d75bb7a/reader/wikiextractor/__init__.py


--------------------------------------------------------------------------------
/reader/wikiextractor/cirrus-extract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # =============================================================================
  5 | #  Version: 1.00 (December 15, 2015)
  6 | #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
  7 | #
  8 | # =============================================================================
  9 | #  Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
 10 | # =============================================================================
 11 | #  This file is part of Tanl.
 12 | #
 13 | #  Tanl is free software; you can redistribute it and/or modify it
 14 | #  under the terms of the GNU Affero General Public License, version 3,
 15 | #  as published by the Free Software Foundation.
 16 | #
 17 | #  Tanl is distributed in the hope that it will be useful,
 18 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 19 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 20 | #  GNU Affero General Public License for more details.
 21 | #
 22 | #  You should have received a copy of the GNU Affero General Public License
 23 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 24 | # =============================================================================
 25 | 
 26 | """Wikipedia Cirrus Extractor:
 27 | Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
 28 | number of files of similar size in a given directory.
 29 | Each file will contain several documents in the format:
 30 | 
 31 | 	<doc id="" url="" title="" language="" revision="">
 32 |         ...
 33 |         </doc>
 34 | 
 35 | """
 36 | 
 37 | import sys, os.path, time
 38 | import re
 39 | import json
 40 | import argparse
 41 | import bz2
 42 | import gzip
 43 | import logging
 44 | 
 45 | # Program version
 46 | version = '3.0'
 47 | 
 48 | urlbase = 'http://it.wikipedia.org/'
 49 | 
 50 | # ----------------------------------------------------------------------
 51 | 
 52 | class NextFile(object):
 53 |     """
 54 |     Synchronous generation of next available file name.
 55 |     """
 56 | 
 57 |     filesPerDir = 100
 58 | 
 59 |     def __init__(self, path_name):
 60 |         self.path_name = path_name
 61 |         self.dir_index = -1
 62 |         self.file_index = -1
 63 | 
 64 |     def next(self):
 65 |         self.file_index = (self.file_index + 1) % NextFile.filesPerDir
 66 |         if self.file_index == 0:
 67 |             self.dir_index += 1
 68 |         dirname = self._dirname()
 69 |         if not os.path.isdir(dirname):
 70 |             os.makedirs(dirname)
 71 |         return self._filepath()
 72 | 
 73 |     def _dirname(self):
 74 |         char1 = self.dir_index % 26
 75 |         char2 = int(self.dir_index / 26) % 26
 76 |         return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
 77 | 
 78 |     def _filepath(self):
 79 |         return '%s/wiki_%02d' % (self._dirname(), self.file_index)
 80 | 
 81 | class OutputSplitter(object):
 82 |     """
 83 |     File-like object, that splits output to multiple files of a given max size.
 84 |     """
 85 | 
 86 |     def __init__(self, nextFile, max_file_size=0, compress=True):
 87 |         """
 88 |         :param nextfile: a NextFile object from which to obtain filenames
 89 |             to use.
 90 |         :param max_file_size: the maximum size of each file.
 91 |         :para compress: whether to write data with bzip compression.
 92 |         """
 93 |         self.nextFile = nextFile
 94 |         self.compress = compress
 95 |         self.max_file_size = max_file_size
 96 |         self.file = self.open(self.nextFile.next())
 97 | 
 98 |     def reserve(self, size):
 99 |         if self.file.tell() + size > self.max_file_size:
100 |             self.close()
101 |             self.file = self.open(self.nextFile.next())
102 | 
103 |     def write(self, data):
104 |         self.reserve(len(data))
105 |         self.file.write(data)
106 | 
107 |     def close(self):
108 |         self.file.close()
109 | 
110 |     def open(self, filename):
111 |         if self.compress:
112 |             return bz2.BZ2File(filename + '.bz2', 'w')
113 |         else:
114 |             return open(filename, 'w')
115 | 
116 | # ----------------------------------------------------------------------
117 | 
118 | class Extractor(object):
119 | 
120 |     def extract(self, out):
121 |         """
122 |         :param out: output file.
123 |         """
124 |         logging.debug("%s\t%s", self.id, self.title)
125 |         text = ''.join(self.page)
126 |         url = get_url(self.id)
127 |         header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
128 |         # Separate header from text with a newline.
129 |         header += self.title + '\n\n'
130 |         header = header.encode('utf-8')
131 |         footer = "\n</doc>\n"
132 |         out.write(header)
133 |         text = clean(self, text)
134 |         for line in compact(text):
135 |             out.write(line.encode('utf-8'))
136 |             out.write('\n')
137 |         out.write(footer)
138 | 
139 | def process_dump(input_file, out_file, file_size, file_compress):
140 |     """
141 |     :param input_file: name of the wikipedia dump file; '-' to read from stdin
142 |     :param out_file: directory where to store extracted data, or '-' for stdout
143 |     :param file_size: max size of each extracted file, or None for no max (one file)
144 |     :param file_compress: whether to compress files with bzip.
145 |     """
146 | 
147 |     if input_file == '-':
148 |         input = sys.stdin
149 |     else:
150 |         input = gzip.open(input_file)
151 | 
152 |     if out_file == '-':
153 |         output = sys.stdout
154 |         if file_compress:
155 |             logging.warn("writing to stdout, so no output compression (use external tool)")
156 |     else:
157 |         nextFile = NextFile(out_file)
158 |         output = OutputSplitter(nextFile, file_size, file_compress)
159 | 
160 |     # process dump
161 |     # format
162 |     # {"index":{"_type":"page","_id":"3825914"}}
163 |     # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
164 |     while True:
165 |         line = input.readline()
166 |         if not line:
167 |             break
168 |         index = json.loads(line)
169 |         content = json.loads(input.readline())
170 |         type = index['index']['_type']
171 |         id = index['index']['_id']
172 |         language = content['language']
173 |         revision = content['version']
174 |         if type == 'page' and content['namespace'] == 0:
175 |             title = content['title']
176 |             text = content['text']
177 |             # drop references:
178 |             # ^ The Penguin Dictionary
179 |             text = re.sub(r'  \^ .*', '', text)
180 |             url = urlbase + 'wiki?curid=' + id
181 |             header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
182 |             page = header + title + '\n\n' + text + '\n</doc>\n'
183 |             output.write(page.encode('utf-8'))
184 | 
185 | # ----------------------------------------------------------------------
186 | 
187 | # Minimum size of output files
188 | minFileSize = 200 * 1024
189 | 
190 | def main():
191 |     parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
192 |         formatter_class=argparse.RawDescriptionHelpFormatter,
193 |                                      description=__doc__)
194 |     parser.add_argument("input",
195 |                         help="Cirrus Json wiki dump file")
196 |     groupO = parser.add_argument_group('Output')
197 |     groupO.add_argument("-o", "--output", default="text",
198 |                         help="directory for extracted files (or '-' for dumping to stdin)")
199 |     groupO.add_argument("-b", "--bytes", default="1M",
200 |                         help="maximum bytes per output file (default %(default)s)",
201 |                         metavar="n[KMG]")
202 |     groupO.add_argument("-c", "--compress", action="store_true",
203 |                         help="compress output files using bzip")
204 | 
205 |     groupP = parser.add_argument_group('Processing')
206 |     groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
207 |                         help="accepted namespaces")
208 | 
209 |     groupS = parser.add_argument_group('Special')
210 |     groupS.add_argument("-q", "--quiet", action="store_true",
211 |                         help="suppress reporting progress info")
212 |     groupS.add_argument("-v", "--version", action="version",
213 |                         version='%(prog)s ' + version,
214 |                         help="print program version")
215 | 
216 |     args = parser.parse_args()
217 | 
218 |     try:
219 |         power = 'kmg'.find(args.bytes[-1].lower()) + 1
220 |         file_size = int(args.bytes[:-1]) * 1024 ** power
221 |         if file_size < minFileSize:
222 |             raise ValueError()
223 |     except ValueError:
224 |         logging.error('Insufficient or invalid size: %s', args.bytes)
225 |         return
226 | 
227 |     FORMAT = '%(levelname)s: %(message)s'
228 |     logging.basicConfig(format=FORMAT)
229 | 
230 |     logger = logging.getLogger()
231 |     if not args.quiet:
232 |         logger.setLevel(logging.INFO)
233 | 
234 |     input_file = args.input
235 | 
236 |     output_path = args.output
237 |     if output_path != '-' and not os.path.isdir(output_path):
238 |         try:
239 |             os.makedirs(output_path)
240 |         except:
241 |             logging.error('Could not create: %s', output_path)
242 |             return
243 | 
244 |     process_dump(input_file, output_path, file_size, args.compress)
245 | 
246 | 
247 | if __name__ == '__main__':
248 |     main()
249 | 


--------------------------------------------------------------------------------
/reader/wikiextractor/clean.py:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | #  Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
 3 | # =============================================================================
 4 | #  This file is part of Tanl.
 5 | #
 6 | #  Tanl is free software; you can redistribute it and/or modify it
 7 | #  under the terms of the GNU Affero General Public License, version 3,
 8 | #  as published by the Free Software Foundation.
 9 | #
10 | #  Tanl is distributed in the hope that it will be useful,
11 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #  GNU Affero General Public License for more details.
14 | #
15 | #  You should have received a copy of the GNU Affero General Public License
16 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | # =============================================================================
18 | 
19 | from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
20 | 
21 | 
22 | def clean_markup(markup, keep_links=False, ignore_headers=True):
23 |     """
24 |     Clean Wikimarkup to produce plaintext.
25 | 
26 |     :param keep_links: Set to True to keep internal and external links
27 |     :param ignore_headers: if set to True, the output list will not contain
28 |     headers, only 
29 | 
30 |     Returns a list of paragraphs (unicode strings).
31 |     """
32 | 
33 |     if not keep_links:
34 |         ignoreTag('a')
35 | 
36 |     extractor = Extractor(0, '', [])
37 | 
38 |     # returns a list of strings
39 |     paragraphs = extractor.clean_text(markup,
40 |                                       mark_headers=True,
41 |                                       expand_templates=False,
42 |                                       escape_doc=True)
43 |     resetIgnoredTags()
44 | 
45 |     if ignore_headers:
46 |         paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
47 | 
48 |     return paragraphs
49 | 


--------------------------------------------------------------------------------
/reader/wikiextractor/extract.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | 
   3 | # =============================================================================
   4 | #  Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
   5 | # =============================================================================
   6 | #  This file is part of Tanl.
   7 | #
   8 | #  Tanl is free software; you can redistribute it and/or modify it
   9 | #  under the terms of the GNU Affero General Public License, version 3,
  10 | #  as published by the Free Software Foundation.
  11 | #
  12 | #  Tanl is distributed in the hope that it will be useful,
  13 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 | #  GNU Affero General Public License for more details.
  16 | #
  17 | #  You should have received a copy of the GNU Affero General Public License
  18 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 | # =============================================================================
  20 | 
  21 | import re
  22 | import html
  23 | import json
  24 | from itertools import zip_longest
  25 | from urllib.parse import quote as urlencode
  26 | from html.entities import name2codepoint
  27 | import logging
  28 | import time
  29 | 
  30 | # ----------------------------------------------------------------------
  31 | 
  32 | # match tail after wikilink
  33 | tailRE = re.compile('\w+')
  34 | syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
  35 | 
  36 | ## PARAMS ####################################################################
  37 | 
  38 | ##
  39 | # Defined in <siteinfo>
  40 | # We include as default Template, when loading external template file.
  41 | knownNamespaces = set(['Template'])
  42 | 
  43 | ##
  44 | # Drop these elements from article text
  45 | #
  46 | discardElements = [
  47 |     'gallery', 'timeline', 'noinclude', 'pre',
  48 |     'table', 'tr', 'td', 'th', 'caption', 'div',
  49 |     'form', 'input', 'select', 'option', 'textarea',
  50 |     'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
  51 |     'ref', 'references', 'img', 'imagemap', 'source', 'small'
  52 | ]
  53 | 
  54 | ##
  55 | # Recognize only these namespaces
  56 | # w: Internal links to the Wikipedia
  57 | # wiktionary: Wiki dictionary
  58 | # wikt: shortcut for Wiktionary
  59 | #
  60 | acceptedNamespaces = ['w', 'wiktionary', 'wikt']
  61 | 
  62 | 
  63 | def get_url(urlbase, uid):
  64 |     return "%s?curid=%s" % (urlbase, uid)
  65 | 
  66 | 
  67 | # ======================================================================
  68 | 
  69 | 
  70 | def clean(extractor, text, expand_templates=False, html_safe=True):
  71 |     """
  72 |     Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
  73 |     @see https://www.mediawiki.org/wiki/Help:Formatting
  74 |     :param extractor: the Extractor t use.
  75 |     :param text: the text to clean.
  76 |     :param expand_templates: whether to perform template expansion.
  77 |     :param html_safe: whether to convert reserved HTML characters to entities.
  78 |     @return: the cleaned text.
  79 |     """
  80 | 
  81 |     if expand_templates:
  82 |         # expand templates
  83 |         # See: http://www.mediawiki.org/wiki/Help:Templates
  84 |         text = extractor.expandTemplates(text)
  85 |     else:
  86 |         # Drop transclusions (template, parser functions)
  87 |         text = dropNested(text, r'{{', r'}}')
  88 | 
  89 |     # Drop tables
  90 |     text = dropNested(text, r'{\|', r'\|}')
  91 | 
  92 |     # replace external links
  93 |     text = replaceExternalLinks(text)
  94 | 
  95 |     # replace internal links
  96 |     text = replaceInternalLinks(text)
  97 | 
  98 |     # drop MagicWords behavioral switches
  99 |     text = magicWordsRE.sub('', text)
 100 | 
 101 |     # ############### Process HTML ###############
 102 | 
 103 |     # turn into HTML, except for the content of <syntaxhighlight>
 104 |     res = ''
 105 |     cur = 0
 106 |     for m in syntaxhighlight.finditer(text):
 107 |         end = m.end()
 108 |         res += unescape(text[cur:m.start()]) + m.group(1)
 109 |         cur = end
 110 |     text = res + unescape(text[cur:])
 111 | 
 112 |     # Handle bold/italic/quote
 113 |     if extractor.HtmlFormatting:
 114 |         text = bold_italic.sub(r'<b>\1</b>', text)
 115 |         text = bold.sub(r'<b>\1</b>', text)
 116 |         text = italic.sub(r'<i>\1</i>', text)
 117 |     else:
 118 |         text = bold_italic.sub(r'\1', text)
 119 |         text = bold.sub(r'\1', text)
 120 |         text = italic_quote.sub(r'"\1"', text)
 121 |         text = italic.sub(r'"\1"', text)
 122 |         text = quote_quote.sub(r'"\1"', text)
 123 |     # residuals of unbalanced quotes
 124 |     text = text.replace("'''", '').replace("''", '"')
 125 | 
 126 |     # Collect spans
 127 | 
 128 |     spans = []
 129 |     # Drop HTML comments
 130 |     for m in comment.finditer(text):
 131 |         spans.append((m.start(), m.end()))
 132 | 
 133 |     # Drop self-closing tags
 134 |     for pattern in selfClosing_tag_patterns:
 135 |         for m in pattern.finditer(text):
 136 |             spans.append((m.start(), m.end()))
 137 | 
 138 |     # Drop ignored tags
 139 |     for left, right in ignored_tag_patterns:
 140 |         for m in left.finditer(text):
 141 |             spans.append((m.start(), m.end()))
 142 |         for m in right.finditer(text):
 143 |             spans.append((m.start(), m.end()))
 144 | 
 145 |     # Bulk remove all spans
 146 |     text = dropSpans(spans, text)
 147 | 
 148 |     # Drop discarded elements
 149 |     for tag in discardElements:
 150 |         text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
 151 | 
 152 |     if not extractor.HtmlFormatting:
 153 |         # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
 154 |         text = unescape(text)
 155 | 
 156 |     # Expand placeholders
 157 |     for pattern, placeholder in placeholder_tag_patterns:
 158 |         index = 1
 159 |         for match in pattern.finditer(text):
 160 |             text = text.replace(match.group(), '%s_%d' % (placeholder, index))
 161 |             index += 1
 162 | 
 163 |     text = text.replace('<<', u'«').replace('>>', u'»')
 164 | 
 165 |     #############################################
 166 | 
 167 |     # Cleanup text
 168 |     text = text.replace('\t', ' ')
 169 |     text = spaces.sub(' ', text)
 170 |     text = dots.sub('...', text)
 171 |     text = re.sub(u' (,:\.\)\]»)', r'\1', text)
 172 |     text = re.sub(u'(\[\(«) ', r'\1', text)
 173 |     text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
 174 |     text = text.replace(',,', ',').replace(',.', '.')
 175 |     if html_safe:
 176 |         text = html.escape(text, quote=False)
 177 |     return text
 178 | 
 179 | 
 180 | # skip level 1, it is page name level
 181 | section = re.compile(r'(==+)\s*(.*?)\s*\1')
 182 | 
 183 | listOpen = {'*': '<ul>', '#': '<ol>', ';': '<dl>', ':': '<dl>'}
 184 | listClose = {'*': '</ul>', '#': '</ol>', ';': '</dl>', ':': '</dl>'}
 185 | listItem = {'*': '<li>%s</li>', '#': '<li>%s</<li>', ';': '<dt>%s</dt>',
 186 |             ':': '<dd>%s</dd>'}
 187 | 
 188 | 
 189 | def compact(text, mark_headers=False):
 190 |     """Deal with headers, lists, empty sections, residuals of tables.
 191 |     :param text: convert to HTML
 192 |     """
 193 | 
 194 |     page = []  # list of paragraph
 195 |     headers = {}  # Headers for unfilled sections
 196 |     emptySection = False  # empty sections are discarded
 197 |     listLevel = ''  # nesting of lists
 198 | 
 199 |     for line in text.split('\n'):
 200 | 
 201 |         if not line:
 202 |             continue
 203 |         # Handle section titles
 204 |         m = section.match(line)
 205 |         if m:
 206 |             title = m.group(2)
 207 |             lev = len(m.group(1))
 208 |             if Extractor.HtmlFormatting:
 209 |                 page.append("<h%d>%s</h%d>" % (lev, title, lev))
 210 |             if title and title[-1] not in '!?':
 211 |                 title += '.'
 212 | 
 213 |             if mark_headers:
 214 |                 title = "## " + title
 215 | 
 216 |             headers[lev] = title
 217 |             # drop previous headers
 218 |             headers = { k:v for k,v in headers.items() if k <= lev }
 219 |             emptySection = True
 220 |             continue
 221 |         # Handle page title
 222 |         if line.startswith('++'):
 223 |             title = line[2:-2]
 224 |             if title:
 225 |                 if title[-1] not in '!?':
 226 |                     title += '.'
 227 |                 page.append(title)
 228 |         # handle indents
 229 |         elif line[0] == ':':
 230 |             # page.append(line.lstrip(':*#;'))
 231 |             continue
 232 |         # handle lists
 233 |         elif line[0] in '*#;:':
 234 |             if Extractor.HtmlFormatting:
 235 |                 i = 0
 236 |                 for c, n in zip_longest(listLevel, line, fillvalue=''):
 237 |                     if not n or n not in '*#;:':
 238 |                         if c:
 239 |                             page.append(listClose[c])
 240 |                             listLevel = listLevel[:-1]
 241 |                             continue
 242 |                         else:
 243 |                             break
 244 |                     # n != ''
 245 |                     if c != n and (not c or (c not in ';:' and n not in ';:')):
 246 |                         if c:
 247 |                             # close level
 248 |                             page.append(listClose[c])
 249 |                             listLevel = listLevel[:-1]
 250 |                         listLevel += n
 251 |                         page.append(listOpen[n])
 252 |                     i += 1
 253 |                 n = line[i - 1]  # last list char
 254 |                 line = line[i:].strip()
 255 |                 if line:  # FIXME: n is '"'
 256 |                     page.append(listItem[n] % line)
 257 |             else:
 258 |                 continue
 259 |         elif len(listLevel):
 260 |             for c in reversed(listLevel):
 261 |                 page.append(listClose[c])
 262 |             listLevel = []
 263 | 
 264 |         # Drop residuals of lists
 265 |         elif line[0] in '{|' or line[-1] == '}':
 266 |             continue
 267 |         # Drop irrelevant lines
 268 |         elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
 269 |             continue
 270 |         elif len(headers):
 271 |             if Extractor.keepSections:
 272 |                 items = sorted(headers.items())
 273 |                 for (i, v) in items:
 274 |                     page.append(v)
 275 |             headers.clear()
 276 |             page.append(line)  # first line
 277 |             emptySection = False
 278 |         elif not emptySection:
 279 |             page.append(line)
 280 |             # dangerous
 281 |             # # Drop preformatted
 282 |             # elif line[0] == ' ':
 283 |             #     continue
 284 | 
 285 |     return page
 286 | 
 287 | 
 288 | # ----------------------------------------------------------------------
 289 | 
 290 | def dropNested(text, openDelim, closeDelim):
 291 |     """
 292 |     A matching function for nested expressions, e.g. namespaces and tables.
 293 |     """
 294 |     openRE = re.compile(openDelim, re.IGNORECASE)
 295 |     closeRE = re.compile(closeDelim, re.IGNORECASE)
 296 |     # partition text in separate blocks { } { }
 297 |     spans = []  # pairs (s, e) for each partition
 298 |     nest = 0  # nesting level
 299 |     start = openRE.search(text, 0)
 300 |     if not start:
 301 |         return text
 302 |     end = closeRE.search(text, start.end())
 303 |     next = start
 304 |     while end:
 305 |         next = openRE.search(text, next.end())
 306 |         if not next:  # termination
 307 |             while nest:  # close all pending
 308 |                 nest -= 1
 309 |                 end0 = closeRE.search(text, end.end())
 310 |                 if end0:
 311 |                     end = end0
 312 |                 else:
 313 |                     break
 314 |             spans.append((start.start(), end.end()))
 315 |             break
 316 |         while end.end() < next.start():
 317 |             # { } {
 318 |             if nest:
 319 |                 nest -= 1
 320 |                 # try closing more
 321 |                 last = end.end()
 322 |                 end = closeRE.search(text, end.end())
 323 |                 if not end:  # unbalanced
 324 |                     if spans:
 325 |                         span = (spans[0][0], last)
 326 |                     else:
 327 |                         span = (start.start(), last)
 328 |                     spans = [span]
 329 |                     break
 330 |             else:
 331 |                 spans.append((start.start(), end.end()))
 332 |                 # advance start, find next close
 333 |                 start = next
 334 |                 end = closeRE.search(text, next.end())
 335 |                 break  # { }
 336 |         if next != start:
 337 |             # { { }
 338 |             nest += 1
 339 |     # collect text outside partitions
 340 |     return dropSpans(spans, text)
 341 | 
 342 | 
 343 | def dropSpans(spans, text):
 344 |     """
 345 |     Drop from text the blocks identified in :param spans:, possibly nested.
 346 |     """
 347 |     spans.sort()
 348 |     res = ''
 349 |     offset = 0
 350 |     for s, e in spans:
 351 |         if offset <= s:  # handle nesting
 352 |             if offset < s:
 353 |                 res += text[offset:s]
 354 |             offset = e
 355 |     res += text[offset:]
 356 |     return res
 357 | 
 358 | 
 359 | # ----------------------------------------------------------------------
 360 | # External links
 361 | 
 362 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html
 363 | 
 364 | wgUrlProtocols = [
 365 |     'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://',
 366 |     'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:',
 367 |     'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
 368 |     'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//'
 369 | ]
 370 | 
 371 | # from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html
 372 | 
 373 | # Constants needed for external link processing
 374 | # Everything except bracket, space, or control characters
 375 | # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
 376 | # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
 377 | EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
 378 | ExtLinkBracketedRegex = re.compile(
 379 |     '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
 380 |     re.S | re.U)
 381 | EXT_IMAGE_REGEX = re.compile(
 382 |     r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
 383 |     /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
 384 |     re.X | re.S | re.U)
 385 | 
 386 | 
 387 | def replaceExternalLinks(text):
 388 |     s = ''
 389 |     cur = 0
 390 |     for m in ExtLinkBracketedRegex.finditer(text):
 391 |         s += text[cur:m.start()]
 392 |         cur = m.end()
 393 | 
 394 |         url = m.group(1)
 395 |         label = m.group(3)
 396 | 
 397 |         # # The characters '<' and '>' (which were escaped by
 398 |         # # removeHTMLtags()) should not be included in
 399 |         # # URLs, per RFC 2396.
 400 |         # m2 = re.search('&(lt|gt);', url)
 401 |         # if m2:
 402 |         #     link = url[m2.end():] + ' ' + link
 403 |         #     url = url[0:m2.end()]
 404 | 
 405 |         # If the link text is an image URL, replace it with an <img> tag
 406 |         # This happened by accident in the original parser, but some people used it extensively
 407 |         m = EXT_IMAGE_REGEX.match(label)
 408 |         if m:
 409 |             label = makeExternalImage(label)
 410 | 
 411 |         # Use the encoded URL
 412 |         # This means that users can paste URLs directly into the text
 413 |         # Funny characters like ö aren't valid in URLs anyway
 414 |         # This was changed in August 2004
 415 |         s += makeExternalLink(url, label)  # + trail
 416 | 
 417 |     return s + text[cur:]
 418 | 
 419 | 
 420 | def makeExternalLink(url, anchor):
 421 |     """Function applied to wikiLinks"""
 422 |     if Extractor.keepLinks:
 423 |         return '<a href="%s">%s</a>' % (urlencode(url), anchor)
 424 |     else:
 425 |         return anchor
 426 | 
 427 | 
 428 | def makeExternalImage(url, alt=''):
 429 |     if Extractor.keepLinks:
 430 |         return '<img src="%s" alt="%s">' % (url, alt)
 431 |     else:
 432 |         return alt
 433 | 
 434 | 
 435 | # ----------------------------------------------------------------------
 436 | # WikiLinks
 437 | # See https://www.mediawiki.org/wiki/Help:Links#Internal_links
 438 | 
 439 | # Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
 440 | # Also: [[Help:IPA for Catalan|[andora]]]
 441 | 
 442 | 
 443 | def replaceInternalLinks(text):
 444 |     """
 445 |     Replaces external links of the form:
 446 |     [[title |...|label]]trail
 447 | 
 448 |     with title concatenated with trail, when present, e.g. 's' for plural.
 449 |     """
 450 |     # call this after removal of external links, so we need not worry about
 451 |     # triple closing ]]].
 452 |     cur = 0
 453 |     res = ''
 454 |     for s, e in findBalanced(text, ['[['], [']]']):
 455 |         m = tailRE.match(text, e)
 456 |         if m:
 457 |             trail = m.group(0)
 458 |             end = m.end()
 459 |         else:
 460 |             trail = ''
 461 |             end = e
 462 |         inner = text[s + 2:e - 2]
 463 |         # find first |
 464 |         pipe = inner.find('|')
 465 |         if pipe < 0:
 466 |             title = inner
 467 |             label = title
 468 |         else:
 469 |             title = inner[:pipe].rstrip()
 470 |             # find last |
 471 |             curp = pipe + 1
 472 |             for s1, e1 in findBalanced(inner, ['[['], [']]']):
 473 |                 last = inner.rfind('|', curp, s1)
 474 |                 if last >= 0:
 475 |                     pipe = last  # advance
 476 |                 curp = e1
 477 |             label = inner[pipe + 1:].strip()
 478 |         res += text[cur:s] + makeInternalLink(title, label) + trail
 479 |         cur = end
 480 |     return res + text[cur:]
 481 | 
 482 | 
 483 | def makeInternalLink(title, label):
 484 |     colon = title.find(':')
 485 |     if colon > 0 and title[:colon] not in acceptedNamespaces:
 486 |         return ''
 487 |     if colon == 0:
 488 |         # drop also :File:
 489 |         colon2 = title.find(':', colon + 1)
 490 |         if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
 491 |             return ''
 492 |     if Extractor.keepLinks:
 493 |         return '<a href="%s">%s</a>' % (urlencode(title), label)
 494 |     else:
 495 |         return label
 496 | 
 497 | 
 498 | # ----------------------------------------------------------------------
 499 | # variables
 500 | 
 501 | 
 502 | class MagicWords():
 503 | 
 504 |     """
 505 |     One copy in each Extractor.
 506 | 
 507 |     @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html
 508 |     """
 509 |     names = [
 510 |         '!',
 511 |         'currentmonth',
 512 |         'currentmonth1',
 513 |         'currentmonthname',
 514 |         'currentmonthnamegen',
 515 |         'currentmonthabbrev',
 516 |         'currentday',
 517 |         'currentday2',
 518 |         'currentdayname',
 519 |         'currentyear',
 520 |         'currenttime',
 521 |         'currenthour',
 522 |         'localmonth',
 523 |         'localmonth1',
 524 |         'localmonthname',
 525 |         'localmonthnamegen',
 526 |         'localmonthabbrev',
 527 |         'localday',
 528 |         'localday2',
 529 |         'localdayname',
 530 |         'localyear',
 531 |         'localtime',
 532 |         'localhour',
 533 |         'numberofarticles',
 534 |         'numberoffiles',
 535 |         'numberofedits',
 536 |         'articlepath',
 537 |         'pageid',
 538 |         'sitename',
 539 |         'server',
 540 |         'servername',
 541 |         'scriptpath',
 542 |         'stylepath',
 543 |         'pagename',
 544 |         'pagenamee',
 545 |         'fullpagename',
 546 |         'fullpagenamee',
 547 |         'namespace',
 548 |         'namespacee',
 549 |         'namespacenumber',
 550 |         'currentweek',
 551 |         'currentdow',
 552 |         'localweek',
 553 |         'localdow',
 554 |         'revisionid',
 555 |         'revisionday',
 556 |         'revisionday2',
 557 |         'revisionmonth',
 558 |         'revisionmonth1',
 559 |         'revisionyear',
 560 |         'revisiontimestamp',
 561 |         'revisionuser',
 562 |         'revisionsize',
 563 |         'subpagename',
 564 |         'subpagenamee',
 565 |         'talkspace',
 566 |         'talkspacee',
 567 |         'subjectspace',
 568 |         'subjectspacee',
 569 |         'talkpagename',
 570 |         'talkpagenamee',
 571 |         'subjectpagename',
 572 |         'subjectpagenamee',
 573 |         'numberofusers',
 574 |         'numberofactiveusers',
 575 |         'numberofpages',
 576 |         'currentversion',
 577 |         'rootpagename',
 578 |         'rootpagenamee',
 579 |         'basepagename',
 580 |         'basepagenamee',
 581 |         'currenttimestamp',
 582 |         'localtimestamp',
 583 |         'directionmark',
 584 |         'contentlanguage',
 585 |         'numberofadmins',
 586 |         'cascadingsources',
 587 |     ]
 588 | 
 589 |     def __init__(self):
 590 |         self.values = {'!': '|'}
 591 | 
 592 |     def __getitem__(self, name):
 593 |         return self.values.get(name)
 594 | 
 595 |     def __setitem__(self, name, value):
 596 |         self.values[name] = value
 597 | 
 598 |     switches = (
 599 |         '__NOTOC__',
 600 |         '__FORCETOC__',
 601 |         '__TOC__',
 602 |         '__TOC__',
 603 |         '__NEWSECTIONLINK__',
 604 |         '__NONEWSECTIONLINK__',
 605 |         '__NOGALLERY__',
 606 |         '__HIDDENCAT__',
 607 |         '__NOCONTENTCONVERT__',
 608 |         '__NOCC__',
 609 |         '__NOTITLECONVERT__',
 610 |         '__NOTC__',
 611 |         '__START__',
 612 |         '__END__',
 613 |         '__INDEX__',
 614 |         '__NOINDEX__',
 615 |         '__STATICREDIRECT__',
 616 |         '__DISAMBIG__'
 617 |     )
 618 | 
 619 | 
 620 | magicWordsRE = re.compile('|'.join(MagicWords.switches))
 621 | 
 622 | 
 623 | # =========================================================================
 624 | #
 625 | # MediaWiki Markup Grammar
 626 | # https://www.mediawiki.org/wiki/Preprocessor_ABNF
 627 | 
 628 | # xml-char = %x9 / %xA / %xD / %x20-D7FF / %xE000-FFFD / %x10000-10FFFF
 629 | # sptab = SP / HTAB
 630 | 
 631 | # ; everything except ">" (%x3E)
 632 | # attr-char = %x9 / %xA / %xD / %x20-3D / %x3F-D7FF / %xE000-FFFD / %x10000-10FFFF
 633 | 
 634 | # literal         = *xml-char
 635 | # title           = wikitext-L3
 636 | # part-name       = wikitext-L3
 637 | # part-value      = wikitext-L3
 638 | # part            = ( part-name "=" part-value ) / ( part-value )
 639 | # parts           = [ title *( "|" part ) ]
 640 | # tplarg          = "{{{" parts "}}}"
 641 | # template        = "{{" parts "}}"
 642 | # link            = "[[" wikitext-L3 "]]"
 643 | 
 644 | # comment         = "<!--" literal "-->"
 645 | # unclosed-comment = "<!--" literal END
 646 | # ; the + in the line-eating-comment rule was absent between MW 1.12 and MW 1.22
 647 | # line-eating-comment = LF LINE-START *SP +( comment *SP ) LINE-END
 648 | 
 649 | # attr            = *attr-char
 650 | # nowiki-element  = "<nowiki" attr ( "/>" / ( ">" literal ( "</nowiki>" / END ) ) )
 651 | 
 652 | # wikitext-L2     = heading / wikitext-L3 / *wikitext-L2
 653 | # wikitext-L3     = literal / template / tplarg / link / comment /
 654 | #                   line-eating-comment / unclosed-comment / xmlish-element /
 655 | #                   *wikitext-L3
 656 | 
 657 | # ------------------------------------------------------------------------------
 658 | 
 659 | selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
 660 | 
 661 | # These tags are dropped, keeping their content.
 662 | # handle 'a' separately, depending on keepLinks
 663 | ignoredTags = (
 664 |     'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
 665 |     'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
 666 |     'p', 'plaintext', 's', 'span', 'strike', 'strong',
 667 |     'sub', 'sup', 'tt', 'u', 'var','a'
 668 | )
 669 | 
 670 | placeholder_tags = {'math': 'formula', 'code': 'codice'}
 671 | 
 672 | 
 673 | def normalizeTitle(title):
 674 |     """Normalize title"""
 675 |     # remove leading/trailing whitespace and underscores
 676 |     title = title.strip(' _')
 677 |     # replace sequences of whitespace and underscore chars with a single space
 678 |     title = re.sub(r'[\s_]+', ' ', title)
 679 | 
 680 |     m = re.match(r'([^:]*):(\s*)(\S(?:.*))', title)
 681 |     if m:
 682 |         prefix = m.group(1)
 683 |         if m.group(2):
 684 |             optionalWhitespace = ' '
 685 |         else:
 686 |             optionalWhitespace = ''
 687 |         rest = m.group(3)
 688 | 
 689 |         ns = normalizeNamespace(prefix)
 690 |         if ns in knownNamespaces:
 691 |             # If the prefix designates a known namespace, then it might be
 692 |             # followed by optional whitespace that should be removed to get
 693 |             # the canonical page name
 694 |             # (e.g., "Category:  Births" should become "Category:Births").
 695 |             title = ns + ":" + ucfirst(rest)
 696 |         else:
 697 |             # No namespace, just capitalize first letter.
 698 |             # If the part before the colon is not a known namespace, then we
 699 |             # must not remove the space after the colon (if any), e.g.,
 700 |             # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
 701 |             # However, to get the canonical page name we must contract multiple
 702 |             # spaces into one, because
 703 |             # "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".
 704 |             title = ucfirst(prefix) + ":" + optionalWhitespace + ucfirst(rest)
 705 |     else:
 706 |         # no namespace, just capitalize first letter
 707 |         title = ucfirst(title)
 708 |     return title
 709 | 
 710 | 
 711 | def unescape(text):
 712 |     """
 713 |     Removes HTML or XML character references and entities from a text string.
 714 | 
 715 |     :param text The HTML (or XML) source text.
 716 |     :return The plain text, as a Unicode string, if necessary.
 717 |     """
 718 | 
 719 |     def fixup(m):
 720 |         text = m.group(0)
 721 |         code = m.group(1)
 722 |         try:
 723 |             if text[1] == "#":  # character reference
 724 |                 if text[2] == "x":
 725 |                     return chr(int(code[1:], 16))
 726 |                 else:
 727 |                     return chr(int(code))
 728 |             else:  # named entity
 729 |                 return chr(name2codepoint[code])
 730 |         except:
 731 |             return text  # leave as is
 732 | 
 733 |     return re.sub("&#?(\w+);", fixup, text)
 734 | 
 735 | 
 736 | # Match HTML comments
 737 | # The buggy template {{Template:T}} has a comment terminating with just "->"
 738 | comment = re.compile(r'<!--.*?-->', re.DOTALL)
 739 | 
 740 | # Match ignored tags
 741 | ignored_tag_patterns = []
 742 | 
 743 | 
 744 | def ignoreTag(tag):
 745 |     left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL)  # both <ref> and <reference>
 746 |     right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
 747 |     ignored_tag_patterns.append((left, right))
 748 | 
 749 | 
 750 | def resetIgnoredTags():
 751 |     global ignored_tag_patterns
 752 |     ignored_tag_patterns = []
 753 | 
 754 | 
 755 | for tag in ignoredTags:
 756 |     ignoreTag(tag)
 757 | 
 758 | # Match selfClosing HTML tags
 759 | selfClosing_tag_patterns = [
 760 |     re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
 761 | ]
 762 | 
 763 | # Match HTML placeholder tags
 764 | placeholder_tag_patterns = [
 765 |     (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE),
 766 |      repl) for tag, repl in placeholder_tags.items()
 767 | ]
 768 | 
 769 | # Match preformatted lines
 770 | preformatted = re.compile(r'^ .*?$')
 771 | 
 772 | # Match external links (space separates second optional parameter)
 773 | externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]')
 774 | externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')
 775 | 
 776 | # Matches bold/italic
 777 | bold_italic = re.compile(r"'''''(.*?)'''''")
 778 | bold = re.compile(r"'''(.*?)'''")
 779 | italic_quote = re.compile(r"''\"([^\"]*?)\"''")
 780 | italic = re.compile(r"''(.*?)''")
 781 | quote_quote = re.compile(r'""([^"]*?)""')
 782 | 
 783 | # Matches space
 784 | spaces = re.compile(r' {2,}')
 785 | 
 786 | # Matches dots
 787 | dots = re.compile(r'\.{4,}')
 788 | 
 789 | # ======================================================================
 790 | 
 791 | substWords = 'subst:|safesubst:'
 792 | 
 793 | 
 794 | class Extractor():
 795 |     """
 796 |     An extraction task on a article.
 797 |     """
 798 |     ##
 799 |     # Whether to preserve links in output
 800 |     keepLinks = False
 801 | 
 802 |     ##
 803 |     # Whether to preserve section titles
 804 |     keepSections = True
 805 | 
 806 |     ##
 807 |     # Whether to output text with HTML formatting elements in <doc> files.
 808 |     HtmlFormatting = False
 809 | 
 810 |     ##
 811 |     # Whether to produce json instead of the default <doc> output format.
 812 |     toJson = False
 813 | 
 814 |     def __init__(self,title):
 815 |         """
 816 |         :param page: a list of lines.
 817 |         """
 818 |         # self.id = id
 819 |         # self.revid = revid
 820 |         # self.url = get_url(urlbase, id)
 821 |         self.title = title
 822 |         # self.page = page
 823 |         self.magicWords = MagicWords()
 824 |         self.frame = []
 825 |         self.recursion_exceeded_1_errs = 0  # template recursion within expandTemplates()
 826 |         self.recursion_exceeded_2_errs = 0  # template recursion within expandTemplate()
 827 |         self.recursion_exceeded_3_errs = 0  # parameter recursion
 828 |         self.template_title_errs = 0
 829 | 
 830 |     def clean_text(self, text, mark_headers=False, expand_templates=False,
 831 |                    html_safe=True):
 832 |         """
 833 |         :param mark_headers: True to distinguish headers from paragraphs
 834 |           e.g. "## Section 1"
 835 |         """
 836 |         self.magicWords['pagename'] = self.title
 837 |         self.magicWords['fullpagename'] = self.title
 838 |         self.magicWords['currentyear'] = time.strftime('%Y')
 839 |         self.magicWords['currentmonth'] = time.strftime('%m')
 840 |         self.magicWords['currentday'] = time.strftime('%d')
 841 |         self.magicWords['currenthour'] = time.strftime('%H')
 842 |         self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
 843 | 
 844 |         text = clean(self, text, expand_templates=expand_templates,
 845 |                      html_safe=html_safe)
 846 | 
 847 |         text = compact(text, mark_headers=mark_headers)
 848 |         return text
 849 | 
 850 |     def extract(self, page, html_safe=False):
 851 |         """
 852 |         :param out: a memory file.
 853 |         :param html_safe: whether to escape HTML entities.
 854 |         """
 855 |         # logging.debug("%s\t%s", self.id, self.title)
 856 |         text = self.clean_text(page, html_safe=html_safe)
 857 |         return '\n'.join(text)
 858 | 
 859 |         
 860 |         # header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
 861 |         # # Separate header from text with a newline.
 862 |         # header += self.title + '\n\n'
 863 |         # footer = "\n</doc>\n"
 864 |         # out.write(header)
 865 |         # out.write('\n'.join(text))
 866 |         # out.write('\n')
 867 |         # out.write(footer)
 868 | 
 869 |         # errs = (self.template_title_errs,
 870 |         #         self.recursion_exceeded_1_errs,
 871 |         #         self.recursion_exceeded_2_errs,
 872 |         #         self.recursion_exceeded_3_errs)
 873 |         # if any(errs):
 874 |         #     logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
 875 |         #                  self.title, self.id, *errs)
 876 | 
 877 |     # ----------------------------------------------------------------------
 878 |     # Expand templates
 879 | 
 880 |     maxTemplateRecursionLevels = 30
 881 |     maxParameterRecursionLevels = 10
 882 | 
 883 |     # check for template beginning
 884 |     reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
 885 | 
 886 |     def expandTemplates(self, wikitext):
 887 |         """
 888 |         :param wikitext: the text to be expanded.
 889 | 
 890 |         Templates are frequently nested. Occasionally, parsing mistakes may
 891 |         cause template insertion to enter an infinite loop, for instance when
 892 |         trying to instantiate Template:Country
 893 | 
 894 |         {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
 895 | 
 896 |         which is repeatedly trying to insert template 'country_', which is
 897 |         again resolved to Template:Country. The straightforward solution of
 898 |         keeping track of templates that were already inserted for the current
 899 |         article would not work, because the same template may legally be used
 900 |         more than once, with different parameters in different parts of the
 901 |         article.  Therefore, we limit the number of iterations of nested
 902 |         template inclusion.
 903 | 
 904 |         """
 905 |         # Test template expansion at:
 906 |         # https://en.wikipedia.org/wiki/Special:ExpandTemplates
 907 | 
 908 |         res = ''
 909 |         if len(self.frame) >= self.maxTemplateRecursionLevels:
 910 |             self.recursion_exceeded_1_errs += 1
 911 |             return res
 912 | 
 913 |         # logging.debug('<expandTemplates ' + str(len(self.frame)))
 914 | 
 915 |         cur = 0
 916 |         # look for matching {{...}}
 917 |         for s, e in findMatchingBraces(wikitext, 2):
 918 |             res += wikitext[cur:s] + self.expandTemplate(wikitext[s + 2:e - 2])
 919 |             cur = e
 920 |         # leftover
 921 |         res += wikitext[cur:]
 922 |         # logging.debug('   expandTemplates> %d %s', len(self.frame), res)
 923 |         return res
 924 | 
 925 |     def templateParams(self, parameters):
 926 |         """
 927 |         Build a dictionary with positional or name key to expanded parameters.
 928 |         :param parameters: the parts[1:] of a template, i.e. all except the title.
 929 |         """
 930 |         templateParams = {}
 931 | 
 932 |         if not parameters:
 933 |             return templateParams
 934 |         logging.debug('<templateParams: %s', '|'.join(parameters))
 935 | 
 936 |         # Parameters can be either named or unnamed. In the latter case, their
 937 |         # name is defined by their ordinal position (1, 2, 3, ...).
 938 | 
 939 |         unnamedParameterCounter = 0
 940 | 
 941 |         # It's legal for unnamed parameters to be skipped, in which case they
 942 |         # will get default values (if available) during actual instantiation.
 943 |         # That is {{template_name|a||c}} means parameter 1 gets
 944 |         # the value 'a', parameter 2 value is not defined, and parameter 3 gets
 945 |         # the value 'c'.  This case is correctly handled by function 'split',
 946 |         # and does not require any special handling.
 947 |         for param in parameters:
 948 |             # Spaces before or after a parameter value are normally ignored,
 949 |             # UNLESS the parameter contains a link (to prevent possible gluing
 950 |             # the link to the following text after template substitution)
 951 | 
 952 |             # Parameter values may contain "=" symbols, hence the parameter
 953 |             # name extends up to the first such symbol.
 954 | 
 955 |             # It is legal for a parameter to be specified several times, in
 956 |             # which case the last assignment takes precedence. Example:
 957 |             # "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
 958 |             # Therefore, we don't check if the parameter has been assigned a
 959 |             # value before, because anyway the last assignment should override
 960 |             # any previous ones.
 961 |             # FIXME: Don't use DOTALL here since parameters may be tags with
 962 |             # attributes, e.g. <div class="templatequotecite">
 963 |             # Parameters may span several lines, like:
 964 |             # {{Reflist|colwidth=30em|refs=
 965 |             # &lt;ref name=&quot;Goode&quot;&gt;Title&lt;/ref&gt;
 966 | 
 967 |             # The '=' might occurr within an HTML attribute:
 968 |             #   "&lt;ref name=value"
 969 |             # but we stop at first.
 970 |             m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL)
 971 |             if m:
 972 |                 # This is a named parameter.  This case also handles parameter
 973 |                 # assignments like "2=xxx", where the number of an unnamed
 974 |                 # parameter ("2") is specified explicitly - this is handled
 975 |                 # transparently.
 976 | 
 977 |                 parameterName = m.group(1).strip()
 978 |                 parameterValue = m.group(2)
 979 | 
 980 |                 if ']]' not in parameterValue:  # if the value does not contain a link, trim whitespace
 981 |                     parameterValue = parameterValue.strip()
 982 |                 templateParams[parameterName] = parameterValue
 983 |             else:
 984 |                 # this is an unnamed parameter
 985 |                 unnamedParameterCounter += 1
 986 | 
 987 |                 if ']]' not in param:  # if the value does not contain a link, trim whitespace
 988 |                     param = param.strip()
 989 |                 templateParams[str(unnamedParameterCounter)] = param
 990 |         logging.debug('   templateParams> %s', '|'.join(templateParams.values()))
 991 |         return templateParams
 992 | 
 993 |     def expandTemplate(self, body):
 994 |         """Expands template invocation.
 995 |         :param body: the parts of a template.
 996 | 
 997 |         :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation
 998 |         of the process.
 999 | 
1000 |         See in particular: Expansion of names and values
1001 |         http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values
1002 | 
1003 |         For most parser functions all names and values are expanded,
1004 |         regardless of what is relevant for the result. The branching functions
1005 |         (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions.
1006 | 
1007 |         All names in a template call are expanded, and the titles of the
1008 |         tplargs in the template body, after which it is determined which
1009 |         values must be expanded, and for which tplargs in the template body
1010 |         the first part (default).
1011 | 
1012 |         In the case of a tplarg, any parts beyond the first are never
1013 |         expanded.  The possible name and the value of the first part is
1014 |         expanded if the title does not match a name in the template call.
1015 | 
1016 |         :see code for braceSubstitution at
1017 |         https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397:
1018 | 
1019 |         """
1020 | 
1021 |         # template        = "{{" parts "}}"
1022 | 
1023 |         # Templates and tplargs are decomposed in the same way, with pipes as
1024 |         # separator, even though eventually any parts in a tplarg after the first
1025 |         # (the parameter default) are ignored, and an equals sign in the first
1026 |         # part is treated as plain text.
1027 |         # Pipes inside inner templates and tplargs, or inside double rectangular
1028 |         # brackets within the template or tplargs are not taken into account in
1029 |         # this decomposition.
1030 |         # The first part is called title, the other parts are simply called parts.
1031 | 
1032 |         # If a part has one or more equals signs in it, the first equals sign
1033 |         # determines the division into name = value. Equals signs inside inner
1034 |         # templates and tplargs, or inside double rectangular brackets within the
1035 |         # part are not taken into account in this decomposition. Parts without
1036 |         # equals sign are indexed 1, 2, .., given as attribute in the <name> tag.
1037 | 
1038 |         if len(self.frame) >= self.maxTemplateRecursionLevels:
1039 |             self.recursion_exceeded_2_errs += 1
1040 |             # logging.debug('   INVOCATION> %d %s', len(self.frame), body)
1041 |             return ''
1042 | 
1043 |         logging.debug('INVOCATION %d %s', len(self.frame), body)
1044 | 
1045 |         parts = splitParts(body)
1046 |         # title is the portion before the first |
1047 |         logging.debug('TITLE %s', parts[0].strip())
1048 |         title = self.expandTemplates(parts[0].strip())
1049 | 
1050 |         # SUBST
1051 |         # Apply the template tag to parameters without
1052 |         # substituting into them, e.g.
1053 |         # {{subst:t|a{{{p|q}}}b}} gives the wikitext start-a{{{p|q}}}b-end
1054 |         # @see https://www.mediawiki.org/wiki/Manual:Substitution#Partial_substitution
1055 |         subst = False
1056 |         if re.match(substWords, title, re.IGNORECASE):
1057 |             title = re.sub(substWords, '', title, 1, re.IGNORECASE)
1058 |             subst = True
1059 | 
1060 |         if title.lower() in self.magicWords.values:
1061 |             return self.magicWords[title.lower()]
1062 | 
1063 |         # Parser functions
1064 |         # The first argument is everything after the first colon.
1065 |         # It has been evaluated above.
1066 |         colon = title.find(':')
1067 |         if colon > 1:
1068 |             funct = title[:colon]
1069 |             parts[0] = title[colon + 1:].strip()  # side-effect (parts[0] not used later)
1070 |             # arguments after first are not evaluated
1071 |             ret = callParserFunction(funct, parts, self.frame)
1072 |             return self.expandTemplates(ret)
1073 | 
1074 |         title = fullyQualifiedTemplateTitle(title)
1075 |         if not title:
1076 |             self.template_title_errs += 1
1077 |             return ''
1078 | 
1079 |         redirected = redirects.get(title)
1080 |         if redirected:
1081 |             title = redirected
1082 | 
1083 |         # get the template
1084 |         if title in templateCache:
1085 |             template = templateCache[title]
1086 |         elif title in templates:
1087 |             template = Template.parse(templates[title])
1088 |             # add it to cache
1089 |             templateCache[title] = template
1090 |             del templates[title]
1091 |         else:
1092 |             # The page being included could not be identified
1093 |             return ''
1094 | 
1095 |         # logging.debug('TEMPLATE %s: %s', title, template)
1096 | 
1097 |         # tplarg          = "{{{" parts "}}}"
1098 |         # parts           = [ title *( "|" part ) ]
1099 |         # part            = ( part-name "=" part-value ) / ( part-value )
1100 |         # part-name       = wikitext-L3
1101 |         # part-value      = wikitext-L3
1102 |         # wikitext-L3     = literal / template / tplarg / link / comment /
1103 |         #                   line-eating-comment / unclosed-comment /
1104 |         #           	    xmlish-element / *wikitext-L3
1105 | 
1106 |         # A tplarg may contain other parameters as well as templates, e.g.:
1107 |         #   {{{text|{{{quote|{{{1|{{error|Error: No text given}}}}}}}}}}}
1108 |         # hence no simple RE like this would work:
1109 |         #   '{{{((?:(?!{{{).)*?)}}}'
1110 |         # We must use full CF parsing.
1111 | 
1112 |         # the parameter name itself might be computed, e.g.:
1113 |         #   {{{appointe{{#if:{{{appointer14|}}}|r|d}}14|}}}
1114 | 
1115 |         # Because of the multiple uses of double-brace and triple-brace
1116 |         # syntax, expressions can sometimes be ambiguous.
1117 |         # Precedence rules specifed here:
1118 |         # http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
1119 |         # resolve ambiguities like this:
1120 |         #   {{{{ }}}} -> { {{{ }}} }
1121 |         #   {{{{{ }}}}} -> {{ {{{ }}} }}
1122 |         #
1123 |         # :see: https://en.wikipedia.org/wiki/Help:Template#Handling_parameters
1124 | 
1125 |         params = parts[1:]
1126 | 
1127 |         if not subst:
1128 |             # Evaluate parameters, since they may contain templates, including
1129 |             # the symbol "=".
1130 |             # {{#ifexpr: {{{1}}} = 1 }}
1131 |             params = [self.expandTemplates(p) for p in params]
1132 | 
1133 |         # build a dict of name-values for the parameter values
1134 |         params = self.templateParams(params)
1135 | 
1136 |         # Perform parameter substitution
1137 |         # extend frame before subst, since there may be recursion in default
1138 |         # parameter value, e.g. {{OTRS|celebrative|date=April 2015}} in article
1139 |         # 21637542 in enwiki.
1140 |         self.frame.append((title, params))
1141 |         instantiated = template.subst(params, self)
1142 |         # logging.debug('instantiated %d %s', len(self.frame), instantiated)
1143 |         value = self.expandTemplates(instantiated)
1144 |         self.frame.pop()
1145 |         # logging.debug('   INVOCATION> %s %d %s', title, len(self.frame), value)
1146 |         return value
1147 | 
1148 | 
1149 | # ----------------------------------------------------------------------
1150 | # parameter handling
1151 | 
1152 | 
1153 | def splitParts(paramsList):
1154 |     """
1155 |     :param paramsList: the parts of a template or tplarg.
1156 | 
1157 |     Split template parameters at the separator "|".
1158 |     separator "=".
1159 | 
1160 |     Template parameters often contain URLs, internal links, text or even
1161 |     template expressions, since we evaluate templates outside in.
1162 |     This is required for cases like:
1163 |       {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}}
1164 |     Parameters are separated by "|" symbols. However, we
1165 |     cannot simply split the string on "|" symbols, since these
1166 |     also appear inside templates and internal links, e.g.
1167 | 
1168 |      {{if:|
1169 |       |{{#if:the president|
1170 |            |{{#if:|
1171 |                [[Category:Hatnote templates|A{{PAGENAME}}]]
1172 |             }}
1173 |        }}
1174 |      }}
1175 | 
1176 |     We split parts at the "|" symbols that are not inside any pair
1177 |     {{{...}}}, {{...}}, [[...]], {|...|}.
1178 |     """
1179 | 
1180 |     # Must consider '[' as normal in expansion of Template:EMedicine2:
1181 |     # #ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}
1182 |     # as part of:
1183 |     # {{#ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}}} ped/180{{#if: |~}}]
1184 | 
1185 |     # should handle both tpl arg like:
1186 |     #    4|{{{{{subst|}}}CURRENTYEAR}}
1187 |     # and tpl parameters like:
1188 |     #    ||[[Category:People|{{#if:A|A|{{PAGENAME}}}}]]
1189 | 
1190 |     sep = '|'
1191 |     parameters = []
1192 |     cur = 0
1193 |     for s, e in findMatchingBraces(paramsList):
1194 |         par = paramsList[cur:s].split(sep)
1195 |         if par:
1196 |             if parameters:
1197 |                 # portion before | belongs to previous parameter
1198 |                 parameters[-1] += par[0]
1199 |                 if len(par) > 1:
1200 |                     # rest are new parameters
1201 |                     parameters.extend(par[1:])
1202 |             else:
1203 |                 parameters = par
1204 |         elif not parameters:
1205 |             parameters = ['']  # create first param
1206 |         # add span to last previous parameter
1207 |         parameters[-1] += paramsList[s:e]
1208 |         cur = e
1209 |     # leftover
1210 |     par = paramsList[cur:].split(sep)
1211 |     if par:
1212 |         if parameters:
1213 |             # portion before | belongs to previous parameter
1214 |             parameters[-1] += par[0]
1215 |             if len(par) > 1:
1216 |                 # rest are new parameters
1217 |                 parameters.extend(par[1:])
1218 |         else:
1219 |             parameters = par
1220 | 
1221 |     # logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, str(parameters))
1222 |     return parameters
1223 | 
1224 | 
1225 | def findMatchingBraces(text, ldelim=0):
1226 |     """
1227 |     :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}.
1228 |     """
1229 |     # Parsing is done with respect to pairs of double braces {{..}} delimiting
1230 |     # a template, and pairs of triple braces {{{..}}} delimiting a tplarg.
1231 |     # If double opening braces are followed by triple closing braces or
1232 |     # conversely, this is taken as delimiting a template, with one left-over
1233 |     # brace outside it, taken as plain text. For any pattern of braces this
1234 |     # defines a set of templates and tplargs such that any two are either
1235 |     # separate or nested (not overlapping).
1236 | 
1237 |     # Unmatched double rectangular closing brackets can be in a template or
1238 |     # tplarg, but unmatched double rectangular opening brackets cannot.
1239 |     # Unmatched double or triple closing braces inside a pair of
1240 |     # double rectangular brackets are treated as plain text.
1241 |     # Other formulation: in ambiguity between template or tplarg on one hand,
1242 |     # and a link on the other hand, the structure with the rightmost opening
1243 |     # takes precedence, even if this is the opening of a link without any
1244 |     # closing, so not producing an actual link.
1245 | 
1246 |     # In the case of more than three opening braces the last three are assumed
1247 |     # to belong to a tplarg, unless there is no matching triple of closing
1248 |     # braces, in which case the last two opening braces are are assumed to
1249 |     # belong to a template.
1250 | 
1251 |     # We must skip individual { like in:
1252 |     #   {{#ifeq: {{padleft:|1|}} | { | | &nbsp;}}
1253 |     # We must resolve ambiguities like this:
1254 |     #   {{{{ }}}} -> { {{{ }}} }
1255 |     #   {{{{{ }}}}} -> {{ {{{ }}} }}
1256 |     #   {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}}
1257 | 
1258 |     # Handle:
1259 |     #   {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}}
1260 |     # as well as expressions with stray }:
1261 |     #   {{{link|{{ucfirst:{{{1}}}}}} interchange}}}
1262 | 
1263 |     if ldelim:  # 2-3
1264 |         reOpen = re.compile('[{]{%d,}' % ldelim)  # at least ldelim
1265 |         reNext = re.compile('[{]{2,}|}{2,}')  # at least 2
1266 |     else:
1267 |         reOpen = re.compile('{{2,}|\[{2,}')
1268 |         reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
1269 | 
1270 |     cur = 0
1271 |     while True:
1272 |         m1 = reOpen.search(text, cur)
1273 |         if not m1:
1274 |             return
1275 |         lmatch = m1.end() - m1.start()
1276 |         if m1.group()[0] == '{':
1277 |             stack = [lmatch]  # stack of opening braces lengths
1278 |         else:
1279 |             stack = [-lmatch]  # negative means [
1280 |         end = m1.end()
1281 |         while True:
1282 |             m2 = reNext.search(text, end)
1283 |             if not m2:
1284 |                 return  # unbalanced
1285 |             end = m2.end()
1286 |             brac = m2.group()[0]
1287 |             lmatch = m2.end() - m2.start()
1288 | 
1289 |             if brac == '{':
1290 |                 stack.append(lmatch)
1291 |             elif brac == '}':
1292 |                 while stack:
1293 |                     openCount = stack.pop()  # opening span
1294 |                     if openCount == 0:  # illegal unmatched [[
1295 |                         continue
1296 |                     if lmatch >= openCount:
1297 |                         lmatch -= openCount
1298 |                         if lmatch <= 1:  # either close or stray }
1299 |                             break
1300 |                     else:
1301 |                         # put back unmatched
1302 |                         stack.append(openCount - lmatch)
1303 |                         break
1304 |                 if not stack:
1305 |                     yield m1.start(), end - lmatch
1306 |                     cur = end
1307 |                     break
1308 |                 elif len(stack) == 1 and 0 < stack[0] < ldelim:
1309 |                     # ambiguous {{{{{ }}} }}
1310 |                     yield m1.start() + stack[0], end
1311 |                     cur = end
1312 |                     break
1313 |             elif brac == '[':  # [[
1314 |                 stack.append(-lmatch)
1315 |             else:  # ]]
1316 |                 while stack and stack[-1] < 0:  # matching [[
1317 |                     openCount = -stack.pop()
1318 |                     if lmatch >= openCount:
1319 |                         lmatch -= openCount
1320 |                         if lmatch <= 1:  # either close or stray ]
1321 |                             break
1322 |                     else:
1323 |                         # put back unmatched (negative)
1324 |                         stack.append(lmatch - openCount)
1325 |                         break
1326 |                 if not stack:
1327 |                     yield m1.start(), end - lmatch
1328 |                     cur = end
1329 |                     break
1330 |                 # unmatched ]] are discarded
1331 |                 cur = end
1332 | 
1333 | 
1334 | def findBalanced(text, openDelim, closeDelim):
1335 |     """
1336 |     Assuming that text contains a properly balanced expression using
1337 |     :param openDelim: as opening delimiters and
1338 |     :param closeDelim: as closing delimiters.
1339 |     :return: an iterator producing pairs (start, end) of start and end
1340 |     positions in text containing a balanced expression.
1341 |     """
1342 |     openPat = '|'.join([re.escape(x) for x in openDelim])
1343 |     # patter for delimiters expected after each opening delimiter
1344 |     afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
1345 |     stack = []
1346 |     start = 0
1347 |     cur = 0
1348 |     # end = len(text)
1349 |     startSet = False
1350 |     startPat = re.compile(openPat)
1351 |     nextPat = startPat
1352 |     while True:
1353 |         next = nextPat.search(text, cur)
1354 |         if not next:
1355 |             return
1356 |         if not startSet:
1357 |             start = next.start()
1358 |             startSet = True
1359 |         delim = next.group(0)
1360 |         if delim in openDelim:
1361 |             stack.append(delim)
1362 |             nextPat = afterPat[delim]
1363 |         else:
1364 |             opening = stack.pop()
1365 |             # assert opening == openDelim[closeDelim.index(next.group(0))]
1366 |             if stack:
1367 |                 nextPat = afterPat[stack[-1]]
1368 |             else:
1369 |                 yield start, next.end()
1370 |                 nextPat = startPat
1371 |                 start = next.end()
1372 |                 startSet = False
1373 |         cur = next.end()
1374 | 
1375 | # ----------------------------------------------------------------------
1376 | # parser functions utilities
1377 | 
1378 | 
1379 | def ucfirst(string):
1380 |     """:return: a string with just its first character uppercase
1381 |     We can't use title() since it coverts all words.
1382 |     """
1383 |     if string:
1384 |         if len(string) > 1:
1385 |             return string[0].upper() + string[1:]
1386 |         else:
1387 |             return string.upper()
1388 |     else:
1389 |         return ''
1390 | 
1391 | 
1392 | def lcfirst(string):
1393 |     """:return: a string with its first character lowercase"""
1394 |     if string:
1395 |         if len(string) > 1:
1396 |             return string[0].lower() + string[1:]
1397 |         else:
1398 |             return string.lower()
1399 |     else:
1400 |         return ''
1401 | 
1402 | 
1403 | def fullyQualifiedTemplateTitle(templateTitle):
1404 |     """
1405 |     Determine the namespace of the page being included through the template
1406 |     mechanism
1407 |     """
1408 |     if templateTitle.startswith(':'):
1409 |         # Leading colon by itself implies main namespace, so strip this colon
1410 |         return ucfirst(templateTitle[1:])
1411 |     else:
1412 |         m = re.match('([^:]*)(:.*)', templateTitle)
1413 |         if m:
1414 |             # colon found but not in the first position - check if it
1415 |             # designates a known namespace
1416 |             prefix = normalizeNamespace(m.group(1))
1417 |             if prefix in knownNamespaces:
1418 |                 return prefix + ucfirst(m.group(2))
1419 |     # The title of the page being included is NOT in the main namespace and
1420 |     # lacks any other explicit designation of the namespace - therefore, it
1421 |     # is resolved to the Template namespace (that's the default for the
1422 |     # template inclusion mechanism).
1423 | 
1424 |     # This is a defense against pages whose title only contains UTF-8 chars
1425 |     # that are reduced to an empty string. Right now I can think of one such
1426 |     # case - <C2><A0> which represents the non-breaking space.
1427 |     # In this particular case, this page is a redirect to [[Non-nreaking
1428 |     # space]], but having in the system a redirect page with an empty title
1429 |     # causes numerous problems, so we'll live happier without it.
1430 |     if templateTitle:
1431 |         return templatePrefix + ucfirst(templateTitle)
1432 |     else:
1433 |         return ''  # caller may log as error
1434 | 
1435 | 
1436 | def normalizeNamespace(ns):
1437 |     return ucfirst(ns)
1438 | 
1439 | 
1440 | # ----------------------------------------------------------------------
1441 | # Parser functions
1442 | # see http://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
1443 | # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
1444 | 
1445 | 
1446 | class Infix():
1447 | 
1448 |     """Infix operators.
1449 |     The calling sequence for the infix is:
1450 |       x |op| y
1451 |     """
1452 | 
1453 |     def __init__(self, function):
1454 |         self.function = function
1455 | 
1456 |     def __ror__(self, other):
1457 |         return Infix(lambda x, self=self, other=other: self.function(other, x))
1458 | 
1459 |     def __or__(self, other):
1460 |         return self.function(other)
1461 | 
1462 |     def __rlshift__(self, other):
1463 |         return Infix(lambda x, self=self, other=other: self.function(other, x))
1464 | 
1465 |     def __rshift__(self, other):
1466 |         return self.function(other)
1467 | 
1468 |     def __call__(self, value1, value2):
1469 |         return self.function(value1, value2)
1470 | 
1471 | 
1472 | ROUND = Infix(lambda x, y: round(x, y))
1473 | 
1474 | 
1475 | def sharp_expr(expr):
1476 |     try:
1477 |         expr = re.sub('=', '==', expr)
1478 |         expr = re.sub('mod', '%', expr)
1479 |         expr = re.sub('\bdiv\b', '/', expr)
1480 |         expr = re.sub('\bround\b', '|ROUND|', expr)
1481 |         return unicode(eval(expr))
1482 |     except:
1483 |         return '<span class="error"></span>'
1484 | 
1485 | 
1486 | def sharp_if(testValue, valueIfTrue, valueIfFalse=None, *args):
1487 |     # In theory, we should evaluate the first argument here,
1488 |     # but it was evaluated while evaluating part[0] in expandTemplate().
1489 |     if testValue.strip():
1490 |         # The {{#if:}} function is an if-then-else construct.
1491 |         # The applied condition is: "The condition string is non-empty".
1492 |         valueIfTrue = valueIfTrue.strip()
1493 |         if valueIfTrue:
1494 |             return valueIfTrue
1495 |     elif valueIfFalse:
1496 |         return valueIfFalse.strip()
1497 |     return ""
1498 | 
1499 | 
1500 | def sharp_ifeq(lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args):
1501 |     rvalue = rvalue.strip()
1502 |     if rvalue:
1503 |         # lvalue is always defined
1504 |         if lvalue.strip() == rvalue:
1505 |             # The {{#ifeq:}} function is an if-then-else construct. The
1506 |             # applied condition is "is rvalue equal to lvalue". Note that this
1507 |             # does only string comparison while MediaWiki implementation also
1508 |             # supports numerical comparissons.
1509 | 
1510 |             if valueIfTrue:
1511 |                 return valueIfTrue.strip()
1512 |         else:
1513 |             if valueIfFalse:
1514 |                 return valueIfFalse.strip()
1515 |     return ""
1516 | 
1517 | 
1518 | def sharp_iferror(test, then='', Else=None, *args):
1519 |     if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
1520 |         return then
1521 |     elif Else is None:
1522 |         return test.strip()
1523 |     else:
1524 |         return Else.strip()
1525 | 
1526 | 
1527 | def sharp_switch(primary, *params):
1528 |     # FIXME: we don't support numeric expressions in primary
1529 | 
1530 |     # {{#switch: comparison string
1531 |     #  | case1 = result1
1532 |     #  | case2
1533 |     #  | case4 = result2
1534 |     #  | 1 | case5 = result3
1535 |     #  | #default = result4
1536 |     # }}
1537 | 
1538 |     primary = primary.strip()
1539 |     found = False  # for fall through cases
1540 |     default = None
1541 |     rvalue = None
1542 |     lvalue = ''
1543 |     for param in params:
1544 |         # handle cases like:
1545 |         #  #default = [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...]
1546 |         pair = param.split('=', 1)
1547 |         lvalue = pair[0].strip()
1548 |         rvalue = None
1549 |         if len(pair) > 1:
1550 |             # got "="
1551 |             rvalue = pair[1].strip()
1552 |             # check for any of multiple values pipe separated
1553 |             if found or primary in [v.strip() for v in lvalue.split('|')]:
1554 |                 # Found a match, return now
1555 |                 return rvalue
1556 |             elif lvalue == '#default':
1557 |                 default = rvalue
1558 |             rvalue = None  # avoid defaulting to last case
1559 |         elif lvalue == primary:
1560 |             # If the value matches, set a flag and continue
1561 |             found = True
1562 |     # Default case
1563 |     # Check if the last item had no = sign, thus specifying the default case
1564 |     if rvalue is not None:
1565 |         return lvalue
1566 |     elif default is not None:
1567 |         return default
1568 |     return ''
1569 | 
1570 | 
1571 | # Extension Scribuntu
1572 | def sharp_invoke(module, function, frame):
1573 |     functions = modules.get(module)
1574 |     if functions:
1575 |         funct = functions.get(function)
1576 |         if funct:
1577 |             # find parameters in frame whose title is the one of the original
1578 |             # template invocation
1579 |             templateTitle = fullyQualifiedTemplateTitle(function)
1580 |             if not templateTitle:
1581 |                 logging.warn("Template with empty title")
1582 |             pair = next((x for x in frame if x[0] == templateTitle), None)
1583 |             if pair:
1584 |                 params = pair[1]
1585 |                 # extract positional args
1586 |                 params = [params.get(str(i + 1)) for i in range(len(params))]
1587 |                 return funct(*params)
1588 |             else:
1589 |                 return funct()
1590 |     return ''
1591 | 
1592 | 
1593 | parserFunctions = {
1594 | 
1595 |     '#expr': sharp_expr,
1596 | 
1597 |     '#if': sharp_if,
1598 | 
1599 |     '#ifeq': sharp_ifeq,
1600 | 
1601 |     '#iferror': sharp_iferror,
1602 | 
1603 |     '#ifexpr': lambda *args: '',  # not supported
1604 | 
1605 |     '#ifexist': lambda *args: '',  # not supported
1606 | 
1607 |     '#rel2abs': lambda *args: '',  # not supported
1608 | 
1609 |     '#switch': sharp_switch,
1610 | 
1611 |     '# language': lambda *args: '',  # not supported
1612 | 
1613 |     '#time': lambda *args: '',  # not supported
1614 | 
1615 |     '#timel': lambda *args: '',  # not supported
1616 | 
1617 |     '#titleparts': lambda *args: '',  # not supported
1618 | 
1619 |     # This function is used in some pages to construct links
1620 |     # http://meta.wikimedia.org/wiki/Help:URL
1621 |     'urlencode': lambda string, *rest: urlencode(string),
1622 | 
1623 |     'lc': lambda string, *rest: string.lower() if string else '',
1624 | 
1625 |     'lcfirst': lambda string, *rest: lcfirst(string),
1626 | 
1627 |     'uc': lambda string, *rest: string.upper() if string else '',
1628 | 
1629 |     'ucfirst': lambda string, *rest: ucfirst(string),
1630 | 
1631 |     'int': lambda string, *rest: str(int(string)),
1632 | 
1633 | }
1634 | 
1635 | 
1636 | def callParserFunction(functionName, args, frame):
1637 |     """
1638 |     Parser functions have similar syntax as templates, except that
1639 |     the first argument is everything after the first colon.
1640 |     :return: the result of the invocation, None in case of failure.
1641 | 
1642 |     http://meta.wikimedia.org/wiki/Help:ParserFunctions
1643 |     """
1644 | 
1645 |     try:
1646 |         if functionName == '#invoke':
1647 |             # special handling of frame
1648 |             ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
1649 |             # logging.debug('parserFunction> %s %s', functionName, ret)
1650 |             return ret
1651 |         if functionName in parserFunctions:
1652 |             ret = parserFunctions[functionName](*args)
1653 |             # logging.debug('parserFunction> %s %s', functionName, ret)
1654 |             return ret
1655 |     except:
1656 |         return ""  # FIXME: fix errors
1657 | 
1658 |     return ""
1659 | 
1660 | 
1661 | # ----------------------------------------------------------------------
1662 | # Extract Template definition
1663 | 
1664 | reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
1665 | reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
1666 | 
1667 | # These are built before spawning processes, hence thay are shared.
1668 | templates = {}
1669 | redirects = {}
1670 | # cache of parser templates
1671 | # FIXME: sharing this with a Manager slows down.
1672 | templateCache = {}
1673 | 
1674 | 
1675 | def define_template(title, page):
1676 |     """
1677 |     Adds a template defined in the :param page:.
1678 |     @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
1679 |     """
1680 |     global templates
1681 |     global redirects
1682 | 
1683 |     # title = normalizeTitle(title)
1684 | 
1685 |     # check for redirects
1686 |     m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
1687 |     if m:
1688 |         redirects[title] = m.group(1)  # normalizeTitle(m.group(1))
1689 |         return
1690 | 
1691 |     text = unescape(''.join(page))
1692 | 
1693 |     # We're storing template text for future inclusion, therefore,
1694 |     # remove all <noinclude> text and keep all <includeonly> text
1695 |     # (but eliminate <includeonly> tags per se).
1696 |     # However, if <onlyinclude> ... </onlyinclude> parts are present,
1697 |     # then only keep them and discard the rest of the template body.
1698 |     # This is because using <onlyinclude> on a text fragment is
1699 |     # equivalent to enclosing it in <includeonly> tags **AND**
1700 |     # enclosing all the rest of the template body in <noinclude> tags.
1701 | 
1702 |     # remove comments
1703 |     text = comment.sub('', text)
1704 | 
1705 |     # eliminate <noinclude> fragments
1706 |     text = reNoinclude.sub('', text)
1707 |     # eliminate unterminated <noinclude> elements
1708 |     text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL)
1709 |     text = re.sub(r'<noinclude/>', '', text)
1710 | 
1711 |     onlyincludeAccumulator = ''
1712 |     for m in re.finditer('<onlyinclude>(.*?)</onlyinclude>', text, re.DOTALL):
1713 |         onlyincludeAccumulator += m.group(1)
1714 |     if onlyincludeAccumulator:
1715 |         text = onlyincludeAccumulator
1716 |     else:
1717 |         text = reIncludeonly.sub('', text)
1718 | 
1719 |     if text:
1720 |         if title in templates:
1721 |             logging.warn('Redefining: %s', title)
1722 |         templates[title] = text
1723 | 


--------------------------------------------------------------------------------
/reader/wikiextractor/extractPage.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # =============================================================================
  5 | #  Version: 3.0 (July 22, 2020)
  6 | #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
  7 | 
  8 | # =============================================================================
  9 | #  Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
 10 | # =============================================================================
 11 | #  This file is part of Tanl.
 12 | #
 13 | #  Tanl is free software; you can redistribute it and/or modify it
 14 | #  under the terms of the GNU Affero General Public License, version 3,
 15 | #  as published by the Free Software Foundation.
 16 | #
 17 | #  Tanl is distributed in the hope that it will be useful,
 18 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 19 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 20 | #  GNU Affero General Public License for more details.
 21 | #
 22 | #  You should have received a copy of the GNU Affero General Public License
 23 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 24 | # =============================================================================
 25 | 
 26 | """Wikipedia Page Extractor:
 27 | Extracts a single page from a Wikipedia dump file.
 28 | """
 29 | 
 30 | import sys, os.path
 31 | import re
 32 | import argparse
 33 | import bz2
 34 | 
 35 | 
 36 | # Program version
 37 | __version__ = '3.0.5'
 38 | 
 39 | # ----------------------------------------------------------------------
 40 | # READER
 41 | 
 42 | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
 43 | #tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
 44 | #                    1     2            3
 45 | 
 46 | def process_data(input_file, id, templates=False):
 47 |     """
 48 |     :param input_file: name of the wikipedia dump file.
 49 |     :param id: article id
 50 |     """
 51 | 
 52 |     if input_file.lower().endswith(".bz2"):
 53 |         input = bz2.open(input_file, mode='rt', encoding='utf-8')
 54 |     else:
 55 |         input = open(input_file)
 56 | 
 57 |     page = []
 58 |     for line in input:
 59 |         line = line
 60 |         if '<' not in line:         # faster than doing re.search()
 61 |             if page:
 62 |                 page.append(line)
 63 |             continue
 64 |         m = tagRE.search(line)
 65 |         if not m:
 66 |             continue
 67 |         tag = m.group(2)
 68 |         if tag == 'page':
 69 |             page = []
 70 |             page.append(line)
 71 |             inArticle = False
 72 |         elif tag == 'id':
 73 |             curid = m.group(3)
 74 |             if id == curid:
 75 |                 page.append(line)
 76 |                 inArticle = True
 77 |             elif not inArticle and not templates:
 78 |                 page = []
 79 |         elif tag == 'title':
 80 |             if templates:
 81 |                 if m.group(3).startswith('Template:'):
 82 |                     page.append(line)
 83 |                 else:
 84 |                     page = []
 85 |             else:
 86 |                 page.append(line)
 87 |         elif tag == '/page':
 88 |             if page:
 89 |                 page.append(line)
 90 |                 print(''.join(page))
 91 |                 if not templates:
 92 |                     break
 93 |             page = []
 94 |         elif page:
 95 |             page.append(line)
 96 | 
 97 |     input.close()
 98 | 
 99 | def main():
100 |     parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
101 |         formatter_class=argparse.RawDescriptionHelpFormatter,
102 |                                      description=__doc__)
103 |     parser.add_argument("input",
104 |                         help="XML wiki dump file")
105 |     parser.add_argument("--id", default="1",
106 |                         help="article number")
107 |     parser.add_argument("--template", action="store_true",
108 |                         help="template number")
109 |     parser.add_argument("-v", "--version", action="version",
110 |                         version='%(prog)s ' + version,
111 |                         help="print program version")
112 | 
113 |     args = parser.parse_args()
114 | 
115 |     process_data(args.input, args.id, args.template)
116 | 
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/run/pipeline.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import os
  4 | import logging
  5 | import copy
  6 | from glob import glob
  7 | 
  8 | import threading
  9 | from multiprocessing import Process
 10 | import multiprocessing as mp
 11 | 
 12 | def run_bash(args,cmd):
 13 |     cmd = cmd.format(**args)
 14 |     print('cmd:',cmd)
 15 |     os.system(cmd)
 16 | 
 17 | def check_output_dir(output_dir):
 18 |     if not os.path.exists(output_dir):
 19 |         os.mkdir(output_dir)
 20 | 
 21 | POOL_SIZE = 9
 22 | MAX_EDIT = 0.1
 23 | MIN_FILESIZE = 5 # KB
 24 | 
 25 | global_args = {
 26 |     "code_root" : '/nfs/users/xueyou/github/wiki-error-corpus',
 27 |     # "xml_dump": '/nfs/users/xueyou/data/speller/wiki/zhwiki-20211201-pages-meta-history1.xml-p2981p11534',
 28 |     "input_dir" : '/data/xueyou/data/speller/wiki/',
 29 |     "output_dir" : '/data/xueyou/data/speller/wiki/',
 30 |     'max_edit': MAX_EDIT
 31 |     }
 32 | 
 33 | for xml_dump_file in list(glob('/data/xueyou/data/speller/wiki/*.7z')):
 34 |   global_args['xml_dump'] = xml_dump_file.replace('.7z','')
 35 | 
 36 |   # Stage 1
 37 |   # Extract 7z file
 38 |   print(f'extract {xml_dump_file}')
 39 |   cmd = f'7z e {xml_dump_file}'
 40 |   run_bash({},cmd)
 41 | 
 42 |   # Stage 2
 43 |   # Divide the large XML revision dump file into per page revisions.
 44 |   print(f'divide XML file')
 45 |   cmd = 'python {code_root}/reader/divide_xml_revisions.py {xml_dump} {output_dir}'
 46 |   args = copy.deepcopy(global_args)
 47 |   args['output_dir'] = args['output_dir'] + 'stage1'
 48 |   check_output_dir(args['output_dir'])
 49 |   run_bash(args,cmd)
 50 | 
 51 | 
 52 |   # Stage 3
 53 |   # Extract Revisions from page history
 54 |   cmd = 'python {code_root}/reader/extract_revisions_new.py {input_dir} {input_file} {output_dir}'
 55 |   input_dir = global_args['input_dir'] + 'stage1'
 56 |   output_dir = global_args['output_dir'] + 'stage3'
 57 |   check_output_dir(output_dir)
 58 | 
 59 |   pool = mp.Pool(processes = POOL_SIZE)
 60 |   for fname in glob(input_dir + '/*.xml'):
 61 |     fsize = os.path.getsize(fname) / 1024 # KB
 62 |     if fsize < MIN_FILESIZE:
 63 |       # print(f'small size, skip {fname}')
 64 |       continue
 65 |     args = copy.deepcopy(global_args)
 66 |     args['input_dir'] = input_dir
 67 |     args['output_dir'] = output_dir
 68 |     args['input_file'] = os.path.basename(fname)
 69 |     pool.apply_async(run_bash,(args, cmd))
 70 |   pool.close()
 71 |   pool.join()
 72 | 
 73 |   # Stage 4
 74 |   # Extract errors with edit distance
 75 |   cmd = 'python {code_root}/reader/extract_spelling_errors_new.py {input_dir} {input_file} {output_dir} zh {max_edit}'
 76 |   input_dir = global_args['input_dir'] + 'stage3'
 77 |   output_dir = global_args['output_dir'] + 'stage4'
 78 |   check_output_dir(output_dir)
 79 | 
 80 |   pool = mp.Pool(processes = POOL_SIZE)
 81 |   for fname in glob(input_dir + '/*.xml'):
 82 |     basename = os.path.basename(fname)
 83 |     args = copy.deepcopy(global_args)
 84 |     args['input_dir'] = input_dir
 85 |     args['output_dir'] = output_dir
 86 |     args['input_file'] = basename
 87 |     pool.apply_async(run_bash,(args, cmd))
 88 |   pool.close()
 89 |   pool.join()
 90 | 
 91 |   # Stage5
 92 |   # collect all the errors
 93 |   input_dir = global_args['input_dir'] + 'stage4'
 94 |   output_dir = global_args['input_dir'] + 'stage5'
 95 |   check_output_dir(output_dir)
 96 |   with open(output_dir + '/error_sent.txt','a') as ef,open(output_dir + '/ori_sent.txt','a') as of:
 97 |     for fname in glob(input_dir + '/*.xml_error_sen.txt'):
 98 |       basename = os.path.basename(fname).split('.')[0]
 99 |       # err_f.write(open(input_dir + '/' + basename + '.xml_spelling_error.txt').read())
100 |       ef.write(open(input_dir + '/' + basename + '.xml_error_sen.txt').read())
101 |       of.write(open(input_dir + '/' + basename + '.xml_orig_sen.txt').read())
102 | 
103 |   # Stage 6
104 |   # clear 
105 |   print('clear tmp files')
106 |   check_output_dir('./extracted')
107 |   cmd = f'''rm {global_args['xml_dump']}
108 |   rm -rf stage1 stage3 stage4
109 |   mv {xml_dump_file} extracted
110 |   '''
111 |   run_bash({},cmd)
112 | print('all done')
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------