├── README.md
├── natudump.py
└── tabulate.py
/README.md:
--------------------------------------------------------------------------------
1 | This is example of scraping public LegiFrance registry's naturalisation decrees for research purposes only (`naturalisation par mariage` is not included in these decrees). Code license is MIT.
2 |
3 | ```shell
4 | pip install selenium charset_normalizer
5 |
6 | mkdir -p jo
7 | # python3 natudump.py -o jo --years $(seq 2000 2021) --output-directory-prefix "$(wslpath -a -w "$PWD")\\" # for WSL systems, must be on a NTFS drive
8 | python3 natudump.py -o jo --years $(seq 2000 2021) --output-directory-prefix "$PWD/"
9 | ls jo | wc -l
10 |
11 | mkdir -p txtjo
12 | # https://github.com/pdfminer/pdfminer.six/issues/809
13 | git clone --branch 20220524 --depth 1 https://github.com/pdfminer/pdfminer.six
14 | PYTHONPATH="pdfminer.six:pdfminer.six/tools:$PYTHONPATH" find jo -name '*.pdf' -exec python3 -m pdf2txt {} -o txt{}.txt \;
15 | ls txtjo | wc -l
16 |
17 | python3 tabulate.py -i txtjo -o natufrance_2000_2021.tsv
18 | grep 'Russie\|URSS\|U.R.S.S' natufrance_2000_2021.tsv | wc -l
19 |
20 | mkdir -p catjo
21 | git clone --branch v0.4 --depth 1 https://github.com/pmaupin/pdfrw
22 | rm $(PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" find jo/ -type f -not -exec python3 -c 'import sys, pdfrw; pdfrw.PdfReader(sys.argv[1])' {} \; -print)
23 | for years in $(seq 2000 2021); do PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" python3 pdfrw/examples/cat.py jo/JORF_${years}*; mv cat.JORF_${years}*.pdf catjo; done
24 | ls catjo | wc -l
25 |
26 | mkdir -p tarjo
27 | for years in $(seq 2000 2021); do tar -cf tarjo/jo${years}.tar jo/*_${years}*; done
28 | ls tarjo | wc -l
29 | ```
30 |
--------------------------------------------------------------------------------
/natudump.py:
--------------------------------------------------------------------------------
1 | # https://chromedriver.chromium.org/downloads
2 | # https://www.smashingmagazine.com/2021/12/headers-https-requests-ui-automation-testing/
3 | # https://pypi.org/project/selenium-wire/
4 |
5 | import os
6 | import time
7 | import argparse
8 | import urllib.request
9 | import html.parser
10 |
11 | import selenium.webdriver
12 | import selenium.webdriver.support.ui
13 | import selenium.webdriver.support.expected_conditions
14 |
15 | class JoCaptcha(html.parser.HTMLParser):
16 | def __init__(self):
17 | super().__init__()
18 | self.captcha = ''
19 |
20 | def handle_starttag(self, tag, attrs):
21 | if (tag == 'div' and any(k == 'class' and v == 'lf-captcha-sum' for k, v in attrs)) or (tag == 'input' and '<' in self.captcha and '' not in self.captcha):
22 | self.captcha += f'<{tag}>'
23 |
24 | def handle_data(self, data):
25 | if '<' in self.captcha and '' not in self.captcha:
26 | self.captcha += data
27 |
28 | def handle_endtag(self, tag):
29 | if tag == 'div' and '' not in self.captcha:
30 | self.captcha += f'{tag}>'
31 |
32 | def solve(self, replace = {' ' : '', '
' : '', '
' : '', '' : '?', 'onze' : 11, 'douze' : 12, 'treize' : 13, 'quatorze' : 14, 'quinze' : 15, 'seize' : 16, 'dix-sept' : 17, 'dix-huit' : 18, 'dix-neuf' : 19, 'vingt' : 20, 'un' : 1, 'deux' : 2, 'trois' : 3, 'quatre' : 4, 'cinq' : 5, 'six' : 6, 'sept' : 7, 'huit' : 8, 'neuf' : 9, 'dix' : 10}):
33 | captcha = self.captcha
34 | for k, v in replace.items():
35 | captcha = captcha.replace(k, str(v))
36 |
37 | assert all(c in '?+=0123456789' for c in captcha) and '=' in captcha, captcha
38 |
39 | ab, c = captcha.split('=')
40 | a, b = ab.split('+')
41 | if a == '?':
42 | return int(c) - int(b)
43 | if b == '?':
44 | return int(c) - int(a)
45 | assert c == '?'
46 | return int(a) + int(b)
47 |
48 | if __name__ == '__main__':
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument('--jo-search', default = 'https://www.legifrance.gouv.fr/jorf/jo/{year}?page={page}&pageSize=100')
51 | parser.add_argument('--jo-download', default= 'https://www.legifrance.gouv.fr/download/secure/file/{token}')
52 | parser.add_argument('--years', default = [2016, 2017, 2018, 2019, 2020, 2021], type = int, nargs = '+')
53 | parser.add_argument('--output-directory', '-o', default = 'jo')
54 | parser.add_argument('--output-directory-prefix', default = '')
55 | parser.add_argument('--chromedriver', default = '/usr/bin/chromedriver', help="for WSLv1, this path should symlink to chromedriver.exe extracted from a zipball chromedriver_win32.zip downloaded from https://chromedriver.chromium.org/downloads, don' t forget to update it when you upgrade the host Chrome")
56 | parser.add_argument('--timeout', type = float, default = 10.0)
57 | parser.add_argument('--timeout-big', type = float, default = 30.0)
58 | parser.add_argument('--headmore', action = 'store_true')
59 | parser.add_argument('--debug', default = 'debug.html')
60 |
61 | args = parser.parse_args()
62 | print(args)
63 |
64 | os.makedirs(args.output_directory, exist_ok = True)
65 |
66 | chrome_prefs = {
67 | 'download.default_directory': args.output_directory_prefix + args.output_directory,
68 | 'download.prompt_for_download': False,
69 | 'download.directory_upgrade': True,
70 | 'plugins.always_open_pdf_externally': True,
71 | }
72 | chrome_options = selenium.webdriver.ChromeOptions()
73 | chrome_options.add_experimental_option('prefs', chrome_prefs)
74 | if not args.headmore:
75 | chrome_options.add_argument('--headless')
76 | chrome_service = selenium.webdriver.chrome.service.Service(executable_path = args.chromedriver)
77 |
78 | #driver.request_interceptor = driver.response_interceptor = (lambda request, response: print(request.url, request.headers, response.headers))
79 |
80 | find_artefacts = lambda joid, temp: [fname for fname in os.listdir(args.output_directory) for prefix in ['joe_' + joid[-4:] + joid[2:4] + joid[:2], joid.rstrip('pdf')] if prefix in fname and ('.crdownload' in fname) == temp]
81 |
82 | for year in args.years:
83 | page = 1
84 | while True:
85 | driver = None
86 | try:
87 | driver = selenium.webdriver.Chrome(options = chrome_options, service = chrome_service)
88 | wait = selenium.webdriver.support.ui.WebDriverWait(driver, args.timeout)
89 |
90 | url = args.jo_search.format(year = year, page = page)
91 |
92 | driver.get(url)
93 | page_source = driver.page_source
94 | wait.until(selenium.webdriver.support.expected_conditions.url_to_be(url))
95 |
96 | jolinks = driver.find_elements('partial link text' , 'nominatives') + driver.find_elements('partial link text', 'version papier numérisée')
97 |
98 | print('Page', page, 'found', len(jolinks), 'links', url)
99 |
100 | for i, jolink in enumerate(jolinks):
101 | joid = jolink.get_attribute('data-textid')
102 |
103 | res_files = find_artefacts(joid, temp = False)
104 | if res_files:
105 | print('Page', page, 'Skipping', joid)
106 | continue
107 |
108 | for fname in find_artefacts(joid, temp = True):
109 | print('Temp file exists, deleting', joid, fname)
110 | os.remove(os.path.join(args.output_directory, fname))
111 |
112 | print('Page', page, 'Processing', joid, i, '/', len(jolinks))
113 |
114 | jolink.click()
115 | wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.lf-captcha-line')))
116 |
117 | captcha = driver.find_element('css selector', '.lf-captcha-line')
118 |
119 | jo_captcha_parser = JoCaptcha()
120 | jo_captcha_parser.feed(captcha.get_attribute('innerHTML'))
121 | captcha_solution = jo_captcha_parser.solve()
122 |
123 | captcha.find_element('css selector', '.lf-captcha-input').send_keys(str(captcha_solution))
124 | captcha.find_element('css selector', '.captcha-submit').click()
125 |
126 | wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.secure-content a')))
127 | token = driver.find_element('css selector', '.secure-content a').get_attribute('href').split('token=')[-1]
128 |
129 | url = args.jo_download.format(token = token)
130 | driver.get(url)
131 |
132 | time.sleep(args.timeout)
133 | while find_artefacts(joid, temp = True):
134 | print('Download in progress, temp file exists, sleeping')
135 | time.sleep(args.timeout)
136 | assert find_artefacts(joid, temp = False), 'Must have final downloaded file'
137 | print('Page', page, 'OK', joid, 'PDF:', url)
138 |
139 | driver.quit()
140 | page += 1
141 | print('Page', page, 'increased')
142 | if len(jolinks) == 0:
143 | if args.debug:
144 | print('Debug', args.debug)
145 | with open(args.debug, 'w') as f:
146 | f.write(page_source)
147 | break
148 |
149 |
150 | except Exception as e:
151 | print(e)
152 | print('Page', page, 'big timeout')
153 | if driver is not None:
154 | driver.quit()
155 | time.sleep(args.timeout_big)
156 |
--------------------------------------------------------------------------------
/tabulate.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import argparse
4 |
5 | digits = lambda s: ''.join(c for c in s if c.isdigit())
6 |
7 | def extract_record(record, year, section, date, url, basename):
8 | comment = ''
9 | genderstr, gender = None, None
10 | for k, v in {'née le' : 'f', 'né le' : 'm', ', née' : 'f','néele' : 'f', 'néle' : 'm', ', né' : 'm'}.items():
11 | if k in record.lower():
12 | gender, genderstr = v, k
13 | break
14 |
15 | if genderstr is None:
16 | firstname = record.split('(')[0]
17 | lastname = action = gender = birthdate = birthplace = birthcountry = dep = ''
18 | comment = record
19 |
20 | else:
21 | gendersplitted = record.split(genderstr) or record.lower().split(genderstr)
22 | name = gendersplitted[0]
23 | lastname, firstname = name.split('(')[0], name.split('(')[-1]
24 | lastname, firstname = lastname.strip().strip('(),'), firstname.strip().strip('(),')
25 |
26 | birth = re.split('NAT|EFF|LIB|REI', gendersplitted[-1].strip())[0]
27 |
28 | record_nospace = record.replace(' ', '').lower()
29 | action = 'NAT' if ',nat,' in record_nospace else 'EFF' if ',eff,' in record_nospace else 'LIB' if ',lib,' in record_nospace else 'REI' if ',rei,' in record_nospace else ''
30 | re_birthdate = r'[ \-/0-9]+'
31 | birthdate = (re.findall(re_birthdate, birth) + [''])[0].replace('-', '').replace('/', '').replace(' ', '')
32 | birthdate = (birthdate[:2] + '/' + birthdate[-6:-4] + '/' + birthdate[-4:]).strip('/')
33 | birthplace = ' '.join(re.split(re_birthdate, birth)).strip('àau,. ')
34 |
35 | if birthplace.count('(') == birthplace.count(')') == 1:
36 | birthcountry = birthplace.split('(')[1].split(')')[0].strip(' ,')
37 | birthplace = birthplace.split('(')[0].strip(' ,')
38 | else:
39 | birthcountry = ''
40 | dep = digits(record_nospace.split('dép')[-1].split(',')[0].strip()) if 'dép' in record.lower() else ''
41 |
42 | return (year, section, date, url, basename.rstrip('.txt'), action, lastname.capitalize(), firstname, gender, birthdate, birthplace, birthcountry, dep, comment.replace('\t', ''))
43 |
44 |
45 | if __name__ == '__main__':
46 | parser = argparse.ArgumentParser()
47 | parser.add_argument('--input-directory', '-i', default = 'txtjo')
48 | parser.add_argument('--output-path', '-o', default = 'natufrance.tsv')
49 | parser.add_argument('--legifrance', default = 'https://www.legifrance.gouv.fr/jorf/jo/{year}/{month}/{day}/{num}')
50 | parser.add_argument('--section', default = 'naturalisation', help = 'nat : naturalisé français ; rei : réintégré dans la nationalité française ; eff : enfant saisi par l’effet collectif attaché à l’acquisition de la nationalité française par ses parents ; lib : libéré de l’allégeance française')
51 | args = parser.parse_args()
52 |
53 |
54 | records = []
55 | joe = {}
56 |
57 | for basename in os.listdir(args.input_directory): #['JORF_20150116_13.pdf.txt']:#
58 | yearmonthday, num = basename.split('_')[1:3]
59 | year, month, day = yearmonthday[:4], yearmonthday[4:6], yearmonthday[6:]
60 | date = day + '/' + month + '/' + year
61 | url = args.legifrance.format(year = year, month = month, day = day, num = num)
62 |
63 | lines = list(open(os.path.join(args.input_directory, basename)))
64 | section = None
65 | record = None
66 |
67 | lines = list(filter(bool, map(str.strip, lines)))
68 | ignore = True
69 | for i in range(len(lines)):
70 | L = lines[i]
71 | L = L.replace(' ', ' ').replace(':', '.').strip()
72 | dots = '-_;·~·•'
73 | for c in dots:
74 | if L and L[-1] == c:
75 | L = L.rstrip(c) + '.'
76 | for c in dots:
77 | L = L.replace(c, ' ')
78 | L = L.strip()
79 |
80 | F = (re.split('[^a-zA-Z]', L) + [''])[0]
81 | l = L.lower()
82 |
83 | if section == args.section:
84 | ignore = False
85 |
86 | if 'sont naturalisés français' in l:# or 'l’acquisition de la nationalité française' in l:
87 | section = 'naturalisation'
88 | record = ''
89 |
90 | elif 'demandes de changement de nom' in l:
91 | section = 'changementdenom'
92 | record = ''
93 |
94 | elif 'Art.' in L:
95 | section = None
96 | record = ''
97 |
98 | elif 'libéré de l’allégeance française' in L:
99 | continue
100 |
101 | else:
102 | if section == 'naturalisation':
103 | if (L.endswith('.') or (l.count('dép') >= 1 and L.count('Dt') >= 1)) and (bool(record) or F.isupper()):
104 | record += L + ' '
105 | if L.endswith('dép.') or L.endswith('Dt.'):# or record.count('(') != record.count(')'):
106 | continue
107 |
108 | if 'né' not in record.lower():
109 | record = ''
110 | continue
111 |
112 | if section == args.section:
113 | records.append(extract_record(record, year, section, date, url, basename))
114 | if not records[-1]:
115 | records.pop()
116 |
117 | record = ''
118 |
119 | elif F.isupper() and not record and 'né' in l:
120 | record = L
121 |
122 | elif record:
123 | record += L
124 |
125 | if section == 'changementdenom':
126 | if L.endswith('.'):
127 | record += L
128 | if section == args.section:
129 | firstname = lastname = gender = birthdate = birthplace = birthcountry = dep = action = ''
130 | records.append((year, section, date, url, basename.rstrip('.txt'), action, lastname, firstname, gender, birthdate, birthplace, birthcountry, dep, record.replace('\t', '')))
131 | record = ''
132 |
133 | elif not L.startswith('No. ') and not L.startswith('No '):
134 | record += L
135 | joe[basename] = ignore
136 |
137 | open(args.output_path, 'w').write('\n'.join(map('\t'.join, [('year', 'section', 'date', 'url', 'decree', 'action', 'lastname', 'firstname', 'gender', 'birthdate', 'birthplace', 'birthcountry', 'dep', 'comment')] + sorted(records, key = lambda r: (r[6], r[7])))))
138 | open(args.output_path + '.joe.txt', 'w').write('\n'.join(k + '\t' + str(int(v)) for k, v in joe.items()))
139 |
--------------------------------------------------------------------------------