├── README.md ├── natudump.py └── tabulate.py /README.md: -------------------------------------------------------------------------------- 1 | This is example of scraping public LegiFrance registry's naturalisation decrees for research purposes only (`naturalisation par mariage` is not included in these decrees). Code license is MIT. 2 | 3 | ```shell 4 | pip install selenium charset_normalizer 5 | 6 | mkdir -p jo 7 | # python3 natudump.py -o jo --years $(seq 2000 2021) --output-directory-prefix "$(wslpath -a -w "$PWD")\\" # for WSL systems, must be on a NTFS drive 8 | python3 natudump.py -o jo --years $(seq 2000 2021) --output-directory-prefix "$PWD/" 9 | ls jo | wc -l 10 | 11 | mkdir -p txtjo 12 | # https://github.com/pdfminer/pdfminer.six/issues/809 13 | git clone --branch 20220524 --depth 1 https://github.com/pdfminer/pdfminer.six 14 | PYTHONPATH="pdfminer.six:pdfminer.six/tools:$PYTHONPATH" find jo -name '*.pdf' -exec python3 -m pdf2txt {} -o txt{}.txt \; 15 | ls txtjo | wc -l 16 | 17 | python3 tabulate.py -i txtjo -o natufrance_2000_2021.tsv 18 | grep 'Russie\|URSS\|U.R.S.S' natufrance_2000_2021.tsv | wc -l 19 | 20 | mkdir -p catjo 21 | git clone --branch v0.4 --depth 1 https://github.com/pmaupin/pdfrw 22 | rm $(PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" find jo/ -type f -not -exec python3 -c 'import sys, pdfrw; pdfrw.PdfReader(sys.argv[1])' {} \; -print) 23 | for years in $(seq 2000 2021); do PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" python3 pdfrw/examples/cat.py jo/JORF_${years}*; mv cat.JORF_${years}*.pdf catjo; done 24 | ls catjo | wc -l 25 | 26 | mkdir -p tarjo 27 | for years in $(seq 2000 2021); do tar -cf tarjo/jo${years}.tar jo/*_${years}*; done 28 | ls tarjo | wc -l 29 | ``` 30 | -------------------------------------------------------------------------------- /natudump.py: -------------------------------------------------------------------------------- 1 | # https://chromedriver.chromium.org/downloads 2 | # https://www.smashingmagazine.com/2021/12/headers-https-requests-ui-automation-testing/ 3 | # https://pypi.org/project/selenium-wire/ 4 | 5 | import os 6 | import time 7 | import argparse 8 | import urllib.request 9 | import html.parser 10 | 11 | import selenium.webdriver 12 | import selenium.webdriver.support.ui 13 | import selenium.webdriver.support.expected_conditions 14 | 15 | class JoCaptcha(html.parser.HTMLParser): 16 | def __init__(self): 17 | super().__init__() 18 | self.captcha = '' 19 | 20 | def handle_starttag(self, tag, attrs): 21 | if (tag == 'div' and any(k == 'class' and v == 'lf-captcha-sum' for k, v in attrs)) or (tag == 'input' and '<' in self.captcha and '' 23 | 24 | def handle_data(self, data): 25 | if '<' in self.captcha and '' 31 | 32 | def solve(self, replace = {' ' : '', '
' : '', '
' : '', '' : '?', 'onze' : 11, 'douze' : 12, 'treize' : 13, 'quatorze' : 14, 'quinze' : 15, 'seize' : 16, 'dix-sept' : 17, 'dix-huit' : 18, 'dix-neuf' : 19, 'vingt' : 20, 'un' : 1, 'deux' : 2, 'trois' : 3, 'quatre' : 4, 'cinq' : 5, 'six' : 6, 'sept' : 7, 'huit' : 8, 'neuf' : 9, 'dix' : 10}): 33 | captcha = self.captcha 34 | for k, v in replace.items(): 35 | captcha = captcha.replace(k, str(v)) 36 | 37 | assert all(c in '?+=0123456789' for c in captcha) and '=' in captcha, captcha 38 | 39 | ab, c = captcha.split('=') 40 | a, b = ab.split('+') 41 | if a == '?': 42 | return int(c) - int(b) 43 | if b == '?': 44 | return int(c) - int(a) 45 | assert c == '?' 46 | return int(a) + int(b) 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument('--jo-search', default = 'https://www.legifrance.gouv.fr/jorf/jo/{year}?page={page}&pageSize=100') 51 | parser.add_argument('--jo-download', default= 'https://www.legifrance.gouv.fr/download/secure/file/{token}') 52 | parser.add_argument('--years', default = [2016, 2017, 2018, 2019, 2020, 2021], type = int, nargs = '+') 53 | parser.add_argument('--output-directory', '-o', default = 'jo') 54 | parser.add_argument('--output-directory-prefix', default = '') 55 | parser.add_argument('--chromedriver', default = '/usr/bin/chromedriver', help="for WSLv1, this path should symlink to chromedriver.exe extracted from a zipball chromedriver_win32.zip downloaded from https://chromedriver.chromium.org/downloads, don' t forget to update it when you upgrade the host Chrome") 56 | parser.add_argument('--timeout', type = float, default = 10.0) 57 | parser.add_argument('--timeout-big', type = float, default = 30.0) 58 | parser.add_argument('--headmore', action = 'store_true') 59 | parser.add_argument('--debug', default = 'debug.html') 60 | 61 | args = parser.parse_args() 62 | print(args) 63 | 64 | os.makedirs(args.output_directory, exist_ok = True) 65 | 66 | chrome_prefs = { 67 | 'download.default_directory': args.output_directory_prefix + args.output_directory, 68 | 'download.prompt_for_download': False, 69 | 'download.directory_upgrade': True, 70 | 'plugins.always_open_pdf_externally': True, 71 | } 72 | chrome_options = selenium.webdriver.ChromeOptions() 73 | chrome_options.add_experimental_option('prefs', chrome_prefs) 74 | if not args.headmore: 75 | chrome_options.add_argument('--headless') 76 | chrome_service = selenium.webdriver.chrome.service.Service(executable_path = args.chromedriver) 77 | 78 | #driver.request_interceptor = driver.response_interceptor = (lambda request, response: print(request.url, request.headers, response.headers)) 79 | 80 | find_artefacts = lambda joid, temp: [fname for fname in os.listdir(args.output_directory) for prefix in ['joe_' + joid[-4:] + joid[2:4] + joid[:2], joid.rstrip('pdf')] if prefix in fname and ('.crdownload' in fname) == temp] 81 | 82 | for year in args.years: 83 | page = 1 84 | while True: 85 | driver = None 86 | try: 87 | driver = selenium.webdriver.Chrome(options = chrome_options, service = chrome_service) 88 | wait = selenium.webdriver.support.ui.WebDriverWait(driver, args.timeout) 89 | 90 | url = args.jo_search.format(year = year, page = page) 91 | 92 | driver.get(url) 93 | page_source = driver.page_source 94 | wait.until(selenium.webdriver.support.expected_conditions.url_to_be(url)) 95 | 96 | jolinks = driver.find_elements('partial link text' , 'nominatives') + driver.find_elements('partial link text', 'version papier numérisée') 97 | 98 | print('Page', page, 'found', len(jolinks), 'links', url) 99 | 100 | for i, jolink in enumerate(jolinks): 101 | joid = jolink.get_attribute('data-textid') 102 | 103 | res_files = find_artefacts(joid, temp = False) 104 | if res_files: 105 | print('Page', page, 'Skipping', joid) 106 | continue 107 | 108 | for fname in find_artefacts(joid, temp = True): 109 | print('Temp file exists, deleting', joid, fname) 110 | os.remove(os.path.join(args.output_directory, fname)) 111 | 112 | print('Page', page, 'Processing', joid, i, '/', len(jolinks)) 113 | 114 | jolink.click() 115 | wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.lf-captcha-line'))) 116 | 117 | captcha = driver.find_element('css selector', '.lf-captcha-line') 118 | 119 | jo_captcha_parser = JoCaptcha() 120 | jo_captcha_parser.feed(captcha.get_attribute('innerHTML')) 121 | captcha_solution = jo_captcha_parser.solve() 122 | 123 | captcha.find_element('css selector', '.lf-captcha-input').send_keys(str(captcha_solution)) 124 | captcha.find_element('css selector', '.captcha-submit').click() 125 | 126 | wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.secure-content a'))) 127 | token = driver.find_element('css selector', '.secure-content a').get_attribute('href').split('token=')[-1] 128 | 129 | url = args.jo_download.format(token = token) 130 | driver.get(url) 131 | 132 | time.sleep(args.timeout) 133 | while find_artefacts(joid, temp = True): 134 | print('Download in progress, temp file exists, sleeping') 135 | time.sleep(args.timeout) 136 | assert find_artefacts(joid, temp = False), 'Must have final downloaded file' 137 | print('Page', page, 'OK', joid, 'PDF:', url) 138 | 139 | driver.quit() 140 | page += 1 141 | print('Page', page, 'increased') 142 | if len(jolinks) == 0: 143 | if args.debug: 144 | print('Debug', args.debug) 145 | with open(args.debug, 'w') as f: 146 | f.write(page_source) 147 | break 148 | 149 | 150 | except Exception as e: 151 | print(e) 152 | print('Page', page, 'big timeout') 153 | if driver is not None: 154 | driver.quit() 155 | time.sleep(args.timeout_big) 156 | -------------------------------------------------------------------------------- /tabulate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import argparse 4 | 5 | digits = lambda s: ''.join(c for c in s if c.isdigit()) 6 | 7 | def extract_record(record, year, section, date, url, basename): 8 | comment = '' 9 | genderstr, gender = None, None 10 | for k, v in {'née le' : 'f', 'né le' : 'm', ', née' : 'f','néele' : 'f', 'néle' : 'm', ', né' : 'm'}.items(): 11 | if k in record.lower(): 12 | gender, genderstr = v, k 13 | break 14 | 15 | if genderstr is None: 16 | firstname = record.split('(')[0] 17 | lastname = action = gender = birthdate = birthplace = birthcountry = dep = '' 18 | comment = record 19 | 20 | else: 21 | gendersplitted = record.split(genderstr) or record.lower().split(genderstr) 22 | name = gendersplitted[0] 23 | lastname, firstname = name.split('(')[0], name.split('(')[-1] 24 | lastname, firstname = lastname.strip().strip('(),'), firstname.strip().strip('(),') 25 | 26 | birth = re.split('NAT|EFF|LIB|REI', gendersplitted[-1].strip())[0] 27 | 28 | record_nospace = record.replace(' ', '').lower() 29 | action = 'NAT' if ',nat,' in record_nospace else 'EFF' if ',eff,' in record_nospace else 'LIB' if ',lib,' in record_nospace else 'REI' if ',rei,' in record_nospace else '' 30 | re_birthdate = r'[ \-/0-9]+' 31 | birthdate = (re.findall(re_birthdate, birth) + [''])[0].replace('-', '').replace('/', '').replace(' ', '') 32 | birthdate = (birthdate[:2] + '/' + birthdate[-6:-4] + '/' + birthdate[-4:]).strip('/') 33 | birthplace = ' '.join(re.split(re_birthdate, birth)).strip('àau,. ') 34 | 35 | if birthplace.count('(') == birthplace.count(')') == 1: 36 | birthcountry = birthplace.split('(')[1].split(')')[0].strip(' ,') 37 | birthplace = birthplace.split('(')[0].strip(' ,') 38 | else: 39 | birthcountry = '' 40 | dep = digits(record_nospace.split('dép')[-1].split(',')[0].strip()) if 'dép' in record.lower() else '' 41 | 42 | return (year, section, date, url, basename.rstrip('.txt'), action, lastname.capitalize(), firstname, gender, birthdate, birthplace, birthcountry, dep, comment.replace('\t', '')) 43 | 44 | 45 | if __name__ == '__main__': 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('--input-directory', '-i', default = 'txtjo') 48 | parser.add_argument('--output-path', '-o', default = 'natufrance.tsv') 49 | parser.add_argument('--legifrance', default = 'https://www.legifrance.gouv.fr/jorf/jo/{year}/{month}/{day}/{num}') 50 | parser.add_argument('--section', default = 'naturalisation', help = 'nat : naturalisé français ; rei : réintégré dans la nationalité française ; eff : enfant saisi par l’effet collectif attaché à l’acquisition de la nationalité française par ses parents ; lib : libéré de l’allégeance française') 51 | args = parser.parse_args() 52 | 53 | 54 | records = [] 55 | joe = {} 56 | 57 | for basename in os.listdir(args.input_directory): #['JORF_20150116_13.pdf.txt']:# 58 | yearmonthday, num = basename.split('_')[1:3] 59 | year, month, day = yearmonthday[:4], yearmonthday[4:6], yearmonthday[6:] 60 | date = day + '/' + month + '/' + year 61 | url = args.legifrance.format(year = year, month = month, day = day, num = num) 62 | 63 | lines = list(open(os.path.join(args.input_directory, basename))) 64 | section = None 65 | record = None 66 | 67 | lines = list(filter(bool, map(str.strip, lines))) 68 | ignore = True 69 | for i in range(len(lines)): 70 | L = lines[i] 71 | L = L.replace(' ', ' ').replace(':', '.').strip() 72 | dots = '-_;·~·•' 73 | for c in dots: 74 | if L and L[-1] == c: 75 | L = L.rstrip(c) + '.' 76 | for c in dots: 77 | L = L.replace(c, ' ') 78 | L = L.strip() 79 | 80 | F = (re.split('[^a-zA-Z]', L) + [''])[0] 81 | l = L.lower() 82 | 83 | if section == args.section: 84 | ignore = False 85 | 86 | if 'sont naturalisés français' in l:# or 'l’acquisition de la nationalité française' in l: 87 | section = 'naturalisation' 88 | record = '' 89 | 90 | elif 'demandes de changement de nom' in l: 91 | section = 'changementdenom' 92 | record = '' 93 | 94 | elif 'Art.' in L: 95 | section = None 96 | record = '' 97 | 98 | elif 'libéré de l’allégeance française' in L: 99 | continue 100 | 101 | else: 102 | if section == 'naturalisation': 103 | if (L.endswith('.') or (l.count('dép') >= 1 and L.count('Dt') >= 1)) and (bool(record) or F.isupper()): 104 | record += L + ' ' 105 | if L.endswith('dép.') or L.endswith('Dt.'):# or record.count('(') != record.count(')'): 106 | continue 107 | 108 | if 'né' not in record.lower(): 109 | record = '' 110 | continue 111 | 112 | if section == args.section: 113 | records.append(extract_record(record, year, section, date, url, basename)) 114 | if not records[-1]: 115 | records.pop() 116 | 117 | record = '' 118 | 119 | elif F.isupper() and not record and 'né' in l: 120 | record = L 121 | 122 | elif record: 123 | record += L 124 | 125 | if section == 'changementdenom': 126 | if L.endswith('.'): 127 | record += L 128 | if section == args.section: 129 | firstname = lastname = gender = birthdate = birthplace = birthcountry = dep = action = '' 130 | records.append((year, section, date, url, basename.rstrip('.txt'), action, lastname, firstname, gender, birthdate, birthplace, birthcountry, dep, record.replace('\t', ''))) 131 | record = '' 132 | 133 | elif not L.startswith('No. ') and not L.startswith('No '): 134 | record += L 135 | joe[basename] = ignore 136 | 137 | open(args.output_path, 'w').write('\n'.join(map('\t'.join, [('year', 'section', 'date', 'url', 'decree', 'action', 'lastname', 'firstname', 'gender', 'birthdate', 'birthplace', 'birthcountry', 'dep', 'comment')] + sorted(records, key = lambda r: (r[6], r[7]))))) 138 | open(args.output_path + '.joe.txt', 'w').write('\n'.join(k + '\t' + str(int(v)) for k, v in joe.items())) 139 | --------------------------------------------------------------------------------