├── README.md
├── natudump.py
└── tabulate.py


/README.md:
--------------------------------------------------------------------------------
 1 | This is example of scraping public LegiFrance registry's naturalisation decrees for research purposes only (`naturalisation par mariage` is not included in these decrees). Code license is MIT.
 2 | 
 3 | ```shell
 4 | pip install selenium charset_normalizer
 5 | 
 6 | mkdir -p jo
 7 | # python3 natudump.py -o jo --years $(seq 2000 2021) --output-directory-prefix "$(wslpath -a -w "$PWD")\\" # for WSL systems, must be on a NTFS drive
 8 | python3 natudump.py   -o jo --years $(seq 2000 2021) --output-directory-prefix "$PWD/"
 9 | ls jo | wc -l
10 | 
11 | mkdir -p txtjo
12 | # https://github.com/pdfminer/pdfminer.six/issues/809
13 | git clone --branch 20220524 --depth 1 https://github.com/pdfminer/pdfminer.six
14 | PYTHONPATH="pdfminer.six:pdfminer.six/tools:$PYTHONPATH" find jo -name '*.pdf' -exec python3 -m pdf2txt {} -o txt{}.txt \;
15 | ls txtjo | wc -l
16 | 
17 | python3 tabulate.py -i txtjo -o natufrance_2000_2021.tsv
18 | grep 'Russie\|URSS\|U.R.S.S' natufrance_2000_2021.tsv | wc -l
19 | 
20 | mkdir -p catjo
21 | git clone --branch v0.4 --depth 1 https://github.com/pmaupin/pdfrw
22 | rm $(PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" find jo/ -type f -not -exec python3 -c 'import sys, pdfrw; pdfrw.PdfReader(sys.argv[1])' {} \; -print)
23 | for years in $(seq 2000 2021); do PYTHONPATH="$PWD/pdfrw:$PYTHONPATH" python3 pdfrw/examples/cat.py jo/JORF_${years}*; mv cat.JORF_${years}*.pdf catjo; done
24 | ls catjo | wc -l
25 | 
26 | mkdir -p tarjo
27 | for years in $(seq 2000 2021); do tar -cf tarjo/jo${years}.tar jo/*_${years}*; done
28 | ls tarjo | wc -l
29 | ```
30 | 


--------------------------------------------------------------------------------
/natudump.py:
--------------------------------------------------------------------------------
  1 | # https://chromedriver.chromium.org/downloads
  2 | # https://www.smashingmagazine.com/2021/12/headers-https-requests-ui-automation-testing/
  3 | # https://pypi.org/project/selenium-wire/
  4 | 
  5 | import os
  6 | import time
  7 | import argparse
  8 | import urllib.request
  9 | import html.parser
 10 | 
 11 | import selenium.webdriver
 12 | import selenium.webdriver.support.ui 
 13 | import selenium.webdriver.support.expected_conditions
 14 | 
 15 | class JoCaptcha(html.parser.HTMLParser):
 16 |     def __init__(self):
 17 |         super().__init__()
 18 |         self.captcha = ''
 19 | 
 20 |     def handle_starttag(self, tag, attrs):
 21 |         if (tag == 'div' and any(k == 'class' and v == 'lf-captcha-sum' for k, v in attrs)) or (tag == 'input' and '<' in self.captcha and '</' not in self.captcha):
 22 |             self.captcha += f'<{tag}>'
 23 |     
 24 |     def handle_data(self, data):
 25 |         if '<' in self.captcha and '</' not in self.captcha:
 26 |             self.captcha += data
 27 |             
 28 |     def handle_endtag(self, tag):
 29 |         if tag == 'div' and '</' not in self.captcha:
 30 |             self.captcha += f'</{tag}>'
 31 | 
 32 |     def solve(self, replace = {' ' : '', '<div>' : '', '</div>' : '', '<input>' : '?', 'onze' : 11, 'douze' : 12, 'treize' : 13, 'quatorze' : 14, 'quinze' : 15, 'seize' : 16, 'dix-sept' : 17, 'dix-huit' : 18, 'dix-neuf' : 19, 'vingt' : 20, 'un' : 1, 'deux' : 2, 'trois' : 3, 'quatre' : 4, 'cinq' : 5, 'six' : 6, 'sept' : 7, 'huit' : 8, 'neuf' : 9, 'dix' : 10}):
 33 |         captcha = self.captcha
 34 |         for k, v in replace.items():
 35 |             captcha = captcha.replace(k, str(v))
 36 |         
 37 |         assert all(c in '?+=0123456789' for c in captcha) and '=' in captcha, captcha
 38 | 
 39 |         ab, c = captcha.split('=')
 40 |         a, b = ab.split('+')
 41 |         if a == '?':
 42 |             return int(c) - int(b)
 43 |         if b == '?':
 44 |             return int(c) - int(a)
 45 |         assert c == '?'
 46 |         return int(a) + int(b)
 47 | 
 48 | if __name__ == '__main__':
 49 |     parser = argparse.ArgumentParser()
 50 |     parser.add_argument('--jo-search', default  = 'https://www.legifrance.gouv.fr/jorf/jo/{year}?page={page}&pageSize=100')
 51 |     parser.add_argument('--jo-download', default= 'https://www.legifrance.gouv.fr/download/secure/file/{token}')
 52 |     parser.add_argument('--years', default = [2016, 2017, 2018, 2019, 2020, 2021], type = int, nargs = '+')
 53 |     parser.add_argument('--output-directory', '-o', default = 'jo')
 54 |     parser.add_argument('--output-directory-prefix', default = '')
 55 |     parser.add_argument('--chromedriver', default = '/usr/bin/chromedriver', help="for WSLv1, this path should symlink to chromedriver.exe extracted from a zipball chromedriver_win32.zip downloaded from https://chromedriver.chromium.org/downloads, don' t forget to update it when you upgrade the host Chrome")
 56 |     parser.add_argument('--timeout', type = float, default = 10.0)
 57 |     parser.add_argument('--timeout-big', type = float, default = 30.0)
 58 |     parser.add_argument('--headmore', action = 'store_true')
 59 |     parser.add_argument('--debug', default = 'debug.html')
 60 | 
 61 |     args = parser.parse_args()
 62 |     print(args)
 63 | 
 64 |     os.makedirs(args.output_directory, exist_ok = True)
 65 |     
 66 |     chrome_prefs = {
 67 |         'download.default_directory': args.output_directory_prefix + args.output_directory,
 68 |         'download.prompt_for_download': False,
 69 |         'download.directory_upgrade': True,
 70 |         'plugins.always_open_pdf_externally': True,
 71 |     }
 72 |     chrome_options = selenium.webdriver.ChromeOptions()
 73 |     chrome_options.add_experimental_option('prefs', chrome_prefs)
 74 |     if not args.headmore:
 75 |         chrome_options.add_argument('--headless')
 76 |     chrome_service = selenium.webdriver.chrome.service.Service(executable_path = args.chromedriver)
 77 | 
 78 |     #driver.request_interceptor = driver.response_interceptor = (lambda request, response: print(request.url, request.headers, response.headers))
 79 |     
 80 |     find_artefacts = lambda joid, temp: [fname for fname in os.listdir(args.output_directory) for prefix in ['joe_' + joid[-4:] + joid[2:4] + joid[:2], joid.rstrip('pdf')] if prefix in fname and ('.crdownload' in fname) == temp]
 81 | 
 82 |     for year in args.years:
 83 |         page = 1
 84 |         while True:
 85 |             driver = None
 86 |             try:
 87 |                 driver = selenium.webdriver.Chrome(options = chrome_options, service = chrome_service)
 88 |                 wait = selenium.webdriver.support.ui.WebDriverWait(driver, args.timeout)
 89 |                 
 90 |                 url = args.jo_search.format(year = year, page = page)
 91 |                 
 92 |                 driver.get(url)
 93 |                 page_source = driver.page_source
 94 |                 wait.until(selenium.webdriver.support.expected_conditions.url_to_be(url))
 95 | 
 96 |                 jolinks = driver.find_elements('partial link text' , 'nominatives') + driver.find_elements('partial link text', 'version papier numérisée')
 97 | 
 98 |                 print('Page', page, 'found', len(jolinks), 'links', url)
 99 |                 
100 |                 for i, jolink in enumerate(jolinks):
101 |                     joid = jolink.get_attribute('data-textid')
102 |                     
103 |                     res_files = find_artefacts(joid, temp = False)
104 |                     if res_files:
105 |                         print('Page', page, 'Skipping', joid)
106 |                         continue
107 | 
108 |                     for fname in find_artefacts(joid, temp = True):
109 |                         print('Temp file exists, deleting', joid, fname)
110 |                         os.remove(os.path.join(args.output_directory, fname))
111 | 
112 |                     print('Page', page, 'Processing', joid, i, '/', len(jolinks))
113 |                     
114 |                     jolink.click()
115 |                     wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.lf-captcha-line')))
116 | 
117 |                     captcha = driver.find_element('css selector', '.lf-captcha-line')
118 |                     
119 |                     jo_captcha_parser = JoCaptcha()
120 |                     jo_captcha_parser.feed(captcha.get_attribute('innerHTML'))
121 |                     captcha_solution = jo_captcha_parser.solve()
122 | 
123 |                     captcha.find_element('css selector', '.lf-captcha-input').send_keys(str(captcha_solution))
124 |                     captcha.find_element('css selector', '.captcha-submit').click()
125 |                     
126 |                     wait.until(selenium.webdriver.support.expected_conditions.presence_of_element_located(('css selector', '.secure-content a')))
127 |                     token = driver.find_element('css selector', '.secure-content a').get_attribute('href').split('token=')[-1]
128 | 
129 |                     url = args.jo_download.format(token = token)
130 |                     driver.get(url)
131 |                     
132 |                     time.sleep(args.timeout)
133 |                     while find_artefacts(joid, temp = True):
134 |                         print('Download in progress, temp file exists, sleeping')
135 |                         time.sleep(args.timeout)
136 |                     assert find_artefacts(joid, temp = False), 'Must have final downloaded file'  
137 |                     print('Page', page, 'OK', joid, 'PDF:', url)
138 | 
139 |                 driver.quit()
140 |                 page += 1
141 |                 print('Page', page, 'increased')
142 |                 if len(jolinks) == 0:
143 |                     if args.debug:
144 |                         print('Debug', args.debug)
145 |                         with open(args.debug, 'w') as f:
146 |                             f.write(page_source)
147 |                     break
148 | 
149 |             
150 |             except Exception as e:
151 |                 print(e)
152 |                 print('Page', page, 'big timeout')
153 |                 if driver is not None:
154 |                     driver.quit()
155 |                 time.sleep(args.timeout_big)
156 | 


--------------------------------------------------------------------------------
/tabulate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import argparse
  4 | 
  5 | digits = lambda s: ''.join(c for c in s if c.isdigit())
  6 | 
  7 | def extract_record(record, year, section, date, url, basename):
  8 |     comment = ''
  9 |     genderstr, gender = None, None
 10 |     for k, v in {'née le' : 'f', 'né le' : 'm', ', née' : 'f','néele' : 'f',  'néle' : 'm', ', né' : 'm'}.items():
 11 |         if k in record.lower():
 12 |             gender, genderstr = v, k
 13 |             break
 14 | 
 15 |     if genderstr is None:
 16 |         firstname = record.split('(')[0]
 17 |         lastname = action = gender = birthdate = birthplace = birthcountry = dep = ''
 18 |         comment = record
 19 | 
 20 |     else:
 21 |         gendersplitted = record.split(genderstr) or record.lower().split(genderstr)
 22 |         name = gendersplitted[0]
 23 |         lastname, firstname = name.split('(')[0], name.split('(')[-1]
 24 |         lastname, firstname = lastname.strip().strip('(),'), firstname.strip().strip('(),') 
 25 | 
 26 |         birth = re.split('NAT|EFF|LIB|REI', gendersplitted[-1].strip())[0]
 27 | 
 28 |         record_nospace = record.replace(' ', '').lower()
 29 |         action = 'NAT' if ',nat,' in record_nospace else 'EFF' if ',eff,' in record_nospace else 'LIB' if ',lib,' in record_nospace else 'REI' if ',rei,' in record_nospace else ''
 30 |         re_birthdate = r'[ \-/0-9]+'
 31 |         birthdate = (re.findall(re_birthdate, birth) + [''])[0].replace('-', '').replace('/', '').replace(' ', '')
 32 |         birthdate = (birthdate[:2] + '/' + birthdate[-6:-4] + '/' + birthdate[-4:]).strip('/')
 33 |         birthplace = ' '.join(re.split(re_birthdate, birth)).strip('àau,. ')
 34 | 
 35 |         if birthplace.count('(') == birthplace.count(')') == 1:
 36 |             birthcountry = birthplace.split('(')[1].split(')')[0].strip(' ,')
 37 |             birthplace = birthplace.split('(')[0].strip(' ,')
 38 |         else:
 39 |             birthcountry = ''
 40 |         dep = digits(record_nospace.split('dép')[-1].split(',')[0].strip()) if 'dép' in record.lower() else ''
 41 | 
 42 |     return (year, section, date, url, basename.rstrip('.txt'), action, lastname.capitalize(), firstname, gender, birthdate, birthplace, birthcountry, dep, comment.replace('\t', ''))
 43 | 
 44 | 
 45 | if __name__ == '__main__':
 46 |     parser = argparse.ArgumentParser()
 47 |     parser.add_argument('--input-directory', '-i', default = 'txtjo')
 48 |     parser.add_argument('--output-path', '-o', default = 'natufrance.tsv')
 49 |     parser.add_argument('--legifrance', default = 'https://www.legifrance.gouv.fr/jorf/jo/{year}/{month}/{day}/{num}')
 50 |     parser.add_argument('--section', default = 'naturalisation', help = 'nat : naturalisé français ; rei : réintégré dans la nationalité française ; eff : enfant saisi par l’effet collectif attaché à l’acquisition de la nationalité française par ses parents ; lib : libéré de l’allégeance française')
 51 |     args = parser.parse_args()
 52 | 
 53 | 
 54 |     records = []
 55 |     joe = {}
 56 | 
 57 |     for basename in os.listdir(args.input_directory): #['JORF_20150116_13.pdf.txt']:#
 58 |         yearmonthday, num = basename.split('_')[1:3]
 59 |         year, month, day = yearmonthday[:4], yearmonthday[4:6], yearmonthday[6:]
 60 |         date = day + '/' + month + '/' + year
 61 |         url = args.legifrance.format(year = year, month = month, day = day, num = num)
 62 | 
 63 |         lines = list(open(os.path.join(args.input_directory, basename)))
 64 |         section = None
 65 |         record = None
 66 |         
 67 |         lines = list(filter(bool, map(str.strip, lines)))
 68 |         ignore = True
 69 |         for i in range(len(lines)):
 70 |             L = lines[i]
 71 |             L = L.replace('  ', ' ').replace(':', '.').strip()
 72 |             dots = '-_;·~·•'
 73 |             for c in dots:
 74 |                 if L and L[-1] == c:
 75 |                     L = L.rstrip(c) + '.'
 76 |             for c in dots:
 77 |                 L = L.replace(c, ' ')
 78 |             L = L.strip()
 79 | 
 80 |             F = (re.split('[^a-zA-Z]', L) + [''])[0]
 81 |             l = L.lower()
 82 |             
 83 |             if section == args.section:
 84 |                 ignore = False
 85 | 
 86 |             if 'sont naturalisés français' in l:# or 'l’acquisition de la nationalité française' in l:
 87 |                 section = 'naturalisation'
 88 |                 record = ''
 89 | 
 90 |             elif 'demandes de changement de nom' in l:
 91 |                 section = 'changementdenom'
 92 |                 record = ''
 93 | 
 94 |             elif 'Art.' in L:
 95 |                 section = None
 96 |                 record = ''
 97 |             
 98 |             elif 'libéré de l’allégeance française' in L:
 99 |                 continue
100 | 
101 |             else:
102 |                 if section == 'naturalisation':
103 |                     if (L.endswith('.') or (l.count('dép') >= 1 and L.count('Dt') >= 1)) and (bool(record) or F.isupper()):
104 |                         record += L + ' '
105 |                         if L.endswith('dép.') or L.endswith('Dt.'):# or record.count('(') != record.count(')'):
106 |                             continue
107 | 
108 |                         if 'né' not in record.lower():
109 |                             record = ''
110 |                             continue
111 | 
112 |                         if section == args.section:
113 |                             records.append(extract_record(record, year, section, date, url, basename))
114 |                             if not records[-1]:
115 |                                 records.pop()
116 | 
117 |                         record = ''
118 |                     
119 |                     elif F.isupper() and not record and 'né' in l:
120 |                         record = L
121 |                     
122 |                     elif record:
123 |                         record += L
124 |                         
125 |                 if section == 'changementdenom':
126 |                     if L.endswith('.'):
127 |                         record += L
128 |                         if section == args.section:
129 |                             firstname = lastname = gender = birthdate = birthplace = birthcountry = dep = action = ''
130 |                             records.append((year, section, date, url, basename.rstrip('.txt'), action, lastname, firstname, gender, birthdate, birthplace, birthcountry, dep, record.replace('\t', '')))
131 |                         record = ''
132 |                     
133 |                     elif not L.startswith('No. ') and not L.startswith('No '):
134 |                         record += L
135 |         joe[basename] = ignore
136 | 
137 |     open(args.output_path, 'w').write('\n'.join(map('\t'.join, [('year', 'section', 'date', 'url', 'decree', 'action', 'lastname', 'firstname', 'gender', 'birthdate', 'birthplace', 'birthcountry', 'dep', 'comment')] + sorted(records, key = lambda r: (r[6], r[7])))))
138 |     open(args.output_path + '.joe.txt', 'w').write('\n'.join(k + '\t' + str(int(v)) for k, v in joe.items()))
139 | 


--------------------------------------------------------------------------------