├── .gitignore ├── Gemfile ├── LICENSE ├── README.md ├── __init__.py ├── analysis ├── __init__.py ├── export.py ├── export_dockets.py ├── schema.sql └── tests.py ├── auto ├── fabfile.py ├── requirements.txt └── ssh_util.py ├── duplicates ├── __init__.py ├── cftc.py ├── clustering.py ├── db.py ├── interactive.py ├── ngrams.py └── tests.py ├── ec2 ├── README ├── install-deps.sh ├── run-x.sh └── setup-env.sh ├── one_offs ├── copy_agency │ └── cp.py ├── dodd_frank │ ├── agencies.py │ ├── dockets.py │ ├── dump.py │ ├── parse.py │ ├── regscrape │ └── settings.py ├── lightsquared │ ├── download_files.py │ ├── extract_text.py │ └── get_metadata.py └── pdf_repair │ ├── detect_pdfs.py │ └── fix_pdfs.py ├── regscrape ├── __init__.py ├── pipeline.py ├── regs_common │ ├── __init__.py │ ├── aggregates.py │ ├── commands │ │ ├── __init__.py │ │ ├── add_to_search.py │ │ ├── administer_search.py │ │ ├── annotate_fr_agencies.py │ │ ├── annotate_fr_docs.py │ │ ├── create_dockets.py │ │ ├── create_entities.py │ │ ├── export_text.py │ │ ├── extract.py │ │ ├── mark_searchable_entities.py │ │ ├── match_text.py │ │ ├── reset_downloads.py │ │ ├── reset_extraction.py │ │ ├── run_aggregates.py │ │ └── runner.py │ ├── data │ │ ├── es_mapping.json │ │ └── names.dat │ ├── data_import.py │ ├── entities.py │ ├── exceptions.py │ ├── extraction.py │ ├── gevent_mongo.py │ ├── mp_types.py │ ├── processing.py │ ├── scripts │ │ ├── extract_docx.py │ │ └── process_fr_docs.rb │ ├── tmp_redis.py │ ├── transfer.py │ └── util.py ├── regsdotgov │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── rdg_create_agencies.py │ │ ├── rdg_download.py │ │ ├── rdg_dump_api.py │ │ ├── rdg_parse_api.py │ │ ├── rdg_scrape.py │ │ ├── rdg_scrape_dockets.py │ │ └── rdg_simple_update.py │ ├── document.py │ └── search.py ├── run.py ├── sec_cftc │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── cftc_extract_current.py │ │ ├── cftc_scrape_dockets.py │ │ ├── cftc_scrape_documents.py │ │ ├── sec_cftc_import.py │ │ ├── sec_cftc_name_dockets.py │ │ ├── sec_scrape_dockets.py │ │ ├── sec_scrape_documents.py │ │ └── suppress_duplicates.py └── settings.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .project 3 | .pydevproject 4 | *.pickle 5 | nohup.out 6 | local_settings.py 7 | _test* 8 | oxtail 9 | pytrie 10 | Gemfile.lock -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org/' 2 | gem 'nokogiri' 3 | gem 'us-documents' -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Sunlight Foundation 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of sunlighttt nor the names of its contributors 14 | may be used to endorse or promote products derived from this software 15 | without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regulations-scraper 2 | 3 | This repo contains scraper code for maintaining a complete copy of all data on [Regulations.gov](http://regulations.gov) (consisting mainly of Federal Register documents and public comments), extracting text from said documents, and doing named entity recognition (using [Oxtail](https://github.com/sunlightlabs/oxtail)) and plagiarism detection/clustering (using [cluster-explorer](https://github.com/sunlightlabs/cluster-explorer)). Additionally, the project includes scrapers for a couple non-participating agencies, the SEC and CFTC, and shoehorns their content into the Regulations.gov data model. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/__init__.py -------------------------------------------------------------------------------- /analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/analysis/__init__.py -------------------------------------------------------------------------------- /analysis/export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import csv 6 | import time 7 | import multiprocessing 8 | from Queue import Empty 9 | from datetime import datetime 10 | from collections import namedtuple 11 | from pymongo import Connection 12 | import StringIO 13 | 14 | pid = os.getpid() 15 | 16 | import_start = time.time() 17 | print '[%s] Loading trie...' % pid 18 | from oxtail.matching import match 19 | print '[%s] Loaded trie in %s seconds.' % (pid, time.time() - import_start) 20 | 21 | F = namedtuple('F', ['csv_column', 'transform']) 22 | 23 | def deep_get(key, dict, default=None): 24 | if '.' in key: 25 | first, rest = key.split('.', 1) 26 | return deep_get(rest, dict.get(first, {}), default) 27 | else: 28 | out = dict.get(key, default) 29 | return out if out else default 30 | 31 | def getter(key, default=''): 32 | return lambda d: deep_get(key, d, default) 33 | 34 | 35 | DOCS_QUERY = {'deleted': False} 36 | 37 | DOCS_FIELDS = [ 38 | F('document_id', getter('document_id')), 39 | F('docket_id', getter('docket_id')), 40 | F('agency', getter('agency')), 41 | F('date_posted', getter('details.receive_date', None)), 42 | F('date_due', getter('details.comment_end_date', None)), 43 | F('title', getter('title')), 44 | F('type', getter('type')), 45 | F('org_name', getter('details.organization')), 46 | F('submitter_name', lambda d: ' '.join(filter(bool, [deep_get('details.first_name', d, None), deep_get('details.mid_initial', d, None), deep_get('details.last_name', d, None)]))), 47 | F('on_type', getter('comment_on.type')), 48 | F('on_id', getter('comment_on.id')), 49 | F('on_title', getter('comment_on.title')), 50 | ] 51 | 52 | 53 | def filter_for_postgres(v): 54 | if v is None: 55 | return '\N' 56 | 57 | if isinstance(v, datetime): 58 | return str(v) 59 | 60 | return v.encode('utf8').replace("\.", ".") 61 | 62 | def process_doc(doc, fields=DOCS_FIELDS): 63 | # field extraction 64 | output = { 65 | 'metadata': [filter_for_postgres(f.transform(doc)) for f in fields], 66 | 'matches': [], 67 | 'submitter_matches': [] 68 | } 69 | 70 | # entity extraction 71 | if 'views' in doc and doc['views']: 72 | for view in doc['views']: 73 | if 'extracted' in view and view['extracted'] == True: 74 | for entity_id in match(view['text']).keys(): 75 | # hack to deal with documents whose scrapes failed but still got extracted 76 | object_id = doc['object_id'] if 'object_id' in doc else view['file'].split('/')[-1].split('.')[0] 77 | output['matches'].append([doc['document_id'], object_id, view['type'], 'view', entity_id]) 78 | if 'attachments' in doc and doc['attachments']: 79 | for attachment in doc['attachments']: 80 | if 'views' in attachment and attachment['views']: 81 | for view in attachment['views']: 82 | if 'extracted' in view and view['extracted'] == True: 83 | for entity_id in match(view['text']).keys(): 84 | output['matches'].append([doc['document_id'], attachment['object_id'], view['type'], 'attachment', entity_id]) 85 | 86 | # submitter matches 87 | for entity_id in match('\n'.join([output['metadata'][7], output['metadata'][8]])).keys(): 88 | output['submitter_matches'].append([doc['document_id'], entity_id]) 89 | 90 | return output 91 | 92 | # single-core version 93 | def dump_cursor(c, fields, filename): 94 | metadata_writer = csv.writer(open(sys.argv[3] + '_meta.csv', 'w')) 95 | metadata_writer.writerow([f.csv_column for f in fields]) 96 | 97 | match_writer = csv.writer(open(sys.argv[3] + '_text_matches.csv', 'w')) 98 | match_writer.writerow(['document_id', 'object_id', 'file_type', 'view_type', 'entity_id']) 99 | 100 | submitter_writer = csv.writer(open(sys.argv[3] + '_submitter_matches.csv', 'w')) 101 | submitter_writer.writerow(['document_id', 'entity_id']) 102 | 103 | for doc in c: 104 | doc_data = process_doc(doc) 105 | metadata_writer.writerow(doc_data['metadata']) 106 | match_writer.writerows(doc_data['matches']) 107 | submitter_writer.writerows(doc_data['submitter_matches']) 108 | 109 | # multi-core version and helpers 110 | def write_worker(done_queue, filename, fields=DOCS_FIELDS): 111 | print '[%s] Writer started.' % os.getpid() 112 | 113 | metadata_writer = csv.writer(open(sys.argv[3] + '_meta.csv', 'w')) 114 | metadata_writer.writerow([f.csv_column for f in fields]) 115 | 116 | match_writer = csv.writer(open(sys.argv[3] + '_text_matches.csv', 'w')) 117 | match_writer.writerow(['document_id', 'object_id', 'file_type', 'view_type', 'entity_id']) 118 | 119 | submitter_writer = csv.writer(open(sys.argv[3] + '_submitter_matches.csv', 'w')) 120 | submitter_writer.writerow(['document_id', 'entity_id']) 121 | 122 | while True: 123 | try: 124 | doc_data = done_queue.get(timeout=20) 125 | except Empty: 126 | print '[%s] CSV writes complete.' % os.getpid() 127 | return 128 | 129 | metadata_writer.writerow(doc_data['metadata']) 130 | match_writer.writerows(doc_data['matches']) 131 | submitter_writer.writerows(doc_data['submitter_matches']) 132 | 133 | done_queue.task_done() 134 | 135 | def process_worker(todo_queue, done_queue): 136 | print '[%s] Worker started.' % os.getpid() 137 | while True: 138 | try: 139 | doc = todo_queue.get(timeout=20) 140 | except Empty: 141 | print '[%s] Processing complete.' % os.getpid() 142 | return 143 | 144 | doc_data = process_doc(doc) 145 | done_queue.put(doc_data) 146 | 147 | todo_queue.task_done() 148 | 149 | def dump_cursor_multi(c, fields, filename, num_workers): 150 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 151 | done_queue = multiprocessing.JoinableQueue(num_workers * 3) 152 | 153 | for i in range(num_workers): 154 | proc = multiprocessing.Process(target=process_worker, args=(todo_queue, done_queue)) 155 | proc.start() 156 | proc = multiprocessing.Process(target=write_worker, args=(done_queue, filename)) 157 | proc.start() 158 | 159 | for doc in c: 160 | todo_queue.put(doc) 161 | 162 | todo_queue.join() 163 | done_queue.join() 164 | 165 | if __name__ == '__main__': 166 | # set up options 167 | from optparse import OptionParser 168 | parser = OptionParser(usage="usage: %prog [options] host dbname file_prefix") 169 | parser.add_option("-l", "--limit", dest="limit", action="store", type="int", default=None, help="Limit number of records for testing.") 170 | parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=None, help="Set number of worker processes. Single-process model used if not specified.") 171 | 172 | (options, args) = parser.parse_args() 173 | 174 | # fetch options, args 175 | host = args[0] 176 | dbname = args[1] 177 | prefix = args[2] 178 | 179 | # do request and analysis 180 | if options.limit: 181 | cursor = Connection(host=host)[dbname].docs.find(DOCS_QUERY, limit=options.limit) 182 | else: 183 | cursor = Connection(host=host)[dbname].docs.find(DOCS_QUERY) 184 | 185 | run_start = time.time() 186 | print '[%s] Starting analysis...' % pid 187 | 188 | if options.multi: 189 | dump_cursor_multi(cursor, DOCS_FIELDS, prefix, options.multi) 190 | else: 191 | dump_cursor(cursor, DOCS_FIELDS, prefix) 192 | 193 | print '[%s] Completed analysis in %s seconds.' % (pid, time.time() - run_start) 194 | -------------------------------------------------------------------------------- /analysis/export_dockets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import csv 6 | import time 7 | from datetime import datetime 8 | from collections import namedtuple 9 | from pymongo import Connection 10 | 11 | pid = os.getpid() 12 | 13 | DOCKETS_QUERY = {'scraped': True} 14 | 15 | DOCKET_MONGO_FIELDS = ['_id', 'title', 'agency', 'year'] 16 | DOCKET_CSV_FIELDS = ['docket_id', 'title', 'agency', 'year'] 17 | 18 | def filter_for_postgres(v): 19 | if v is None: 20 | return '\N' 21 | 22 | if isinstance(v, datetime): 23 | return str(v) 24 | 25 | return unicode(v).encode('utf8').replace("\.", ".") 26 | 27 | if __name__ == '__main__': 28 | # set up options 29 | from optparse import OptionParser 30 | parser = OptionParser(usage="usage: %prog [options] host dbname file_prefix") 31 | 32 | (options, args) = parser.parse_args() 33 | 34 | # fetch options, args 35 | host = args[0] 36 | dbname = args[1] 37 | prefix = args[2] 38 | 39 | writer = csv.writer(open(sys.argv[3] + '_dockets.csv', 'w')) 40 | writer.writerow(DOCKET_CSV_FIELDS) 41 | 42 | cursor = Connection(host=host)[dbname].dockets.find(DOCKETS_QUERY) 43 | 44 | run_start = time.time() 45 | print '[%s] Starting export...' % pid 46 | 47 | for row in cursor: 48 | writer.writerow([filter_for_postgres(row[field]) for field in DOCKET_MONGO_FIELDS]) 49 | 50 | print '[%s] Completed export in %s seconds.' % (pid, time.time() - run_start) 51 | -------------------------------------------------------------------------------- /analysis/schema.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE regulations_comments ( 3 | document_id varchar(32) PRIMARY KEY NOT NULL, 4 | docket_id varchar(32) NOT NULL, 5 | agency varchar(8) NOT NULL, 6 | date date, 7 | text text NOT NULL 8 | ); 9 | 10 | DROP TABLE IF EXISTS regulations_comments_full; 11 | CREATE TABLE regulations_comments_full ( 12 | document_id varchar(64) PRIMARY KEY NOT NULL, 13 | docket_id varchar(64) NOT NULL, 14 | agency varchar(8) NOT NULL, 15 | date_posted date, 16 | date_due date, 17 | title varchar(512) NOT NULL, 18 | type varchar(32), 19 | org_name varchar(255) NOT NULL, 20 | submitter_name varchar(255) NOT NULL, 21 | on_type varchar(32), 22 | on_id varchar(64) NOT NULL, 23 | on_title varchar(512) NOT NULL 24 | ); 25 | CREATE INDEX regulations_comments_full_docket_id ON regulations_comments_full ( docket_id ); 26 | 27 | -- this should replace some fields on the comment 28 | CREATE TABLE regulations_dockets ( 29 | docket_id varchar(64) PRIMARY KEY NOT NULL, 30 | title varchar(512) NOT NULL, 31 | agency varchar(8) NOT NULL, 32 | year smallint 33 | ); 34 | 35 | CREATE TABLE regulations_text_matches ( 36 | document_id varchar(64), 37 | object_id varchar(32), 38 | file_type varchar(16), 39 | view_type varchar(16), 40 | entity_id uuid 41 | ); 42 | 43 | CREATE TABLE regulations_submitter_matches ( 44 | document_id varchar(64), 45 | entity_id uuid 46 | ); 47 | -------------------------------------------------------------------------------- /analysis/tests.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | from analysis.export import deep_get 5 | 6 | 7 | class TestDeepGet(unittest.TestCase): 8 | 9 | def test_deep_get(self): 10 | self.assertEqual(None, deep_get('foo', {})) 11 | 12 | self.assertEqual(7, deep_get('foo', {'foo': 7})) 13 | self.assertEqual(None, deep_get('bar', {'foo': 7})) 14 | 15 | self.assertEqual(7, deep_get('foo.bar.spaz', {'foo': {'bar': {'spaz': 7}}})) 16 | self.assertEqual({'spaz': 7}, deep_get('foo.bar', {'foo': {'bar': {'spaz': 7}}})) 17 | self.assertEqual(None, deep_get('foo.more.less', {'foo': {'bar': {'spaz': 7}}})) 18 | self.assertEqual(None, deep_get('spaz.more.less', {'foo': {'bar': {'spaz': 7}}})) 19 | 20 | 21 | if __name__ == '__main__': 22 | unittest.main() -------------------------------------------------------------------------------- /auto/fabfile.py: -------------------------------------------------------------------------------- 1 | from fabric.api import * 2 | from ssh_util import * 3 | from collections import OrderedDict 4 | import os, sys, json, datetime 5 | 6 | VERBOSE = False 7 | 8 | TASKS_ALWAYS = [ 9 | ('local', ['rdg_scrape']), 10 | ('local', ['rdg_download']), 11 | ('local', ['extract']), 12 | ('local', ['create_dockets']), 13 | ('local', ['rdg_scrape_dockets']), 14 | ('local', ['match_text']), 15 | ('local', ['add_to_search']), 16 | ] 17 | 18 | TASK_SETS = { 19 | 'major': [ 20 | ('local', ['rdg_dump_api']), 21 | ('local', ['rdg_parse_api']), 22 | ] + TASKS_ALWAYS + [ 23 | ('local', ['run_aggregates', '-A']), 24 | ('remote', ['analyze_regs', '-F']), 25 | ], 26 | 27 | 'minor': [ 28 | ('local', ['rdg_simple_update']), 29 | ] + TASKS_ALWAYS + [ 30 | ('local', ['run_aggregates']), 31 | ('remote', ['analyze_regs', '-F']), 32 | ] 33 | } 34 | 35 | ADMINS = [] 36 | EMAIL_SENDER = '' 37 | EMAIL_API_KEY = '' 38 | LOCK_DIR = '/tmp' 39 | LOG_DIR = '/var/log/scrape' 40 | 41 | try: 42 | from local_settings import * 43 | except: 44 | pass 45 | 46 | def send_email(recipients, subject, message): 47 | from postmark import PMMail 48 | message = PMMail( 49 | to = ','.join(recipients), 50 | subject = '[regs] %s' % subject, 51 | text_body = message, 52 | api_key = EMAIL_API_KEY, 53 | sender = EMAIL_SENDER 54 | ) 55 | message.send(test=False) 56 | 57 | def run_local(command): 58 | os.chdir(os.path.expanduser('~/regulations-scraper/regscrape')) 59 | out = local(' '.join([sys.executable, command]), capture=True) 60 | return out 61 | 62 | def run_remote(command): 63 | with cd('~/sparerib'): 64 | with prefix('source ~/.virtualenvs/sparerib_pypy/bin/activate'): 65 | return run(command) 66 | 67 | def handle_completion(message, results): 68 | output = '%s\nComplete results:\n%s' % (message, json.dumps(results, indent=4)) 69 | print output 70 | 71 | if ADMINS: 72 | send_email(ADMINS, message, output) 73 | 74 | def acquire_lock(): 75 | lock_path = os.path.join(LOCK_DIR, 'regs.lock') 76 | if os.path.exists(lock_path): 77 | raise RuntimeError("Can't acquire lock.") 78 | else: 79 | lock = open(lock_path, 'w') 80 | lock.write(str(os.getpid())) 81 | lock.close() 82 | 83 | def release_lock(): 84 | lock_path = os.path.join(LOCK_DIR, 'regs.lock') 85 | os.unlink(lock_path) 86 | 87 | @hosts(ssh_config('regs-fe')) 88 | def run_regs(start_with=None, end_with=None, task_set=None): 89 | try: 90 | # use a lock file to keep multiple instances from trying to run simultaneously, which, among other things, consumes all of the memory on the high-CPU instance 91 | acquire_lock() 92 | except: 93 | print 'Unable to acquire lock.' 94 | if ADMINS: 95 | send_email(ADMINS, "Aborting: can't acquire lock", "Can't start processing due to inability to acquire lock.") 96 | 97 | sys.exit(1) 98 | 99 | # get some logging stuff ready 100 | now = datetime.datetime.now() 101 | today = now.date().isoformat() 102 | month = today.rsplit('-', 1)[0] 103 | month_log_path = os.path.join(LOG_DIR, month) 104 | if not os.path.exists(month_log_path): 105 | os.mkdir(month_log_path) 106 | 107 | if not (task_set and task_set in TASK_SETS): 108 | # is it Sunday? 109 | is_sunday = now.weekday() == 6 110 | 111 | # have we run already today? 112 | run_already = len([log_file for log_file in os.listdir(month_log_path) if log_file.startswith(today)]) > 0 113 | 114 | if is_sunday and not run_already: 115 | task_set = 'major' 116 | else: 117 | task_set = 'minor' 118 | all_tasks = TASK_SETS[task_set] 119 | 120 | print 'Starting task set "%s"...' % task_set 121 | 122 | start_with = start_with if start_with is not None else all_tasks[0][1][0] 123 | end_with = end_with if end_with is not None else all_tasks[-1][1][0] 124 | 125 | first_task_idx = [i for i in range(len(all_tasks)) if all_tasks[i][1][0] == start_with][0] 126 | last_task_idx = [i for i in range(len(all_tasks)) if all_tasks[i][1][0] == end_with][0] 127 | tasks = all_tasks[first_task_idx:(last_task_idx+1)] 128 | runners = { 129 | 'remote': run_remote, 130 | 'local': run_local 131 | } 132 | results = OrderedDict() 133 | for func, command in tasks: 134 | try: 135 | output = runners[func](' '.join(['./run.py' if func == 'local' else './manage.py'] + command + ['--parsable'])) 136 | try: 137 | results[command[0]] = json.loads(output) 138 | except ValueError: 139 | results[command[0]] = {'raw_results': output} 140 | if VERBOSE and ADMINS: 141 | send_email(ADMINS, 'Results of %s' % command[0], 'Results of %s:\n%s' % (command[0], json.dumps(results[command[0]], indent=4))) 142 | except SystemExit: 143 | results[command[0]] = 'failed' 144 | handle_completion('Aborting at step: %s' % command[0], results) 145 | if command[0] == "rdg_simple_update": 146 | release_lock() 147 | sys.exit(1) 148 | handle_completion('All steps completed.', results) 149 | 150 | logfile = open(os.path.join(month_log_path, now.isoformat() + ".json"), "w") 151 | logfile.write(json.dumps(results, indent=4)) 152 | logfile.close() 153 | 154 | release_lock() 155 | -------------------------------------------------------------------------------- /auto/requirements.txt: -------------------------------------------------------------------------------- 1 | fabric 2 | python-postmark 3 | -------------------------------------------------------------------------------- /auto/ssh_util.py: -------------------------------------------------------------------------------- 1 | from fabric.api import * 2 | 3 | def ssh_config(host): 4 | from os.path import expanduser 5 | from paramiko.config import SSHConfig 6 | 7 | def hostinfo(host, config): 8 | hive = config.lookup(host) 9 | if 'hostname' in hive: 10 | host = hive['hostname'] 11 | if 'user' in hive: 12 | host = '%s@%s' % (hive['user'], host) 13 | if 'port' in hive: 14 | host = '%s:%s' % (host, hive['port']) 15 | return host 16 | 17 | try: 18 | config_file = file(expanduser('~/.ssh/config')) 19 | except IOError: 20 | pass 21 | else: 22 | config = SSHConfig() 23 | config.parse(config_file) 24 | key = config.lookup(host).get('identityfile', None) 25 | key_filename = expanduser(key) 26 | 27 | env.key_filename = [key_filename] if key_filename else [] 28 | return hostinfo(host, config) 29 | -------------------------------------------------------------------------------- /duplicates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/duplicates/__init__.py -------------------------------------------------------------------------------- /duplicates/cftc.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | import re 4 | import os 5 | import sys 6 | import cPickle 7 | 8 | from clustering import Clustering 9 | from ngrams import NGramSpace 10 | 11 | def extract_row(row, pdf_path, ngrams): 12 | text = _get_text(row, pdf_path) 13 | date = row['DateTime Submitted'] 14 | if row['Middle Initial']: 15 | name = " ".join([row['First Name'], row['Middle Initial'], row['Last Name']]) 16 | else: 17 | name = " ".join([row['First Name'], row['Last Name']]) 18 | org = row['Organization'] 19 | 20 | return CFTCDocument(name, org, date, text, ngrams) 21 | 22 | 23 | def _get_text(row, pdf_path): 24 | source_file = row['File Name'] 25 | if source_file in ('', 'NULL'): 26 | return row['Comment Text'] 27 | else: 28 | if source_file.lower().endswith('.pdf'): 29 | stripped = source_file[:-4] 30 | elif source_file.lower().endswith('pdf'): 31 | stripped = source_file[:-3] 32 | else: 33 | stripped = source_file 34 | 35 | extraction = " ".join(open(os.path.join(pdf_path, stripped + '.txt'), 'r')) 36 | # as a sanity check, assure that there are at least 5 words 37 | if len(re.split('\W+', extraction)) > 5: 38 | return extraction 39 | else: 40 | return '' 41 | 42 | 43 | class CFTCDocument(object): 44 | 45 | def __init__(self, name, org, date, text, ngrams): 46 | self.name = name 47 | self.org = org 48 | self.date = date 49 | self.text = text 50 | self.parsed = ngrams.parse(self.text) 51 | 52 | def __str__(self): 53 | return "%s (%s)\n%s" % (self.name, self.org, self.text) 54 | 55 | @classmethod 56 | def get_output_headers(self): 57 | return ['name', 'org', 'date', 'text'] 58 | 59 | def get_output_values(self): 60 | return [self.name, self.org, self.date, self.text] 61 | 62 | 63 | def setup(source, pdf_path): 64 | ngrams = NGramSpace(4) 65 | print "parsing documents at %s..." % source 66 | docs = [extract_row(row, pdf_path, ngrams) for row in csv.DictReader(open(source, 'r'))] 67 | print "clustering %d documents..." % len(docs) 68 | clustering = Clustering([doc.parsed for doc in docs]) 69 | return (clustering, docs) 70 | 71 | 72 | if __name__ == '__main__': 73 | (clustering, docs) = setup(sys.argv[1], sys.argv[2]) 74 | print "\nWriting clustering to %s..." % sys.argv[3] 75 | cPickle.dump((clustering, docs), open(sys.argv[3], 'wb'), cPickle.HIGHEST_PROTOCOL) 76 | 77 | -------------------------------------------------------------------------------- /duplicates/clustering.py: -------------------------------------------------------------------------------- 1 | from ngrams import jaccard, NGramSpace 2 | import numpy 3 | 4 | 5 | class SymmetricMatrix(object): 6 | 7 | def __init__(self, n): 8 | self.values = numpy.zeros((n, n)) 9 | self.mask = None 10 | 11 | def submatrix(self, ids): 12 | sub = SymmetricMatrix(0) 13 | sub.values = self 14 | sub.mask = ids 15 | return sub 16 | 17 | def translate(self, index): 18 | (i, j) = (max(index), min(index)) 19 | if self.mask: 20 | return (self.mask[i], self.mask[j]) 21 | return (i, j) 22 | 23 | def __getitem__(self, index): 24 | return self.values[self.translate(index)] 25 | 26 | def __setitem__(self, index, value): 27 | self.values[self.translate(index)] = value 28 | 29 | def __len__(self): 30 | if self.mask: 31 | return len(self.mask) 32 | return len(self.values) 33 | 34 | 35 | class PriorityQueue(object): 36 | 37 | def __init__(self, size): 38 | self.size = size 39 | self.data = list() 40 | 41 | def insert(self, value, priority): 42 | i = len(self.data) 43 | while i > 0 and self.data[i - 1][0] > priority: 44 | i -= 1 45 | 46 | if i < self.size: 47 | self.data.insert(i, (priority, value)) 48 | self.data = self.data[:self.size] 49 | 50 | def values(self): 51 | return [value for (priority, value) in self.data] 52 | 53 | 54 | class Clustering(object): 55 | 56 | def __init__(self, docs): 57 | self.num_docs = len(docs) 58 | self.assignments = range(0, self.num_docs) 59 | 60 | self.distance = SymmetricMatrix(self.num_docs) 61 | count = 0 62 | for i in range(0, self.num_docs): 63 | for j in range(0, i + 1): 64 | self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j]) 65 | 66 | count += 1 67 | if count % 1000000 == 0: 68 | print "Computed %d distances out of %d..." % (count, self.num_docs * self.num_docs / 2) 69 | 70 | 71 | for i in range(0, self.num_docs): 72 | for j in range(0, i): 73 | if self.distance[i, j] == 0 and self.assignments[i] != self.assignments[j]: 74 | self.merge(i, j) 75 | 76 | 77 | def min_link(self): 78 | min_i = None 79 | min_j = None 80 | min_d = 1.0 81 | 82 | for i in range(0, self.num_docs): 83 | for j in range(0, i): 84 | if self.distance[i, j] <= min_d and self.assignments[i] != self.assignments[j]: 85 | min_i = i 86 | min_j = j 87 | min_d = self.distance[i, j] 88 | 89 | return (min_i, min_j) 90 | 91 | 92 | def closest_neighbors(self, seeds, n=1): 93 | unseeded = [i for i in range(0, self.num_docs) if i not in seeds] 94 | unseeded_distance = [1.0] * len(unseeded) 95 | 96 | for seeded_index in range(0, len(seeds)): 97 | for unseeded_index in range(0, len(unseeded)): 98 | d = self.distance[seeds[seeded_index], unseeded[unseeded_index]] 99 | if d < unseeded_distance[unseeded_index]: 100 | unseeded_distance[unseeded_index] = d 101 | 102 | neighbors = PriorityQueue(n) 103 | for i in range(0, len(unseeded)): 104 | neighbors.insert(unseeded[i], unseeded_distance[i]) 105 | 106 | return neighbors.values() 107 | 108 | 109 | def merge(self, i, j): 110 | cluster_i = self.assignments[i] 111 | cluster_j = self.assignments[j] 112 | 113 | for x in range(0, self.num_docs): 114 | if self.assignments[x] == cluster_j: 115 | self.assignments[x] = cluster_i 116 | 117 | 118 | def get_clusters(self): 119 | mapping = dict([(rep, list()) for rep in set(self.assignments)]) 120 | for i in range(0, self.num_docs): 121 | mapping[self.assignments[i]].append(i) 122 | return mapping 123 | 124 | def get_cluster(self, i): 125 | rep = self.assignments[i] 126 | return [i for i in range(0, self.num_docs) if self.assignments[i] == rep] 127 | 128 | def _view(self, ids): 129 | if ids: 130 | return self.distance.submatrix(ids) 131 | return self.distance 132 | 133 | def pp_distance(self, ids): 134 | """ Pretty-print the distances between given docs. """ 135 | 136 | view = self._view(ids) 137 | 138 | print '\t' + '\t'.join([str(id) for id in ids]) 139 | for i in range(0, len(view)): 140 | distances = [view[i, j] for j in range(0, i)] 141 | print "%d:\t%s" % (ids[i], '\t'.join(['{0:.3}'.format(d) for d in distances])) 142 | 143 | (min, avg, max) = ['{0:.3}'.format(s) for s in self.stats(ids)] 144 | print "min/avg/max = %s / %s / %s" % (min, avg, max) 145 | 146 | def closest_pair(self, ids=None, farthest=False): 147 | view = self._view(ids) 148 | 149 | # set mins to first entry to be scanned... 150 | # that way if it turns out to be the actual min, we won't be left w/ Nones 151 | min_i = ids[1] 152 | min_j = ids[0] 153 | min_d = view[1, 0] 154 | 155 | for i in range(0, len(view)): 156 | for j in range(0, i): 157 | if (view[i, j] >= min_d) if farthest else (view[i, j] <= min_d): 158 | min_i = ids[i] 159 | min_j = ids[j] 160 | min_d = view[i, j] 161 | 162 | return (min_i, min_j) 163 | 164 | def farthest_pair(self, ids=None): 165 | return self.closest_pair(ids, farthest=True) 166 | 167 | def stats(self, ids): 168 | if len(ids) < 2: 169 | return (0.0, 0.0, 0.0) 170 | 171 | view = self._view(ids) 172 | distances = list() 173 | 174 | for i in range(0, len(view)): 175 | for j in range(0, i): 176 | distances.append(view[i, j]) 177 | 178 | return (min(distances), sum(distances) / float(len(distances)), max(distances)) 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /duplicates/db.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | from pymongo import Connection 4 | from BeautifulSoup import BeautifulSoup, Tag, NavigableString 5 | from clustering import NGramSpace, Clustering 6 | 7 | 8 | DOCUMENT_URL = 'http://www.regulations.gov/#!documentDetail;D=' 9 | 10 | class RegsDocument(object): 11 | 12 | def __init__(self, mongo_doc, ngrams): 13 | self.mongo_doc = mongo_doc 14 | self.comment = get_comment(mongo_doc) 15 | self.parsed = ngrams.parse(self.comment) 16 | self.url = DOCUMENT_URL + mongo_doc['Document ID'] 17 | self.title = mongo_doc['Details'].get('Title', '') if 'Details' in mongo_doc else '' 18 | 19 | def __str__(self): 20 | return "%s\n%s\n%s" % (self.title, self.url, self.comment) 21 | 22 | def get_id(self): 23 | return self.url 24 | 25 | 26 | def extract_html_comment(comment): 27 | soup = BeautifulSoup(comment) 28 | 29 | comment_header = soup.find('h2', text='General Comment').parent 30 | 31 | comment = '' 32 | 33 | for node in comment_header.findNextSiblings(): 34 | if node.name == 'h2': 35 | break 36 | 37 | comment += ''.join(strip_tags(node)) 38 | 39 | return comment 40 | 41 | def extract_comment(comment): 42 | pattern = 'Comments? ?(?:\*+|:)(.*?)(?:===+.*)?$' 43 | match = re.search(pattern, comment, re.DOTALL) 44 | if match: 45 | return match.group(1).strip() 46 | 47 | return comment.strip() 48 | 49 | 50 | def strip_tags(node): 51 | if type(node) is NavigableString: 52 | return str(node) 53 | else: 54 | return ''.join(map(strip_tags, node.contents)) 55 | 56 | # todo: update with other types, fallback for unknown types 57 | VIEW_PREFERENCE = ['crtext', 'msw8', 'pdf'] 58 | 59 | def get_comment(doc): 60 | for label in VIEW_PREFERENCE: 61 | views = [v.get('text', '') for v in doc.get('views', []) if v.get('type', '') == label and v.get('decoded')] 62 | if views: 63 | return extract_comment(views[0]) 64 | 65 | return '' 66 | 67 | 68 | def docs_2_csv(docs, filename): 69 | writer = csv.writer(open(filename, 'w')) 70 | 71 | writer.writerow(['id', 'title', 'url', 'text']) 72 | 73 | for i in range(0, len(docs)): 74 | writer.writerow([i, docs[i].title.encode('utf8', 'ignore'), docs[i].url, docs[i].comment.encode('utf8', 'ignore')]) 75 | 76 | 77 | def get_texts(ngrams): 78 | c = Connection() 79 | docs = c.regulations.docs.find() 80 | return [RegsDocument(d, ngrams) for d in docs] 81 | 82 | 83 | def setup(): 84 | ngrams = NGramSpace(4) 85 | docs = get_texts(ngrams) 86 | clustering = Clustering([doc.parsed for doc in docs]) 87 | return (clustering, docs) 88 | 89 | -------------------------------------------------------------------------------- /duplicates/interactive.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import cPickle 4 | import csv 5 | 6 | from clustering import * 7 | from cftc import CFTCDocument 8 | #from db import RegsDocument 9 | 10 | 11 | def format_stats(size, stats): 12 | (min, avg, max) = ['{0:.3}'.format(s) for s in stats] 13 | return "%s documents: min/avg/max = %s / %s / %s" % (size, min, avg, max) 14 | 15 | 16 | def cluster_loop(clustering, docs): 17 | previous_seed = None 18 | 19 | while True: 20 | (seed, _) = clustering.min_link() 21 | 22 | if seed is None: 23 | print "All elements in single cluster." 24 | break 25 | if seed == previous_seed: 26 | print "Done clustering." 27 | break 28 | previous_seed = seed 29 | 30 | print "\n%s\n" % ('=' * 80) 31 | print "Initial document:\n" 32 | print docs[seed] 33 | 34 | exponential_loop(clustering, seed, docs) 35 | 36 | 37 | def exponential_loop(clustering, seed, docs): 38 | step_size = 1 39 | current_cluster = clustering.get_cluster(seed) 40 | current_stats = clustering.stats(current_cluster) 41 | 42 | while True: 43 | potential_reps = clustering.closest_neighbors(current_cluster, step_size) 44 | if not potential_reps: 45 | print "Nothing left to add to cluster." 46 | break 47 | 48 | potential_cluster = list(set(reduce(lambda x, y: x + y, map(clustering.get_cluster, potential_reps)))) 49 | potential_cluster.sort() 50 | combined_stats = clustering.stats(current_cluster + potential_cluster) 51 | 52 | avg_sim_before = 1 - current_stats[1] 53 | avg_sim_after = 1 - combined_stats[1] 54 | # removed per Nancy's request 55 | # if avg_sim_after < .5 * avg_sim_before: 56 | # print "*** Average distance increased too much. Stopping clustering automatically. ***" 57 | # break 58 | 59 | print "\n%s\n" % ('-' * 80) 60 | print "Sample doc to cluster:" 61 | print docs[potential_reps[-1]] 62 | print "" 63 | print "Existing cluster\t%s" % format_stats(len(current_cluster), current_stats) 64 | print "Combined cluster\t%s" % format_stats(len(current_cluster) + len(potential_cluster), combined_stats) 65 | 66 | while True: 67 | choice = raw_input("Cluster? [Y/n] ").lower() 68 | if choice in ('', 'y', 'n'): 69 | break 70 | 71 | if choice in ('', 'y'): 72 | for rep in potential_reps: 73 | clustering.merge(seed, rep) 74 | 75 | step_size *= 2 76 | current_cluster = clustering.get_cluster(seed) 77 | current_stats = clustering.stats(current_cluster) 78 | else: 79 | if step_size == 1: 80 | break 81 | else: 82 | step_size = 1 83 | 84 | 85 | def dump_to_csv(clustering, docs, filename): 86 | writer = csv.writer(open(filename, 'w')) 87 | writer.writerow(['cluster number', 'document number'] + docs[0].get_output_headers()) 88 | 89 | clusters = [c for c in clustering.get_clusters().values() if len(c) > 1] 90 | clusters.sort(key=len, reverse=True) 91 | 92 | for i in range(0, len(clusters)): 93 | for d in clusters[i]: 94 | writer.writerow([i, d] + docs[d].get_output_values()) 95 | 96 | return writer 97 | 98 | 99 | def main(filename): 100 | print "Reading existing clustering from %s..." % filename 101 | (clustering, docs) = cPickle.load(open(filename, 'rb')) 102 | 103 | try: 104 | cluster_loop(clustering, docs) 105 | except KeyboardInterrupt: 106 | pass 107 | 108 | print "\nWriting clustering to %s..." % filename 109 | cPickle.dump((clustering, docs), open(filename, 'wb'), cPickle.HIGHEST_PROTOCOL) 110 | 111 | dump_to_csv(clustering, docs, filename + '.csv') 112 | 113 | 114 | if __name__ == '__main__': 115 | main(sys.argv[1]) 116 | 117 | -------------------------------------------------------------------------------- /duplicates/ngrams.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class Sequencer(object): 5 | 6 | def __init__(self): 7 | self.next_id = 1 8 | self.data = dict() 9 | 10 | def id(self, x): 11 | existing_id = self.data.get(x, None) 12 | 13 | if existing_id: 14 | return existing_id 15 | 16 | self.data[x] = self.next_id 17 | self.next_id += 1 18 | 19 | return self.next_id - 1 20 | 21 | 22 | 23 | class NGramSpace(object): 24 | 25 | def __init__(self, n): 26 | self.n = n 27 | self.ngrams = Sequencer() 28 | 29 | def parse(self, text): 30 | normalized_text = re.sub('\W', ' ', text.lower()) 31 | split_text = normalized_text.split() 32 | 33 | ids = set() 34 | 35 | for i in range(0, len(split_text) + 1 - self.n): 36 | ngram = " ".join(split_text[i:i+self.n]) 37 | ids.add(self.ngrams.id(ngram)) 38 | 39 | sorted_ids = list(ids) 40 | sorted_ids.sort() 41 | 42 | return sorted_ids 43 | 44 | 45 | def overlap(x, y): 46 | i = 0 47 | j = 0 48 | 49 | c = 0 50 | 51 | len_x = len(x) 52 | len_y = len(y) 53 | 54 | while i < len_x and j < len_y: 55 | if x[i] > y[j]: 56 | j += 1 57 | elif x[i] < y[j]: 58 | i += 1 59 | else: # x[i] == y[j] 60 | c += 1 61 | i += 1 62 | j += 1 63 | 64 | return c 65 | 66 | 67 | def jaccard(x, y): 68 | intersection_size = overlap(x, y) 69 | union_size = len(x) + len(y) - intersection_size 70 | 71 | return float(intersection_size) / union_size if union_size != 0 else 0 72 | 73 | -------------------------------------------------------------------------------- /ec2/README: -------------------------------------------------------------------------------- 1 | These are scripts to rapidly get a new scraping instance up and running on Amazon EC2. 2 | -------------------------------------------------------------------------------- /ec2/install-deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | apt-get update 3 | apt-get install -y build-essential python2.7-dev git mercurial python-pip python-virtualenv virtualenvwrapper puf openbox libxslt1-dev libxml2-dev zlib1g-dev html2text poppler-utils ghostscript antiword catdoc libjpeg8-dev libwpd-tools unrtf 4 | 5 | mkdir /tmp/tesseract 6 | cd /tmp/tesseract 7 | wget http://ppa.launchpad.net/alex-p/notesalexp/ubuntu/pool/main/t/tesseract/tesseract-ocr_3.0.0+svn590-2ppa1~maverick1_amd64.deb 8 | wget http://ppa.launchpad.net/alex-p/notesalexp/ubuntu/pool/main/t/tesseract/tesseract-ocr-eng_3.0.0+svn590-2ppa1~maverick1_all.deb 9 | dpkg -i *.deb 10 | cd 11 | rm -rf /tmp/tesseract 12 | apt-get install -f 13 | if [ ! -f /usr/lib/liblept.so.0 ]; then 14 | ln -s /usr/lib/liblept.so.1 /usr/lib/liblept.so.0 15 | fi 16 | -------------------------------------------------------------------------------- /ec2/run-x.sh: -------------------------------------------------------------------------------- 1 | nohup Xvfb & 2 | export DISPLAY=:0 3 | 4 | nohup openbox & 5 | -------------------------------------------------------------------------------- /ec2/setup-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /etc/bash_completion 4 | if [ ! -d $HOME/.virtualenvs ]; then 5 | mkdir $HOME/.virtualenvs 6 | fi 7 | mkvirtualenv scraper 8 | workon scraper 9 | 10 | pip install -r ../requirements.txt 11 | -------------------------------------------------------------------------------- /one_offs/copy_agency/cp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pymongo 3 | import gridfs 4 | 5 | source_db = pymongo.Connection(port=27019).regulations 6 | dest_db = pymongo.Connection().regulations_demo 7 | 8 | source_fs = gridfs.GridFS(source_db, 'files') 9 | dest_fs = gridfs.GridFS(dest_db, 'files') 10 | 11 | agency = sys.argv[1] 12 | 13 | for doc in source_db.docs.find({'agency': agency}): 14 | print "Copying document %s..." % doc['_id'] 15 | 16 | for attachment in [doc] + doc.get('attachments', []): 17 | for view in attachment.get('views', []): 18 | content_id = view.get('content', None) 19 | if content_id and source_fs.exists(content_id) and not dest_fs.exists(content_id): 20 | print "Copying file %s..." % content_id 21 | dest_fs.put(source_fs.get(content_id), _id=content_id) 22 | dest_db.docs.save(doc) -------------------------------------------------------------------------------- /one_offs/dodd_frank/agencies.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import csv 3 | 4 | db = pymongo.Connection().regulations_models 5 | 6 | out = csv.DictWriter(open("agencies.csv", "w"), fieldnames=['agency', 'submitters', 'mentioned']) 7 | out.writeheader() 8 | 9 | for agency in db.agencies.find(): 10 | row = {} 11 | row['agency'] = agency['_id'] 12 | 13 | #row['fr_docs'] = ";".join(["%s (%s)" % (doc['id'], doc['title']) for doc in agency['stats']['fr_docs']]) 14 | 15 | row['submitters'] = "; ".join([ 16 | '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0], item[0], item[1]) 17 | for item in sorted(agency['stats']['submitter_entities'].items(), key=lambda i: i[1], reverse=True) 18 | ]) 19 | 20 | row['mentioned'] = "; ".join([ 21 | '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0], item[0], item[1]) 22 | for item in sorted(agency['stats']['text_entities'].items(), key=lambda i: i[1], reverse=True) 23 | ]) 24 | 25 | out.writerow(row) 26 | -------------------------------------------------------------------------------- /one_offs/dodd_frank/dockets.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import csv 3 | import datetime 4 | 5 | from regscrape.models import * 6 | db = Doc._get_db() 7 | 8 | out = csv.DictWriter(open("dockets.csv", "w"), fieldnames=['title', 'docket_id', 'agency', 'fr_docs', 'submitters', 'mentioned', 'num_comments', 'num_public_interest_comments', 'num_bank_comments', 'num_law_firm_comments', 'word_count', 'last_seven']) 9 | out.writeheader() 10 | 11 | public_groups = ["ab09bc57f97b483391c483cbbdc479c8", "9c422ba85ac649269ce42804a6827059", "f8a9c531807f4585b1d5c73040b3c0fb", "f90cb1c4490344feba2ca83c2d3dd931", "be74818419c84a87b2b99c173aaea26d", "c58e0c68a7754ee2bd909fa68cecee7a", "2f0920a5271d41a7a85c4a7946775390", "10c585fd7f9d4cd1a82265e151b12f9e", "e31bfef434e9470b9e473d6182f2d021", "174a89892823486aad4538033fe0e8c7", "fb702029157e4c7c887172eba71c66c5", "6e5348b28f7242aab5437e0a34758350", "f89c8e3ab2b44f72971f91b764868231", "219154488de945e781330db65a54e1f4", "c5fe2c9b5a6c46fc8faeb34b8df6524f", "4536032e5d1d47248a5eddb86ce1a7f1", "23a8fb4b1188414ea125e06f34dc7df7", "3b14c79d157c4a8ab7e1bd7fdc589544"] 12 | banks = ["5202316fe79343a09a31e8c6c31ebe3d", "bc1d056e59334c07bb0761b97efa64e4", "793070ae7f5e42c2a76a58663a588f3d", "4e6bc9a6b7dc40d7b9b00d58c0e359db", "8d93cebae445485f9af02676a2d71b3f", "91f9a88888d744da8d433018cf912460", "c28bf9dd2a0b4ac19408b645ecc74a7a", "71c49bc56b3a4d369e727fd22744876a", "597eccfe48784677a437569ff6293097", "9bea23144b304a31a790a6c3a9e5f9e6", "878b4d98431344de88d8fb9757043a95", "8c007e162ca1450cbe7f976732a9a770", "c86403b874ea4d9390574088a2973705", "46ff48813fc34247b8d31e22a13663c5", "1fecb472df7444d3822e784f1a0845e6", "c24ef68246554310aa03888ea10cd9bf", "8376751efebe4687b70c84b7c33e3c74", "31e6e04b59084d5c9b09c102680bcc32", "b6a33d8be4784be58c69e1e487a3ed8b", "162122d165e24747b2d7ebb064d7142f"] 13 | law_firms = ["28f4d347bbae4d738aa3199346cf6850", "555e92b13c6640288ef76ee7c2bae09f", "783f8ace8f5d4a3c8a29c7d02b9a336f"] 14 | one_week = datetime.timedelta(days=7) 15 | 16 | for agency in db.dockets.find(): 17 | row = {} 18 | row['title'] = agency['title'].encode('ascii', errors='ignore') 19 | row['docket_id'] = agency['_id'] 20 | row['agency'] = agency['agency'] 21 | 22 | row['fr_docs'] = "; ".join(["%s (%s)" % (doc['id'], doc['title'].encode('ascii', errors='ignore')) for doc in agency['stats']['fr_docs']]) 23 | 24 | row['submitters'] = "; ".join([ 25 | '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0].encode('ascii', errors='ignore'), item[0], item[1]) 26 | for item in sorted(agency['stats']['submitter_entities'].items(), key=lambda i: i[1], reverse=True) 27 | ]) 28 | 29 | row['mentioned'] = "; ".join([ 30 | '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0].encode('ascii', errors='ignore'), item[0], item[1]) 31 | for item in sorted(agency['stats']['text_entities'].items(), key=lambda i: i[1], reverse=True) 32 | ]) 33 | 34 | row['num_comments'] = agency['stats']['type_breakdown'].get('public_submission', 0) 35 | 36 | row['num_public_interest_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in public_groups]) 37 | row['num_bank_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in banks]) 38 | row['num_law_firm_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in law_firms]) 39 | 40 | last_seven = 0 41 | word_count = 0 42 | for doc in db.docs.find({'docket_id': agency['_id'], 'type': 'public_submission'}): 43 | if doc['views']: 44 | word_count += max([len(View._from_son(view).content.read()) for view in doc['views'] if view['extracted'] == 'yes'] or [0]) 45 | 46 | if doc.get('attachments', []): 47 | for attachment in doc['attachments']: 48 | if attachment['views']: 49 | word_count += max([len(View._from_son(view).content.read()) for view in attachment['views'] if view['extracted'] == 'yes'] or [0]) 50 | 51 | date = doc.get('details', {}).get('Date_Posted', None) 52 | if date and agency['stats']['date_range'][1]: 53 | if date > agency['stats']['date_range'][1] - one_week: 54 | last_seven += 1 55 | 56 | 57 | row['word_count'] = word_count 58 | row['last_seven'] = last_seven 59 | 60 | out.writerow(row) 61 | -------------------------------------------------------------------------------- /one_offs/dodd_frank/dump.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | dockets = [docket.strip() for docket in open(sys.argv[1])] 5 | 6 | for docket in dockets: 7 | p = subprocess.Popen(["./run.py", 'rdg_dump_api', '-d', docket]) 8 | p.communicate() -------------------------------------------------------------------------------- /one_offs/dodd_frank/parse.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | dockets = [docket.strip() for docket in open(sys.argv[1])] 5 | 6 | for docket in dockets: 7 | try: 8 | p = subprocess.Popen(["./run.py", 'rdg_parse_api', '-d', docket, '-A']) 9 | p.communicate() 10 | except: 11 | print "failed %s" % docket -------------------------------------------------------------------------------- /one_offs/dodd_frank/regscrape: -------------------------------------------------------------------------------- 1 | ../../regscrape -------------------------------------------------------------------------------- /one_offs/dodd_frank/settings.py: -------------------------------------------------------------------------------- 1 | ../../regscrape/settings.py -------------------------------------------------------------------------------- /one_offs/lightsquared/download_files.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | 4 | def run(options, args): 5 | import json, os 6 | from regscrape_lib.transfer import bulk_download 7 | 8 | if len(args) > 1: 9 | metadata_path = args[0] 10 | out_path = args[1] 11 | else: 12 | print "Specify files" 13 | sys.exit(0) 14 | 15 | input = json.load(open(metadata_path, 'r')) 16 | 17 | download_path = os.path.join(os.path.dirname(metadata_path), 'downloads') 18 | 19 | def download_generator(): 20 | for record in input: 21 | for document in record['documents']: 22 | num = document['url'].split('=').pop() + '.pdf' 23 | yield (document['url'], os.path.join(download_path, num), document) 24 | 25 | def status_func(status, url, filename, record): 26 | if status[0]: 27 | record['filename'] = 'downloads/' + filename.split('downloads/').pop() 28 | else: 29 | record['filename'] = False 30 | record['download_error'] = status[1] 31 | 32 | bulk_download(download_generator(), status_func, retries=2, verbose=True) 33 | 34 | date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None 35 | open(out_path, 'w').write(json.dumps(input, default=date_handler, indent=4)) -------------------------------------------------------------------------------- /one_offs/lightsquared/extract_text.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | 4 | def run(options, args): 5 | import json, os 6 | from regscrape_lib.extraction import serial_bulk_extract 7 | 8 | if len(args) > 1: 9 | metadata_path = args[0] 10 | out_path = args[1] 11 | else: 12 | print "Specify files" 13 | sys.exit(0) 14 | 15 | input = json.load(open(metadata_path, 'r')) 16 | 17 | file_path = os.path.dirname(metadata_path) 18 | 19 | def extract_generator(): 20 | for record in input: 21 | for document in record['documents']: 22 | yield (os.path.join(file_path, document['filename']), 'pdf', document) 23 | 24 | def status_func(status, text, filename, filetype, used_ocr, record): 25 | if status[0]: 26 | record['text'] = text 27 | else: 28 | record['text'] = None 29 | record['extraction_error'] = status[1] 30 | 31 | serial_bulk_extract(extract_generator(), status_func, verbose=True) 32 | 33 | date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None 34 | open(out_path, 'w').write(json.dumps(input, default=date_handler, indent=4)) -------------------------------------------------------------------------------- /one_offs/lightsquared/get_metadata.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | 4 | def run(options, args): 5 | import zipfile, sys, datetime, re, json 6 | from lxml import etree 7 | 8 | if len(args) > 1: 9 | wbk_path = args[0] 10 | out_path = args[1] 11 | else: 12 | print "Specify files" 13 | sys.exit(0) 14 | 15 | wbk = zipfile.ZipFile(wbk_path, 'r') 16 | sheet = wbk.open("content.xml", 'r') 17 | 18 | document = etree.parse(sheet) 19 | 20 | ns_map = {'table': "urn:oasis:names:tc:opendocument:xmlns:table:1.0"} 21 | bools = {'Y': True, 'N': False} 22 | date_re = re.compile(r'\d{2}/\d{2}/\d{4}') 23 | date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None 24 | link_re = re.compile(r'of:=HYPERLINK\("(?P[\w:/?=\.]*)";"(?P[\w\s\(\)]*)"\)') 25 | 26 | rows = document.findall("//table:table-row", ns_map) 27 | 28 | # handle the first row 29 | fields = [] 30 | for cell in rows[0].findall("table:table-cell", ns_map): 31 | text_nodes = cell.getchildren() 32 | if text_nodes and text_nodes[0].text: 33 | fields.append(text_nodes[0].text.lower().replace(' ', '_')) 34 | 35 | out = [] 36 | for row in rows[1:]: 37 | row_data = {'documents': []} 38 | cells = row.findall("table:table-cell", ns_map) 39 | for i in xrange(len(cells)): 40 | cell = cells[i] 41 | 42 | text_nodes = cell.getchildren() 43 | if "{urn:oasis:names:tc:opendocument:xmlns:table:1.0}formula" in cell.keys(): 44 | # looks like a link 45 | hyperlink = cell.attrib["{urn:oasis:names:tc:opendocument:xmlns:table:1.0}formula"] 46 | match = link_re.match(hyperlink) 47 | if not match: 48 | print hyperlink 49 | print 'failed to parse link %s' % hyperlink 50 | sys.exit() 51 | row_data['documents'].append(match.groupdict()) 52 | elif text_nodes and text_nodes[0].text: 53 | # looks like plain text 54 | text = text_nodes[0].text 55 | 56 | # fix dates 57 | if date_re.match(text): 58 | text = datetime.datetime.strptime(text, '%m/%d/%Y').date() 59 | 60 | # fix booleans: 61 | if text in bools: 62 | text = bools[text] 63 | 64 | row_data[fields[i]] = text 65 | 66 | if len(row_data.keys()) > 1: 67 | out.append(row_data) 68 | 69 | open(out_path, 'w').write(json.dumps(out, default=date_handler, indent=4)) -------------------------------------------------------------------------------- /one_offs/pdf_repair/detect_pdfs.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | def run(): 4 | from regs_models import Doc 5 | import json 6 | from regs_common.processing import * 7 | 8 | problems = set() 9 | for finder in [find_views, find_attachment_views]: 10 | for view_d in finder(mode="html", type="pdf", extracted="yes"): 11 | content = view_d['view'].content.read() 12 | if not content: 13 | print "Weird:", view_d['doc'] 14 | elif html_is_empty(content): 15 | print "Problem:", view_d['doc'] 16 | problems.add(view_d['doc']) 17 | else: 18 | print "OK:", view_d['doc'] 19 | 20 | print "%s problems" % len(problems) 21 | outfile = open("/tmp/problems.json", "w") 22 | json.dump(sorted(list(problems)), outfile) -------------------------------------------------------------------------------- /one_offs/pdf_repair/fix_pdfs.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from regs_models import Doc 4 | import json 5 | import itertools 6 | 7 | def split_seq(iterable, size): 8 | it = iter(iterable) 9 | item = list(itertools.islice(it, size)) 10 | while item: 11 | yield item 12 | item = list(itertools.islice(it, size)) 13 | 14 | all_ids = json.load(open("/tmp/problems.json")) 15 | for ids in split_seq(all_ids, 1000): 16 | for doc in Doc.objects(id__in=ids): 17 | for view in doc.views: 18 | if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": 19 | view.extracted = "no" 20 | view.content.delete() 21 | for attachment in doc.attachments: 22 | for view in attachment.views: 23 | if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": 24 | view.extracted = "no" 25 | view.content.delete() 26 | doc.in_search_index = False 27 | doc.in_cluster_db = False 28 | doc.entities_last_extracted = None 29 | 30 | print "Repaired %s" % doc.id 31 | doc.save() -------------------------------------------------------------------------------- /regscrape/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/__init__.py -------------------------------------------------------------------------------- /regscrape/pipeline.py: -------------------------------------------------------------------------------- 1 | import settings 2 | import pymongo 3 | import json 4 | import sys 5 | import os 6 | import subprocess 7 | import time 8 | import signal 9 | import datetime 10 | 11 | DEFAULT_SEQUENCE = [ 12 | 'rdg_dump_api', 13 | 'rdg_parse_api', 14 | 'rdg_scrape', 15 | 'rdg_download', 16 | 'extract', 17 | 'create_dockets', 18 | 'rdg_scrape_dockets', 19 | 'add_to_search', 20 | 'run_aggregates' 21 | ] 22 | OVERRIDE_SEQUENCES = {} 23 | FLAGS = { 24 | 'scrape': ['-m', '8'], 25 | 'scrape_dockets': ['-m', '8'] 26 | } 27 | 28 | running = {} 29 | 30 | db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME] 31 | pid = os.getpid() 32 | 33 | enabled = True 34 | def sigint_handler(signum, frame): 35 | global enabled 36 | enabled = False 37 | print "Caught SIGINT; will exit after current tasks are complete." 38 | signal.signal(signal.SIGINT, sigint_handler) 39 | signal.signal(signal.SIGHUP, sigint_handler) 40 | 41 | def preexec_function(): 42 | # Ignore the SIGINT signal by setting the handler to the standard 43 | # signal handler SIG_IGN. 44 | signal.signal(signal.SIGINT, signal.SIG_IGN) 45 | 46 | # start by resetting failures 47 | for agency_record in db.pipeline.find(): 48 | sequence = OVERRIDE_SEQUENCES.get(agency_record['_id'], DEFAULT_SEQUENCE) 49 | completed = agency_record['completed'] 50 | failed_idxs = [i for i in xrange(len(sequence)) if sequence[i] in completed and type(completed[sequence[i]]) != dict] 51 | if failed_idxs: 52 | print "Resetting everything for agency %s after command %s" % (agency_record['_id'], sequence[failed_idxs[0]]) 53 | for command in sequence[failed_idxs[0]:]: 54 | if command in completed: 55 | print "Resetting %s" % command 56 | del completed[command] 57 | db.pipeline.update({'_id': agency_record['_id']}, {'$set': {'completed': completed}}, safe=True) 58 | 59 | while True: 60 | now = str(datetime.datetime.now()) 61 | print "[%s] TICK %s" % (now, pid) 62 | 63 | # book-keep already started processes 64 | for command, info in running.items(): 65 | agency, proc = info 66 | if proc.poll() is not None: 67 | print "[%s] %s has finished command %s" % (now, agency, command) 68 | results = proc.stdout.read() 69 | try: 70 | parsed = json.loads(results) 71 | except ValueError: 72 | parsed = "parse_failure" 73 | 74 | db.pipeline.update({'_id': agency}, {'$set': {('completed.' + command): parsed}}, safe=True) 75 | del running[command] 76 | 77 | # start up new ones as necessary, assuming we're still going 78 | if enabled: 79 | for agency_record in db.pipeline.find().sort('count'): 80 | agency = agency_record['_id'] 81 | 82 | sequence = OVERRIDE_SEQUENCES.get(agency, DEFAULT_SEQUENCE) 83 | completed = agency_record['completed'].keys() 84 | to_do = [command for command in sequence if command not in completed] 85 | 86 | if not to_do: 87 | continue 88 | 89 | next = to_do[0] 90 | if next not in running: 91 | full_command = [sys.executable, './run.py', next] + FLAGS.get(next, []) + ['-a', agency, '--parsable'] 92 | proc = subprocess.Popen(full_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=preexec_function) 93 | running[next] = (agency, proc) 94 | print '[%s] %s has started command %s' % (now, agency, next) 95 | 96 | if not running.keys(): 97 | print 'Nothing left to do; exiting.' 98 | break 99 | 100 | time.sleep(1) -------------------------------------------------------------------------------- /regscrape/regs_common/__init__.py: -------------------------------------------------------------------------------- 1 | # lifted some logging code from the examples in the Python docs 2 | 3 | import logging 4 | 5 | # create logger 6 | logger = logging.getLogger("regscrape") 7 | logger.setLevel(logging.DEBUG) 8 | 9 | # create console handler and set level to debug 10 | ch = logging.StreamHandler() 11 | ch.setLevel(logging.DEBUG) 12 | 13 | # create formatter 14 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 15 | 16 | # add formatter to ch 17 | ch.setFormatter(formatter) 18 | 19 | # add ch to logger 20 | logger.addHandler(ch) 21 | 22 | # also log pykka stuff if DEBUG is true 23 | import settings 24 | 25 | if settings.DEBUG: 26 | for ext_logger_name in ['pykka', 'remote_connection']: 27 | ext_logger = logging.getLogger(ext_logger_name) 28 | ext_logger.setLevel(logging.DEBUG) 29 | ext_logger.addHandler(ch) 30 | 31 | # add self to path 32 | import sys 33 | import os 34 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 35 | if CURRENT_DIR not in sys.path: 36 | sys.path.append(CURRENT_DIR) 37 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/regs_common/commands/__init__.py -------------------------------------------------------------------------------- /regscrape/regs_common/commands/add_to_search.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from optparse import OptionParser 4 | arg_parser = OptionParser() 5 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 6 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 7 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing search data with new data.") 8 | 9 | from regs_models import * 10 | import urllib2, json, traceback, datetime, zlib, pymongo, pytz, itertools 11 | import rawes, requests, thrift 12 | 13 | def run(options, args): 14 | while True: 15 | try: 16 | return add_to_search(options, args) 17 | except (pymongo.errors.OperationFailure, requests.exceptions.ConnectionError, thrift.transport.TTransport.TTransportException): 18 | print "Resetting..." 19 | continue 20 | 21 | def add_to_search(options, args): 22 | import settings 23 | 24 | es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=60.0) 25 | index = getattr(es, settings.ES_INDEX) 26 | 27 | now = datetime.datetime.now() 28 | 29 | querysets = {} 30 | builders = {} 31 | metadata = {} 32 | 33 | PER_REQUEST = 200 34 | 35 | ### Dockets ### 36 | 37 | query = {'scraped': 'yes'} 38 | if options.agency: 39 | query['agency'] = options.agency 40 | if options.docket: 41 | query['_id'] = options.docket 42 | if not options.process_all: 43 | query['in_search_index'] = False 44 | 45 | querysets['docket'] = Docket.objects(__raw__=query) 46 | 47 | def build_docket(docket): 48 | print 'preparing docket', docket.id 49 | 50 | # build initial ES document 51 | es_doc = { 52 | 'title': docket.title, 53 | 'agency': docket.agency, 54 | 'identifiers': [docket.id] 55 | } 56 | 57 | # add identifiers 58 | if docket.rin and docket.rin != "Not Assigned": 59 | es_doc['identifiers'].append(docket.rin) 60 | 61 | return es_doc 62 | 63 | def get_docket_metadata(docket): 64 | return {'_index': settings.ES_INDEX, '_type': 'docket', '_id': docket.id} 65 | 66 | builders['docket'] = build_docket 67 | metadata['docket'] = get_docket_metadata 68 | 69 | ### Documents ### 70 | 71 | query = {'deleted': False, 'scraped': 'yes', '$nor': [{'views.extracted': 'no'},{'attachments.views.extracted':'no'}]} 72 | if options.agency: 73 | query['agency'] = options.agency 74 | if options.docket: 75 | query['docket_id'] = options.docket 76 | if not options.process_all: 77 | query['in_search_index'] = False 78 | 79 | querysets['document'] = Doc.objects(__raw__=query) 80 | 81 | def build_document(doc): 82 | print 'preparing document', doc.id 83 | if doc.renamed: 84 | print 'preparing', doc.id 85 | doc.in_search_index = True 86 | doc.save() 87 | return None 88 | 89 | # build initial ES document 90 | es_doc = { 91 | 'docket_id': doc.docket_id if doc.docket_id else doc.id.rsplit('-', 1)[0], 92 | 'comment_on': doc.comment_on.get('document_id', None) if doc.comment_on else None, 93 | 'title': doc.title, 94 | 'agency': doc.agency, 95 | 'posted_date': doc.details['Date_Posted'].replace(tzinfo=pytz.UTC) if 'Date_Posted' in doc.details else None, 96 | 'document_type': doc.type, 97 | 'submitter_organization': doc.details.get('Organization_Name', None), 98 | 'submitter_name': ' '.join(filter(bool, [doc.details.get('First_Name', None), doc.details.get('Middle_Initial', None), doc.details.get('Last_Name', None)])), 99 | 'submitter_entities': doc.submitter_entities, 100 | 'files': [], 101 | 'analyses': [], 102 | 'identifiers': [doc.id] 103 | } 104 | 105 | # add views (max of 5 to avoid pathological cases) 106 | for view in doc.views[:5]: 107 | if not view.content: 108 | continue 109 | es_doc['files'].append({ 110 | "title": None, 111 | "abstract": None, 112 | "object_id": doc.object_id, 113 | "file_type": view.type, 114 | "view_type": "document_view", 115 | "text": view.as_text()[:100000], 116 | "entities": view.entities 117 | }) 118 | 119 | # add attachments (max of 10 to avoid pathological cases) 120 | for attachment in doc.attachments[:10]: 121 | for view in attachment.views[:5]: 122 | if not view.content: 123 | continue 124 | es_doc['files'].append({ 125 | "title": attachment.title, 126 | "abstract": attachment.abstract, 127 | "object_id": attachment.object_id, 128 | "file_type": view.type, 129 | "view_type": "attachment_view", 130 | "text": view.as_text()[:100000], 131 | "entities": view.entities 132 | }) 133 | 134 | # add identifiers 135 | if doc.rin and doc.rin != "Not Assigned": 136 | es_doc['identifiers'].append(doc.rin) 137 | 138 | if doc.details.get('Federal_Register_Number', None): 139 | es_doc['identifiers'].append(doc.details['Federal_Register_Number']) 140 | 141 | if doc.details.get('FR_Citation', None): 142 | es_doc['identifiers'].append(doc.details['FR_Citation'].replace(' ', '')) 143 | 144 | return es_doc 145 | 146 | def get_document_metadata(doc): 147 | return {'_index': settings.ES_INDEX, '_type': 'document', '_id': doc.id, '_parent': doc.docket_id if doc.docket_id else doc.id.rsplit('-', 1)[0]} 148 | 149 | builders['document'] = build_document 150 | metadata['document'] = get_document_metadata 151 | 152 | ### Actually do everything ### 153 | def flush(queue, ids, collection): 154 | # no need to do anything if there aren't any docs to add 155 | if not ids: 156 | return 157 | 158 | # save current queue to ES 159 | try: 160 | es_status = es._bulk.post(data="\n".join(queue)) 161 | print 'saved %s to ES' % ", ".join(ids) 162 | except rawes.elastic_exception.ElasticException: 163 | # sometimes the bulk save fails for some reason; fall back to traditional iterative safe if so 164 | print 'falling back to iterative save...' 165 | # iterate over the queue pair-wise 166 | for command, record in itertools.izip(*[iter(queue)]*2): 167 | meta = json.loads(command)['index'] 168 | params = {'parent': meta['_parent']} if '_parent' in meta else {} 169 | 170 | es_index = getattr(es, meta['_index']) 171 | es_type = getattr(es_index, meta['_type']) 172 | 173 | es_status = es_type[meta['_id']].put(data=record, params=params) 174 | print 'saved %s to ES as %s' % (meta['_id'], es_status['_id']) 175 | 176 | # update mongo docs 177 | collection.update({'_id': {'$in': ids}}, {'$set': {'in_search_index': True}}, multi=True, safe=True) 178 | 179 | print "saved %s back to mongo" % ", ".join(ids) 180 | 181 | counts = {'docket': 0, 'document': 0} 182 | for datatype in ('docket', 'document'): 183 | queue = [] 184 | ids = [] 185 | max_length = PER_REQUEST * 2 186 | for item in querysets[datatype]: 187 | record = builders[datatype](item) 188 | meta = metadata[datatype](item) 189 | 190 | if record: 191 | if not item.suppression.get('replaced_by', None): 192 | queue.append(json.dumps({'index':meta})) 193 | queue.append(json.dumps(record, default=es.json_encoder)) 194 | ids.append(item.id) 195 | 196 | if len(queue) >= max_length: 197 | flush(queue, ids, querysets[datatype]._collection) 198 | counts[datatype] += len(ids) 199 | queue = [] 200 | ids = [] 201 | flush(queue, ids, querysets[datatype]._collection) 202 | counts[datatype] += len(ids) 203 | 204 | print "Done adding things to search: %s docket entries and %s document entries." % (counts['docket'], counts['document']) 205 | return counts -------------------------------------------------------------------------------- /regscrape/regs_common/commands/administer_search.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from optparse import OptionParser 4 | arg_parser = OptionParser() 5 | arg_parser.add_option("-d", "--delete", dest="delete", action="store_true", default=False, help="Delete the search index.") 6 | arg_parser.add_option("-c", "--create", dest="create", action="store_true", default=False, help="Create the search index.") 7 | 8 | from regs_models import * 9 | import urllib2, json, os 10 | import rawes 11 | 12 | def run(options, args): 13 | import settings, regs_common 14 | 15 | es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=30.0) 16 | index = getattr(es, settings.ES_INDEX) 17 | 18 | if options.delete: 19 | index.delete() 20 | print "Index deleted." 21 | 22 | if options.create: 23 | mapping_file = os.path.join(os.path.abspath(os.path.dirname(regs_common.__file__)), "data", "es_mapping.json") 24 | mapping_data = json.load(open(mapping_file)) 25 | index.put(data={'mappings': mapping_data}) 26 | print "Index created." 27 | 28 | stats = es._stats.get() 29 | print json.dumps(stats, indent=4) 30 | 31 | return stats -------------------------------------------------------------------------------- /regscrape/regs_common/commands/annotate_fr_agencies.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from regs_models import * 4 | import json, urllib2 5 | 6 | def run(): 7 | fr_data = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/agencies.json")) 8 | 9 | fr_dict = {r['short_name']: r for r in fr_data if r['short_name']} 10 | 11 | for agency in Agency.objects(): 12 | if agency.id in fr_dict: 13 | agency.fr_id = fr_dict[agency.id]['id'] 14 | agency.save() 15 | print "Saved %s with ID %s" % (agency.name, agency.fr_id) -------------------------------------------------------------------------------- /regscrape/regs_common/commands/annotate_fr_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | GEVENT = False 4 | 5 | from regs_common.exceptions import * 6 | from regs_models import * 7 | from optparse import OptionParser 8 | 9 | import json, urllib, urllib2, os, re, datetime 10 | 11 | from pyquery import PyQuery as pq 12 | import dateutil.parser 13 | import jellyfish 14 | 15 | # arguments 16 | arg_parser = OptionParser() 17 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 18 | arg_parser.add_option("-d", "--docket", dest="docket_id", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 19 | arg_parser.add_option("-s", "--source", dest="source", action="store", type="string", default=None, help="Specify a scraping source to which to limit the dump.") 20 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing FR data with new data.") 21 | 22 | HEADER_MATCHER = re.compile(r"^[\*\s]*Federal Register[\*\s]*/ Vol. (\d+).*") 23 | NUMBER = re.compile("^(\d+)$") 24 | THREE_MONTHS = datetime.timedelta(days=90) 25 | 26 | def fr_citation_from_view(view): 27 | view_text = view.as_text() 28 | lines = view_text.split("\n") 29 | 30 | # look for a page header 31 | header_match = [HEADER_MATCHER.match(l) for l in lines] 32 | header_lines = [(i, m.groups()[0]) for i, m in enumerate(header_match) if m] 33 | 34 | # now, for each, the page number will come either on the preceding or following line depending whether it's a left or right page 35 | number_match = {i: filter(bool, [NUMBER.match(lines[n].replace('*', '').strip()) for n in (i - 1, i + 1)]) for i, l in header_lines} 36 | 37 | header_lines_n = [(i, l, number_match[i][0].groups()[0]) for i, l in header_lines if number_match[i]] 38 | 39 | if header_lines_n: 40 | return "%s FR %s" % (header_lines_n[0][1], header_lines_n[0][2]) 41 | 42 | return None 43 | 44 | _fr_ids = {} 45 | def fr_id_for_agency(agency): 46 | if agency in _fr_ids: 47 | return _fr_ids[agency] 48 | 49 | agency_obj = Agency.objects.get(id=agency) 50 | _fr_ids[agency] = agency_obj.fr_id if agency_obj.fr_id else None 51 | return _fr_ids[agency] 52 | 53 | def levenshtein_ratio(s1, s2): 54 | s = len(s1) + len(s2) 55 | return (s - jellyfish.levenshtein_distance(s1.encode('utf8'), s2.encode('utf8'))) / float(s) 56 | 57 | def guess_fr_num(doc): 58 | # if it's title-less or has a very short title, don't bother 59 | if not doc.title or len(doc.title) < 10: 60 | return None 61 | 62 | query = {'conditions[term]': doc.title.encode('utf8')} 63 | 64 | aid = fr_id_for_agency(doc.agency) 65 | if aid: 66 | query['conditions[agency_ids][]'] = str(aid) 67 | 68 | has_date = 'Date_Posted' in doc.details 69 | if has_date: 70 | # bracket the FR date by three months in either direction because sometimes they don't match 71 | query['conditions[publication_date][gte]'] = (doc.details['Date_Posted'] - THREE_MONTHS).strftime("%m/%d/%Y") 72 | query['conditions[publication_date][lte]'] = (doc.details['Date_Posted'] + THREE_MONTHS).strftime("%m/%d/%Y") 73 | 74 | # do search 75 | results = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles?" + urllib.urlencode(query))) 76 | if results['count']: 77 | # first annotate each with its title's distance to the real title, how far in time it is from the real time 78 | for result in results['results']: 79 | result['similarity'] = levenshtein_ratio(result['title'], doc.title) 80 | 81 | if has_date: 82 | real_date = dateutil.parser.parse(result['publication_date']) 83 | result['time_apart'] = abs(doc.details['Date_Posted'] - real_date) 84 | 85 | # then strip out all the ones that aren't at least 75% similar 86 | candidates = [result for result in results['results'] if result['similarity'] > 0.75] 87 | if not candidates: 88 | return None 89 | 90 | # then sort by how far away in time they are, if there are dates, or the distance otherwise 91 | if has_date: 92 | sorted_candidates = sorted(candidates, key=lambda r: r['time_apart']) 93 | else: 94 | sorted_candidates = sorted(candidates, key=lambda r: r['similarity'], reverse=True) 95 | 96 | return candidates[0]['document_number'] 97 | 98 | def fr_num_from_cite(fr_cite, title): 99 | # construct a query 100 | query = {'conditions[term]': fr_cite.encode('utf8')} 101 | 102 | # do search -- has to be by HTML because there doesn't seem to be a way to do citation searches via the API 103 | page = pq(url="https://www.federalregister.gov/articles/search?" + urllib.urlencode(query)) 104 | links = page('.matching_citation_document h4 a') 105 | 106 | if not links: 107 | return None 108 | 109 | items = [(link.attr('href'), link.text()) for link in links.items()] 110 | 111 | # we order only by name because all results are on the same page and will therefore be from the same date 112 | sorted_items = sorted(items, key=lambda l: levenshtein_ratio(l[1], title), reverse=True) 113 | 114 | # the document number is the thing before the last slash 115 | return sorted_items[0][0].split("/")[-2] 116 | 117 | def run(options, args): 118 | query = {'type__in': ['notice', 'proposed_rule', 'rule']} 119 | 120 | for filter_type in ('agency', 'docket_id', 'source'): 121 | filter_attr = getattr(options, filter_type) 122 | if filter_attr: 123 | query[filter_type] = filter_attr 124 | 125 | frn, frc, g, nd = 0, 0, 0, 0 126 | for doc in Doc.objects(**query): 127 | if 'fr_data' in doc.annotations and not options.process_all: 128 | continue 129 | 130 | fr_num = None 131 | fr_cite = None 132 | 133 | if 'Federal_Register_Number' in doc.details: 134 | print doc.id, 'FR num', doc.details['Federal_Register_Number'].encode('utf8') 135 | frn += 1 136 | 137 | # try fetching now; maybe we're done, but we can always try one of the other tactics if this doesn't work 138 | fr_num = doc.details['Federal_Register_Number'].encode('utf8') 139 | try: 140 | doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num)) 141 | doc.save() 142 | print 'Succeeded with %s via FR number' % doc.id 143 | continue 144 | except: 145 | fr_num = None 146 | 147 | if 'Federal_Register_Citation' in doc.details: 148 | print doc.id, 'FR cite', doc.details['Federal_Register_Citation'].encode('utf8') 149 | frc += 1 150 | fr_cite = doc.details['Federal_Register_Citation'].encode('utf8') 151 | fr_num = fr_num_from_cite(fr_cite, doc.title) 152 | if fr_num: 153 | # try again 154 | try: 155 | doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num)) 156 | doc.save() 157 | print 'Succeeded with %s via FR citation' % doc.id 158 | continue 159 | except: 160 | fr_cite = None 161 | fr_num = None 162 | else: 163 | fr_cite = None 164 | 165 | if not fr_num and not fr_cite: 166 | # does it have a PDF copy of the Federal Register version of the thing? 167 | views = None 168 | att = [a for a in doc.attachments if 'Federal Register' in a.title] 169 | if att: 170 | views = [v for v in att[0].views if v.type == 'pdf'] 171 | 172 | if not views: 173 | views = [v for v in doc.views if v.type == 'xpdf'] 174 | 175 | if views: 176 | fr_cite = fr_citation_from_view(views[0]) 177 | 178 | if fr_cite: 179 | print doc.id, 'FR cite (by PDF)', fr_cite 180 | frc += 1 181 | 182 | fr_num = fr_num_from_cite(fr_cite, doc.title) 183 | if fr_num: 184 | # try again 185 | try: 186 | doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num)) 187 | doc.save() 188 | print 'Succeeded with %s via FR citation (PDF)' % doc.id 189 | continue 190 | except: 191 | fr_cite = None 192 | fr_num = None 193 | else: 194 | fr_cite = None 195 | 196 | else: 197 | # last chance -- we guess from the title alone 198 | fr_num = guess_fr_num(doc) 199 | if fr_num: 200 | # try again 201 | try: 202 | doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num)) 203 | doc.save() 204 | g += 1 205 | print 'Succeeded with %s via title guessing' % doc.id 206 | continue 207 | except: 208 | fr_cite = None 209 | fr_num = None 210 | 211 | if not fr_num and not fr_cite: 212 | # we failed :/ 213 | doc.annotations['fr_data'] = None 214 | doc.save() 215 | print doc.id, 'No dice' 216 | nd += 1 217 | print frn, frc, g, nd -------------------------------------------------------------------------------- /regscrape/regs_common/commands/create_dockets.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 5 | 6 | def run(options, args): 7 | import regs_models as models 8 | from collections import defaultdict 9 | 10 | db = models.Docket._get_db() 11 | new = 0 12 | 13 | print 'Starting docket query...' 14 | 15 | conditions = {} 16 | if options.agency: 17 | conditions['agency'] = options.agency 18 | if options.docket: 19 | conditions['id'] = options.docket 20 | 21 | # there's no way to do this aggregation without a map-reduce in Mongo 2.0, so do it on the Python side for now 22 | # once 2.2 is final, this can trivially be replaced with a $group + $addToSet pipeline using the new aggregation framework 23 | dockets = defaultdict(set) 24 | for doc in db.docs.find(conditions, fields=['docket_id', 'agency']): 25 | if 'docket_id' not in doc: 26 | continue 27 | dockets[doc['docket_id']].add(doc['agency']) 28 | 29 | for docket_id, agencies in dockets.iteritems(): 30 | if docket_id: 31 | agency = list(agencies)[0] if len(agencies) == 1 else sorted(agencies, key=lambda a: docket_id.startswith(a), reverse=True)[0] 32 | try: 33 | docket = models.Docket(id=docket_id, agency=agency) 34 | docket.save(force_insert=True) 35 | new += 1 36 | except: 37 | # we already have this one 38 | pass 39 | 40 | total = len(dockets.keys()) 41 | print 'Iterated over %s dockets, of which %s were new.' % (total, new) 42 | 43 | return {'total': total, 'new': new} 44 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/create_entities.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from regs_models import * 4 | 5 | from optparse import OptionParser 6 | arg_parser = OptionParser() 7 | arg_parser.add_option("-U", "--update", dest="update", action="store_true", default=False, help="Check if entities already existing before creating (slower)") 8 | 9 | def run(options, args): 10 | import settings, regs_common 11 | import os 12 | 13 | # if we're updating 14 | if options.update: 15 | print "Preparing to update existing entities; retrieving current entity list..." 16 | current = set((e.id for e in Entity.objects())) 17 | print "Entities retrieved." 18 | else: 19 | print "Constructing new entity list." 20 | 21 | # grab a dictionary 22 | word_file = getattr(settings, 'WORD_FILE', '/usr/share/dict/words') 23 | name_file = os.path.join(os.path.abspath(os.path.dirname(regs_common.__file__)), "data", "names.dat") 24 | 25 | # filtered_words is a set of English words, plus common first and last names, and single letters 26 | filtered_words = set((word.strip() for word in open(word_file, 'r') if word and word[0] == word[0].lower())) 27 | filtered_words.update((name.strip().lower() for name in open(name_file, 'r') if name.strip() and not name.startswith('#'))) 28 | filtered_words.update(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) 29 | 30 | 31 | from influenceexplorer import InfluenceExplorer 32 | api = InfluenceExplorer(settings.API_KEY, getattr(settings, 'AGGREGATES_API_BASE_URL', "http://transparencydata.com/api/1.0/")) 33 | 34 | entities = [] 35 | for type in ['individual', 'organization', 'politician']: 36 | count = api.entities.count(type) 37 | for i in range(0, count, 10000): 38 | entities.extend(api.entities.list(i, i + 10000, type)) 39 | 40 | from oxtail.matching.normalize import normalize_list 41 | for entity in entities: 42 | record = { 43 | 'id': entity['id'], 44 | 'td_type': entity['type'], 45 | 'td_name': entity['name'], 46 | 'aliases': [name.strip() for name in normalize_list([entity['name']] + entity['aliases'], entity['type'])] 47 | } 48 | record['filtered_aliases'] = [alias for alias in record['aliases'] if alias.lower() not in filtered_words] 49 | 50 | if options.update and record['id'] in current: 51 | Entity.objects(id=record['id']) \ 52 | .update(safe_update=True, set__td_type=record['td_type'], set__td_name=record['td_name'], set__aliases=record['aliases'], set__filtered_aliases=record['filtered_aliases']) 53 | print "Updated %s as existing record %s" % (record['aliases'][0], record['id']) 54 | current.remove(record['id']) 55 | else: 56 | db_entity = Entity(**record) 57 | db_entity.save() 58 | print "Saved %s as new record %s" % (record['aliases'][0], record['id']) 59 | 60 | if options.update: 61 | print "Deleting %s no-longer-existing records..." % len(current) 62 | db = Entity._get_db() 63 | db.entities.remove({'_id': {'$in': list(current)}}) 64 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/export_text.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Filter to only one agency. Default to all agencies if not specified.") 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Filter to only one docket. Default to all dockets if not specified.") 5 | 6 | def ensure_directory(directory): 7 | if not os.path.exists(directory): 8 | os.mkdir(directory) 9 | 10 | def extract(record, keys): 11 | out = {} 12 | for key in keys: 13 | if key in record and record[key]: 14 | out[key] = record[key] 15 | return out 16 | 17 | dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None 18 | 19 | def run(options, args): 20 | global os 21 | import settings 22 | import datetime 23 | import os 24 | import pymongo 25 | import itertools 26 | import json 27 | from regs_common.util import get_db 28 | import zipfile 29 | import sys 30 | 31 | print 'Starting dump...' 32 | 33 | query = {'scraped': True, 'deleted': False} 34 | 35 | if options.docket: 36 | query['docket_id'] = options.docket 37 | if options.agency: 38 | query['agency'] = options.agency 39 | print query 40 | 41 | db = get_db() 42 | 43 | export_dir = os.path.join(settings.DATA_DIR, 'bulk', 'regulations-%s' % str(datetime.datetime.now().date())) 44 | ensure_directory(export_dir) 45 | 46 | for agency, agency_docs in itertools.groupby(db.docs.find(query, sort=[('document_id', pymongo.ASCENDING)]), key=lambda d: d['agency']): 47 | print 'Starting agency %s...' % agency 48 | agency_dir = os.path.join(export_dir, agency) 49 | ensure_directory(agency_dir) 50 | 51 | for docket, docket_docs in itertools.groupby(agency_docs, key=lambda d: d['docket_id']): 52 | print 'Starting docket %s...' % docket 53 | zip_path = os.path.join(agency_dir, '%s.zip' % docket) 54 | 55 | with zipfile.ZipFile(zip_path, 'a', zipfile.ZIP_DEFLATED, True) as docket_zip: 56 | docket_record = list(db.dockets.find({'_id': docket})) 57 | 58 | if docket_record: 59 | docket_zip.writestr( 60 | 'metadata.json', 61 | json.dumps( 62 | extract( 63 | docket_record[0], 64 | ['docket_id', 'title', 'agency', 'rin', 'details', 'year'] 65 | ), 66 | default=dthandler 67 | ) 68 | ) 69 | 70 | for doc in docket_docs: 71 | files = [] 72 | 73 | views = [('view', view) for view in doc['views']] 74 | if 'attachments' in doc: 75 | for attachment in doc['attachments']: 76 | views.extend([('attachment', view) for view in attachment['views']]) 77 | 78 | for type, view in views: 79 | file = {'url': view['url']} 80 | if view['extracted'] == True: 81 | filename = '%s_%s.txt' % (type, view['file'].split('/')[-1].replace('.', '_')) 82 | file['filename'] = filename 83 | 84 | docket_zip.writestr(os.path.join(doc['document_id'], filename), view['text'].encode('utf8')) 85 | 86 | files.append(file) 87 | 88 | metadata = extract( 89 | doc, 90 | ['document_id', 'title', 'agency', 'docket_id', 'type', 'topics', 'details', 'comment_on', 'rin'] 91 | ) 92 | metadata['files'] = files 93 | 94 | docket_zip.writestr(os.path.join(doc['document_id'], 'metadata.json'), json.dumps(metadata, default=dthandler)) 95 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | GEVENT = False 4 | 5 | from regs_common.exceptions import * 6 | from optparse import OptionParser 7 | 8 | # arguments 9 | arg_parser = OptionParser() 10 | arg_parser.add_option("-t", "--type", action="store", dest="type", default=None) 11 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 12 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 13 | 14 | # runner 15 | def run(options, args): 16 | global Pool, sys, settings, subprocess, os, urlparse, json, regs_common, pymongo, mp_bulk_extract 17 | from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view 18 | from regs_common.extraction import mp_bulk_extract 19 | from gevent.pool import Pool 20 | import sys 21 | import settings 22 | import subprocess, os, urlparse, json 23 | import pymongo 24 | 25 | return { 26 | 'document_views': run_for_view_type('document views', find_views, update_view, options), 27 | 'attachment_views': run_for_view_type('attachment views', find_attachment_views, update_attachment_view, options) 28 | } 29 | 30 | def run_for_view_type(view_label, find_func, update_func, options): 31 | print 'Preparing text extraction of %s.' % view_label 32 | 33 | query = {'deleted': False} 34 | if options.agency: 35 | query['agency'] = options.agency 36 | if options.docket: 37 | query['docket_id'] = options.docket 38 | 39 | find_conditions = { 40 | 'downloaded': "yes", 41 | 'extracted': "no", 42 | 'query': query 43 | } 44 | if options.type: 45 | find_conditions['type'] = options.type 46 | 47 | # track stats -- no locks because yay for cooperative multitasking 48 | stats = {'extracted': 0, 'failed': 0} 49 | 50 | views = find_func(**find_conditions) 51 | 52 | # same yucky hack as in downloads 53 | v_array = [views] 54 | def extract_generator(): 55 | while True: 56 | try: 57 | result = v_array[0].next() 58 | yield (result['view'].file_path, None, result) 59 | except pymongo.errors.OperationFailure: 60 | # occasionally pymongo seems to lose track of the cursor for some reason, so reset the query 61 | v_array[0] = find_func(**find_conditions) 62 | continue 63 | except StopIteration: 64 | break 65 | 66 | def status_func(status, text, filename, filetype, output_type, used_ocr, result): 67 | if status[0]: 68 | result['view'].extracted = "yes" 69 | 70 | result['view'].content.new_file() 71 | result['view'].content.content_type = 'text/plain' 72 | result['view'].content.write(text.encode('utf-8')) 73 | result['view'].content.close() 74 | 75 | result['view'].mode = output_type 76 | result['view'].ocr = used_ocr 77 | try: 78 | update_func(**result) 79 | print 'Extracted and saved text from %s' % filename 80 | stats['extracted'] += 1 81 | except (pymongo.errors.OperationFailure, pymongo.errors.InvalidDocument): 82 | print 'Extracted text from %s but failed to save due to oversized document.' % filename 83 | stats['failed'] += 1 84 | 85 | if not 'oversized' in stats: 86 | stats['oversized'] = [] 87 | stats['oversized'].append(result['view'].url()) 88 | else: 89 | result['view'].extracted = 'failed_no_extractor' if 'no extractor' in status[1] else 'failed_extraction' 90 | update_func(**result) 91 | print 'Saved failure to decode %s' % result['view'].file_path 92 | stats['failed'] += 1 93 | update_func(**result) 94 | 95 | mp_bulk_extract(extract_generator(), status_func, verbose=True) 96 | 97 | print 'Done with %s.' % view_label 98 | 99 | return stats 100 | 101 | if __name__ == "__main__": 102 | run() 103 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/mark_searchable_entities.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | def run(): 4 | from regs_models import Entity 5 | 6 | print "Updating entity search index..." 7 | 8 | # mark the ones that should be searchable but aren't as searchable 9 | Entity.objects(__raw__={ 10 | 'td_type': 'organization', 11 | 'stats.count': {'$gt': 0}, 12 | 'searchable': False 13 | }).update(set__searchable=True, multi=True) 14 | 15 | # mark the ones that are searchable but shouldn't be unsearchable 16 | Entity.objects(__raw__={ 17 | '$or': [ 18 | {'td_type': {'$ne': 'organization'}}, 19 | {'stats.count': {'$not': {'$gt': 0}}} 20 | ], 21 | 'searchable': True 22 | }).update(set__searchable=False, multi=True) 23 | 24 | print "Update complete." -------------------------------------------------------------------------------- /regscrape/regs_common/commands/match_text.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import zlib 4 | import datetime 5 | import settings 6 | 7 | import pymongo 8 | import traceback 9 | import os 10 | import re 11 | import multiprocessing 12 | from Queue import Empty 13 | from regs_models import * 14 | 15 | from oxtail.matching import match 16 | 17 | # arguments 18 | from optparse import OptionParser 19 | arg_parser = OptionParser() 20 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 21 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 22 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Force a re-extraction of all documents in the system.") 23 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.") 24 | 25 | # regex to find titles that are likely to have submitter names 26 | NAME_FINDER = re.compile(r"^(public )?(comment|submission)s? (by|from) (?P<name>.*)$", re.I) 27 | 28 | def get_text(view): 29 | if not view.content: 30 | return '' 31 | 32 | return view.content.read() 33 | 34 | def process_doc(doc): 35 | # entity extraction 36 | for view in doc.views: 37 | if view.extracted == 'yes': 38 | view_matches = match(get_text(view), multiple=True) 39 | view.entities = list(view_matches.keys()) if view_matches else [] 40 | 41 | for attachment in doc.attachments: 42 | for view in attachment.views: 43 | if view.extracted == 'yes': 44 | view_matches = match(get_text(view), multiple=True) 45 | view.entities = list(view_matches.keys()) if view_matches else [] 46 | 47 | # submitter matches 48 | # check if there's submitter stuff in the title 49 | title_match = NAME_FINDER.match(doc.title) 50 | 51 | # next check details, which is where most title stuff lives 52 | details = doc.details 53 | # stick "XXXX" between tokens because it doesn't occur in entity names 54 | submitter_matches = match(' XXXX '.join([ 55 | # organization 56 | details.get('Organization_Name', ''), 57 | 58 | # submitter name 59 | ' '.join( 60 | filter(bool, [details.get('First_Name', ''), details.get('Last_Name', '')]) 61 | ), 62 | 63 | # submitter representative 64 | details.get('Submitter_s_Representative', ''), 65 | 66 | # title_match if we found one 67 | title_match.groupdict()['name'] if title_match else '', 68 | 69 | # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info 70 | doc.title if doc.source == 'sec_cftc' and doc.type in ('public_submission', 'other') else '' 71 | ])) 72 | doc.submitter_entities = list(submitter_matches.keys()) if submitter_matches else [] 73 | 74 | doc.entities_last_extracted = datetime.datetime.now() 75 | 76 | doc.save() 77 | 78 | return True 79 | 80 | def process_worker(todo_queue): 81 | pid = os.getpid() 82 | print '[%s] Worker started.' % pid 83 | while True: 84 | try: 85 | doc = Doc._from_son(todo_queue.get()) 86 | except Empty: 87 | print '[%s] Processing complete.' % pid 88 | return 89 | 90 | try: 91 | doc_success = process_doc(doc) 92 | print '[%s] Processing of doc %s succeeded.' % (pid, doc.id) 93 | except: 94 | print '[%s] Processing of doc %s failed.' % (pid, doc.id) 95 | traceback.print_exc() 96 | 97 | todo_queue.task_done() 98 | 99 | def run(options, args): 100 | from regs_common.entities import load_trie_from_mongo 101 | import time 102 | 103 | pid = os.getpid() 104 | 105 | # load trie from the mongo database 106 | import_start = time.time() 107 | print '[%s] Loading trie...' % pid 108 | load_trie_from_mongo() 109 | print '[%s] Loaded trie in %s seconds.' % (pid, time.time() - import_start) 110 | 111 | query = {'deleted': False, 'scraped': 'yes', '$nor': [{'views.extracted': 'no'},{'attachments.views.extracted':'no'}]} 112 | if options.agency: 113 | query['agency'] = options.agency 114 | if options.docket: 115 | query['docket_id'] = options.docket 116 | if not options.process_all: 117 | query['entities_last_extracted'] = {'$exists': False} 118 | 119 | cursor = Doc.objects(__raw__=query) 120 | 121 | run_start = time.time() 122 | print '[%s] Starting analysis...' % pid 123 | 124 | num_workers = options.multi 125 | 126 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 127 | 128 | processes = [] 129 | for i in range(num_workers): 130 | proc = multiprocessing.Process(target=process_worker, args=(todo_queue,)) 131 | proc.start() 132 | processes.append(proc) 133 | 134 | for doc in cursor: 135 | todo_queue.put(doc.to_mongo()) 136 | 137 | todo_queue.join() 138 | 139 | for proc in processes: 140 | print 'Terminating worker %s...' % proc.pid 141 | proc.terminate() 142 | 143 | print '[%s] Completed analysis in %s seconds.' % (pid, time.time() - run_start) 144 | 145 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/reset_downloads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def run(): 4 | global os, settings 5 | from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view 6 | import os 7 | import settings 8 | 9 | run_for_view_type('document views', find_views, update_view) 10 | run_for_view_type('attachment views', find_attachment_views, update_attachment_view) 11 | 12 | def run_for_view_type(view_label, find_func, update_func): 13 | print 'Resetting %s.' % view_label 14 | views = find_func(downloaded='failed', query={'deleted': False}) 15 | 16 | for result in views: 17 | result['view'].downloaded = 'no' 18 | update_func(**result) 19 | 20 | print 'Done with %s.' % view_label 21 | 22 | if __name__ == "__main__": 23 | run() -------------------------------------------------------------------------------- /regscrape/regs_common/commands/reset_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def run(): 4 | global os, settings 5 | from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view 6 | import os 7 | import settings 8 | 9 | run_for_view_type('document views', find_views, update_view) 10 | run_for_view_type('attachment views', find_attachment_views, update_attachment_view) 11 | 12 | def run_for_view_type(view_label, find_func, update_func): 13 | print 'Resetting %s.' % view_label 14 | views = find_func(extracted='failed', query={'deleted': False}) 15 | 16 | for result in views: 17 | result['view'].extracted = 'no' 18 | update_func(**result) 19 | 20 | print 'Done with %s.' % view_label 21 | 22 | if __name__ == "__main__": 23 | run() -------------------------------------------------------------------------------- /regscrape/regs_common/commands/run_aggregates.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import multiprocessing 4 | 5 | from optparse import OptionParser 6 | arg_parser = OptionParser() 7 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 8 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 9 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing MR data with new data.") 10 | arg_parser.add_option("-p", "--pretend", dest="pretend", action="store_true", default=False, help="Don't actually write anything to the database.") 11 | arg_parser.add_option("-n", "--no-children", dest="no_children", action="store_true", default=False, help="Don't spawn child processes.") 12 | arg_parser.add_option("-r", "--resume", dest="resume_db", action="store", type="string", default=None, help="Resume a previous aggregation task (HERE BE DRAGONS)") 13 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.") 14 | 15 | def run_client(): 16 | from mincemeat import Client, DEFAULT_PORT 17 | import time 18 | import socket 19 | import os 20 | 21 | print "[%s] Starting worker" % os.getpid() 22 | while True: 23 | time.sleep(2) 24 | try: 25 | client = Client() 26 | client.password = "" 27 | client.conn('localhost', DEFAULT_PORT) 28 | return 29 | except socket.error as v: 30 | if v.errno == 54: 31 | print "[%s] Caught a socket error 54; resetting worker" % os.getpid() 32 | else: 33 | print "[%s] Caught a socket error %s; giving up" % (os.getpid(), v.errno) 34 | return 35 | 36 | def run(options, args): 37 | print 'Running aggregates...' 38 | 39 | num_workers = options.multi 40 | 41 | pool = multiprocessing.Pool(num_workers) 42 | 43 | if not options.no_children: 44 | for i in range(num_workers): 45 | pool.apply_async(run_client) 46 | 47 | from aggregates import run_aggregates 48 | run_aggregates(options) 49 | 50 | pool.terminate() 51 | 52 | print "Aggregates complete." 53 | 54 | return {'success': True} 55 | -------------------------------------------------------------------------------- /regscrape/regs_common/commands/runner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, optparse, json, settings 4 | 5 | def run_command(): 6 | if len(sys.argv) < 2: 7 | print 'Usage: ./run.py <command>' 8 | sys.exit() 9 | command = sys.argv[1] 10 | 11 | if command.endswith('.py'): 12 | mod_name = command.split('/').pop().rsplit('.', 1)[0] 13 | import imp 14 | try: 15 | mod = imp.load_source(mod_name, command) 16 | except ImportError: 17 | print 'Could not load custom command: %s' % command 18 | sys.exit() 19 | else: 20 | imported = False 21 | for lib in ['regs_common'] + settings.SITES: 22 | try: 23 | parent_mod = __import__('%s.commands' % lib, fromlist=[command]) 24 | mod = getattr(parent_mod, command) 25 | imported = True 26 | break 27 | except ImportError: 28 | pass 29 | except AttributeError: 30 | pass 31 | if not imported: 32 | print 'No such command: %s' % command 33 | sys.exit() 34 | 35 | if getattr(mod, 'GEVENT', True): 36 | from gevent.monkey import patch_all 37 | patch_all() 38 | 39 | run = getattr(mod, 'run', False) 40 | if not run or not callable(run): 41 | print 'Command %s is not runnable' % command 42 | sys.exit() 43 | 44 | parser = getattr(mod, 'arg_parser', None) 45 | parser_defined = parser is not None 46 | 47 | if not parser: 48 | parser = optparse.OptionParser() 49 | parser.add_option('--parsable', dest='parsable', action='store_true', default=False, help='Output JSON instead of human-readable messages.') 50 | parse_results = parser.parse_args(sys.argv[2:]) 51 | 52 | dev_null = open('/dev/null', 'w') 53 | if parse_results[0].parsable: 54 | # disable standard output by monkey-patching sys.stdout 55 | real_stdout = sys.stdout 56 | sys.stdout = dev_null 57 | 58 | from regs_common.util import bootstrap_settings 59 | bootstrap_settings() 60 | 61 | out = run(*(parse_results if parser_defined else [])) 62 | 63 | if parse_results[0].parsable: 64 | # turn stdout back on so we can print output 65 | sys.stdout = real_stdout 66 | 67 | if out: 68 | print json.dumps(out) 69 | 70 | # no matter what, nuke stderr on exit so we can avoid that stupid gevent thing 71 | sys.stderr = dev_null 72 | -------------------------------------------------------------------------------- /regscrape/regs_common/data/es_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "docket": { 3 | "properties": { 4 | "_id": { 5 | "type": "string", 6 | "index": "not_analyzed" 7 | }, 8 | "title": { 9 | "type": "string" 10 | }, 11 | "agency": { 12 | "type": "string", 13 | "index": "not_analyzed" 14 | }, 15 | "identifiers": { 16 | "type": "string", 17 | "index": "not_analyzed" 18 | } 19 | } 20 | }, 21 | "document": { 22 | "_parent": { 23 | "type": "docket" 24 | }, 25 | "properties": { 26 | "_id": { 27 | "type": "string", 28 | "index": "not_analyzed" 29 | }, 30 | "title": { 31 | "type": "string" 32 | }, 33 | "docket_id": { 34 | "type": "string", 35 | "index": "not_analyzed" 36 | }, 37 | "agency": { 38 | "type": "string", 39 | "index": "not_analyzed" 40 | }, 41 | "comment_on": { 42 | "type": "string", 43 | "index": "not_analyzed" 44 | }, 45 | "posted_date": { 46 | "type": "date" 47 | }, 48 | "document_type": { 49 | "type": "string", 50 | "index": "not_analyzed" 51 | }, 52 | "submitter_organization": { 53 | "type": "string" 54 | }, 55 | "submitter_name": { 56 | "type": "string" 57 | }, 58 | "submitter_entities": { 59 | "type": "string", 60 | "index": "not_analyzed" 61 | }, 62 | "analyses": { 63 | "type": "string", 64 | "index": "not_analyzed" 65 | }, 66 | "identifiers": { 67 | "type": "string", 68 | "index": "not_analyzed" 69 | }, 70 | "files": { 71 | "properties": { 72 | "title": { 73 | "type": "string" 74 | }, 75 | "abstract": { 76 | "type": "string" 77 | }, 78 | "text": { 79 | "type": "string", 80 | "term_vector": "with_positions_offsets" 81 | }, 82 | "object_id": { 83 | "type": "string", 84 | "index": "not_analyzed" 85 | }, 86 | "file_type": { 87 | "type": "string", 88 | "index": "not_analyzed" 89 | }, 90 | "view_type": { 91 | "type": "string", 92 | "index": "not_analyzed" 93 | }, 94 | "entities": { 95 | "type": "string", 96 | "index": "not_analyzed" 97 | } 98 | } 99 | } 100 | } 101 | } 102 | } -------------------------------------------------------------------------------- /regscrape/regs_common/data_import.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import gridfs 3 | import settings 4 | 5 | def copy_data(source_db_name, dest_db_name, query): 6 | source = pymongo.Connection(**settings.DB_SETTINGS)[source_db_name] 7 | dest = pymongo.Connection(**settings.DB_SETTINGS)[dest_db_name] 8 | 9 | source_gridfs = gridfs.GridFS(source, collection='files') 10 | dest_gridfs = gridfs.GridFS(dest, collection='files') 11 | 12 | for doc in source.docs.find(query): 13 | print 'Copying doc %s...' % doc['_id'] 14 | 15 | # flip some flags 16 | #doc['stats'] = {} 17 | doc['in_aggregates'] = False 18 | doc['in_cluster_db'] = False 19 | doc['in_search_index'] = False 20 | 21 | dest.docs.save(doc) 22 | 23 | file_ids = [] 24 | for view in doc.get('views', []): 25 | if view.get('content', None): 26 | file_ids.append(view['content']) 27 | 28 | for attachment in doc.get('attachments', []): 29 | for view in attachment.get('views', []): 30 | if view.get('content', None): 31 | file_ids.append(view['content']) 32 | 33 | for fid in file_ids: 34 | print "Copying file %s" % fid 35 | 36 | # delete out of the dest in case it's already there 37 | dest_gridfs.delete(fid) 38 | 39 | # then read out from the old one 40 | fdata = source_gridfs.get(fid).read() 41 | 42 | # ... and write to the new one 43 | dest_gridfs.put(fdata, _id=fid) 44 | 45 | print "Done." 46 | 47 | dkt_query = dict(query) 48 | if "docket_id" in dkt_query: 49 | dkt_query['_id'] = dkt_query['docket_id'] 50 | del dkt_query['docket_id'] 51 | 52 | for dkt in source.dockets.find(dkt_query): 53 | print 'Copying docket %s...' % dkt['_id'] 54 | 55 | # flip some flags 56 | #dkt['stats'] = {} 57 | dkt['in_search_index'] = False 58 | 59 | if 'source' not in dkt: 60 | dkt['source'] = 'regulations.gov' 61 | 62 | dest.dockets.save(dkt) 63 | 64 | print "Done." -------------------------------------------------------------------------------- /regscrape/regs_common/entities.py: -------------------------------------------------------------------------------- 1 | def all_aliases(): 2 | import itertools 3 | from regs_common.util import get_db 4 | db = get_db() 5 | 6 | return itertools.chain.from_iterable( 7 | itertools.imap( 8 | lambda entity: [(alias, entity['_id']) for alias in entity.get('filtered_aliases', [])], 9 | db.entities.find() 10 | ) 11 | ) 12 | 13 | def load_trie_from_mongo(): 14 | from oxtail import matching 15 | 16 | matching._entity_trie = matching.build_token_trie( 17 | all_aliases(), 18 | matching._blacklist 19 | ) -------------------------------------------------------------------------------- /regscrape/regs_common/exceptions.py: -------------------------------------------------------------------------------- 1 | class ExtractionFailed(Exception): 2 | pass 3 | 4 | class DoesNotExist(Exception): 5 | pass 6 | 7 | class ChildTimeout(Exception): 8 | pass 9 | 10 | class RateLimitException(Exception): 11 | pass -------------------------------------------------------------------------------- /regscrape/regs_common/extraction.py: -------------------------------------------------------------------------------- 1 | from regs_common.processing import * 2 | import subprocess 3 | import settings 4 | 5 | EXTRACTORS = { 6 | 'xml': [ 7 | binary_extractor('cat', error='The document does not have a content file of type', output_type="html"), 8 | binary_extractor('html2text', error='The document does not have a content file of type') 9 | ], 10 | 11 | 'pdf': [ 12 | binary_extractor(['pdftohtml', '-noframes', '-i', '-stdout'], error='PDF file is damaged', output_type="html"), 13 | binary_extractor('pdftotext', append=['-'], error='PDF file is damaged'), 14 | ], 15 | 16 | 'msw8': [ 17 | binary_extractor('antiword', error='is not a Word Document'), 18 | binary_extractor('catdoc', error='The document does not have a content file of type') # not really an error, but catdoc happily regurgitates whatever you throw at it 19 | ], 20 | 21 | 'rtf': [ 22 | binary_extractor('unrtf', error='Warning: No stack to get attribute from', output_type="html"), 23 | binary_extractor('catdoc', error='The document does not have a content file of type') # not really an error, as above 24 | ], 25 | 26 | 'txt': [ 27 | binary_extractor('cat', error='The document does not have a content file of type') # not really an error, as above 28 | ], 29 | 30 | 'msw12': [ 31 | script_extractor('extract_docx.py', error='Failed to decode file') 32 | ], 33 | 34 | 'wp8': [ 35 | binary_extractor('wpd2text', error='ERROR') 36 | ], 37 | } 38 | 39 | EXTRACTORS['crtext'] = EXTRACTORS['xml'] 40 | EXTRACTORS['html'] = EXTRACTORS['xml'] 41 | EXTRACTORS['msw6'] = EXTRACTORS['msw8'] 42 | EXTRACTORS['msw'] = EXTRACTORS['msw8'] 43 | EXTRACTORS['xpdf'] = EXTRACTORS['pdf'] + [pdf_ocr] 44 | 45 | # extractor factory 46 | def _get_extractor(status_func, verbose, filename, filetype=None, record=None): 47 | def extract(): 48 | local_filetype = filetype if filetype else filename.split('.')[-1] 49 | if local_filetype in EXTRACTORS: 50 | success = False 51 | error_message = None 52 | used_ocr = False 53 | output_type = "text" 54 | for extractor in EXTRACTORS[local_filetype]: 55 | try: 56 | output = extractor(filename) 57 | except ExtractionFailed as failure: 58 | reason = str(failure) 59 | error_message = 'Failed to extract from %s using %s%s' % ( 60 | filename, 61 | extractor.__str__(), 62 | ' %s' % reason if reason else '' 63 | ) 64 | if verbose: print error_message 65 | continue 66 | except ChildTimeout as failure: 67 | error_message = 'Failed extracting from %s using %s due to timeout' % ( 68 | filename, 69 | extractor.__str__() 70 | ) 71 | if verbose: print error_message 72 | continue 73 | 74 | success = True 75 | text = unicode(remove_control_chars(output), 'utf-8', 'ignore') 76 | used_ocr = getattr(extractor, 'ocr', False) 77 | output_type = getattr(extractor, 'output_type', 'text') 78 | if verbose: print 'Extracted text from %s using %s' % ( 79 | filename, 80 | extractor.__str__() 81 | ) 82 | 83 | break 84 | 85 | status_func( 86 | (success, error_message), 87 | text if success else None, 88 | filename, 89 | local_filetype, 90 | output_type, 91 | used_ocr, 92 | record 93 | ) 94 | else: 95 | status_func( 96 | (False, "no extractor for type %s" % local_filetype), 97 | None, 98 | filename, 99 | local_filetype, 100 | "text", 101 | False, 102 | record 103 | ) 104 | return extract 105 | 106 | def bulk_extract(extract_iterable, status_func=None, verbose=False): 107 | from gevent.pool import Pool 108 | workers = Pool(getattr(settings, 'EXTRACTORS', 2)) 109 | 110 | # keep the extractors busy with tasks as long as there are more results 111 | for extract_record in extract_iterable: 112 | workers.spawn(_get_extractor(status_func, verbose, *extract_record)) 113 | 114 | workers.join() 115 | 116 | return 117 | 118 | def mp_bulk_extract(extract_iterable, status_func=None, verbose=False): 119 | import multiprocessing 120 | from Queue import Empty 121 | 122 | num_workers = getattr(settings, 'EXTRACTORS', multiprocessing.cpu_count()) 123 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 124 | 125 | def worker(todo_queue): 126 | while True: 127 | try: 128 | extract_record = todo_queue.get() 129 | except Empty: 130 | return 131 | 132 | _get_extractor(status_func, verbose, *extract_record)() 133 | todo_queue.task_done() 134 | 135 | processes = [] 136 | for i in range(num_workers): 137 | proc = multiprocessing.Process(target=worker, args=(todo_queue,)) 138 | proc.start() 139 | processes.append(proc) 140 | 141 | for extract_record in extract_iterable: 142 | todo_queue.put(extract_record) 143 | 144 | todo_queue.join() 145 | 146 | for proc in processes: 147 | proc.terminate() 148 | 149 | return 150 | 151 | def serial_bulk_extract(extract_iterable, status_func=None, verbose=False): 152 | import subprocess 153 | 154 | for extract_record in extract_iterable: 155 | _get_extractor(status_func, verbose, *extract_record)() 156 | 157 | return -------------------------------------------------------------------------------- /regscrape/regs_common/gevent_mongo.py: -------------------------------------------------------------------------------- 1 | __author__ = "Andrey Nikishaev" 2 | __email__ = "creotiv@gmail.com" 3 | 4 | import pymongo, sys 5 | from gevent.queue import Queue 6 | 7 | class GeventMongoPool(object): 8 | """ 9 | Rewrited connection pool for working with global connections. 10 | """ 11 | 12 | # Non thread-locals 13 | __slots__ = ["sockets", "socket_factory"] 14 | sock = None 15 | 16 | def __init__(self, socket_factory): 17 | self.socket_factory = socket_factory 18 | if not hasattr(self, "sockets"): 19 | self.sockets = [] 20 | 21 | def socket(self): 22 | # we store the pid here to avoid issues with fork / 23 | # multiprocessing - see 24 | # test.test_connection:TestConnection.test_fork for an example 25 | # of what could go wrong otherwise 26 | pid = os.getpid() 27 | 28 | if self.sock is not None and self.sock[0] == pid: 29 | return self.sock[1] 30 | 31 | try: 32 | self.sock = (pid, self.sockets.pop()) 33 | except IndexError: 34 | self.sock = (pid, self.socket_factory()) 35 | 36 | return self.sock[1] 37 | 38 | def return_socket(self): 39 | if self.sock is not None and self.sock[0] == os.getpid(): 40 | self.sockets.append(self.sock[1]) 41 | self.sock = None 42 | 43 | pymongo.connection.Pool = GeventMongoPool 44 | 45 | class MongoConnection(object): 46 | """Memcache pool auto-destruct connection""" 47 | def __init__(self,pool,conn): 48 | self.pool = pool 49 | self.conn = conn 50 | 51 | def getDB(self): 52 | return self.conn 53 | 54 | def __getattr__(self, name): 55 | return getattr(self.conn, name) 56 | 57 | def __getitem__(self, name): 58 | return self.conn[name] 59 | 60 | def __del__(self): 61 | self.pool.queue.put(self.conn) 62 | del self.pool 63 | del self.conn 64 | 65 | class Mongo(object): 66 | """MongoDB Pool""" 67 | def __new__(cls,db_name,size=5,*args,**kwargs): 68 | if not hasattr(cls,'_instance'): 69 | # use your own config library 70 | cls._instance = object.__new__(cls) 71 | cls._instance.queue = Queue(size) 72 | for x in xrange(size): 73 | try: 74 | # use your own config library 75 | cls._instance.queue.put( 76 | pymongo.Connection(*args,**kwargs)[db_name] 77 | ) 78 | except: 79 | sys.exc_clear() 80 | error('Can\'t connect to mongo servers') 81 | 82 | return cls._instance 83 | 84 | def get_conn(self,block=True,timeout=None): 85 | """Get Mongo connection wrapped in MongoConnection object""" 86 | obj = MongoConnection 87 | return obj(self,self.queue.get(block,timeout)) 88 | -------------------------------------------------------------------------------- /regscrape/regs_common/mp_types.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import RLock 2 | import multiprocessing.sharedctypes 3 | import ctypes 4 | 5 | class SynchronizedCounter(multiprocessing.sharedctypes.Synchronized): 6 | def increment(self, amount=1): 7 | self.acquire() 8 | try: 9 | self._obj.value += amount 10 | finally: 11 | self.release() 12 | 13 | def Counter(): 14 | value = multiprocessing.sharedctypes.RawValue(ctypes.c_uint) 15 | return SynchronizedCounter(value, RLock()) -------------------------------------------------------------------------------- /regscrape/regs_common/processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from bson.code import Code 4 | from pymongo.errors import OperationFailure, InvalidDocument 5 | import subprocess, os, urlparse, json 6 | from gevent import Timeout 7 | from regs_models import * 8 | from exceptions import ExtractionFailed, ChildTimeout 9 | import os 10 | import re 11 | import cStringIO 12 | import time 13 | import itertools 14 | import sys 15 | import regs_common 16 | import operator 17 | import zlib 18 | import settings 19 | 20 | def find_views(**params): 21 | db = Doc._get_db() 22 | 23 | # allow for using a pre-filter to speed up execution 24 | kwargs = {} 25 | query = {} 26 | if 'query' in params: 27 | query = params['query'] 28 | del params['query'] 29 | 30 | # create the actual map function 31 | conditions = dict([('views.%s' % item[0], item[1]) for item in params.items()]) 32 | conditions.update(query) 33 | 34 | results = itertools.chain.from_iterable( 35 | itertools.imap( 36 | lambda doc: [{'view': View._from_son(view), 'doc': doc['_id']} for view in doc['views'] if all(item[0] in view and view[item[0]] == item[1] for item in params.items())], 37 | db.docs.find(conditions) 38 | ) 39 | ) 40 | 41 | return results 42 | 43 | def find_attachment_views(**params): 44 | db = Doc._get_db() 45 | 46 | # allow for using a pre-filter to speed up execution 47 | kwargs = {} 48 | query = {} 49 | if 'query' in params: 50 | query = params['query'] 51 | del params['query'] 52 | 53 | # create the actual map function 54 | conditions = dict([('attachments.views.%s' % item[0], item[1]) for item in params.items()]) 55 | conditions.update(query) 56 | 57 | results = itertools.chain.from_iterable( 58 | itertools.imap( 59 | lambda doc: reduce(operator.add, [ 60 | [ 61 | {'view': View._from_son(view), 'doc': doc['_id'], 'attachment': attachment['object_id']} 62 | for view in attachment['views'] if all(item[0] in view and view[item[0]] == item[1] for item in params.items()) 63 | ] for attachment in doc['attachments'] 64 | ] if 'attachments' in doc else [], []), 65 | db.docs.find(conditions) 66 | ) 67 | ) 68 | 69 | return results 70 | 71 | def update_view(doc, view): 72 | # use db object from thread pool 73 | db = Doc._get_db() 74 | 75 | # can't figure out a way to do this atomically because of bug SERVER-1050 76 | # remove the old version of the view 77 | db.docs.update({ 78 | '_id': doc 79 | }, 80 | { 81 | '$pull': {"views": {"url": view.url}} 82 | }, safe=True) 83 | 84 | # add the new one back 85 | db.docs.update({ 86 | '_id': doc 87 | }, 88 | { 89 | '$push': {"views": view.to_mongo()} 90 | }, safe=True) 91 | 92 | # return it to the pool 93 | del db 94 | 95 | def update_attachment_view(doc, attachment, view): 96 | db = Doc._get_db() 97 | 98 | # two-stage push/pull as above 99 | db.docs.update({ 100 | '_id': doc, 101 | 'attachments.object_id': attachment 102 | }, 103 | { 104 | '$pull': {'attachments.$.views': {'url': view.url}} 105 | }, safe=True) 106 | 107 | db.docs.update({ 108 | '_id': doc, 109 | 'attachments.object_id': attachment 110 | }, 111 | { 112 | '$push': {'attachments.$.views': view.to_mongo()} 113 | }, safe=True) 114 | 115 | 116 | del db 117 | 118 | 119 | # the following is from http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python 120 | def which(program): 121 | import os 122 | def is_exe(fpath): 123 | return os.path.exists(fpath) and os.access(fpath, os.X_OK) 124 | 125 | fpath, fname = os.path.split(program) 126 | if fpath: 127 | if is_exe(program): 128 | return program 129 | else: 130 | for path in os.environ["PATH"].split(os.pathsep): 131 | exe_file = os.path.join(path, program) 132 | if is_exe(exe_file): 133 | return exe_file 134 | 135 | return None 136 | 137 | # the following is from http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python 138 | import unicodedata, re 139 | 140 | control_chars = ''.join(map(unichr, range(0,10) + range(11,13) + range(14,32) + range(127,160))) 141 | 142 | control_char_re = re.compile('[%s]' % re.escape(control_chars)) 143 | 144 | def remove_control_chars(s): 145 | return control_char_re.sub('', s) 146 | 147 | # extractor 148 | POPEN = subprocess.Popen 149 | _nbsp = re.compile('( ?| ?| ?)') 150 | def binary_extractor(binary, error=None, append=[], output_type="text"): 151 | if not type(binary) == list: 152 | binary = [binary] 153 | def extractor(filename): 154 | interpreter = POPEN(binary + [filename] + append, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 155 | 156 | timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120), ChildTimeout) 157 | timeout.start() 158 | try: 159 | output, run_error = interpreter.communicate('') 160 | timeout.cancel() 161 | except ChildTimeout: 162 | print 'killing %s' % filename 163 | interpreter.kill() 164 | raise 165 | 166 | if (output_type == 'text' and not output.strip()) or (output_type == 'html' and html_is_empty(output)) or (error and (error in output or error in run_error)): 167 | raise ExtractionFailed() 168 | elif output_type == 'html': 169 | # strip non-breaking spaces 170 | return _nbsp.sub(' ', output) 171 | else: 172 | return output 173 | 174 | extractor.__str__ = lambda: binary[0] 175 | extractor.output_type = output_type 176 | 177 | return extractor 178 | 179 | def script_extractor(script, error=None, output_type="text"): 180 | script_path = os.path.join(os.path.dirname(os.path.abspath(regs_common.__file__)), 'scripts', script) 181 | 182 | extractor = binary_extractor([sys.executable, script_path], error=error, output_type=output_type) 183 | extractor.__str__ = lambda: script 184 | 185 | return extractor 186 | 187 | _tag_stripper = re.compile(r'<[^>]*?>') 188 | def strip_tags(text): 189 | return _tag_stripper.sub('', text) 190 | 191 | _body_finder = re.compile(r"<body[^>]*>(.*)</body>", re.I | re.DOTALL) 192 | _outline_finder = re.compile(r'<a name="outline"></a>\s*<h1>Document Outline</h1>\s*<ul>.*</ul>', re.I | re.DOTALL) 193 | def html_is_empty(text): 194 | # grab the body 195 | body = _body_finder.findall(text) 196 | if not body: 197 | return True 198 | 199 | # explicitly strip out pdftohtml's document outlines 200 | without_outline = _outline_finder.sub("", body[0]) 201 | 202 | body_text = strip_tags(without_outline).strip() 203 | if not body_text: 204 | return True 205 | 206 | return False 207 | 208 | def ocr_scrub(text): 209 | lines = re.split(r'\n', text) 210 | garbage = re.compile(r'[^a-zA-Z\s]') 211 | 212 | def is_real_line(word): 213 | letter_length = len(garbage.sub('', word)) 214 | return letter_length and len(word) and letter_length/float(len(word)) >= 0.5 215 | 216 | filtered_lines = [line.strip() for line in lines if line and is_real_line(line)] 217 | filtered_text = '\n'.join(filtered_lines) 218 | 219 | if len(filtered_text) / float(len(text)) < 0.5: 220 | raise ExtractionFailed('This is does not appear to be text.') 221 | 222 | return filtered_text 223 | 224 | def pdf_ocr(filename): 225 | basename = os.path.basename(filename).split('.')[0] 226 | working = '/tmp/%s' % basename 227 | if not os.path.exists(working): 228 | os.mkdir(working) 229 | os.chdir(working) 230 | 231 | def cleanup(): 232 | if working and working != '/tmp/': 233 | os.chdir('..') 234 | subprocess.Popen(['rm', '-rf', working], stdout=subprocess.PIPE).communicate() 235 | 236 | extractor = subprocess.Popen(['pdfimages', filename, basename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 237 | extractor_output, extractor_error = extractor.communicate() 238 | if extractor_error: 239 | cleanup() 240 | raise ExtractionFailed("Failed to extract image data from PDF.") 241 | 242 | pnm_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.p.m") 243 | pnms = [file for file in os.listdir(working) if pnm_match.match(file)] 244 | if not pnms: 245 | cleanup() 246 | raise ExtractionFailed("No images found in PDF.") 247 | 248 | converter = subprocess.Popen(['gm', 'mogrify', '-format', 'tiff', '-type', 'Grayscale'] + pnms, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 249 | converter_output, converter_error = converter.communicate() 250 | if converter_error: 251 | cleanup() 252 | raise ExtractionFailed("Failed to convert images to tiff.") 253 | 254 | tiff_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.tiff") 255 | tiffs = [file for file in os.listdir(working) if tiff_match.match(file)] 256 | if not tiffs: 257 | cleanup() 258 | raise ExtractionFailed("Converted tiffs not found.") 259 | 260 | out = cStringIO.StringIO() 261 | for tiff in tiffs: 262 | tiff_base = tiff.split('.')[0] 263 | ocr = subprocess.Popen(['tesseract', tiff, tiff_base], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 264 | ocr_output, ocr_error = ocr.communicate() 265 | 266 | txt_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.txt") 267 | txts = [file for file in os.listdir(working) if txt_match.match(file)] 268 | if not txts: 269 | cleanup() 270 | raise ExctractionFailed("OCR failed to find any text.") 271 | 272 | for txt in txts: 273 | ocr_file = open(txt, 'r') 274 | out.write(ocr_file.read()) 275 | out.write('\n') 276 | 277 | try: 278 | return_data = ocr_scrub(out.getvalue()) 279 | except ExtractionFailed: 280 | cleanup() 281 | raise 282 | 283 | cleanup() 284 | return return_data 285 | pdf_ocr.__str__ = lambda: 'tesseract' 286 | pdf_ocr.ocr = True 287 | -------------------------------------------------------------------------------- /regscrape/regs_common/scripts/extract_docx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # extracts text from docx files using the docx module by Mike MacCana 4 | 5 | from docx import * 6 | import sys 7 | 8 | if __name__ == '__main__': 9 | try: 10 | document = opendocx(sys.argv[1]) 11 | except: 12 | sys.stderr.write('Failed to decode file\n') 13 | exit() 14 | 15 | ## Fetch all the text out of the document we just created 16 | paratextlist = getdocumenttext(document) 17 | 18 | # Make explicit unicode version 19 | newparatextlist = [] 20 | for paratext in paratextlist: 21 | newparatextlist.append(paratext.encode("utf-8")) 22 | 23 | ## Print our documnts test with two newlines under each paragraph 24 | sys.stdout.write('\n\n'.join(newparatextlist)) -------------------------------------------------------------------------------- /regscrape/regs_common/scripts/process_fr_docs.rb: -------------------------------------------------------------------------------- 1 | require 'us-documents' 2 | puts UnitedStates::Documents::FederalRegister.process STDIN.read -------------------------------------------------------------------------------- /regscrape/regs_common/tmp_redis.py: -------------------------------------------------------------------------------- 1 | try: 2 | import settings 3 | except ImportError: 4 | settings = object() 5 | 6 | import uuid, os, subprocess, time, shutil 7 | 8 | class TmpRedis(object): 9 | REDIS_CONFIG = {'daemonize': 'no', 'pidfile': '{path}/redis.pid', 'port': '0', 'bind': '127.0.0.1', 'unixsocket': '{path}/redis.sock', 'timeout': '300', 'loglevel': 'warning', 'logfile': 'stdout', 'databases': '1', '' : 'save 900 1\nsave 300 10\nsave 60 10000', 'rdbcompression': 'yes', 'dbfilename': 'dump.rdb', 'dir': '{path}/data', 'slave-serve-stale-data': 'yes', 'appendonly': 'no', 'appendfsync': 'everysec', 'no-appendfsync-on-rewrite': 'no', 'vm-enabled': 'no', 'vm-swap-file': '{path}/redis.swap', 'vm-max-memory': '0', 'vm-page-size': '32', 'vm-pages': '134217728', 'vm-max-threads': '4', 'hash-max-zipmap-entries': '512', 'hash-max-zipmap-value': '64', 'list-max-ziplist-entries': '512', 'list-max-ziplist-value': '64', 'set-max-intset-entries': '512', 'activerehashing': 'yes'} 10 | 11 | def get_config(self, **kwargs): 12 | return '\n'.join([' '.join(option).strip() for option in self.REDIS_CONFIG.items()]).format(**kwargs) 13 | 14 | def __init__(self, db_uuid=None): 15 | self.uuid = db_uuid if db_uuid else uuid.uuid4().__str__() 16 | 17 | redis_base = getattr(settings, 'REDIS_BASE', '/mnt/redis') 18 | redis_dir = os.path.join(redis_base, self.uuid) 19 | 20 | try: 21 | os.mkdir(redis_dir) 22 | os.mkdir(os.path.join(redis_dir, 'data')) 23 | except OSError: 24 | pass 25 | 26 | self.config = os.path.join(redis_dir, 'redis.conf') 27 | config_file = open(self.config, 'w') 28 | config_file.write(self.get_config(path=redis_dir)) 29 | config_file.close() 30 | 31 | self.directory = redis_dir 32 | self.socket = os.path.join(redis_dir, 'redis.sock') 33 | self.process = subprocess.Popen(['redis-server', self.config], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 34 | 35 | time.sleep(1) 36 | 37 | def get_connection(self): 38 | from redis import Redis 39 | return Redis(unix_socket_path=self.socket) 40 | 41 | def get_pickle_connection(self): 42 | # define an inner class so that we don't have to import redis until re try to get a connection 43 | import cPickle 44 | from redis import Redis 45 | 46 | class PickleRedis(Redis): 47 | def get(self, key): 48 | data = super(PickleRedis, self).get(key) 49 | return cPickle.loads(data) if data else data 50 | 51 | def set(self, key, value): 52 | return super(PickleRedis, self).set(key, cPickle.dumps(value, -1)) 53 | 54 | return PickleRedis(unix_socket_path=self.socket) 55 | 56 | def terminate(self, delete=True): 57 | self.process.terminate() 58 | time.sleep(1) 59 | 60 | if self.process.poll() is None: 61 | self.process.kill() 62 | 63 | if delete: 64 | shutil.rmtree(self.directory) 65 | -------------------------------------------------------------------------------- /regscrape/regs_common/transfer.py: -------------------------------------------------------------------------------- 1 | import urllib2, urllib3 2 | import subprocess 3 | from gevent.pool import Pool 4 | from gevent import Timeout 5 | import greenlet 6 | import settings 7 | import datetime 8 | import sys 9 | import traceback 10 | import time 11 | 12 | def pump(input, output, chunk_size): 13 | size = 0 14 | while True: 15 | chunk = input.read(chunk_size) 16 | if not chunk: break 17 | output.write(chunk) 18 | size += len(chunk) 19 | return size 20 | 21 | def download(url, output_file, post_data=None, headers=None): 22 | transfer = urllib2.urlopen(urllib2.Request(url, post_data, headers if headers else {}), timeout=10) if type(url) in (unicode, str) else url 23 | 24 | out = open(output_file, 'wb') 25 | size = pump(transfer, out, 16 * 1024) 26 | out.close() 27 | 28 | return size 29 | 30 | def download_wget(url, output_file): 31 | proc = subprocess.Popen(['wget', '-nv', url, '-O', output_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) 32 | out = proc.communicate('') 33 | if 'URL:' in out[0] and os.path.exists(output_file): 34 | return os.stat(output_file).st_size 35 | elif 'ERROR' in out[0]: 36 | error_match = re.match('.*ERROR (\d{3}): (.*)', out[0].strip().replace('\n', ' ')) 37 | if error_match: 38 | error_groups = error_match.groups() 39 | raise urllib2.HTTPError(url, error_groups[0], error_groups[1], {}, None) 40 | raise Exception("Something went wrong with the download.") 41 | 42 | # pooled and timed-out versions of the transfer code 43 | def tpump(input, output, chunk_size): 44 | size = 0 45 | while True: 46 | try: 47 | timeout = Timeout.start_new(5) 48 | chunk = input.read(chunk_size) 49 | timeout.cancel() 50 | except Timeout: 51 | input.release_conn() 52 | raise 53 | 54 | if not chunk: break 55 | output.write(chunk) 56 | size += len(chunk) 57 | return size 58 | 59 | def download_pooled(url, output_file): 60 | transfer = CPOOL.urlopen("GET", url, timeout=10, preload_content=False) 61 | if transfer.status != 200: 62 | raise urllib2.HTTPError(url, transfer.status, transfer.reason, transfer.headers, None) 63 | 64 | out = open(output_file, 'wb') 65 | size = tpump(transfer, out, 16 * 1024) 66 | out.close() 67 | 68 | return size 69 | 70 | def _get_downloader(status_func, download_func, retries, verbose, min_size, url, filename, record=None): 71 | def download_file(): 72 | for try_num in xrange(retries): 73 | if verbose: print 'Downloading %s (try #%d, downloader %s)...' % (url, try_num, hash(greenlet.getcurrent())) 74 | 75 | download_succeeded = False 76 | download_message = None 77 | size = 0 78 | try: 79 | start = datetime.datetime.now() 80 | size = download_func(url, filename) 81 | download_succeeded = True 82 | elapsed = datetime.datetime.now() - start 83 | except urllib2.HTTPError as e: 84 | if verbose: print 'Download of %s failed due to error %s.' % (url, e.code) 85 | download_message = e.code 86 | 87 | if int(e.code) == 429: 88 | if verbose: print 'Error occurred due to rate limiting; waiting 10 minutes.' 89 | time.sleep(600) 90 | except Timeout as e: 91 | if verbose: print 'Download of %s timed out.' % url 92 | except: 93 | exc = sys.exc_info() 94 | if verbose: print traceback.print_tb(exc[2]) 95 | 96 | if download_succeeded: 97 | if size >= min_size: 98 | # print status 99 | ksize = int(round(size/1024.0)) 100 | if verbose: print 'Downloaded %s to %s: %sk in %s seconds (%sk/sec)' % (url, filename, ksize, elapsed.seconds, round(float(ksize)/elapsed.seconds * 10)/10 if elapsed.seconds > 0 else '--') 101 | break 102 | else: 103 | download_succeeded = False 104 | download_message = "Resulting file was smaller than the minimum file size." 105 | if verbose: print download_message 106 | 107 | status_func( 108 | (download_succeeded, download_message), 109 | url, 110 | filename, 111 | record 112 | ) 113 | return download_file 114 | 115 | 116 | def bulk_download(download_iterable, status_func=None, retries=3, verbose=False, min_size=0): 117 | workers = Pool(getattr(settings, 'DOWNLOADERS', 5)) 118 | 119 | # keep the downloaders busy with tasks as long as there are more results 120 | for download_record in download_iterable: 121 | workers.spawn(_get_downloader(status_func, download, retries, verbose, min_size, *download_record)) 122 | 123 | workers.join() 124 | 125 | return 126 | 127 | CPOOL = None 128 | def pooled_bulk_download(download_iterable, status_func=None, retries=5, verbose=False, min_size=0): 129 | num_downloaders = getattr(settings, 'DOWNLOADERS', 5) 130 | global CPOOL 131 | if not CPOOL: 132 | CPOOL = urllib3.PoolManager(num_pools=2, maxsize=num_downloaders * 2) 133 | 134 | workers = Pool(num_downloaders) 135 | 136 | # keep the downloaders busy with tasks as long as there are more results 137 | for download_record in download_iterable: 138 | workers.spawn(_get_downloader(status_func, download_pooled, retries, verbose, min_size, *download_record)) 139 | 140 | workers.join() 141 | 142 | return -------------------------------------------------------------------------------- /regscrape/regs_common/util.py: -------------------------------------------------------------------------------- 1 | import time 2 | import settings 3 | from pymongo import Connection 4 | import os 5 | from gevent_mongo import Mongo 6 | import urllib2 7 | import subprocess 8 | import re 9 | import hashlib, crockford 10 | 11 | def get_db(): 12 | db_settings = getattr(settings, 'DB_SETTINGS', {}) 13 | return Mongo(getattr(settings, 'DB_NAME', 'regulations'), settings.INSTANCES + 2, **db_settings).get_conn() 14 | 15 | def bootstrap_settings(): 16 | if not getattr(settings, 'DOWNLOAD_DIR', False): 17 | settings.DOWNLOAD_DIR = os.path.join(settings.DATA_DIR, 'downloads') 18 | 19 | if not getattr(settings, 'DUMP_DIR', False): 20 | settings.DUMP_DIR = os.path.join(settings.DATA_DIR, 'dumps') 21 | 22 | def listify(item): 23 | if not item: 24 | return [] 25 | if type(item) in (str, unicode, dict): 26 | return [item] 27 | return list(item) 28 | 29 | def crockford_hash(s): 30 | h = hashlib.md5(s) 31 | return crockford.b32encode(h.digest()) -------------------------------------------------------------------------------- /regscrape/regsdotgov/__init__.py: -------------------------------------------------------------------------------- 1 | # add self to path 2 | import sys 3 | import os 4 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | if CURRENT_DIR not in sys.path: 6 | sys.path.append(CURRENT_DIR) -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/regsdotgov/commands/__init__.py -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_create_agencies.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | def run(): 4 | from regs_models import Agency 5 | import subprocess, re, urllib2 6 | 7 | BAD_SPACES = re.compile("(\xc2\xa0|\r)") 8 | AGENCY_LINE = re.compile(r"^[A-Z\s\.\,\&\-\'\(\)\/]*[A-Z]+[A-Z\s\(\)]*$") 9 | REGULAR_LINE = re.compile(r"^[A-Z]{2,}\s{3,}[A-Z]+.*$") 10 | AGENCY_ONLY_LINE = re.compile(r"^[A-Z]{2,}\s*$") 11 | DESCRIPTION_ONLY_LINE = re.compile(r"^\s{3,}[A-Z]+.*$") 12 | THREE_SPACES = re.compile("\s{3,}") 13 | SPACES = re.compile(r"\s+") 14 | AMPERSAND = re.compile(r"(?<=[A-Z])\&") 15 | 16 | new = 0 17 | 18 | print 'Fetching agencies...' 19 | agencies = {} 20 | 21 | ml_descs = [] 22 | ml_agency = None 23 | 24 | participating = {} 25 | 26 | for file in ["Participating_Agencies.pdf", "Non_Participating_Agencies.pdf"]: 27 | data = urllib2.urlopen("http://www.regulations.gov/docs/%s" % file) 28 | pdftotext = subprocess.Popen(["pdftotext", "-layout", "-", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 29 | text = pdftotext.communicate(data.read()) 30 | 31 | contents = BAD_SPACES.sub(" ", text[0]) 32 | 33 | agency_lines = [line for line in contents.split("\n") if AGENCY_LINE.match(line)] 34 | 35 | for line in agency_lines: 36 | if REGULAR_LINE.match(line): 37 | split = THREE_SPACES.split(line, maxsplit=1) 38 | a_name = split[0].strip() 39 | a_desc = split[1].strip() 40 | 41 | agencies[a_name] = a_desc 42 | participating[a_name] = "Non" not in file 43 | elif AGENCY_ONLY_LINE.match(line): 44 | ml_agency = line.strip() 45 | elif DESCRIPTION_ONLY_LINE.match(line): 46 | ml_descs.append(line.strip()) 47 | if ml_agency: 48 | agencies[ml_agency] = " ".join(ml_descs) 49 | participating[ml_agency] = "Non" not in file 50 | ml_agency = None 51 | ml_descs = [] 52 | else: 53 | print "Broken line:", line 54 | 55 | # hard-coded SIGAR, because it's messed up in the PDF 56 | agencies["SIGAR"] = "SPECIAL INSPECTOR GENERAL FOR AFGHANISTAN RECONSTRUCTION" 57 | participating["SIGAR"] = False 58 | 59 | print 'Saving agencies...' 60 | 61 | stop_words = ['the', 'and', 'of', 'on', 'in', 'for'] 62 | for agency, name in agencies.items(): 63 | # fix ampersand weirdness 64 | name = AMPERSAND.sub(" & ", name) 65 | 66 | # fix spacing and capitalization 67 | name_parts = SPACES.split(name) 68 | capitalized_parts = [name_parts[0].title()] + [word.title() if word.lower() not in stop_words else word.lower() for word in name_parts[1:]] 69 | name = ' '.join(capitalized_parts) 70 | 71 | new += Agency.objects(id=agency).update( 72 | set__name=name, 73 | set__rdg_participating=participating[agency], 74 | 75 | upsert=True, 76 | safe_update=True 77 | ) 78 | 79 | print 'Iterated over %s agencies.' % (len(agencies)) 80 | 81 | return {'total': len(agencies), 'new': new} 82 | -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import settings 4 | MIN_SIZE = getattr(settings, 'MIN_DOWNLOAD_SIZE', 512) 5 | 6 | from optparse import OptionParser 7 | arg_parser = OptionParser() 8 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 9 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 10 | 11 | def run(options, args): 12 | # global imports hack so we don't mess up gevent loading 13 | global pooled_bulk_download, settings, subprocess, os, urlparse, sys, traceback, datetime, pymongo, hashlib 14 | from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view 15 | from regs_common.transfer import pooled_bulk_download 16 | import subprocess, os, urlparse, sys, traceback, datetime, hashlib 17 | import pymongo 18 | 19 | # ensure that our hash directories are all there 20 | for hex_dir in [hex(i).split('x').pop().zfill(2) for i in range(256)]: 21 | dir_path = os.path.join(settings.DOWNLOAD_DIR, hex_dir) 22 | if not os.path.exists(dir_path): 23 | os.mkdir(dir_path) 24 | 25 | return { 26 | 'document_views': run_for_view_type('document views', find_views, update_view, options), 27 | 'attachment_views': run_for_view_type('attachment views', find_attachment_views, update_attachment_view, options) 28 | } 29 | 30 | def run_for_view_type(view_label, find_func, update_func, options): 31 | print 'Preparing download of %s.' % view_label 32 | 33 | query = {'deleted': False} 34 | if options.agency: 35 | query['agency'] = options.agency 36 | if options.docket: 37 | query['docket_id'] = options.docket 38 | 39 | views = find_func(downloaded="no", query=query) 40 | 41 | # track stats -- no locks because yay for cooperative multitasking 42 | stats = {'downloaded': 0, 'failed': 0} 43 | 44 | # hack around stupid Python closure behavior 45 | v_array = [views] 46 | def download_generator(): 47 | while True: 48 | try: 49 | result = v_array[0].next() 50 | 51 | save_hash = hashlib.md5(result['view'].url).hexdigest() 52 | save_name = '%s.%s' % (result['view'].object_id if result['view'].object_id else save_hash, result['view'].type) 53 | save_path = os.path.join(settings.DOWNLOAD_DIR, save_hash[:2], save_name) 54 | 55 | fetch_url = result['view'].url 56 | if "api.data.gov/regulations/v3/download" in fetch_url and "api_key" not in fetch_url: 57 | # this requires an API key but one wasn't included in the upstream-provided URL, so add one 58 | fetch_url = fetch_url + "&api_key=" + settings.DDG_API_KEY 59 | 60 | yield (fetch_url, save_path, result) 61 | except pymongo.errors.OperationFailure: 62 | # occasionally pymongo seems to lose track of the cursor for some reason, so reset the query 63 | v_array[0] = find_func(downloaded="no", query=query) 64 | continue 65 | except StopIteration: 66 | break 67 | 68 | def status_func(status, url, filename, result): 69 | if status[0]: 70 | result['view'].downloaded = "yes" 71 | result['view'].file_path = filename 72 | stats['downloaded'] += 1 73 | else: 74 | result['view'].downloaded = "failed" 75 | stats['failed'] += 1 76 | update_func(**result) 77 | 78 | pooled_bulk_download(download_generator(), status_func, verbose=not options.parsable, min_size=MIN_SIZE) 79 | 80 | print 'Done with %s.' % view_label 81 | 82 | return stats 83 | 84 | if __name__ == "__main__": 85 | run() 86 | -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_dump_api.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | arg_parser = OptionParser() 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 5 | 6 | def run(options, args): 7 | import urllib2, httplib 8 | import settings 9 | import os, time, sys 10 | from regsdotgov.search import search, parsed_search 11 | from regs_common.transfer import download 12 | 13 | search_args = { 14 | # order ascending by posted date to reduce pagination errors 15 | "sb": "postedDate", 16 | "so": "ASC" 17 | } 18 | id_string = 'all' 19 | if options.agency and options.docket: 20 | raise Exception("Specify either an agency or a docket") 21 | elif options.agency: 22 | search_args['agency'] = options.agency 23 | id_string = 'agency_' + options.agency 24 | elif options.docket: 25 | search_args['docket'] = options.docket 26 | id_string = 'docket_' + options.docket.replace('-', '_') 27 | 28 | # delete old dumps 29 | [os.unlink(os.path.join(settings.DUMP_DIR, file)) for file in os.listdir(settings.DUMP_DIR) if file.startswith('dump_%s' % id_string) and file.endswith('.json')] 30 | 31 | # keep stats 32 | stats = {'downloaded': 0, 'failed': 0} 33 | 34 | # start new dumps 35 | position = 0 36 | increment = 1000 37 | total = parsed_search(1, 0, **search_args)['totalNumRecords'] 38 | num_digits = len(str(settings.DUMP_END)) 39 | while position <= total: 40 | for i in range(3): 41 | try: 42 | current_str = (position / increment) + 1 43 | total_str = '?' if total == 1 else (total / increment) + 1 44 | print "Downloading page %s of %s..." % (current_str, total_str) 45 | download( 46 | search(increment, position, **search_args), 47 | os.path.join(settings.DUMP_DIR, 'dump_%s_%s.json' % (id_string, str(position).zfill(num_digits))), 48 | ) 49 | stats['downloaded'] += 1 50 | break 51 | except (urllib2.HTTPError, httplib.HTTPException) as e: 52 | if i < 2: 53 | if hasattr(e, 'code') and e.code in (503, 429) and 'rate' in e.read().lower(): 54 | print 'Download failed because of rate limiting; will retry in an hour...' 55 | time.sleep(3600) 56 | else: 57 | print 'Download failed; will retry in 10 seconds...' 58 | time.sleep(10) 59 | else: 60 | print 'System troubles; giving up.' 61 | raise 62 | 63 | position += increment 64 | 65 | return stats 66 | -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_parse_api.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import os 4 | import settings 5 | import sys 6 | from search import parse, iter_parse, result_to_model 7 | import pytz 8 | import datetime 9 | import operator 10 | import time 11 | import json 12 | import re 13 | from regs_common.tmp_redis import TmpRedis 14 | from regs_common.mp_types import Counter 15 | from regs_common.util import listify 16 | from regsdotgov.document import make_view 17 | from regs_models import * 18 | 19 | 20 | import multiprocessing 21 | from Queue import Empty 22 | 23 | from optparse import OptionParser 24 | arg_parser = OptionParser() 25 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.") 26 | arg_parser.add_option("-k", "--keep-cache", dest="keep_cache", action="store_true", default=False, help="Prevents the cache from being deleted at the end of processing to make testing faster.") 27 | arg_parser.add_option("-u", "--use-cache", dest="use_cache", action="store", default=None, help="Use pre-existing cache to make testing faster.") 28 | arg_parser.add_option("-A", "--add-only", dest="add_only", action="store_true", default=False, help="Skip reconciliation, assume that all records are new, and go straight to the add step.") 29 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 30 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 31 | 32 | def repair_views(old_views, new_views): 33 | for new_view in new_views: 34 | already_exists = [view for view in old_views if view.type == new_view.type] 35 | if not already_exists: 36 | old_views.append(new_view) 37 | elif already_exists and already_exists[0].downloaded == 'failed': 38 | already_exists[0].downloaded = "no" 39 | 40 | def reconcile_process(record, cache, db, now, repaired_counter, updated_counter, deleted_counter): 41 | # check and see if this doc has been updated 42 | new_record = cache.get(record['_id']) 43 | if new_record: 44 | # do we need to fix anything? 45 | statuses = [[view['downloaded'] for view in record.get('views', [])]] + [[view['downloaded'] for view in attachment.get('views', [])] for attachment in record.get('attachments', [])] 46 | 47 | #main_views = [make_view(format) for format in listify(new_record.get('fileFormats', []))] 48 | 49 | if record['scraped'] == 'failed' or 'failed' in reduce(operator.add, statuses, []) or (record['scraped'] == 'yes' and len(record.get('attachments', [])) != new_record.get('attachmentCount', 0)): 50 | # needs a repair; grab the full document 51 | current_docs = Doc.objects(id=record['_id']) 52 | 53 | db_doc = current_docs[0] 54 | 55 | db_doc.scraped = "no" 56 | 57 | # rebuild views 58 | #repair_views(db_doc.views, main_views) 59 | 60 | # update the last-seen date 61 | db_doc.last_seen = now 62 | 63 | # reset a couple of flags to trigger reprocessing 64 | db_doc.in_search_index = False 65 | db_doc.in_cluster_db = False 66 | db_doc.entities_last_extracted = None 67 | 68 | # do save 69 | try: 70 | db_doc.save() 71 | repaired_counter.increment() 72 | except: 73 | print "Failed to repair %s" % db_doc.id 74 | else: 75 | # we don't need a full repair, so just do an update on the date 76 | Doc.objects(id=record['_id']).update_one(set__last_seen=now) 77 | updated_counter.increment() 78 | 79 | # either way, delete the document from the cache so we can tell what's new at the end 80 | cache.delete(record['_id']) 81 | else: 82 | # this document isn't in the new data anymore, so mark it deleted 83 | Doc.objects(id=record['_id']).update_one(set__deleted=True) 84 | deleted_counter.increment() 85 | 86 | def reconcile_worker(todo_queue, cache_wrapper, now, repaired_counter, updated_counter, deleted_counter): 87 | pid = os.getpid() 88 | 89 | print '[%s] Reconciliation worker started.' % pid 90 | 91 | cache = cache_wrapper.get_pickle_connection() 92 | 93 | import pymongo 94 | db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME] 95 | 96 | while True: 97 | record = todo_queue.get() 98 | 99 | reconcile_process(record, cache, db, now, repaired_counter, updated_counter, deleted_counter) 100 | 101 | todo_queue.task_done() 102 | 103 | def add_new_docs(cache_wrapper, now): 104 | print 'Adding new documents to the database...' 105 | 106 | cache = cache_wrapper.get_pickle_connection() 107 | 108 | new = 0 109 | for id in cache.keys(): 110 | doc = cache.get(id) 111 | 112 | if doc.get('documentStatus', None) == "Withdrawn": 113 | continue 114 | 115 | db_doc = result_to_model(doc, now=now) 116 | 117 | try: 118 | db_doc.save() 119 | new += 1 120 | except: 121 | print "Failed to save document %s" % db_doc.id 122 | 123 | written = new 124 | print 'Wrote %s new documents.' % (written) 125 | 126 | return written 127 | 128 | def reconcile_dumps(options, cache_wrapper, now): 129 | sys.stdout.write('Reconciling dumps with current data...\n') 130 | sys.stdout.flush() 131 | 132 | # get workers going 133 | num_workers = options.multi 134 | 135 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 136 | repaired_counter = Counter() 137 | updated_counter = Counter() 138 | deleted_counter = Counter() 139 | 140 | processes = [] 141 | for i in range(num_workers): 142 | proc = multiprocessing.Process(target=reconcile_worker, args=(todo_queue, cache_wrapper, now, repaired_counter, updated_counter, deleted_counter)) 143 | proc.start() 144 | processes.append(proc) 145 | 146 | import pymongo 147 | db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME] 148 | 149 | conditions = {'last_seen': {'$lt': now}, 'deleted': False, 'source': 'regulations.gov'} 150 | if options.agency: 151 | conditions['agency'] = options.agency 152 | if options.docket: 153 | conditions['docket_id'] = options.docket 154 | 155 | fields = {'_id': 1, 'scraped': 1, 'views.downloaded': 1, 'views.type': 1, 'attachments.views.downloaded': 1, 'attachments.views.type': 1, 'attachments.object_id': 1} 156 | to_check = db.docs.find(conditions, fields) 157 | 158 | while True: 159 | try: 160 | record = to_check.next() 161 | except pymongo.errors.OperationFailure: 162 | print 'OH NOES!' 163 | to_scrape = db.docs.find(conditions, fields) 164 | continue 165 | except StopIteration: 166 | break 167 | 168 | todo_queue.put(record) 169 | 170 | todo_queue.join() 171 | 172 | for proc in processes: 173 | print 'Terminating reconciliation worker %s...' % proc.pid 174 | proc.terminate() 175 | 176 | # compile and print some stats 177 | num_updated = updated_counter.value 178 | num_repaired = repaired_counter.value 179 | num_deleted = deleted_counter.value 180 | num_docs = num_updated + num_repaired + num_deleted 181 | print 'Reconciliation complete: examined %s documents, of which %s were updated, %s were repaired, and %s were flagged as deleted.' % (num_docs, num_updated, num_repaired, num_deleted) 182 | 183 | return {'updated': num_updated, 'repaired': num_repaired, 'deleted': num_deleted} 184 | 185 | def parser_process(file, cache): 186 | docs = iter_parse(os.path.join(settings.DUMP_DIR, file)) 187 | print '[%s] Done with JSON decode.' % os.getpid() 188 | 189 | count = 0 190 | for doc in docs: 191 | cache.set(doc['documentId'], doc) 192 | count += 1 193 | 194 | return {'docs': count} 195 | 196 | def parser_worker(todo_queue, done_queue, cache_wrapper): 197 | pid = os.getpid() 198 | 199 | print '[%s] Parser worker started.' % pid 200 | 201 | cache = cache_wrapper.get_pickle_connection() 202 | 203 | while True: 204 | file = todo_queue.get() 205 | 206 | sys.stdout.write('[%s] Parsing file %s...\n' % (pid, file)) 207 | sys.stdout.flush() 208 | start = datetime.datetime.now() 209 | 210 | stats = parser_process(file, cache) 211 | 212 | elapsed = datetime.datetime.now() - start 213 | sys.stdout.write('[%s] Done with %s in %s minutes\n' % (pid, file, round(elapsed.total_seconds() / 60.0))) 214 | sys.stdout.flush() 215 | 216 | done_queue.put(stats) 217 | 218 | todo_queue.task_done() 219 | 220 | def parse_dumps(options, cache_wrapper): 221 | # figure out which files are ours 222 | id_string = 'all' 223 | if options.agency and options.docket: 224 | raise Exception("Specify either an agency or a docket") 225 | elif options.agency: 226 | id_string = 'agency_' + options.agency 227 | elif options.docket: 228 | id_string = 'docket_' + options.docket.replace('-', '_') 229 | 230 | num_workers = options.multi 231 | files = [file for file in os.listdir(settings.DUMP_DIR) if file.startswith('dump_%s' % id_string) and file.endswith('.json')] 232 | 233 | if len(files) < 1: 234 | # something is wrong, as there should be more than ten files 235 | raise Exception('Too few .json files; something went wrong.') 236 | 237 | # it's a small number of files, so just make a queue big enough to hold them all, to keep from having to block 238 | todo_queue = multiprocessing.JoinableQueue(len(files)) 239 | done_queue = multiprocessing.Queue(len(files)) 240 | 241 | sys.stdout.write('Starting parser workers...\n') 242 | processes = [] 243 | for i in range(num_workers): 244 | proc = multiprocessing.Process(target=parser_worker, args=(todo_queue, done_queue, cache_wrapper)) 245 | proc.start() 246 | processes.append(proc) 247 | 248 | for file in files: 249 | todo_queue.put(file) 250 | 251 | todo_queue.join() 252 | 253 | for proc in processes: 254 | print 'Terminating parser worker %s...' % proc.pid 255 | proc.terminate() 256 | 257 | # print totals 258 | print 'Done parsing files.' 259 | 260 | def run(options, args): 261 | sys.stdout.write('Starting decoding...\n') 262 | sys.stdout.flush() 263 | 264 | # get workers going 265 | now = datetime.datetime.now(tz=pytz.utc) 266 | 267 | num_workers = options.multi 268 | 269 | # set up caching 270 | sys.stdout.write('Spinning up Redis instance...\n') 271 | 272 | if options.use_cache: 273 | cache_wrapper = TmpRedis(db_uuid=options.use_cache) 274 | # give it time to rebuild its cache from disk if we're using an already-built cache 275 | sys.stdout.write('Loading cache from disk...') 276 | time.sleep(15) 277 | sys.stdout.write(' done.\n') 278 | else: 279 | cache_wrapper = TmpRedis() 280 | parse_dumps(options, cache_wrapper) 281 | 282 | stats = {} 283 | if not options.add_only: 284 | stats = reconcile_dumps(options, cache_wrapper, now) 285 | else: 286 | print 'Skipping reconciliation step.' 287 | 288 | # still-existing and deleted stuff is now done, but we still have to do the new stuff 289 | stats['new'] = add_new_docs(cache_wrapper, now) 290 | 291 | sys.stdout.write('Terminating Redis cache...\n') 292 | 293 | if options.keep_cache: 294 | cache_wrapper.terminate(delete=False) 295 | print 'Cache preserved with UUID %s.' % cache_wrapper.uuid 296 | else: 297 | cache_wrapper.terminate() 298 | 299 | return stats -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_scrape.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import settings 4 | from regs_models import * 5 | from regsdotgov.document import scrape_document 6 | import urllib2, urllib3 7 | import sys 8 | import os 9 | import traceback 10 | import pymongo 11 | import time 12 | 13 | import multiprocessing 14 | from Queue import Empty 15 | from regs_common.mp_types import Counter 16 | from regs_common.exceptions import DoesNotExist, RateLimitException 17 | 18 | from optparse import OptionParser 19 | arg_parser = OptionParser() 20 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.") 21 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 22 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 23 | 24 | def process_record(record, num_succeeded, num_failed, cpool): 25 | if record is None: 26 | return 27 | 28 | new_doc = None 29 | 30 | for i in range(2): 31 | error = None 32 | removed = False 33 | try: 34 | new_doc = scrape_document(record.id, cpool) 35 | new_doc.last_seen = record.last_seen 36 | new_doc.created = record.created 37 | print '[%s] Scraped doc %s...' % (os.getpid(), new_doc.id) 38 | 39 | num_succeeded.increment() 40 | break 41 | except DoesNotExist: 42 | print "Document %s appears to have been deleted; skipping." % record.id 43 | removed = True 44 | break 45 | except KeyboardInterrupt: 46 | raise 47 | except RateLimitException: 48 | print '### Warning: scrape failed on try %s because of RATE LIMIT' % i 49 | time.sleep(3600) 50 | except: 51 | print 'Warning: scrape failed on try %s' % i 52 | error = sys.exc_info() 53 | traceback.print_tb(error[2], file=sys.stdout) 54 | 55 | # catch renames of documents 56 | if new_doc and (not error) and (not removed) and new_doc.id != record.id: 57 | renamed_to = new_doc.id 58 | new_doc = Doc.objects(id=record.id)[0] 59 | new_doc.scraped = 'yes' 60 | new_doc.attachments = [] 61 | new_doc.views = [] 62 | new_doc.details['renamed_to'] = renamed_to 63 | new_doc.renamed = True 64 | 65 | # catch errors and removes 66 | if removed: 67 | num_failed.increment() 68 | return None 69 | elif error or not new_doc: 70 | new_doc = Doc.objects(id=record.id)[0] 71 | new_doc.scraped = 'failed' 72 | if error: 73 | print 'Scrape of %s failed because of %s' % (new_doc.id, str(error)) 74 | num_failed.increment() 75 | 76 | try: 77 | new_doc.save() 78 | except: 79 | print "Warning: database save failed on document %s (scraped based on original doc ID %s)." % (new_doc.id, record.id) 80 | traceback.print_exc() 81 | 82 | def worker(todo_queue, num_succeeded, num_failed): 83 | pid = os.getpid() 84 | cpool = urllib3.PoolManager(maxsize=2) 85 | 86 | print '[%s] Worker started.' % pid 87 | 88 | while True: 89 | record = Doc._from_son(todo_queue.get()) 90 | 91 | process_record(record, num_succeeded, num_failed, cpool) 92 | 93 | todo_queue.task_done() 94 | 95 | def run(options, args): 96 | sys.stdout.write('Starting scrape...\n') 97 | sys.stdout.flush() 98 | 99 | # get workers going 100 | num_workers = options.multi 101 | 102 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 103 | 104 | # set up some counters to track progress 105 | num_succeeded = Counter() 106 | num_failed = Counter() 107 | 108 | processes = [] 109 | for i in range(num_workers): 110 | proc = multiprocessing.Process(target=worker, args=(todo_queue, num_succeeded, num_failed)) 111 | proc.start() 112 | processes.append(proc) 113 | 114 | conditions = {'scraped': 'no', 'deleted': False, 'source': 'regulations.gov'} 115 | if options.agency: 116 | conditions['agency'] = options.agency 117 | if options.docket: 118 | conditions['docket_id'] = options.docket 119 | to_scrape = Doc.objects(**conditions).only('id', 'last_seen', 'created', 'views', 'attachments') 120 | 121 | while True: 122 | try: 123 | record = to_scrape.next() 124 | except pymongo.errors.OperationFailure: 125 | to_scrape = Doc.objects(**conditions).only('id', 'last_seen', 'created', 'views', 'attachments') 126 | continue 127 | except StopIteration: 128 | break 129 | 130 | todo_queue.put(record.to_mongo()) 131 | 132 | todo_queue.join() 133 | 134 | for proc in processes: 135 | print 'Terminating worker %s...' % proc.pid 136 | proc.terminate() 137 | 138 | print 'Scrape complete with %s successes and %s failures.' % (num_succeeded.value, num_failed.value) 139 | return {'scraped': num_succeeded.value, 'failed': num_failed.value} 140 | -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_scrape_dockets.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import settings 4 | from regsdotgov.document import scrape_docket 5 | import urllib2, urllib3 6 | import sys 7 | import os 8 | import traceback 9 | from regs_models import * 10 | import pymongo 11 | import time 12 | 13 | import multiprocessing 14 | from Queue import Empty 15 | from regs_common.mp_types import Counter 16 | from regs_common.exceptions import DoesNotExist, RateLimitException 17 | 18 | from optparse import OptionParser 19 | arg_parser = OptionParser() 20 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.") 21 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.") 22 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 23 | 24 | def process_record(record, num_succeeded, num_failed, cpool): 25 | if record is None: 26 | return 27 | 28 | docket = None 29 | 30 | for i in range(2): 31 | error = None 32 | try: 33 | docket = scrape_docket(record.id, cpool) 34 | docket._created = record._created 35 | docket.stats = record.stats 36 | print '[%s] Scraped docket %s...' % (os.getpid(), docket.id) 37 | num_succeeded.increment() 38 | break 39 | except DoesNotExist: 40 | error = sys.exc_info() 41 | print 'Warning: scrape failed on try %s with server exception: %s' % (i, error[1]) 42 | # no need to try three times 43 | break 44 | except KeyboardInterrupt: 45 | raise 46 | except RateLimitException: 47 | print '### Warning: scrape failed on try %s because of RATE LIMIT' % i 48 | time.sleep(3600) 49 | except: 50 | error = sys.exc_info() 51 | print 'Warning: scrape failed on try %s' % i 52 | 53 | # catch errors 54 | if error or not docket: 55 | docket = record 56 | docket.scraped = 'failed' 57 | if error: 58 | print 'Scrape of %s failed because of %s' % (docket.id, str(error)) 59 | num_failed.increment() 60 | 61 | try: 62 | docket.save() 63 | except: 64 | print "Warning: database save failed on document %s (scraped based on original doc ID %s)." % (docket.id, record.id) 65 | 66 | def worker(todo_queue, num_succeeded, num_failed): 67 | pid = os.getpid() 68 | cpool = urllib3.PoolManager(maxsize=2) 69 | 70 | print '[%s] Worker started.' % pid 71 | 72 | while True: 73 | record = todo_queue.get() 74 | 75 | process_record(record, num_succeeded, num_failed, cpool) 76 | 77 | todo_queue.task_done() 78 | 79 | def run(options, args): 80 | sys.stdout.write('Starting scrape...\n') 81 | sys.stdout.flush() 82 | 83 | # get workers going 84 | num_workers = options.multi 85 | 86 | todo_queue = multiprocessing.JoinableQueue(num_workers * 3) 87 | 88 | # set up some counters to track progress 89 | num_succeeded = Counter() 90 | num_failed = Counter() 91 | 92 | processes = [] 93 | for i in range(num_workers): 94 | proc = multiprocessing.Process(target=worker, args=(todo_queue, num_succeeded, num_failed)) 95 | proc.start() 96 | processes.append(proc) 97 | 98 | conditions = {'scraped': 'no'} 99 | if options.agency: 100 | conditions['agency'] = options.agency 101 | if options.docket: 102 | conditions['id'] = options.docket 103 | to_scrape = Docket.objects(**conditions) 104 | 105 | while True: 106 | try: 107 | record = to_scrape.next() 108 | except pymongo.errors.OperationFailure: 109 | to_scrape = Docket.objects(**conditions) 110 | continue 111 | except StopIteration: 112 | break 113 | 114 | todo_queue.put(record) 115 | 116 | todo_queue.join() 117 | 118 | for proc in processes: 119 | print 'Terminating worker %s...' % proc.pid 120 | proc.terminate() 121 | 122 | print 'Scrape complete with %s successes and %s failures.' % (num_succeeded.value, num_failed.value) 123 | return {'scraped': num_succeeded.value, 'failed': num_failed.value} 124 | -------------------------------------------------------------------------------- /regscrape/regsdotgov/commands/rdg_simple_update.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import os 4 | import subprocess 5 | import settings 6 | import sys 7 | from search import parsed_search, result_to_model 8 | import pytz 9 | import datetime 10 | import operator 11 | import time 12 | import json 13 | import re 14 | import itertools 15 | import urllib2, httplib 16 | from regs_models import * 17 | 18 | from optparse import OptionParser 19 | arg_parser = OptionParser() 20 | arg_parser.add_option("-s", "--since", dest="since", action="store", type="string", default=None, help="Manually specify search start date.") 21 | 22 | def run(options, args): 23 | print 'Retrieving current document IDs...' 24 | 25 | # HACK - pull ids via shell because doing it in Python is slow 26 | count_proc = subprocess.Popen( 27 | ["mongo", settings.DB_NAME] +\ 28 | list(itertools.chain.from_iterable([("--%s" % key, str(value)) for key, value in settings.DB_SETTINGS.items()])) +\ 29 | ["--quiet", "--eval", "printjson(db.docs.find({source:'regulations.gov',deleted:false},{_id:1}).map(function(i){return i._id;}))"], 30 | stdout=subprocess.PIPE 31 | ) 32 | ids = set(json.load(count_proc.stdout)) 33 | 34 | now = datetime.datetime.now() 35 | 36 | if options.since: 37 | most_recent = datetime.datetime.strptime(options.since, "%Y-%m-%d") 38 | print "Done; start date manually set to %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids)) 39 | else: 40 | print "Retrieving date of most recent document..." 41 | recent_agg = Doc._get_collection().aggregate([ 42 | { 43 | "$match": { 44 | "source": "regulations.gov", 45 | "deleted": False 46 | } 47 | }, 48 | { 49 | "$group": { 50 | "_id": 0, 51 | "max": { 52 | "$max": "$details.Date_Posted" 53 | } 54 | } 55 | } 56 | ]); 57 | most_recent = recent_agg['result'][0]['max'] 58 | 59 | print "Done; last document is from %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids)) 60 | 61 | if most_recent > now: 62 | most_recent = now 63 | print "Overriding most recent to now." 64 | 65 | search_args = { 66 | # date range from one day before the most recent until one day after now 67 | "pd": "-".join([d.strftime("%m/%d/%y") for d in (most_recent - datetime.timedelta(days=1), now + datetime.timedelta(days=1))]), 68 | 69 | # order ascending by posted date to reduce pagination errors 70 | "sb": "postedDate", 71 | "so": "ASC" 72 | } 73 | 74 | # start new dumps 75 | position = 0 76 | increment = 1000 77 | stats = {'pages_downloaded': 0, 'new_records': 0, 'existing_records': 0, 'failed_saves': 0} 78 | total = parsed_search(1, 0, **search_args)['totalNumRecords'] 79 | while position <= total: 80 | page = None 81 | for i in range(3): 82 | try: 83 | current_str = (position / increment) + 1 84 | total_str = '?' if total == 1 else (total / increment) + 1 85 | print "Downloading page %s of %s..." % (current_str, total_str) 86 | 87 | page = parsed_search(increment, position, **search_args) 88 | 89 | stats['pages_downloaded'] += 1 90 | break 91 | except (urllib2.HTTPError, httplib.HTTPException) as e: 92 | if i < 2: 93 | if hasattr(e, 'code') and e.code in (503, 429) and 'rate' in e.read().lower(): 94 | print 'Download failed because of rate limiting; will retry in an hour...' 95 | time.sleep(3600) 96 | else: 97 | print 'Download failed; will retry in 10 seconds...' 98 | time.sleep(10) 99 | else: 100 | print 'System troubles; giving up.' 101 | raise 102 | 103 | for result in page.get('documents', []): 104 | if result['documentId'] in ids: 105 | stats['existing_records'] += 1 106 | else: 107 | if result.get('documentStatus', None) == "Withdrawn": 108 | continue 109 | 110 | db_doc = result_to_model(result, now=now) 111 | 112 | try: 113 | db_doc.save() 114 | stats['new_records'] += 1 115 | except: 116 | print "Failed to save document %s" % db_doc.id 117 | stats['failed_saves'] += 1 118 | 119 | position += increment 120 | 121 | print "Wrote %s new records, encountered %s existing records, and had %s failed saves." % (stats['new_records'], stats['existing_records'], stats['failed_saves']) 122 | 123 | return stats -------------------------------------------------------------------------------- /regscrape/regsdotgov/search.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import json 3 | import datetime 4 | from regs_common.util import listify 5 | from regs_models import * 6 | 7 | from settings import RDG_API_KEY, DDG_API_KEY 8 | ARG_NAMES = { 9 | 'agency': 'a', 10 | 'docket': 'dktid' 11 | } 12 | 13 | FR_DOC_TYPES = set(['notice', 'rule', 'proposed_rule']) 14 | 15 | def _v1_search(per_page, position, **args): 16 | url_args = { 17 | 'api_key': RDG_API_KEY, 18 | 'rpp': per_page, 19 | 'po': position 20 | } 21 | 22 | for key, value in args.items(): 23 | url_args[ARG_NAMES[key]] = value 24 | 25 | return urllib2.urlopen( 26 | "http://regulations.gov/api/documentsearch/v1.json?" + '&'.join(['%s=%s' % arg for arg in url_args.items()]) 27 | ) 28 | 29 | def _v3_search(per_page, position, **args): 30 | url_args = { 31 | 'api_key': DDG_API_KEY, 32 | 'rpp': per_page, 33 | 'po': position 34 | } 35 | 36 | for key, value in args.items(): 37 | url_args[ARG_NAMES.get(key, key)] = value 38 | 39 | url = "http://api.data.gov/regulations/v3/documents.json?" + '&'.join(['%s=%s' % arg for arg in url_args.items()]) 40 | req = urllib2.Request(url, headers={'Accept': 'application/json,*/*'}) 41 | return urllib2.urlopen(req) 42 | 43 | search = _v3_search 44 | 45 | def parse(file): 46 | data = open(file) if type(file) in (unicode, str) else file 47 | return json.load(data) 48 | 49 | def _v1_iter_parse(file): 50 | data = parse(file) 51 | return iter(listify(data['searchresult']['documents']['document'])) 52 | 53 | def _v3_iter_parse(file): 54 | data = parse(file) 55 | return iter(data['documents']) 56 | 57 | iter_parse = _v3_iter_parse 58 | 59 | def result_to_model(doc, now=None): 60 | now = now if now is not None else datetime.datetime.now() 61 | 62 | return Doc(**{ 63 | 'id': doc['documentId'], 64 | 'title': unicode(doc.get('title', '')), 65 | 'docket_id': doc['docketId'], 66 | 'agency': doc['agencyAcronym'], 67 | 'type': DOC_TYPES[doc['documentType']], 68 | 'fr_doc': DOC_TYPES[doc['documentType']] in FR_DOC_TYPES, 69 | 'last_seen': now, 70 | 'created': now 71 | }) 72 | 73 | # convenience function that strings them together 74 | def parsed_search(per_page, position, client=None, **args): 75 | return parse(search(per_page, position, **args)) 76 | 77 | # use the search with an overridden client to get the agencies instead of the documents 78 | def get_agencies(): 79 | raise Exception("Haven't written this one yet") -------------------------------------------------------------------------------- /regscrape/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from regs_common.commands.runner import run_command 4 | 5 | if __name__ == '__main__': 6 | run_command() 7 | -------------------------------------------------------------------------------- /regscrape/sec_cftc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/sec_cftc/__init__.py -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/sec_cftc/commands/__init__.py -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/cftc_extract_current.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import urllib2, re, json, os, urlparse 4 | from pyquery import PyQuery as pq 5 | from lxml import etree 6 | from collections import OrderedDict, defaultdict 7 | from optparse import OptionParser 8 | import settings 9 | 10 | from regs_common.util import crockford_hash 11 | from regs_models import * 12 | 13 | # FIXME: split this out 14 | from sec_cftc.commands.sec_cftc_import import view_from_url 15 | 16 | # arguments 17 | arg_parser = OptionParser() 18 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False) 19 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 20 | arg_parser.add_option("-D", "--document", dest="document", action="store", type="string", default=None, help="Specify a document to which to limit the dump.") 21 | 22 | def run(options, args): 23 | query = {'scraped': 'no', 'source': 'sec_cftc', 'agency': 'CFTC', 'views__downloaded': 'yes'} 24 | 25 | if options.docket: 26 | query['docket_id'] = options.docket 27 | 28 | if options.document: 29 | query['id'] = options.document 30 | 31 | parser = etree.HTMLParser() 32 | for doc in Doc.objects(**query): 33 | print "Processing %s..." % doc.id 34 | page_data = open(doc.views[0].file_path).read() 35 | page = pq(etree.fromstring(page_data, parser)) 36 | 37 | text_block = page('.dyn_wrap div.ClearBoth') 38 | text = text_block.html().strip() if len(text_block) else "" 39 | full_text = "<html><body>%s</body></html>" % text 40 | 41 | if doc.views[0].content: 42 | doc.views[0].content.delete() 43 | 44 | doc.views[0].content.new_file() 45 | doc.views[0].content.write(full_text.encode('utf8')) 46 | doc.views[0].content.close() 47 | 48 | doc.views[0].extracted = 'yes' 49 | print "Found and wrote text." 50 | 51 | print "attachment" 52 | attachment_link = page('.dyn_wrap a[id*=StaticLink]') 53 | if attachment_link: 54 | att_url = urlparse.urljoin(doc.views[0].url, attachment_link.attr('href').strip()) 55 | 56 | att = Attachment() 57 | att.title = page('.dyn_wrap a[id*=AssetAttachment]').text().strip() 58 | 59 | att_view = view_from_url(att_url) 60 | if 'pdf' in att_url.lower(): 61 | att_view.type = 'xpdf' 62 | att.views.append(att_view) 63 | att.object_id = att_view.object_id 64 | 65 | doc.attachments = [att] 66 | 67 | print "Found and saved attachment %s." % att_url 68 | else: 69 | print "No attachment found." 70 | 71 | doc.scraped = 'yes' 72 | doc.save() -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/cftc_scrape_documents.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | import urllib2, re, json, os, sys, operator, string, urlparse, urllib, cookielib 4 | from pyquery import PyQuery as pq 5 | from lxml import etree 6 | from collections import OrderedDict, defaultdict 7 | import settings 8 | from optparse import OptionParser 9 | 10 | from regs_common.util import crockford_hash 11 | from regs_common.exceptions import ExtractionFailed 12 | 13 | # arguments 14 | arg_parser = OptionParser() 15 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False) 16 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.") 17 | arg_parser.add_option("-s", "--strategy", dest="strategy", action="store", type="string", default=None, help="Restrict scraping to a single strategy.") 18 | 19 | parser = etree.HTMLParser() 20 | 21 | def fix_spaces(text): 22 | return re.sub(u"[\s\xa0]+", " ", text) 23 | 24 | def parse_current_docket(docket_record): 25 | # grab the file with the URL mangled slightly to grab 100k records 26 | docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read() 27 | page = pq(etree.fromstring(docket_file, parser)) 28 | 29 | docket = dict(docket_record) 30 | 31 | docket['title'] = page('.dyn_wrap h1').text().strip() 32 | assert docket['title'], 'no title found' 33 | 34 | headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()] 35 | 36 | docket['comments'] = [] 37 | 38 | # check if there's a no-records message 39 | if len(page('.rgMasterTable .rgNoRecords')): 40 | return docket 41 | 42 | for row in page('.rgMasterTable tbody tr').items(): 43 | tds = row.find('td') 44 | cell_text = [item.text().strip() for item in tds.items()] 45 | cdata = dict(zip(headers, cell_text)) 46 | 47 | link = pq(tds[-1]).find('a') 48 | 49 | doc = { 50 | 'url': urlparse.urljoin(docket['url'], link.attr('href')), 51 | 'details': {}, 52 | 'release': [fix_spaces(cdata['Release'])], 53 | 'date': cdata['Date Received'], 54 | 'doctype': 'public_submission', 55 | } 56 | 57 | vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url']) 58 | if vc_matches: 59 | doc['id'] = vc_matches[0] 60 | doc['subtype'] = 'comment' 61 | detail_columns = ['Organization', 'First Name', 'Last Name'] 62 | else: 63 | ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url']) 64 | if ep_matches: 65 | doc['id'] = "EP-%s" % ep_matches[0] 66 | doc['subtype'] = 'exparte' 67 | detail_columns = ['Organization'] 68 | else: 69 | assert False, "expected either comment or exparte link: %s" % doc['url'] 70 | 71 | for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')): 72 | if cftc_label in detail_columns and cdata[cftc_label]: 73 | doc['details'][rdg_label] = cdata[cftc_label] 74 | 75 | docket['comments'].append(doc) 76 | 77 | assert len(docket['comments']) < 100000, "we probably exceeded one page" 78 | 79 | # then strip out all the ones that aren't about this document 80 | release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip()) 81 | docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release] 82 | 83 | return docket 84 | 85 | def parse_old_docket(docket_record): 86 | docket_file = urllib2.urlopen(docket_record['url']).read() 87 | page = pq(etree.fromstring(docket_file, parser)) 88 | 89 | docket = dict(docket_record) 90 | 91 | release = page('ul.text p a').text().strip() 92 | if not re.match("\d+ FR \d+", release): 93 | release = None 94 | 95 | # hackery to get the title 96 | para_lines = [chunk.strip() for chunk in page('ul.text p a').parent().html().split("</a>")[-1].replace(" ", " ").split("<br />") if chunk.strip()] 97 | docket['title'] = para_lines[0] 98 | 99 | docket['comments'] = [] 100 | 101 | for row in page('.list-release .row').items(): 102 | date = row('.column-date').text().strip() 103 | if not date: 104 | # this is an FR document 105 | item = row('.column-item') 106 | label = item.text().strip() 107 | assert re.match('\d+ FR \d+', label), "Expected FR citation, got: %s" % label 108 | 109 | link = item.find('a') 110 | frnum = re.findall("[A-Z0-9-]+", link.attr('href').rsplit("/", 1)[-1]) 111 | assert frnum, "expected FR num" 112 | doc = { 113 | 'id': frnum[0], 114 | 'title': label, 115 | 'details': { 116 | 'Federal Register Citation': label, 117 | 'Federal Register Number': frnum[0] 118 | }, 119 | 'url': urlparse.urljoin(docket_record['url'], link.attr('href')), 120 | 'doctype': 'Federal Register Release' 121 | } 122 | else: 123 | # this is a comment 124 | desc = row('.column-comment, .column-item') 125 | link = desc('a') 126 | link_label = link.text().strip() 127 | 128 | ll_is_id = re.match("^[A-Z]{2}\d+$", link_label) 129 | 130 | doc = { 131 | 'date': date, 132 | 'url': urlparse.urljoin(docket_record['url'], link.attr('href')), 133 | 'title': re.split(r"<br ?/?>", desc.html().strip())[1].strip() if ll_is_id else link_label, 134 | 'details': {}, 135 | 'doctype': 'public_submission' 136 | } 137 | if ll_is_id: 138 | doc['id'] = link_label 139 | if release: 140 | doc['release'] = [release] 141 | pages = row('.column-pages') 142 | if len(pages): 143 | doc['details']['Pages'] = pages.text().strip() 144 | 145 | docket['comments'].append(doc) 146 | 147 | return docket 148 | 149 | def is_ancient_label(text): 150 | return re.match("[A-Z ]+:", text) 151 | 152 | def parse_ancient_docket(docket_record): 153 | page_url = docket_record['url'] 154 | 155 | docket = dict(docket_record) 156 | docket['comments'] = [] 157 | 158 | while True: 159 | page_data = urllib2.urlopen(page_url).read() 160 | page = pq(etree.fromstring(page_data, parser)) 161 | 162 | groups = [] 163 | group = [] 164 | first_divider = False 165 | for table in page('table').items(): 166 | divider = table.find('font[color*="#808000"]') 167 | if len(divider) and re.match(r".*-{10,}.*", divider.text()): 168 | if not first_divider: 169 | first_divider = True 170 | continue 171 | if group: 172 | groups.append(group) 173 | group = [] 174 | elif first_divider: 175 | group.append(table) 176 | 177 | for group in groups: 178 | cells = pq([g[0] for g in group]).find('td') 179 | 180 | doc = { 181 | 'title': fix_spaces(" ".join([item.text() for item in pq([g[0] for g in group[1:]]).find('td b font').items()])), 182 | 'details': {}, 183 | 'url': None, 184 | } 185 | 186 | for i in range(len(cells)): 187 | text = fix_spaces(cells.eq(i).text().strip()) 188 | if is_ancient_label(text): 189 | next_text = fix_spaces(cells.eq(i + 1).text().strip()) 190 | next_text = next_text if not is_ancient_label(next_text) else None 191 | 192 | if next_text: 193 | if text == "DOCUMENT:": 194 | # we need yet another cell 195 | doc['id'] = next_text + fix_spaces(cells.eq(i + 2).text().strip()) 196 | 197 | if 'CL' in doc['id']: 198 | doc['doctype'] = 'public_submission' 199 | elif 'NC' in doc['id']: 200 | doc['doctype'] = 'other' 201 | elif 'FR' in doc['id']: 202 | ltitle = doc['title'].lower() 203 | if 'proposed' in ltitle: 204 | doc['doctype'] = 'proposed_rule' 205 | elif 'final' in ltitle: 206 | doc['doctype'] = 'rule' 207 | else: 208 | doc['doctype'] = 'notice' 209 | elif text == "DATE:": 210 | doc['date'] = next_text 211 | elif text == "FR PAGE:" and "N/A" not in next_text.upper(): 212 | doc['details']['Federal Register Page'] = next_text 213 | elif text == "PAGES:": 214 | doc['details']['Pages'] = next_text 215 | elif text == "PDF SIZE:": 216 | doc['details']['PDF Size'] = next_text 217 | elif text == "PDF LINK:": 218 | link = cells.eq(i + 1).find('a') 219 | if len(link): 220 | doc['url'] = urlparse.urljoin(page_url, link.attr('href')) 221 | docket['comments'].append(doc) 222 | 223 | # grab the 'next' link 224 | next_link = [a for a in page('a[href*=foi]').items() if 'Next' in a.text()] 225 | if next_link: 226 | next_url = urlparse.urljoin(page_url, next_link[0].attr('href')) 227 | if next_url != page_url: 228 | page_url = next_url 229 | else: 230 | # apparently sometimes "next" points to the current page -- bail if so 231 | break 232 | else: 233 | break 234 | return docket 235 | 236 | def parse_sirt_docket(docket_record): 237 | # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything... 238 | # which is arduous and stupid because it's a yucky ASP app. 239 | 240 | cj = cookielib.CookieJar() 241 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 242 | initial = pq(opener.open(docket_record['url']).read()) 243 | 244 | error_header = initial("h4") 245 | if len(error_header) and "sorry" in error_header.text().lower(): 246 | raise ExtractionFailed("This URL doesn't work.") 247 | 248 | formdata = urllib.urlencode(( 249 | ('__EVENTTARGET', 'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'), 250 | ('__EVENTARGUMENT', ''), 251 | ('__LASTFOCUS', ''), 252 | ('__VIEWSTATE', initial('#__VIEWSTATE').val()), 253 | ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()), 254 | ('ctl00$masterScriptManager', ''), 255 | ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on') 256 | )) 257 | 258 | page = pq(opener.open(docket_record['url'], data=formdata).read()) 259 | 260 | docket = dict(docket_record) 261 | 262 | details = dict([re.split(r"\s*:\s*", row.strip()) for row in re.split(r"<br ?/?>", page('h5.QueryTitle').html()) if row.strip()]) 263 | 264 | if 'details' not in docket: 265 | docket['details'] = {} 266 | 267 | if 'Filing Description' in details: 268 | docket['title'] = details['Filing Description'] 269 | 270 | if 'Organization' in details: 271 | docket['details']['Organization Name'] = details['Organization'] 272 | 273 | if 'Status' in details: 274 | docket['details']['Status'] = details['Status'] 275 | 276 | docket['comments'] = [] 277 | 278 | for link in page('.gradient-style tr td a').items(): 279 | doc = { 280 | 'url': urlparse.urljoin(docket_record['url'], link.attr('href')), 281 | 'title': fix_spaces(link.text().strip()), 282 | 'details': {}, 283 | } 284 | doc['doctype'] = 'public_submission' if 'comment' in doc['title'].lower() else 'other' 285 | doc['id'] = crockford_hash(doc['url']) 286 | 287 | docket['comments'].append(doc) 288 | 289 | return docket 290 | 291 | 292 | def run(options, args): 293 | dockets = json.load(open(os.path.join(settings.DUMP_DIR, "cftc_dockets.json"))) 294 | 295 | stats = {'fetched': 0, 'skipped': 0, 'failed': 0} 296 | 297 | docket_dir = os.path.join(settings.DUMP_DIR, "cftc_dockets") 298 | if not os.path.exists(docket_dir): 299 | os.mkdir(docket_dir) 300 | 301 | for i, docket in enumerate(dockets.itervalues()): 302 | if options.docket and docket['id'] != options.docket: 303 | continue 304 | 305 | if options.strategy and docket['strategy'] != options.strategy: 306 | continue 307 | 308 | if 'url' in docket: 309 | print 'Fetching %s...' % docket['id'] 310 | print i, json.dumps(docket) 311 | try: 312 | fetched = globals()['parse_%s_docket' % docket['strategy']](docket) 313 | except ExtractionFailed: 314 | print "FAILED to scrape docket data for %s" % docket['id'] 315 | stats['failed'] += 1 316 | continue 317 | 318 | if options.verbose: 319 | print json.dumps(fetched, indent=4) 320 | 321 | outfile = open(os.path.join(docket_dir, "%s.json" % docket['id']), "wb") 322 | json.dump(fetched, outfile, indent=4) 323 | outfile.close() 324 | 325 | stats['fetched'] += 1 326 | else: 327 | print 'Skipping %s.' % docket['id'] 328 | stats['skipped'] += 1 329 | 330 | print "Fetched %s dockets; skipped %s dockets; failed on %s dockets." % (stats['fetched'], stats['skipped'], stats['failed']) 331 | return stats -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/sec_cftc_import.py: -------------------------------------------------------------------------------- 1 | import json, re, urlparse 2 | from dateutil.parser import parse as parse_date 3 | import datetime 4 | 5 | import settings 6 | from regs_models import * 7 | from regs_common.util import * 8 | from mongoengine.errors import NotUniqueError 9 | 10 | from optparse import OptionParser 11 | 12 | # arguments 13 | arg_parser = OptionParser() 14 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False) 15 | arg_parser.add_option("-u", "--update", dest="update", action="store_true", default=False, help="Update existing records.") 16 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", default=None, help="Restrict import to a single agency.") 17 | 18 | GEVENT = False 19 | 20 | type_mappings = { 21 | "notice": "notice", 22 | "other": "other", 23 | "proposed_rule": "proposed_rule", 24 | "public_submission": "public_submission", 25 | "rule": "rule", 26 | "General CFTC": "other", 27 | "Industry Filing": "other", 28 | "Orders and Other Announcements": "other", 29 | "Press Release": "other", 30 | "Privacy Act Systems": "other", 31 | "Proposed Rule": "proposed_rule", 32 | "Public Information Collection": "notice", 33 | "Sunshine Act": "other", 34 | "concept": "notice", 35 | "final": "rule", 36 | "interim-final-temp": "rule", 37 | "interp": "notice", 38 | "other": "other", 39 | "petitions": "other", 40 | "policy": "other", 41 | "proposed": "proposed_rule", 42 | # FIXME: this is terrible; should actually figure out what it is 43 | "Federal Register Release": "notice", 44 | } 45 | file_mapping = { 46 | 'pdf': 'xpdf', 47 | 'html': 'html', 48 | 'htm': 'html' 49 | } 50 | 51 | nineteen_hundred = parse_date("1900-01-01") 52 | 53 | def docket_record_to_model(record, agency): 54 | dkt = Docket() 55 | 56 | dkt.id = "%s-X-%s" % (agency, record['id']) 57 | dkt.agency = agency 58 | dkt.details['Source_ID'] = record['id'] 59 | 60 | if 'title' in record and record['title']: 61 | dkt.title = record['title'] 62 | 63 | if record.get('url', None): 64 | dkt['details']['Source_URL'] = record['url'] 65 | 66 | if record.get('type', None): 67 | dkt['details']['Type'] = record['type'] 68 | 69 | if record.get('subtype', None): 70 | dkt['details']['Subtype'] = record['subtype'] 71 | 72 | dkt.source = 'sec_cftc' 73 | dkt.scraped = 'no' 74 | 75 | return dkt 76 | 77 | def docket_record_from_id(docket_id, agency): 78 | dkt = Docket() 79 | 80 | dkt.id = docket_id 81 | dkt.agency = agency 82 | dkt.source = 'sec_cftc' 83 | dkt.scraped = 'no' 84 | 85 | return dkt 86 | 87 | def view_from_url(url): 88 | view = View() 89 | # strip fragments 90 | view.url = re.sub(r"#.*", "", url).strip() 91 | 92 | ext_matches = re.findall(r"\.([A-Za-z]+)$", view.url) 93 | if ext_matches: 94 | view.type = file_mapping.get(ext_matches[0], ext_matches[0]) 95 | else: 96 | view.type = 'html' 97 | 98 | view.object_id = crockford_hash(view.url) 99 | 100 | return view 101 | 102 | def fr_doc_record_to_model(record, agency): 103 | doc = Doc() 104 | 105 | if record['file_info']: 106 | file_info = record['file_info'][0] 107 | file_id = file_info['parent' if 'parent' in file_info else 'id'] 108 | else: 109 | file_id = crockford_hash(record['id']) 110 | 111 | doc.docket_id = "%s-X-%s" % (agency, file_id) 112 | doc.id = "%s-%s" % (doc.docket_id, record['id'].encode('ascii', 'ignore')) 113 | 114 | doc.type = type_mappings[record['doctype']] 115 | 116 | if 'title' in record: 117 | doc.title = record['title'] 118 | else: 119 | doc.title = record['id'] 120 | 121 | doc.agency = agency 122 | doc.source = 'sec_cftc' 123 | doc.scraped = 'yes' 124 | 125 | doc.details = {k.replace(" ", "_").replace(".", ""): v for k, v in record.get("details", {}).iteritems()} 126 | if record.get('date', None) and record['date'].strip(): 127 | parsed_date = parse_date(record['date'].strip()) 128 | if parsed_date > nineteen_hundred: 129 | doc.details['Date_Posted'] = parsed_date 130 | 131 | if record.get('description', None): 132 | doc.abstract = record['description'] 133 | 134 | doc.fr_doc = doc.type in ('rule', 'proposed_rule', 'notice') 135 | 136 | doc.created = datetime.datetime.now() 137 | 138 | if record.get('url', None): 139 | doc.views.append(view_from_url(record['url'])) 140 | 141 | for att in record.get('attachments', []): 142 | attachment = Attachment() 143 | attachment.title = att['title'] 144 | for v in att['views']: 145 | view = view_from_url(v['url']) 146 | if 'type' in v: 147 | view.type = v['type'] 148 | attachment.views.append(view) 149 | if attachment.views: 150 | attachment.object_id = attachment.views[0].object_id 151 | doc.attachments.append(attachment) 152 | 153 | return doc 154 | 155 | 156 | def comment_record_to_model(record, agency, docket_id): 157 | doc = Doc() 158 | 159 | doc.docket_id = docket_id 160 | doc.id = "%s-%s" % (doc.docket_id, record['id']) 161 | 162 | doc.type = type_mappings[record['doctype']] 163 | 164 | if 'title' in record: 165 | doc.title = record['title'] 166 | else: 167 | parts = [] 168 | if 'First Name' in record['details']: 169 | parts.append(record['details']['First Name'] + (" " + record['details']['Last Name']) if 'Last Name' in record['details'] else "") 170 | if 'Organization Name' in record['details']: 171 | parts.append(record['details']['Organization Name']) 172 | doc.title = "Comment from %s" % ", ".join(parts) 173 | 174 | if not doc.title: 175 | doc.title = record['id'] 176 | 177 | doc.agency = agency 178 | doc.source = 'sec_cftc' 179 | 180 | if agency == 'CFTC' and 'comments.cftc.gov' in (record.get('url', '') or ''): 181 | doc.scraped = 'no' 182 | else: 183 | doc.scraped = 'yes' 184 | 185 | doc.details = {k.replace(" ", "_"): v for k, v in record.get("details", {}).iteritems()} 186 | if record.get('date', None) and record['date'].strip(): 187 | try: 188 | parsed_date = parse_date(record['date'].strip()) 189 | if parsed_date > nineteen_hundred: 190 | doc.details['Date_Posted'] = parsed_date 191 | except: 192 | pass 193 | 194 | if 'num_received' in record: 195 | doc.details['Number_of_Duplicate_Submissions'] = record['num_received'] 196 | 197 | if record.get('description', None): 198 | doc.abstract = record['description'] 199 | 200 | doc.fr_doc = doc.type in ('rule', 'proposed_rule', 'notice') 201 | 202 | doc.created = datetime.datetime.now() 203 | 204 | if record.get('url', None): 205 | doc.views.append(view_from_url(record['url'])) 206 | 207 | for att in record.get('attachments', []): 208 | attachment = Attachment() 209 | attachment.title = att['title'] 210 | attachment.views.append(view_from_url(att['url'])) 211 | attachment.object_id = attachment.views[0].object_id 212 | doc.attachments.append(attachment) 213 | 214 | return doc 215 | 216 | def run(options, args): 217 | for agency in (options.agency,) if options.agency else ('CFTC', 'SEC'): 218 | lagency = agency.lower() 219 | 220 | all_dockets = {} 221 | dockets_for_saving = {} 222 | 223 | # first load the docket file 224 | dockets = json.load(open(os.path.join(settings.DUMP_DIR, "%s_dockets.json" % lagency))) 225 | docket_dir = os.path.join(settings.DUMP_DIR, "%s_dockets" % lagency) 226 | 227 | # next deal with the FR documents 228 | doc_by_identifier = {} 229 | cftc_ancient_mapping = {} 230 | all_fr_docs = [] 231 | dockets_seen = set() 232 | 233 | fr_docs = json.load(open(os.path.join(settings.DUMP_DIR, "%s_fr_docs.json" % lagency))) 234 | for doc in fr_docs: 235 | if 'id' not in doc and 'url' in doc: 236 | doc['id'] = crockford_hash(doc['url']) 237 | 238 | if 'doctype' not in doc: 239 | doc['doctype'] = 'Federal Register Release' 240 | 241 | print "Processing FR doc %s in %s..." % (doc['id'].encode('utf8'), agency) 242 | dc = fr_doc_record_to_model(doc, agency) 243 | for identifier in (doc['id'], dc.details.get('Federal_Register_Number', None), dc.details.get('Federal_Register_Citation', None)): 244 | if identifier: 245 | doc_by_identifier[identifier] = dc 246 | 247 | # treat ancient CFTC FR docs specially because they'll show up again in the listing, so don't double count 248 | if agency == 'CFTC' and doc['strategy'] == 'ancient': 249 | if 'Federal_Register_Citation' in dc.details: 250 | cftc_ancient_mapping[dc.details['Federal_Register_Citation'].split(" FR ")[-1]] = dc 251 | else: 252 | all_fr_docs.append(dc) 253 | dockets_seen.add(dc.docket_id) 254 | 255 | # now load docket files one by one and deal with docket records and comments 256 | all_comments = [] 257 | for record in dockets.itervalues(): 258 | json_file = os.path.join(docket_dir, "%s.json" % record['id']) 259 | 260 | file_exists = os.path.exists(json_file) 261 | 262 | if file_exists: 263 | full_record = json.load(open(json_file)) 264 | 265 | print "Processing docket %s in %s..." % (record['id'], agency) 266 | 267 | dkt = docket_record_to_model(full_record if file_exists else record, agency) 268 | all_dockets[dkt.id] = dkt 269 | 270 | if 'parent' in record: 271 | dkt.details['Parent'] = '%s-X-%s' % (agency, record['parent']) 272 | else: 273 | dockets_for_saving[dkt.id] = dkt 274 | 275 | if not file_exists: 276 | continue 277 | 278 | for comment_record in full_record['comments']: 279 | if 'doctype' not in comment_record: 280 | comment_record['doctype'] = 'public_submission' 281 | 282 | if 'id' not in comment_record and 'url' in comment_record: 283 | comment_record['id'] = crockford_hash(comment_record['url']) 284 | 285 | print "Processing comment %s in %s..." % (comment_record['id'], dkt.id) 286 | cmt = comment_record_to_model(comment_record, agency, dkt.details['Parent'] if 'Parent' in dkt.details else dkt.id) 287 | 288 | if comment_record.get('release', None): 289 | release = comment_record['release'][0] 290 | if release in doc_by_identifier: 291 | cmt.comment_on = {'document_id': doc_by_identifier[release].id} 292 | 293 | if 'Federal Register Page' in comment_record: 294 | cmt.title = cftc_ancient_mapping[comment_record['details']['Federal Register Page']].title 295 | 296 | all_comments.append(cmt) 297 | 298 | print len(all_dockets), len(all_fr_docs), len(all_comments) 299 | 300 | # make sure we have docket records for all dockets that have documents in them 301 | for docket_id in dockets_seen: 302 | if docket_id not in dockets_for_saving: 303 | simple_dkt = docket_record_from_id(docket_id, agency) 304 | dockets_for_saving[docket_id] = simple_dkt 305 | 306 | for dkt in dockets_for_saving.itervalues(): 307 | try: 308 | print "Attempting to save docket %s..." % dkt.id 309 | dkt.save(force_insert=True) 310 | print "Docket %s saved." % dkt.id 311 | except NotUniqueError: 312 | print "Docket %s already exists." % dkt.id 313 | if options.update: 314 | print "Fetching docket %s for update..." % dkt.id 315 | 316 | # fetch the current one 317 | current = Docket.objects.get(id=dkt.id) 318 | if dkt.title: 319 | current.title = dkt.title 320 | 321 | current.details = dkt.details 322 | current.source = dkt.source 323 | current.agency = dkt.agency 324 | 325 | if current.scraped != 'yes': 326 | current.scraped = dkt.scraped 327 | 328 | current.save() 329 | 330 | for doc_set in (all_fr_docs, all_comments): 331 | for doc_obj in doc_set: 332 | try: 333 | print "Attempting to save document %s..." % doc_obj.id 334 | doc_obj.save(force_insert=True) 335 | print "Document %s saved." % doc_obj.id 336 | except NotUniqueError: 337 | print "Document %s already exists." % doc_obj.id 338 | if options.update: 339 | print "Fetching document %s for update..." % doc_obj.id 340 | 341 | # fetch the current one 342 | current = Doc.objects.get(id=doc_obj.id) 343 | current.title = doc_obj.title 344 | current.details = doc_obj.details 345 | 346 | if len(current.views) != len(doc_obj.views): 347 | current.views = doc_obj.views 348 | 349 | if len(current.attachments) != len(doc_obj.attachments): 350 | current.attachments = doc_obj.attachments 351 | 352 | current.save() 353 | -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/sec_cftc_name_dockets.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from regs_models import * 4 | import datetime 5 | 6 | def run(): 7 | for docket in Docket.objects(source="sec_cftc", scraped="no"): 8 | now = datetime.datetime.now() 9 | if not docket.title: 10 | candidates = list(Doc.objects(docket_id=docket.id, type__in=("rule", "proposed_rule", "notice"))) 11 | candidates = sorted(candidates, key=lambda c: c.details.get('Date_Posted', now)) 12 | 13 | # also consider type "other", but they're worse 14 | worse_candidates = list(Doc.objects(docket_id=docket.id, type="other")) 15 | worse_candidates = sorted(worse_candidates, key=lambda c: c.details.get('Date_Posted', now)) 16 | 17 | candidates = candidates + worse_candidates 18 | 19 | if candidates: 20 | ctitle = candidates[0].title 21 | else: 22 | ctitle = docket.id 23 | 24 | print "For docket %s, proposing title: %s" % (docket.id, ctitle) 25 | 26 | docket.title = ctitle 27 | 28 | docket.scraped = 'yes' 29 | 30 | docket.save() -------------------------------------------------------------------------------- /regscrape/sec_cftc/commands/suppress_duplicates.py: -------------------------------------------------------------------------------- 1 | GEVENT = False 2 | 3 | from regs_models import * 4 | 5 | import settings 6 | import rawes 7 | 8 | def run(): 9 | es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=60.0) 10 | index = getattr(es, settings.ES_INDEX) 11 | 12 | records = { 13 | 'sec_cftc': {}, 14 | 'regulations.gov': {} 15 | } 16 | 17 | for doc in Doc.objects(type__in=['notice', 'proposed_rule', 'rule'], agency__in=['SEC', 'CFTC']): 18 | # first check the annotation 19 | if 'fr_data' in doc.annotations and doc.annotations['fr_data']: 20 | #print 'annotation', doc.source, doc.id, doc.annotations['fr_data']['document_number'] 21 | records[doc.source][doc.annotations['fr_data']['document_number']] = doc 22 | elif 'Federal_Register_Number' in doc.details: 23 | #print 'detail', doc.source, doc.id, doc.details['Federal_Register_Number'] 24 | frn = doc.details['Federal_Register_Number'] 25 | # trim leading zeros from the second part 26 | if "-" in frn: 27 | frnp = frn.split("-") 28 | frn = "-".join(frnp[:-1] + [frnp[-1].lstrip('0')]) 29 | records[doc.source][frn] = doc 30 | 31 | overlap = records['sec_cftc'].viewkeys() & records['regulations.gov'].viewkeys() 32 | for frid in overlap: 33 | winner = records['sec_cftc'][frid] 34 | loser = records['regulations.gov'][frid] 35 | 36 | winner_dkt = Docket.objects.get(id=winner.docket_id) 37 | loser_dkt = Docket.objects.get(id=loser.docket_id) 38 | 39 | for w, l in ((winner, loser), (winner_dkt, loser_dkt)): 40 | replaces = set(w.suppression.get('replaces', [])) 41 | replaces.add(l.id) 42 | w.suppression['replaces'] = list(replaces) 43 | 44 | replaced_by = set(l.suppression.get('replaced_by', [])) 45 | replaced_by.add(w.id) 46 | l.suppression['replaced_by'] = list(replaced_by) 47 | 48 | l.save() 49 | w.save() 50 | 51 | try: 52 | index.docket.delete(loser_dkt.id) 53 | index.document.delete(loser.id) 54 | except: 55 | pass 56 | 57 | print '%s suppresses %s' % (winner.id, loser.id) -------------------------------------------------------------------------------- /regscrape/settings.py: -------------------------------------------------------------------------------- 1 | TARGET_SERVER = 'www.regulations.gov' 2 | DEBUG = True 3 | DB_NAME = 'regulations' 4 | ES_HOST = 'thrift://localhost:9500' 5 | ES_INDEX = 'regulations' 6 | DATA_DIR = '/data' 7 | EXTRACTORS = 2 8 | 9 | DUMP_START = 0 10 | DUMP_END = 3850000 11 | DUMP_INCREMENT = 10000 12 | MAX_WAIT = 600 13 | CHUNK_SIZE = 10 14 | FILTER = {} 15 | 16 | INSTANCES = 2 17 | THREADS_PER_INSTANCE = 2 18 | 19 | SITES = ['regsdotgov', 'sec_cftc'] 20 | 21 | try: 22 | from local_settings import * 23 | except: 24 | pass 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gevent 2 | pytz 3 | lxml 4 | Pillow 5 | -e git+https://github.com/jodal/pykka.git@v0.12#egg=pykka 6 | pymongo 7 | -e git+https://github.com/mikemaccana/python-docx.git#egg=docx 8 | -e git+https://github.com/sunlightlabs/oxtail.git#egg=oxtail 9 | -e git+https://github.com/sunlightlabs/python-transparencydata#egg=transparencydata 10 | -e git+https://github.com/sunlightlabs/name-cleaver.git#egg=name_cleaver 11 | -e git+https://github.com/apendleton/mincemeatpy.git#egg=mincemeat 12 | -e git+https://github.com/sunlightlabs/regs-models.git#egg=regs_models 13 | isoweek 14 | 15 | # redis-based sync requirements 16 | hiredis 17 | redis 18 | 19 | mongoengine 20 | html2text 21 | urllib3 22 | rawes 23 | 24 | pyquery 25 | crockford 26 | python-dateutil 27 | PyYAML 28 | jellyfish --------------------------------------------------------------------------------