├── .gitignore
├── Gemfile
├── LICENSE
├── README.md
├── __init__.py
├── analysis
    ├── __init__.py
    ├── export.py
    ├── export_dockets.py
    ├── schema.sql
    └── tests.py
├── auto
    ├── fabfile.py
    ├── requirements.txt
    └── ssh_util.py
├── duplicates
    ├── __init__.py
    ├── cftc.py
    ├── clustering.py
    ├── db.py
    ├── interactive.py
    ├── ngrams.py
    └── tests.py
├── ec2
    ├── README
    ├── install-deps.sh
    ├── run-x.sh
    └── setup-env.sh
├── one_offs
    ├── copy_agency
    │   └── cp.py
    ├── dodd_frank
    │   ├── agencies.py
    │   ├── dockets.py
    │   ├── dump.py
    │   ├── parse.py
    │   ├── regscrape
    │   └── settings.py
    ├── lightsquared
    │   ├── download_files.py
    │   ├── extract_text.py
    │   └── get_metadata.py
    └── pdf_repair
    │   ├── detect_pdfs.py
    │   └── fix_pdfs.py
├── regscrape
    ├── __init__.py
    ├── pipeline.py
    ├── regs_common
    │   ├── __init__.py
    │   ├── aggregates.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── add_to_search.py
    │   │   ├── administer_search.py
    │   │   ├── annotate_fr_agencies.py
    │   │   ├── annotate_fr_docs.py
    │   │   ├── create_dockets.py
    │   │   ├── create_entities.py
    │   │   ├── export_text.py
    │   │   ├── extract.py
    │   │   ├── mark_searchable_entities.py
    │   │   ├── match_text.py
    │   │   ├── reset_downloads.py
    │   │   ├── reset_extraction.py
    │   │   ├── run_aggregates.py
    │   │   └── runner.py
    │   ├── data
    │   │   ├── es_mapping.json
    │   │   └── names.dat
    │   ├── data_import.py
    │   ├── entities.py
    │   ├── exceptions.py
    │   ├── extraction.py
    │   ├── gevent_mongo.py
    │   ├── mp_types.py
    │   ├── processing.py
    │   ├── scripts
    │   │   ├── extract_docx.py
    │   │   └── process_fr_docs.rb
    │   ├── tmp_redis.py
    │   ├── transfer.py
    │   └── util.py
    ├── regsdotgov
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── rdg_create_agencies.py
    │   │   ├── rdg_download.py
    │   │   ├── rdg_dump_api.py
    │   │   ├── rdg_parse_api.py
    │   │   ├── rdg_scrape.py
    │   │   ├── rdg_scrape_dockets.py
    │   │   └── rdg_simple_update.py
    │   ├── document.py
    │   └── search.py
    ├── run.py
    ├── sec_cftc
    │   ├── __init__.py
    │   └── commands
    │   │   ├── __init__.py
    │   │   ├── cftc_extract_current.py
    │   │   ├── cftc_scrape_dockets.py
    │   │   ├── cftc_scrape_documents.py
    │   │   ├── sec_cftc_import.py
    │   │   ├── sec_cftc_name_dockets.py
    │   │   ├── sec_scrape_dockets.py
    │   │   ├── sec_scrape_documents.py
    │   │   └── suppress_duplicates.py
    └── settings.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .project
 3 | .pydevproject
 4 | *.pickle
 5 | nohup.out
 6 | local_settings.py
 7 | _test*
 8 | oxtail
 9 | pytrie
10 | Gemfile.lock


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'http://rubygems.org/'
2 | gem 'nokogiri'
3 | gem 'us-documents'


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Sunlight Foundation
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright notice,
 9 |       this list of conditions and the following disclaimer.
10 |     * Redistributions in binary form must reproduce the above copyright notice,
11 |       this list of conditions and the following disclaimer in the documentation
12 |       and/or other materials provided with the distribution.
13 |     * Neither the name of sunlighttt nor the names of its contributors
14 |       may be used to endorse or promote products derived from this software
15 |       without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # regulations-scraper
2 | 
3 | This repo contains scraper code for maintaining a complete copy of all data on [Regulations.gov](http://regulations.gov) (consisting mainly of Federal Register documents and public comments), extracting text from said documents, and doing named entity recognition (using [Oxtail](https://github.com/sunlightlabs/oxtail)) and plagiarism detection/clustering (using [cluster-explorer](https://github.com/sunlightlabs/cluster-explorer)). Additionally, the project includes scrapers for a couple non-participating agencies, the SEC and CFTC, and shoehorns their content into the Regulations.gov data model.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/__init__.py


--------------------------------------------------------------------------------
/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/analysis/__init__.py


--------------------------------------------------------------------------------
/analysis/export.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import os
  5 | import csv
  6 | import time
  7 | import multiprocessing
  8 | from Queue import Empty
  9 | from datetime import datetime
 10 | from collections import namedtuple
 11 | from pymongo import Connection
 12 | import StringIO
 13 | 
 14 | pid = os.getpid()
 15 | 
 16 | import_start = time.time()
 17 | print '[%s] Loading trie...' % pid
 18 | from oxtail.matching import match
 19 | print '[%s] Loaded trie in %s seconds.' % (pid, time.time() - import_start)
 20 | 
 21 | F = namedtuple('F', ['csv_column', 'transform'])
 22 | 
 23 | def deep_get(key, dict, default=None):
 24 |     if '.' in key:
 25 |         first, rest = key.split('.', 1)
 26 |         return deep_get(rest, dict.get(first, {}), default)
 27 |     else:
 28 |         out = dict.get(key, default)
 29 |         return out if out else default
 30 | 
 31 | def getter(key, default=''):
 32 |     return lambda d: deep_get(key, d, default)
 33 | 
 34 | 
 35 | DOCS_QUERY = {'deleted': False}
 36 | 
 37 | DOCS_FIELDS = [
 38 |     F('document_id', getter('document_id')),
 39 |     F('docket_id', getter('docket_id')),
 40 |     F('agency', getter('agency')),
 41 |     F('date_posted', getter('details.receive_date', None)),
 42 |     F('date_due', getter('details.comment_end_date', None)),
 43 |     F('title', getter('title')),
 44 |     F('type', getter('type')),
 45 |     F('org_name', getter('details.organization')),
 46 |     F('submitter_name', lambda d: ' '.join(filter(bool, [deep_get('details.first_name', d, None), deep_get('details.mid_initial', d, None), deep_get('details.last_name', d, None)]))),
 47 |     F('on_type', getter('comment_on.type')),
 48 |     F('on_id', getter('comment_on.id')),
 49 |     F('on_title', getter('comment_on.title')),
 50 | ]
 51 | 
 52 | 
 53 | def filter_for_postgres(v):
 54 |     if v is None:
 55 |         return '\N'
 56 |     
 57 |     if isinstance(v, datetime):
 58 |         return str(v)
 59 | 
 60 |     return v.encode('utf8').replace("\.", ".")
 61 | 
 62 | def process_doc(doc, fields=DOCS_FIELDS):
 63 |     # field extraction
 64 |     output = {
 65 |         'metadata': [filter_for_postgres(f.transform(doc)) for f in fields],
 66 |         'matches': [],
 67 |         'submitter_matches': []
 68 |     }
 69 |     
 70 |     # entity extraction
 71 |     if 'views' in doc and doc['views']:
 72 |         for view in doc['views']:
 73 |             if 'extracted' in view and view['extracted'] == True:
 74 |                 for entity_id in match(view['text']).keys():
 75 |                     # hack to deal with documents whose scrapes failed but still got extracted
 76 |                     object_id = doc['object_id'] if 'object_id' in doc else view['file'].split('/')[-1].split('.')[0]
 77 |                     output['matches'].append([doc['document_id'], object_id, view['type'], 'view', entity_id])
 78 |     if 'attachments' in doc and doc['attachments']:
 79 |         for attachment in doc['attachments']:
 80 |             if 'views' in attachment and attachment['views']:
 81 |                 for view in attachment['views']:
 82 |                     if 'extracted' in view and view['extracted'] == True:
 83 |                         for entity_id in match(view['text']).keys():
 84 |                             output['matches'].append([doc['document_id'], attachment['object_id'], view['type'], 'attachment', entity_id])
 85 |     
 86 |     # submitter matches
 87 |     for entity_id in match('\n'.join([output['metadata'][7], output['metadata'][8]])).keys():
 88 |         output['submitter_matches'].append([doc['document_id'], entity_id])
 89 |     
 90 |     return output
 91 |     
 92 | # single-core version
 93 | def dump_cursor(c, fields, filename):
 94 |     metadata_writer = csv.writer(open(sys.argv[3] + '_meta.csv', 'w'))
 95 |     metadata_writer.writerow([f.csv_column for f in fields])
 96 |     
 97 |     match_writer = csv.writer(open(sys.argv[3] + '_text_matches.csv', 'w'))
 98 |     match_writer.writerow(['document_id', 'object_id', 'file_type', 'view_type', 'entity_id'])
 99 |     
100 |     submitter_writer = csv.writer(open(sys.argv[3] + '_submitter_matches.csv', 'w'))
101 |     submitter_writer.writerow(['document_id', 'entity_id'])
102 |     
103 |     for doc in c:
104 |         doc_data = process_doc(doc)
105 |         metadata_writer.writerow(doc_data['metadata'])
106 |         match_writer.writerows(doc_data['matches'])
107 |         submitter_writer.writerows(doc_data['submitter_matches'])
108 | 
109 | # multi-core version and helpers
110 | def write_worker(done_queue, filename, fields=DOCS_FIELDS):
111 |     print '[%s] Writer started.' % os.getpid()
112 |     
113 |     metadata_writer = csv.writer(open(sys.argv[3] + '_meta.csv', 'w'))
114 |     metadata_writer.writerow([f.csv_column for f in fields])
115 |     
116 |     match_writer = csv.writer(open(sys.argv[3] + '_text_matches.csv', 'w'))
117 |     match_writer.writerow(['document_id', 'object_id', 'file_type', 'view_type', 'entity_id'])
118 |     
119 |     submitter_writer = csv.writer(open(sys.argv[3] + '_submitter_matches.csv', 'w'))
120 |     submitter_writer.writerow(['document_id', 'entity_id'])
121 |     
122 |     while True:
123 |         try:
124 |             doc_data = done_queue.get(timeout=20)
125 |         except Empty:
126 |             print '[%s] CSV writes complete.' % os.getpid()
127 |             return
128 |         
129 |         metadata_writer.writerow(doc_data['metadata'])
130 |         match_writer.writerows(doc_data['matches'])
131 |         submitter_writer.writerows(doc_data['submitter_matches'])
132 |         
133 |         done_queue.task_done()
134 | 
135 | def process_worker(todo_queue, done_queue):
136 |     print '[%s] Worker started.' % os.getpid()
137 |     while True:
138 |         try:
139 |             doc = todo_queue.get(timeout=20)
140 |         except Empty:
141 |             print '[%s] Processing complete.' % os.getpid()
142 |             return
143 |         
144 |         doc_data = process_doc(doc)
145 |         done_queue.put(doc_data)
146 |         
147 |         todo_queue.task_done()
148 |     
149 | def dump_cursor_multi(c, fields, filename, num_workers):
150 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
151 |     done_queue = multiprocessing.JoinableQueue(num_workers * 3)
152 |     
153 |     for i in range(num_workers):
154 |         proc = multiprocessing.Process(target=process_worker, args=(todo_queue, done_queue))
155 |         proc.start()
156 |     proc = multiprocessing.Process(target=write_worker, args=(done_queue, filename))
157 |     proc.start()
158 |     
159 |     for doc in c:
160 |         todo_queue.put(doc)
161 |     
162 |     todo_queue.join()
163 |     done_queue.join()
164 | 
165 | if __name__ == '__main__':
166 |     # set up options
167 |     from optparse import OptionParser
168 |     parser = OptionParser(usage="usage: %prog [options] host dbname file_prefix")
169 |     parser.add_option("-l", "--limit", dest="limit", action="store", type="int", default=None, help="Limit number of records for testing.")
170 |     parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=None, help="Set number of worker processes.  Single-process model used if not specified.")
171 |     
172 |     (options, args) = parser.parse_args()
173 |     
174 |     # fetch options, args
175 |     host = args[0]
176 |     dbname = args[1]
177 |     prefix = args[2]
178 |     
179 |     # do request and analysis
180 |     if options.limit:
181 |         cursor = Connection(host=host)[dbname].docs.find(DOCS_QUERY, limit=options.limit)
182 |     else:
183 |         cursor = Connection(host=host)[dbname].docs.find(DOCS_QUERY)
184 |     
185 |     run_start = time.time()
186 |     print '[%s] Starting analysis...' % pid
187 |     
188 |     if options.multi:
189 |         dump_cursor_multi(cursor, DOCS_FIELDS, prefix, options.multi)
190 |     else:
191 |         dump_cursor(cursor, DOCS_FIELDS, prefix)
192 |     
193 |     print '[%s] Completed analysis in %s seconds.' % (pid, time.time() - run_start)
194 | 


--------------------------------------------------------------------------------
/analysis/export_dockets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import csv
 6 | import time
 7 | from datetime import datetime
 8 | from collections import namedtuple
 9 | from pymongo import Connection
10 | 
11 | pid = os.getpid()
12 | 
13 | DOCKETS_QUERY = {'scraped': True}
14 | 
15 | DOCKET_MONGO_FIELDS = ['_id', 'title', 'agency', 'year']
16 | DOCKET_CSV_FIELDS = ['docket_id', 'title', 'agency', 'year']
17 | 
18 | def filter_for_postgres(v):
19 |     if v is None:
20 |         return '\N'
21 |     
22 |     if isinstance(v, datetime):
23 |         return str(v)
24 |     
25 |     return unicode(v).encode('utf8').replace("\.", ".")
26 | 
27 | if __name__ == '__main__':
28 |     # set up options
29 |     from optparse import OptionParser
30 |     parser = OptionParser(usage="usage: %prog [options] host dbname file_prefix")
31 |     
32 |     (options, args) = parser.parse_args()
33 |     
34 |     # fetch options, args
35 |     host = args[0]
36 |     dbname = args[1]
37 |     prefix = args[2]
38 |     
39 |     writer = csv.writer(open(sys.argv[3] + '_dockets.csv', 'w'))
40 |     writer.writerow(DOCKET_CSV_FIELDS)
41 |     
42 |     cursor = Connection(host=host)[dbname].dockets.find(DOCKETS_QUERY)
43 |     
44 |     run_start = time.time()
45 |     print '[%s] Starting export...' % pid
46 |     
47 |     for row in cursor:
48 |         writer.writerow([filter_for_postgres(row[field]) for field in DOCKET_MONGO_FIELDS])
49 |     
50 |     print '[%s] Completed export in %s seconds.' % (pid, time.time() - run_start)
51 | 


--------------------------------------------------------------------------------
/analysis/schema.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | CREATE TABLE regulations_comments (
 3 |     document_id varchar(32) PRIMARY KEY NOT NULL,
 4 |     docket_id varchar(32) NOT NULL,
 5 |     agency varchar(8) NOT NULL,
 6 |     date date,
 7 |     text text NOT NULL
 8 | );
 9 | 
10 | DROP TABLE IF EXISTS regulations_comments_full;
11 | CREATE TABLE regulations_comments_full (
12 |     document_id varchar(64) PRIMARY KEY NOT NULL,
13 |     docket_id varchar(64) NOT NULL,
14 |     agency varchar(8) NOT NULL,
15 |     date_posted date,
16 |     date_due date,
17 |     title varchar(512) NOT NULL,
18 |     type varchar(32),
19 |     org_name varchar(255) NOT NULL,
20 |     submitter_name varchar(255) NOT NULL,
21 |     on_type varchar(32),
22 |     on_id varchar(64) NOT NULL,
23 |     on_title varchar(512) NOT NULL
24 | );
25 | CREATE INDEX regulations_comments_full_docket_id ON regulations_comments_full ( docket_id );
26 | 
27 | -- this should replace some fields on the comment
28 | CREATE TABLE regulations_dockets (
29 |     docket_id varchar(64) PRIMARY KEY NOT NULL,
30 |     title varchar(512) NOT NULL,
31 |     agency varchar(8) NOT NULL,
32 |     year smallint
33 | );
34 | 
35 | CREATE TABLE regulations_text_matches (
36 |     document_id varchar(64),
37 |     object_id varchar(32),
38 |     file_type varchar(16),
39 |     view_type varchar(16),
40 |     entity_id uuid
41 | );
42 | 
43 | CREATE TABLE regulations_submitter_matches (
44 |     document_id varchar(64),
45 |     entity_id uuid
46 | );
47 | 


--------------------------------------------------------------------------------
/analysis/tests.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | 
 4 | from analysis.export import deep_get
 5 | 
 6 | 
 7 | class TestDeepGet(unittest.TestCase):
 8 |     
 9 |     def test_deep_get(self):
10 |         self.assertEqual(None, deep_get('foo', {}))
11 | 
12 |         self.assertEqual(7, deep_get('foo', {'foo': 7}))
13 |         self.assertEqual(None, deep_get('bar', {'foo': 7}))
14 |         
15 |         self.assertEqual(7, deep_get('foo.bar.spaz', {'foo': {'bar': {'spaz': 7}}}))
16 |         self.assertEqual({'spaz': 7}, deep_get('foo.bar', {'foo': {'bar': {'spaz': 7}}}))
17 |         self.assertEqual(None, deep_get('foo.more.less', {'foo': {'bar': {'spaz': 7}}}))
18 |         self.assertEqual(None, deep_get('spaz.more.less', {'foo': {'bar': {'spaz': 7}}}))
19 |     
20 |     
21 | if __name__ == '__main__':
22 |     unittest.main()


--------------------------------------------------------------------------------
/auto/fabfile.py:
--------------------------------------------------------------------------------
  1 | from fabric.api import *
  2 | from ssh_util import *
  3 | from collections import OrderedDict
  4 | import os, sys, json, datetime
  5 | 
  6 | VERBOSE = False
  7 | 
  8 | TASKS_ALWAYS = [
  9 |     ('local', ['rdg_scrape']),
 10 |     ('local', ['rdg_download']),
 11 |     ('local', ['extract']),
 12 |     ('local', ['create_dockets']),
 13 |     ('local', ['rdg_scrape_dockets']),
 14 |     ('local', ['match_text']),
 15 |     ('local', ['add_to_search']),
 16 | ]
 17 | 
 18 | TASK_SETS = {
 19 |     'major': [
 20 |         ('local', ['rdg_dump_api']),
 21 |         ('local', ['rdg_parse_api']),
 22 |     ] + TASKS_ALWAYS + [
 23 |         ('local', ['run_aggregates', '-A']),
 24 |         ('remote', ['analyze_regs', '-F']),
 25 |     ],
 26 | 
 27 |     'minor': [
 28 |         ('local', ['rdg_simple_update']),
 29 |     ] + TASKS_ALWAYS + [
 30 |         ('local', ['run_aggregates']),
 31 |         ('remote', ['analyze_regs', '-F']),
 32 |     ]
 33 | }
 34 | 
 35 | ADMINS = []
 36 | EMAIL_SENDER = ''
 37 | EMAIL_API_KEY = ''
 38 | LOCK_DIR = '/tmp'
 39 | LOG_DIR = '/var/log/scrape'
 40 | 
 41 | try:
 42 |     from local_settings import *
 43 | except:
 44 |     pass
 45 | 
 46 | def send_email(recipients, subject, message):
 47 |     from postmark import PMMail
 48 |     message = PMMail(
 49 |         to = ','.join(recipients),
 50 |         subject = '[regs] %s' % subject,
 51 |         text_body = message,
 52 |         api_key = EMAIL_API_KEY,
 53 |         sender = EMAIL_SENDER
 54 |     )
 55 |     message.send(test=False)
 56 | 
 57 | def run_local(command):
 58 |     os.chdir(os.path.expanduser('~/regulations-scraper/regscrape'))
 59 |     out = local(' '.join([sys.executable, command]), capture=True)
 60 |     return out
 61 | 
 62 | def run_remote(command):
 63 |     with cd('~/sparerib'):
 64 |         with prefix('source ~/.virtualenvs/sparerib_pypy/bin/activate'):
 65 |             return run(command)
 66 | 
 67 | def handle_completion(message, results):
 68 |     output = '%s\nComplete results:\n%s' % (message, json.dumps(results, indent=4))
 69 |     print output
 70 |     
 71 |     if ADMINS:
 72 |         send_email(ADMINS, message, output)
 73 | 
 74 | def acquire_lock():
 75 |     lock_path = os.path.join(LOCK_DIR, 'regs.lock')
 76 |     if os.path.exists(lock_path):
 77 |         raise RuntimeError("Can't acquire lock.")
 78 |     else:
 79 |         lock = open(lock_path, 'w')
 80 |         lock.write(str(os.getpid()))
 81 |         lock.close()
 82 | 
 83 | def release_lock():
 84 |     lock_path = os.path.join(LOCK_DIR, 'regs.lock')
 85 |     os.unlink(lock_path)
 86 | 
 87 | @hosts(ssh_config('regs-fe'))
 88 | def run_regs(start_with=None, end_with=None, task_set=None):
 89 |     try:
 90 |         # use a lock file to keep multiple instances from trying to run simultaneously, which, among other things, consumes all of the memory on the high-CPU instance
 91 |         acquire_lock()
 92 |     except:
 93 |         print 'Unable to acquire lock.'
 94 |         if ADMINS:
 95 |             send_email(ADMINS, "Aborting: can't acquire lock", "Can't start processing due to inability to acquire lock.")
 96 |         
 97 |         sys.exit(1)
 98 | 
 99 |     # get some logging stuff ready
100 |     now = datetime.datetime.now()
101 |     today = now.date().isoformat()
102 |     month = today.rsplit('-', 1)[0]
103 |     month_log_path = os.path.join(LOG_DIR, month)
104 |     if not os.path.exists(month_log_path):
105 |         os.mkdir(month_log_path)
106 | 
107 |     if not (task_set and task_set in TASK_SETS):
108 |         # is it Sunday?
109 |         is_sunday = now.weekday() == 6
110 | 
111 |         # have we run already today?
112 |         run_already = len([log_file for log_file in os.listdir(month_log_path) if log_file.startswith(today)]) > 0
113 | 
114 |         if is_sunday and not run_already:
115 |             task_set = 'major'
116 |         else:
117 |             task_set = 'minor'
118 |     all_tasks = TASK_SETS[task_set]
119 | 
120 |     print 'Starting task set "%s"...' % task_set
121 |     
122 |     start_with = start_with if start_with is not None else all_tasks[0][1][0]
123 |     end_with = end_with if end_with is not None else all_tasks[-1][1][0]
124 |     
125 |     first_task_idx = [i for i in range(len(all_tasks)) if all_tasks[i][1][0] == start_with][0]
126 |     last_task_idx = [i for i in range(len(all_tasks)) if all_tasks[i][1][0] == end_with][0]
127 |     tasks = all_tasks[first_task_idx:(last_task_idx+1)]
128 |     runners = {
129 |         'remote': run_remote,
130 |         'local': run_local
131 |     }
132 |     results = OrderedDict()
133 |     for func, command in tasks:
134 |         try:
135 |             output = runners[func](' '.join(['./run.py' if func == 'local' else './manage.py'] + command + ['--parsable']))
136 |             try:
137 |                 results[command[0]] = json.loads(output)
138 |             except ValueError:
139 |                 results[command[0]] = {'raw_results': output}
140 |             if VERBOSE and ADMINS:
141 |                 send_email(ADMINS, 'Results of %s' % command[0], 'Results of %s:\n%s' % (command[0], json.dumps(results[command[0]], indent=4)))
142 |         except SystemExit:
143 |             results[command[0]] = 'failed'
144 |             handle_completion('Aborting at step: %s' % command[0], results)
145 |             if command[0] == "rdg_simple_update":
146 |                 release_lock()
147 |             sys.exit(1)
148 |     handle_completion('All steps completed.', results)
149 | 
150 |     logfile = open(os.path.join(month_log_path, now.isoformat() + ".json"), "w")
151 |     logfile.write(json.dumps(results, indent=4))
152 |     logfile.close()
153 | 
154 |     release_lock()
155 | 


--------------------------------------------------------------------------------
/auto/requirements.txt:
--------------------------------------------------------------------------------
1 | fabric
2 | python-postmark
3 | 


--------------------------------------------------------------------------------
/auto/ssh_util.py:
--------------------------------------------------------------------------------
 1 | from fabric.api import *
 2 | 
 3 | def ssh_config(host):
 4 |     from os.path import expanduser
 5 |     from paramiko.config import SSHConfig
 6 | 
 7 |     def hostinfo(host, config):
 8 |         hive = config.lookup(host)
 9 |         if 'hostname' in hive:
10 |             host = hive['hostname']
11 |         if 'user' in hive:
12 |             host = '%s@%s' % (hive['user'], host)
13 |         if 'port' in hive:
14 |             host = '%s:%s' % (host, hive['port'])
15 |         return host
16 | 
17 |     try:
18 |         config_file = file(expanduser('~/.ssh/config'))
19 |     except IOError:
20 |         pass
21 |     else:
22 |         config = SSHConfig()
23 |         config.parse(config_file)
24 |         key = config.lookup(host).get('identityfile', None)
25 |         key_filename = expanduser(key)
26 |         
27 |         env.key_filename = [key_filename] if key_filename else []
28 |         return hostinfo(host, config)
29 | 


--------------------------------------------------------------------------------
/duplicates/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/duplicates/__init__.py


--------------------------------------------------------------------------------
/duplicates/cftc.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import csv
 3 | import re
 4 | import os
 5 | import sys
 6 | import cPickle
 7 | 
 8 | from clustering import Clustering
 9 | from ngrams import NGramSpace 
10 | 
11 | def extract_row(row, pdf_path, ngrams):
12 |     text = _get_text(row, pdf_path)
13 |     date = row['DateTime Submitted']
14 |     if row['Middle Initial']:
15 |         name = " ".join([row['First Name'], row['Middle Initial'], row['Last Name']])
16 |     else:
17 |         name = " ".join([row['First Name'], row['Last Name']])
18 |     org = row['Organization']
19 |     
20 |     return CFTCDocument(name, org, date, text, ngrams)
21 | 
22 |         
23 | def _get_text(row, pdf_path):
24 |     source_file = row['File Name']
25 |     if source_file in ('', 'NULL'):
26 |         return row['Comment Text']
27 |     else:
28 |         if source_file.lower().endswith('.pdf'):
29 |             stripped = source_file[:-4]
30 |         elif source_file.lower().endswith('pdf'):
31 |             stripped = source_file[:-3]
32 |         else:
33 |             stripped = source_file
34 |             
35 |         extraction = " ".join(open(os.path.join(pdf_path, stripped + '.txt'), 'r'))
36 |         # as a sanity check, assure that there are at least 5 words
37 |         if len(re.split('\W+', extraction)) > 5:
38 |             return extraction
39 |         else:
40 |             return ''
41 | 
42 | 
43 | class CFTCDocument(object):
44 |     
45 |     def __init__(self, name, org, date, text, ngrams):
46 |         self.name = name
47 |         self.org = org
48 |         self.date = date
49 |         self.text = text
50 |         self.parsed = ngrams.parse(self.text)
51 |         
52 |     def __str__(self):
53 |         return "%s (%s)\n%s" % (self.name, self.org, self.text)
54 | 
55 |     @classmethod
56 |     def get_output_headers(self):
57 |         return ['name', 'org', 'date', 'text']
58 |     
59 |     def get_output_values(self):
60 |         return [self.name, self.org, self.date, self.text]
61 | 
62 | 
63 | def setup(source, pdf_path):
64 |     ngrams = NGramSpace(4)
65 |     print "parsing documents at %s..." % source
66 |     docs = [extract_row(row, pdf_path, ngrams) for row in csv.DictReader(open(source, 'r'))]
67 |     print "clustering %d documents..." % len(docs)
68 |     clustering = Clustering([doc.parsed for doc in docs])
69 |     return (clustering, docs)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     (clustering, docs) = setup(sys.argv[1], sys.argv[2])
74 |     print "\nWriting clustering to %s..." % sys.argv[3]
75 |     cPickle.dump((clustering, docs), open(sys.argv[3], 'wb'), cPickle.HIGHEST_PROTOCOL)
76 |     
77 | 


--------------------------------------------------------------------------------
/duplicates/clustering.py:
--------------------------------------------------------------------------------
  1 | from ngrams import jaccard, NGramSpace
  2 | import numpy
  3 | 
  4 | 
  5 | class SymmetricMatrix(object):
  6 |     
  7 |     def __init__(self, n):
  8 |         self.values = numpy.zeros((n, n))
  9 |         self.mask = None
 10 |     
 11 |     def submatrix(self, ids):
 12 |         sub = SymmetricMatrix(0)
 13 |         sub.values = self
 14 |         sub.mask = ids
 15 |         return sub
 16 |     
 17 |     def translate(self, index):
 18 |         (i, j) = (max(index), min(index))
 19 |         if self.mask:
 20 |             return (self.mask[i], self.mask[j])
 21 |         return (i, j)
 22 |         
 23 |     def __getitem__(self, index):
 24 |         return self.values[self.translate(index)]
 25 |     
 26 |     def __setitem__(self, index, value):
 27 |         self.values[self.translate(index)] = value
 28 |         
 29 |     def __len__(self):
 30 |         if self.mask:
 31 |             return len(self.mask)
 32 |         return len(self.values)
 33 | 
 34 | 
 35 | class PriorityQueue(object):
 36 |     
 37 |     def __init__(self, size):
 38 |         self.size = size
 39 |         self.data = list()
 40 |         
 41 |     def insert(self, value, priority):
 42 |         i = len(self.data)
 43 |         while i > 0 and self.data[i - 1][0] > priority:
 44 |             i -= 1
 45 |         
 46 |         if i < self.size:
 47 |             self.data.insert(i, (priority, value))
 48 |             self.data = self.data[:self.size]
 49 |     
 50 |     def values(self):
 51 |         return [value for (priority, value) in self.data]
 52 | 
 53 | 
 54 | class Clustering(object):
 55 |     
 56 |     def __init__(self, docs):
 57 |         self.num_docs = len(docs)
 58 |         self.assignments = range(0, self.num_docs)
 59 |         
 60 |         self.distance = SymmetricMatrix(self.num_docs)
 61 |         count = 0
 62 |         for i in range(0, self.num_docs):
 63 |             for j in range(0, i + 1):
 64 |                 self.distance[i, j] = 1.0 - jaccard(docs[i], docs[j])
 65 |                 
 66 |                 count += 1
 67 |                 if count % 1000000 == 0:
 68 |                     print "Computed %d distances out of %d..." % (count, self.num_docs * self.num_docs / 2)
 69 |         
 70 |         
 71 |         for i in range(0, self.num_docs):
 72 |             for j in range(0, i):
 73 |                 if self.distance[i, j] == 0 and self.assignments[i] != self.assignments[j]:
 74 |                     self.merge(i, j)
 75 | 
 76 |     
 77 |     def min_link(self):
 78 |         min_i = None
 79 |         min_j = None
 80 |         min_d = 1.0
 81 |         
 82 |         for i in range(0, self.num_docs):
 83 |             for j in range(0, i):
 84 |                 if self.distance[i, j] <= min_d and self.assignments[i] != self.assignments[j]:
 85 |                     min_i = i
 86 |                     min_j = j
 87 |                     min_d = self.distance[i, j]
 88 |         
 89 |         return (min_i, min_j)
 90 | 
 91 | 
 92 |     def closest_neighbors(self, seeds, n=1):
 93 |         unseeded = [i for i in range(0, self.num_docs) if i not in seeds]
 94 |         unseeded_distance = [1.0] * len(unseeded)
 95 |         
 96 |         for seeded_index in range(0, len(seeds)):
 97 |             for unseeded_index in range(0, len(unseeded)):
 98 |                 d = self.distance[seeds[seeded_index], unseeded[unseeded_index]]
 99 |                 if d < unseeded_distance[unseeded_index]:
100 |                     unseeded_distance[unseeded_index] = d
101 |         
102 |         neighbors = PriorityQueue(n)
103 |         for i in range(0, len(unseeded)):
104 |             neighbors.insert(unseeded[i], unseeded_distance[i])
105 |         
106 |         return neighbors.values()
107 | 
108 | 
109 |     def merge(self, i, j):
110 |         cluster_i = self.assignments[i]
111 |         cluster_j = self.assignments[j]
112 |         
113 |         for x in range(0, self.num_docs):
114 |             if self.assignments[x] == cluster_j:
115 |                 self.assignments[x] = cluster_i
116 |                 
117 |                 
118 |     def get_clusters(self):
119 |         mapping = dict([(rep, list()) for rep in set(self.assignments)])
120 |         for i in range(0, self.num_docs):
121 |             mapping[self.assignments[i]].append(i)
122 |         return mapping
123 |         
124 |     def get_cluster(self, i):
125 |         rep = self.assignments[i]
126 |         return [i for i in range(0, self.num_docs) if self.assignments[i] == rep]
127 |     
128 |     def _view(self, ids):
129 |         if ids:
130 |             return self.distance.submatrix(ids)
131 |         return self.distance        
132 | 
133 |     def pp_distance(self, ids):
134 |         """ Pretty-print the distances between given docs. """
135 |         
136 |         view = self._view(ids)
137 |         
138 |         print '\t' + '\t'.join([str(id) for id in ids])
139 |         for i in range(0, len(view)):
140 |             distances = [view[i, j] for j in range(0, i)]
141 |             print "%d:\t%s" % (ids[i], '\t'.join(['{0:.3}'.format(d) for d in distances]))
142 |         
143 |         (min, avg, max) = ['{0:.3}'.format(s) for s in self.stats(ids)]
144 |         print "min/avg/max = %s / %s / %s" % (min, avg, max)
145 |         
146 |     def closest_pair(self, ids=None, farthest=False):
147 |         view = self._view(ids)
148 |         
149 |         # set mins to first entry to be scanned...
150 |         # that way if it turns out to be the actual min, we won't be left w/ Nones
151 |         min_i = ids[1]
152 |         min_j = ids[0]
153 |         min_d = view[1, 0]
154 |         
155 |         for i in range(0, len(view)):
156 |             for j in range(0, i):
157 |                 if (view[i, j] >= min_d) if farthest else (view[i, j] <= min_d):
158 |                     min_i = ids[i]
159 |                     min_j = ids[j]
160 |                     min_d = view[i, j]
161 |         
162 |         return (min_i, min_j)
163 |     
164 |     def farthest_pair(self, ids=None):
165 |         return self.closest_pair(ids, farthest=True)
166 |     
167 |     def stats(self, ids):
168 |         if len(ids) < 2:
169 |             return (0.0, 0.0, 0.0)
170 |         
171 |         view = self._view(ids)
172 |         distances = list()
173 |         
174 |         for i in range(0, len(view)):
175 |             for j in range(0, i):
176 |                 distances.append(view[i, j])
177 |         
178 |         return (min(distances), sum(distances) / float(len(distances)), max(distances))
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/duplicates/db.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import csv
 3 | from pymongo import Connection
 4 | from BeautifulSoup import BeautifulSoup, Tag, NavigableString
 5 | from clustering import NGramSpace, Clustering
 6 | 
 7 | 
 8 | DOCUMENT_URL = 'http://www.regulations.gov/#!documentDetail;D='
 9 | 
10 | class RegsDocument(object):
11 |     
12 |     def __init__(self, mongo_doc, ngrams):
13 |         self.mongo_doc = mongo_doc
14 |         self.comment = get_comment(mongo_doc)
15 |         self.parsed = ngrams.parse(self.comment)
16 |         self.url = DOCUMENT_URL + mongo_doc['Document ID']
17 |         self.title = mongo_doc['Details'].get('Title', '') if 'Details' in mongo_doc else ''
18 |     
19 |     def __str__(self):
20 |         return "%s\n%s\n%s" % (self.title, self.url, self.comment)
21 |     
22 |     def get_id(self):
23 |         return self.url
24 |     
25 |         
26 | def extract_html_comment(comment):
27 |     soup = BeautifulSoup(comment)
28 |     
29 |     comment_header = soup.find('h2', text='General Comment').parent
30 |         
31 |     comment = ''
32 | 
33 |     for node in comment_header.findNextSiblings():
34 |         if node.name == 'h2':
35 |             break
36 |         
37 |         comment += ''.join(strip_tags(node))
38 |         
39 |     return comment
40 | 
41 | def extract_comment(comment):
42 |     pattern = 'Comments? ?(?:\*+|:)(.*?)(?:===+.*)?$'
43 |     match = re.search(pattern, comment, re.DOTALL)
44 |     if match:
45 |         return match.group(1).strip()
46 |     
47 |     return comment.strip()
48 |     
49 | 
50 | def strip_tags(node):
51 |     if type(node) is NavigableString:
52 |         return str(node)
53 |     else:
54 |         return ''.join(map(strip_tags, node.contents))
55 | 
56 | # todo: update with other types, fallback for unknown types
57 | VIEW_PREFERENCE = ['crtext', 'msw8', 'pdf']
58 | 
59 | def get_comment(doc):
60 |     for label in VIEW_PREFERENCE:
61 |         views = [v.get('text', '') for v in doc.get('views', []) if v.get('type', '') == label and v.get('decoded')]
62 |         if views:
63 |             return extract_comment(views[0])
64 |     
65 |     return ''
66 |     
67 | 
68 | def docs_2_csv(docs, filename):
69 |     writer = csv.writer(open(filename, 'w'))
70 |     
71 |     writer.writerow(['id', 'title', 'url', 'text'])
72 |     
73 |     for i in range(0, len(docs)):
74 |         writer.writerow([i, docs[i].title.encode('utf8', 'ignore'), docs[i].url, docs[i].comment.encode('utf8', 'ignore')])
75 |         
76 | 
77 | def get_texts(ngrams):
78 |     c = Connection()
79 |     docs = c.regulations.docs.find()
80 |     return [RegsDocument(d, ngrams) for d in docs]
81 | 
82 | 
83 | def setup():
84 |     ngrams = NGramSpace(4)
85 |     docs = get_texts(ngrams)
86 |     clustering = Clustering([doc.parsed for doc in docs])
87 |     return (clustering, docs)
88 | 
89 | 


--------------------------------------------------------------------------------
/duplicates/interactive.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import cPickle
  4 | import csv
  5 | 
  6 | from clustering import *
  7 | from cftc import CFTCDocument
  8 | #from db import RegsDocument
  9 | 
 10 | 
 11 | def format_stats(size, stats):
 12 |     (min, avg, max) = ['{0:.3}'.format(s) for s in stats]
 13 |     return "%s documents: min/avg/max = %s / %s / %s" % (size, min, avg, max)
 14 |     
 15 | 
 16 | def cluster_loop(clustering, docs):
 17 |     previous_seed = None
 18 |     
 19 |     while True:
 20 |         (seed, _) = clustering.min_link()
 21 | 
 22 |         if seed is None:
 23 |             print "All elements in single cluster."
 24 |             break
 25 |         if seed == previous_seed:
 26 |             print "Done clustering."
 27 |             break
 28 |         previous_seed = seed
 29 |         
 30 |         print "\n%s\n" % ('=' * 80)
 31 |         print "Initial document:\n"
 32 |         print docs[seed]     
 33 |         
 34 |         exponential_loop(clustering, seed, docs)
 35 |         
 36 | 
 37 | def exponential_loop(clustering, seed, docs):
 38 |     step_size = 1
 39 |     current_cluster = clustering.get_cluster(seed)
 40 |     current_stats = clustering.stats(current_cluster)
 41 |     
 42 |     while True:
 43 |         potential_reps = clustering.closest_neighbors(current_cluster, step_size)
 44 |         if not potential_reps:
 45 |             print "Nothing left to add to cluster."
 46 |             break
 47 |         
 48 |         potential_cluster = list(set(reduce(lambda x, y: x + y, map(clustering.get_cluster, potential_reps))))
 49 |         potential_cluster.sort()
 50 |         combined_stats = clustering.stats(current_cluster + potential_cluster)
 51 |         
 52 |         avg_sim_before = 1 - current_stats[1]
 53 |         avg_sim_after = 1 - combined_stats[1]
 54 |         # removed per Nancy's request
 55 | #        if avg_sim_after < .5 * avg_sim_before:
 56 | #            print "*** Average distance increased too much. Stopping clustering automatically. ***"
 57 | #            break        
 58 |         
 59 |         print "\n%s\n" % ('-' * 80)
 60 |         print "Sample doc to cluster:"
 61 |         print docs[potential_reps[-1]]
 62 |         print ""
 63 |         print "Existing cluster\t%s" % format_stats(len(current_cluster), current_stats)
 64 |         print "Combined cluster\t%s" % format_stats(len(current_cluster) + len(potential_cluster), combined_stats)
 65 |     
 66 |         while True:
 67 |             choice = raw_input("Cluster? [Y/n] ").lower()
 68 |             if choice in ('', 'y', 'n'):
 69 |                 break
 70 | 
 71 |         if choice in ('', 'y'):
 72 |             for rep in potential_reps:
 73 |                 clustering.merge(seed, rep)
 74 | 
 75 |             step_size *= 2
 76 |             current_cluster = clustering.get_cluster(seed)
 77 |             current_stats = clustering.stats(current_cluster)
 78 |         else:
 79 |             if step_size == 1:
 80 |                 break
 81 |             else:
 82 |                 step_size = 1
 83 |   
 84 | 
 85 | def dump_to_csv(clustering, docs, filename):
 86 |     writer = csv.writer(open(filename, 'w'))
 87 |     writer.writerow(['cluster number', 'document number'] + docs[0].get_output_headers())
 88 |     
 89 |     clusters = [c for c in clustering.get_clusters().values() if len(c) > 1]
 90 |     clusters.sort(key=len, reverse=True)
 91 |     
 92 |     for i in range(0, len(clusters)):
 93 |         for d in clusters[i]:
 94 |             writer.writerow([i, d] + docs[d].get_output_values())
 95 |     
 96 |     return writer
 97 |     
 98 | 
 99 | def main(filename):
100 |     print "Reading existing clustering from %s..." % filename 
101 |     (clustering, docs) = cPickle.load(open(filename, 'rb'))
102 |     
103 |     try:
104 |         cluster_loop(clustering, docs)
105 |     except KeyboardInterrupt:
106 |         pass
107 |     
108 |     print "\nWriting clustering to %s..." % filename
109 |     cPickle.dump((clustering, docs), open(filename, 'wb'), cPickle.HIGHEST_PROTOCOL)
110 |     
111 |     dump_to_csv(clustering, docs, filename + '.csv')
112 |     
113 | 
114 | if __name__ == '__main__':
115 |     main(sys.argv[1])
116 | 
117 | 


--------------------------------------------------------------------------------
/duplicates/ngrams.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class Sequencer(object):
 5 |     
 6 |     def __init__(self):
 7 |         self.next_id = 1
 8 |         self.data = dict()
 9 |         
10 |     def id(self, x):
11 |         existing_id = self.data.get(x, None)
12 | 
13 |         if existing_id:
14 |             return existing_id
15 |             
16 |         self.data[x] = self.next_id
17 |         self.next_id += 1
18 |         
19 |         return self.next_id - 1
20 | 
21 | 
22 | 
23 | class NGramSpace(object):
24 |     
25 |     def __init__(self, n):
26 |         self.n = n
27 |         self.ngrams = Sequencer()
28 |         
29 |     def parse(self, text):
30 |         normalized_text = re.sub('\W', ' ', text.lower())
31 |         split_text = normalized_text.split()
32 |         
33 |         ids = set()
34 |         
35 |         for i in range(0, len(split_text) + 1 - self.n):
36 |             ngram = " ".join(split_text[i:i+self.n])
37 |             ids.add(self.ngrams.id(ngram))
38 |             
39 |         sorted_ids = list(ids)
40 |         sorted_ids.sort()
41 |         
42 |         return sorted_ids
43 |         
44 | 
45 | def overlap(x, y):
46 |     i = 0
47 |     j = 0
48 |     
49 |     c = 0
50 |     
51 |     len_x = len(x)
52 |     len_y = len(y)
53 |     
54 |     while i < len_x and j < len_y:
55 |         if x[i] > y[j]:
56 |             j += 1
57 |         elif x[i] < y[j]:
58 |             i += 1
59 |         else: # x[i] == y[j]
60 |             c += 1
61 |             i += 1
62 |             j += 1          
63 |             
64 |     return c
65 |     
66 | 
67 | def jaccard(x, y):
68 |     intersection_size = overlap(x, y)
69 |     union_size = len(x) + len(y) - intersection_size
70 |     
71 |     return float(intersection_size) / union_size if union_size != 0 else 0
72 | 
73 | 


--------------------------------------------------------------------------------
/ec2/README:
--------------------------------------------------------------------------------
1 | These are scripts to rapidly get a new scraping instance up and running on Amazon EC2.
2 | 


--------------------------------------------------------------------------------
/ec2/install-deps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | apt-get update
 3 | apt-get install -y build-essential python2.7-dev git mercurial python-pip python-virtualenv virtualenvwrapper puf openbox libxslt1-dev libxml2-dev zlib1g-dev html2text poppler-utils ghostscript antiword catdoc libjpeg8-dev libwpd-tools unrtf
 4 | 
 5 | mkdir /tmp/tesseract
 6 | cd /tmp/tesseract
 7 | wget http://ppa.launchpad.net/alex-p/notesalexp/ubuntu/pool/main/t/tesseract/tesseract-ocr_3.0.0+svn590-2ppa1~maverick1_amd64.deb
 8 | wget http://ppa.launchpad.net/alex-p/notesalexp/ubuntu/pool/main/t/tesseract/tesseract-ocr-eng_3.0.0+svn590-2ppa1~maverick1_all.deb
 9 | dpkg -i *.deb
10 | cd
11 | rm -rf /tmp/tesseract
12 | apt-get install -f
13 | if [ ! -f /usr/lib/liblept.so.0 ]; then
14 |     ln -s /usr/lib/liblept.so.1 /usr/lib/liblept.so.0
15 | fi
16 | 


--------------------------------------------------------------------------------
/ec2/run-x.sh:
--------------------------------------------------------------------------------
1 | nohup Xvfb &
2 | export DISPLAY=:0
3 | 
4 | nohup openbox &
5 | 


--------------------------------------------------------------------------------
/ec2/setup-env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /etc/bash_completion
 4 | if [ ! -d $HOME/.virtualenvs ]; then
 5 |     mkdir $HOME/.virtualenvs
 6 | fi
 7 | mkvirtualenv scraper
 8 | workon scraper
 9 | 
10 | pip install -r ../requirements.txt
11 | 


--------------------------------------------------------------------------------
/one_offs/copy_agency/cp.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pymongo
 3 | import gridfs
 4 | 
 5 | source_db = pymongo.Connection(port=27019).regulations
 6 | dest_db = pymongo.Connection().regulations_demo
 7 | 
 8 | source_fs = gridfs.GridFS(source_db, 'files')
 9 | dest_fs = gridfs.GridFS(dest_db, 'files')
10 | 
11 | agency = sys.argv[1]
12 | 
13 | for doc in source_db.docs.find({'agency': agency}):
14 |     print "Copying document %s..." % doc['_id']
15 | 
16 |     for attachment in [doc] + doc.get('attachments', []):
17 |         for view in attachment.get('views', []):
18 |             content_id = view.get('content', None)
19 |             if content_id and source_fs.exists(content_id) and not dest_fs.exists(content_id):
20 |                 print "Copying file %s..." % content_id
21 |                 dest_fs.put(source_fs.get(content_id), _id=content_id)
22 |     dest_db.docs.save(doc)


--------------------------------------------------------------------------------
/one_offs/dodd_frank/agencies.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | import csv
 3 | 
 4 | db = pymongo.Connection().regulations_models
 5 | 
 6 | out = csv.DictWriter(open("agencies.csv", "w"), fieldnames=['agency', 'submitters', 'mentioned'])
 7 | out.writeheader()
 8 | 
 9 | for agency in db.agencies.find():
10 |     row = {}
11 |     row['agency'] = agency['_id']
12 | 
13 |     #row['fr_docs'] = ";".join(["%s (%s)" % (doc['id'], doc['title']) for doc in agency['stats']['fr_docs']])
14 | 
15 |     row['submitters'] = "; ".join([
16 |         '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0], item[0], item[1])
17 |         for item in sorted(agency['stats']['submitter_entities'].items(), key=lambda i: i[1], reverse=True)
18 |     ])
19 | 
20 |     row['mentioned'] = "; ".join([
21 |         '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0], item[0], item[1])
22 |         for item in sorted(agency['stats']['text_entities'].items(), key=lambda i: i[1], reverse=True)
23 |     ])
24 | 
25 |     out.writerow(row)
26 | 


--------------------------------------------------------------------------------
/one_offs/dodd_frank/dockets.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | import csv
 3 | import datetime
 4 | 
 5 | from regscrape.models import *
 6 | db = Doc._get_db()
 7 | 
 8 | out = csv.DictWriter(open("dockets.csv", "w"), fieldnames=['title', 'docket_id', 'agency', 'fr_docs', 'submitters', 'mentioned', 'num_comments', 'num_public_interest_comments', 'num_bank_comments', 'num_law_firm_comments', 'word_count', 'last_seven'])
 9 | out.writeheader()
10 | 
11 | public_groups = ["ab09bc57f97b483391c483cbbdc479c8", "9c422ba85ac649269ce42804a6827059", "f8a9c531807f4585b1d5c73040b3c0fb", "f90cb1c4490344feba2ca83c2d3dd931", "be74818419c84a87b2b99c173aaea26d", "c58e0c68a7754ee2bd909fa68cecee7a", "2f0920a5271d41a7a85c4a7946775390", "10c585fd7f9d4cd1a82265e151b12f9e", "e31bfef434e9470b9e473d6182f2d021", "174a89892823486aad4538033fe0e8c7", "fb702029157e4c7c887172eba71c66c5", "6e5348b28f7242aab5437e0a34758350", "f89c8e3ab2b44f72971f91b764868231", "219154488de945e781330db65a54e1f4", "c5fe2c9b5a6c46fc8faeb34b8df6524f", "4536032e5d1d47248a5eddb86ce1a7f1", "23a8fb4b1188414ea125e06f34dc7df7", "3b14c79d157c4a8ab7e1bd7fdc589544"]
12 | banks = ["5202316fe79343a09a31e8c6c31ebe3d", "bc1d056e59334c07bb0761b97efa64e4", "793070ae7f5e42c2a76a58663a588f3d", "4e6bc9a6b7dc40d7b9b00d58c0e359db", "8d93cebae445485f9af02676a2d71b3f", "91f9a88888d744da8d433018cf912460", "c28bf9dd2a0b4ac19408b645ecc74a7a", "71c49bc56b3a4d369e727fd22744876a", "597eccfe48784677a437569ff6293097", "9bea23144b304a31a790a6c3a9e5f9e6", "878b4d98431344de88d8fb9757043a95", "8c007e162ca1450cbe7f976732a9a770", "c86403b874ea4d9390574088a2973705", "46ff48813fc34247b8d31e22a13663c5", "1fecb472df7444d3822e784f1a0845e6", "c24ef68246554310aa03888ea10cd9bf", "8376751efebe4687b70c84b7c33e3c74", "31e6e04b59084d5c9b09c102680bcc32", "b6a33d8be4784be58c69e1e487a3ed8b", "162122d165e24747b2d7ebb064d7142f"]
13 | law_firms = ["28f4d347bbae4d738aa3199346cf6850", "555e92b13c6640288ef76ee7c2bae09f", "783f8ace8f5d4a3c8a29c7d02b9a336f"]
14 | one_week = datetime.timedelta(days=7)
15 | 
16 | for agency in db.dockets.find():
17 |     row = {}
18 |     row['title'] = agency['title'].encode('ascii', errors='ignore')
19 |     row['docket_id'] = agency['_id']
20 |     row['agency'] = agency['agency']
21 | 
22 |     row['fr_docs'] = "; ".join(["%s (%s)" % (doc['id'], doc['title'].encode('ascii', errors='ignore')) for doc in agency['stats']['fr_docs']])
23 | 
24 |     row['submitters'] = "; ".join([
25 |         '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0].encode('ascii', errors='ignore'), item[0], item[1])
26 |         for item in sorted(agency['stats']['submitter_entities'].items(), key=lambda i: i[1], reverse=True)
27 |     ])
28 | 
29 |     row['mentioned'] = "; ".join([
30 |         '%s (%s): %s' % (db.entities.find({'_id': item[0]})[0]['aliases'][0].encode('ascii', errors='ignore'), item[0], item[1])
31 |         for item in sorted(agency['stats']['text_entities'].items(), key=lambda i: i[1], reverse=True)
32 |     ])
33 | 
34 |     row['num_comments'] = agency['stats']['type_breakdown'].get('public_submission', 0)
35 | 
36 |     row['num_public_interest_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in public_groups])
37 |     row['num_bank_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in banks])
38 |     row['num_law_firm_comments'] = sum([agency['stats']['submitter_entities'].get(entity, 0) for entity in law_firms])
39 | 
40 |     last_seven = 0
41 |     word_count = 0
42 |     for doc in db.docs.find({'docket_id': agency['_id'], 'type': 'public_submission'}):
43 |         if doc['views']:
44 |             word_count += max([len(View._from_son(view).content.read()) for view in doc['views'] if view['extracted'] == 'yes'] or [0])
45 | 
46 |         if doc.get('attachments', []):
47 |             for attachment in doc['attachments']:
48 |                 if attachment['views']:
49 |                     word_count += max([len(View._from_son(view).content.read()) for view in attachment['views'] if view['extracted'] == 'yes'] or [0])
50 | 
51 |         date = doc.get('details', {}).get('Date_Posted', None)
52 |         if date and agency['stats']['date_range'][1]:
53 |             if date > agency['stats']['date_range'][1] - one_week:
54 |                 last_seven += 1
55 | 
56 | 
57 |     row['word_count'] = word_count
58 |     row['last_seven'] = last_seven
59 | 
60 |     out.writerow(row)
61 | 


--------------------------------------------------------------------------------
/one_offs/dodd_frank/dump.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 | 
4 | dockets = [docket.strip() for docket in open(sys.argv[1])]
5 | 
6 | for docket in dockets:
7 |     p = subprocess.Popen(["./run.py", 'rdg_dump_api', '-d', docket])
8 |     p.communicate()


--------------------------------------------------------------------------------
/one_offs/dodd_frank/parse.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | dockets = [docket.strip() for docket in open(sys.argv[1])]
 5 | 
 6 | for docket in dockets:
 7 |     try:
 8 |         p = subprocess.Popen(["./run.py", 'rdg_parse_api', '-d', docket, '-A'])
 9 |         p.communicate()
10 |     except:
11 |         print "failed %s" % docket


--------------------------------------------------------------------------------
/one_offs/dodd_frank/regscrape:
--------------------------------------------------------------------------------
1 | ../../regscrape


--------------------------------------------------------------------------------
/one_offs/dodd_frank/settings.py:
--------------------------------------------------------------------------------
1 | ../../regscrape/settings.py


--------------------------------------------------------------------------------
/one_offs/lightsquared/download_files.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | 
 4 | def run(options, args):
 5 |     import json, os
 6 |     from regscrape_lib.transfer import bulk_download
 7 | 
 8 |     if len(args) > 1:
 9 |         metadata_path = args[0]
10 |         out_path = args[1]
11 |     else:
12 |         print "Specify files"
13 |         sys.exit(0)
14 |     
15 |     input = json.load(open(metadata_path, 'r'))
16 | 
17 |     download_path = os.path.join(os.path.dirname(metadata_path), 'downloads')
18 | 
19 |     def download_generator():
20 |         for record in input:
21 |             for document in record['documents']:
22 |                 num = document['url'].split('=').pop() + '.pdf'
23 |                 yield (document['url'], os.path.join(download_path, num), document)
24 |     
25 |     def status_func(status, url, filename, record):
26 |         if status[0]:
27 |             record['filename'] = 'downloads/' + filename.split('downloads/').pop()
28 |         else:
29 |             record['filename'] = False
30 |             record['download_error'] = status[1]
31 |     
32 |     bulk_download(download_generator(), status_func, retries=2, verbose=True)
33 |     
34 |     date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None
35 |     open(out_path, 'w').write(json.dumps(input, default=date_handler, indent=4))


--------------------------------------------------------------------------------
/one_offs/lightsquared/extract_text.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | 
 4 | def run(options, args):
 5 |     import json, os
 6 |     from regscrape_lib.extraction import serial_bulk_extract
 7 | 
 8 |     if len(args) > 1:
 9 |         metadata_path = args[0]
10 |         out_path = args[1]
11 |     else:
12 |         print "Specify files"
13 |         sys.exit(0)
14 |     
15 |     input = json.load(open(metadata_path, 'r'))
16 | 
17 |     file_path = os.path.dirname(metadata_path)
18 | 
19 |     def extract_generator():
20 |         for record in input:
21 |             for document in record['documents']:
22 |                 yield (os.path.join(file_path, document['filename']), 'pdf', document)
23 |     
24 |     def status_func(status, text, filename, filetype, used_ocr, record):
25 |         if status[0]:
26 |             record['text'] = text
27 |         else:
28 |             record['text'] = None
29 |             record['extraction_error'] = status[1]
30 |     
31 |     serial_bulk_extract(extract_generator(), status_func, verbose=True)
32 |     
33 |     date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None
34 |     open(out_path, 'w').write(json.dumps(input, default=date_handler, indent=4))


--------------------------------------------------------------------------------
/one_offs/lightsquared/get_metadata.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | 
 4 | def run(options, args):
 5 |     import zipfile, sys, datetime, re, json
 6 |     from lxml import etree
 7 | 
 8 |     if len(args) > 1:
 9 |         wbk_path = args[0]
10 |         out_path = args[1]
11 |     else:
12 |         print "Specify files"
13 |         sys.exit(0)
14 |     
15 |     wbk = zipfile.ZipFile(wbk_path, 'r')
16 |     sheet = wbk.open("content.xml", 'r')
17 | 
18 |     document = etree.parse(sheet)
19 | 
20 |     ns_map = {'table': "urn:oasis:names:tc:opendocument:xmlns:table:1.0"}
21 |     bools = {'Y': True, 'N': False}
22 |     date_re = re.compile(r'\d{2}/\d{2}/\d{4}')
23 |     date_handler = lambda obj: obj.isoformat() if hasattr(obj, 'isoformat') else None
24 |     link_re = re.compile(r'of:=HYPERLINK\("(?P<url>[\w:/?=\.]*)";"(?P<title>[\w\s\(\)]*)"\)')
25 | 
26 |     rows = document.findall("//table:table-row", ns_map)
27 | 
28 |     # handle the first row
29 |     fields = []
30 |     for cell in rows[0].findall("table:table-cell", ns_map):
31 |         text_nodes = cell.getchildren()
32 |         if text_nodes and text_nodes[0].text:
33 |             fields.append(text_nodes[0].text.lower().replace(' ', '_'))
34 |     
35 |     out = []
36 |     for row in rows[1:]:
37 |         row_data = {'documents': []}
38 |         cells = row.findall("table:table-cell", ns_map)
39 |         for i in xrange(len(cells)):
40 |             cell = cells[i]
41 | 
42 |             text_nodes = cell.getchildren()
43 |             if "{urn:oasis:names:tc:opendocument:xmlns:table:1.0}formula" in cell.keys():
44 |                 # looks like a link
45 |                 hyperlink = cell.attrib["{urn:oasis:names:tc:opendocument:xmlns:table:1.0}formula"]
46 |                 match = link_re.match(hyperlink)
47 |                 if not match:
48 |                     print hyperlink
49 |                     print 'failed to parse link %s' % hyperlink
50 |                     sys.exit()
51 |                 row_data['documents'].append(match.groupdict())
52 |             elif text_nodes and text_nodes[0].text:
53 |                 # looks like plain text
54 |                 text = text_nodes[0].text
55 | 
56 |                 # fix dates
57 |                 if date_re.match(text):
58 |                     text = datetime.datetime.strptime(text, '%m/%d/%Y').date()
59 |                 
60 |                 # fix booleans:
61 |                 if text in bools:
62 |                     text = bools[text]
63 |                 
64 |                 row_data[fields[i]] = text
65 |         
66 |         if len(row_data.keys()) > 1:
67 |             out.append(row_data)
68 | 
69 |     open(out_path, 'w').write(json.dumps(out, default=date_handler, indent=4))


--------------------------------------------------------------------------------
/one_offs/pdf_repair/detect_pdfs.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | def run():
 4 |     from regs_models import Doc
 5 |     import json
 6 |     from regs_common.processing import *
 7 |     
 8 |     problems = set()
 9 |     for finder in [find_views, find_attachment_views]:
10 |         for view_d in finder(mode="html", type="pdf", extracted="yes"):
11 |             content = view_d['view'].content.read()
12 |             if not content:
13 |                 print "Weird:", view_d['doc']
14 |             elif html_is_empty(content):
15 |                 print "Problem:", view_d['doc']
16 |                 problems.add(view_d['doc'])
17 |             else:
18 |                 print "OK:", view_d['doc']
19 |     
20 |     print "%s problems" % len(problems)
21 |     outfile = open("/tmp/problems.json", "w")
22 |     json.dump(sorted(list(problems)), outfile)


--------------------------------------------------------------------------------
/one_offs/pdf_repair/fix_pdfs.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from regs_models import Doc
 4 | import json
 5 | import itertools
 6 | 
 7 | def split_seq(iterable, size):
 8 |     it = iter(iterable)
 9 |     item = list(itertools.islice(it, size))
10 |     while item:
11 |         yield item
12 |         item = list(itertools.islice(it, size))
13 | 
14 | all_ids = json.load(open("/tmp/problems.json"))
15 | for ids in split_seq(all_ids, 1000):
16 |     for doc in Doc.objects(id__in=ids):
17 |         for view in doc.views:
18 |             if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
19 |                 view.extracted = "no"
20 |                 view.content.delete()
21 |         for attachment in doc.attachments:
22 |             for view in attachment.views:
23 |                 if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
24 |                     view.extracted = "no"
25 |                     view.content.delete()
26 |         doc.in_search_index = False
27 |         doc.in_cluster_db = False
28 |         doc.entities_last_extracted = None
29 |         
30 |         print "Repaired %s" % doc.id
31 |         doc.save()


--------------------------------------------------------------------------------
/regscrape/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/__init__.py


--------------------------------------------------------------------------------
/regscrape/pipeline.py:
--------------------------------------------------------------------------------
  1 | import settings
  2 | import pymongo
  3 | import json
  4 | import sys
  5 | import os
  6 | import subprocess
  7 | import time
  8 | import signal
  9 | import datetime
 10 | 
 11 | DEFAULT_SEQUENCE = [
 12 |     'rdg_dump_api',
 13 |     'rdg_parse_api',
 14 |     'rdg_scrape',
 15 |     'rdg_download',
 16 |     'extract',
 17 |     'create_dockets',
 18 |     'rdg_scrape_dockets',
 19 |     'add_to_search',
 20 |     'run_aggregates'    
 21 | ]
 22 | OVERRIDE_SEQUENCES = {}
 23 | FLAGS = {
 24 |     'scrape': ['-m', '8'],
 25 |     'scrape_dockets': ['-m', '8']
 26 | }
 27 | 
 28 | running = {}
 29 | 
 30 | db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME]
 31 | pid = os.getpid()
 32 | 
 33 | enabled = True
 34 | def sigint_handler(signum, frame):
 35 |     global enabled
 36 |     enabled = False
 37 |     print "Caught SIGINT; will exit after current tasks are complete."
 38 | signal.signal(signal.SIGINT, sigint_handler)
 39 | signal.signal(signal.SIGHUP, sigint_handler)
 40 | 
 41 | def preexec_function():
 42 |     # Ignore the SIGINT signal by setting the handler to the standard
 43 |     # signal handler SIG_IGN.
 44 |     signal.signal(signal.SIGINT, signal.SIG_IGN)
 45 | 
 46 | # start by resetting failures
 47 | for agency_record in db.pipeline.find():
 48 |     sequence = OVERRIDE_SEQUENCES.get(agency_record['_id'], DEFAULT_SEQUENCE)
 49 |     completed = agency_record['completed']
 50 |     failed_idxs = [i for i in xrange(len(sequence)) if sequence[i] in completed and type(completed[sequence[i]]) != dict]
 51 |     if failed_idxs:
 52 |         print "Resetting everything for agency %s after command %s" % (agency_record['_id'], sequence[failed_idxs[0]])
 53 |         for command in sequence[failed_idxs[0]:]:
 54 |             if command in completed:
 55 |                 print "Resetting %s" % command
 56 |                 del completed[command]
 57 |         db.pipeline.update({'_id': agency_record['_id']}, {'$set': {'completed': completed}}, safe=True)
 58 | 
 59 | while True:
 60 |     now = str(datetime.datetime.now())
 61 |     print "[%s] TICK %s" % (now, pid)
 62 | 
 63 |     # book-keep already started processes
 64 |     for command, info in running.items():
 65 |         agency, proc = info
 66 |         if proc.poll() is not None:
 67 |             print "[%s] %s has finished command %s" % (now, agency, command)
 68 |             results = proc.stdout.read()
 69 |             try:
 70 |                 parsed = json.loads(results)
 71 |             except ValueError:
 72 |                 parsed = "parse_failure"
 73 | 
 74 |             db.pipeline.update({'_id': agency}, {'$set': {('completed.' + command): parsed}}, safe=True)
 75 |             del running[command]
 76 | 
 77 |     # start up new ones as necessary, assuming we're still going
 78 |     if enabled:
 79 |         for agency_record in db.pipeline.find().sort('count'):
 80 |             agency = agency_record['_id']
 81 | 
 82 |             sequence = OVERRIDE_SEQUENCES.get(agency, DEFAULT_SEQUENCE)
 83 |             completed = agency_record['completed'].keys()
 84 |             to_do = [command for command in sequence if command not in completed]
 85 | 
 86 |             if not to_do:
 87 |                 continue
 88 | 
 89 |             next = to_do[0]
 90 |             if next not in running:
 91 |                 full_command = [sys.executable, './run.py', next] + FLAGS.get(next, []) + ['-a', agency, '--parsable']
 92 |                 proc = subprocess.Popen(full_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=preexec_function)
 93 |                 running[next] = (agency, proc)
 94 |                 print '[%s] %s has started command %s' % (now, agency, next)
 95 | 
 96 |     if not running.keys():
 97 |         print 'Nothing left to do; exiting.'
 98 |         break
 99 | 
100 |     time.sleep(1)


--------------------------------------------------------------------------------
/regscrape/regs_common/__init__.py:
--------------------------------------------------------------------------------
 1 | # lifted some logging code from the examples in the Python docs
 2 | 
 3 | import logging
 4 | 
 5 | # create logger
 6 | logger = logging.getLogger("regscrape")
 7 | logger.setLevel(logging.DEBUG)
 8 | 
 9 | # create console handler and set level to debug
10 | ch = logging.StreamHandler()
11 | ch.setLevel(logging.DEBUG)
12 | 
13 | # create formatter
14 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
15 | 
16 | # add formatter to ch
17 | ch.setFormatter(formatter)
18 | 
19 | # add ch to logger
20 | logger.addHandler(ch)
21 | 
22 | # also log pykka stuff if DEBUG is true
23 | import settings
24 | 
25 | if settings.DEBUG:
26 |     for ext_logger_name in ['pykka', 'remote_connection']:
27 |         ext_logger = logging.getLogger(ext_logger_name)
28 |         ext_logger.setLevel(logging.DEBUG)
29 |         ext_logger.addHandler(ch)
30 | 
31 | # add self to path
32 | import sys
33 | import os
34 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
35 | if CURRENT_DIR not in sys.path:
36 |     sys.path.append(CURRENT_DIR)
37 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/regs_common/commands/__init__.py


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/add_to_search.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | from optparse import OptionParser
  4 | arg_parser = OptionParser()
  5 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
  6 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
  7 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing search data with new data.")
  8 | 
  9 | from regs_models import *
 10 | import urllib2, json, traceback, datetime, zlib, pymongo, pytz, itertools
 11 | import rawes, requests, thrift
 12 | 
 13 | def run(options, args):
 14 |     while True:
 15 |         try:
 16 |             return add_to_search(options, args)
 17 |         except (pymongo.errors.OperationFailure, requests.exceptions.ConnectionError, thrift.transport.TTransport.TTransportException):
 18 |             print "Resetting..."
 19 |             continue
 20 | 
 21 | def add_to_search(options, args):
 22 |     import settings
 23 | 
 24 |     es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=60.0)
 25 |     index = getattr(es, settings.ES_INDEX)
 26 | 
 27 |     now = datetime.datetime.now()
 28 | 
 29 |     querysets = {}
 30 |     builders = {}
 31 |     metadata = {}
 32 | 
 33 |     PER_REQUEST = 200
 34 | 
 35 |     ### Dockets ###
 36 | 
 37 |     query = {'scraped': 'yes'}
 38 |     if options.agency:
 39 |         query['agency'] = options.agency
 40 |     if options.docket:
 41 |         query['_id'] = options.docket
 42 |     if not options.process_all:
 43 |         query['in_search_index'] = False
 44 | 
 45 |     querysets['docket'] = Docket.objects(__raw__=query)
 46 | 
 47 |     def build_docket(docket):
 48 |         print 'preparing docket', docket.id
 49 | 
 50 |         # build initial ES document
 51 |         es_doc = {
 52 |             'title': docket.title,
 53 |             'agency': docket.agency,
 54 |             'identifiers': [docket.id]
 55 |         }
 56 | 
 57 |         # add identifiers
 58 |         if docket.rin and docket.rin != "Not Assigned":
 59 |             es_doc['identifiers'].append(docket.rin)
 60 | 
 61 |         return es_doc
 62 | 
 63 |     def get_docket_metadata(docket):
 64 |         return {'_index': settings.ES_INDEX, '_type': 'docket', '_id': docket.id}
 65 | 
 66 |     builders['docket'] = build_docket
 67 |     metadata['docket'] = get_docket_metadata
 68 | 
 69 |     ### Documents ###
 70 | 
 71 |     query = {'deleted': False, 'scraped': 'yes', '$nor': [{'views.extracted': 'no'},{'attachments.views.extracted':'no'}]}
 72 |     if options.agency:
 73 |         query['agency'] = options.agency
 74 |     if options.docket:
 75 |         query['docket_id'] = options.docket
 76 |     if not options.process_all:
 77 |         query['in_search_index'] = False
 78 | 
 79 |     querysets['document'] = Doc.objects(__raw__=query)
 80 | 
 81 |     def build_document(doc):
 82 |         print 'preparing document', doc.id
 83 |         if doc.renamed:
 84 |             print 'preparing', doc.id
 85 |             doc.in_search_index = True
 86 |             doc.save()
 87 |             return None
 88 |         
 89 |         # build initial ES document
 90 |         es_doc = {
 91 |             'docket_id': doc.docket_id if doc.docket_id else doc.id.rsplit('-', 1)[0],
 92 |             'comment_on': doc.comment_on.get('document_id', None) if doc.comment_on else None,
 93 |             'title': doc.title,
 94 |             'agency': doc.agency,
 95 |             'posted_date': doc.details['Date_Posted'].replace(tzinfo=pytz.UTC) if 'Date_Posted' in doc.details else None,
 96 |             'document_type': doc.type,
 97 |             'submitter_organization': doc.details.get('Organization_Name', None),
 98 |             'submitter_name': ' '.join(filter(bool, [doc.details.get('First_Name', None), doc.details.get('Middle_Initial', None), doc.details.get('Last_Name', None)])),
 99 |             'submitter_entities': doc.submitter_entities,
100 |             'files': [],
101 |             'analyses': [],
102 |             'identifiers': [doc.id]
103 |         }
104 | 
105 |         # add views (max of 5 to avoid pathological cases)
106 |         for view in doc.views[:5]:
107 |             if not view.content:
108 |                 continue
109 |             es_doc['files'].append({
110 |                 "title": None,
111 |                 "abstract": None,
112 |                 "object_id": doc.object_id,
113 |                 "file_type": view.type,
114 |                 "view_type": "document_view",
115 |                 "text": view.as_text()[:100000],
116 |                 "entities": view.entities
117 |             })
118 | 
119 |         # add attachments (max of 10 to avoid pathological cases)
120 |         for attachment in doc.attachments[:10]:
121 |             for view in attachment.views[:5]:
122 |                 if not view.content:
123 |                     continue
124 |                 es_doc['files'].append({
125 |                     "title": attachment.title,
126 |                     "abstract": attachment.abstract,
127 |                     "object_id": attachment.object_id,
128 |                     "file_type": view.type,
129 |                     "view_type": "attachment_view",
130 |                     "text": view.as_text()[:100000],
131 |                     "entities": view.entities
132 |                 })
133 | 
134 |         # add identifiers
135 |         if doc.rin and doc.rin != "Not Assigned":
136 |             es_doc['identifiers'].append(doc.rin)
137 | 
138 |         if doc.details.get('Federal_Register_Number', None):
139 |             es_doc['identifiers'].append(doc.details['Federal_Register_Number'])
140 | 
141 |         if doc.details.get('FR_Citation', None):
142 |             es_doc['identifiers'].append(doc.details['FR_Citation'].replace(' ', ''))
143 | 
144 |         return es_doc
145 | 
146 |     def get_document_metadata(doc):
147 |         return {'_index': settings.ES_INDEX, '_type': 'document', '_id': doc.id, '_parent': doc.docket_id if doc.docket_id else doc.id.rsplit('-', 1)[0]}
148 | 
149 |     builders['document'] = build_document
150 |     metadata['document'] = get_document_metadata
151 | 
152 |     ### Actually do everything ###
153 |     def flush(queue, ids, collection):
154 |         # no need to do anything if there aren't any docs to add
155 |         if not ids:
156 |             return
157 |         
158 |         # save current queue to ES
159 |         try:
160 |             es_status = es._bulk.post(data="\n".join(queue))
161 |             print 'saved %s to ES' % ", ".join(ids)
162 |         except rawes.elastic_exception.ElasticException:
163 |             # sometimes the bulk save fails for some reason; fall back to traditional iterative safe if so
164 |             print 'falling back to iterative save...'
165 |             # iterate over the queue pair-wise
166 |             for command, record in itertools.izip(*[iter(queue)]*2):
167 |                 meta = json.loads(command)['index']
168 |                 params = {'parent': meta['_parent']} if '_parent' in meta else {}
169 | 
170 |                 es_index = getattr(es, meta['_index'])
171 |                 es_type = getattr(es_index, meta['_type'])
172 | 
173 |                 es_status = es_type[meta['_id']].put(data=record, params=params)
174 |                 print 'saved %s to ES as %s' % (meta['_id'], es_status['_id'])
175 |         
176 |         # update mongo docs
177 |         collection.update({'_id': {'$in': ids}}, {'$set': {'in_search_index': True}}, multi=True, safe=True)
178 | 
179 |         print "saved %s back to mongo" % ", ".join(ids)
180 |     
181 |     counts = {'docket': 0, 'document': 0}
182 |     for datatype in ('docket', 'document'):
183 |         queue = []
184 |         ids = []
185 |         max_length = PER_REQUEST * 2
186 |         for item in querysets[datatype]:
187 |             record = builders[datatype](item)
188 |             meta = metadata[datatype](item)
189 | 
190 |             if record:
191 |                 if not item.suppression.get('replaced_by', None):
192 |                     queue.append(json.dumps({'index':meta}))
193 |                     queue.append(json.dumps(record, default=es.json_encoder))
194 |                 ids.append(item.id)
195 | 
196 |             if len(queue) >= max_length:
197 |                 flush(queue, ids, querysets[datatype]._collection)
198 |                 counts[datatype] += len(ids)
199 |                 queue = []
200 |                 ids = []
201 |         flush(queue, ids, querysets[datatype]._collection)
202 |         counts[datatype] += len(ids)
203 | 
204 |     print "Done adding things to search: %s docket entries and %s document entries." % (counts['docket'], counts['document'])
205 |     return counts


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/administer_search.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from optparse import OptionParser
 4 | arg_parser = OptionParser()
 5 | arg_parser.add_option("-d", "--delete", dest="delete", action="store_true", default=False, help="Delete the search index.")
 6 | arg_parser.add_option("-c", "--create", dest="create", action="store_true", default=False, help="Create the search index.")
 7 | 
 8 | from regs_models import *
 9 | import urllib2, json, os
10 | import rawes
11 | 
12 | def run(options, args):
13 |     import settings, regs_common
14 | 
15 |     es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=30.0)
16 |     index = getattr(es, settings.ES_INDEX)
17 | 
18 |     if options.delete:
19 |         index.delete()
20 |         print "Index deleted."
21 | 
22 |     if options.create:
23 |         mapping_file = os.path.join(os.path.abspath(os.path.dirname(regs_common.__file__)), "data", "es_mapping.json")
24 |         mapping_data = json.load(open(mapping_file))
25 |         index.put(data={'mappings': mapping_data})
26 |         print "Index created."
27 | 
28 |     stats = es._stats.get()
29 |     print json.dumps(stats, indent=4)
30 | 
31 |     return stats


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/annotate_fr_agencies.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from regs_models import *
 4 | import json, urllib2
 5 | 
 6 | def run():
 7 |     fr_data = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/agencies.json"))
 8 | 
 9 |     fr_dict = {r['short_name']: r for r in fr_data if r['short_name']}
10 | 
11 |     for agency in Agency.objects():
12 |         if agency.id in fr_dict:
13 |             agency.fr_id = fr_dict[agency.id]['id']
14 |             agency.save()
15 |             print "Saved %s with ID %s" % (agency.name, agency.fr_id)


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/annotate_fr_docs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | GEVENT = False
  4 | 
  5 | from regs_common.exceptions import *
  6 | from regs_models import *
  7 | from optparse import OptionParser
  8 | 
  9 | import json, urllib, urllib2, os, re, datetime
 10 | 
 11 | from pyquery import PyQuery as pq
 12 | import dateutil.parser
 13 | import jellyfish
 14 | 
 15 | # arguments
 16 | arg_parser = OptionParser()
 17 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 18 | arg_parser.add_option("-d", "--docket", dest="docket_id", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 19 | arg_parser.add_option("-s", "--source", dest="source", action="store", type="string", default=None, help="Specify a scraping source to which to limit the dump.")
 20 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing FR data with new data.")
 21 | 
 22 | HEADER_MATCHER = re.compile(r"^[\*\s]*Federal Register[\*\s]*/ Vol. (\d+).*")
 23 | NUMBER = re.compile("^(\d+)$")
 24 | THREE_MONTHS = datetime.timedelta(days=90)
 25 | 
 26 | def fr_citation_from_view(view):
 27 |     view_text = view.as_text()
 28 |     lines = view_text.split("\n")
 29 | 
 30 |     # look for a page header
 31 |     header_match = [HEADER_MATCHER.match(l) for l in lines]
 32 |     header_lines = [(i, m.groups()[0]) for i, m in enumerate(header_match) if m]
 33 | 
 34 |     # now, for each, the page number will come either on the preceding or following line depending whether it's a left or right page
 35 |     number_match = {i: filter(bool, [NUMBER.match(lines[n].replace('*', '').strip()) for n in (i - 1, i + 1)]) for i, l in header_lines}
 36 | 
 37 |     header_lines_n = [(i, l, number_match[i][0].groups()[0]) for i, l in header_lines if number_match[i]]
 38 | 
 39 |     if header_lines_n:
 40 |         return "%s FR %s" % (header_lines_n[0][1], header_lines_n[0][2])
 41 | 
 42 |     return None
 43 | 
 44 | _fr_ids = {}
 45 | def fr_id_for_agency(agency):
 46 |     if agency in _fr_ids:
 47 |         return _fr_ids[agency]
 48 |     
 49 |     agency_obj = Agency.objects.get(id=agency)
 50 |     _fr_ids[agency] = agency_obj.fr_id if agency_obj.fr_id else None
 51 |     return _fr_ids[agency]
 52 | 
 53 | def levenshtein_ratio(s1, s2):
 54 |     s = len(s1) + len(s2)
 55 |     return (s - jellyfish.levenshtein_distance(s1.encode('utf8'), s2.encode('utf8'))) / float(s)
 56 | 
 57 | def guess_fr_num(doc):
 58 |     # if it's title-less or has a very short title, don't bother
 59 |     if not doc.title or len(doc.title) < 10:
 60 |         return None
 61 | 
 62 |     query = {'conditions[term]': doc.title.encode('utf8')}
 63 |     
 64 |     aid = fr_id_for_agency(doc.agency)
 65 |     if aid:
 66 |         query['conditions[agency_ids][]'] = str(aid)
 67 | 
 68 |     has_date = 'Date_Posted' in doc.details
 69 |     if has_date:
 70 |         # bracket the FR date by three months in either direction because sometimes they don't match
 71 |         query['conditions[publication_date][gte]'] = (doc.details['Date_Posted'] - THREE_MONTHS).strftime("%m/%d/%Y")
 72 |         query['conditions[publication_date][lte]'] = (doc.details['Date_Posted'] + THREE_MONTHS).strftime("%m/%d/%Y")
 73 | 
 74 |     # do search
 75 |     results = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles?" + urllib.urlencode(query)))
 76 |     if results['count']:
 77 |         # first annotate each with its title's distance to the real title, how far in time it is from the real time
 78 |         for result in results['results']:
 79 |             result['similarity'] = levenshtein_ratio(result['title'], doc.title)
 80 |             
 81 |             if has_date:
 82 |                 real_date = dateutil.parser.parse(result['publication_date'])
 83 |                 result['time_apart'] = abs(doc.details['Date_Posted'] - real_date)
 84 | 
 85 |         # then strip out all the ones that aren't at least 75% similar
 86 |         candidates = [result for result in results['results'] if result['similarity'] > 0.75]
 87 |         if not candidates:
 88 |             return None
 89 | 
 90 |         # then sort by how far away in time they are, if there are dates, or the distance otherwise
 91 |         if has_date:
 92 |             sorted_candidates = sorted(candidates, key=lambda r: r['time_apart'])
 93 |         else:
 94 |             sorted_candidates = sorted(candidates, key=lambda r: r['similarity'], reverse=True)
 95 | 
 96 |         return candidates[0]['document_number']
 97 | 
 98 | def fr_num_from_cite(fr_cite, title):
 99 |     # construct a query
100 |     query = {'conditions[term]': fr_cite.encode('utf8')}
101 | 
102 |     # do search -- has to be by HTML because there doesn't seem to be a way to do citation searches via the API
103 |     page = pq(url="https://www.federalregister.gov/articles/search?" + urllib.urlencode(query))
104 |     links = page('.matching_citation_document h4 a')
105 | 
106 |     if not links:
107 |         return None
108 | 
109 |     items = [(link.attr('href'), link.text()) for link in links.items()]
110 | 
111 |     # we order only by name because all results are on the same page and will therefore be from the same date
112 |     sorted_items = sorted(items, key=lambda l: levenshtein_ratio(l[1], title), reverse=True)
113 | 
114 |     # the document number is the thing before the last slash
115 |     return sorted_items[0][0].split("/")[-2]
116 | 
117 | def run(options, args):
118 |     query = {'type__in': ['notice', 'proposed_rule', 'rule']}
119 | 
120 |     for filter_type in ('agency', 'docket_id', 'source'):
121 |         filter_attr = getattr(options, filter_type)
122 |         if filter_attr:
123 |             query[filter_type] = filter_attr
124 | 
125 |     frn, frc, g, nd = 0, 0, 0, 0
126 |     for doc in Doc.objects(**query):
127 |         if 'fr_data' in doc.annotations and not options.process_all:
128 |             continue
129 |         
130 |         fr_num = None
131 |         fr_cite = None
132 | 
133 |         if 'Federal_Register_Number' in doc.details:
134 |             print doc.id, 'FR num', doc.details['Federal_Register_Number'].encode('utf8')
135 |             frn += 1
136 | 
137 |             # try fetching now; maybe we're done, but we can always try one of the other tactics if this doesn't work
138 |             fr_num = doc.details['Federal_Register_Number'].encode('utf8')
139 |             try:
140 |                 doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num))
141 |                 doc.save()
142 |                 print 'Succeeded with %s via FR number' % doc.id
143 |                 continue
144 |             except:
145 |                 fr_num = None
146 | 
147 |         if 'Federal_Register_Citation' in doc.details:
148 |             print doc.id, 'FR cite', doc.details['Federal_Register_Citation'].encode('utf8')
149 |             frc += 1
150 |             fr_cite = doc.details['Federal_Register_Citation'].encode('utf8')
151 |             fr_num = fr_num_from_cite(fr_cite, doc.title)
152 |             if fr_num:
153 |                 # try again
154 |                 try:
155 |                     doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num))
156 |                     doc.save()
157 |                     print 'Succeeded with %s via FR citation' % doc.id
158 |                     continue
159 |                 except:
160 |                     fr_cite = None
161 |                     fr_num = None
162 |             else:
163 |                 fr_cite = None
164 | 
165 |         if not fr_num and not fr_cite:
166 |             # does it have a PDF copy of the Federal Register version of the thing?
167 |             views = None
168 |             att = [a for a in doc.attachments if 'Federal Register' in a.title]
169 |             if att:
170 |                 views = [v for v in att[0].views if v.type == 'pdf']
171 | 
172 |             if not views:
173 |                 views = [v for v in doc.views if v.type == 'xpdf']
174 |             
175 |             if views:
176 |                 fr_cite = fr_citation_from_view(views[0])
177 | 
178 |             if fr_cite:
179 |                 print doc.id, 'FR cite (by PDF)', fr_cite
180 |                 frc += 1
181 | 
182 |                 fr_num = fr_num_from_cite(fr_cite, doc.title)
183 |                 if fr_num:
184 |                     # try again
185 |                     try:
186 |                         doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num))
187 |                         doc.save()
188 |                         print 'Succeeded with %s via FR citation (PDF)' % doc.id
189 |                         continue
190 |                     except:
191 |                         fr_cite = None
192 |                         fr_num = None
193 |                 else:
194 |                     fr_cite = None
195 | 
196 |             else:
197 |                 # last chance -- we guess from the title alone
198 |                 fr_num = guess_fr_num(doc)
199 |                 if fr_num:
200 |                     # try again
201 |                     try:
202 |                         doc.annotations['fr_data'] = json.load(urllib2.urlopen("https://www.federalregister.gov/api/v1/articles/" + fr_num))
203 |                         doc.save()
204 |                         g += 1
205 |                         print 'Succeeded with %s via title guessing' % doc.id
206 |                         continue
207 |                     except:
208 |                         fr_cite = None
209 |                         fr_num = None
210 | 
211 |         if not fr_num and not fr_cite:
212 |             # we failed :/
213 |             doc.annotations['fr_data'] = None
214 |             doc.save()
215 |             print doc.id, 'No dice'
216 |             nd += 1
217 |     print frn, frc, g, nd


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/create_dockets.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 5 | 
 6 | def run(options, args):
 7 |     import regs_models as models
 8 |     from collections import defaultdict
 9 | 
10 |     db = models.Docket._get_db()
11 |     new = 0
12 |     
13 |     print 'Starting docket query...'
14 | 
15 |     conditions = {}
16 |     if options.agency:
17 |         conditions['agency'] = options.agency
18 |     if options.docket:
19 |         conditions['id'] = options.docket
20 | 
21 |     # there's no way to do this aggregation without a map-reduce in Mongo 2.0, so do it on the Python side for now
22 |     # once 2.2 is final, this can trivially be replaced with a $group + $addToSet pipeline using the new aggregation framework
23 |     dockets = defaultdict(set)
24 |     for doc in db.docs.find(conditions, fields=['docket_id', 'agency']):
25 |         if 'docket_id' not in doc:
26 |             continue
27 |         dockets[doc['docket_id']].add(doc['agency'])
28 | 
29 |     for docket_id, agencies in dockets.iteritems():
30 |         if docket_id:
31 |             agency = list(agencies)[0] if len(agencies) == 1 else sorted(agencies, key=lambda a: docket_id.startswith(a), reverse=True)[0]
32 |             try:
33 |                 docket = models.Docket(id=docket_id, agency=agency)
34 |                 docket.save(force_insert=True)
35 |                 new += 1
36 |             except:
37 |                 # we already have this one
38 |                 pass
39 |     
40 |     total = len(dockets.keys())
41 |     print 'Iterated over %s dockets, of which %s were new.' % (total, new)
42 |     
43 |     return {'total': total, 'new': new}
44 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/create_entities.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from regs_models import *
 4 | 
 5 | from optparse import OptionParser
 6 | arg_parser = OptionParser()
 7 | arg_parser.add_option("-U", "--update", dest="update", action="store_true", default=False, help="Check if entities already existing before creating (slower)")
 8 | 
 9 | def run(options, args):
10 |     import settings, regs_common
11 |     import os
12 | 
13 |     # if we're updating
14 |     if options.update:
15 |         print "Preparing to update existing entities; retrieving current entity list..."
16 |         current = set((e.id for e in Entity.objects()))
17 |         print "Entities retrieved."
18 |     else:
19 |         print "Constructing new entity list."
20 | 
21 |     # grab a dictionary
22 |     word_file = getattr(settings, 'WORD_FILE', '/usr/share/dict/words')
23 |     name_file = os.path.join(os.path.abspath(os.path.dirname(regs_common.__file__)), "data", "names.dat")
24 | 
25 |     # filtered_words is a set of English words, plus common first and last names, and single letters
26 |     filtered_words = set((word.strip() for word in open(word_file, 'r') if word and word[0] == word[0].lower()))
27 |     filtered_words.update((name.strip().lower() for name in open(name_file, 'r') if name.strip() and not name.startswith('#')))
28 |     filtered_words.update(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
29 | 
30 | 
31 |     from influenceexplorer import InfluenceExplorer
32 |     api = InfluenceExplorer(settings.API_KEY, getattr(settings, 'AGGREGATES_API_BASE_URL', "http://transparencydata.com/api/1.0/"))
33 | 
34 |     entities = []
35 |     for type in ['individual', 'organization', 'politician']:
36 |         count = api.entities.count(type)
37 |         for i in range(0, count, 10000):
38 |             entities.extend(api.entities.list(i, i + 10000, type))
39 | 
40 |     from oxtail.matching.normalize import normalize_list
41 |     for entity in entities:
42 |         record = {
43 |             'id': entity['id'],
44 |             'td_type': entity['type'],
45 |             'td_name': entity['name'],
46 |             'aliases': [name.strip() for name in normalize_list([entity['name']] + entity['aliases'], entity['type'])]
47 |         }
48 |         record['filtered_aliases'] = [alias for alias in record['aliases'] if alias.lower() not in filtered_words]
49 |                 
50 |         if options.update and record['id'] in current:
51 |             Entity.objects(id=record['id']) \
52 |                   .update(safe_update=True, set__td_type=record['td_type'], set__td_name=record['td_name'], set__aliases=record['aliases'], set__filtered_aliases=record['filtered_aliases'])
53 |             print "Updated %s as existing record %s" % (record['aliases'][0], record['id'])
54 |             current.remove(record['id'])
55 |         else:
56 |             db_entity = Entity(**record)
57 |             db_entity.save()
58 |             print "Saved %s as new record %s" % (record['aliases'][0], record['id'])
59 | 
60 |     if options.update:
61 |         print "Deleting %s no-longer-existing records..." % len(current)
62 |         db = Entity._get_db()
63 |         db.entities.remove({'_id': {'$in': list(current)}})
64 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/export_text.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Filter to only one agency.  Default to all agencies if not specified.")
 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Filter to only one docket.  Default to all dockets if not specified.")
 5 | 
 6 | def ensure_directory(directory):
 7 |     if not os.path.exists(directory):
 8 |         os.mkdir(directory)
 9 | 
10 | def extract(record, keys):
11 |     out = {}
12 |     for key in keys:
13 |         if key in record and record[key]:
14 |             out[key] = record[key]
15 |     return out
16 | 
17 | dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
18 | 
19 | def run(options, args):
20 |     global os
21 |     import settings
22 |     import datetime
23 |     import os
24 |     import pymongo
25 |     import itertools
26 |     import json
27 |     from regs_common.util import get_db
28 |     import zipfile
29 |     import sys
30 | 
31 |     print 'Starting dump...'
32 |     
33 |     query = {'scraped': True, 'deleted': False}
34 |     
35 |     if options.docket:
36 |         query['docket_id'] = options.docket
37 |     if options.agency:
38 |         query['agency'] = options.agency
39 |     print query
40 |     
41 |     db = get_db()
42 |     
43 |     export_dir = os.path.join(settings.DATA_DIR, 'bulk', 'regulations-%s' % str(datetime.datetime.now().date()))
44 |     ensure_directory(export_dir)
45 |         
46 |     for agency, agency_docs in itertools.groupby(db.docs.find(query, sort=[('document_id', pymongo.ASCENDING)]), key=lambda d: d['agency']):
47 |         print 'Starting agency %s...' % agency
48 |         agency_dir = os.path.join(export_dir, agency)
49 |         ensure_directory(agency_dir)
50 |         
51 |         for docket, docket_docs in itertools.groupby(agency_docs, key=lambda d: d['docket_id']):
52 |             print 'Starting docket %s...' % docket
53 |             zip_path = os.path.join(agency_dir, '%s.zip' % docket)
54 |             
55 |             with zipfile.ZipFile(zip_path, 'a', zipfile.ZIP_DEFLATED, True) as docket_zip:
56 |                 docket_record = list(db.dockets.find({'_id': docket}))
57 |                 
58 |                 if docket_record:
59 |                     docket_zip.writestr(
60 |                         'metadata.json',
61 |                         json.dumps(
62 |                             extract(
63 |                                 docket_record[0],
64 |                                 ['docket_id', 'title', 'agency', 'rin', 'details', 'year']
65 |                             ),
66 |                             default=dthandler
67 |                         )
68 |                     )
69 |                 
70 |                 for doc in docket_docs:
71 |                     files = []
72 |                     
73 |                     views = [('view', view) for view in doc['views']]
74 |                     if 'attachments' in doc:
75 |                         for attachment in doc['attachments']:
76 |                             views.extend([('attachment', view) for view in attachment['views']])
77 |                     
78 |                     for type, view in views:
79 |                         file = {'url': view['url']}
80 |                         if view['extracted'] == True:
81 |                             filename = '%s_%s.txt' % (type, view['file'].split('/')[-1].replace('.', '_'))
82 |                             file['filename'] = filename
83 |                             
84 |                             docket_zip.writestr(os.path.join(doc['document_id'], filename), view['text'].encode('utf8'))
85 |                             
86 |                         files.append(file)
87 |                         
88 |                     metadata = extract(
89 |                         doc,
90 |                         ['document_id', 'title', 'agency', 'docket_id', 'type', 'topics', 'details', 'comment_on', 'rin']
91 |                     )
92 |                     metadata['files'] = files
93 |                     
94 |                     docket_zip.writestr(os.path.join(doc['document_id'], 'metadata.json'), json.dumps(metadata, default=dthandler))
95 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/extract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | GEVENT = False
  4 | 
  5 | from regs_common.exceptions import *
  6 | from optparse import OptionParser
  7 | 
  8 | # arguments
  9 | arg_parser = OptionParser()
 10 | arg_parser.add_option("-t", "--type", action="store", dest="type", default=None)
 11 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 12 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 13 | 
 14 | # runner
 15 | def run(options, args): 
 16 |     global Pool, sys, settings, subprocess, os, urlparse, json, regs_common, pymongo, mp_bulk_extract
 17 |     from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view
 18 |     from regs_common.extraction import mp_bulk_extract
 19 |     from gevent.pool import Pool
 20 |     import sys
 21 |     import settings
 22 |     import subprocess, os, urlparse, json
 23 |     import pymongo
 24 | 
 25 |     return {
 26 |         'document_views': run_for_view_type('document views', find_views, update_view, options),
 27 |         'attachment_views': run_for_view_type('attachment views', find_attachment_views, update_attachment_view, options)
 28 |     }
 29 | 
 30 | def run_for_view_type(view_label, find_func, update_func, options):
 31 |     print 'Preparing text extraction of %s.' % view_label
 32 |     
 33 |     query = {'deleted': False}
 34 |     if options.agency:
 35 |         query['agency'] = options.agency
 36 |     if options.docket:
 37 |         query['docket_id'] = options.docket
 38 | 
 39 |     find_conditions = {
 40 |         'downloaded': "yes",
 41 |         'extracted': "no",
 42 |         'query': query
 43 |     }
 44 |     if options.type:
 45 |         find_conditions['type'] = options.type
 46 |     
 47 |     # track stats -- no locks because yay for cooperative multitasking
 48 |     stats = {'extracted': 0, 'failed': 0}
 49 | 
 50 |     views = find_func(**find_conditions)
 51 | 
 52 |     # same yucky hack as in downloads
 53 |     v_array = [views]
 54 |     def extract_generator():
 55 |         while True:
 56 |             try:
 57 |                 result = v_array[0].next()
 58 |                 yield (result['view'].file_path, None, result)
 59 |             except pymongo.errors.OperationFailure:
 60 |                 # occasionally pymongo seems to lose track of the cursor for some reason, so reset the query
 61 |                 v_array[0] = find_func(**find_conditions)
 62 |                 continue
 63 |             except StopIteration:
 64 |                 break
 65 | 
 66 |     def status_func(status, text, filename, filetype, output_type, used_ocr, result):
 67 |         if status[0]:
 68 |             result['view'].extracted = "yes"
 69 | 
 70 |             result['view'].content.new_file()
 71 |             result['view'].content.content_type = 'text/plain'
 72 |             result['view'].content.write(text.encode('utf-8'))
 73 |             result['view'].content.close()
 74 | 
 75 |             result['view'].mode = output_type
 76 |             result['view'].ocr = used_ocr
 77 |             try:
 78 |                 update_func(**result)
 79 |                 print 'Extracted and saved text from %s' % filename
 80 |                 stats['extracted'] += 1
 81 |             except (pymongo.errors.OperationFailure, pymongo.errors.InvalidDocument):
 82 |                 print 'Extracted text from %s but failed to save due to oversized document.' % filename
 83 |                 stats['failed'] += 1
 84 |                 
 85 |                 if not 'oversized' in stats:
 86 |                     stats['oversized'] = []
 87 |                 stats['oversized'].append(result['view'].url())
 88 |         else:
 89 |             result['view'].extracted = 'failed_no_extractor' if 'no extractor' in status[1] else 'failed_extraction'
 90 |             update_func(**result)
 91 |             print 'Saved failure to decode %s' % result['view'].file_path
 92 |             stats['failed'] += 1
 93 |         update_func(**result)
 94 |     
 95 |     mp_bulk_extract(extract_generator(), status_func, verbose=True)
 96 | 
 97 |     print 'Done with %s.' % view_label
 98 |     
 99 |     return stats
100 | 
101 | if __name__ == "__main__":
102 |     run()
103 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/mark_searchable_entities.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | def run():
 4 |     from regs_models import Entity
 5 | 
 6 |     print "Updating entity search index..."
 7 | 
 8 |     # mark the ones that should be searchable but aren't as searchable
 9 |     Entity.objects(__raw__={
10 |         'td_type': 'organization',
11 |         'stats.count': {'$gt': 0},
12 |         'searchable': False
13 |     }).update(set__searchable=True, multi=True)
14 | 
15 |     # mark the ones that are searchable but shouldn't be unsearchable
16 |     Entity.objects(__raw__={
17 |         '$or': [
18 |             {'td_type': {'$ne': 'organization'}},
19 |             {'stats.count': {'$not': {'$gt': 0}}}
20 |         ],
21 |         'searchable': True
22 |     }).update(set__searchable=False, multi=True)
23 | 
24 |     print "Update complete."


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/match_text.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import zlib
  4 | import datetime
  5 | import settings
  6 | 
  7 | import pymongo
  8 | import traceback
  9 | import os
 10 | import re
 11 | import multiprocessing
 12 | from Queue import Empty
 13 | from regs_models import *
 14 | 
 15 | from oxtail.matching import match
 16 | 
 17 | # arguments
 18 | from optparse import OptionParser
 19 | arg_parser = OptionParser()
 20 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 21 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 22 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Force a re-extraction of all documents in the system.")
 23 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes.  Defaults to number of cores if not specified.")
 24 | 
 25 | # regex to find titles that are likely to have submitter names
 26 | NAME_FINDER = re.compile(r"^(public )?(comment|submission)s? (by|from) (?P<name>.*)$", re.I)
 27 | 
 28 | def get_text(view):
 29 |     if not view.content:
 30 |         return ''
 31 |     
 32 |     return view.content.read()
 33 | 
 34 | def process_doc(doc):
 35 |     # entity extraction
 36 |     for view in doc.views:
 37 |         if view.extracted == 'yes':
 38 |             view_matches = match(get_text(view), multiple=True)
 39 |             view.entities = list(view_matches.keys()) if view_matches else []
 40 | 
 41 |     for attachment in doc.attachments:
 42 |         for view in attachment.views:
 43 |             if view.extracted == 'yes':
 44 |                 view_matches = match(get_text(view), multiple=True)
 45 |                 view.entities = list(view_matches.keys()) if view_matches else []
 46 |     
 47 |     # submitter matches
 48 |     #   check if there's submitter stuff in the title
 49 |     title_match = NAME_FINDER.match(doc.title)
 50 | 
 51 |     #   next check details, which is where most title stuff lives
 52 |     details = doc.details
 53 |     #   stick "XXXX" between tokens because it doesn't occur in entity names
 54 |     submitter_matches = match(' XXXX '.join([
 55 |         # organization
 56 |         details.get('Organization_Name', ''),
 57 |         
 58 |         # submitter name
 59 |         ' '.join(
 60 |             filter(bool, [details.get('First_Name', ''), details.get('Last_Name', '')])
 61 |         ),
 62 | 
 63 |         # submitter representative
 64 |         details.get('Submitter_s_Representative', ''),
 65 | 
 66 |         # title_match if we found one
 67 |         title_match.groupdict()['name'] if title_match else '',
 68 |         
 69 |         # just examine the whole title if it's from SEC or CFTC; the title is basically always submitter info
 70 |         doc.title if doc.source == 'sec_cftc' and doc.type in ('public_submission', 'other') else ''
 71 |     ]))
 72 |     doc.submitter_entities = list(submitter_matches.keys()) if submitter_matches else []
 73 | 
 74 |     doc.entities_last_extracted = datetime.datetime.now()
 75 |         
 76 |     doc.save()
 77 | 
 78 |     return True
 79 | 
 80 | def process_worker(todo_queue):
 81 |     pid = os.getpid()
 82 |     print '[%s] Worker started.' % pid
 83 |     while True:
 84 |         try:
 85 |             doc = Doc._from_son(todo_queue.get())
 86 |         except Empty:
 87 |             print '[%s] Processing complete.' % pid
 88 |             return
 89 |         
 90 |         try:
 91 |             doc_success = process_doc(doc)
 92 |             print '[%s] Processing of doc %s succeeded.' % (pid, doc.id)
 93 |         except:
 94 |             print '[%s] Processing of doc %s failed.' % (pid, doc.id)
 95 |             traceback.print_exc()
 96 |         
 97 |         todo_queue.task_done()
 98 | 
 99 | def run(options, args):
100 |     from regs_common.entities import load_trie_from_mongo
101 |     import time
102 | 
103 |     pid = os.getpid()
104 | 
105 |     # load trie from the mongo database
106 |     import_start = time.time()
107 |     print '[%s] Loading trie...' % pid
108 |     load_trie_from_mongo()
109 |     print '[%s] Loaded trie in %s seconds.' % (pid, time.time() - import_start)
110 | 
111 |     query = {'deleted': False, 'scraped': 'yes', '$nor': [{'views.extracted': 'no'},{'attachments.views.extracted':'no'}]}
112 |     if options.agency:
113 |         query['agency'] = options.agency
114 |     if options.docket:
115 |         query['docket_id'] = options.docket
116 |     if not options.process_all:
117 |         query['entities_last_extracted'] = {'$exists': False}
118 |     
119 |     cursor = Doc.objects(__raw__=query)
120 |     
121 |     run_start = time.time()
122 |     print '[%s] Starting analysis...' % pid
123 | 
124 |     num_workers = options.multi
125 |     
126 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
127 |     
128 |     processes = []
129 |     for i in range(num_workers):
130 |         proc = multiprocessing.Process(target=process_worker, args=(todo_queue,))
131 |         proc.start()
132 |         processes.append(proc)
133 |     
134 |     for doc in cursor:
135 |         todo_queue.put(doc.to_mongo())
136 |     
137 |     todo_queue.join()
138 | 
139 |     for proc in processes:
140 |         print 'Terminating worker %s...' % proc.pid
141 |         proc.terminate()
142 |     
143 |     print '[%s] Completed analysis in %s seconds.' % (pid, time.time() - run_start)
144 | 
145 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/reset_downloads.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | def run():
 4 |     global os, settings
 5 |     from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view
 6 |     import os
 7 |     import settings
 8 | 
 9 |     run_for_view_type('document views', find_views, update_view)
10 |     run_for_view_type('attachment views', find_attachment_views, update_attachment_view)
11 | 
12 | def run_for_view_type(view_label, find_func, update_func):
13 |     print 'Resetting %s.' % view_label
14 |     views = find_func(downloaded='failed', query={'deleted': False})
15 |     
16 |     for result in views:
17 |         result['view'].downloaded = 'no'
18 |         update_func(**result)
19 |     
20 |     print 'Done with %s.' % view_label
21 | 
22 | if __name__ == "__main__":
23 |     run()


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/reset_extraction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | def run():
 4 |     global os, settings
 5 |     from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view
 6 |     import os
 7 |     import settings
 8 | 
 9 |     run_for_view_type('document views', find_views, update_view)
10 |     run_for_view_type('attachment views', find_attachment_views, update_attachment_view)
11 | 
12 | def run_for_view_type(view_label, find_func, update_func):
13 |     print 'Resetting %s.' % view_label
14 |     views = find_func(extracted='failed', query={'deleted': False})
15 |     
16 |     for result in views:
17 |         result['view'].extracted = 'no'
18 |         update_func(**result)
19 |     
20 |     print 'Done with %s.' % view_label
21 | 
22 | if __name__ == "__main__":
23 |     run()


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/run_aggregates.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | import multiprocessing
 4 | 
 5 | from optparse import OptionParser
 6 | arg_parser = OptionParser()
 7 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 8 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 9 | arg_parser.add_option("-A", "--all", dest="process_all", action="store_true", default=False, help="Replace existing MR data with new data.")
10 | arg_parser.add_option("-p", "--pretend", dest="pretend", action="store_true", default=False, help="Don't actually write anything to the database.")
11 | arg_parser.add_option("-n", "--no-children", dest="no_children", action="store_true", default=False, help="Don't spawn child processes.")
12 | arg_parser.add_option("-r", "--resume", dest="resume_db", action="store", type="string", default=None, help="Resume a previous aggregation task (HERE BE DRAGONS)")
13 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes.  Defaults to number of cores if not specified.")
14 | 
15 | def run_client():
16 |     from mincemeat import Client, DEFAULT_PORT
17 |     import time
18 |     import socket
19 |     import os
20 | 
21 |     print "[%s] Starting worker" % os.getpid()
22 |     while True:
23 |         time.sleep(2)
24 |         try:
25 |             client = Client()
26 |             client.password = ""
27 |             client.conn('localhost', DEFAULT_PORT)
28 |             return
29 |         except socket.error as v:
30 |             if v.errno == 54:
31 |                 print "[%s] Caught a socket error 54; resetting worker" % os.getpid()
32 |             else:
33 |                 print "[%s] Caught a socket error %s; giving up" % (os.getpid(), v.errno)
34 |                 return
35 | 
36 | def run(options, args):
37 |     print 'Running aggregates...'
38 | 
39 |     num_workers = options.multi
40 | 
41 |     pool = multiprocessing.Pool(num_workers)
42 | 
43 |     if not options.no_children:
44 |         for i in range(num_workers):
45 |             pool.apply_async(run_client)
46 | 
47 |     from aggregates import run_aggregates
48 |     run_aggregates(options)
49 | 
50 |     pool.terminate()
51 | 
52 |     print "Aggregates complete."
53 |     
54 |     return {'success': True}
55 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/commands/runner.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, optparse, json, settings
 4 | 
 5 | def run_command():    
 6 |     if len(sys.argv) < 2:
 7 |         print 'Usage: ./run.py <command>'
 8 |         sys.exit()
 9 |     command = sys.argv[1]
10 |     
11 |     if command.endswith('.py'):
12 |         mod_name = command.split('/').pop().rsplit('.', 1)[0]
13 |         import imp
14 |         try:
15 |             mod = imp.load_source(mod_name, command)
16 |         except ImportError:
17 |             print 'Could not load custom command: %s' % command
18 |             sys.exit()
19 |     else:
20 |         imported = False
21 |         for lib in ['regs_common'] + settings.SITES:
22 |             try:
23 |                 parent_mod = __import__('%s.commands' % lib, fromlist=[command])
24 |                 mod = getattr(parent_mod, command)
25 |                 imported = True
26 |                 break
27 |             except ImportError:
28 |                 pass
29 |             except AttributeError:
30 |                 pass
31 |         if not imported:
32 |             print 'No such command: %s' % command
33 |             sys.exit()
34 |     
35 |     if getattr(mod, 'GEVENT', True):
36 |         from gevent.monkey import patch_all
37 |         patch_all()
38 | 
39 |     run = getattr(mod, 'run', False)
40 |     if not run or not callable(run):
41 |         print 'Command %s is not runnable' % command
42 |         sys.exit()
43 |     
44 |     parser = getattr(mod, 'arg_parser', None)
45 |     parser_defined = parser is not None
46 |     
47 |     if not parser:
48 |         parser = optparse.OptionParser()
49 |     parser.add_option('--parsable', dest='parsable', action='store_true', default=False, help='Output JSON instead of human-readable messages.')
50 |     parse_results = parser.parse_args(sys.argv[2:])
51 |     
52 |     dev_null = open('/dev/null', 'w')
53 |     if parse_results[0].parsable:
54 |         # disable standard output by monkey-patching sys.stdout
55 |         real_stdout = sys.stdout
56 |         sys.stdout = dev_null
57 |     
58 |     from regs_common.util import bootstrap_settings
59 |     bootstrap_settings()
60 |     
61 |     out = run(*(parse_results if parser_defined else []))
62 |     
63 |     if parse_results[0].parsable:
64 |         # turn stdout back on so we can print output
65 |         sys.stdout = real_stdout
66 |         
67 |         if out:
68 |             print json.dumps(out)
69 |     
70 |     # no matter what, nuke stderr on exit so we can avoid that stupid gevent thing
71 |     sys.stderr = dev_null
72 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/data/es_mapping.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "docket": {
  3 |         "properties": {
  4 |             "_id": {
  5 |                 "type": "string",
  6 |                 "index": "not_analyzed"
  7 |             },
  8 |             "title": {
  9 |                 "type": "string"
 10 |             },
 11 |             "agency": {
 12 |                 "type": "string",
 13 |                 "index": "not_analyzed"
 14 |             },
 15 |             "identifiers": {
 16 |                 "type": "string",
 17 |                 "index": "not_analyzed"
 18 |             }
 19 |         }
 20 |     },
 21 |     "document": {
 22 |         "_parent": {
 23 |             "type": "docket"
 24 |         },
 25 |         "properties": {
 26 |             "_id": {
 27 |                 "type": "string",
 28 |                 "index": "not_analyzed"
 29 |             },
 30 |             "title": {
 31 |                 "type": "string"
 32 |             },
 33 |             "docket_id": {
 34 |                 "type": "string",
 35 |                 "index": "not_analyzed"
 36 |             },
 37 |             "agency": {
 38 |                 "type": "string",
 39 |                 "index": "not_analyzed"
 40 |             },
 41 |             "comment_on": {
 42 |                 "type": "string",
 43 |                 "index": "not_analyzed"
 44 |             },
 45 |             "posted_date": {
 46 |                 "type": "date"
 47 |             },
 48 |             "document_type": {
 49 |                 "type": "string",
 50 |                 "index": "not_analyzed"
 51 |             },
 52 |             "submitter_organization": {
 53 |                 "type": "string"
 54 |             },
 55 |             "submitter_name": {
 56 |                 "type": "string"
 57 |             },
 58 |             "submitter_entities": {
 59 |                 "type": "string",
 60 |                 "index": "not_analyzed"
 61 |             },
 62 |             "analyses": {
 63 |                 "type": "string",
 64 |                 "index": "not_analyzed"
 65 |             },
 66 |             "identifiers": {
 67 |                 "type": "string",
 68 |                 "index": "not_analyzed"
 69 |             },
 70 |             "files": {
 71 |                 "properties": {
 72 |                     "title": {
 73 |                         "type": "string"
 74 |                     },
 75 |                     "abstract": {
 76 |                         "type": "string"
 77 |                     },
 78 |                     "text": {
 79 |                         "type": "string",
 80 |                         "term_vector": "with_positions_offsets"
 81 |                     },
 82 |                     "object_id": {
 83 |                         "type": "string",
 84 |                         "index": "not_analyzed"
 85 |                     },
 86 |                     "file_type": {
 87 |                         "type": "string",
 88 |                         "index": "not_analyzed"
 89 |                     },
 90 |                     "view_type": {
 91 |                         "type": "string",
 92 |                         "index": "not_analyzed"
 93 |                     },
 94 |                     "entities": {
 95 |                         "type": "string",
 96 |                         "index": "not_analyzed"
 97 |                     }
 98 |                 }
 99 |             }
100 |         }
101 |     }
102 | }


--------------------------------------------------------------------------------
/regscrape/regs_common/data_import.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | import gridfs
 3 | import settings
 4 | 
 5 | def copy_data(source_db_name, dest_db_name, query):
 6 |     source = pymongo.Connection(**settings.DB_SETTINGS)[source_db_name]
 7 |     dest = pymongo.Connection(**settings.DB_SETTINGS)[dest_db_name]
 8 | 
 9 |     source_gridfs = gridfs.GridFS(source, collection='files')
10 |     dest_gridfs = gridfs.GridFS(dest, collection='files')
11 | 
12 |     for doc in source.docs.find(query):
13 |         print 'Copying doc %s...' % doc['_id']
14 | 
15 |         # flip some flags
16 |         #doc['stats'] = {}
17 |         doc['in_aggregates'] = False
18 |         doc['in_cluster_db'] = False
19 |         doc['in_search_index'] = False
20 | 
21 |         dest.docs.save(doc)
22 | 
23 |         file_ids = []
24 |         for view in doc.get('views', []):
25 |             if view.get('content', None):
26 |                 file_ids.append(view['content'])
27 |         
28 |         for attachment in doc.get('attachments', []):
29 |             for view in attachment.get('views', []):
30 |                 if view.get('content', None):
31 |                     file_ids.append(view['content'])
32 | 
33 |         for fid in file_ids:
34 |             print "Copying file %s" % fid
35 | 
36 |             # delete out of the dest in case it's already there
37 |             dest_gridfs.delete(fid)
38 | 
39 |             # then read out from the old one
40 |             fdata = source_gridfs.get(fid).read()
41 | 
42 |             # ... and write to the new one
43 |             dest_gridfs.put(fdata, _id=fid)
44 | 
45 |         print "Done."
46 | 
47 |     dkt_query = dict(query)
48 |     if "docket_id" in dkt_query:
49 |         dkt_query['_id'] = dkt_query['docket_id']
50 |         del dkt_query['docket_id']
51 | 
52 |     for dkt in source.dockets.find(dkt_query):
53 |         print 'Copying docket %s...' % dkt['_id']
54 | 
55 |         # flip some flags
56 |         #dkt['stats'] = {}
57 |         dkt['in_search_index'] = False
58 | 
59 |         if 'source' not in dkt:
60 |             dkt['source'] = 'regulations.gov'
61 | 
62 |         dest.dockets.save(dkt)
63 | 
64 |         print "Done."


--------------------------------------------------------------------------------
/regscrape/regs_common/entities.py:
--------------------------------------------------------------------------------
 1 | def all_aliases():
 2 |     import itertools
 3 |     from regs_common.util import get_db
 4 |     db = get_db()
 5 | 
 6 |     return itertools.chain.from_iterable(
 7 |         itertools.imap(
 8 |             lambda entity: [(alias, entity['_id']) for alias in entity.get('filtered_aliases', [])],
 9 |             db.entities.find()
10 |         )
11 |     )
12 | 
13 | def load_trie_from_mongo():
14 |     from oxtail import matching
15 | 
16 |     matching._entity_trie = matching.build_token_trie(
17 |         all_aliases(),
18 |         matching._blacklist
19 |     )


--------------------------------------------------------------------------------
/regscrape/regs_common/exceptions.py:
--------------------------------------------------------------------------------
 1 | class ExtractionFailed(Exception):
 2 |     pass
 3 | 
 4 | class DoesNotExist(Exception):
 5 |     pass
 6 | 
 7 | class ChildTimeout(Exception):
 8 |     pass
 9 | 
10 | class RateLimitException(Exception):
11 |     pass


--------------------------------------------------------------------------------
/regscrape/regs_common/extraction.py:
--------------------------------------------------------------------------------
  1 | from regs_common.processing import *
  2 | import subprocess
  3 | import settings
  4 | 
  5 | EXTRACTORS = {
  6 |     'xml': [
  7 |         binary_extractor('cat', error='The document does not have a content file of type', output_type="html"),
  8 |         binary_extractor('html2text', error='The document does not have a content file of type')
  9 |     ],
 10 |         
 11 |     'pdf': [
 12 |         binary_extractor(['pdftohtml', '-noframes', '-i', '-stdout'], error='PDF file is damaged', output_type="html"),
 13 |         binary_extractor('pdftotext', append=['-'], error='PDF file is damaged'),
 14 |     ],
 15 |     
 16 |     'msw8': [
 17 |         binary_extractor('antiword', error='is not a Word Document'),
 18 |         binary_extractor('catdoc', error='The document does not have a content file of type') # not really an error, but catdoc happily regurgitates whatever you throw at it
 19 |     ],
 20 |     
 21 |     'rtf': [
 22 |         binary_extractor('unrtf', error='Warning: No stack to get attribute from', output_type="html"),
 23 |         binary_extractor('catdoc', error='The document does not have a content file of type') # not really an error, as above
 24 |     ],
 25 |     
 26 |     'txt': [
 27 |         binary_extractor('cat', error='The document does not have a content file of type') # not really an error, as above
 28 |     ],
 29 |     
 30 |     'msw12': [
 31 |         script_extractor('extract_docx.py', error='Failed to decode file')
 32 |     ],
 33 |     
 34 |     'wp8': [
 35 |         binary_extractor('wpd2text', error='ERROR')
 36 |     ],
 37 | }
 38 | 
 39 | EXTRACTORS['crtext'] = EXTRACTORS['xml']
 40 | EXTRACTORS['html'] = EXTRACTORS['xml']
 41 | EXTRACTORS['msw6'] = EXTRACTORS['msw8']
 42 | EXTRACTORS['msw'] = EXTRACTORS['msw8']
 43 | EXTRACTORS['xpdf'] = EXTRACTORS['pdf'] + [pdf_ocr]
 44 | 
 45 | # extractor factory
 46 | def _get_extractor(status_func, verbose, filename, filetype=None, record=None):
 47 |     def extract():
 48 |         local_filetype = filetype if filetype else filename.split('.')[-1]
 49 |         if local_filetype in EXTRACTORS:
 50 |             success = False
 51 |             error_message = None
 52 |             used_ocr = False
 53 |             output_type = "text"
 54 |             for extractor in EXTRACTORS[local_filetype]:
 55 |                 try:
 56 |                     output = extractor(filename)
 57 |                 except ExtractionFailed as failure:
 58 |                     reason = str(failure)
 59 |                     error_message = 'Failed to extract from %s using %s%s' % (
 60 |                         filename,
 61 |                         extractor.__str__(),
 62 |                         ' %s' % reason if reason else ''
 63 |                     )
 64 |                     if verbose: print error_message
 65 |                     continue
 66 |                 except ChildTimeout as failure:
 67 |                     error_message = 'Failed extracting from %s using %s due to timeout' % (
 68 |                         filename,
 69 |                         extractor.__str__()
 70 |                     )
 71 |                     if verbose: print error_message
 72 |                     continue
 73 |                 
 74 |                 success = True
 75 |                 text = unicode(remove_control_chars(output), 'utf-8', 'ignore')
 76 |                 used_ocr = getattr(extractor, 'ocr', False)
 77 |                 output_type = getattr(extractor, 'output_type', 'text')
 78 |                 if verbose: print 'Extracted text from %s using %s' % (
 79 |                     filename,
 80 |                     extractor.__str__()
 81 |                 )
 82 |                 
 83 |                 break
 84 | 
 85 |             status_func(
 86 |                 (success, error_message),
 87 |                 text if success else None,
 88 |                 filename,
 89 |                 local_filetype,
 90 |                 output_type,
 91 |                 used_ocr,
 92 |                 record
 93 |             )
 94 |         else:
 95 |             status_func(
 96 |                 (False, "no extractor for type %s" % local_filetype),
 97 |                 None,
 98 |                 filename,
 99 |                 local_filetype,
100 |                 "text",
101 |                 False,
102 |                 record
103 |             )
104 |     return extract
105 | 
106 | def bulk_extract(extract_iterable, status_func=None, verbose=False):
107 |     from gevent.pool import Pool  
108 |     workers = Pool(getattr(settings, 'EXTRACTORS', 2))
109 |     
110 |     # keep the extractors busy with tasks as long as there are more results
111 |     for extract_record in extract_iterable:
112 |         workers.spawn(_get_extractor(status_func, verbose, *extract_record))
113 |     
114 |     workers.join()
115 |     
116 |     return
117 | 
118 | def mp_bulk_extract(extract_iterable, status_func=None, verbose=False):
119 |     import multiprocessing
120 |     from Queue import Empty
121 | 
122 |     num_workers = getattr(settings, 'EXTRACTORS', multiprocessing.cpu_count())
123 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
124 |     
125 |     def worker(todo_queue):
126 |         while True:
127 |             try:
128 |                 extract_record = todo_queue.get()
129 |             except Empty:
130 |                 return
131 | 
132 |             _get_extractor(status_func, verbose, *extract_record)()
133 |             todo_queue.task_done()
134 | 
135 |     processes = []
136 |     for i in range(num_workers):
137 |         proc = multiprocessing.Process(target=worker, args=(todo_queue,))
138 |         proc.start()
139 |         processes.append(proc)
140 |     
141 |     for extract_record in extract_iterable:
142 |         todo_queue.put(extract_record)
143 | 
144 |     todo_queue.join()
145 |     
146 |     for proc in processes:
147 |         proc.terminate()
148 | 
149 |     return
150 | 
151 | def serial_bulk_extract(extract_iterable, status_func=None, verbose=False):
152 |     import subprocess
153 |     
154 |     for extract_record in extract_iterable:
155 |         _get_extractor(status_func, verbose, *extract_record)()
156 | 
157 |     return


--------------------------------------------------------------------------------
/regscrape/regs_common/gevent_mongo.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Andrey Nikishaev"
 2 | __email__ = "creotiv@gmail.com"
 3 |  
 4 | import pymongo, sys
 5 | from gevent.queue import Queue
 6 |  
 7 | class GeventMongoPool(object):
 8 |     """
 9 |     Rewrited connection pool for working with global connections.
10 |     """
11 |  
12 |     # Non thread-locals
13 |     __slots__ = ["sockets", "socket_factory"]
14 |     sock = None
15 |  
16 |     def __init__(self, socket_factory):
17 |         self.socket_factory = socket_factory
18 |         if not hasattr(self, "sockets"):
19 |             self.sockets = []
20 |  
21 |     def socket(self):
22 |         # we store the pid here to avoid issues with fork /
23 |         # multiprocessing - see
24 |         # test.test_connection:TestConnection.test_fork for an example
25 |         # of what could go wrong otherwise
26 |         pid = os.getpid()
27 |  
28 |         if self.sock is not None and self.sock[0] == pid:
29 |             return self.sock[1]
30 |  
31 |         try:
32 |             self.sock = (pid, self.sockets.pop())
33 |         except IndexError:
34 |             self.sock = (pid, self.socket_factory())
35 |  
36 |         return self.sock[1]
37 |  
38 |     def return_socket(self):
39 |         if self.sock is not None and self.sock[0] == os.getpid():
40 |             self.sockets.append(self.sock[1])
41 |         self.sock = None
42 |  
43 | pymongo.connection.Pool = GeventMongoPool
44 |  
45 | class MongoConnection(object):
46 |     """Memcache pool auto-destruct connection"""
47 |     def __init__(self,pool,conn):
48 |         self.pool = pool
49 |         self.conn = conn
50 |  
51 |     def getDB(self):
52 |         return self.conn
53 |  
54 |     def __getattr__(self, name):
55 |         return getattr(self.conn, name)
56 |  
57 |     def __getitem__(self, name):
58 |         return self.conn[name]
59 |  
60 |     def __del__(self):
61 |         self.pool.queue.put(self.conn)
62 |         del self.pool
63 |         del self.conn
64 |  
65 | class Mongo(object):    
66 |     """MongoDB Pool"""
67 |     def __new__(cls,db_name,size=5,*args,**kwargs):
68 |         if not hasattr(cls,'_instance'):
69 | 	    # use your own config library
70 |             cls._instance = object.__new__(cls)
71 |             cls._instance.queue = Queue(size)
72 |             for x in xrange(size):
73 |                 try:
74 | 		    # use your own config library
75 |                     cls._instance.queue.put(
76 |                         pymongo.Connection(*args,**kwargs)[db_name]
77 |                     )
78 |                 except:
79 |                     sys.exc_clear()
80 |                     error('Can\'t connect to mongo servers')
81 |  
82 |         return cls._instance     
83 |  
84 |     def get_conn(self,block=True,timeout=None):
85 |         """Get Mongo connection wrapped in MongoConnection object"""
86 |         obj = MongoConnection
87 |         return obj(self,self.queue.get(block,timeout))
88 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/mp_types.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import RLock
 2 | import multiprocessing.sharedctypes
 3 | import ctypes
 4 | 
 5 | class SynchronizedCounter(multiprocessing.sharedctypes.Synchronized):
 6 |     def increment(self, amount=1):
 7 |         self.acquire()
 8 |         try:
 9 |             self._obj.value += amount
10 |         finally:
11 |             self.release()
12 | 
13 | def Counter():
14 |     value = multiprocessing.sharedctypes.RawValue(ctypes.c_uint)
15 |     return SynchronizedCounter(value, RLock())


--------------------------------------------------------------------------------
/regscrape/regs_common/processing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from bson.code import Code
  4 | from pymongo.errors import OperationFailure, InvalidDocument
  5 | import subprocess, os, urlparse, json
  6 | from gevent import Timeout
  7 | from regs_models import *
  8 | from exceptions import ExtractionFailed, ChildTimeout
  9 | import os
 10 | import re
 11 | import cStringIO
 12 | import time
 13 | import itertools
 14 | import sys
 15 | import regs_common
 16 | import operator
 17 | import zlib
 18 | import settings
 19 | 
 20 | def find_views(**params):
 21 |     db = Doc._get_db()
 22 |     
 23 |     # allow for using a pre-filter to speed up execution
 24 |     kwargs = {}
 25 |     query = {}
 26 |     if 'query' in params:
 27 |         query = params['query']
 28 |         del params['query']
 29 |     
 30 |     # create the actual map function
 31 |     conditions = dict([('views.%s' % item[0], item[1]) for item in params.items()])
 32 |     conditions.update(query)
 33 |     
 34 |     results = itertools.chain.from_iterable(
 35 |         itertools.imap(
 36 |             lambda doc: [{'view': View._from_son(view), 'doc': doc['_id']} for view in doc['views'] if all(item[0] in view and view[item[0]] == item[1] for item in params.items())],
 37 |             db.docs.find(conditions)
 38 |         )
 39 |     )
 40 |     
 41 |     return results
 42 | 
 43 | def find_attachment_views(**params):
 44 |     db = Doc._get_db()
 45 | 
 46 |     # allow for using a pre-filter to speed up execution
 47 |     kwargs = {}
 48 |     query = {}
 49 |     if 'query' in params:
 50 |         query = params['query']
 51 |         del params['query']
 52 | 
 53 |     # create the actual map function
 54 |     conditions = dict([('attachments.views.%s' % item[0], item[1]) for item in params.items()])
 55 |     conditions.update(query)
 56 | 
 57 |     results = itertools.chain.from_iterable(
 58 |         itertools.imap(
 59 |             lambda doc: reduce(operator.add, [
 60 |                 [
 61 |                     {'view': View._from_son(view), 'doc': doc['_id'], 'attachment': attachment['object_id']}
 62 |                     for view in attachment['views'] if all(item[0] in view and view[item[0]] == item[1] for item in params.items())
 63 |                 ] for attachment in doc['attachments']
 64 |             ] if 'attachments' in doc else [], []),
 65 |             db.docs.find(conditions)
 66 |         )
 67 |     )
 68 | 
 69 |     return results
 70 | 
 71 | def update_view(doc, view):    
 72 |     # use db object from thread pool
 73 |     db = Doc._get_db()
 74 |     
 75 |     # can't figure out a way to do this atomically because of bug SERVER-1050
 76 |     # remove the old version of the view
 77 |     db.docs.update({
 78 |         '_id': doc
 79 |     },
 80 |     {
 81 |         '$pull': {"views": {"url": view.url}}
 82 |     }, safe=True)
 83 | 
 84 |     # add the new one back
 85 |     db.docs.update({
 86 |         '_id': doc
 87 |     },
 88 |     {
 89 |         '$push': {"views": view.to_mongo()}
 90 |     }, safe=True)
 91 |     
 92 |     # return it to the pool
 93 |     del db
 94 | 
 95 | def update_attachment_view(doc, attachment, view):    
 96 |     db = Doc._get_db()
 97 |     
 98 |     # two-stage push/pull as above
 99 |     db.docs.update({
100 |         '_id': doc,
101 |         'attachments.object_id': attachment
102 |     },
103 |     {
104 |         '$pull': {'attachments.$.views': {'url': view.url}}
105 |     }, safe=True)
106 | 
107 |     db.docs.update({
108 |         '_id': doc,
109 |         'attachments.object_id': attachment
110 |     },
111 |     {
112 |         '$push': {'attachments.$.views': view.to_mongo()}
113 |     }, safe=True)
114 | 
115 |     
116 |     del db
117 | 
118 | 
119 | # the following is from http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
120 | def which(program):
121 |     import os
122 |     def is_exe(fpath):
123 |         return os.path.exists(fpath) and os.access(fpath, os.X_OK)
124 | 
125 |     fpath, fname = os.path.split(program)
126 |     if fpath:
127 |         if is_exe(program):
128 |             return program
129 |     else:
130 |         for path in os.environ["PATH"].split(os.pathsep):
131 |             exe_file = os.path.join(path, program)
132 |             if is_exe(exe_file):
133 |                 return exe_file
134 | 
135 |     return None
136 | 
137 | # the following is from http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
138 | import unicodedata, re
139 | 
140 | control_chars = ''.join(map(unichr, range(0,10) + range(11,13) + range(14,32) + range(127,160)))
141 | 
142 | control_char_re = re.compile('[%s]' % re.escape(control_chars))
143 | 
144 | def remove_control_chars(s):
145 |     return control_char_re.sub('', s)
146 | 
147 | # extractor
148 | POPEN = subprocess.Popen
149 | _nbsp = re.compile('(&nbsp;?|&#160;?|&#xa0;?)')
150 | def binary_extractor(binary, error=None, append=[], output_type="text"):
151 |     if not type(binary) == list:
152 |         binary = [binary]
153 |     def extractor(filename):
154 |         interpreter = POPEN(binary + [filename] + append, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
155 |         
156 |         timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120), ChildTimeout)
157 |         timeout.start()
158 |         try:
159 |             output, run_error = interpreter.communicate('')
160 |             timeout.cancel()
161 |         except ChildTimeout:
162 |             print 'killing %s' % filename
163 |             interpreter.kill()
164 |             raise
165 |         
166 |         if (output_type == 'text' and not output.strip()) or (output_type == 'html' and html_is_empty(output)) or (error and (error in output or error in run_error)):
167 |             raise ExtractionFailed()
168 |         elif output_type == 'html':
169 |             # strip non-breaking spaces
170 |             return _nbsp.sub(' ', output)
171 |         else:
172 |             return output
173 |     
174 |     extractor.__str__ = lambda: binary[0]
175 |     extractor.output_type = output_type
176 |     
177 |     return extractor
178 | 
179 | def script_extractor(script, error=None, output_type="text"):
180 |     script_path = os.path.join(os.path.dirname(os.path.abspath(regs_common.__file__)), 'scripts', script)
181 |     
182 |     extractor = binary_extractor([sys.executable, script_path], error=error, output_type=output_type)
183 |     extractor.__str__ = lambda: script
184 |     
185 |     return extractor
186 | 
187 | _tag_stripper = re.compile(r'<[^>]*?>')
188 | def strip_tags(text):
189 |     return _tag_stripper.sub('', text)
190 | 
191 | _body_finder = re.compile(r"<body[^>]*>(.*)</body>", re.I | re.DOTALL)
192 | _outline_finder = re.compile(r'<a name="outline"></a>\s*<h1>Document Outline</h1>\s*<ul>.*</ul>', re.I | re.DOTALL)
193 | def html_is_empty(text):
194 |     # grab the body
195 |     body = _body_finder.findall(text)
196 |     if not body:
197 |         return True
198 |     
199 |     # explicitly strip out pdftohtml's document outlines
200 |     without_outline = _outline_finder.sub("", body[0])
201 |     
202 |     body_text = strip_tags(without_outline).strip()
203 |     if not body_text:
204 |         return True
205 |     
206 |     return False
207 | 
208 | def ocr_scrub(text):
209 |     lines = re.split(r'\n', text)
210 |     garbage = re.compile(r'[^a-zA-Z\s]')
211 |     
212 |     def is_real_line(word):
213 |         letter_length = len(garbage.sub('', word))
214 |         return letter_length and len(word) and letter_length/float(len(word)) >= 0.5
215 |     
216 |     filtered_lines = [line.strip() for line in lines if line and is_real_line(line)]
217 |     filtered_text = '\n'.join(filtered_lines)
218 |     
219 |     if len(filtered_text) / float(len(text)) < 0.5:
220 |         raise ExtractionFailed('This is does not appear to be text.')
221 |     
222 |     return filtered_text
223 | 
224 | def pdf_ocr(filename):
225 |     basename = os.path.basename(filename).split('.')[0]
226 |     working = '/tmp/%s' % basename
227 |     if not os.path.exists(working):
228 |         os.mkdir(working)
229 |     os.chdir(working)
230 |     
231 |     def cleanup():
232 |         if working and working != '/tmp/':
233 |             os.chdir('..')
234 |             subprocess.Popen(['rm', '-rf', working], stdout=subprocess.PIPE).communicate()
235 |     
236 |     extractor = subprocess.Popen(['pdfimages', filename, basename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
237 |     extractor_output, extractor_error = extractor.communicate()
238 |     if extractor_error:
239 |         cleanup()
240 |         raise ExtractionFailed("Failed to extract image data from PDF.")
241 |     
242 |     pnm_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.p.m")
243 |     pnms = [file for file in os.listdir(working) if pnm_match.match(file)]
244 |     if not pnms:
245 |         cleanup()
246 |         raise ExtractionFailed("No images found in PDF.")
247 |     
248 |     converter = subprocess.Popen(['gm', 'mogrify', '-format', 'tiff', '-type', 'Grayscale'] + pnms, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
249 |     converter_output, converter_error = converter.communicate()
250 |     if converter_error:
251 |         cleanup()
252 |         raise ExtractionFailed("Failed to convert images to tiff.")
253 |     
254 |     tiff_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.tiff")
255 |     tiffs = [file for file in os.listdir(working) if tiff_match.match(file)]
256 |     if not tiffs:
257 |         cleanup()
258 |         raise ExtractionFailed("Converted tiffs not found.")
259 |     
260 |     out = cStringIO.StringIO()
261 |     for tiff in tiffs:
262 |         tiff_base = tiff.split('.')[0]
263 |         ocr = subprocess.Popen(['tesseract', tiff, tiff_base], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
264 |         ocr_output, ocr_error = ocr.communicate()
265 |         
266 |     txt_match = re.compile(r"[a-zA-Z0-9]+-[0-9]+\.txt")
267 |     txts = [file for file in os.listdir(working) if txt_match.match(file)]
268 |     if not txts:
269 |         cleanup()
270 |         raise ExctractionFailed("OCR failed to find any text.")
271 |     
272 |     for txt in txts:
273 |         ocr_file = open(txt, 'r')
274 |         out.write(ocr_file.read())
275 |         out.write('\n')
276 |     
277 |     try:
278 |         return_data =  ocr_scrub(out.getvalue())
279 |     except ExtractionFailed:
280 |         cleanup()
281 |         raise
282 |     
283 |     cleanup()
284 |     return return_data
285 | pdf_ocr.__str__ = lambda: 'tesseract'
286 | pdf_ocr.ocr = True
287 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/scripts/extract_docx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # extracts text from docx files using the docx module by Mike MacCana
 4 | 
 5 | from docx import *
 6 | import sys
 7 | 
 8 | if __name__ == '__main__':
 9 |     try:
10 |         document = opendocx(sys.argv[1])
11 |     except:
12 |         sys.stderr.write('Failed to decode file\n')
13 |         exit()
14 |     
15 |     ## Fetch all the text out of the document we just created
16 |     paratextlist = getdocumenttext(document)
17 |     
18 |     # Make explicit unicode version
19 |     newparatextlist = []
20 |     for paratext in paratextlist:
21 |         newparatextlist.append(paratext.encode("utf-8"))
22 |     
23 |     ## Print our documnts test with two newlines under each paragraph
24 |     sys.stdout.write('\n\n'.join(newparatextlist))


--------------------------------------------------------------------------------
/regscrape/regs_common/scripts/process_fr_docs.rb:
--------------------------------------------------------------------------------
1 | require 'us-documents'
2 | puts UnitedStates::Documents::FederalRegister.process STDIN.read


--------------------------------------------------------------------------------
/regscrape/regs_common/tmp_redis.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import settings
 3 | except ImportError:
 4 |     settings = object()
 5 | 
 6 | import uuid, os, subprocess, time, shutil
 7 | 
 8 | class TmpRedis(object):
 9 |     REDIS_CONFIG = {'daemonize': 'no', 'pidfile': '{path}/redis.pid', 'port': '0', 'bind': '127.0.0.1', 'unixsocket': '{path}/redis.sock', 'timeout': '300', 'loglevel': 'warning', 'logfile': 'stdout', 'databases': '1', '' : 'save 900 1\nsave 300 10\nsave 60 10000', 'rdbcompression': 'yes', 'dbfilename': 'dump.rdb', 'dir': '{path}/data', 'slave-serve-stale-data': 'yes', 'appendonly': 'no', 'appendfsync': 'everysec', 'no-appendfsync-on-rewrite': 'no', 'vm-enabled': 'no', 'vm-swap-file': '{path}/redis.swap', 'vm-max-memory': '0', 'vm-page-size': '32', 'vm-pages': '134217728', 'vm-max-threads': '4', 'hash-max-zipmap-entries': '512', 'hash-max-zipmap-value': '64', 'list-max-ziplist-entries': '512', 'list-max-ziplist-value': '64', 'set-max-intset-entries': '512', 'activerehashing': 'yes'}
10 | 
11 |     def get_config(self, **kwargs):
12 |         return '\n'.join([' '.join(option).strip() for option in self.REDIS_CONFIG.items()]).format(**kwargs)
13 |     
14 |     def __init__(self, db_uuid=None):
15 |         self.uuid = db_uuid if db_uuid else uuid.uuid4().__str__()
16 |         
17 |         redis_base = getattr(settings, 'REDIS_BASE', '/mnt/redis')
18 |         redis_dir = os.path.join(redis_base, self.uuid)
19 |         
20 |         try:
21 |             os.mkdir(redis_dir)
22 |             os.mkdir(os.path.join(redis_dir, 'data'))
23 |         except OSError:
24 |             pass
25 |         
26 |         self.config = os.path.join(redis_dir, 'redis.conf')
27 |         config_file = open(self.config, 'w')
28 |         config_file.write(self.get_config(path=redis_dir))
29 |         config_file.close()
30 |         
31 |         self.directory = redis_dir
32 |         self.socket = os.path.join(redis_dir, 'redis.sock')
33 |         self.process = subprocess.Popen(['redis-server', self.config], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
34 |         
35 |         time.sleep(1)
36 |     
37 |     def get_connection(self):
38 |         from redis import Redis
39 |         return Redis(unix_socket_path=self.socket)
40 |     
41 |     def get_pickle_connection(self):
42 |         # define an inner class so that we don't have to import redis until re try to get a connection
43 |         import cPickle
44 |         from redis import Redis
45 |         
46 |         class PickleRedis(Redis):
47 |             def get(self, key):
48 |                 data = super(PickleRedis, self).get(key)
49 |                 return cPickle.loads(data) if data else data
50 |             
51 |             def set(self, key, value):
52 |                 return super(PickleRedis, self).set(key, cPickle.dumps(value, -1))
53 |         
54 |         return PickleRedis(unix_socket_path=self.socket)
55 |     
56 |     def terminate(self, delete=True):
57 |         self.process.terminate()
58 |         time.sleep(1)
59 | 
60 |         if self.process.poll() is None:
61 |             self.process.kill()
62 |         
63 |         if delete:
64 |             shutil.rmtree(self.directory)
65 | 


--------------------------------------------------------------------------------
/regscrape/regs_common/transfer.py:
--------------------------------------------------------------------------------
  1 | import urllib2, urllib3
  2 | import subprocess
  3 | from gevent.pool import Pool
  4 | from gevent import Timeout
  5 | import greenlet
  6 | import settings
  7 | import datetime
  8 | import sys
  9 | import traceback
 10 | import time
 11 | 
 12 | def pump(input, output, chunk_size):
 13 |     size = 0
 14 |     while True:
 15 |         chunk = input.read(chunk_size)
 16 |         if not chunk: break
 17 |         output.write(chunk)
 18 |         size += len(chunk)
 19 |     return size
 20 | 
 21 | def download(url, output_file, post_data=None, headers=None):
 22 |     transfer = urllib2.urlopen(urllib2.Request(url, post_data, headers if headers else {}), timeout=10) if type(url) in (unicode, str) else url
 23 |     
 24 |     out = open(output_file, 'wb')
 25 |     size = pump(transfer, out, 16 * 1024)
 26 |     out.close()
 27 |     
 28 |     return size
 29 | 
 30 | def download_wget(url, output_file):
 31 |     proc = subprocess.Popen(['wget', '-nv', url, '-O', output_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
 32 |     out = proc.communicate('')
 33 |     if 'URL:' in out[0] and os.path.exists(output_file):
 34 |         return os.stat(output_file).st_size
 35 |     elif 'ERROR' in out[0]:
 36 |         error_match = re.match('.*ERROR (\d{3}): (.*)', out[0].strip().replace('\n', ' '))
 37 |         if error_match:
 38 |             error_groups = error_match.groups()
 39 |             raise urllib2.HTTPError(url, error_groups[0], error_groups[1], {}, None)
 40 |     raise Exception("Something went wrong with the download.")
 41 | 
 42 | # pooled and timed-out versions of the transfer code
 43 | def tpump(input, output, chunk_size):
 44 |     size = 0
 45 |     while True:
 46 |         try:
 47 |             timeout = Timeout.start_new(5)
 48 |             chunk = input.read(chunk_size)
 49 |             timeout.cancel()
 50 |         except Timeout:
 51 |             input.release_conn()
 52 |             raise
 53 | 
 54 |         if not chunk: break
 55 |         output.write(chunk)
 56 |         size += len(chunk)
 57 |     return size
 58 | 
 59 | def download_pooled(url, output_file):
 60 |     transfer = CPOOL.urlopen("GET", url, timeout=10, preload_content=False)
 61 |     if transfer.status != 200:
 62 |         raise urllib2.HTTPError(url, transfer.status, transfer.reason, transfer.headers, None)
 63 | 
 64 |     out = open(output_file, 'wb')
 65 |     size = tpump(transfer, out, 16 * 1024)
 66 |     out.close()
 67 | 
 68 |     return size
 69 | 
 70 | def _get_downloader(status_func, download_func, retries, verbose, min_size, url, filename, record=None):
 71 |     def download_file():
 72 |         for try_num in xrange(retries):
 73 |             if verbose: print 'Downloading %s (try #%d, downloader %s)...' % (url, try_num, hash(greenlet.getcurrent()))
 74 |             
 75 |             download_succeeded = False
 76 |             download_message = None
 77 |             size = 0
 78 |             try:
 79 |                 start = datetime.datetime.now()
 80 |                 size = download_func(url, filename)
 81 |                 download_succeeded = True
 82 |                 elapsed = datetime.datetime.now() - start
 83 |             except urllib2.HTTPError as e:
 84 |                 if verbose: print 'Download of %s failed due to error %s.' % (url, e.code)
 85 |                 download_message = e.code
 86 |                 
 87 |                 if int(e.code) == 429:
 88 |                     if verbose: print 'Error occurred due to rate limiting; waiting 10 minutes.'
 89 |                     time.sleep(600)
 90 |             except Timeout as e:
 91 |                 if verbose: print 'Download of %s timed out.' % url
 92 |             except:
 93 |                 exc = sys.exc_info()
 94 |                 if verbose: print traceback.print_tb(exc[2])
 95 |             
 96 |             if download_succeeded:
 97 |                 if size >= min_size:
 98 |                     # print status
 99 |                     ksize = int(round(size/1024.0))
100 |                     if verbose: print 'Downloaded %s to %s: %sk in %s seconds (%sk/sec)' % (url, filename, ksize, elapsed.seconds, round(float(ksize)/elapsed.seconds * 10)/10 if elapsed.seconds > 0 else '--')
101 |                     break
102 |                 else:
103 |                     download_succeeded = False
104 |                     download_message = "Resulting file was smaller than the minimum file size."
105 |                     if verbose: print download_message
106 |         
107 |         status_func(
108 |             (download_succeeded, download_message),
109 |             url,
110 |             filename,
111 |             record
112 |         )
113 |     return download_file
114 | 
115 | 
116 | def bulk_download(download_iterable, status_func=None, retries=3, verbose=False, min_size=0):
117 |     workers = Pool(getattr(settings, 'DOWNLOADERS', 5))
118 |     
119 |     # keep the downloaders busy with tasks as long as there are more results
120 |     for download_record in download_iterable:
121 |         workers.spawn(_get_downloader(status_func, download, retries, verbose, min_size, *download_record))
122 |     
123 |     workers.join()
124 |     
125 |     return
126 | 
127 | CPOOL = None
128 | def pooled_bulk_download(download_iterable, status_func=None, retries=5, verbose=False, min_size=0):
129 |     num_downloaders = getattr(settings, 'DOWNLOADERS', 5)
130 |     global CPOOL
131 |     if not CPOOL:
132 |         CPOOL = urllib3.PoolManager(num_pools=2, maxsize=num_downloaders * 2)
133 | 
134 |     workers = Pool(num_downloaders)
135 |     
136 |     # keep the downloaders busy with tasks as long as there are more results
137 |     for download_record in download_iterable:
138 |         workers.spawn(_get_downloader(status_func, download_pooled, retries, verbose, min_size, *download_record))
139 |     
140 |     workers.join()
141 |     
142 |     return


--------------------------------------------------------------------------------
/regscrape/regs_common/util.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import settings
 3 | from pymongo import Connection
 4 | import os
 5 | from gevent_mongo import Mongo
 6 | import urllib2
 7 | import subprocess
 8 | import re
 9 | import hashlib, crockford
10 | 
11 | def get_db():
12 |     db_settings = getattr(settings, 'DB_SETTINGS', {})
13 |     return Mongo(getattr(settings, 'DB_NAME', 'regulations'), settings.INSTANCES + 2, **db_settings).get_conn()
14 | 
15 | def bootstrap_settings():
16 |     if not getattr(settings, 'DOWNLOAD_DIR', False):
17 |         settings.DOWNLOAD_DIR = os.path.join(settings.DATA_DIR, 'downloads')
18 |     
19 |     if not getattr(settings, 'DUMP_DIR', False):
20 |         settings.DUMP_DIR = os.path.join(settings.DATA_DIR, 'dumps')
21 | 
22 | def listify(item):
23 |     if not item:
24 |         return []
25 |     if type(item) in (str, unicode, dict):
26 |         return [item]
27 |     return list(item)
28 | 
29 | def crockford_hash(s):
30 |     h = hashlib.md5(s)
31 |     return crockford.b32encode(h.digest())


--------------------------------------------------------------------------------
/regscrape/regsdotgov/__init__.py:
--------------------------------------------------------------------------------
1 | # add self to path
2 | import sys
3 | import os
4 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
5 | if CURRENT_DIR not in sys.path:
6 |     sys.path.append(CURRENT_DIR)


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/regsdotgov/commands/__init__.py


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_create_agencies.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | def run():
 4 |     from regs_models import Agency
 5 |     import subprocess, re, urllib2
 6 | 
 7 |     BAD_SPACES = re.compile("(\xc2\xa0|\r)")
 8 |     AGENCY_LINE = re.compile(r"^[A-Z\s\.\,\&\-\'\(\)\/]*[A-Z]+[A-Z\s\(\)]*$")
 9 |     REGULAR_LINE = re.compile(r"^[A-Z]{2,}\s{3,}[A-Z]+.*$")
10 |     AGENCY_ONLY_LINE = re.compile(r"^[A-Z]{2,}\s*$")
11 |     DESCRIPTION_ONLY_LINE = re.compile(r"^\s{3,}[A-Z]+.*$")
12 |     THREE_SPACES = re.compile("\s{3,}")
13 |     SPACES = re.compile(r"\s+")
14 |     AMPERSAND = re.compile(r"(?<=[A-Z])\&")
15 |     
16 |     new = 0
17 |     
18 |     print 'Fetching agencies...'
19 |     agencies = {}
20 | 
21 |     ml_descs = []
22 |     ml_agency = None
23 | 
24 |     participating = {}
25 | 
26 |     for file in ["Participating_Agencies.pdf", "Non_Participating_Agencies.pdf"]:
27 |         data = urllib2.urlopen("http://www.regulations.gov/docs/%s" % file)
28 |         pdftotext = subprocess.Popen(["pdftotext", "-layout", "-", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
29 |         text = pdftotext.communicate(data.read())
30 |         
31 |         contents = BAD_SPACES.sub(" ", text[0])
32 |         
33 |         agency_lines = [line for line in contents.split("\n") if AGENCY_LINE.match(line)]
34 |         
35 |         for line in agency_lines:
36 |             if REGULAR_LINE.match(line):
37 |                 split = THREE_SPACES.split(line, maxsplit=1)
38 |                 a_name = split[0].strip()
39 |                 a_desc = split[1].strip()
40 | 
41 |                 agencies[a_name] = a_desc
42 |                 participating[a_name] = "Non" not in file
43 |             elif AGENCY_ONLY_LINE.match(line):
44 |                 ml_agency = line.strip()
45 |             elif DESCRIPTION_ONLY_LINE.match(line):
46 |                 ml_descs.append(line.strip())
47 |                 if ml_agency:
48 |                     agencies[ml_agency] = " ".join(ml_descs)
49 |                     participating[ml_agency] = "Non" not in file
50 |                     ml_agency = None
51 |                     ml_descs = []
52 |             else:
53 |                 print "Broken line:", line
54 | 
55 |     # hard-coded SIGAR, because it's messed up in the PDF
56 |     agencies["SIGAR"] = "SPECIAL INSPECTOR GENERAL FOR AFGHANISTAN RECONSTRUCTION"
57 |     participating["SIGAR"] = False
58 | 
59 |     print 'Saving agencies...'
60 | 
61 |     stop_words = ['the', 'and', 'of', 'on', 'in', 'for']
62 |     for agency, name in agencies.items():
63 |         # fix ampersand weirdness
64 |         name = AMPERSAND.sub(" & ", name)
65 | 
66 |         # fix spacing and capitalization
67 |         name_parts = SPACES.split(name)
68 |         capitalized_parts = [name_parts[0].title()] + [word.title() if word.lower() not in stop_words else word.lower() for word in name_parts[1:]]
69 |         name = ' '.join(capitalized_parts)
70 | 
71 |         new += Agency.objects(id=agency).update(
72 |             set__name=name,
73 |             set__rdg_participating=participating[agency],
74 | 
75 |             upsert=True,
76 |             safe_update=True
77 |         )
78 |     
79 |     print 'Iterated over %s agencies.' % (len(agencies))
80 |     
81 |     return {'total': len(agencies), 'new': new}
82 | 


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_download.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import settings
 4 | MIN_SIZE = getattr(settings, 'MIN_DOWNLOAD_SIZE', 512)
 5 | 
 6 | from optparse import OptionParser
 7 | arg_parser = OptionParser()
 8 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 9 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
10 | 
11 | def run(options, args):
12 |     # global imports hack so we don't mess up gevent loading
13 |     global pooled_bulk_download, settings, subprocess, os, urlparse, sys, traceback, datetime, pymongo, hashlib
14 |     from regs_common.processing import find_views, update_view, find_attachment_views, update_attachment_view
15 |     from regs_common.transfer import pooled_bulk_download
16 |     import subprocess, os, urlparse, sys, traceback, datetime, hashlib
17 |     import pymongo
18 |     
19 |     # ensure that our hash directories are all there
20 |     for hex_dir in [hex(i).split('x').pop().zfill(2) for i in range(256)]:
21 |         dir_path = os.path.join(settings.DOWNLOAD_DIR, hex_dir)
22 |         if not os.path.exists(dir_path):
23 |             os.mkdir(dir_path)
24 | 
25 |     return {
26 |         'document_views': run_for_view_type('document views', find_views, update_view, options),
27 |         'attachment_views': run_for_view_type('attachment views', find_attachment_views, update_attachment_view, options)
28 |     }
29 | 
30 | def run_for_view_type(view_label, find_func, update_func, options):
31 |     print 'Preparing download of %s.' % view_label
32 | 
33 |     query = {'deleted': False}
34 |     if options.agency:
35 |         query['agency'] = options.agency
36 |     if options.docket:
37 |         query['docket_id'] = options.docket
38 |     
39 |     views = find_func(downloaded="no", query=query)
40 |     
41 |     # track stats -- no locks because yay for cooperative multitasking
42 |     stats = {'downloaded': 0, 'failed': 0}
43 |     
44 |     # hack around stupid Python closure behavior
45 |     v_array = [views]
46 |     def download_generator():
47 |         while True:
48 |             try:
49 |                 result = v_array[0].next()
50 |                 
51 |                 save_hash = hashlib.md5(result['view'].url).hexdigest()
52 |                 save_name = '%s.%s' % (result['view'].object_id if result['view'].object_id else save_hash, result['view'].type)
53 |                 save_path = os.path.join(settings.DOWNLOAD_DIR, save_hash[:2], save_name)
54 |                 
55 |                 fetch_url = result['view'].url
56 |                 if "api.data.gov/regulations/v3/download" in fetch_url and "api_key" not in fetch_url:
57 |                     # this requires an API key but one wasn't included in the upstream-provided URL, so add one
58 |                     fetch_url = fetch_url + "&api_key=" + settings.DDG_API_KEY
59 | 
60 |                 yield (fetch_url, save_path, result)
61 |             except pymongo.errors.OperationFailure:
62 |                 # occasionally pymongo seems to lose track of the cursor for some reason, so reset the query
63 |                 v_array[0] = find_func(downloaded="no", query=query)
64 |                 continue
65 |             except StopIteration:
66 |                 break
67 |     
68 |     def status_func(status, url, filename, result):
69 |         if status[0]:
70 |             result['view'].downloaded = "yes"
71 |             result['view'].file_path = filename
72 |             stats['downloaded'] += 1
73 |         else:
74 |             result['view'].downloaded = "failed"
75 |             stats['failed'] += 1
76 |         update_func(**result)
77 |     
78 |     pooled_bulk_download(download_generator(), status_func, verbose=not options.parsable, min_size=MIN_SIZE)
79 | 
80 |     print 'Done with %s.' % view_label
81 |     
82 |     return stats
83 | 
84 | if __name__ == "__main__":
85 |     run()
86 | 


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_dump_api.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | arg_parser = OptionParser()
 3 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 4 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 5 | 
 6 | def run(options, args):
 7 |     import urllib2, httplib
 8 |     import settings
 9 |     import os, time, sys
10 |     from regsdotgov.search import search, parsed_search
11 |     from regs_common.transfer import download
12 | 
13 |     search_args = {
14 |         # order ascending by posted date to reduce pagination errors
15 |         "sb": "postedDate",
16 |         "so": "ASC"
17 |     }
18 |     id_string = 'all'
19 |     if options.agency and options.docket:
20 |         raise Exception("Specify either an agency or a docket")
21 |     elif options.agency:
22 |         search_args['agency'] = options.agency
23 |         id_string = 'agency_' + options.agency
24 |     elif options.docket:
25 |         search_args['docket'] = options.docket
26 |         id_string = 'docket_' + options.docket.replace('-', '_')
27 | 
28 |     # delete old dumps
29 |     [os.unlink(os.path.join(settings.DUMP_DIR, file)) for file in os.listdir(settings.DUMP_DIR) if file.startswith('dump_%s' % id_string) and file.endswith('.json')]
30 |     
31 |     # keep stats
32 |     stats = {'downloaded': 0, 'failed': 0}
33 |     
34 |     # start new dumps
35 |     position = 0
36 |     increment = 1000
37 |     total = parsed_search(1, 0, **search_args)['totalNumRecords']
38 |     num_digits = len(str(settings.DUMP_END))
39 |     while position <= total:
40 |         for i in range(3):
41 |             try:
42 |                 current_str = (position / increment) + 1
43 |                 total_str = '?' if total == 1 else (total / increment) + 1
44 |                 print "Downloading page %s of %s..." % (current_str, total_str)
45 |                 download(
46 |                     search(increment, position, **search_args),
47 |                     os.path.join(settings.DUMP_DIR, 'dump_%s_%s.json' % (id_string, str(position).zfill(num_digits))),
48 |                 )
49 |                 stats['downloaded'] += 1
50 |                 break
51 |             except (urllib2.HTTPError, httplib.HTTPException) as e:
52 |                 if i < 2:
53 |                     if hasattr(e, 'code') and e.code in (503, 429) and 'rate' in e.read().lower():
54 |                         print 'Download failed because of rate limiting; will retry in an hour...'
55 |                         time.sleep(3600)
56 |                     else:
57 |                         print 'Download failed; will retry in 10 seconds...'
58 |                         time.sleep(10)
59 |                 else:
60 |                     print 'System troubles; giving up.'
61 |                     raise
62 |         
63 |         position += increment
64 |     
65 |     return stats
66 | 


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_parse_api.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import os
  4 | import settings
  5 | import sys
  6 | from search import parse, iter_parse, result_to_model
  7 | import pytz
  8 | import datetime
  9 | import operator
 10 | import time
 11 | import json
 12 | import re
 13 | from regs_common.tmp_redis import TmpRedis
 14 | from regs_common.mp_types import Counter
 15 | from regs_common.util import listify
 16 | from regsdotgov.document import make_view
 17 | from regs_models import *
 18 | 
 19 | 
 20 | import multiprocessing
 21 | from Queue import Empty
 22 | 
 23 | from optparse import OptionParser
 24 | arg_parser = OptionParser()
 25 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes. Defaults to number of cores if not specified.")
 26 | arg_parser.add_option("-k", "--keep-cache", dest="keep_cache", action="store_true", default=False, help="Prevents the cache from being deleted at the end of processing to make testing faster.")
 27 | arg_parser.add_option("-u", "--use-cache", dest="use_cache", action="store", default=None, help="Use pre-existing cache to make testing faster.")
 28 | arg_parser.add_option("-A", "--add-only", dest="add_only", action="store_true", default=False, help="Skip reconciliation, assume that all records are new, and go straight to the add step.")
 29 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 30 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 31 | 
 32 | def repair_views(old_views, new_views):
 33 |     for new_view in new_views:
 34 |         already_exists = [view for view in old_views if view.type == new_view.type]
 35 |         if not already_exists:
 36 |             old_views.append(new_view)
 37 |         elif already_exists and already_exists[0].downloaded == 'failed':
 38 |             already_exists[0].downloaded = "no"
 39 | 
 40 | def reconcile_process(record, cache, db, now, repaired_counter, updated_counter, deleted_counter):
 41 |     # check and see if this doc has been updated
 42 |     new_record = cache.get(record['_id'])
 43 |     if new_record:
 44 |         # do we need to fix anything?
 45 |         statuses = [[view['downloaded'] for view in record.get('views', [])]] + [[view['downloaded'] for view in attachment.get('views', [])] for attachment in record.get('attachments', [])]
 46 | 
 47 |         #main_views = [make_view(format) for format in listify(new_record.get('fileFormats', []))]
 48 |         
 49 |         if record['scraped'] == 'failed' or 'failed' in reduce(operator.add, statuses, []) or (record['scraped'] == 'yes' and len(record.get('attachments', [])) != new_record.get('attachmentCount', 0)):
 50 |             # needs a repair; grab the full document
 51 |             current_docs = Doc.objects(id=record['_id'])
 52 |             
 53 |             db_doc = current_docs[0]
 54 |             
 55 |             db_doc.scraped = "no"
 56 |             
 57 |             # rebuild views
 58 |             #repair_views(db_doc.views, main_views)
 59 |             
 60 |             # update the last-seen date
 61 |             db_doc.last_seen = now
 62 | 
 63 |             # reset a couple of flags to trigger reprocessing
 64 |             db_doc.in_search_index = False
 65 |             db_doc.in_cluster_db = False
 66 |             db_doc.entities_last_extracted = None
 67 |             
 68 |             # do save
 69 |             try:
 70 |                 db_doc.save()
 71 |                 repaired_counter.increment()
 72 |             except:
 73 |                 print "Failed to repair %s" % db_doc.id
 74 |         else:
 75 |             # we don't need a full repair, so just do an update on the date
 76 |             Doc.objects(id=record['_id']).update_one(set__last_seen=now)
 77 |             updated_counter.increment()
 78 |         
 79 |         # either way, delete the document from the cache so we can tell what's new at the end
 80 |         cache.delete(record['_id'])
 81 |     else:
 82 |         # this document isn't in the new data anymore, so mark it deleted
 83 |         Doc.objects(id=record['_id']).update_one(set__deleted=True)
 84 |         deleted_counter.increment()
 85 | 
 86 | def reconcile_worker(todo_queue, cache_wrapper, now, repaired_counter, updated_counter, deleted_counter):
 87 |     pid = os.getpid()
 88 |     
 89 |     print '[%s] Reconciliation worker started.' % pid
 90 |     
 91 |     cache = cache_wrapper.get_pickle_connection()
 92 |     
 93 |     import pymongo
 94 |     db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME]
 95 |     
 96 |     while True:
 97 |         record = todo_queue.get()
 98 |         
 99 |         reconcile_process(record, cache, db, now, repaired_counter, updated_counter, deleted_counter)
100 |         
101 |         todo_queue.task_done()
102 |     
103 | def add_new_docs(cache_wrapper, now):
104 |     print 'Adding new documents to the database...'
105 |     
106 |     cache = cache_wrapper.get_pickle_connection()
107 |     
108 |     new = 0
109 |     for id in cache.keys():
110 |         doc = cache.get(id)
111 | 
112 |         if doc.get('documentStatus', None) == "Withdrawn":
113 |             continue
114 | 
115 |         db_doc = result_to_model(doc, now=now)
116 |         
117 |         try:
118 |             db_doc.save()
119 |             new += 1
120 |         except:
121 |             print "Failed to save document %s" % db_doc.id
122 |     
123 |     written = new
124 |     print 'Wrote %s new documents.' % (written)
125 |     
126 |     return written
127 | 
128 | def reconcile_dumps(options, cache_wrapper, now):
129 |     sys.stdout.write('Reconciling dumps with current data...\n')
130 |     sys.stdout.flush()
131 |     
132 |     # get workers going
133 |     num_workers = options.multi
134 |     
135 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
136 |     repaired_counter = Counter()
137 |     updated_counter = Counter()
138 |     deleted_counter = Counter()
139 |     
140 |     processes = []
141 |     for i in range(num_workers):
142 |         proc = multiprocessing.Process(target=reconcile_worker, args=(todo_queue, cache_wrapper, now, repaired_counter, updated_counter, deleted_counter))
143 |         proc.start()
144 |         processes.append(proc)
145 |     
146 |     import pymongo
147 |     db = pymongo.Connection(**settings.DB_SETTINGS)[settings.DB_NAME]
148 |     
149 |     conditions = {'last_seen': {'$lt': now}, 'deleted': False, 'source': 'regulations.gov'}
150 |     if options.agency:
151 |         conditions['agency'] = options.agency
152 |     if options.docket:
153 |         conditions['docket_id'] = options.docket
154 | 
155 |     fields = {'_id': 1, 'scraped': 1, 'views.downloaded': 1, 'views.type': 1, 'attachments.views.downloaded': 1, 'attachments.views.type': 1, 'attachments.object_id': 1}
156 |     to_check = db.docs.find(conditions, fields)
157 |     
158 |     while True:
159 |         try:
160 |             record = to_check.next()
161 |         except pymongo.errors.OperationFailure:
162 |             print 'OH NOES!'
163 |             to_scrape = db.docs.find(conditions, fields)
164 |             continue
165 |         except StopIteration:
166 |             break
167 |             
168 |         todo_queue.put(record)
169 |     
170 |     todo_queue.join()
171 |     
172 |     for proc in processes:
173 |         print 'Terminating reconciliation worker %s...' % proc.pid
174 |         proc.terminate()
175 |     
176 |     # compile and print some stats
177 |     num_updated = updated_counter.value
178 |     num_repaired = repaired_counter.value
179 |     num_deleted = deleted_counter.value
180 |     num_docs = num_updated + num_repaired + num_deleted
181 |     print 'Reconciliation complete: examined %s documents, of which %s were updated, %s were repaired, and %s were flagged as deleted.' % (num_docs, num_updated, num_repaired, num_deleted)
182 |     
183 |     return {'updated': num_updated, 'repaired': num_repaired, 'deleted': num_deleted}
184 | 
185 | def parser_process(file, cache):
186 |     docs = iter_parse(os.path.join(settings.DUMP_DIR, file))
187 |     print '[%s] Done with JSON decode.' % os.getpid()
188 |     
189 |     count = 0
190 |     for doc in docs:
191 |         cache.set(doc['documentId'], doc)
192 |         count += 1
193 |     
194 |     return {'docs': count}
195 | 
196 | def parser_worker(todo_queue, done_queue, cache_wrapper):
197 |     pid = os.getpid()
198 |     
199 |     print '[%s] Parser worker started.' % pid
200 |     
201 |     cache = cache_wrapper.get_pickle_connection()
202 |     
203 |     while True:
204 |         file = todo_queue.get()
205 |         
206 |         sys.stdout.write('[%s] Parsing file %s...\n' % (pid, file))
207 |         sys.stdout.flush()
208 |         start = datetime.datetime.now()
209 |         
210 |         stats = parser_process(file, cache)
211 |         
212 |         elapsed = datetime.datetime.now() - start
213 |         sys.stdout.write('[%s] Done with %s in %s minutes\n' % (pid, file, round(elapsed.total_seconds() / 60.0)))
214 |         sys.stdout.flush()
215 |         
216 |         done_queue.put(stats)
217 |         
218 |         todo_queue.task_done()
219 |     
220 | def parse_dumps(options, cache_wrapper):
221 |     # figure out which files are ours
222 |     id_string = 'all'
223 |     if options.agency and options.docket:
224 |         raise Exception("Specify either an agency or a docket")
225 |     elif options.agency:
226 |         id_string = 'agency_' + options.agency
227 |     elif options.docket:
228 |         id_string = 'docket_' + options.docket.replace('-', '_')
229 | 
230 |     num_workers = options.multi
231 |     files = [file for file in os.listdir(settings.DUMP_DIR) if file.startswith('dump_%s' % id_string) and file.endswith('.json')]
232 | 
233 |     if len(files) < 1:
234 |         # something is wrong, as there should be more than ten files
235 |         raise Exception('Too few .json files; something went wrong.')
236 |     
237 |     # it's a small number of files, so just make a queue big enough to hold them all, to keep from having to block
238 |     todo_queue = multiprocessing.JoinableQueue(len(files))
239 |     done_queue = multiprocessing.Queue(len(files))
240 |     
241 |     sys.stdout.write('Starting parser workers...\n')
242 |     processes = []
243 |     for i in range(num_workers):
244 |         proc = multiprocessing.Process(target=parser_worker, args=(todo_queue, done_queue, cache_wrapper))
245 |         proc.start()
246 |         processes.append(proc)
247 |     
248 |     for file in files:
249 |         todo_queue.put(file)
250 |     
251 |     todo_queue.join()
252 |     
253 |     for proc in processes:
254 |         print 'Terminating parser worker %s...' % proc.pid
255 |         proc.terminate()
256 |     
257 |     # print totals
258 |     print 'Done parsing files.'
259 | 
260 | def run(options, args):
261 |     sys.stdout.write('Starting decoding...\n')
262 |     sys.stdout.flush()
263 |     
264 |     # get workers going
265 |     now = datetime.datetime.now(tz=pytz.utc)
266 |     
267 |     num_workers = options.multi
268 |     
269 |     # set up caching
270 |     sys.stdout.write('Spinning up Redis instance...\n')
271 |     
272 |     if options.use_cache:
273 |         cache_wrapper = TmpRedis(db_uuid=options.use_cache)
274 |         # give it time to rebuild its cache from disk if we're using an already-built cache
275 |         sys.stdout.write('Loading cache from disk...')
276 |         time.sleep(15)
277 |         sys.stdout.write(' done.\n')
278 |     else:
279 |         cache_wrapper = TmpRedis()
280 |         parse_dumps(options, cache_wrapper)
281 |     
282 |     stats = {}
283 |     if not options.add_only:
284 |         stats = reconcile_dumps(options, cache_wrapper, now)
285 |     else:
286 |         print 'Skipping reconciliation step.'
287 |     
288 |     # still-existing and deleted stuff is now done, but we still have to do the new stuff
289 |     stats['new'] = add_new_docs(cache_wrapper, now)
290 |     
291 |     sys.stdout.write('Terminating Redis cache...\n')
292 |     
293 |     if options.keep_cache:
294 |         cache_wrapper.terminate(delete=False)
295 |         print 'Cache preserved with UUID %s.' % cache_wrapper.uuid
296 |     else:
297 |         cache_wrapper.terminate()
298 |     
299 |     return stats


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_scrape.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import settings
  4 | from regs_models import *
  5 | from regsdotgov.document import scrape_document
  6 | import urllib2, urllib3
  7 | import sys
  8 | import os
  9 | import traceback
 10 | import pymongo
 11 | import time
 12 | 
 13 | import multiprocessing
 14 | from Queue import Empty
 15 | from regs_common.mp_types import Counter
 16 | from regs_common.exceptions import DoesNotExist, RateLimitException
 17 | 
 18 | from optparse import OptionParser
 19 | arg_parser = OptionParser()
 20 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes.  Defaults to number of cores if not specified.")
 21 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 22 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 23 | 
 24 | def process_record(record, num_succeeded, num_failed, cpool):
 25 |     if record is None:
 26 |         return
 27 |     
 28 |     new_doc = None
 29 |     
 30 |     for i in range(2):
 31 |         error = None
 32 |         removed = False
 33 |         try:
 34 |             new_doc = scrape_document(record.id, cpool)
 35 |             new_doc.last_seen = record.last_seen
 36 |             new_doc.created = record.created
 37 |             print '[%s] Scraped doc %s...' % (os.getpid(), new_doc.id)
 38 | 
 39 |             num_succeeded.increment()
 40 |             break
 41 |         except DoesNotExist:
 42 |             print "Document %s appears to have been deleted; skipping." % record.id
 43 |             removed = True
 44 |             break
 45 |         except KeyboardInterrupt:
 46 |             raise
 47 |         except RateLimitException:
 48 |             print '### Warning: scrape failed on try %s because of RATE LIMIT' % i
 49 |             time.sleep(3600)
 50 |         except:
 51 |             print 'Warning: scrape failed on try %s' % i
 52 |             error = sys.exc_info()
 53 |             traceback.print_tb(error[2], file=sys.stdout)
 54 |     
 55 |     # catch renames of documents
 56 |     if new_doc and (not error) and (not removed) and new_doc.id != record.id:
 57 |         renamed_to = new_doc.id
 58 |         new_doc = Doc.objects(id=record.id)[0]
 59 |         new_doc.scraped = 'yes'
 60 |         new_doc.attachments = []
 61 |         new_doc.views = []
 62 |         new_doc.details['renamed_to'] = renamed_to
 63 |         new_doc.renamed = True
 64 |     
 65 |     # catch errors and removes
 66 |     if removed:
 67 |         num_failed.increment()
 68 |         return None
 69 |     elif error or not new_doc:
 70 |         new_doc = Doc.objects(id=record.id)[0]
 71 |         new_doc.scraped = 'failed'
 72 |         if error:
 73 |             print 'Scrape of %s failed because of %s' % (new_doc.id, str(error))
 74 |         num_failed.increment()
 75 |     
 76 |     try:
 77 |         new_doc.save()
 78 |     except:
 79 |         print "Warning: database save failed on document %s (scraped based on original doc ID %s)." % (new_doc.id, record.id)
 80 |         traceback.print_exc()
 81 | 
 82 | def worker(todo_queue, num_succeeded, num_failed):
 83 |     pid = os.getpid()
 84 |     cpool = urllib3.PoolManager(maxsize=2)
 85 |     
 86 |     print '[%s] Worker started.' % pid
 87 |             
 88 |     while True:
 89 |         record = Doc._from_son(todo_queue.get())
 90 |         
 91 |         process_record(record, num_succeeded, num_failed, cpool)
 92 |         
 93 |         todo_queue.task_done()
 94 | 
 95 | def run(options, args):
 96 |     sys.stdout.write('Starting scrape...\n')
 97 |     sys.stdout.flush()
 98 |     
 99 |     # get workers going
100 |     num_workers = options.multi
101 |     
102 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
103 |     
104 |     # set up some counters to track progress
105 |     num_succeeded = Counter()
106 |     num_failed = Counter()
107 |     
108 |     processes = []
109 |     for i in range(num_workers):
110 |         proc = multiprocessing.Process(target=worker, args=(todo_queue, num_succeeded, num_failed))
111 |         proc.start()
112 |         processes.append(proc)
113 |     
114 |     conditions = {'scraped': 'no', 'deleted': False, 'source': 'regulations.gov'}
115 |     if options.agency:
116 |         conditions['agency'] = options.agency
117 |     if options.docket:
118 |         conditions['docket_id'] = options.docket
119 |     to_scrape = Doc.objects(**conditions).only('id', 'last_seen', 'created', 'views', 'attachments')
120 |     
121 |     while True:
122 |         try:
123 |             record = to_scrape.next()
124 |         except pymongo.errors.OperationFailure:
125 |             to_scrape = Doc.objects(**conditions).only('id', 'last_seen', 'created', 'views', 'attachments')
126 |             continue
127 |         except StopIteration:
128 |             break
129 |             
130 |         todo_queue.put(record.to_mongo())
131 |     
132 |     todo_queue.join()
133 |     
134 |     for proc in processes:
135 |         print 'Terminating worker %s...' % proc.pid
136 |         proc.terminate()
137 |     
138 |     print 'Scrape complete with %s successes and %s failures.' % (num_succeeded.value, num_failed.value)
139 |     return {'scraped': num_succeeded.value, 'failed': num_failed.value}
140 | 


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_scrape_dockets.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import settings
  4 | from regsdotgov.document import scrape_docket
  5 | import urllib2, urllib3
  6 | import sys
  7 | import os
  8 | import traceback
  9 | from regs_models import *
 10 | import pymongo
 11 | import time
 12 | 
 13 | import multiprocessing
 14 | from Queue import Empty
 15 | from regs_common.mp_types import Counter
 16 | from regs_common.exceptions import DoesNotExist, RateLimitException
 17 | 
 18 | from optparse import OptionParser
 19 | arg_parser = OptionParser()
 20 | arg_parser.add_option("-m", "--multi", dest="multi", action="store", type="int", default=multiprocessing.cpu_count(), help="Set number of worker processes.  Defaults to number of cores if not specified.")
 21 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", type="string", default=None, help="Specify an agency to which to limit the dump.")
 22 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 23 | 
 24 | def process_record(record, num_succeeded, num_failed, cpool):
 25 |     if record is None:
 26 |         return
 27 |     
 28 |     docket = None
 29 |     
 30 |     for i in range(2):
 31 |         error = None
 32 |         try:
 33 |             docket = scrape_docket(record.id, cpool)
 34 |             docket._created = record._created
 35 |             docket.stats = record.stats
 36 |             print '[%s] Scraped docket %s...' % (os.getpid(), docket.id)
 37 |             num_succeeded.increment()
 38 |             break
 39 |         except DoesNotExist:
 40 |             error = sys.exc_info()
 41 |             print 'Warning: scrape failed on try %s with server exception: %s' % (i, error[1])
 42 |             # no need to try three times
 43 |             break
 44 |         except KeyboardInterrupt:
 45 |             raise
 46 |         except RateLimitException:
 47 |             print '### Warning: scrape failed on try %s because of RATE LIMIT' % i
 48 |             time.sleep(3600)
 49 |         except:
 50 |             error = sys.exc_info()
 51 |             print 'Warning: scrape failed on try %s' % i
 52 |     
 53 |     # catch errors
 54 |     if error or not docket:
 55 |         docket = record
 56 |         docket.scraped = 'failed'
 57 |         if error:
 58 |             print 'Scrape of %s failed because of %s' % (docket.id, str(error))
 59 |         num_failed.increment()
 60 |     
 61 |     try:
 62 |         docket.save()
 63 |     except:
 64 |         print "Warning: database save failed on document %s (scraped based on original doc ID %s)." % (docket.id, record.id)
 65 | 
 66 | def worker(todo_queue, num_succeeded, num_failed):
 67 |     pid = os.getpid()
 68 |     cpool = urllib3.PoolManager(maxsize=2)
 69 |     
 70 |     print '[%s] Worker started.' % pid
 71 |     
 72 |     while True:
 73 |         record = todo_queue.get()
 74 |         
 75 |         process_record(record, num_succeeded, num_failed, cpool)
 76 |         
 77 |         todo_queue.task_done()
 78 | 
 79 | def run(options, args):
 80 |     sys.stdout.write('Starting scrape...\n')
 81 |     sys.stdout.flush()
 82 |     
 83 |     # get workers going
 84 |     num_workers = options.multi
 85 |     
 86 |     todo_queue = multiprocessing.JoinableQueue(num_workers * 3)
 87 |     
 88 |     # set up some counters to track progress
 89 |     num_succeeded = Counter()
 90 |     num_failed = Counter()
 91 |     
 92 |     processes = []
 93 |     for i in range(num_workers):
 94 |         proc = multiprocessing.Process(target=worker, args=(todo_queue, num_succeeded, num_failed))
 95 |         proc.start()
 96 |         processes.append(proc)
 97 |         
 98 |     conditions = {'scraped': 'no'}
 99 |     if options.agency:
100 |         conditions['agency'] = options.agency
101 |     if options.docket:
102 |         conditions['id'] = options.docket
103 |     to_scrape = Docket.objects(**conditions)
104 |     
105 |     while True:
106 |         try:
107 |             record = to_scrape.next()
108 |         except pymongo.errors.OperationFailure:
109 |             to_scrape = Docket.objects(**conditions)
110 |             continue
111 |         except StopIteration:
112 |             break
113 |             
114 |         todo_queue.put(record)
115 |     
116 |     todo_queue.join()
117 |     
118 |     for proc in processes:
119 |         print 'Terminating worker %s...' % proc.pid
120 |         proc.terminate()
121 |     
122 |     print 'Scrape complete with %s successes and %s failures.' % (num_succeeded.value, num_failed.value)
123 |     return {'scraped': num_succeeded.value, 'failed': num_failed.value}
124 | 


--------------------------------------------------------------------------------
/regscrape/regsdotgov/commands/rdg_simple_update.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import os
  4 | import subprocess
  5 | import settings
  6 | import sys
  7 | from search import parsed_search, result_to_model
  8 | import pytz
  9 | import datetime
 10 | import operator
 11 | import time
 12 | import json
 13 | import re
 14 | import itertools
 15 | import urllib2, httplib
 16 | from regs_models import *
 17 | 
 18 | from optparse import OptionParser
 19 | arg_parser = OptionParser()
 20 | arg_parser.add_option("-s", "--since", dest="since", action="store", type="string", default=None, help="Manually specify search start date.")
 21 | 
 22 | def run(options, args):
 23 |     print 'Retrieving current document IDs...'
 24 |     
 25 |     # HACK - pull ids via shell because doing it in Python is slow
 26 |     count_proc = subprocess.Popen(
 27 |         ["mongo", settings.DB_NAME] +\
 28 |             list(itertools.chain.from_iterable([("--%s" % key, str(value)) for key, value in settings.DB_SETTINGS.items()])) +\
 29 |             ["--quiet", "--eval", "printjson(db.docs.find({source:'regulations.gov',deleted:false},{_id:1}).map(function(i){return i._id;}))"],
 30 |         stdout=subprocess.PIPE
 31 |     )
 32 |     ids = set(json.load(count_proc.stdout))
 33 | 
 34 |     now = datetime.datetime.now()
 35 | 
 36 |     if options.since:
 37 |         most_recent = datetime.datetime.strptime(options.since, "%Y-%m-%d")
 38 |         print "Done; start date manually set to %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids))
 39 |     else:
 40 |         print "Retrieving date of most recent document..."
 41 |         recent_agg = Doc._get_collection().aggregate([
 42 |             {
 43 |                 "$match": {
 44 |                     "source": "regulations.gov",
 45 |                     "deleted": False
 46 |                 }
 47 |             },
 48 |             {
 49 |                 "$group": {
 50 |                     "_id": 0,
 51 |                     "max": {
 52 |                         "$max": "$details.Date_Posted"
 53 |                     }
 54 |                 }
 55 |             }
 56 |         ]);
 57 |         most_recent = recent_agg['result'][0]['max']
 58 | 
 59 |         print "Done; last document is from %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids))
 60 |         
 61 |         if most_recent > now:
 62 |             most_recent = now
 63 |             print "Overriding most recent to now."
 64 |         
 65 |     search_args = {
 66 |         # date range from one day before the most recent until one day after now
 67 |         "pd": "-".join([d.strftime("%m/%d/%y") for d in (most_recent - datetime.timedelta(days=1), now + datetime.timedelta(days=1))]),
 68 | 
 69 |         # order ascending by posted date to reduce pagination errors
 70 |         "sb": "postedDate",
 71 |         "so": "ASC"
 72 |     }
 73 | 
 74 |     # start new dumps
 75 |     position = 0
 76 |     increment = 1000
 77 |     stats = {'pages_downloaded': 0, 'new_records': 0, 'existing_records': 0, 'failed_saves': 0}
 78 |     total = parsed_search(1, 0, **search_args)['totalNumRecords']
 79 |     while position <= total:
 80 |         page = None
 81 |         for i in range(3):
 82 |             try:
 83 |                 current_str = (position / increment) + 1
 84 |                 total_str = '?' if total == 1 else (total / increment) + 1
 85 |                 print "Downloading page %s of %s..." % (current_str, total_str)
 86 |                 
 87 |                 page = parsed_search(increment, position, **search_args)
 88 | 
 89 |                 stats['pages_downloaded'] += 1
 90 |                 break
 91 |             except (urllib2.HTTPError, httplib.HTTPException) as e:
 92 |                 if i < 2:
 93 |                     if hasattr(e, 'code') and e.code in (503, 429) and 'rate' in e.read().lower():
 94 |                         print 'Download failed because of rate limiting; will retry in an hour...'
 95 |                         time.sleep(3600)
 96 |                     else:
 97 |                         print 'Download failed; will retry in 10 seconds...'
 98 |                         time.sleep(10)
 99 |                 else:
100 |                     print 'System troubles; giving up.'
101 |                     raise
102 | 
103 |         for result in page.get('documents', []):
104 |             if result['documentId'] in ids:
105 |                 stats['existing_records'] += 1
106 |             else:
107 |                 if result.get('documentStatus', None) == "Withdrawn":
108 |                     continue
109 |                 
110 |                 db_doc = result_to_model(result, now=now)
111 |                 
112 |                 try:
113 |                     db_doc.save()
114 |                     stats['new_records'] += 1
115 |                 except:
116 |                     print "Failed to save document %s" % db_doc.id
117 |                     stats['failed_saves'] += 1
118 |         
119 |         position += increment
120 |     
121 |     print "Wrote %s new records, encountered %s existing records, and had %s failed saves." % (stats['new_records'], stats['existing_records'], stats['failed_saves'])
122 |     
123 |     return stats


--------------------------------------------------------------------------------
/regscrape/regsdotgov/search.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import json
 3 | import datetime
 4 | from regs_common.util import listify
 5 | from regs_models import *
 6 | 
 7 | from settings import RDG_API_KEY, DDG_API_KEY
 8 | ARG_NAMES = {
 9 |     'agency': 'a',
10 |     'docket': 'dktid'
11 | }
12 | 
13 | FR_DOC_TYPES = set(['notice', 'rule', 'proposed_rule'])
14 | 
15 | def _v1_search(per_page, position, **args):
16 |     url_args = {
17 |         'api_key': RDG_API_KEY,
18 |         'rpp': per_page,
19 |         'po': position
20 |     }
21 | 
22 |     for key, value in args.items():
23 |         url_args[ARG_NAMES[key]] = value
24 |     
25 |     return urllib2.urlopen(
26 |         "http://regulations.gov/api/documentsearch/v1.json?" + '&'.join(['%s=%s' % arg for arg in url_args.items()])
27 |     )
28 | 
29 | def _v3_search(per_page, position, **args):
30 |     url_args = {
31 |         'api_key': DDG_API_KEY,
32 |         'rpp': per_page,
33 |         'po': position
34 |     }
35 | 
36 |     for key, value in args.items():
37 |         url_args[ARG_NAMES.get(key, key)] = value
38 |     
39 |     url = "http://api.data.gov/regulations/v3/documents.json?" + '&'.join(['%s=%s' % arg for arg in url_args.items()])
40 |     req = urllib2.Request(url, headers={'Accept': 'application/json,*/*'})
41 |     return urllib2.urlopen(req)
42 | 
43 | search = _v3_search
44 | 
45 | def parse(file):
46 |     data = open(file) if type(file) in (unicode, str) else file
47 |     return json.load(data)
48 | 
49 | def _v1_iter_parse(file):
50 |     data = parse(file)
51 |     return iter(listify(data['searchresult']['documents']['document']))
52 | 
53 | def _v3_iter_parse(file):
54 |     data = parse(file)
55 |     return iter(data['documents'])
56 | 
57 | iter_parse = _v3_iter_parse
58 | 
59 | def result_to_model(doc, now=None):
60 |     now = now if now is not None else datetime.datetime.now()
61 | 
62 |     return Doc(**{
63 |         'id': doc['documentId'],
64 |         'title': unicode(doc.get('title', '')),
65 |         'docket_id': doc['docketId'],
66 |         'agency': doc['agencyAcronym'],
67 |         'type': DOC_TYPES[doc['documentType']],
68 |         'fr_doc': DOC_TYPES[doc['documentType']] in FR_DOC_TYPES,
69 |         'last_seen': now,
70 |         'created': now
71 |     })
72 | 
73 | # convenience function that strings them together
74 | def parsed_search(per_page, position, client=None, **args):
75 |     return parse(search(per_page, position, **args))
76 | 
77 | # use the search with an overridden client to get the agencies instead of the documents
78 | def get_agencies():
79 |     raise Exception("Haven't written this one yet")


--------------------------------------------------------------------------------
/regscrape/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | from regs_common.commands.runner import run_command
4 | 
5 | if __name__ == '__main__':
6 |     run_command()
7 | 


--------------------------------------------------------------------------------
/regscrape/sec_cftc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/sec_cftc/__init__.py


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunlightlabs/regulations-scraper/5f2644a3cf54f915d7d90957645073737ab91022/regscrape/sec_cftc/commands/__init__.py


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/cftc_extract_current.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | import urllib2, re, json, os, urlparse
 4 | from pyquery import PyQuery as pq
 5 | from lxml import etree
 6 | from collections import OrderedDict, defaultdict
 7 | from optparse import OptionParser
 8 | import settings
 9 | 
10 | from regs_common.util import crockford_hash
11 | from regs_models import *
12 | 
13 | # FIXME: split this out
14 | from sec_cftc.commands.sec_cftc_import import view_from_url
15 | 
16 | # arguments
17 | arg_parser = OptionParser()
18 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False)
19 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
20 | arg_parser.add_option("-D", "--document", dest="document", action="store", type="string", default=None, help="Specify a document to which to limit the dump.")
21 | 
22 | def run(options, args):
23 |     query = {'scraped': 'no', 'source': 'sec_cftc', 'agency': 'CFTC', 'views__downloaded': 'yes'}
24 | 
25 |     if options.docket:
26 |         query['docket_id'] = options.docket
27 | 
28 |     if options.document:
29 |         query['id'] = options.document
30 | 
31 |     parser = etree.HTMLParser()
32 |     for doc in Doc.objects(**query):
33 |         print "Processing %s..." % doc.id
34 |         page_data = open(doc.views[0].file_path).read()
35 |         page = pq(etree.fromstring(page_data, parser))
36 |         
37 |         text_block = page('.dyn_wrap div.ClearBoth')
38 |         text = text_block.html().strip() if len(text_block) else ""
39 |         full_text = "<html><body>%s</body></html>" % text
40 |         
41 |         if doc.views[0].content:
42 |             doc.views[0].content.delete()
43 |         
44 |         doc.views[0].content.new_file()
45 |         doc.views[0].content.write(full_text.encode('utf8'))
46 |         doc.views[0].content.close()
47 |         
48 |         doc.views[0].extracted = 'yes'
49 |         print "Found and wrote text."
50 |         
51 |         print "attachment"
52 |         attachment_link = page('.dyn_wrap a[id*=StaticLink]')
53 |         if attachment_link:
54 |             att_url = urlparse.urljoin(doc.views[0].url, attachment_link.attr('href').strip())
55 |             
56 |             att = Attachment()
57 |             att.title = page('.dyn_wrap a[id*=AssetAttachment]').text().strip()
58 |             
59 |             att_view = view_from_url(att_url)
60 |             if 'pdf' in att_url.lower():
61 |                 att_view.type = 'xpdf'
62 |             att.views.append(att_view)
63 |             att.object_id = att_view.object_id
64 |             
65 |             doc.attachments = [att]
66 |             
67 |             print "Found and saved attachment %s." % att_url
68 |         else:
69 |             print "No attachment found."
70 |         
71 |         doc.scraped = 'yes'        
72 |         doc.save()


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/cftc_scrape_documents.py:
--------------------------------------------------------------------------------
  1 | GEVENT = False
  2 | 
  3 | import urllib2, re, json, os, sys, operator, string, urlparse, urllib, cookielib
  4 | from pyquery import PyQuery as pq
  5 | from lxml import etree
  6 | from collections import OrderedDict, defaultdict
  7 | import settings
  8 | from optparse import OptionParser
  9 | 
 10 | from regs_common.util import crockford_hash
 11 | from regs_common.exceptions import ExtractionFailed
 12 | 
 13 | # arguments
 14 | arg_parser = OptionParser()
 15 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False)
 16 | arg_parser.add_option("-d", "--docket", dest="docket", action="store", type="string", default=None, help="Specify a docket to which to limit the dump.")
 17 | arg_parser.add_option("-s", "--strategy", dest="strategy", action="store", type="string", default=None, help="Restrict scraping to a single strategy.")
 18 | 
 19 | parser = etree.HTMLParser()
 20 | 
 21 | def fix_spaces(text):
 22 |     return re.sub(u"[\s\xa0]+", " ", text)
 23 | 
 24 | def parse_current_docket(docket_record):
 25 |     # grab the file with the URL mangled slightly to grab 100k records
 26 |     docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read()
 27 |     page = pq(etree.fromstring(docket_file, parser))
 28 | 
 29 |     docket = dict(docket_record)
 30 | 
 31 |     docket['title'] = page('.dyn_wrap h1').text().strip()
 32 |     assert docket['title'], 'no title found'
 33 | 
 34 |     headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()]
 35 | 
 36 |     docket['comments'] = []
 37 | 
 38 |     # check if there's a no-records message
 39 |     if len(page('.rgMasterTable .rgNoRecords')):
 40 |         return docket
 41 |     
 42 |     for row in page('.rgMasterTable tbody tr').items():
 43 |         tds = row.find('td')
 44 |         cell_text = [item.text().strip() for item in tds.items()]
 45 |         cdata = dict(zip(headers, cell_text))
 46 |         
 47 |         link = pq(tds[-1]).find('a')
 48 | 
 49 |         doc = {
 50 |             'url': urlparse.urljoin(docket['url'], link.attr('href')),
 51 |             'details': {},
 52 |             'release': [fix_spaces(cdata['Release'])],
 53 |             'date': cdata['Date Received'],
 54 |             'doctype': 'public_submission',
 55 |         }
 56 | 
 57 |         vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url'])
 58 |         if vc_matches:
 59 |             doc['id'] = vc_matches[0]
 60 |             doc['subtype'] = 'comment'
 61 |             detail_columns = ['Organization', 'First Name', 'Last Name']
 62 |         else:
 63 |             ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url'])
 64 |             if ep_matches:
 65 |                 doc['id'] = "EP-%s" % ep_matches[0]
 66 |                 doc['subtype'] = 'exparte'
 67 |                 detail_columns = ['Organization']
 68 |             else:
 69 |                 assert False, "expected either comment or exparte link: %s" % doc['url']
 70 | 
 71 |         for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')):
 72 |             if cftc_label in detail_columns and cdata[cftc_label]:
 73 |                 doc['details'][rdg_label] = cdata[cftc_label]
 74 | 
 75 |         docket['comments'].append(doc)
 76 | 
 77 |     assert len(docket['comments']) < 100000, "we probably exceeded one page"
 78 | 
 79 |     # then strip out all the ones that aren't about this document
 80 |     release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip())
 81 |     docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release]
 82 | 
 83 |     return docket
 84 | 
 85 | def parse_old_docket(docket_record):
 86 |     docket_file = urllib2.urlopen(docket_record['url']).read()
 87 |     page = pq(etree.fromstring(docket_file, parser))
 88 | 
 89 |     docket = dict(docket_record)
 90 | 
 91 |     release = page('ul.text p a').text().strip()
 92 |     if not re.match("\d+ FR \d+", release):
 93 |         release = None
 94 |     
 95 |     # hackery to get the title
 96 |     para_lines = [chunk.strip() for chunk in page('ul.text p a').parent().html().split("</a>")[-1].replace("&#13;", " ").split("<br />") if chunk.strip()]
 97 |     docket['title'] = para_lines[0]
 98 | 
 99 |     docket['comments'] = []
100 | 
101 |     for row in page('.list-release .row').items():
102 |         date = row('.column-date').text().strip()
103 |         if not date:
104 |             # this is an FR document
105 |             item = row('.column-item')
106 |             label = item.text().strip()
107 |             assert re.match('\d+ FR \d+', label), "Expected FR citation, got: %s" % label
108 | 
109 |             link = item.find('a')
110 |             frnum = re.findall("[A-Z0-9-]+", link.attr('href').rsplit("/", 1)[-1])
111 |             assert frnum, "expected FR num"
112 |             doc = {
113 |                 'id': frnum[0],
114 |                 'title': label,
115 |                 'details': {
116 |                     'Federal Register Citation': label,
117 |                     'Federal Register Number': frnum[0]
118 |                 },
119 |                 'url': urlparse.urljoin(docket_record['url'], link.attr('href')),
120 |                 'doctype': 'Federal Register Release'
121 |             }
122 |         else:
123 |             # this is a comment
124 |             desc = row('.column-comment, .column-item')
125 |             link = desc('a')
126 |             link_label = link.text().strip()
127 | 
128 |             ll_is_id = re.match("^[A-Z]{2}\d+$", link_label)
129 |             
130 |             doc = {
131 |                 'date': date,
132 |                 'url': urlparse.urljoin(docket_record['url'], link.attr('href')),
133 |                 'title': re.split(r"<br ?/?>", desc.html().strip())[1].strip() if ll_is_id else link_label,
134 |                 'details': {},
135 |                 'doctype': 'public_submission'
136 |             }
137 |             if ll_is_id:
138 |                 doc['id'] = link_label
139 |             if release:
140 |                 doc['release'] = [release]
141 |             pages = row('.column-pages')
142 |             if len(pages):
143 |                 doc['details']['Pages'] = pages.text().strip()
144 | 
145 |         docket['comments'].append(doc)
146 | 
147 |     return docket
148 | 
149 | def is_ancient_label(text):
150 |     return re.match("[A-Z ]+:", text)
151 | 
152 | def parse_ancient_docket(docket_record):
153 |     page_url = docket_record['url']
154 |     
155 |     docket = dict(docket_record)
156 |     docket['comments'] = []
157 | 
158 |     while True:
159 |         page_data = urllib2.urlopen(page_url).read()
160 |         page = pq(etree.fromstring(page_data, parser))
161 | 
162 |         groups = []
163 |         group = []
164 |         first_divider = False
165 |         for table in page('table').items():
166 |             divider = table.find('font[color*="#808000"]')
167 |             if len(divider) and re.match(r".*-{10,}.*", divider.text()):
168 |                 if not first_divider:
169 |                     first_divider = True
170 |                     continue
171 |                 if group:
172 |                     groups.append(group)
173 |                     group = []
174 |             elif first_divider:
175 |                 group.append(table)
176 | 
177 |         for group in groups:
178 |             cells = pq([g[0] for g in group]).find('td')
179 | 
180 |             doc = {
181 |                 'title': fix_spaces(" ".join([item.text() for item in pq([g[0] for g in group[1:]]).find('td b font').items()])),
182 |                 'details': {},
183 |                 'url': None,
184 |             }
185 | 
186 |             for i in range(len(cells)):
187 |                 text = fix_spaces(cells.eq(i).text().strip())
188 |                 if is_ancient_label(text):
189 |                     next_text = fix_spaces(cells.eq(i + 1).text().strip())
190 |                     next_text = next_text if not is_ancient_label(next_text) else None
191 | 
192 |                     if next_text:
193 |                         if text == "DOCUMENT:":
194 |                             # we need yet another cell
195 |                             doc['id'] = next_text + fix_spaces(cells.eq(i + 2).text().strip())
196 | 
197 |                             if 'CL' in doc['id']:
198 |                                 doc['doctype'] = 'public_submission'
199 |                             elif 'NC' in doc['id']:
200 |                                 doc['doctype'] = 'other'
201 |                             elif 'FR' in doc['id']:
202 |                                 ltitle = doc['title'].lower()
203 |                                 if 'proposed' in ltitle:
204 |                                     doc['doctype'] = 'proposed_rule'
205 |                                 elif 'final' in ltitle:
206 |                                     doc['doctype'] = 'rule'
207 |                                 else:
208 |                                     doc['doctype'] = 'notice'
209 |                         elif text == "DATE:":
210 |                             doc['date'] = next_text
211 |                         elif text == "FR PAGE:" and "N/A" not in next_text.upper():
212 |                             doc['details']['Federal Register Page'] = next_text
213 |                         elif text == "PAGES:":
214 |                             doc['details']['Pages'] = next_text
215 |                         elif text == "PDF SIZE:":
216 |                             doc['details']['PDF Size'] = next_text
217 |                         elif text == "PDF LINK:":
218 |                             link = cells.eq(i + 1).find('a')
219 |                             if len(link):
220 |                                 doc['url'] = urlparse.urljoin(page_url, link.attr('href'))
221 |             docket['comments'].append(doc)
222 | 
223 |         # grab the 'next' link
224 |         next_link = [a for a in page('a[href*=foi]').items() if 'Next' in a.text()]
225 |         if next_link:
226 |             next_url = urlparse.urljoin(page_url, next_link[0].attr('href'))
227 |             if next_url != page_url:
228 |                 page_url = next_url
229 |             else:
230 |                 # apparently sometimes "next" points to the current page -- bail if so
231 |                 break
232 |         else:
233 |             break
234 |     return docket
235 | 
236 | def parse_sirt_docket(docket_record):
237 |     # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything...
238 |     # which is arduous and stupid because it's a yucky ASP app.
239 | 
240 |     cj = cookielib.CookieJar()
241 |     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
242 |     initial = pq(opener.open(docket_record['url']).read())
243 | 
244 |     error_header = initial("h4")
245 |     if len(error_header) and "sorry" in error_header.text().lower():
246 |         raise ExtractionFailed("This URL doesn't work.")
247 | 
248 |     formdata = urllib.urlencode((
249 |             ('__EVENTTARGET', 'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'),
250 |             ('__EVENTARGUMENT', ''),
251 |             ('__LASTFOCUS', ''),
252 |             ('__VIEWSTATE', initial('#__VIEWSTATE').val()),
253 |             ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()),
254 |             ('ctl00$masterScriptManager', ''),
255 |             ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on')
256 |         ))
257 | 
258 |     page = pq(opener.open(docket_record['url'], data=formdata).read())
259 | 
260 |     docket = dict(docket_record)
261 | 
262 |     details = dict([re.split(r"\s*:\s*", row.strip()) for row in re.split(r"<br ?/?>", page('h5.QueryTitle').html()) if row.strip()])
263 | 
264 |     if 'details' not in docket:
265 |         docket['details'] = {}
266 | 
267 |     if 'Filing Description' in details:
268 |         docket['title'] = details['Filing Description']
269 | 
270 |     if 'Organization' in details:
271 |         docket['details']['Organization Name'] = details['Organization']
272 | 
273 |     if 'Status' in details:
274 |         docket['details']['Status'] = details['Status']
275 | 
276 |     docket['comments'] = []
277 | 
278 |     for link in page('.gradient-style tr td a').items():
279 |         doc = {
280 |             'url': urlparse.urljoin(docket_record['url'], link.attr('href')),
281 |             'title': fix_spaces(link.text().strip()),
282 |             'details': {},
283 |         }
284 |         doc['doctype'] = 'public_submission' if 'comment' in doc['title'].lower() else 'other'
285 |         doc['id'] = crockford_hash(doc['url'])
286 | 
287 |         docket['comments'].append(doc)
288 | 
289 |     return docket
290 | 
291 | 
292 | def run(options, args):
293 |     dockets = json.load(open(os.path.join(settings.DUMP_DIR, "cftc_dockets.json")))
294 | 
295 |     stats = {'fetched': 0, 'skipped': 0, 'failed': 0}
296 | 
297 |     docket_dir = os.path.join(settings.DUMP_DIR, "cftc_dockets")
298 |     if not os.path.exists(docket_dir):
299 |         os.mkdir(docket_dir)
300 | 
301 |     for i, docket in enumerate(dockets.itervalues()):
302 |         if options.docket and docket['id'] != options.docket:
303 |             continue
304 | 
305 |         if options.strategy and docket['strategy'] != options.strategy:
306 |             continue
307 | 
308 |         if 'url' in docket:
309 |             print 'Fetching %s...' % docket['id']
310 |             print i, json.dumps(docket)
311 |             try:
312 |                 fetched = globals()['parse_%s_docket' % docket['strategy']](docket)
313 |             except ExtractionFailed:
314 |                 print "FAILED to scrape docket data for %s" % docket['id']
315 |                 stats['failed'] += 1
316 |                 continue
317 | 
318 |             if options.verbose:
319 |                 print json.dumps(fetched, indent=4)
320 | 
321 |             outfile = open(os.path.join(docket_dir, "%s.json" % docket['id']), "wb")
322 |             json.dump(fetched, outfile, indent=4)
323 |             outfile.close()
324 | 
325 |             stats['fetched'] += 1
326 |         else:
327 |             print 'Skipping %s.' % docket['id']
328 |             stats['skipped'] += 1
329 | 
330 |     print "Fetched %s dockets; skipped %s dockets; failed on %s dockets." % (stats['fetched'], stats['skipped'], stats['failed'])
331 |     return stats


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/sec_cftc_import.py:
--------------------------------------------------------------------------------
  1 | import json, re, urlparse
  2 | from dateutil.parser import parse as parse_date
  3 | import datetime
  4 | 
  5 | import settings
  6 | from regs_models import *
  7 | from regs_common.util import *
  8 | from mongoengine.errors import NotUniqueError
  9 | 
 10 | from optparse import OptionParser
 11 | 
 12 | # arguments
 13 | arg_parser = OptionParser()
 14 | arg_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False)
 15 | arg_parser.add_option("-u", "--update", dest="update", action="store_true", default=False, help="Update existing records.")
 16 | arg_parser.add_option("-a", "--agency", dest="agency", action="store", default=None, help="Restrict import to a single agency.")
 17 | 
 18 | GEVENT = False
 19 | 
 20 | type_mappings = {
 21 |     "notice": "notice",
 22 |     "other": "other",
 23 |     "proposed_rule": "proposed_rule",
 24 |     "public_submission": "public_submission",
 25 |     "rule": "rule",
 26 |     "General CFTC": "other",
 27 |     "Industry Filing": "other",
 28 |     "Orders and Other Announcements": "other",
 29 |     "Press Release": "other",
 30 |     "Privacy Act Systems": "other",
 31 |     "Proposed Rule": "proposed_rule",
 32 |     "Public Information Collection": "notice",
 33 |     "Sunshine Act": "other",
 34 |     "concept": "notice",
 35 |     "final": "rule",
 36 |     "interim-final-temp": "rule",
 37 |     "interp": "notice",
 38 |     "other": "other",
 39 |     "petitions": "other",
 40 |     "policy": "other",
 41 |     "proposed": "proposed_rule",
 42 |     # FIXME: this is terrible; should actually figure out what it is
 43 |     "Federal Register Release": "notice",
 44 | }
 45 | file_mapping = {
 46 |     'pdf': 'xpdf',
 47 |     'html': 'html',
 48 |     'htm': 'html'
 49 | }
 50 | 
 51 | nineteen_hundred = parse_date("1900-01-01")
 52 | 
 53 | def docket_record_to_model(record, agency):
 54 |     dkt = Docket()
 55 |     
 56 |     dkt.id = "%s-X-%s" % (agency, record['id'])
 57 |     dkt.agency = agency
 58 |     dkt.details['Source_ID'] = record['id']
 59 | 
 60 |     if 'title' in record and record['title']:
 61 |         dkt.title = record['title']
 62 | 
 63 |     if record.get('url', None):
 64 |         dkt['details']['Source_URL'] = record['url']
 65 | 
 66 |     if record.get('type', None):
 67 |         dkt['details']['Type'] = record['type']
 68 | 
 69 |     if record.get('subtype', None):
 70 |         dkt['details']['Subtype'] = record['subtype']
 71 | 
 72 |     dkt.source = 'sec_cftc'
 73 |     dkt.scraped = 'no'
 74 | 
 75 |     return dkt
 76 | 
 77 | def docket_record_from_id(docket_id, agency):
 78 |     dkt = Docket()
 79 | 
 80 |     dkt.id = docket_id
 81 |     dkt.agency = agency
 82 |     dkt.source = 'sec_cftc'
 83 |     dkt.scraped = 'no'
 84 | 
 85 |     return dkt
 86 | 
 87 | def view_from_url(url):
 88 |     view = View()
 89 |     # strip fragments
 90 |     view.url = re.sub(r"#.*", "", url).strip()
 91 | 
 92 |     ext_matches = re.findall(r"\.([A-Za-z]+)$", view.url)
 93 |     if ext_matches:
 94 |         view.type = file_mapping.get(ext_matches[0], ext_matches[0])
 95 |     else:
 96 |         view.type = 'html'
 97 | 
 98 |     view.object_id = crockford_hash(view.url)
 99 | 
100 |     return view
101 | 
102 | def fr_doc_record_to_model(record, agency):
103 |     doc = Doc()
104 |     
105 |     if record['file_info']:
106 |         file_info = record['file_info'][0]
107 |         file_id = file_info['parent' if 'parent' in file_info else 'id']
108 |     else:
109 |         file_id = crockford_hash(record['id'])
110 |     
111 |     doc.docket_id = "%s-X-%s" % (agency, file_id)
112 |     doc.id = "%s-%s" % (doc.docket_id, record['id'].encode('ascii', 'ignore'))
113 | 
114 |     doc.type = type_mappings[record['doctype']]
115 | 
116 |     if 'title' in record:
117 |         doc.title = record['title']
118 |     else:
119 |         doc.title = record['id']
120 | 
121 |     doc.agency = agency
122 |     doc.source = 'sec_cftc'
123 |     doc.scraped = 'yes'
124 | 
125 |     doc.details = {k.replace(" ", "_").replace(".", ""): v for k, v in record.get("details", {}).iteritems()}
126 |     if record.get('date', None) and record['date'].strip():
127 |         parsed_date = parse_date(record['date'].strip())
128 |         if parsed_date > nineteen_hundred:
129 |             doc.details['Date_Posted'] = parsed_date
130 | 
131 |     if record.get('description', None):
132 |         doc.abstract = record['description']
133 | 
134 |     doc.fr_doc = doc.type in ('rule', 'proposed_rule', 'notice')
135 | 
136 |     doc.created = datetime.datetime.now()
137 | 
138 |     if record.get('url', None):
139 |         doc.views.append(view_from_url(record['url']))
140 | 
141 |     for att in record.get('attachments', []):
142 |         attachment = Attachment()
143 |         attachment.title = att['title']
144 |         for v in att['views']:
145 |             view = view_from_url(v['url'])
146 |             if 'type' in v:
147 |                 view.type = v['type']
148 |             attachment.views.append(view)
149 |         if attachment.views:
150 |             attachment.object_id = attachment.views[0].object_id
151 |         doc.attachments.append(attachment)
152 | 
153 |     return doc
154 | 
155 | 
156 | def comment_record_to_model(record, agency, docket_id):
157 |     doc = Doc()
158 |     
159 |     doc.docket_id = docket_id
160 |     doc.id = "%s-%s" % (doc.docket_id, record['id'])
161 | 
162 |     doc.type = type_mappings[record['doctype']]
163 | 
164 |     if 'title' in record:
165 |         doc.title = record['title']
166 |     else:
167 |         parts = []
168 |         if 'First Name' in record['details']:
169 |             parts.append(record['details']['First Name'] + (" " + record['details']['Last Name']) if 'Last Name' in record['details'] else "")
170 |         if 'Organization Name' in record['details']:
171 |             parts.append(record['details']['Organization Name'])
172 |         doc.title = "Comment from %s" % ", ".join(parts)
173 |     
174 |     if not doc.title:
175 |         doc.title = record['id']
176 | 
177 |     doc.agency = agency
178 |     doc.source = 'sec_cftc'
179 |     
180 |     if agency == 'CFTC' and 'comments.cftc.gov' in (record.get('url', '') or ''):
181 |         doc.scraped = 'no'
182 |     else:
183 |         doc.scraped = 'yes'
184 |     
185 |     doc.details = {k.replace(" ", "_"): v for k, v in record.get("details", {}).iteritems()}
186 |     if record.get('date', None) and record['date'].strip():
187 |         try:
188 |             parsed_date = parse_date(record['date'].strip())
189 |             if parsed_date > nineteen_hundred:
190 |                 doc.details['Date_Posted'] = parsed_date
191 |         except:
192 |             pass
193 | 
194 |     if 'num_received' in record:
195 |         doc.details['Number_of_Duplicate_Submissions'] = record['num_received']
196 | 
197 |     if record.get('description', None):
198 |         doc.abstract = record['description']
199 | 
200 |     doc.fr_doc = doc.type in ('rule', 'proposed_rule', 'notice')
201 | 
202 |     doc.created = datetime.datetime.now()
203 | 
204 |     if record.get('url', None):
205 |         doc.views.append(view_from_url(record['url']))
206 | 
207 |     for att in record.get('attachments', []):
208 |         attachment = Attachment()
209 |         attachment.title = att['title']
210 |         attachment.views.append(view_from_url(att['url']))
211 |         attachment.object_id = attachment.views[0].object_id
212 |         doc.attachments.append(attachment)
213 | 
214 |     return doc
215 | 
216 | def run(options, args):
217 |     for agency in (options.agency,) if options.agency else ('CFTC', 'SEC'):
218 |         lagency = agency.lower()
219 | 
220 |         all_dockets = {}
221 |         dockets_for_saving = {}
222 | 
223 |         # first load the docket file
224 |         dockets = json.load(open(os.path.join(settings.DUMP_DIR, "%s_dockets.json" % lagency)))
225 |         docket_dir = os.path.join(settings.DUMP_DIR, "%s_dockets" % lagency)
226 | 
227 |         # next deal with the FR documents
228 |         doc_by_identifier = {}
229 |         cftc_ancient_mapping = {}
230 |         all_fr_docs = []
231 |         dockets_seen = set()
232 | 
233 |         fr_docs = json.load(open(os.path.join(settings.DUMP_DIR, "%s_fr_docs.json" % lagency)))
234 |         for doc in fr_docs:
235 |             if 'id' not in doc and 'url' in doc:
236 |                 doc['id'] = crockford_hash(doc['url'])
237 | 
238 |             if 'doctype' not in doc:
239 |                 doc['doctype'] = 'Federal Register Release'
240 | 
241 |             print "Processing FR doc %s in %s..." % (doc['id'].encode('utf8'), agency) 
242 |             dc = fr_doc_record_to_model(doc, agency)
243 |             for identifier in (doc['id'], dc.details.get('Federal_Register_Number', None), dc.details.get('Federal_Register_Citation', None)):
244 |                 if identifier:
245 |                     doc_by_identifier[identifier] = dc
246 | 
247 |             # treat ancient CFTC FR docs specially because they'll show up again in the listing, so don't double count
248 |             if agency == 'CFTC' and doc['strategy'] == 'ancient':
249 |                 if 'Federal_Register_Citation' in dc.details:
250 |                     cftc_ancient_mapping[dc.details['Federal_Register_Citation'].split(" FR ")[-1]] = dc
251 |             else:
252 |                 all_fr_docs.append(dc)
253 |                 dockets_seen.add(dc.docket_id)
254 | 
255 |         # now load docket files one by one and deal with docket records and comments
256 |         all_comments = []
257 |         for record in dockets.itervalues():
258 |             json_file = os.path.join(docket_dir, "%s.json" % record['id'])
259 |             
260 |             file_exists = os.path.exists(json_file)
261 |             
262 |             if file_exists:
263 |                 full_record = json.load(open(json_file))
264 | 
265 |             print "Processing docket %s in %s..." % (record['id'], agency) 
266 |             
267 |             dkt = docket_record_to_model(full_record if file_exists else record, agency)
268 |             all_dockets[dkt.id] = dkt
269 | 
270 |             if 'parent' in record:
271 |                 dkt.details['Parent'] = '%s-X-%s' % (agency, record['parent'])
272 |             else:
273 |                 dockets_for_saving[dkt.id] = dkt
274 | 
275 |             if not file_exists:
276 |                 continue
277 | 
278 |             for comment_record in full_record['comments']:
279 |                 if 'doctype' not in comment_record:
280 |                     comment_record['doctype'] = 'public_submission'
281 | 
282 |                 if 'id' not in comment_record and 'url' in comment_record:
283 |                     comment_record['id'] = crockford_hash(comment_record['url'])
284 | 
285 |                 print "Processing comment %s in %s..." % (comment_record['id'], dkt.id) 
286 |                 cmt = comment_record_to_model(comment_record, agency, dkt.details['Parent'] if 'Parent' in dkt.details else dkt.id)
287 |                 
288 |                 if comment_record.get('release', None):
289 |                     release = comment_record['release'][0]
290 |                     if release in doc_by_identifier:
291 |                         cmt.comment_on = {'document_id': doc_by_identifier[release].id}
292 | 
293 |                 if 'Federal Register Page' in comment_record:
294 |                     cmt.title = cftc_ancient_mapping[comment_record['details']['Federal Register Page']].title
295 | 
296 |                 all_comments.append(cmt)
297 | 
298 |         print len(all_dockets), len(all_fr_docs), len(all_comments)
299 | 
300 |         # make sure we have docket records for all dockets that have documents in them
301 |         for docket_id in dockets_seen:
302 |             if docket_id not in dockets_for_saving:
303 |                 simple_dkt = docket_record_from_id(docket_id, agency)
304 |                 dockets_for_saving[docket_id] = simple_dkt
305 |         
306 |         for dkt in dockets_for_saving.itervalues():
307 |             try:
308 |                 print "Attempting to save docket %s..." % dkt.id
309 |                 dkt.save(force_insert=True)
310 |                 print "Docket %s saved." % dkt.id
311 |             except NotUniqueError:
312 |                 print "Docket %s already exists." % dkt.id
313 |                 if options.update:
314 |                     print "Fetching docket %s for update..." % dkt.id
315 | 
316 |                     # fetch the current one
317 |                     current = Docket.objects.get(id=dkt.id)
318 |                     if dkt.title:
319 |                         current.title = dkt.title
320 | 
321 |                     current.details = dkt.details
322 |                     current.source = dkt.source
323 |                     current.agency = dkt.agency
324 | 
325 |                     if current.scraped != 'yes':
326 |                         current.scraped = dkt.scraped
327 | 
328 |                     current.save()
329 |         
330 |         for doc_set in (all_fr_docs, all_comments):
331 |             for doc_obj in doc_set:
332 |                 try:
333 |                     print "Attempting to save document %s..." % doc_obj.id
334 |                     doc_obj.save(force_insert=True)
335 |                     print "Document %s saved." % doc_obj.id
336 |                 except NotUniqueError:
337 |                     print "Document %s already exists." % doc_obj.id
338 |                     if options.update:
339 |                         print "Fetching document %s for update..." % doc_obj.id
340 | 
341 |                         # fetch the current one
342 |                         current = Doc.objects.get(id=doc_obj.id)
343 |                         current.title = doc_obj.title
344 |                         current.details = doc_obj.details
345 | 
346 |                         if len(current.views) != len(doc_obj.views):
347 |                             current.views = doc_obj.views
348 | 
349 |                         if len(current.attachments) != len(doc_obj.attachments):
350 |                             current.attachments = doc_obj.attachments
351 | 
352 |                         current.save()
353 | 


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/sec_cftc_name_dockets.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from regs_models import *
 4 | import datetime
 5 | 
 6 | def run():
 7 |     for docket in Docket.objects(source="sec_cftc", scraped="no"):
 8 |         now = datetime.datetime.now()
 9 |         if not docket.title:
10 |             candidates = list(Doc.objects(docket_id=docket.id, type__in=("rule", "proposed_rule", "notice")))
11 |             candidates = sorted(candidates, key=lambda c: c.details.get('Date_Posted', now))
12 |             
13 |             # also consider type "other", but they're worse
14 |             worse_candidates = list(Doc.objects(docket_id=docket.id, type="other"))
15 |             worse_candidates = sorted(worse_candidates, key=lambda c: c.details.get('Date_Posted', now))
16 | 
17 |             candidates = candidates + worse_candidates
18 | 
19 |             if candidates:
20 |                 ctitle = candidates[0].title
21 |             else:
22 |                 ctitle = docket.id
23 |             
24 |             print "For docket %s, proposing title: %s" % (docket.id, ctitle)
25 |             
26 |             docket.title = ctitle
27 |         
28 |         docket.scraped = 'yes'
29 |         
30 |         docket.save()


--------------------------------------------------------------------------------
/regscrape/sec_cftc/commands/suppress_duplicates.py:
--------------------------------------------------------------------------------
 1 | GEVENT = False
 2 | 
 3 | from regs_models import *
 4 | 
 5 | import settings
 6 | import rawes
 7 | 
 8 | def run():
 9 |     es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=60.0)
10 |     index = getattr(es, settings.ES_INDEX)
11 | 
12 |     records = {
13 |         'sec_cftc': {},
14 |         'regulations.gov': {}
15 |     }
16 | 
17 |     for doc in Doc.objects(type__in=['notice', 'proposed_rule', 'rule'], agency__in=['SEC', 'CFTC']):
18 |         # first check the annotation
19 |         if 'fr_data' in doc.annotations and doc.annotations['fr_data']:
20 |             #print 'annotation', doc.source, doc.id, doc.annotations['fr_data']['document_number']
21 |             records[doc.source][doc.annotations['fr_data']['document_number']] = doc
22 |         elif 'Federal_Register_Number' in doc.details:
23 |             #print 'detail', doc.source, doc.id, doc.details['Federal_Register_Number']
24 |             frn = doc.details['Federal_Register_Number']
25 |             # trim leading zeros from the second part
26 |             if "-" in frn:
27 |                 frnp = frn.split("-")
28 |                 frn = "-".join(frnp[:-1] + [frnp[-1].lstrip('0')])
29 |             records[doc.source][frn] = doc
30 | 
31 |     overlap = records['sec_cftc'].viewkeys() & records['regulations.gov'].viewkeys()
32 |     for frid in overlap:
33 |         winner = records['sec_cftc'][frid]
34 |         loser = records['regulations.gov'][frid]
35 | 
36 |         winner_dkt = Docket.objects.get(id=winner.docket_id)
37 |         loser_dkt = Docket.objects.get(id=loser.docket_id)
38 | 
39 |         for w, l in ((winner, loser), (winner_dkt, loser_dkt)):
40 |             replaces = set(w.suppression.get('replaces', []))
41 |             replaces.add(l.id)
42 |             w.suppression['replaces'] = list(replaces)
43 | 
44 |             replaced_by = set(l.suppression.get('replaced_by', []))
45 |             replaced_by.add(w.id)
46 |             l.suppression['replaced_by'] = list(replaced_by)
47 | 
48 |             l.save()
49 |             w.save()
50 | 
51 |         try:
52 |             index.docket.delete(loser_dkt.id)
53 |             index.document.delete(loser.id)
54 |         except:
55 |             pass
56 | 
57 |         print '%s suppresses %s' % (winner.id, loser.id)


--------------------------------------------------------------------------------
/regscrape/settings.py:
--------------------------------------------------------------------------------
 1 | TARGET_SERVER = 'www.regulations.gov'
 2 | DEBUG = True
 3 | DB_NAME = 'regulations'
 4 | ES_HOST = 'thrift://localhost:9500'
 5 | ES_INDEX = 'regulations'
 6 | DATA_DIR = '/data'
 7 | EXTRACTORS = 2
 8 | 
 9 | DUMP_START = 0
10 | DUMP_END = 3850000
11 | DUMP_INCREMENT = 10000
12 | MAX_WAIT = 600
13 | CHUNK_SIZE = 10
14 | FILTER = {}
15 | 
16 | INSTANCES = 2
17 | THREADS_PER_INSTANCE = 2
18 | 
19 | SITES = ['regsdotgov', 'sec_cftc']
20 | 
21 | try:
22 |     from local_settings import *
23 | except:
24 |     pass
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gevent
 2 | pytz
 3 | lxml
 4 | Pillow
 5 | -e git+https://github.com/jodal/pykka.git@v0.12#egg=pykka
 6 | pymongo
 7 | -e git+https://github.com/mikemaccana/python-docx.git#egg=docx
 8 | -e git+https://github.com/sunlightlabs/oxtail.git#egg=oxtail
 9 | -e git+https://github.com/sunlightlabs/python-transparencydata#egg=transparencydata
10 | -e git+https://github.com/sunlightlabs/name-cleaver.git#egg=name_cleaver
11 | -e git+https://github.com/apendleton/mincemeatpy.git#egg=mincemeat
12 | -e git+https://github.com/sunlightlabs/regs-models.git#egg=regs_models
13 | isoweek
14 | 
15 | # redis-based sync requirements
16 | hiredis
17 | redis
18 | 
19 | mongoengine
20 | html2text
21 | urllib3
22 | rawes
23 | 
24 | pyquery
25 | crockford
26 | python-dateutil
27 | PyYAML
28 | jellyfish


--------------------------------------------------------------------------------