├── .gitignore
├── LICENSE
├── README.md
├── archive
    ├── alignment_evaluation.py
    ├── classifier.py
    ├── exploratory.R
    ├── input
    │   ├── .download_bulk_sunlight_files.sh.swp
    │   ├── Drakefile
    │   ├── Sunlight_data.ipynb
    │   ├── bill_metadata.py
    │   ├── bill_metadata.sql
    │   ├── campaign_contributions.sh
    │   ├── committee_metadata.py
    │   ├── committee_metadata.sql
    │   ├── download_bulk_sunlight_files.sh
    │   ├── legislators.py
    │   ├── legislators.sql
    │   ├── lobbyists
    │   │   └── compile_lobbyist_lists.sh
    │   ├── opensecrets
    │   │   └── opensecrets_candidates.sql
    │   ├── state_metadata.py
    │   ├── state_metadata.sql
    │   └── unzip_bulk_files.sh
    ├── prototype_text_alignment_algorithms.py
    ├── score_alignments.py
    └── tfidf_ranking.py
├── bashrc_lid
├── data
    ├── bill_ids.txt
    ├── classifier
    │   └── training_data_alignment_classifier.csv
    ├── evaluation_set
    │   ├── bills_for_evaluation_set.csv
    │   └── labeled_bills.json
    ├── model_legislation_urls
    │   └── clean_urls.txt
    └── state_bill_samples.txt
├── db
    ├── elasticsearch.yml
    ├── evaluation_mapping.json
    ├── state_bill_index.json
    └── state_bill_mapping.json
├── html
    ├── bootstrap3
    │   ├── css
    │   │   ├── .Rhistory
    │   │   ├── bootstrap-theme.css
    │   │   ├── bootstrap-theme.css.map
    │   │   ├── bootstrap-theme.min.css
    │   │   ├── bootstrap.css
    │   │   ├── bootstrap.css.map
    │   │   ├── bootstrap.min.css
    │   │   └── custom.css
    │   ├── fonts
    │   │   ├── glyphicons-halflings-regular.eot
    │   │   ├── glyphicons-halflings-regular.svg
    │   │   ├── glyphicons-halflings-regular.ttf
    │   │   ├── glyphicons-halflings-regular.woff
    │   │   └── glyphicons-halflings-regular.woff2
    │   └── js
    │   │   ├── bootstrap.js
    │   │   ├── bootstrap.min.js
    │   │   └── npm.js
    ├── index.html
    └── templates
    │   └── searchdemo.html.jinja
├── lid
    ├── .DS_Store
    ├── __init__.py
    ├── alignment_classifier.py
    ├── config.py
    ├── database.py
    ├── etl
    │   ├── __init__.py
    │   ├── extractors.py
    │   ├── load_constitutions_into_elasticsearch.py
    │   ├── scrapers.py
    │   └── state_bill_extractors.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── bills_for_evaluation_set.py
    │   └── lid_evaluation.py
    ├── frontend.py
    ├── lid.py
    ├── text_alignment.py
    └── utils
    │   ├── __init__.py
    │   ├── general_utils.py
    │   ├── sunlight_utils.py
    │   └── text_cleaning.py
├── readme.txt
├── requirements.txt
├── scripts
    ├── bill_to_bill_analysis.py
    ├── bill_to_bill_parallel.sh
    ├── compare_constitutions.py
    ├── generate_bill_to_bill_matches.py
    ├── generate_model_legislation_matches.py
    ├── model_legislation_network.py
    ├── model_legislation_parallel.sh
    └── model_legislation_to_bill_analysis.py
└── tests
    └── text_alignment_tests.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Drake log
 2 | drake.log
 3 | 
 4 | # Database info
 5 | default_profile
 6 | 
 7 | # Sunlight key
 8 | .sunlight.*
 9 | 
10 | # IPython Notebook checkpoints
11 | .ipynb_checkpoints/
12 | 
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | 
17 | # C extensions
18 | *.so
19 | 
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | 
37 | # PyInstaller
38 | #  Usually these files are written by a python script from a template
39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 | 
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 | 
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | 
55 | # Translations
56 | *.mo
57 | *.pot
58 | 
59 | # Django stuff:
60 | *.log
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyBuilder
66 | target/
67 | 
68 | # Drake 
69 | drake.*
70 | .drake/
71 | 
72 | #pycharm
73 | *.idea
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Data Science for Social Good
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Legislative Influence Detector
 2 | 
 3 | Legislators often lack the time to write bills, so they tend to rely on outside groups to help. Researchers and concerned citizens would like to know who’s writing legislative bills, but trying to read those bills, let alone trace their source, is tedious and time consuming. This is especially true at the state and local levels, where arguably more important policy decisions are made every day.
 4 | 
 5 | This project provides tools to help analyze and access government bills. Using the Sunlight Foundation’s collection of state bills and model legislation scraped from lobbying groups from around the country, we built tools to shed light on the origination and diffusion of policy ideas around the country, the effectiveness of various lobbying organizations, and the democratic nature of individual bills, all in near real time.
 6 | 
 7 | # How does it work?
 8 | 
 9 | We use the Smith-Waterman local-alignment algorithm to find matching text across documents. This algorithm grabs pieces of text from each document and compares each word, adding points for matches and subtracting points for mismatches. Unfortunately, the local-alignment algorithm is too slow for large sets of text, such as ours. It could take the algorithm thousands of years to finish analyzing the legislation. We improved the speed of the analysis by first limiting the number of documents that need to be compared. Elasticsearch, our database of choice for this project, efficiently calculates Lucene scores. When we use LID to search for a document, it quickly compares our document against all others and grabs the 100 most similar documents as measured by their Lucene scores. Then we run the local-alignment algorithm on those 100.
10 | 
11 | # How to use it?
12 | 
13 | * The text_alignmnet.py file gives our implemtnation of the smith-waterman algorithm. Feel free to use it!
14 | 
15 | # Important Files
16 | 
17 | * text_alignment.py: contains our fast implementation of the smith-waterman algorithm.
18 | 
19 | ## Environmental Variables
20 | * POLICY_DIFFUSION
21 | * LOGFILE_DIRECTORY: should not exist inside repository, to prevent repository bloating
22 | * TEMPFILE_DIRECTORY: stores files created temporarily while the algorithm runs
23 | * ELASTICSEARCH_IP
24 |  
25 | 


--------------------------------------------------------------------------------
/archive/classifier.py:
--------------------------------------------------------------------------------
  1 | from alignment_evaluation import alignment_features
  2 | import numpy as np
  3 | import nltk
  4 | from sklearn import linear_model
  5 | from sklearn.metrics import confusion_matrix, accuracy_score
  6 | 
  7 | from score_alignments import StateTFIDF
  8 | import json
  9 | import argparse
 10 | import os
 11 | from database import ElasticConnection
 12 | import random
 13 | import codecs
 14 | from utils.general_utils import alignment_tokenizer
 15 | from utils.general_utils import UnicodeWriter
 16 | from sklearn.metrics import jaccard_similarity_score
 17 | 
 18 | 
 19 | def construct_training_set(alignments_file,out_file_name):
 20 |     """
 21 |     Args:
 22 |         alignments_file (file) -- file containing sample alignments
 23 |         
 24 |         out_file_name (string) -- name of training data file to write to
 25 | 
 26 |     Returns:
 27 |         None
 28 |     """
 29 |     ec = ElasticConnection(host= "54.203.12.145")
 30 |     
 31 |     training_examples = []
 32 |     for i,x in enumerate(alignments_file):
 33 |         json_obj = json.loads(x.strip())
 34 |         
 35 |         if "alignment_results" not in json_obj.keys():
 36 |             continue
 37 | 
 38 |         left_doc_id = json_obj['query_document_id']
 39 |         left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
 40 |         
 41 |         left_doc = json_obj['query_document']
 42 |         left_doc = reduce(lambda x,y:x+y,left_doc)
 43 |         
 44 |         left_doc_length = len(left_doc.split())
 45 | 
 46 |         for i,alignment_doc in enumerate(json_obj['alignment_results']):
 47 |             
 48 |             right_doc_id = alignment_doc['document_id']
 49 |             right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
 50 |             
 51 |             for alignment in alignment_doc['alignments']:
 52 | 
 53 |                 left = alignment['left']
 54 |                 right = alignment['right']
 55 |                 left_start = alignment['left_start'] 
 56 |                 right_start = alignment['right_start']
 57 |                 left_end = alignment['left_end']
 58 |                 right_end = alignment['right_end']
 59 |                 score = alignment['score']
 60 |                 training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
 61 |                     right_end,score,left_bill_title,right_bill_title,
 62 |                     " ".join(left)," ".join(right)])
 63 |         
 64 |     
 65 |     random.shuffle(training_examples)            
 66 |     
 67 |     header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
 68 |                     "right_end","score","left_bill_title","right_bill_title","left","right"]
 69 |    
 70 | 
 71 |     k = 500
 72 |     with codecs.open(out_file_name, 'wb') as output_file:
 73 |         writer =  UnicodeWriter(output_file, header)
 74 |         writer.writerow(header)
 75 |         for l in training_examples[0:k]:
 76 |             l = [unicode(x) for x in l]
 77 |             writer.writerow(l)
 78 | 
 79 | 
 80 |     return
 81 | =======
 82 | from score_alignments import StateTFIDF
 83 | import json
 84 | import argparse
 85 | import os
 86 | from database import ElasticConnection
 87 | import random
 88 | import codecs
 89 | from utils.general_utils import alignment_tokenizer
 90 | from utils.general_utils import UnicodeWriter
 91 | 
 92 | 
 93 | def construct_training_set(alignments_file,out_file_name):
 94 |     """
 95 |     Args:
 96 |         alignments_file (file) -- file containing sample alignments
 97 |         
 98 |         out_file_name (string) -- name of training data file to write to
 99 | 
100 |     Returns:
101 |         None
102 |     """
103 |     ec = ElasticConnection(host= "54.203.12.145")
104 |     
105 |     training_examples = []
106 |     for i,x in enumerate(alignments_file):
107 |         json_obj = json.loads(x.strip())
108 |         
109 |         if "alignment_results" not in json_obj.keys():
110 |             continue
111 | 
112 |         left_doc_id = json_obj['query_document_id']
113 |         left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
114 |         
115 |         left_doc = json_obj['query_document']
116 |         left_doc = reduce(lambda x,y:x+y,left_doc)
117 |         
118 |         left_doc_length = len(left_doc.split())
119 | 
120 |         for i,alignment_doc in enumerate(json_obj['alignment_results']):
121 |             
122 |             right_doc_id = alignment_doc['document_id']
123 |             right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
124 |             
125 |             for alignment in alignment_doc['alignments']:
126 | 
127 |                 left = alignment['left']
128 |                 right = alignment['right']
129 |                 left_start = alignment['left_start'] 
130 |                 right_start = alignment['right_start']
131 |                 left_end = alignment['left_end']
132 |                 right_end = alignment['right_end']
133 |                 score = alignment['score']
134 |                 training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
135 |                     right_end,score,left_bill_title,right_bill_title,
136 |                     " ".join(left)," ".join(right)])
137 |         
138 |     
139 |     random.shuffle(training_examples)            
140 |     
141 |     header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
142 |                     "right_end","score","left_bill_title","right_bill_title","left","right"]
143 |    
144 | 
145 |     k = 500
146 |     with codecs.open(out_file_name, 'wb') as output_file:
147 |         writer =  UnicodeWriter(output_file, header)
148 |         writer.writerow(header)
149 |         for l in training_examples[0:k]:
150 |             l = [unicode(x) for x in l]
151 |             writer.writerow(l)
152 | 
153 | 
154 |     return
155 | 
156 | 
157 | def features_matrix(alignment):
158 | 	right = alignment['right']
159 | 	left = alignment['left']
160 | 	features['left_tfidf'], features['right_tfidf'] = s.tfidf_score(left, right)
161 | 	features = alignment_features(left, right)
162 | 	features['score'] = alignment['score']
163 | 	features['label'] = alignment['label']
164 | 
165 | 	return features
166 | 
167 | def evaluate_model():
168 |     data = list_alignments
169 |     featuresets = [features_matrix(alignment) for alignment in data]
170 | 
171 |     data_list = [[value['avg_consec_match_length'], value['avg_gap_length_l'], 
172 |                             value['avg_gap_length_r'], value['jaccard_score'], 
173 |                             value['length'], value['num_gaps_l'], value['num_gaps_r'], 
174 |                             value['num_matches'], value['num_mismatches'], 
175 |                             value['score'], value['label']] for value in featuresets]
176 | 
177 |     alignment_data = np.array(data_list)
178 |     alignment_y=alignment_data[:,-1]
179 |     alignment_X=alignment_data[:,:-1]
180 | 
181 |     # A random permutation, to split the data randomly
182 |     np.random.seed(0)
183 |     indices = np.random.permutation(len(alignment_X))
184 |     train_n = 5
185 |     alignment_X_train = alignment_X[indices[:-train_n]]
186 |     alignment_y_train = alignment_y[indices[:-train_n]]
187 |     alignment_X_test  = alignment_X[indices[-train_n:]]
188 |     alignment_y_test  = alignment_y[indices[-train_n:]]
189 |      
190 |     # Create and fit a logistic regression
191 |     logistic = linear_model.LogisticRegression(C=1e5)
192 |     logistic.fit(alignment_X_train, alignment_y_train)
193 |     y_pred = logistic.predict(alignment_X_test)
194 | 
195 |     #Calculate accuracy
196 |     accuracy_score(alignment_y_test, y_pred)
197 |     cm = confusion_matrix(alignment_y_test, y_pred)
198 | 
199 | 
200 | 
201 | def main():
202 |     parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ')
203 |     parser.add_argument('command',
204 |             help='command to run, options are: construct_training_set,train_model,evaluate_model')
205 |     parser.add_argument('--alignment_samples_doc', dest='alignment_samples',
206 |             help="file path to the alignment samples used to construct training set ")
207 |     args = parser.parse_args()
208 | 
209 |     if args.command == "construct_training_set":
210 |         construct_training_set(open(args.alignment_samples),
211 |                 os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set.csv")    
212 |     elif args.command == "train_model":
213 |         pass
214 |     elif args.command == "evaluate_model":
215 |         pass
216 |     else:
217 |         print args
218 |         print "command not recognized, please enter construct_training_set,train_model,evaluate_model"
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     main()
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 


--------------------------------------------------------------------------------
/archive/exploratory.R:
--------------------------------------------------------------------------------
  1 | library('RPostgreSQL')
  2 | library('ggplot2')
  3 | 
  4 | db_info <- read.csv('policy_diffusion/default_profile', sep='=', header=F, quote='', stringsAsFactors=F)
  5 | 
  6 | # sessions
  7 | drv <- dbDriver('PostgreSQL')
  8 | con <- dbConnect(drv, user=db_info$V2[3], password=db_info$V2[4], 
  9 |                  dbname=db_info$V2[2], host=db_info$V2[1])
 10 | 
 11 | # number of governments
 12 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state FROM bill_metadata) AS a;")
 13 | 
 14 | # list the governments
 15 | dbGetQuery(con, "SELECT DISTINCT state FROM bill_metadata ORDER BY state;")
 16 | 
 17 | # number of sessions
 18 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state, session FROM bill_metadata ORDER BY state, session) AS a;")
 19 | 
 20 | # sessions
 21 | dbGetQuery(con, "SELECT DISTINCT state, session FROM bill_metadata ORDER BY session;")
 22 | 
 23 | # oldest session per government
 24 | dbGetQuery(con, "SELECT state, MIN(session) AS min_session FROM bill_metadata GROUP BY state ORDER BY state;")
 25 | 
 26 | # newest session per government
 27 | dbGetQuery(con, "SELECT state, MAX(session) AS max_session FROM bill_metadata GROUP BY state ORDER BY state;")
 28 | 
 29 | # bills and resolutions by government session
 30 | bills_and_resolutions <-
 31 |     dbGetQuery(con, "SELECT a.state, 
 32 |                             a.session,
 33 |                             a.bill_freq,
 34 |                             b.resolution_freq
 35 |                      FROM   (SELECT state, session, count(*) as bill_freq FROM bill_metadata WHERE type LIKE '%bill%' GROUP BY state, session) AS a,
 36 |                             (SELECT state, session, count(*) as resolution_freq FROM bill_metadata WHERE type LIKE '%resolution%' GROUP BY state, session) AS b
 37 |                      WHERE  a.state = b.state AND
 38 |                             a.session = b.session
 39 |                      ORDER BY bill_freq DESC;")
 40 | 
 41 | br_plt <- ggplot(bills_and_resolutions, aes(bill_freq, resolution_freq))
 42 | br_plt +  theme(axis.text=element_text(size=18),
 43 |                 axis.title=element_text(size=18,face="bold")) +
 44 |           ylim(0, max(bills_and_resolutions$bill_freq)) + 
 45 |           geom_point() + 
 46 |           xlab("bills") + 
 47 |           ylab("resolutions") +
 48 |           geom_abline(intercept=0, slope=1) + 
 49 |           geom_text(data=subset(bills_and_resolutions, bill_freq > 5000),
 50 |                     aes(bill_freq, resolution_freq, label=toupper(state)),
 51 |                     vjust=-.5, size=8) + 
 52 |           geom_text(data=subset(bills_and_resolutions, bill_freq < resolution_freq & bill_freq > 100),
 53 |                     aes(bill_freq, resolution_freq, label=toupper(state)),
 54 |                     vjust=-.5, size=8) 
 55 |   
 56 | 
 57 | # how many bills Sunlight scraped from each government after the second
 58 | # year it started scraping that government
 59 | bills_by_state_year <-
 60 |     dbGetQuery(con, "SELECT UPPER(c.state) as state, 
 61 |                             EXTRACT(YEAR FROM c.created_at) AS year,
 62 |                             COUNT(*) AS freq
 63 |                      FROM   bill_metadata AS c,
 64 |                             -- find minimum year 
 65 |                             (SELECT a.state,
 66 |                                     MIN(a.year) AS min_year
 67 |                              FROM   (SELECT state,
 68 |                                             EXTRACT(YEAR FROM created_at) AS year
 69 |                                     FROM    bill_metadata) AS a
 70 |                             GROUP BY state) as b
 71 |                      WHERE  c.state = b.state AND
 72 |                             EXTRACT(YEAR FROM created_at) >= b.min_year 
 73 |                      GROUP BY c.state, 
 74 |                               EXTRACT(YEAR FROM c.created_at)
 75 |                      ORDER BY c.state,
 76 |                               EXTRACT(YEAR FROM c.created_at);")
 77 | 
 78 | # we're missing data for some states in some years
 79 | dbGetQuery(con, "SELECT c.state,
 80 |                         c.year - 1 AS missing_year
 81 |                  FROM (SELECT *,
 82 |   	                          b.year - lag(b.year) OVER w AS gap
 83 |                        FROM 	(SELECT a.state,
 84 | 		 		                              a.year,
 85 | 		 		                              COUNT(*)
 86 | 		                           FROM 	(SELECT state, 
 87 | 		 		                                      EXTRACT(YEAR FROM created_at) AS year
 88 | 		 		                              FROM 	 bill_metadata) AS a
 89 |                                 		  GROUP BY a.state,
 90 | 		 		                                       a.year
 91 | 		                                  ORDER BY a.state,
 92 | 		 		                                       a.year) AS b
 93 |                         WINDOW w AS (ORDER BY b.state, b.year)) AS c
 94 |                   WHERE c.gap > 1;")
 95 | 
 96 | missing_values <- data.frame(state = c('MT', 'ND', 'NV', 'TX', 'TX'),
 97 |                              year = c(2014, 2014, 2012, 2012, 2014),
 98 |                              freq = rep(0,5))
 99 | bills_by_state_year <- rbind(bills_by_state_year, missing_values)
100 | bills_by_state_year <- bills_by_state_year[ order(bills_by_state_year$state, bills_by_state_year$year), ]
101 | 
102 | # New Jersey 2012 is wrong. Subtract 2013 number from total here: http://www.njleg.state.nj.us/bills/BillsByNumber.asp
103 | bills_by_state_year$freq[ bills_by_state_year$state == 'NJ' & bills_by_state_year$year == 2012 ] <- 6808
104 | 
105 | sy_plt <- ggplot(bills_by_state_year, aes(year, freq, color=state))
106 | sy_plt +  theme(legend.position="none", 
107 |                 axis.text=element_text(size=18),
108 |                 axis.title=element_text(size=18,face="bold")) +
109 |           geom_line(size=2) + 
110 |           ylab("frequency") +
111 |           geom_text(data=data.frame(state=c('NJ', 'TX', 'NJ', 'NY', 'IL', 'TX'),
112 |                                     year=c(2012, 2013, 2014, 2014, 2015.05, 2015), 
113 |                                     freq=c(6850, 11700, 7500, 13200, 7000, 10000)),
114 |                     aes(x=year, y=freq, label=state),
115 |                     vjust=-.5, size=7)
116 | 


--------------------------------------------------------------------------------
/archive/input/.download_bulk_sunlight_files.sh.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/archive/input/.download_bulk_sunlight_files.sh.swp


--------------------------------------------------------------------------------
/archive/input/Drakefile:
--------------------------------------------------------------------------------
 1 | PROFILE:=default_profile
 2 | %include $[PROFILE]
 3 | 
 4 | psql()
 5 | 	psql -v ON_ERROR_STOP=1 -f $[INPUT] && touch $[OUTPUT]
 6 | 
 7 | 
 8 | 
 9 | 
10 | ; GRAB STATE LEGISLATIVE METADATA FROM SUNLIGHT
11 | /mnt/data/sunlight/data/input/state_metadata.csv <- [-timestamp]
12 | ;	input/./state_metadata.py | sed -E "s/u?'//g" > $OUTPUT
13 | 
14 | ; CREATE TABLE / COPY FOR STATE METADATA
15 | ;psql/input/touch_state_metadata <- input/state_metadata.sql, data/input/state_metadata.csv [method:psql]
16 | 
17 | 
18 | 
19 | ; CREATE TABLE FOR BILL METADATA
20 | ; (sql file creates the table; the python script pushes the data)
21 | ;psql/input/touch_bill_metadata <- input/bill_metadata.sql [method:psql]
22 | 
23 | ; GRAB BILL METADATA FROM SUNLIGHT
24 | ;data/input/touch_bill_metadata <- input/download_bulk_sunlight_files.sh
25 | ;	 bash $INPUT && touch $OUTPUT
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/archive/input/bill_metadata.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import psycopg2
  3 | from psycopg2.extras import Json
  4 | import json
  5 | import csv
  6 | import os
  7 | import re
  8 | 
  9 | 
 10 | 
 11 | # GRAB DATABASE INFO FROM default_profile
 12 | db_info = []
 13 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file:
 14 |     reader = csv.reader(db_file, delimiter='=', quotechar='"')
 15 |     for row in reader:
 16 |         db_info.append(row[1])
 17 | 
 18 | 
 19 | # CONNECT TO DATABASE
 20 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 
 21 | cur = conn.cursor()
 22 |         
 23 | 
 24 | 
 25 | # PARSE BILL METADATA FOR DATABASE INSERTION
 26 | def parse_bill_metadata(bill_metadata):
 27 |     bill_id = bill_metadata['bill_id']
 28 |     title = bill_metadata['title']
 29 |     if len(bill_metadata['alternate_titles']) > 0:
 30 |         alternate_titles = Json(bill_metadata['alternate_titles'][0])
 31 |     else:
 32 |         alternate_titles = None
 33 |     if len(bill_metadata['versions']) > 0:
 34 |         versions = Json(bill_metadata['versions'][0])
 35 |     else:
 36 |         versions = None 
 37 |     if 'subjects' in bill_metadata:
 38 |         if len(bill_metadata['subjects']) > 0:
 39 |             subjects = bill_metadata['subjects'][0]
 40 |         else: 
 41 |             subjects = None
 42 |     else:
 43 |         subjects = None
 44 |     if 'scraped_subjects' in bill_metadata:
 45 |         if len(bill_metadata['scraped_subjects']) > 0:
 46 |             scraped_subjects = bill_metadata['scraped_subjects'][0]
 47 |         else:
 48 |             scraped_subjects = None
 49 |     else:
 50 |         scraped_subjects = None
 51 |     type_ = bill_metadata['type'][0]
 52 |     if 'level' in bill_metadata:
 53 |         level = bill_metadata['level']
 54 |     else:
 55 |         level = None
 56 |     if len(bill_metadata['sponsors']) > 0:
 57 |         sponsors = Json(bill_metadata['sponsors'][0])
 58 |     else:
 59 |         sponsors = None
 60 |     if len(bill_metadata['actions']) > 0:
 61 |         actions = Json(bill_metadata['actions'][0])
 62 |     else:
 63 |         actions = None
 64 |     if len(bill_metadata['action_dates']) > 0:
 65 |         action_dates = Json(bill_metadata['action_dates'])
 66 |     else:
 67 |         action_dates = None
 68 |     if len(bill_metadata['documents']) > 0:
 69 |         documents = Json(bill_metadata['documents'][0])
 70 |     else:
 71 |         documents = None
 72 |     if len(bill_metadata['votes']) > 0:
 73 |         votes = Json(bill_metadata['votes'][0])
 74 |     else:
 75 |         votes = None
 76 |     id_ = bill_metadata['id']
 77 |     state = bill_metadata['state']
 78 |     chamber = bill_metadata['chamber']
 79 |     session = bill_metadata['session']
 80 |     
 81 |     all_ids = bill_metadata['all_ids'][0]
 82 |     created_at = bill_metadata['created_at']
 83 |     updated_at = bill_metadata['updated_at']
 84 | 
 85 |     return((bill_id, title, alternate_titles, versions, subjects, scraped_subjects, 
 86 |         type_, level, sponsors, actions, action_dates, documents, votes, id_, state,
 87 |         chamber, session, all_ids, created_at, updated_at))
 88 | 
 89 | 
 90 | 
 91 | # GRAB BILL METADATA AND PUSH TO DATABASE
 92 | temp_bill_metadata = []
 93 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/bills/'):
 94 |     for name in files:
 95 |         directory_file = os.path.join(path, name)
 96 |         with open(directory_file) as json_file:
 97 |             bill = json.load(json_file)
 98 |             parsed_data = parse_bill_metadata(bill)
 99 |             temp_bill_metadata.append(parsed_data)
100 |         if len(temp_bill_metadata) == 10000 or name == files[len(files)-1]:
101 |                 args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_bill_metadata)
102 |                 cur.execute("INSERT INTO bill_metadata VALUES " + args_str) 
103 |                 conn.commit()
104 |                 temp_bill_metadata = []
105 | 


--------------------------------------------------------------------------------
/archive/input/bill_metadata.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | DROP TABLE IF EXISTS bill_metadata;
 3 | 
 4 | CREATE TABLE bill_metadata (
 5 | 	bill_id VARCHAR,
 6 | 	title VARCHAR,
 7 | 	alternate_titles JSON,
 8 | 	versions VARCHAR,
 9 | 	subjects VARCHAR,
10 | 	scraped_subjects VARCHAR,
11 | 	type VARCHAR,
12 | 	level VARCHAR,
13 | 	sponsors JSON,
14 | 	actions JSON,
15 | 	action_dates JSON,
16 | 	documents JSON,
17 | 	votes JSON,
18 | 	leg_id VARCHAR,
19 | 	state CHAR(2),
20 | 	chamber VARCHAR,
21 | 	session VARCHAR,
22 | 	all_ids VARCHAR,
23 | 	created_at TIMESTAMP WITHOUT TIME ZONE,
24 | 	updated_at TIMESTAMP WITHOUT TIME ZONE
25 | );
26 | 


--------------------------------------------------------------------------------
/archive/input/campaign_contributions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | source default_profile
 4 | 
 5 | rm /mnt/data/sunlight/followthemoney/contributions.csv
 6 | 
 7 | for state in AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY
 8 | do
 9 | 	url="http://www.followthemoney.org/aaengine/aafetch.php?s=$state&law-ot=S,H&gro=d-id&APIKey=$FOLLOWTHEMONEYKEY&mode=csv"
10 | 	wget -O- --header="Accept: text/html" --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $url >> /mnt/data/sunlight/followthemoney/contributions.csv
11 | done
12 | 


--------------------------------------------------------------------------------
/archive/input/committee_metadata.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sunlight import openstates
 3 | import psycopg2
 4 | from psycopg2.extras import Json
 5 | import json
 6 | import csv
 7 | import sys
 8 | import re
 9 | import os
10 | 
11 | 
12 | # GRAB DATABASE INFO FROM default_profile
13 | db_info = []
14 | with open('default_profile', 'rb') as db_file:
15 |     reader = csv.reader(db_file, delimiter='=', quotechar='"')
16 |     for row in reader:
17 |         db_info.append(row[1])
18 | 
19 | 
20 | # CONNECT TO DATABASE
21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 
22 | cur = conn.cursor()
23 | 
24 | 
25 | # PARSE COMMITTEE METADATA
26 | def parse_committee_metadata(committee_metadata):
27 |     id_ = committee_metadata['id']
28 |     state = committee_metadata['state']
29 |     chamber = committee_metadata['chamber']
30 |     committee = committee_metadata['committee']
31 |     subcommittee = committee_metadata['subcommittee']
32 |     if len(committee_metadata['members']) > 0:
33 |         members = Json(committee_metadata['members'][0])
34 |     else: 
35 |         members = None
36 |     sources = committee_metadata['sources'][0]['url']
37 |     parent_id = committee_metadata['parent_id']
38 |     created_at = committee_metadata['created_at']
39 |     updated_at = committee_metadata['updated_at']
40 |     if len(committee_metadata['all_ids']) > 0:
41 |         all_ids = committee_metadata['all_ids'][0]
42 |     else:
43 |         all_ids = None
44 |     if 'level' in committee_metadata:
45 |         level = committee_metadata['level']
46 |     else:
47 |         level = None
48 | 
49 |     return((id_, state, chamber, committee, subcommittee, members,
50 |         sources, parent_id, created_at, updated_at, all_ids, level))
51 | 
52 | 
53 | 
54 | # GRAB COMMITTEE METADATA FROM FILES AND PUSH TO DATABASE
55 | temp_committee_metadata = []
56 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/committees/'):
57 |     for name in files:
58 |         directory_file = os.path.join(path, name)
59 |         with open(directory_file) as json_file:
60 |             committee = json.load(json_file)
61 |             parsed_data = parse_committee_metadata(committee)
62 |             temp_committee_metadata.append(parsed_data)
63 | 
64 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_committee_metadata)
65 | cur.execute("INSERT INTO committees VALUES " + args_str) 
66 | conn.commit()
67 | 
68 | 


--------------------------------------------------------------------------------
/archive/input/committee_metadata.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | DROP TABLE IF EXISTS committees;
 3 | 
 4 | CREATE TABLE committees (
 5 | 	id VARCHAR,
 6 | 	state VARCHAR(2),
 7 | 	chamber VARCHAR(10),
 8 | 	committee VARCHAR,
 9 | 	subcommittee VARCHAR,
10 | 	members JSON,
11 | 	sources VARCHAR,
12 | 	parent_id VARCHAR(10),
13 | 	created_at TIMESTAMP WITHOUT TIME ZONE,
14 | 	updated_at TIMESTAMP WITHOUT TIME ZONE,
15 | 	all_ids VARCHAR,
16 | 	level VARCHAR(5)
17 | );
18 | 


--------------------------------------------------------------------------------
/archive/input/download_bulk_sunlight_files.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ### DOWNLOAD BULK DATA ###
 4 | eval $(cat /home/jwalsh/policy_diffusion/default_profile | sed 's/^/export /')
 5 | state_abbrevs=$(psql -t -c "SELECT abbreviation FROM state_metadata WHERE bills_identified IS NULL AND abbreviation > 'l' ORDER BY abbreviation;")
 6 | user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36"
 7 | month="07"  # the first day of this month is the last day of records to download
 8 | for i in $state_abbrevs; do
 9 | 	urls="$urls -O http://static.openstates.org/downloads/2016-${month}-01-${i}-json.zip"
10 | done
11 | curl -A '$user_agent' $urls
12 | 
13 | 


--------------------------------------------------------------------------------
/archive/input/legislators.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import psycopg2
  3 | from psycopg2.extras import Json
  4 | import json
  5 | import csv
  6 | import sys
  7 | import re
  8 | import os
  9 | 
 10 | 
 11 | 
 12 | # GRAB DATABASE INFO FROM default_profile
 13 | db_info = []
 14 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file:
 15 |     reader = csv.reader(db_file, delimiter='=', quotechar='"')
 16 |     for row in reader:
 17 |         db_info.append(row[1])
 18 | 
 19 | 
 20 | # CONNECT TO DATABASE
 21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 
 22 | cur = conn.cursor()
 23 |         
 24 | 
 25 | 
 26 | # PARSE BILL METADATA FOR DATABASE INSERTION
 27 | def parse_legislator_metadata(legislator_metadata):
 28 |     id_ = legislator_metadata['id']
 29 |     if 'votesmart_id' in legislator_metadata:
 30 |         votesmart_id = legislator_metadata['votesmart_id']
 31 |     else:
 32 |         votesmart_id = None
 33 |     if 'transparencydata_id' in legislator_metadata:
 34 |         transparencydata_id = legislator_metadata['transparencydata_id']
 35 |     else:
 36 |         transparencydata_id = None
 37 |     first_name = legislator_metadata['first_name']
 38 |     if len(legislator_metadata['middle_name']) > 0: 
 39 |         middle_name = legislator_metadata['middle_name']
 40 |     else:
 41 |         middle_name = None
 42 |     last_name = legislator_metadata['last_name']
 43 |     if len(legislator_metadata['suffixes']) > 0:
 44 |         suffixes = legislator_metadata['suffixes']
 45 |     else:
 46 |         suffixes = None
 47 |     full_name = legislator_metadata['full_name']
 48 |     if 'party' in legislator_metadata:
 49 |         party = legislator_metadata['party']
 50 |     else:
 51 |         party = None
 52 |     active = legislator_metadata['active']
 53 |     if 'url' in legislator_metadata:
 54 |         url = legislator_metadata['url']
 55 |     else:
 56 |         url = None
 57 |     if 'photo_url' in legislator_metadata:
 58 |         photo_url = legislator_metadata['photo_url']
 59 |     else:
 60 |         photo_url = None
 61 |     if 'office_address' in legislator_metadata:
 62 |         office_address = legislator_metadata['office_address']
 63 |     else:
 64 |         office_address = None
 65 |     if 'office_phone' in legislator_metadata:
 66 |         office_phone = legislator_metadata['office_phone']
 67 |     else:
 68 |         office_phone = None
 69 |     leg_id = legislator_metadata['leg_id']
 70 |     if 'chamber' in legislator_metadata:
 71 |         chamber = legislator_metadata['chamber']
 72 |     else:
 73 |         chamber = None
 74 |     if 'district' in legislator_metadata:
 75 |         district = legislator_metadata['district']
 76 |     else:
 77 |         district = None
 78 |     state = legislator_metadata['state']
 79 |     if len(legislator_metadata['offices']) > 0:
 80 |         offices = Json(legislator_metadata['offices'][0])
 81 |     else:
 82 |         offices = None
 83 |     if 'email' in legislator_metadata:
 84 |         email = legislator_metadata['email']
 85 |     else:
 86 |         email = None
 87 |     if len(legislator_metadata['roles']) > 0:
 88 |         roles = Json(legislator_metadata['roles'][0])
 89 |     else:
 90 |         roles = None
 91 |     if 'old_roles' in legislator_metadata:
 92 |         old_roles = Json(legislator_metadata['old_roles'])
 93 |     else:
 94 |         old_roles = None
 95 |     all_legislative_ids = legislator_metadata['all_ids'][0]
 96 |     if 'level' in legislator_metadata:
 97 |         level = legislator_metadata['level']
 98 |     else:
 99 |         level = None
100 |     if len(legislator_metadata['sources']) > 0:
101 |         sources = Json(legislator_metadata['sources'][0])
102 |     else:
103 |         sources = None
104 |     created_at = legislator_metadata['created_at']
105 |     updated_at = legislator_metadata['updated_at']
106 | 
107 |     return((id_, votesmart_id, transparencydata_id, 
108 |         first_name, middle_name, last_name, suffixes, full_name,
109 |         party, active, url, photo_url, office_address, office_phone,
110 |         leg_id, chamber, district, state, offices, email,
111 |         roles, old_roles, all_legislative_ids, level, sources,
112 |         created_at, updated_at))
113 | 
114 | 
115 | 
116 | # GRAB BILL METADATA FROM SUNLIGHT AND PUSH TO DATABASE
117 | temp_legislator_metadata = []
118 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/legislators/'):
119 |     for name in files:
120 |         directory_file = os.path.join(path, name)
121 |         with open(directory_file) as json_file:
122 |             legislator = json.load(json_file)
123 |             parsed_data = parse_legislator_metadata(legislator)
124 |             temp_legislator_metadata.append(parsed_data)
125 | 
126 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_legislator_metadata)
127 | cur.execute("INSERT INTO legislators VALUES " + args_str) 
128 | conn.commit()
129 | 
130 | 


--------------------------------------------------------------------------------
/archive/input/legislators.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | DROP TABLE IF EXISTS legislators;
 3 | 
 4 | CREATE TABLE legislators (
 5 | 	id VARCHAR,
 6 | 	votesmart_id VARCHAR,
 7 | 	transparencydata_id VARCHAR,
 8 | 	first_name VARCHAR,
 9 | 	middle_name VARCHAR,
10 | 	last_name VARCHAR,
11 | 	suffixes VARCHAR,
12 | 	full_name VARCHAR,
13 | 	party VARCHAR,
14 | 	active BOOLEAN,
15 | 	url VARCHAR,
16 | 	photo_url VARCHAR,
17 | 	office_address VARCHAR,
18 | 	office_phone VARCHAR,
19 | 	leg_id VARCHAR,
20 | 	chamber VARCHAR,
21 | 	district VARCHAR,
22 | 	state VARCHAR,
23 | 	offices JSON,
24 | 	email VARCHAR,
25 | 	roles JSON,
26 | 	old_roles JSON,
27 | 	all_legislative_ids VARCHAR,
28 | 	level VARCHAR,
29 | 	sources JSON,
30 | 	created_at TIMESTAMP WITHOUT TIME ZONE,
31 | 	updated_at TIMESTAMP WITHOUT TIME ZONE
32 | );
33 | 


--------------------------------------------------------------------------------
/archive/input/lobbyists/compile_lobbyist_lists.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | ## ILLINOIS ##
 5 | #url=
 6 | 
 7 | 
 8 | ## MICHIGAN ##
 9 | # http://miboecfr.nictusa.com/cgi-bin/cfr/lobby_srch_res.cgi
10 | 
11 | url=http://miboecfr.nictusa.com/cfr/dumpdata/aaarZaGrk/mi_lobby.sh
12 | wget -O michigan_lobbyists.txt --user-agent="jtwalsh@uchicago.edu" $url 
13 | 
14 | #sed -E 's/\t/,/g' michigan_lobbyists.csv | sed 's/#/ Number/g' | sed -E 's/\(MaxLen=(.){1,3}\)//g'
15 | 
16 | http://miboecfr.nictusa.com/cfr/dumpdata/aaa3AaiZp/mi_lobby.sh
17 | 
18 | # second line of the file has metadata
19 | # the bottom of the file has garbage too
20 | 
21 | 


--------------------------------------------------------------------------------
/archive/input/opensecrets/opensecrets_candidates.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS opensecrets.candidates;
 2 | 
 3 | CREATE TABLE opensecrets.candidates (
 4 | 	cycle INTEGER NOT NULL, 
 5 | 	fec_candidate_id VARCHAR(9) NOT NULL, 
 6 | 	candidate_id VARCHAR(9) NOT NULL, 
 7 | 	first_last_party VARCHAR(38) NOT NULL, 
 8 | 	party VARCHAR(7) NOT NULL, 
 9 | 	office_sought VARCHAR(4), 
10 | 	office_held VARCHAR(4), 
11 | 	currently_running BOOLEAN, 
12 | 	 VARCHAR(4), 
13 | 	"RL" VARCHAR(4)
14 | );
15 | 


--------------------------------------------------------------------------------
/archive/input/state_metadata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | from sunlight import openstates
 5 | import psycopg2
 6 | import csv
 7 | import sys
 8 | import re
 9 | 
10 | 
11 | 
12 | # GRAB DATABASE INFO FROM default_profile
13 | db_info = []
14 | with open('default_profile', 'rb') as db_file:
15 |     reader = csv.reader(db_file, delimiter='=', quotechar='"')
16 |     for row in reader:
17 |         db_info.append(row[1])
18 | 
19 | 
20 | # CONNECT TO DATABASE
21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 
22 | cur = conn.cursor()
23 | 
24 | 
25 | # FUNCTION TO PARSE STATE METADATA
26 | def parse_state_metadata(state_metadata):
27 |     name = state_metadata['name']
28 |     abbreviation = state_metadata['abbreviation']
29 |     if 'lower' in state_metadata['chambers']:
30 |         lower_chamber_name = state_metadata['chambers']['lower']['name']
31 |         lower_chamber_title = state_metadata['chambers']['lower']['title']
32 |     else:
33 |         lower_chamber_name = None
34 |         lower_chamber_title = None
35 |     upper_chamber_name = state_metadata['chambers']['upper']['name']
36 |     upper_chamber_title = state_metadata['chambers']['upper']['title']
37 |     feature_flags = ', '.join(state_metadata['feature_flags'])
38 |     return((name, abbreviation, lower_chamber_name, lower_chamber_title,
39 |         upper_chamber_name, upper_chamber_name, feature_flags))
40 | 
41 | 
42 | # GRAB THE DATA FROM SUNLIGHT API
43 | state_metadata = openstates.all_metadata()
44 | 
45 | 
46 | # PARSE SUNLIGHT DATA AND WRITE TO POSTGRES
47 | temp_state_metadata = []
48 | for state in state_metadata:
49 |     temp_state_metadata.append(parse_state_metadata(state))
50 | 
51 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_state_metadata)
52 | cur.execute("INSERT INTO state_metadata VALUES " + args_str) 
53 | conn.commit()


--------------------------------------------------------------------------------
/archive/input/state_metadata.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | DROP TABLE IF EXISTS state_metadata;
 3 | 
 4 | CREATE TABLE state_metadata (
 5 | 	name VARCHAR(20),
 6 | 	abbreviation VARCHAR(2),
 7 | 	lower_chamber_name VARCHAR(10),
 8 | 	lower_chamber_title VARCHAR(15),
 9 | 	upper_chamber_name VARCHAR(10),
10 | 	upper_chamber_title VARCHAR(15),
11 | 	feature_flags VARCHAR(50)
12 | );
13 | 


--------------------------------------------------------------------------------
/archive/input/unzip_bulk_files.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | filenames=$(ls /mnt/data/sunlight/openstates_zipped_files/)
4 | 
5 | for i in $filenames; do
6 | 	dir_name=$(sed -E 's/201[0-9]-0[0-9]-0[0-9]-//g' ${i} | sed -E 's/-json.zip//g')
7 | 	unzip /mnt/data/sunlight/openstates_zipped_files/${i} -d /mnt/data/sunlight/openstates_unzipped/${dir_name}
8 | done
9 | 


--------------------------------------------------------------------------------
/archive/prototype_text_alignment_algorithms.py:
--------------------------------------------------------------------------------
 1 | from text_alignment import *
 2 | from gensim.models import Word2Vec
 3 | from evaluation.score_alignments import load_word2vec
 4 | from scipy.spatial.distance import cosine
 5 | 
 6 | class Word2VecLocalAligner(LocalAligner):
 7 | 
 8 |     def __init__(self,match_score = 3, mismatch_score = -1, gap_score = -2):
 9 |         LocalAligner.__init__(self, match_score, mismatch_score, gap_score)
10 |         self.model = load_word2vec()
11 |         self._algorithm_name = 'word2vec_local_alignment'
12 | 
13 |     def __str__(self):
14 |         
15 |         name_str = "{0} instance".format(self._algorithm_name)
16 |         param_str_1 = "match_score = {0}".format(self.gap_score)
17 |         param_str_2 = "mismatch_score = {0}".format(self.match_score)
18 |         param_str_3 = "gap_score = {0}".format(self.mismatch_score)
19 |         return "{0}: {1}, {2}, {3}".format(name_str,param_str_1,param_str_2,param_str_3)
20 | 
21 | 
22 |     def align(self,left_sections,right_sections):
23 |         '''
24 |         description:
25 |             find alignments between two documents using word2vec
26 |         args:
27 |             left_sections: a list of lists of words
28 |             right_sections: a list of lists of words (usually just a list of a list of words)
29 | 
30 |         returns:
31 |             alignment object
32 |         '''
33 |         
34 |         alignments = []
35 |         alignment_indices = []
36 |         
37 |         for left in left_sections:
38 |             for right in right_sections:
39 | 
40 |                 a_ints, b_ints, word_map = self._transform_text(left, right)
41 | 
42 |                 score_matrix, pointer_matrix = self._compute_matrix(a_ints, b_ints,self.match_score,
43 |                         self.mismatch_score, self.gap_score, self.model)
44 | 
45 |                 l, r, score, align_index = self._backtrace(a_ints, b_ints, score_matrix, pointer_matrix)
46 | 
47 |                 reverse_word_map = {v:k for k,v in word_map.items()}
48 |                 reverse_word_map["-"] = "-" 
49 |                 l = [reverse_word_map[w] for w in l]
50 |                 r = [reverse_word_map[w] for w in r]
51 |                 
52 |                 alignment_indices.append(align_index)
53 |                 alignments.append((score, l, r))
54 | 
55 |         left = reduce(lambda x,y:x+y,left_sections)
56 |         right = reduce(lambda x,y:x+y,right_sections)
57 |         
58 |         return Alignment(left,right,alignments,alignment_indices)
59 | 
60 | 
61 |     @jit
62 |     def _compute_matrix(self, left, right, match_score, mismatch_score, gap_score, model):
63 |         '''
64 |         description:
65 |             create matrix of optimal scores 
66 |         args:
67 |             left: an array of integers
68 |             right: an array of integers
69 |             match_score: score for match in alignment
70 |             mismatch_score: score for mismatch in alignment
71 |             gap_start: score for first gap 
72 |             gap_extend: score for every gap
73 |             model: word2vec model
74 |         returns:
75 |             three matrices required to construct optimal solution
76 |         '''
77 |         m = len(left) + 1
78 |         n = len(right) + 1
79 |         score_matrix = np.zeros((m, n),dtype =  float)
80 |         scores = np.zeros((4),dtype = float)
81 |         pointer_matrix = np.zeros((m,n),dtype = int)
82 |         for i in xrange(1, m):
83 |             for j in xrange(1, n):
84 |                 
85 |                 if left[i-1] == right[j-1]:
86 |                     scores[1] = score_matrix[i-1,j-1] + match_score
87 |                 else:
88 |                     scores[1] = score_matrix[i-1,j-1] + mismatch_score*cosine(left[i-1], right[j-1])
89 | 
90 |                 scores[2] = score_matrix[i, j - 1] + gap_score
91 |                 
92 |                 scores[3] = score_matrix[i - 1, j] + gap_score
93 |         
94 |                 max_decision = np.argmax(scores)
95 | 
96 |                 pointer_matrix[i,j] = max_decision
97 |                 score_matrix[i,j] = scores[max_decision]
98 |         
99 |         return score_matrix, pointer_matrix


--------------------------------------------------------------------------------
/archive/score_alignments.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Functions for scoring alignments
  3 | '''
  4 | 
  5 | from sklearn.feature_extraction.text import TfidfVectorizer
  6 | from sklearn.metrics import jaccard_similarity_score
  7 | import numpy as np
  8 | import scipy as sp
  9 | from database import *
 10 | from gensim.models import Word2Vec
 11 | from utils.general_utils import save_pickle
 12 | import json
 13 | 
 14 | def weight_length(alignment, left_length, right_length):
 15 | 	print alignment
 16 | 	return np.sum([a[0]*(len(a[1])/float(left_length))*(len(a[2])/float(right_length)) for a in alignment.alignments])
 17 | 
 18 | def weight_tfidf(alignment, state_tfidf, left_state, right_state):
 19 | 	'''
 20 | 	state_tfidf: dictionary with tfidf scores by state
 21 | 	'''
 22 | 	f = StateTFIDF(state_tfidf)
 23 | 	return np.sum([f.tfidf_score(a, left_state, right_state)*a[0] for a in alignment.alignments])
 24 | 
 25 | def jaccard_coefficient(left, right):
 26 |     jaccard_scores = jaccard_similarity_score(left,right)
 27 |     return jaccard_scores
 28 | 
 29 | def load_word2vec():
 30 |     model = Word2Vec.load_word2vec_format('/mnt/data/sunlight/GoogleNews-vectors-negative300.bin', binary=True)
 31 | 
 32 |     return model
 33 | 
 34 | def word2vec_similarity(list_of_alignments, model):
 35 |     '''
 36 |     model is word2vec model
 37 |     '''
 38 |     distances = []
 39 |     for alignment in list_of_alignments:
 40 |         score, left, right = alignment
 41 | 
 42 |         word_distance_list = []
 43 |         for i in range(len(left)):
 44 |             
 45 |             if left[i] not in model or right[i] not in model:
 46 |                 continue
 47 |             
 48 |             word_distance_list.append(model.similarity(left[i], right[i]))
 49 | 
 50 |         distances.append(np.mean(word_distance_list))
 51 | 
 52 |     return np.mean(distances)
 53 | 
 54 |         
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | ####################################################################
 62 | ##tfidf functions
 63 | 
 64 | def tfidf_by_state(state, num_bills = 'all'):
 65 |     '''
 66 |     description:
 67 |         create dictionary of tfidf scores for a particular state
 68 |     args:
 69 |         num_bills: number of bills to run the algorithm open
 70 |     returns:
 71 |         dictionary of tfidf scores with words as keys
 72 |     '''
 73 |     es = ElasticConnection()
 74 |     state_bills = es.get_bills_by_state(state, num_bills)
 75 |     corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
 76 |             if bill['_source']['bill_document_last'] != None]
 77 | 
 78 |     vectorizer = TfidfVectorizer()
 79 |     X = vectorizer.fit_transform(corpus)
 80 |     idf = vectorizer.idf_
 81 |     idf = vectorizer._tfidf.idf_
 82 | 
 83 |     return dict(zip(vectorizer.get_feature_names(), idf))
 84 | 
 85 | 
 86 | def tfidf_all_bills():
 87 |     '''
 88 |     description:
 89 |         create dictionary of tfidf scores for a particular state
 90 |     args:
 91 |         num_bills: number of bills to run the algorithm open
 92 |     returns:
 93 |         dictionary of tfidf scores with words as keys
 94 |     '''
 95 |     es = ElasticConnection()
 96 |     state_bills = es.get_all_bills()
 97 |     corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
 98 |             if bill['_source']['bill_document_last'] != None]
 99 | 
100 |     vectorizer = TfidfVectorizer()
101 |     X = vectorizer.fit_transform(corpus)
102 |     idf = vectorizer.idf_
103 |     idf = vectorizer._tfidf.idf_
104 | 
105 |     return dict(zip(vectorizer.get_feature_names(), idf))
106 | 
107 | 
108 | def tfidf_by_all_states():
109 |     states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 
110 |             'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 
111 |             'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 
112 |             'TN', 'TX', 'UT', 'VT',  'VA', 'WA', 'WV', 'WI', 'WY']
113 |     states = map(lambda x : x.lower(), states)
114 | 
115 |     tfidf = {}
116 |     for state in states:
117 |         print 'working on ', state
118 |         tfidf[state] = tfidf_by_state(state)
119 | 
120 |     return tfidf
121 | 
122 | 
123 | ####################################################################
124 | ##state tfidf object
125 | class StateTFIDF():
126 | 
127 |     def __init__(self, state_tfidf):
128 |         self.state_tfidf = state_tfidf
129 | 
130 |     def find_tfidf(self, word, state):
131 |         if state == 'model_legislation':
132 |             return 0
133 |         elif word == '-' or word not in self.state_tfidf[state]:
134 |             return 0
135 |         else:
136 |             return self.state_tfidf[state][word]
137 | 
138 |     def tfidf_score(self, left, right, left_state, right_state):
139 |     	'''
140 |     	gives average tfidf for a particular left and right components of alignment
141 |     	'''
142 |         left_scores = []
143 |         right_scores = [] 
144 | 
145 |         for i in range(len(left)):
146 |             left_scores.append(self.find_tfidf(left[i], left_state)) #need function
147 |             right_scores.append(self.find_tfidf(right[i], right_state))
148 | 
149 |         if scores == []:
150 |             return 0
151 |         else:
152 |             return np.mean(left_scores), np.mean(right_scores)
153 | 
154 | 
155 | def tfidf_by_alignments():
156 |     alignments = []
157 |     with open('bill_to_bill_alignments.txt') as f:
158 |         for i,line in enumerate(f):
159 |             print 'line ', i
160 |             alignments.append(json.loads(line))
161 | 
162 | if __name__ == "__main__":
163 |     tfidf = tfidf_all_bills()
164 |     save_pickle(tfidf, 'tfidf_all_bills')
165 | 
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/archive/tfidf_ranking.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | import numpy as np
  3 | import pickle
  4 | from alignment_evaluation import *
  5 | from database import *
  6 | import time
  7 | 
  8 | def calc_tfidf_alignments(alignments_list):
  9 |     '''
 10 | 	arg:
 11 | 		list of alignment objects
 12 | 	returns:
 13 | 		dictionary with tfi_idf scores
 14 |     '''
 15 |     corpus = [alignment[1] + alignment[2] \
 16 |                 for alignments in alignments_list for alignment in alignments ]
 17 |     corpus = [' '.join(doc) for doc in corpus]
 18 |     vectorizer = TfidfVectorizer()
 19 |     X = vectorizer.fit_transform(corpus)
 20 |     idf = vectorizer.idf_
 21 |     idf = vectorizer._tfidf.idf_
 22 |     return dict(zip(vectorizer.get_feature_names(), idf))
 23 | 
 24 | 
 25 | def rank_alignments(alignments_list):
 26 |     '''
 27 | 	arg: 
 28 | 		list of alignment objects
 29 | 	returns:
 30 | 		list of alignment objects sorted by averaged tfi_idf score
 31 |     '''
 32 |     tfidf = calc_tfidf_alignments(alignments_list)
 33 | 
 34 |     not_in_dict = 0
 35 |     in_dict = 0
 36 | 
 37 |     alignments_tfidf = []
 38 |     for alignments in alignments_list:
 39 |         tfidf_scores = []
 40 |         for alignment in alignments:
 41 |             print alignment
 42 |             for word in alignment[1]:
 43 |                 if word in tfidf:
 44 |                     tfidf_scores.append(tfidf[word.lower()])
 45 |                     in_dict += 1
 46 |                 if word != '-' and word not in tfidf:
 47 |                      not_in_dict += 1
 48 |             for word in alignment[2]:
 49 |                 if word in tfidf:
 50 |                     tfidf_scores.append(tfidf[word.lower()])
 51 |                     in_dict += 1
 52 |                 if word != '-' and word not in tfidf:
 53 |                      not_in_dict += 1
 54 |         if tfidf_scores != []:
 55 |             alignments_tfidf.append((alignments, np.sum(tfidf_scores)))
 56 |         else:
 57 |             alignments_tfidf.append((alignments, 0))
 58 | 
 59 |     print "num not in dict: ", not_in_dict
 60 |     print "in dict: ", in_dict
 61 | 
 62 |     alignments_tfidf.sort(key = lambda x: x[1], reverse=True)
 63 | 
 64 |     return alignments_tfidf
 65 | 
 66 | 
 67 | def tfidf_by_state(state, num_bills = 'all'):
 68 |     '''
 69 |     description:
 70 |         create dictionary of tfidf scores for a particular
 71 |     args:
 72 |         state
 73 |         num_bills: number of bills to run the algorithm open
 74 |     returns:
 75 |         dictionary of tfidf scores with words as keys
 76 |     '''
 77 |     es = ElasticConnection()
 78 |     state_bills = es.get_bills_by_state(state, num_bills)
 79 |     corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
 80 |             if bill['_source']['bill_document_last'] != None]
 81 | 
 82 |     vectorizer = TfidfVectorizer()
 83 |     X = vectorizer.fit_transform(corpus)
 84 |     idf = vectorizer.idf_
 85 |     idf = vectorizer._tfidf.idf_
 86 | 
 87 |     return dict(zip(vectorizer.get_feature_names(), idf))
 88 | 
 89 | 
 90 | def tfidf_by_all_states():
 91 |     states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 
 92 |             'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 
 93 |             'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 
 94 |             'TN', 'TX', 'UT', 'VT',  'VA', 'WA', 'WV', 'WI', 'WY']
 95 |     states = map(lambda x : x.lower(), states)
 96 | 
 97 |     tfidf = {}
 98 |     for state in states:
 99 |         print 'working on ', state
100 |         tfidf[state] = tfidf_by_state(state)
101 | 
102 |     return tfidf
103 | 
104 | 
105 | ####################################################################
106 | ##state tfidf object
107 | class StateTFIDF():
108 | 
109 |     def __init__(self, state_tfidf):
110 |         self.state_tfidf = state_tfidf
111 | 
112 |     def find_tfidf(self, word, state):
113 |         if state == 'model_legislation':
114 |             return 0
115 |         elif word == '-' or word not in self.state_tfidf[state]:
116 |             return 0
117 |         else:
118 |             return self.state_tfidf[state][word]
119 | 
120 |     def tfidf_score(self, alignment_with_state):
121 |         scores = []
122 |         print 'alignment_with_state: ', alignment_with_state
123 |         raw_input("Press Enter to continue...")
124 |         alignment, left_state, right_state = alignment_with_state
125 |         score, left, right = alignment[0] #TODO: make work for more than one alignment
126 | 
127 |         for i in range(len(left)):
128 |             scores.append(self.find_tfidf(left[i], left_state)) #need function
129 |             scores.append(self.find_tfidf(right[i], right_state))
130 | 
131 |         if scores == []:
132 |             return 0
133 |         else:
134 |             return np.mean(scores)
135 | 
136 | 
137 | ####################################################################
138 | ##ranking functions
139 | def rank(alignments_list, functions):
140 |     '''
141 |     depending on the function used, alignments_list may contain states of the alignments of not
142 |     '''
143 |     ranking = []
144 |     for alignments in alignments_list:
145 |         scores = []
146 |         #keep track of for normalization
147 |         max_function_values = np.zeros((4))
148 | 
149 |         for i in range(len(functions)):
150 |             function = functions[i]
151 |             output = function(alignments)
152 |             scores.append(output)
153 |             ranking.append((alignments, scores))
154 | 
155 |             if max_function_values[i] < output:
156 |                 max_function_values[i] = output
157 | 
158 |     final_ranking = []
159 |     for alignments, scores in ranking:
160 |         rank_value = []
161 |         scores_max = zip(scores, max_function_values)
162 | 
163 |         for score, maxim in scores_max:
164 |             rank_value.append(score / float(maxim))
165 | 
166 |         final_ranking.append((alignments[0][0], np.mean(scores)))
167 | 
168 |     final_ranking.sort(key = lambda x: x[1], reverse=True)
169 | 
170 |     return final_ranking
171 | 
172 | 
173 | def inspect_ranking(ranking):
174 |     for alignments, tfidf in ranking:
175 |         score, left, right = alignments
176 |         for i in range(len(left)):
177 |             print left[i], right[i]
178 |         print 'alignment score: ', score
179 |         print 'mean tfidf: ', tfidf
180 |         raw_input("Press Enter to continue...")
181 |         print '\n'
182 | 
183 | 
184 | 
185 | if __name__ == '__main__':
186 | 
187 | 
188 |     # tfidf = calc_tfidf(alignments_list)
189 | 
190 |     # alignments_tfidf = rank_alignments(alignments_list)
191 | 
192 |     # print 'testing speed of calculating tfidf per state'
193 |     
194 |     # t1 = time.time()
195 |     # t=tfidf_state('al')
196 |     # print 'alabama time: {0} seconds'.format(time.time()-t1)
197 | 
198 |     # t1 = time.time()
199 |     # t=tfidf_state('ny')
200 |     # print 'new york time: {0} seconds'.format(time.time()-t1) 
201 | 
202 |     # print 'calculate tfidf by state...'
203 | 
204 |     # tfidf = tfidf_by_all_states()
205 | 
206 |     # with open('state_tfidfs.p', 'wb') as fp:
207 |     #     pickle.dump(tfidf, fp)
208 | 
209 |     print 'loading experiment and building alignment list...'
210 |     with open('experiment.p', 'rb') as fp:
211 |         e = pickle.load(fp)
212 | 
213 |     alignments_list = []
214 |     for key, value in e.results.iteritems():
215 |         i, j = key
216 |         state_i = e.bills[i]['state']
217 |         state_j = e.bills[j]['state']
218 |         alignments_list.append((value['alignments'], state_i, state_j))
219 | 
220 | 
221 |     with open('state_tfidfs.p', 'rb') as fp:
222 |         tfidf = pickle.load(fp)
223 |     f = StateTFIDF(tfidf)
224 | 
225 |     print 'calculating ranking...'
226 |     ranking = rank(alignments_list, [f.tfidf_score])
227 |     inspect_ranking(ranking)
228 | 
229 | 


--------------------------------------------------------------------------------
/bashrc_lid:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # make sure path to this project is set
 4 | if [ ! -n "$POLICY_DIFFUSION" ]; then
 5 |     echo "Error: \$POLICY_DIFFUSION environment variable is not set"
 6 |     return
 7 | fi
 8 | 
 9 | # ensure user specified directory for log files
10 | if [ ! -n "$LOGFILE_DIRECTORY" ]; then
11 |     echo "Error: \$LOGFILE_DIRECTORY environment variable is not set"
12 |     return
13 | fi
14 | 
15 | # ensure users specified a directory for temporary files
16 | if [ ! -n "$TEMPFILE_DIRECTORY" ]; then
17 |     echo "Error: \$TEMPFILE_DIRECTORY environment variable is not set"
18 |     return
19 | fi
20 | 
21 | # ensure users specified the IP address for the ElasticSearch instance
22 | if [ ! -n "$ELASTICSEARCH_IP" ]; then
23 |     echo "Error: \$ELASTICSEARCH_IP environment variable is not set"
24 |     return
25 | fi
26 | 
27 | # add python code to path
28 | export PYTHONPATH=${POLICY_DIFFUSION}/lid:${PYTHONPATH}
29 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/etl:${PYTHONPATH}
30 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/utils:${PYTHONPATH}
31 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/evaluation:${PYTHONPATH}
32 | export PYTHONPATH=${POLICY_DIFFUSION}/scripts:${PYTHONPATH}
33 | 


--------------------------------------------------------------------------------
/data/evaluation_set/bills_for_evaluation_set.csv:
--------------------------------------------------------------------------------
1 | Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 55,2011,Moak,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 750,2011,Howell,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 164,2011 Regular Session,Marzian,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,mo,SB 213,2011,Schaefer,Enacted,https://legiscan.com/MO/text/SB213/id/294359/Missouri-2011-SB213-Enrolled.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,ar,SB 4,2011,Johnson,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 1053,2011,,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,sd,HB 1062,2011,Lust,Enacted,http://legis.sd.gov/docs/legsession/2011/Bills/HB1062HJU.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 146,2011,Payne,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,id,SB 1056,2011,,Enacted,http://legislature.idaho.gov/legislation/2011/S1056.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 2181,187th,Gobi,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,me,LD 1377,2012,Sanborn,Enacted,http://www.mainelegislature.org/legis/bills/getPDF.asp?paper=HP1016&item=1&snum=125Adult Guardianship and Protective Proceedings Jurisdiction Act,fl,HB 1431,2010,Schwartz,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2010/pdf/1431.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB7687,2014,Craven/McCaffrey,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,SB 7464,2012,Hannon,Introduced,https://legiscan.com/NY/text/S07464/id/646869/New_York-2011-S07464-Introduced.htmlAdult Guardianship and Protective Proceedings Jurisdiction Act,wy,SB 39,2013,Ross,Enacted,https://legiscan.com/WY/text/SF0039/2013Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 1366,188th,Gobi,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2655,2013,Hopson,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,sc,SB 1070,2010,Hayes,Enacted,http://www.scstatehouse.gov/sess118_2009-2010/prever/1070_20100120.htmAdult Guardianship and Protective Proceedings Jurisdiction Act,az,HB 2426,49th-2nd-regular,Driggs,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB 5958,2011,Kennedy,Introduced,https://legiscan.com/RI/text/H5958/id/268260/Rhode_Island-2011-H5958-Draft.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,nh,SB 209,2015,Stiles,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,SB525,2015,Lombardi/Craven,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ia,HF 734,2010,,Enacted,https://www.legis.iowa.gov/DOCS/IowaActs/83/2/pdf/Chapter_1086.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,vt,SB 169,2010,Campbell,Introduced,https://legiscan.com/VT/text/S0169/id/384141/Vermont-2009-S0169-Introduced.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 231,2010,Kelley,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ok,SB 2204,2010,Ivester,Enacted,http://www.oklegislature.gov/cf_pdf/2009-10%20ENR/sb/sb2204%20enr.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 80,2010,Howell,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,AB 4253,2011,DiCicco,Introduced,http://www.njleg.state.nj.us/2010/Bills/A4500/4253_I1.HTMAdult Guardianship and Protective Proceedings Jurisdiction Act,hi,SB 2318,2012,Chun,Enacted,https://legiscan.com/HI/text/SB2318/id/544560/Hawaii-2012-SB2318-Introduced.htmlAdult Guardianship and Protective Proceedings Jurisdiction Act,pa,HB 1720,2012,Hennessey,Enacted,http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=1720&pn=2589Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 191,2012,Moak,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,HB 5150,2012,,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,A 2628,215,Rudder,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,mi,SB 466,2013-2014,Schuitmaker,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,S 2249,188th,,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2240,2014,Hopson,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,A 857,2013-2014,Weinstein,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,tn,SB 444,2010,Stewart,Enacted,https://legiscan.com/TN/text/SB0444/id/461093/Tennessee-2009-SB0444-Draft.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2998,84,Rodriguez,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ga,SB 207,2015_16,McKoon,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,nc,HB 817,2015,Hurley,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 426,2010,,Introduced,http://www.cga.ct.gov/2010/FC/2010SB-00426-R000518-FC.htmAdult Guardianship and Protective Proceedings Jurisdiction Act,mn,SF412,2009-2010,Moua,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 497,2009,Wirth,Introduced,http://www.nmlegis.gov/Sessions/09%20Regular/bills/senate/SB0497.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,nv,SB 313,75,,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 576,2009,Doyle,Introduced,http://www.cga.ct.gov/2009/FC/pdf/2009SB-00576-R000752-FC.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2260,81,Truitt,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,wa,HB 1261,2009,Goodman,Enacted,http://lawfilesext.leg.wa.gov/biennium/2009-10/Pdf/Bills/House%20Passed%20Legislature/1261-S.PL.pdfAdult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 122,2009,Kelley,Introduced,Adult Guardianship and Protective Proceedings Jurisdiction Act,il,HB 759,96th,Ryg,Enacted,Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 86,98th,Marzian,Introduced,https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0CB4QFjAAahUKEwi939z7k_TGAhVVMYgKHasODFs&url=http%3A%2F%2Fwww.lrc.ky.gov%2Frecord%2F10rs%2FHB86%2Fbill.doc&ei=jGOyVb1p1eKgBKudsNgF&usg=AFQjCNHcJ0pa2RJG5jxy0CHbLYpUEAADEA&sig2=E7yw8zaghDIujs7uzqPhHQAdult Guardianship and Protective Proceedings Jurisdiction Act,ut,SB 122,2008,Hillyard,Enacted,http://le.utah.gov/~2008/bills/static/SB0122.htmlAdult Guardianship and Protective Proceedings Jurisdiction Act,mo,HB 2105,2008,Cooper,Introduced,http://house.mo.gov/billtracking/bills081/billpdf/intro/HB2105I.PDFAnatomical Gift Act (2006),ct,SB 250,2010,,Enacted,http://www.cga.ct.gov/2010/TOB/S/2010SB-00250-R00-SB.htmAnatomical Gift Act (2006),ky,SB 4,2010,Williams,Enacted,http://www.lrc.ky.gov/record/10RS/SB4/bill.docAnatomical Gift Act (2006),md,HB 1451,2010,"Barve, Dumais",Introduced,http://mlis.state.md.us/2010rs/bills/hb/hb1451f.pdfAnatomical Gift Act (2006),il,HB 2339,98th,Davis,Enacted,Anatomical Gift Act (2006),pa,HB 2700,2009-2010,Petrarca,Introduced,Anatomical Gift Act (2006),pa,SB750,2011-2012,Greenleaf/Petrarca,Introduced,Anatomical Gift Act (2006),ma,S 1098,187th,Fargo,Introduced,Anatomical Gift Act (2006),md,SB 756,2011,Kelley,Enacted,Anatomical Gift Act (2006),pa,SB 180,2015-2016,Greenleaf,Introduced,Anatomical Gift Act (2006),tx,HB 2027,81,Zerwas,Enacted,Anatomical Gift Act (2006),ct,HB 6677,2009,,Introduced,http://www.cga.ct.gov/2009/FC/2009HB-06677-R000964-FC.htmAnatomical Gift Act (2006),fl,SB 766,2009,,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2009/pdf/0766ER.pdfAnatomical Gift Act (2006),oh,HB 529,2009,Wachtmann,Enacted,http://archives.legislature.state.oh.us/analysis.cfm?ID=127_HB_529&ACT=As%20Enrolled&hf=analyses127/08-hb529-127.htmAnatomical Gift Act (2006),il,HB 1349,96th,Davis,Introduced,Anatomical Gift Act (2006),ak,SB 181,2007,McGuire,Introduced,http://www.legis.state.ak.us/PDF/25/Bills/SB0181A.PDFAnatomical Gift Act (2006),az,SB 1099,2007,Allen,Enacted,http://www.azleg.gov/legtext/48leg/1r/bills/sb1099h.htmAnatomical Gift Act (2006),id,SB 1017,2007,,Enacted,http://legislature.idaho.gov/legislation/2007/S1017.htmlAnatomical Gift Act (2006),ne,LB 1036,2010,Council,Enacted,http://www.nebraskalegislature.gov/FloorDocs/101/PDF/Final/LB1036.pdfAnatomical Gift Act (2006),nh,HB 1430,2010,Foose,Enacted,http://www.nhliberty.org/bills/view/2010/HB1430Anatomical Gift Act (2006),vt,S 205,2009-2010,Ayer,Enacted,Anatomical Gift Act (2006),pa,SB 850,2013-2014,Greenleaf,Introduced,Anatomical Gift Act (2006),wa,HB 1637,2008,Hinkle,Enacted,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/Session%20Laws/House/1637-S.SL.pdfAnatomical Gift Act (2006),wi,SB 310,2008,Risser,Enacted,http://docs.legis.wisconsin.gov/2007/related/proposals/sb310Anatomical Gift Act (2006),ca,AB 1689,2008,Lieber,Enacted,http://leginfo.legislature.ca.gov/faces/billNavClient.xhtml?bill_id=200720080AB1689Anatomical Gift Act (2006),ga,SB 405,2008,Balfour,Enacted,http://www.legis.ga.gov/Legislation/20072008/84683.pdfAnatomical Gift Act (2006),hi,HB 2139,2008,,Enacted,http://www.capitol.hawaii.gov/session2008/bills/HB2139_CD1_.pdfAnatomical Gift Act (2006),ak,HB 196,2008,,Enacted,http://www.legis.state.ak.us/PDF/25/Bills/HB0196Z.PDFAnatomical Gift Act (2006),va,HB 2684,2007,Frederick,Enacted,http://lis.virginia.gov/cgi-bin/legp604.exe?071+ful+CHAP0092Anatomical Gift Act (2006),wa,HB 1637,2007,Hinkle,Introduced,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/House%20Passed%20Legislature/1637-S.PL.pdfAnatomical Gift Act (2006),mo,SB 1139,2008,Dempsey,Enacted,http://www.senate.mo.gov/08info/pdf-bill/tat/SB1139.pdfAnatomical Gift Act (2006),ms,HB 1075,2008,Holland,Enacted,https://www.donatelifems.org/HB1075SG.pdfAnatomical Gift Act (2006),me,LD 1505,2008,Hobbins,Enacted,http://www.mainelegislature.org/legis/bills/bills_123rd/billpdfs/SP052801.pdfAnatomical Gift Act (2006),mi,HB 4940,2008,Condino,Enacted,http://www.legislature.mi.gov/documents/2007-2008/publicact/pdf/2008-PA-0039.pdfAnatomical Gift Act (2006),ny,SB 5154,2008,Hannon,Introduced,http://assembly.state.ny.us/leg/?default_fld=&bn=S05154&term=2007&Text=YAnatomical Gift Act (2006),nj,SB 754,2008,Codey,Enacted,http://www.njleg.state.nj.us/2008/Bills/PL08/50_.PDFAnatomical Gift Act (2006),ia,SF 509,2007,,Enacted,http://coolice.legis.iowa.gov/legislation/82ndGA/enrolled/sf509.htmlAnatomical Gift Act (2006),mn,SF 883,2007,Scheid,Enacted,https://www.revisor.mn.gov/bills/text.php?number=SF883&version=0&session_year=2007&session_number=0Anatomical Gift Act (2006),mo,HB723,2007,Stevenson/Koster,Introduced,http://www.house.mo.gov/billtracking/bills071/billpdf/intro/HB0723I.PDFAnatomical Gift Act (2006),nj,AB 3909,2007,Conaway,Introduced,http://www.njleg.state.nj.us/2006/Bills/A4000/3909_I1.HTMAnatomical Gift Act (2006),nm,HB 1276,2007,Cervantes,Enacted,http://www.nmlegis.gov/Sessions/07%20Regular/final/HB1276.pdfAnatomical Gift Act (2006),nc,HB 1372,2007,Folwell,Enacted,http://www.ncga.state.nc.us/Sessions/2007/Bills/House/PDF/H1372v6.pdfAnatomical Gift Act (2006),nd,SB 2163,2007,Kilzer,Enacted,http://legis.nd.gov/assembly/60-2007/bill-text/HAUN0400.pdfAnatomical Gift Act (2006),or,HB 3092,2007,,Enacted,https://olis.leg.state.or.us/liz/2007R1/Downloads/MeasureDocument/HB3092Anatomical Gift Act (2006),tn,HB 1557,2007,Shepard,Enacted,http://state.tn.us/sos/acts/105/pub/pc0428.pdfAnatomical Gift Act (2006),tx,SB 1597,2007,Janek,Introduced,http://www.legis.state.tx.us/tlodocs/80R/billtext/html/SB01597E.htmAnatomical Gift Act (2006),ut,SB 92,2007,Hillyard,Enacted,http://le.utah.gov/~2007/bills/static/SB0092.html


--------------------------------------------------------------------------------
/data/model_legislation_urls/clean_urls.txt:
--------------------------------------------------------------------------------
  1 | http://publicpolicyalliance.org/legislation/model-alac-bill/
  2 | http://www.svia.org/Relations/Legislation.aspx
  3 | http://www.mpp.org/legislation/model-medical-marijuana-bill.html?referrer=https://www.google.com/
  4 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Pets_Shops.pdf
  5 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Pet_Shops.pdf
  6 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Swap_Meets.pdf
  7 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf
  8 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf
  9 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Traveling_Circus.pdf
 10 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_Traveling_Circus.pdf
 11 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_for_Display_of_Exotics.pdf
 12 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_for_Display_of_Exotics.
 13 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_for_trapping.pdf
 14 | http://www.bornfreeusa.org/downloads/pdf/Model_Wildlife_Feeding_Legislation.pdf
 15 | http://images2.americanprogress.org/campus/web/ALEC_voter_ID_model_legislation.pdf
 16 | http://www.publiccharters.org/wp-content/uploads/2014/01/ModelLaw_P7-wCVR_20110402T222341.pdf
 17 | http://apps.americanbar.org/tax/groups/salt/ABA1_OFFICIAL_MODEL_ACT_REPORT_AS_ADOPTED_8-7-06.pdf
 18 | http://www.justice.gov/olp/model-state-provisions-pimping-pandering-and-prostitution
 19 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidencePrescriptiveModelBill2015.pdf
 20 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidenceTaskForceKeyedtoNISTModelBillRB.pdf
 21 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDPrescriptiveModelBill2015.pdf
 22 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDStandardTaskForceModelBill2015.pdf
 23 | http://www.innocenceproject.org/free-innocent/improve-the-law/RecordingofCustodialInterrogationsModelBill2015.pdf
 24 | http://www.innocenceproject.org/free-innocent/improve-the-law/CompensationModelBill2015.pdf
 25 | http://www.innocenceproject.org/free-innocent/improve-the-law/JailhouseInformantModelBill2015.pdf
 26 | http://www.innocenceproject.org/free-innocent/improve-the-law/AccesstoPostConvictionDNATestingModelBill2015.pdf
 27 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_disclosure.pdf
 28 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_clawbacks.pdf
 29 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_jobstandards.pdf
 30 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_UEDB.pdf
 31 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_complete.pdf
 32 | http://www.pcia.com/images/Advocacy_Docs/PCIA_Model_State_Siting_Legislation_2012.pdf
 33 | http://nepc.colorado.edu/files/NEPC-VirtSchool-2-LB-Bathon.pdf
 34 | http://www.shallnot.org/legislation
 35 | http://www.khi.org/assets/uploads/news/13359/goldwater_institute_right_to_try_model_legislation.pdf
 36 | http://www.icmec.org/en_X1/pdf/Child_Pornography_Model_Law_English_7th_Edition_2012.pdf
 37 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf
 38 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf
 39 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf
 40 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf
 41 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Education-Savings-Account-Act.pdf
 42 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Military-Family-Scholarship-Program-Act.pdf
 43 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Foster-Child-Scholarship-Program-Act.pdf
 44 | http://greatlakescenter.org/docs/Policy_Briefs/Research-Based-Options/02-Trujillo_Turnarounds-LB.pdf
 45 | http://www.academicfreedompetition.com/freedom.php
 46 | http://www.dpcare.org/#!dpcc-model-legislation/c14ob
 47 | http://toxicsinpackaging.org/model-legislation/model/
 48 | http://www.adl.org/assets/pdf/combating-hate/Hate-Crimes-Law.pdf
 49 | http://www.davidyerushalmilaw.com/CLE-Course-on-Draft-Uniform-Act--American-Laws-for-American-Courts-b25-p0.html%22
 50 | https://www.aclu.org/model-act-regulating-use-wearable-body-cameras-law-enforcement
 51 | http://www.ncsl.org/documents/standcomm/sccomfc/point_of_sale_model_bill2010.pdf
 52 | http://object.cato.org/pdfs/model-tax-credit-legislation-schaeffer-cato.pdf
 53 | http://inspectorsgeneral.org/files/2011/01/IG-Model-Legislation.pdf
 54 | http://www.inacol.org/wp-content/uploads/2015/02/Principles-For-Model-Legislation-2012.pdf
 55 | http://www.emacweb.org/index.php/mutualaidresources/intrastate-mutual-aid/modellegislation
 56 | http://aldf.org/downloads/ALDF_Model_Laws_v15_0.pdf
 57 | http://www.nationalpartnership.org/research-library/work-family/psd/model-paid-sick-and-safe-days-legislation.pdf
 58 | http://www.nationalpartnership.org/research-library/work-family/psd/section-by-section-analysis-model-legislation.pdf
 59 | http://www.nationalpartnership.org/research-library/work-family/psd/fact-sheet-model-legislation-main-points.pdf
 60 | https://www.aapa.org/WorkArea/DownloadAsset.aspx?id=548
 61 | http://www.indianasenaterepublicans.com/clientuploads/directory/publications/Sen%20David%20Long%20Article%20V%20Packet-Online.pdf
 62 | https://www.mackinac.org/21341http://www.naso.org/Resources/Legislation/ModelLegislation.aspx
 63 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a
 64 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf?e3490a
 65 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf?e3490a
 66 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf?e3490a
 67 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a
 68 | http://autisticadvocacy.org/wp-content/uploads/2014/03/OrganTransplantationModelLegislation.pdf
 69 | https://s3.amazonaws.com/peacelegislation/PEACE+Act.pdf
 70 | http://www.peaceonourstreets.com/peace
 71 | http://www.splc.org/article/2000/07/student-press-law-center-model-legislation-to-protect-student-free-expression-rights
 72 | http://www.splc.org/article/1998/09/student-press-law-center-model-guidelines-for-high-school-student-media
 73 | http://www.splc.org/article/2009/02/splc-college-student-media-model-guidelines
 74 | http://www.peaceonourstreets.com/hemp
 75 | http://www.safeaccessnow.org/model_legislation
 76 | http://www.constitutionproject.org/pdf/FINAL%20Model%20Legislation.PDF
 77 | http://web.archive.org/web/20080725012036/http://innocenceproject.org/docs/Preservation_Evidence_Prescriptive_08.pdf
 78 | http://web.archive.org/web/20080705090003/http://www.innocenceproject.org/docs/Preservation_Pilot_08.pdf
 79 | http://web.archive.org/web/20080705180144/http://innocenceproject.org/docs/Preservation_Task_Force_08.pdf
 80 | http://web.archive.org/web/20080807124850/http://www.innocenceproject.org/docs/Eye_ID_Prescriptive_08.pdf
 81 | http://web.archive.org/web/20080705085403/http://www.innocenceproject.org/docs/Eyewitness_ID_Written_Policies_08.pdf
 82 | http://web.archive.org/web/20080705085547/http://www.innocenceproject.org/docs/Eyewitness_ID_Task_Force_08.pdf
 83 | http://web.archive.org/web/20111014124824/http://www.innocenceproject.org/docs/Recording_Custodial_Interrogations_08.pdf
 84 | http://web.archive.org/web/20080725013155/http://innocenceproject.org/docs/CJR_Commission_08.pdf
 85 | http://web.archive.org/web/20080705090212/http://www.innocenceproject.org/docs/Compensation08.pdf
 86 | http://www.inta.org/Advocacy/Pages/WorldCustomsOrganizationModelLegislation.aspx
 87 | https://www.ij.org/images/pdf_folder/legislative/business-reg-relief-act.pdf
 88 | https://www.ij.org/images/pdf_folder/legislative/model-reporting-law.pdf
 89 | https://www.ij.org/images/pdf_folder/legislative/business-reg-act.pdf
 90 | https://www.ij.org/images/pdf_folder/legislative/anti-slapp-model.pdf
 91 | https://www.ij.org/images/pdf_folder/legislative/model-ed-legislation.pdf
 92 | https://www.ij.org/images/pdf_folder/legislative/ijmodeleconlib.pdf
 93 | https://www.ij.org/images/pdf_folder/legislative/ijmodelforfeiturelaw.pdf
 94 | http://www.nclc.org/images/pdf/arbitration/model-state-arb-act-2015.pdf
 95 | http://www.nclc.org/images/pdf/debt_collection/model_family_financial_protection_act.pdf
 96 | http://www.nclc.org/images/pdf/legislation/model_laws/state-model-law-2011.pdf
 97 | http://www.nclc.org/images/pdf/foreclosure_mortgage/mediation/model-judicial.pdf
 98 | http://www.gunlaws.com/ConstitutionalCarry.htm
 99 | http://www.gunlaws.com/GFZ/GFZ-BillReview.htm
100 | http://www.gunlaws.com/HighSchoolMarksmanship.htm
101 | http://www.gunlaws.com/lostcry.htm
102 | http://www.gunlaws.com/PropertyInVehicleLaw.htm
103 | http://www.gunlaws.com/DefensiveDisplay.htm
104 | http://www.gunlaws.com/MontanaMadeGuns.htm
105 | http://www.gunlaws.com/BIDSvNICS.htm
106 | http://www.gunlaws.com/sunshin.htm
107 | http://www.gunlaws.com/911-Limited-Immunity.htm
108 | http://www.gunlaws.com/EnumeratedPowersAct.htm
109 | http://ncra.files.cms-plus.com/GovernmentRelations/FINAL%20Third-Party%20Contracting%20Model%20Legislation.pdf
110 | https://www.heartland.org/policy-documents/model-bill-parent-trigger
111 | http://www.glsen.org/sites/default/files/GLSEN%20state%20model%20legislation.pdf
112 | http://www.frc.org/onepagers/model-legislation-divorce-reform-for-families-with-children
113 | http://www.lac.org/toolkits/sealing/Model%20Expungement%20Statute.pdf
114 | http://www.hopeafterrapeconception.org/model-legislation.html
115 | https://algaonline.org/DocumentCenter/View/11
116 | http://www.nelp.org/content/uploads/2015/04/NELP-Model-Legislation-Work-Sharing.pdf
117 | http://www.flushthetpp.org/tpp-free-zone-model-legislation/
118 | https://www.proenglish.org/official-english/legislation/model-legislation.html
119 | http://www.nascla.org/nascla-model-legislation
120 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1416085011/4th-Amendment-Protection-Act.pdf
121 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604203/Electronic-Data-Privacy-Act.pdf?1409604203
122 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603024/Freedom_from_Drone_Surveillance_Act.pdf?1409603024
123 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293
124 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293
125 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603013/CHOICE_Act_of_2015.pdf?1409603013
126 | http://www.cchr.org/download-material/model-legislation.html
127 | http://www.cchr.org/sites/default/files/Electroshock_Model_Legislation.pdf
128 | http://www.cchr.org/sites/default/files/Deadly_Restraint_Model_Legislation.pdf
129 | http://www.cchr.org/sites/default/files/Involuntary_Commitment_Model_Legislation.pdf
130 | http://www.cchr.org/sites/default/files/Rape_Model_Legislation.pdf
131 | http://www.licenseportability.org/assets/pdf/Interstate-Medical-Licensure-Compact-(FINAL).pdf
132 | http://assets.aarp.org/rgcenter/consume/d17158_dwell.pdf
133 | http://ticas.org/sites/default/files/legacy/files/File/Model%20Tax%20Credit.pdf
134 | https://represent.us/wp-content/uploads/2015/04/AACA-Full-Provisions.pdf
135 | http://www.naiaonline.org/uploads/Main_Upload_Directory/NaiaPetFriendlyGuide.pdf
136 | http://www.naiaonline.org/pdfs/NAIA_%20Model_Animal_Control_Law_Final.pdf
137 | http://www.naiaonline.org/uploads/Main_Upload_Directory/naiaShelterReportingAct2014.pdf
138 | http://www.naiaonline.org/pdfs/ShelterImportAndReportingModel.pdf
139 | http://www.naiaonline.org/uploads/Main_Upload_Directory/DogPurchaserProtectionModelLaw.pdf
140 | http://www.naiaonline.org/articles/article/naia-resolution-supporting-animal-welfare#sthash.X3spi6jw.dpbs
141 | http://www.naic.org/documents/committees_b_exchanges_adopted_health_benefit_exchanges.pdf
142 | http://netchoice.org/wp-content/uploads/maiyn-online-safety-model-legislation-v2-6.pdf
143 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att7.pdf
144 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att8.pdf
145 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att5.pdf
146 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf
147 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf
148 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_acc.pdf
149 | http://www.pwia.org/assets/cabinets/Cabinet474/PWIAModelLegislation.pdf
150 | http://www.nhcsl.org/2007-1.php
151 | http://legis.state.nm.us/Sessions/07%20Regular/final/SB0600.pdf
152 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf
153 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf
154 | http://gallery.mailchimp.com/c1a51befb8159efb3bbd1f2620f9e1/files/VRA_ModelResolution.pdf
155 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ExcessivelyRaisedVehicles.pdf
156 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_LowSpeedVehicles.pdf
157 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ElectronicLienTitle.pdf
158 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_AdvertisingonInternet.pdf
159 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLawAdvertisingOverInternetJurisdictionCourts.pdf
160 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_UnfairTradePractices.pdf
161 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ProspectivePurchaserInquiryFeeForNMVTIS.pdf
162 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MotorCarrierStaggeredRegistration.pdf
163 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_DisclosurePersnlInfoInMVRecords.pdf
164 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ContractingForMotorVehicleRegistration.pdf
165 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_SalvageVehicleTitling.pdf
166 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MVInspectionsByIndependentContractors.pdf
167 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_PrivatizationThirdParties.pdf
168 | 
169 | 


--------------------------------------------------------------------------------
/db/elasticsearch.yml:
--------------------------------------------------------------------------------
  1 | ##################### Elasticsearch Configuration Example #####################
  2 | 
  3 | # This file contains an overview of various configuration settings,
  4 | # targeted at operations staff. Application developers should
  5 | # consult the guide at <http://elasticsearch.org/guide>.
  6 | #
  7 | # The installation procedure is covered at
  8 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
  9 | #
 10 | # Elasticsearch comes with reasonable defaults for most settings,
 11 | # so you can try it out without bothering with configuration.
 12 | #
 13 | # Most of the time, these defaults are just fine for running a production
 14 | # cluster. If you're fine-tuning your cluster, or wondering about the
 15 | # effect of certain configuration option, please _do ask_ on the
 16 | # mailing list or IRC channel [http://elasticsearch.org/community].
 17 | 
 18 | # Any element in the configuration can be replaced with environment variables
 19 | # by placing them in ${...} notation. For example:
 20 | #
 21 | #node.rack: ${RACK_ENV_VAR}
 22 | 
 23 | # For information on supported formats and syntax for the config file, see
 24 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup-configuration.html>
 25 | 
 26 | 
 27 | ################################### Cluster ###################################
 28 | 
 29 | # Cluster name identifies your cluster for auto-discovery. If you're running
 30 | # multiple clusters on the same network, make sure you're using unique names.
 31 | #
 32 | cluster.name: sunlightcluster
 33 | 
 34 | 
 35 | #################################### Node #####################################
 36 | 
 37 | # Node names are generated dynamically on startup, so you're relieved
 38 | # from configuring them manually. You can tie this node to a specific name:
 39 | #
 40 | node.name: "sunlight_0"
 41 | 
 42 | # Every node can be configured to allow or deny being eligible as the master,
 43 | # and to allow or deny to store the data.
 44 | #
 45 | # Allow this node to be eligible as a master node (enabled by default):
 46 | #
 47 | #node.master: true
 48 | #
 49 | # Allow this node to store data (enabled by default):
 50 | #
 51 | #node.data: true
 52 | 
 53 | # You can exploit these settings to design advanced cluster topologies.
 54 | #
 55 | # 1. You want this node to never become a master node, only to hold data.
 56 | #    This will be the "workhorse" of your cluster.
 57 | #
 58 | #node.master: false
 59 | #node.data: true
 60 | #
 61 | # 2. You want this node to only serve as a master: to not store any data and
 62 | #    to have free resources. This will be the "coordinator" of your cluster.
 63 | #
 64 | #node.master: true
 65 | #node.data: false
 66 | #
 67 | # 3. You want this node to be neither master nor data node, but
 68 | #    to act as a "search load balancer" (fetching data from nodes,
 69 | #    aggregating results, etc.)
 70 | #
 71 | #node.master: false
 72 | #node.data: false
 73 | 
 74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the
 75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools
 76 | # such as <http://www.elasticsearch.org/overview/marvel/>,
 77 | # <http://github.com/karmi/elasticsearch-paramedic>,
 78 | # <http://github.com/lukas-vlcek/bigdesk> and
 79 | # <http://mobz.github.com/elasticsearch-head> to inspect the cluster state.
 80 | 
 81 | # A node can have generic attributes associated with it, which can later be used
 82 | # for customized shard allocation filtering, or allocation awareness. An attribute
 83 | # is a simple key value pair, similar to node.key: value, here is an example:
 84 | #
 85 | #node.rack: rack314
 86 | 
 87 | # By default, multiple nodes are allowed to start from the same installation location
 88 | # to disable it, set the following:
 89 | #node.max_local_storage_nodes: 1
 90 | 
 91 | 
 92 | #################################### Index ####################################
 93 | 
 94 | # You can set a number of options (such as shard/replica options, mapping
 95 | # or analyzer definitions, translog settings, ...) for indices globally,
 96 | # in this file.
 97 | #
 98 | # Note, that it makes more sense to configure index settings specifically for
 99 | # a certain index, either when creating it or by using the index templates API.
100 | #
101 | # See <http://elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html> and
102 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/indices-create-index.html>
103 | # for more information.
104 | 
105 | # Set the number of shards (splits) of an index (5 by default):
106 | #
107 | #index.number_of_shards: 5
108 | 
109 | # Set the number of replicas (additional copies) of an index (1 by default):
110 | #
111 | #index.number_of_replicas: 1
112 | 
113 | # Note, that for development on a local machine, with small indices, it usually
114 | # makes sense to "disable" the distributed features:
115 | #
116 | #index.number_of_shards: 1
117 | #index.number_of_replicas: 0
118 | 
119 | # These settings directly affect the performance of index and search operations
120 | # in your cluster. Assuming you have enough machines to hold shards and
121 | # replicas, the rule of thumb is:
122 | #
123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to
124 | #    _distribute_ a big index across machines.
125 | # 2. Having more *replicas* enhances the _search_ performance and improves the
126 | #    cluster _availability_.
127 | #
128 | # The "number_of_shards" is a one-time setting for an index.
129 | #
130 | # The "number_of_replicas" can be increased or decreased anytime,
131 | # by using the Index Update Settings API.
132 | #
133 | # Elasticsearch takes care about load balancing, relocating, gathering the
134 | # results from nodes, etc. Experiment with different settings to fine-tune
135 | # your setup.
136 | 
137 | # Use the Index Status API (<http://localhost:9200/A/_status>) to inspect
138 | # the index status.
139 | 
140 | 
141 | #################################### Paths ####################################
142 | 
143 | # Path to directory containing configuration (this file and logging.yml):
144 | #
145 | #path.conf: /path/to/conf
146 | 
147 | # Path to directory where to store index data allocated for this node.
148 | #
149 | path.data: /mnt/elasticsearch/
150 | #
151 | # Can optionally include more than one location, causing data to be striped across
152 | # the locations (a la RAID 0) on a file level, favouring locations with most free
153 | # space on creation. For example:
154 | #
155 | #path.data: /path/to/data1,/path/to/data2
156 | 
157 | # Path to temporary files:
158 | #
159 | #path.work: /path/to/work
160 | 
161 | # Path to log files:
162 | #
163 | #path.logs: /mnt/data/sunlight/es_logs
164 | 
165 | # Path to where plugins are installed:
166 | #
167 | #path.plugins: /path/to/plugins
168 | 
169 | 
170 | #################################### Plugin ###################################
171 | 
172 | # If a plugin listed here is not installed for current node, the node will not start.
173 | #
174 | #plugin.mandatory: mapper-attachments,lang-groovy
175 | 
176 | 
177 | ################################### Memory ####################################
178 | 
179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that
180 | # it _never_ swaps.
181 | #
182 | # Set this property to true to lock the memory:
183 | #
184 | bootstrap.mlockall: true
185 | 
186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set
187 | # to the same value, and that the machine has enough memory to allocate
188 | # for Elasticsearch, leaving enough memory for the operating system itself.
189 | #
190 | # You should also make sure that the Elasticsearch process is allowed to lock
191 | # the memory, eg. by using `ulimit -l unlimited`.
192 | 
193 | 
194 | ############################## Network And HTTP ###############################
195 | 
196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens
197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node
198 | # communication. (the range means that if the port is busy, it will automatically
199 | # try the next port).
200 | 
201 | # Set the bind address specifically (IPv4 or IPv6):
202 | #
203 | #network.bind_host: 192.168.0.1
204 | 
205 | # Set the address other nodes will use to communicate with this node. If not
206 | # set, it is automatically derived. It must point to an actual IP address.
207 | #
208 | #network.publish_host: 192.168.0.1
209 | 
210 | # Set both 'bind_host' and 'publish_host':
211 | #
212 | #network.host: 192.168.0.1
213 | 
214 | # Set a custom port for the node to node communication (9300 by default):
215 | #
216 | #transport.tcp.port: 9300
217 | 
218 | # Enable compression for all communication between nodes (disabled by default):
219 | #
220 | #transport.tcp.compress: true
221 | 
222 | # Set a custom port to listen for HTTP traffic:
223 | #
224 | #http.port: 9200
225 | 
226 | # Set a custom allowed content length:
227 | #
228 | #http.max_content_length: 100mb
229 | 
230 | # Disable HTTP completely:
231 | #
232 | #http.enabled: false
233 | 
234 | 
235 | ################################### Gateway ###################################
236 | 
237 | # The gateway allows for persisting the cluster state between full cluster
238 | # restarts. Every change to the state (such as adding an index) will be stored
239 | # in the gateway, and when the cluster starts up for the first time,
240 | # it will read its state from the gateway.
241 | 
242 | # There are several types of gateway implementations. For more information, see
243 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-gateway.html>.
244 | 
245 | # The default gateway type is the "local" gateway (recommended):
246 | #
247 | #gateway.type: local
248 | 
249 | # Settings below control how and when to start the initial recovery process on
250 | # a full cluster restart (to reuse as much local data as possible when using shared
251 | # gateway).
252 | 
253 | # Allow recovery process after N nodes in a cluster are up:
254 | #
255 | #gateway.recover_after_nodes: 1
256 | 
257 | # Set the timeout to initiate the recovery process, once the N nodes
258 | # from previous setting are up (accepts time value):
259 | #
260 | #gateway.recover_after_time: 5m
261 | 
262 | # Set how many nodes are expected in this cluster. Once these N nodes
263 | # are up (and recover_after_nodes is met), begin recovery process immediately
264 | # (without waiting for recover_after_time to expire):
265 | #
266 | #gateway.expected_nodes: 2
267 | 
268 | 
269 | ############################# Recovery Throttling #############################
270 | 
271 | # These settings allow to control the process of shards allocation between
272 | # nodes during initial recovery, replica allocation, rebalancing,
273 | # or when adding and removing nodes.
274 | 
275 | # Set the number of concurrent recoveries happening on a node:
276 | #
277 | # 1. During the initial recovery
278 | #
279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4
280 | #
281 | # 2. During adding/removing nodes, rebalancing, etc
282 | #
283 | #cluster.routing.allocation.node_concurrent_recoveries: 2
284 | 
285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb):
286 | #
287 | #indices.recovery.max_bytes_per_sec: 20mb
288 | 
289 | # Set to limit the number of open concurrent streams when
290 | # recovering a shard from a peer:
291 | #
292 | #indices.recovery.concurrent_streams: 5
293 | 
294 | 
295 | ################################## Discovery ##################################
296 | 
297 | # Discovery infrastructure ensures nodes can be found within a cluster
298 | # and master node is elected. Multicast discovery is the default.
299 | 
300 | # Set to ensure a node sees N other master eligible nodes to be considered
301 | # operational within the cluster. This should be set to a quorum/majority of 
302 | # the master-eligible nodes in the cluster.
303 | #
304 | #discovery.zen.minimum_master_nodes: 1
305 | 
306 | # Set the time to wait for ping responses from other nodes when discovering.
307 | # Set this option to a higher value on a slow or congested network
308 | # to minimize discovery failures:
309 | #
310 | #discovery.zen.ping.timeout: 3s
311 | 
312 | # For more information, see
313 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-zen.html>
314 | 
315 | # Unicast discovery allows to explicitly control which nodes will be used
316 | # to discover the cluster. It can be used when multicast is not present,
317 | # or to restrict the cluster communication-wise.
318 | #
319 | # 1. Disable multicast discovery (enabled by default):
320 | #
321 | #discovery.zen.ping.multicast.enabled: false
322 | #
323 | # 2. Configure an initial list of master nodes in the cluster
324 | #    to perform discovery when new nodes (master or data) are started:
325 | #
326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"]
327 | 
328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery.
329 | #
330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery.
331 | #
332 | # For more information, see
333 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-ec2.html>
334 | #
335 | # See <http://elasticsearch.org/tutorials/elasticsearch-on-ec2/>
336 | # for a step-by-step tutorial.
337 | 
338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery.
339 | #
340 | # You have to install the cloud-gce plugin for enabling the GCE discovery.
341 | #
342 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-gce>.
343 | 
344 | # Azure discovery allows to use Azure API in order to perform discovery.
345 | #
346 | # You have to install the cloud-azure plugin for enabling the Azure discovery.
347 | #
348 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-azure>.
349 | 
350 | ################################## Slow Log ##################################
351 | 
352 | # Shard level query and fetch threshold logging.
353 | 
354 | #index.search.slowlog.threshold.query.warn: 10s
355 | #index.search.slowlog.threshold.query.info: 5s
356 | #index.search.slowlog.threshold.query.debug: 2s
357 | #index.search.slowlog.threshold.query.trace: 500ms
358 | 
359 | #index.search.slowlog.threshold.fetch.warn: 1s
360 | #index.search.slowlog.threshold.fetch.info: 800ms
361 | #index.search.slowlog.threshold.fetch.debug: 500ms
362 | #index.search.slowlog.threshold.fetch.trace: 200ms
363 | 
364 | #index.indexing.slowlog.threshold.index.warn: 10s
365 | #index.indexing.slowlog.threshold.index.info: 5s
366 | #index.indexing.slowlog.threshold.index.debug: 2s
367 | #index.indexing.slowlog.threshold.index.trace: 500ms
368 | 
369 | ################################## GC Logging ################################
370 | 
371 | #monitor.jvm.gc.young.warn: 1000ms
372 | #monitor.jvm.gc.young.info: 700ms
373 | #monitor.jvm.gc.young.debug: 400ms
374 | 
375 | #monitor.jvm.gc.old.warn: 10s
376 | #monitor.jvm.gc.old.info: 5s
377 | #monitor.jvm.gc.old.debug: 2s
378 | 
379 | ################################## Security ################################
380 | 
381 | # Uncomment if you want to enable JSONP as a valid return transport on the
382 | # http server. With this enabled, it may pose a security risk, so disabling
383 | # it unless you need it is recommended (it is disabled by default).
384 | #
385 | #http.jsonp.enable: true
386 | 


--------------------------------------------------------------------------------
/db/evaluation_mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bill_document": {
 3 |     "dynamic": "false",
 4 |     "properties": {
 5 |       "bill_document_last": {
 6 |         "type": "string",
 7 |         "term_vector": "yes",
 8 |         "index": "analyzed",
 9 |         "_size": {
10 |           "enabled": true,
11 |           "store": true
12 |         },
13 |         "fields": {
14 |                         "shingles": {
15 |                             "type":     "string",
16 |                             "analyzer": "my_shingle_analyzer"
17 | } }
18 |       },
19 |       "match": {
20 |         "type": "string",
21 |         "index": "not_analyzed"
22 |       },
23 |       "state": {
24 |         "type": "string",
25 |         "index": "not_analyzed"
26 |       },
27 |       "unique_id": {
28 |         "type": "string",
29 |         "index": "not_analyzed"
30 |       }
31 |     }
32 |   }
33 | }


--------------------------------------------------------------------------------
/db/state_bill_index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index": {
 4 |       "number_of_shards": 1,
 5 |       "number_of_replicas": 0
 6 |     },
 7 |     "analysis": {
 8 |       "filter": {
 9 |         "my_shingle_filter": {
10 |           "type": "shingle",
11 |           "min_shingle_size": 2,
12 |           "max_shingle_size": 4,
13 |           "output_unigrams": "false"
14 |         }
15 |       },
16 |       "analyzer": {
17 |         "my_shingle_analyzer": {
18 |           "type": "custom",
19 |           "tokenizer": "standard",
20 |           "filter": [
21 |             "lowercase",
22 |             "my_shingle_filter"
23 |           ]
24 |         }
25 |       }
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/db/state_bill_mapping.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "bill_document": {
  3 |     "dynamic": "false",
  4 |     "properties": {
  5 |       "actions": {
  6 |         "properties": {
  7 |           "action": {
  8 |             "type": "string",
  9 |             "index": "analyzed"
 10 |           },
 11 |           "actor": {
 12 |             "type": "string",
 13 |             "index": "analyzed"
 14 |           },
 15 |           "date": {
 16 |             "type": "date",
 17 |             "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
 18 |           },
 19 |           "type": {
 20 |             "type": "string",
 21 |             "index": "not_analyzed"
 22 |           }
 23 |         }
 24 |       },
 25 |       "bill_document_first": {
 26 |         "type": "string",
 27 |         "term_vector": "yes",
 28 |         "index": "analyzed",
 29 |         "_size": {
 30 |           "enabled": true,
 31 |           "store": true
 32 |         },
 33 |         "fields": {
 34 |                         "shingles": {
 35 |                             "type":     "string",
 36 |                             "analyzer": "my_shingle_analyzer"
 37 | } }
 38 |       },
 39 |       "bill_document_last": {
 40 |         "type": "string",
 41 |         "term_vector": "yes",
 42 |         "index": "analyzed",
 43 |         "_size": {
 44 |           "enabled": true,
 45 |           "store": true
 46 |         },
 47 |         "fields": {
 48 |                         "shingles": {
 49 |                             "type":     "string",
 50 |                             "analyzer": "my_shingle_analyzer"
 51 | } }
 52 |       },
 53 |       "bill_id": {
 54 |         "type": "string",
 55 |         "index": "not_analyzed"
 56 |       },
 57 |       "bill_title": {
 58 |         "type": "string",
 59 |         "term_vector": "yes",
 60 |         "index": "analyzed",
 61 |         "_size": {
 62 |           "enabled": true,
 63 |           "store": true
 64 |         },
 65 |         "fields": {
 66 |                         "shingles": {
 67 |                             "type":     "string",
 68 |                             "analyzer": "my_shingle_analyzer"
 69 | } }
 70 |       },
 71 |       "bill_type": {
 72 |         "type": "string",
 73 |         "index": "not_analyzed"
 74 |       },
 75 |       "chamber": {
 76 |         "type": "string",
 77 |         "index": "not_analyzed"
 78 |       },
 79 |       "summary": {
 80 |         "type": "string",
 81 |         "term_vector": "yes",
 82 |         "index": "analyzed",
 83 |         "_size": {
 84 |           "enabled": true,
 85 |           "store": true
 86 |         },
 87 |         "fields": {
 88 |                         "shingles": {
 89 |                             "type":     "string",
 90 |                             "analyzer": "my_shingle_analyzer"
 91 | } }
 92 |       },
 93 |       "short_title": {
 94 |         "type": "string",
 95 |         "term_vector": "yes",
 96 |         "index": "analyzed",
 97 |         "_size": {
 98 |           "enabled": true,
 99 |           "store": true
100 |         },
101 |         "fields": {
102 |                         "shingles": {
103 |                             "type":     "string",
104 |                             "analyzer": "my_shingle_analyzer"
105 | } }
106 |       },
107 |       "date_created": {
108 |         "type": "date",
109 |         "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
110 |       },
111 |       "date_updated": {
112 |         "type": "date",
113 |         "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
114 |       },
115 |       "session": {
116 |         "type": "string",
117 |         "index": "analyzed"
118 |       },
119 |       "state": {
120 |         "type": "string",
121 |         "index": "not_analyzed"
122 |       },
123 |       "sunlight_id": {
124 |         "type": "string",
125 |         "index": "not_analyzed"
126 |       },
127 |       "unique_id": {
128 |         "type": "string",
129 |         "index": "not_analyzed"
130 |       }
131 |     }
132 |   }
133 | }


--------------------------------------------------------------------------------
/html/bootstrap3/css/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/css/.Rhistory


--------------------------------------------------------------------------------
/html/bootstrap3/css/custom.css:
--------------------------------------------------------------------------------
 1 | mark {
 2 |     background-color: yellow;
 3 |     color: black;
 4 | }
 5 | 
 6 | .span3 {  
 7 |     height: 250px !important;
 8 |     overflow: scroll;
 9 | }
10 | 
11 | .span5 { 
12 |     height: 800px !important;
13 |     overflow: scroll;
14 | }
15 | td { 
16 |     padding: 5px;
17 | }
18 | 
19 | tr:hover { background: #efedf5; }
20 | td a {
21 |     display: block; 
22 |     padding: 16px; 
23 | }
24 | 


--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/html/bootstrap3/js/npm.js:
--------------------------------------------------------------------------------
 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
 2 | require('../../js/transition.js')
 3 | require('../../js/alert.js')
 4 | require('../../js/button.js')
 5 | require('../../js/carousel.js')
 6 | require('../../js/collapse.js')
 7 | require('../../js/dropdown.js')
 8 | require('../../js/modal.js')
 9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')


--------------------------------------------------------------------------------
/html/index.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 |   <head>
 3 |     <meta http-equiv="cache-control" content="max-age=0" />
 4 |     <meta http-equiv="cache-control" content="no-cache" />
 5 |     <meta http-equiv="expires" content="0" />
 6 |     <meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
 7 |     <meta http-equiv="pragma" content="no-cache" />
 8 |     <meta charset="utf-8">
 9 |     <title>{% if page %}Page {{ page }}{% else %}Intro{% endif %} - Raccoon User Study Part 2</title>
10 |     <link rel="shortcut icon" href="/favicon.ico?v=2" />
11 |     
12 |     <link rel="stylesheet" href="/jquery-ui-1.11.2.custom/jquery-ui.css">
13 |     <link rel="stylesheet" href="/nv.d3/nv.d3.min.css">
14 |     <link rel="stylesheet" href="/bootstrap3/css/bootstrap.min.css">
15 |     <style> 
16 |       #gram-search-form .form-group input {
17 |         width: 350px;
18 |       }
19 |     
20 |       .date-graph svg { 
21 |         background-color: white; 
22 |         width: 150%;
23 |         height: 180px;
24 |         margin-bottom: 10px;
25 |       }
26 |       
27 |       .date-graph .tick line { stroke: lightgrey; }
28 |       
29 |       #kept-grams-form ul li {
30 |         border: 1px dashed #aaa;
31 |         margin-right: 7px;
32 |         background: white;
33 |         padding: 2px;
34 |         margin-bottom: 7px;
35 |       }
36 |       
37 |       .rm-gram, 
38 |       .rm-gram:visited
39 |       .rm-gram:active,
40 |       .rm-gram:hover {
41 |         text-decoration: none;
42 |         color: black;
43 |       }
44 |       
45 |       #instructions li {
46 |         margin-bottom: 7px;
47 |       }
48 |       
49 |       #instructions {
50 |         margin-bottom: 20px;
51 |       }
52 |     </style>
53 |     </head>
54 | 
55 |     <title>Sunlight Demo</title>
56 |     
57 |     <body>
58 |     <div class="container">
59 | 
60 |       <div class="page-header">
61 |         <h1>Bootstrap grid examples</h1>
62 |         <p class="lead">Basic grid layouts to get you familiar with building within the Bootstrap grid system.</p>
63 |       </div>
64 |       <form method="get" action="generate">
65 |               <input type="text" value="type query here" name="query_string" />
66 |               <button type="submit">Give it now!</button>
67 |       </form>  
68 | 
69 |       <div class="row">
70 |           <div class="col-md-8">This is a test to see how we can compare query {{ query_string }}</div>
71 |         <div class="col-md-4">This is a test to see how we can compare query and result text</div>
72 |       </div>
73 | 
74 |       </html>
75 | 


--------------------------------------------------------------------------------
/html/templates/searchdemo.html.jinja:
--------------------------------------------------------------------------------
  1 | <html lang="en">
  2 | <head>
  3 | <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
  4 | <meta http-equiv="cache-control" content="max-age=0" />
  5 | <meta http-equiv="cache-control" content="no-cache" />
  6 | <meta http-equiv="expires" content="0" />
  7 | <meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
  8 | <meta http-equiv="pragma" content="no-cache" />
  9 | <meta charset="utf-8">
 10 | <title>Search Demo</title>
 11 | <link rel="shortcut icon" href="/favicon.ico?v=2" />
 12 | 
 13 | <link rel="stylesheet" href="/bootstrap3/css/bootstrap.min.css">
 14 | <link rel="stylesheet" href="/bootstrap3/css/custom.css">
 15 | <script>
 16 | $('.hidden-text-btn').click(function() {
 17 |     $(this).siblings('.my-hidden-text').show();
 18 |    return false;
 19 | });
 20 | </script>
 21 | 
 22 | 
 23 | <script>
 24 | function moveText(leftText,rightText)
 25 | {
 26 | document.getElementById("left_text").innerHTML=leftText;
 27 | document.getElementById("right_text").innerHTML=rightText;
 28 | }
 29 | 
 30 | </script>
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | <style> 
 38 | #gram-search-form .form-group input {
 39 |     width: 350px;
 40 | }
 41 | 
 42 | .date-graph svg { 
 43 |     background-color: white; 
 44 |     width: 150%;
 45 |     height: 180px;
 46 |     margin-bottom: 10px;
 47 | }
 48 | 
 49 | .date-graph .tick line { stroke: lightgrey; }
 50 | 
 51 | #kept-grams-form ul li {
 52 |     border: 1px dashed #aaa;
 53 |     margin-right: 7px;
 54 |     background: white;
 55 |     padding: 2px;
 56 |     margin-bottom: 7px;
 57 | }
 58 | 
 59 | .rm-gram, 
 60 | .rm-gram:visited
 61 | .rm-gram:active,
 62 | .rm-gram:hover {
 63 |     text-decoration: none;
 64 |     color: black;
 65 | }
 66 | 
 67 | #instructions li {
 68 |     margin-bottom: 7px;
 69 | }
 70 | 
 71 | #instructions {
 72 |     margin-bottom: 20px;
 73 | }
 74 | </style>
 75 | </head>
 76 | 
 77 | <title>Demo</title>
 78 | 
 79 | <body>
 80 | 
 81 | <div class="container">
 82 | 
 83 |     
 84 | 
 85 |     <div class="jumbotron">
 86 |     <h1>Legislative Influence Detector &mdash; LID</h1>      
 87 |     <p>Tracing Policy Ideas across Lobbyists and State Legislatures</p>
 88 |     <p><a href="http://dssg.uchicago.edu/lid">http://dssg.uchicago.edu</a></p>
 89 |   </div>
 90 | 
 91 | <br>
 92 | 
 93 |    <h3>Step 1: Choose the type of documents you'd like to search</h3>
 94 |    <form method="get" action="searchdemo">
 95 |          <input type="radio" name="query_bill" value="bill" checked> Search state bills<br>
 96 |          <input type="radio" name="query_bill" value="model legislation"> Search model legislation<br>
 97 |          <input type="radio" name="query_bill" value="constitution"> Search constitutions worldwide<br>
 98 |     <br>
 99 |    <h3>Step 2: Copy and paste the text of the legislative bill, model legislation, or constitution you'd like to find matches for</h3>
100 |        <textarea name="query_string" class="form-control" rows="5"> </textarea>
101 |         <br>
102 |    <h3>Step 3: Press the submit button</h3>
103 |         <button type="submit" class="btn btn-info btn-sm">search</button>
104 |     </form>
105 | 
106 |     <br>
107 |     <br>
108 | 
109 |     
110 | 
111 | 
112 | 
113 | 
114 |     <h3>Step 4: Scroll potential matches and click on the ones you'd like to investigate. Green indicates likely matches. Red indicates likely not.</h3>
115 |     <div class="row">
116 | 
117 |             <div class="span3">
118 |             <table>
119 |             <tr><td><b>State/Org</b></td><td><b>Session/Type</b></td><td><b>Number</b></td></tr>
120 |                         {% for row in results_to_show %}
121 |             <tr><td>{{ row.1 }}</td>
122 |                 <td>{{ row.2 }}</td>
123 |                 <td>{{ row.3 }}</td>
124 |                 {% if row.0 >= 50 %}
125 |                  <td><input  class="btn btn-success btn-xs" type="button"
126 |                     value="view" left_val = "{{row.4}}" right_val = "{{row.5}}"
127 |                     name="no" onclick="moveText(this.getAttribute('left_val'),this.getAttribute('right_val'))"></td>
128 |                 {% elif row.0 <50 %}
129 |                 <td><input  class="btn btn-danger btn-xs" type="button"
130 |                     value="view" left_val = "{{row.4}}" right_val = "{{row.5}}"
131 |                     name="no" onclick="moveText(this.getAttribute('left_val'),this.getAttribute('right_val'))"></td>
132 |                 {% endif %}
133 |                 <!--<td><input  class="btn btn-default btn-sm" type="button" value="view" left_val = "{{row.4}}" right_val = "{{row.5}}"
134 |                     name="no" onclick="moveText(this.getAttribute('left_val'),this.getAttribute('right_val'))"></td> -->
135 |                 </tr>
136 |             {% endfor %}
137 |         </table>
138 | 
139 |     </div>
140 | </div>
141 |         <br> <br>  <br>            
142 |        
143 |  
144 |         <div class="col-md-6"><div class="span5">
145 |                 <pre id = "left_text"></pre>
146 |         </div></div>
147 |         
148 |         
149 |         <div class="col-md-6"><div class="span5">
150 | 		<pre id = "right_text"></pre>
151 | 	</div></div>
152 |    
153 |  
154 |     </body>
155 |     </html>
156 | 


--------------------------------------------------------------------------------
/lid/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/.DS_Store


--------------------------------------------------------------------------------
/lid/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/__init__.py


--------------------------------------------------------------------------------
/lid/alignment_classifier.py:
--------------------------------------------------------------------------------
  1 | from alignment_evaluation import alignment_features
  2 | import numpy as np
  3 | import nltk
  4 | from sklearn import linear_model
  5 | from sklearn.metrics import confusion_matrix, accuracy_score
  6 | import csv
  7 | import json
  8 | import argparse
  9 | import os
 10 | from database import ElasticConnection
 11 | import random
 12 | import codecs
 13 | from sklearn.feature_extraction.text import TfidfVectorizer
 14 | from utils.general_utils import alignment_tokenizer
 15 | from utils.general_utils import UnicodeWriter,UnicodeReader
 16 | import pickle
 17 | from sklearn.metrics import jaccard_similarity_score,classification_report
 18 | from sklearn.linear_model import LogisticRegression
 19 | from sklearn.cross_validation import KFold
 20 | 
 21 | 
 22 | 
 23 | '''Contains code for both the features and model of the alignment classifier used to classify alignments as
 24 |     substantive or boiler-plate'''
 25 | 
 26 | def compute_tfidf_scores(alignment_data_path,pickle_file_name):
 27 |     count = 0
 28 |     alignment_docs = []
 29 |     for line in alignment_data_path:
 30 |         print count
 31 |         count += 1
 32 |         if count >= 100000:
 33 |             break
 34 |         json_obj = json.loads(line.strip())
 35 |         
 36 |         
 37 |         if "alignment_results" not in json_obj:
 38 |             continue
 39 | 
 40 |         for alignment_result in json_obj['alignment_results']:
 41 |             alignment_doc = []
 42 |             for section_alignment in alignment_result['alignments']:
 43 |                 alignment_doc.extend([x for x in section_alignment['left'] if x not in ['-',None]])
 44 |                 alignment_doc.extend([x for x in section_alignment['right'] if x not in ['-',None]])
 45 |             alignment_docs.append( " ".join(alignment_doc))
 46 |         
 47 |     
 48 |     vectorizer = TfidfVectorizer()
 49 |     X = vectorizer.fit_transform(alignment_docs)
 50 |     idf = vectorizer.idf_
 51 |     idf = vectorizer._tfidf.idf_
 52 |     term_scores = zip(vectorizer.get_feature_names(), idf)
 53 |     term_dict = dict(term_scores)
 54 |     pickle_file = codecs.open(pickle_file_name,mode = "wb")
 55 |     pickle.dump(term_dict,pickle_file)
 56 |     return
 57 | 
 58 | 
 59 | 
 60 | def construct_training_set(alignments_file,out_file_name,score_threshold = None):
 61 |     """
 62 |     Args:
 63 |         alignments_file (file) -- file containing sample alignments
 64 |         
 65 |         out_file_name (string) -- name of training data file to write to
 66 | 
 67 |     Returns:
 68 |         None
 69 |     """
 70 |     ec = ElasticConnection(host= "54.203.12.145")
 71 |     
 72 |     training_examples = []
 73 |     for i,x in enumerate(alignments_file):
 74 |         print i
 75 |         json_obj = json.loads(x.strip())
 76 |         
 77 |         if "alignment_results" not in json_obj.keys():
 78 |             continue
 79 | 
 80 |         left_doc_id = json_obj['query_document_id']
 81 |         left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
 82 |         
 83 |         left_doc = json_obj['query_document']
 84 |         left_doc = reduce(lambda x,y:x+y,left_doc)
 85 |         
 86 |         left_doc_length = len(left_doc.split())
 87 | 
 88 |         for i,alignment_doc in enumerate(json_obj['alignment_results']):
 89 |             
 90 |             right_doc_id = alignment_doc['document_id']
 91 |             right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
 92 |             
 93 |             for alignment in alignment_doc['alignments']:
 94 | 
 95 |                 left = alignment['left']
 96 |                 right = alignment['right']
 97 |                 left_start = alignment['left_start'] 
 98 |                 right_start = alignment['right_start']
 99 |                 left_end = alignment['left_end']
100 |                 right_end = alignment['right_end']
101 |                 score = alignment['score']
102 |                 if score < score_threshold:
103 |                     continue
104 |                 training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
105 |                     right_end,score,left_bill_title,right_bill_title,
106 |                     " ".join(left)," ".join(right)])
107 |         
108 |     
109 |     random.shuffle(training_examples)            
110 |     
111 |     header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
112 |                     "right_end","score","left_bill_title","right_bill_title","left","right"]
113 |    
114 | 
115 |     k = 500
116 |     with codecs.open(out_file_name, 'wb') as output_file:
117 |         writer =  UnicodeWriter(output_file, header)
118 |         writer.writerow(header)
119 |         for l in training_examples[0:k]:
120 |             l = [unicode(x) for x in l]
121 |             writer.writerow(l)
122 | 
123 | 
124 |     return
125 | 
126 | 
127 | 
128 | def evaluate_alignment_classifier():
129 |     """runs k-fold cross validation on training set to evaluate classifier"""
130 |     
131 |     training_examples = []
132 |     for line in csv.reader(self._training_file):
133 |         if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
134 |             continue
135 |         if len(line[10]) <= 1 or len(line[11]) < 1:
136 |             continue
137 |         training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
138 |     
139 |     
140 |     
141 | 
142 |     random.shuffle(training_examples)
143 |     X,y = self.compute_feature_matrix(training_examples)
144 | 
145 |     self._model.fit(X_train,y_train)
146 |     X,y = np.array(X),np.array(y)
147 |     kf = KFold(n=len(X), n_folds=4, shuffle=False,
148 |                            random_state=None)
149 |     for train_index, test_index in kf:
150 |         X_train, X_test = X[train_index], X[test_index]
151 |         y_train, y_test = y[train_index], y[test_index]
152 |         self._model.fit(X_train,y_train)
153 |         y_pred = self._model.predict(X_test)
154 |         print classification_report(y_test, y_pred)
155 |     
156 |     self._model.fit(X,y)
157 |     feat_names =  ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
158 |             'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
159 |     
160 |     for x in zip(feat_names,self._model.coef_.tolist()):
161 |         print x
162 | 
163 | 
164 | 
165 | 
166 | class AlignmentClassifier():
167 |     """Classifier that labels alignments as either substantive (1) or boilerplate (0)"""
168 | 
169 | 
170 |     def __init__(self,idf_file_path):
171 |         """Keyword Args:
172 | 
173 |             idf_file_path: file path of the table that stores idf scores of the words
174 |             
175 |         """
176 |         self._idf_score_dict = pickle.load(open(idf_file_path))
177 |         self._training_file = codecs.open(os.environ['POLICY_DIFFUSION']+\
178 |                 "/data/training_data_alignment_classifier_bigger.csv",mode = "rU")
179 |         self._model =  LogisticRegression(penalty='l1')
180 | 
181 |     def compute_feature_matrix(self,training_examples):
182 |         """Keywords Args:
183 | 
184 |             training_examples: list of dicts, where each dict contains alignment: "left":left_text,"right":right_text
185 |                                 and "label":label of alignment (1) substantive and boilerplate (0) 
186 | 
187 |             Returns:
188 |                 
189 |                 X: feature matrix
190 |                 y: labels
191 | 
192 |             """
193 |         
194 |         X = []
195 |         y = []
196 |         for training_example in training_examples:
197 |             left = training_example['left']
198 |             right = training_example['right']
199 |             label = training_example['label']
200 |             meta_features = self._compute_alignment_meta_features(left,right)
201 |             idf_features  = self._compute_idf_score(left,right)
202 |             features = meta_features + idf_features
203 |             X.append(features)
204 |             y.append(label)
205 | 
206 | 	return X,y
207 | 
208 |     def train_model(self):
209 |         """ Trains model using training examples in self._training_file and returns a trained model self._model
210 |         
211 |         Keywords Args:
212 |             None
213 | 
214 |             Returns:
215 |             None   
216 | 
217 |         """
218 | 
219 |         
220 |         training_examples = []
221 |         for line in csv.reader(self._training_file):
222 |             if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
223 |                 continue
224 |             if len(line[10]) <= 1 or len(line[11]) < 1:
225 |                 continue
226 |             training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
227 |         
228 |         X,y = self.compute_feature_matrix(training_examples)
229 | 
230 |         self._model.fit(X_train,y_train)
231 |         X,y = np.array(X),np.array(y)
232 |         kf = KFold(n=len(X), n_folds=4, shuffle=False,
233 |                                random_state=None)
234 |         for train_index, test_index in kf:
235 |             X_train, X_test = X[train_index], X[test_index]
236 |             y_train, y_test = y[train_index], y[test_index]
237 |             self._model.fit(X_train,y_train)
238 |             y_pred = self._model.predict(X_test)
239 |             print classification_report(y_test, y_pred)
240 |         
241 |         self._model.fit(X,y)
242 |         feat_names =  ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
243 |                 'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
244 |         
245 |         for x in zip(feat_names,self._model.coef_.tolist()):
246 |             print x
247 |                 
248 |     
249 |     def predict(self,alignment_example):
250 |         """predicts label for alignment example
251 |         
252 |         
253 |         Keyword Args:
254 | 
255 |             alignment_example:  alignment [left,right] that needs to be labeled
256 |         
257 |         
258 |         """
259 |         pass
260 | 
261 |     
262 |     
263 |     def _compute_alignment_meta_features(self,left, right):
264 |         '''
265 |         This function takes as input two alignments and produce features of these
266 |         '''
267 |         #alignment features
268 |         features = {}
269 |         features['length'] = len(left)
270 |         features['num_gaps_l'] = 0
271 |         features['num_gaps_r'] = 0
272 |         features['num_mismatches'] = 0
273 |         features['num_matches'] = 0
274 |         features['avg_gap_length_l'] = []
275 |         features['avg_gap_length_r'] = []
276 |         features['avg_consec_match_length'] = []
277 |         features['jaccard_score'] = jaccard_similarity_score(left,right)
278 | 
279 |         #helper variables
280 |         prev_gap_l = False
281 |         prev_gap_r = False
282 |         prev_match = False
283 |         for i in range(len(left)):
284 |             # print 'i: ', i
285 |             # print 'features: ', features
286 |             if left[i] == '-':
287 |                 features['num_gaps_l'] += 1
288 |                 if not prev_gap_l:
289 |                     features['avg_gap_length_l'].append(1)
290 |                     prev_gap_l = True
291 |                 else:
292 |                     features['avg_gap_length_l'][-1] += 1
293 |             else:
294 |                 prev_gap_l = False
295 |             if right[i] == '-':
296 |                 features['num_gaps_r'] += 1
297 |                 if not prev_gap_r:
298 |                     features['avg_gap_length_r'].append(1)
299 |                     prev_gap_r = True
300 |                 else:
301 |                     features['avg_gap_length_r'][-1] += 1
302 |             else:
303 |                 prev_gap_r = False
304 |             if left[i] != '-' and right[i] != '-':
305 |                 if left[i] != right[i]:
306 |                     features['num_mismatches'] += 1
307 |                 elif left[i] == right[i]:
308 |                     features['num_matches'] += 1
309 |                     if not prev_match:
310 |                         features['avg_consec_match_length'].append(1)
311 |                         prev_match = True
312 |                     else:
313 |                         features['avg_consec_match_length'][-1] += 1
314 |             if left[i] != right[i]:
315 |                 prev_match = False
316 | 
317 |         if features['avg_gap_length_l'] != []:
318 |             features['avg_gap_length_l'] = np.mean(features['avg_gap_length_l'])
319 |         else:
320 |             features['avg_gap_length_l'] = 0
321 |         if features['avg_gap_length_r'] != []:
322 |             features['avg_gap_length_r'] = np.mean(features['avg_gap_length_r'])
323 |         else:
324 |             features['avg_gap_length_r'] = 0
325 |         if features['avg_consec_match_length'] != []:
326 |             features['avg_consec_match_length'] = np.mean(features['avg_consec_match_length'])
327 |         else:
328 |             features['avg_consec_match_length'] = 0
329 | 
330 |         features = sorted(features.items(),key = lambda x:x[0],reverse= False)
331 |         return [x[1] for x in features]
332 | 
333 | 
334 |     def _compute_idf_score(self,left,right):
335 |         idf_scores = []
336 | 
337 |         for w in left:
338 |             if w in self._idf_score_dict:
339 |                 idf_scores.append(self._idf_score_dict[w])
340 | 
341 |         for w in right:
342 |             if w in self._idf_score_dict:
343 |                 idf_scores.append(self._idf_score_dict[w])
344 |         
345 | 
346 |         return [np.mean(idf_scores),np.median(idf_scores)]
347 |         
348 |     
349 | 
350 | 
351 | def main():
352 |     parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ')
353 |     parser.add_argument('command',
354 |             help='command to run, options are: construct_training_set,train_model,evaluate_model')
355 |     parser.add_argument('--alignment_samples_doc', dest='alignment_samples',
356 |             help="file path to the alignment samples used to construct training set ")
357 | 
358 |     args = parser.parse_args()
359 | 
360 |     if args.command == "construct_training_set":
361 |         construct_training_set(open(args.alignment_samples),
362 |                 os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set_high_scores.csv",50)    
363 |     elif args.command == "compute_tfidf_scores":
364 |         alignments_file = codecs.open("/mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments.txt",
365 |                 encoding = "utf8")
366 |         out_file = "/mnt/data/sunlight/dssg/features/alignment_tfidf_scores.p"
367 |         compute_tfidf_scores(alignments_file,out_file)
368 | 
369 | 
370 |     elif args.command == "train_model":
371 |         pass
372 |     elif args.command == "evaluate_model":
373 |         pass
374 |     else:
375 |         print args
376 |         print "command not recognized, please enter construct_training_set,train_model,evaluate_model"
377 | 
378 | 
379 | if __name__ == "__main__":
380 |     main()
381 | 
382 | 
383 | 


--------------------------------------------------------------------------------
/lid/config.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | #Global path variables
4 | DATA_PATH = "/mnt/elasticsearch/dssg"
5 | 
6 | 


--------------------------------------------------------------------------------
/lid/etl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/etl/__init__.py


--------------------------------------------------------------------------------
/lid/etl/extractors.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from bs4 import BeautifulSoup
  3 | from state_bill_extractors import bill_text_extractor
  4 | import os
  5 | import codecs
  6 | import argparse
  7 | import re
  8 | import base64
  9 | import json
 10 | from tika import parser as tp
 11 | import traceback
 12 | import logging 
 13 | from config import DATA_PATH
 14 | 
 15 | try:
 16 |     from os import scandir, walk
 17 | except ImportError:
 18 |     from scandir import scandir, walk
 19 | 
 20 | 
 21 | 
 22 | def get_first_and_last_bill_documents(json_obj):
 23 |     state_code = json_obj['state']
 24 | 
 25 |     bill_documents = []
 26 |     for v in range(2):
 27 | 
 28 |         try:
 29 |             bill_document = base64.b64decode(json_obj['versions'][v]['bill_document'])
 30 |         except:
 31 |             bill_documents.append(None)
 32 |             continue
 33 | 
 34 |         try:
 35 |             mimetype = json_obj['versions'][v]['mimetype']
 36 | 
 37 |         except KeyError:
 38 |             mimetype = json_obj['versions'][v]['+mimetype']
 39 | 
 40 |         url = json_obj['versions'][v]['url']
 41 |         # try to extract text with bill-specific extractor
 42 |         bill_text = bill_text_extractor(state_code, bill_document, mimetype, url)
 43 | 
 44 |         # if fails then try tika extractor as backup
 45 |         if not bill_text or len(bill_text) < 1000:
 46 | 
 47 |             try:
 48 |                 bill_text = tp.from_buffer(bill_document)['content']
 49 |                 #if extraction results in short text, most likely a fail
 50 |                 if len(bill_text) < 1000:
 51 |                     bill_text = None
 52 |             except Exception:
 53 |                 bill_text = None
 54 |         
 55 |         
 56 |         bill_documents.append(bill_text)
 57 | 
 58 |     return bill_documents
 59 | 
 60 | 
 61 | 
 62 | # extracts text from bill documents fetched from sunlight
 63 | # and constructs new json obj with selected meta-data
 64 | def extract_bill_document(bill_file_path):
 65 |     try:
 66 | 
 67 |         bill_dict = {}
 68 |         data_dict = json.loads(open(bill_file_path).read())
 69 |         
 70 |         #test whether a document is a bill or resolution
 71 |         bill_text_count = [1 for x in data_dict['type'] if "bill" in x.lower()]
 72 |         good_bill_prefixes = ["A","AJ", "AJR","CACR","HB","S","HJR","ACA","HF","SF","HJ","SJ"
 73 |                 "HJRCA","SJRCA","HSB","IP","LB","SB","SCA","SP"]
 74 |         if sum(bill_text_count) < 1 and data_dict['bill_id'].split()[0] not in good_bill_prefixes:
 75 |             return
 76 |         
 77 |         
 78 | 
 79 | 
 80 |         # extract first and last versions of bill document
 81 |         # and add to json dict
 82 |         bill_document_first, bill_document_last = get_first_and_last_bill_documents(data_dict)
 83 |         bill_dict['bill_document_first'] = bill_document_first
 84 |         bill_dict['bill_document_last'] = bill_document_last
 85 |         
 86 |         if bill_document_first == None or bill_document_last == None:
 87 |             logging.warning("failed to extract text for {0}".format(bill_file_path))
 88 |     
 89 |         else:
 90 |             logging.info("successfully extracted text for {0}".format(bill_file_path))
 91 | 
 92 |         # assign attributes that will be used 
 93 |         bill_id = re.sub("\s+", "", data_dict['bill_id'])
 94 |         bill_dict['unique_id'] = "{0}_{1}_{2}".format(data_dict['state'], data_dict['session'], bill_id)
 95 |         bill_dict['bill_id'] = data_dict['bill_id']
 96 |         bill_dict['date_updated'] = data_dict['updated_at']
 97 |         bill_dict['session'] = data_dict['session']
 98 |         bill_dict['sunlight_id'] = data_dict['id']
 99 |         bill_dict['bill_title'] = data_dict['title']
100 |         bill_dict['bill_type'] = data_dict['type']
101 |         bill_dict['state'] = data_dict['state']
102 |         bill_dict['chamber'] = data_dict['chamber']
103 |         bill_dict['date_created'] = data_dict['created_at']
104 |         bill_dict['actions'] = data_dict['actions']
105 |         bill_dict['action_dates'] = data_dict['action_dates']
106 |         bill_dict['date_introduced'] = data_dict['action_dates']['first']
107 |         bill_dict['date_signed'] = data_dict['action_dates']['signed']
108 | 
109 | 
110 |         if "short_tite" in data_dict.keys():
111 |             bill_dict['short_title'] = data_dict['short_title']
112 |         elif "+short_title" in data_dict.keys():
113 |             bill_dict['short_title'] = data_dict['+short_title']
114 | 
115 |         else:
116 |             bill_dict['short_title'] = None
117 | 
118 |         if "summary" in data_dict.keys():
119 |             bill_dict['summary'] = data_dict['summary']
120 |         else:
121 |             bill_dict['summary'] = None
122 | 
123 |         return bill_dict
124 |     except (KeyboardInterrupt, SystemExit):
125 |         raise
126 |     except Exception as e:
127 |         trace_message = re.sub("\n+", "\t", traceback.format_exc())
128 |         trace_message = re.sub("\s+", " ", trace_message)
129 |         trace_message = "<<{0}>>".format(trace_message)
130 |         m = "Failed to extract document for {0}: {1}".format(bill_file_path, trace_message)
131 |         logging.error(m)
132 | 
133 | if __name__ == "__main__":
134 |     parser = argparse.ArgumentParser(description='Process some integers.')
135 |     parser.add_argument('command', help='command to run, options are: build_index')
136 |     parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ")
137 | 
138 |     args = parser.parse_args()
139 | 
140 |     #extracts text from bill documents and populates a json file with a json_object per row
141 |     if args.command == "extract_bills":
142 |         #configure logging 
143 |         logging.getLogger('tp').setLevel(logging.ERROR)
144 |         logging.getLogger('requests').setLevel(logging.ERROR)
145 |         logging.basicConfig(filename=os.environ['POLICY_DIFFUSION'] + '/logs/state_bill_extractor.log',
146 |                 level=logging.DEBUG)
147 |         
148 |         bill_files = []
149 |         for dirname, dirnames, filenames in walk(args.data_path):
150 |             for filename in filenames:
151 |                 bill_files.append(os.path.join(dirname, filename))
152 | 
153 |         outFile = codecs.open("{0}/extracted_data/extracted_bills.json".format(DATA_PATH), 'w')
154 |         for i, bill_file in enumerate(bill_files):
155 |             bill_json_obj = extract_bill_document(bill_file)
156 | 
157 |             outFile.write("{0}\n".format(json.dumps(bill_json_obj)))
158 | 
159 |         outFile.close()
160 | 
161 | 
162 | 
163 | ##extracts text from model legislation
164 | def extract_model_legislation(json_file, encoded):
165 |     '''
166 |     Keyword Args: 
167 |     json_file: corresponds to json file with model legislation
168 |     encoded: True/False if json file is b64 encoded
169 | 
170 |     returns:
171 |         dictionary with url, date, and text of model legislation 
172 |     decription:
173 |         extract text from model legislation  
174 |     '''
175 |     data = []
176 |     with open(json_file) as f:
177 |         for line in f:
178 |             data.append(json.loads(line))
179 | 
180 |     model_legislation = {}
181 |     for i in range(len(data)):
182 |         model_legislation[i] = data[i]
183 | 
184 |     if encoded == True:
185 |             for i in range(len(model_legislation)):
186 |                 try:
187 |                     ml = model_legislation[i]['source']
188 |                     ml = base64.b64decode(ml)
189 |                     ml = tp.from_buffer(ml)
190 |                     model_legislation[i]['source'] = ml['content']
191 |                 except AttributeError:
192 |                     model_legislation[i]['source'] = None
193 |             return model_legislation
194 |             
195 |     else:
196 |         return model_legislation
197 | 
198 |     
199 |  
200 | 


--------------------------------------------------------------------------------
/lid/etl/load_constitutions_into_elasticsearch.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import time
 4 | import glob
 5 | import json
 6 | import requests
 7 | from io import open
 8 | from elasticsearch import Elasticsearch
 9 | 
10 | files = glob.glob("*.txt")
11 | es = Elasticsearch([{'host': "54.244.236.175", 'port': 9200}])
12 | 
13 | for file in files:
14 |     print file
15 |     state_year = file.split(".")[0]
16 |     state = state_year[:-5]
17 |     year = int(state_year[-4:])
18 |     file_text = open(file, 'r', encoding='ISO-8859-1').read()
19 |     json_object = {
20 |             "document_type": "constitution",
21 |             "state": state,
22 |             "year": year,
23 |             "constitution": file_text
24 |     }
25 | 
26 |     es.index(index="constitutions", doc_type="constitution", id=state_year, body=json.dumps(json_object))
27 |     time.sleep(1)
28 | 


--------------------------------------------------------------------------------
/lid/etl/scrapers.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import codecs
  3 | import base64
  4 | import logging
  5 | import re
  6 | import os
  7 | import sys
  8 | import multiprocessing
  9 | import utils
 10 | import random
 11 | import argparse
 12 | import traceback
 13 | import urllib2
 14 | from config import DATA_PATH
 15 | from bs4 import BeautifulSoup
 16 | 
 17 | 
 18 | try:
 19 |     from os import scandir, walk
 20 | except ImportError:
 21 |     from scandir import scandir, walk
 22 | 
 23 | BILL_SCRAPER_LOG = os.environ['POLICY_DIFFUSION'] + '/logs/bill_scraper.log'
 24 | 
 25 | 
 26 | # scrapes all bills from the input data path
 27 | def scrape_all_bills(bill_data_path, num_workers):
 28 |     logging.basicConfig(filename=BILL_SCRAPER_LOG, level=logging.DEBUG)
 29 | 
 30 |     bill_file_paths = []
 31 |     for dirname, dirnames, filenames in walk(bill_data_path):
 32 |         for filename in filenames:
 33 |             bill_file_paths.append(os.path.join(dirname, filename))
 34 | 
 35 | 
 36 |     scrape_bill_document_from_sunlight(bill_file_paths[0])
 37 | 
 38 |     random.shuffle(bill_file_paths)
 39 | 
 40 |     pool = multiprocessing.Pool(num_workers)
 41 |     
 42 |     print "fetch {0} urls from sunlight...".format(len(bill_file_paths))
 43 |     pool.map(scrape_bill_document_from_sunlight, bill_file_paths)
 44 | 
 45 |     print "finished fetching urls..."
 46 | 
 47 | 
 48 | # open individual json file and scrape bill document,
 49 | # from the s3 server provided by sunlight foundation
 50 | def scrape_bill_document_from_sunlight(file_path):
 51 |     try:
 52 |         file_path = file_path.strip()
 53 | 
 54 |         #define path to write file
 55 |         out_file_path = file_path.split("/bills")[-1]
 56 |         out_file_path = re.sub("\s+", "_", out_file_path)
 57 |         out_dir_root_path = "{0}/scraped_bills".format(DATA_PATH)
 58 |         out_file_name = "{0}{1}.json".format(out_dir_root_path, out_file_path)
 59 | 
 60 |         bill_json = json.loads(codecs.open(file_path, encoding="utf8").read())
 61 | 
 62 |         # filter versions to be only the first and last
 63 |         try:
 64 |             bill_json['versions'] = [bill_json['versions'][0], bill_json['versions'][-1]]
 65 |         except IndexError:
 66 |             return
 67 | 
 68 |         base_url = "{0}/{1}".format("http://static.openstates.org/documents", bill_json['state'])
 69 |         urls = ["{0}/{1}".format(base_url, x['doc_id']) for x in bill_json['versions']]
 70 |         source_urls = [x['url'] for x in bill_json['versions']]
 71 | 
 72 |         for i, url in enumerate(urls):
 73 | 
 74 |             bill_document = utils.fetch_url(url)
 75 | 
 76 |             #hash bill using base64
 77 |             if bill_document is not None:
 78 |                 bill_document = base64.b64encode(bill_document)
 79 |             else:
 80 |                 logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(
 81 |                     file_path, url, i, "link error"))
 82 | 
 83 |             bill_json['versions'][i]['bill_document'] = bill_document
 84 | 
 85 |         if not os.path.exists(os.path.dirname(out_file_name)):
 86 |             os.makedirs(os.path.dirname(out_file_name))
 87 |         with codecs.open(out_file_name, "w", encoding="utf8") as f:
 88 |             f.write(json.dumps(bill_json))
 89 | 
 90 |         logging.info("successfully scraped bill: {0}".format(out_file_path))
 91 | 
 92 |     except Exception as e:
 93 |         trace_message = re.sub("\n+", "\t", traceback.format_exc())
 94 |         trace_message = re.sub("\s+", " ", trace_message)
 95 |         trace_message = "<<{0}>>".format(trace_message)
 96 |         m = "Failed to obtain documents for {0}: {1}".format(file_path, trace_message)
 97 |         logging.error(m)
 98 | 
 99 |     return
100 | 
101 | 
102 | # scrapes bill document from original source link
103 | # this is a backup if s3 doesn't work
104 | def scrape_bill_document_from_original_source(filePath):
105 |     filePath = filePath.strip()
106 | 
107 |     outFilePath = "/".join(filePath.split("/")[7:])
108 |     outFilePath = re.sub("\s+", "_", outFilePath)
109 |     outDirRootPath = "/mnt/data/sunlight/dssg/scraped_bills_new"
110 |     outFileName = "{0}/{1}.json".format(outDirRootPath, outFilePath)
111 | 
112 |     billFile = codecs.open(filePath, encoding="utf8").read()
113 |     billJson = json.loads(billFile)
114 | 
115 |     # filters documents that are resolutions
116 |     bill_text_count = [1 for x in billJson['type'] if "bill" in x.lower()]
117 |     if sum(bill_text_count) < 1:
118 |         return
119 | 
120 |     # filter versions to be only the first and last
121 |     billJson['versions'] = [billJson['versions'][0], billJson['versions'][-1]]
122 | 
123 |     urls = [x['url'] for x in billJson['versions']]
124 | 
125 |     for i, url in enumerate(urls):
126 | 
127 |         billDocument = utils.fetch_url(url)
128 | 
129 |         if billDocument is not None:
130 |             billDocument = base64.b64encode(billDocument)
131 |         else:
132 |             logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(filePath, url, i, "link error"))
133 | 
134 |         billJson['versions'][i]['bill_document'] = billDocument
135 | 
136 |     if not os.path.exists(os.path.dirname(outFileName)):
137 |         os.makedirs(os.path.dirname(outFileName))
138 |     with codecs.open(outFileName, "w", encoding="utf8") as f:
139 |         f.write(json.dumps(billJson))
140 | 
141 |     logging.info("successfully scraped bill: {0}".format(outFilePath))
142 | 
143 |     return
144 | 
145 | 
146 | # scrapes model legistlation from ALEC's official site
147 | # and the tracker website ALEC exposed
148 | def scrape_ALEC_model_legislation():
149 |     url = 'http://www.alec.org/model-legislation/'
150 |     response = urllib2.urlopen(url).read()
151 |     bs = BeautifulSoup(response, 'html5')
152 | 
153 |     # Get all links from website
154 |     ALEClist = []
155 |     for link in bs.find_all('a'):
156 |         if link.has_attr('href'):
157 |             ALEClist.append(link.attrs['href'])
158 | 
159 |     # Filter list so that we have only the ones with model-legislation
160 |     ALEClinks = []
161 |     i = 0
162 |     for i in range(0, len(ALEClist)):
163 |         if ALEClist[i][20:38] == "model-legislation/":
164 |             ALEClinks.append(ALEClist[i])
165 |             i = i + 1
166 | 
167 |     # To get only unique links (get rid off duplicates)
168 |     ALEClinks = set(ALEClinks)
169 | 
170 |     # Save to json file
171 |     with open('{0}/data/model_legislation/alec_bills.json'.format(DATA_PATH, 'w')) as f:
172 |         for line in ALEClinks:
173 |             source = urllib2.urlopen(line).read()
174 |             url = line
175 |             date = 2015
176 |             Jsonbill = bill_source_to_json(url, source, date)
177 |             f.write("{0}\n".format(Jsonbill))
178 | 
179 |     # Save old alec bills (from Center for the Media and Democracy)
180 | def scrape_alec_exposed_bills ():
181 |     names = os.listdir('{0}/model_legislation/ALEC_exposed'.format(DATA_PATH))
182 |     with open('alec_old_bills.json', 'w') as f2:
183 |     for name in names:
184 |         try:
185 |             text = tp.from_file(name)
186 |             source = text['content']
187 |         except:
188 |             source = None
189 |         url = None
190 |         date = '2010-2013'
191 |         print name
192 |         print source
193 |         Jsonbill = bill_source_to_json_not_encoded(url, source, date)
194 |         f2.write("{0}\n".format(Jsonbill))
195 | 
196 | 
197 | def scrape_CSG_model_legislation():
198 | url = 'http://www.csg.org/programs/policyprograms/SSL.aspx'
199 | doc = urllib2.urlopen(url).read()
200 | bs = BeautifulSoup(doc)
201 | 
202 | links = []
203 | for link in bs.find_all('a'):
204 |     if link.has_attr('href'):
205 |         candidate = link.attrs['href']
206 |         # links with pdf extension tend to be model bills
207 |         if candidate[-4:] == ".pdf":
208 |             links.append(candidate)
209 | 
210 | # only keeps distinct links
211 | links2 = list(set(links))
212 | 
213 | badCount = 0
214 | goodCount = 0
215 | 
216 | with open('csg_bills.json', 'w') as f:
217 |     for line in links2:
218 |         try:
219 |             url_key = {}
220 |             source = urllib2.urlopen(line).read()
221 |             Jsonbill = bill_source_to_json(link, source, None)
222 |             f.write("{0}\n".format(Jsonbill))
223 |             goodCount += 1
224 |         except:
225 |             badCount += 1
226 |     print line
227 | 
228 | print str(badCount) + " did not work"
229 | 
230 | 
231 | def scrape_ALICE_legislation():
232 |     path = "/mnt/data/sunlight/dssg/model_legislation/links_"
233 |     lines = []
234 |     for i in [1, 2, 3]:
235 |         filePath = path + str(i) + ".txt"
236 |         with open(filePath) as f:
237 |             lines.extend(f.read().splitlines())
238 | 
239 |     text = ''.join(lines)
240 |     bs = BeautifulSoup(text)
241 | 
242 |     links = []
243 |     for link in bs.find_all('a'):
244 |         if link.has_attr('href'):
245 |             links.append(link.attrs['href'])
246 | 
247 | 
248 |     # grab pdfs from links
249 |     billList = []
250 |     for url in links:
251 |         doc = urllib2.urlopen(url).read()
252 |         bs = BeautifulSoup(doc)
253 | 
254 |         for link in bs.find_all('a'):
255 |             if link.has_attr('href'):
256 |                 candidate = link.attrs['href']
257 |                 if candidate[-4:] == ".pdf":  # links with pdf extension tend to be model bills
258 |                     billList.append("https://stateinnovation.org" + candidate)
259 | 
260 |     badCount = 0
261 |     goodCount = 0
262 |     with open('alice_bills.json', 'w') as f:
263 |         for link in billList:
264 |             # url_key = {}
265 |             # source = urllib2.urlopen(link).read()
266 |             # Jsonbill = bill_source_to_json(link, source, None)
267 |             # f.write("{0}\n".format(Jsonbill))
268 |             try:
269 |                 source = urllib2.urlopen(link).read()
270 |                 Jsonbill = bill_source_to_json(link, source, None)
271 |                 f.write("{0}\n".format(Jsonbill))
272 |                 goodCount += 1
273 |             except:
274 |                 badCount += 1
275 | 
276 |     print str(badCount) + " did not work"
277 | 
278 | def scrape_misc_legislation():
279 |         # Access list of clean urls
280 | with open('/mnt/data/sunlight/dssg/model_legislation/clean_urls.txt',
281 |     'r') as f:
282 | links = f.read().splitlines()
283 | 
284 | badCount = 0
285 | goodCount = 0
286 | with open('misc_bills.json', 'w') as jsonfile:
287 |     for link in links:
288 |         try:
289 |             source = urllib2.urlopen(link).read()
290 |             Jsonbill = bill_source_to_json(link, source, None)
291 |             jsonfile.write("{0}\n".format(Jsonbill))
292 |             goodCount += 1
293 |             print goodCount
294 |         except:
295 |             badCount += 1
296 | 
297 |     print str(badCount) + " did not work"
298 |     print str(goodCount) + " worked"
299 | 
300 | 
301 | 
302 | def main():
303 | 
304 |     parser = argparse.ArgumentParser(description='module that contains functions to scrape legislative data\ '
305 |                                                  ' from sunlight foundation and various'
306 |                                                  'lobbying organizations')
307 |     parser.add_argument('command', help='command to run, options are: \n scrape_bills_from_sunlight')
308 |     parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ")
309 |     parser.add_argument('--num_workers', dest='num_workers',default = 10,
310 |                         type = int, help="file path of data to be indexed ")
311 | 
312 |     args = parser.parse_args()
313 | 
314 |     if args.command == "scrape_bills_from_sunlight":
315 |         scrape_all_bills(args.data_path,args.num_workers)
316 |     elif args.command == "scrape_ALEC_legislation":
317 |         scrape_ALEC_model_legislation()
318 |     elif args.command == "scrape_CSG_legislation":
319 |         scrape_CSG_model_legislation()
320 |     elif args.command == "scrape_ALICE_legislation":
321 |         scrape_ALICE_legislation()
322 |     elif args.command =="scrape_misc_legislation":
323 |         scrape_misc_legislation()
324 |     else:
325 |         print("command not recognized, use -h flag to see list available commands")
326 | 
327 | 
328 | 
329 | if __name__ == "__main__":
330 |     main()
331 | 


--------------------------------------------------------------------------------
/lid/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/evaluation/__init__.py


--------------------------------------------------------------------------------
/lid/evaluation/bills_for_evaluation_set.py:
--------------------------------------------------------------------------------
  1 | from elasticsearch import Elasticsearch
  2 | import re
  3 | import csv
  4 | import urllib2
  5 | import urllib
  6 | from urllib import urlopen
  7 | from tika import parser
  8 | import pickle
  9 | 
 10 | 
 11 | def create_bills(ls):
 12 |     '''
 13 |     args:
 14 |         ls: list of lists of urls that correspond to matches
 15 | 
 16 |     returns:
 17 |         dictionary grouped by matches
 18 |     '''
 19 |     k = 0
 20 |     bill_id = 0
 21 |     bills = {}
 22 |     bad_count = 0
 23 |     for urls in ls:
 24 |         for url,state in urls:
 25 |             try:
 26 |                 print "bill_id: " + str(bill_id)
 27 |                 bills[bill_id] = {}
 28 |                 doc = urllib2.urlopen(url).read()
 29 |                 text = parser.from_buffer(doc)['content'] 
 30 |                 bills[bill_id]['url'] = url
 31 |                 bills[bill_id]['text'] = text
 32 |                 bills[bill_id]['match'] = k
 33 |                 bills[bill_id]['state'] = state
 34 |             except:
 35 |                 pass
 36 |                 bad_count += 1
 37 |                 print 'bad_count: ', bad_count
 38 |             bill_id += 1
 39 |         k += 1
 40 | 
 41 |     #get more evaluation bills
 42 |     eval_bills = grab_more_eval_bills()
 43 |     for more_bills in eval_bills:
 44 |         print 'bill_group: ' k
 45 |         k +=1
 46 |         for text, state in more_bills:
 47 |             bill_id += 1
 48 |             print 'bill_id: ', i
 49 | 
 50 |             bills[bill_id] = {}
 51 |             bills[bill_id]['text'] = text
 52 |             bills[bill_id]['state'] = state  
 53 |             bills[bill_id]['match'] = k
 54 | 
 55 |     try:
 56 |         for bill in bills.keys():
 57 |             if bills[bill] == {} or bills[bill]['text'] == '' \
 58 |                 or bills[bill]['text'] == None:
 59 |                 
 60 |                 del bills[bill]
 61 |     except:
 62 |         pass
 63 | 
 64 |     return bills
 65 | 
 66 | def get_bill_by_id(unique_id):
 67 |     es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300)
 68 |     match = es.search(index="state_bills", body={"query": {"match": {'unique_id': unique_id}}})
 69 |     bill_text = match['hits']['hits'][0]['_source']['bill_document_first']
 70 |     return bill_text
 71 | 
 72 | def grab_more_eval_bills():
 73 |     with open('../../data/evaluation_set/bills_for_evaluation_set.csv') as f:
 74 |         bills_list = [row for row in csv.reader(f.read().splitlines())]
 75 |         
 76 |     bill_ids_list = []
 77 |     url_lists = []
 78 |     topic_list = []
 79 |     for i in range(len(bills_list)):
 80 |         state = bills_list[i][1]
 81 |         if state == 'ct':
 82 |             continue
 83 |         topic = bills_list[i][0]
 84 |         bill_number = bills_list[i][2]
 85 |         bill_number = re.sub(' ', '', bill_number)
 86 |         year = bills_list[i][3]
 87 |         url = bills_list[i][6]
 88 |         unique_id = str(state + '_' + year + '_' + bill_number)
 89 |         topic_list.append(topic)
 90 |         bill_ids_list.append(unique_id)
 91 |         url_lists.append(url)
 92 | 
 93 |     bills_ids = zip(bill_ids_list, url_lists)
 94 | 
 95 |     bad_count = 0
 96 |     bills_text = []
 97 |     state_list = []
 98 |     for i in range(len(bills_ids)):
 99 |         try:
100 |             bill_text = get_bill_by_id(bills_ids[i][0])
101 |         except IndexError:
102 |             try:
103 |                 url = bills_ids[i][1]
104 |                 doc = urllib.urlopen(url).read()
105 |                 bill_text = parser.from_buffer(doc)['content']
106 |                 print url
107 |             except IOError:
108 |             	bad_count += 1 
109 |             	print 'bad_count: ', bad_count
110 |             	#skip this case
111 |                 continue
112 |         bills_text.append(bill_text)
113 |         state = bills_ids[i][0][0:2]
114 |         state_list.append(state)
115 | 
116 |     bills_state = zip(bills_text, state_list, topic_list)
117 | 
118 |     bill_type_1 = []
119 |     bill_type_2 = []
120 |     for bill in bills_state:
121 |         if bill[-1] == 'Adult Guardianship and Protective Proceedings Jurisdiction Act':
122 |             bill_type_1.append((bill[0],bill[1]))
123 |         else:
124 |             bill_type_2.append((bill[0],bill[1]))
125 | 
126 |     return [bill_type_2, bill_type_1]
127 | 
128 | def create_save_bills(bill_list):
129 |     bills = create_bills(bill_list)
130 |     with open('../../data/evaluation_set/labeled_bills.p', 'wb') as fp:
131 |         pickle.dump(bills, fp)
132 | 
133 |     return bills
134 | 
135 | 
136 | if __name__ == '__main__':
137 | 	    #each list in this list of lists contains bills that are matches
138 |     similar_bills = [[('http://www.azleg.gov/legtext/52leg/1r/bills/hb2505p.pdf', 'az'),
139 |     ('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=SB0012B&session=29', 'ak' ),
140 |     ('http://www.capitol.hawaii.gov/session2015/bills/HB9_.PDF', 'hi'),
141 |     ('http://www.capitol.hawaii.gov/session2015/bills/HB1047_.PDF', 'hi'),
142 |     ('http://flsenate.gov/Session/Bill/2015/1490/BillText/Filed/HTML','fl'),
143 |     ('http://ilga.gov/legislation/fulltext.asp?DocName=09900SB1836&GA=99&SessionId=88&DocTypeId=SB&LegID=88673&DocNum=1836&GAID=13&Session=&print=true','il'),
144 |     ('http://www.legis.la.gov/Legis/ViewDocument.aspx?d=933306', 'la'),
145 |     ('http://mgaleg.maryland.gov/2015RS/bills/sb/sb0040f.pdf', 'md'),
146 |     ('http://www.legislature.mi.gov/documents/2015-2016/billintroduced/House/htm/2015-HIB-4167.htm', 'mi'),
147 |     ('https://www.revisor.mn.gov/bills/text.php?number=HF549&version=0&session=ls89&session_year=2015&session_number=0','mn'),
148 |     ('http://www.njleg.state.nj.us/2014/Bills/A2500/2354_R2.HTM','nj'),
149 |     ('http://assembly.state.ny.us/leg/?sh=printbill&bn=A735&term=2015','ny'),
150 |     ('http://www.ncga.state.nc.us/Sessions/2015/Bills/House/HTML/H270v1.html','nc'),
151 |     ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/HB2005/A-Engrossed','or'),
152 |     ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/SB947/Introduced','or'),
153 |     ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2015&sessInd=0&billBody=H&billTyp=B&billNbr=0624&pn=0724', 'pa'),
154 |     ('http://www.scstatehouse.gov/sess121_2015-2016/prever/172_20141203.htm','sc'),
155 |     ('http://lawfilesext.leg.wa.gov/Biennium/2015-16/Htm/Bills/House%20Bills/1356.htm', 'wa'),
156 |     ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874','wv'),
157 |     ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874', 'wv'),
158 |     # ('ftp://ftp.cga.ct.gov/2015/tob/h/2015HB-06784-R00-HB.htm','ct'),
159 |     ('http://www.capitol.hawaii.gov/session2015/bills/SB129_.PDF','hi'),
160 |     ('http://nebraskalegislature.gov/FloorDocs/104/PDF/Intro/LB493.pdf', 'ne'),
161 |     ('http://www.gencourt.state.nh.us/legislation/2015/HB0600.html', 'nh')],
162 |     [('http://alecexposed.org/w/images/2/2d/7K5-No_Sanctuary_Cities_for_Illegal_Immigrants_Act_Exposed.pdf', 'model_legislation'),
163 |     ('http://www.kslegislature.org/li_2012/b2011_12/measures/documents/hb2578_00_0000.pdf', 'ks'),
164 |     ('http://flsenate.gov/Session/Bill/2011/0237/BillText/Filed/HTML','fl'),
165 |     ('http://openstates.org/al/bills/2012rs/SB211/','al'),
166 |     ('http://le.utah.gov/~2011/bills/static/HB0497.html','ut'),
167 |     ('http://webserver1.lsb.state.ok.us/cf_pdf/2013-14%20FLR/HFLR/HB1436%20HFLR.PDF','ok')],
168 |     [('http://www.alec.org/model-legislation/the-disclosure-of-hydraulic-fracturing-fluid-composition-act/', 'model_legislation'),
169 |     ('ftp://ftp.legis.state.tx.us/bills/82R/billtext/html/house_bills/HB03300_HB03399/HB03328S.htm', 'tx')],
170 |     [('http://www.legislature.mi.gov/(S(ntrjry55mpj5pv55bv1wd155))/documents/2005-2006/billintroduced/House/htm/2005-HIB-5153.htm', 'mi'),
171 |     ('http://www.schouse.gov/sess116_2005-2006/bills/4301.htm','sc'),
172 |     ('http://www.lrc.ky.gov/record/06rs/SB38.htm', 'ky'),
173 |     ('http://www.okhouse.gov/Legislation/BillFiles/hb2615cs%20db.PDF', 'ok'),
174 |     ('http://state.tn.us/sos/acts/105/pub/pc0210.pdf', 'tn'),
175 |     ('https://docs.legis.wisconsin.gov/2011/related/proposals/ab69', 'wi'),
176 |     ('http://legisweb.state.wy.us/2008/Enroll/HB0137.pdf', 'wy'),
177 |     ('http://www.kansas.gov/government/legislative/bills/2006/366.pdf', 'ks'),
178 |     ('http://billstatus.ls.state.ms.us/documents/2006/pdf/SB/2400-2499/SB2426SG.pdf', 'mi')],
179 |     [('http://www.alec.org/model-legislation/state-withdrawal-from-regional-climate-initiatives/', 'model_legislation'),
180 |     ('http://www.legislature.mi.gov/documents/2011-2012/resolutionintroduced/House/htm/2011-HIR-0134.htm', 'mi'),
181 |     ('http://www.nmlegis.gov/Sessions/11%20Regular/memorials/house/HJM024.html', 'nm')],
182 |     [('http://alecexposed.org/w/images/9/90/7J1-Campus_Personal_Protection_Act_Exposed.pdf', 'model_legislation'),
183 |     ('ftp://ftp.legis.state.tx.us/bills/831/billtext/html/house_bills/HB00001_HB00099/HB00056I.htm', 'tx')],
184 |     # [
185 |     # ('http://essexuu.org/ctstat.html', 'ct'), we don't have connecituc
186 |     # ('http://alisondb.legislature.state.al.us/alison/codeofalabama/constitution/1901/CA-170364.htm', 'al')],
187 |     [('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=HB0162A&session=27', 'ak'),
188 |     ('https://legiscan.com/AL/text/HB19/id/327641/Alabama-2011-HB19-Enrolled.pdf', 'al'),
189 |     ('http://www.leg.state.co.us/clics/clics2012a/csl.nsf/fsbillcont3/0039C9417C9D9D5D87257981007F3CC9?open&file=1111_01.pdf', 'co'),
190 |     ('http://www.capitol.hawaii.gov/session2012/Bills/HB2221_.PDF', 'hi'),
191 |     ('http://ilga.gov/legislation/fulltext.asp?DocName=09700HB3058&GA=97&SessionId=84&DocTypeId=HB&LegID=60409&DocNum=3058&GAID=11&Session=&print=true', 'il'),
192 |     ('http://coolice.legis.iowa.gov/Legislation/84thGA/Bills/SenateFiles/Introduced/SF142.html', 'ia'),
193 |     ('ftp://www.arkleg.state.ar.us/Bills/2011/Public/HB1797.pdf','ar'),
194 |     ('http://billstatus.ls.state.ms.us/documents/2012/html/HB/0900-0999/HB0921SG.htm', 'ms'),
195 |     ('http://www.leg.state.nv.us/Session/76th2011/Bills/SB/SB373.pdf', 'nv'),
196 |     ('http://www.njleg.state.nj.us/2012/Bills/A1000/674_I1.HTM', 'nj'),
197 |     ('http://webserver1.lsb.state.ok.us/cf_pdf/2011-12%20INT/hB/HB2821%20INT.PDF', 'ok'),
198 |     ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=PDF&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=0934&pn=1003', 'pa'),
199 |     ('http://www.capitol.tn.gov/Bills/107/Bill/SB0016.pdf', 'tn')],
200 |     [('http://www.legislature.idaho.gov/idstat/Title39/T39CH6SECT39-608.htm', 'id'),
201 |     ('http://www.legis.nd.gov/cencode/t12-1c20.pdf?20150708171557', 'nd')]
202 |     ]
203 | 
204 |     bills = create_save_bills(similar_bills)
205 | 
206 | 
207 | 
208 | 
209 | 		
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/lid/frontend.py:
--------------------------------------------------------------------------------
  1 | #/usr/bin/env python
  2 | import os
  3 | import pdb
  4 | import sys
  5 | import argparse
  6 | import datetime as dt
  7 | import time
  8 | from collections import defaultdict
  9 | import cherrypy
 10 | from jinja2 import Environment, FileSystemLoader, Template
 11 | import random
 12 | import string
 13 | import json
 14 | from elasticsearch import Elasticsearch
 15 | from database import ElasticConnection
 16 | import re
 17 | import nltk
 18 | from utils.text_cleaning import clean_document
 19 | from lid import LID
 20 | from utils.general_utils import alignment_tokenizer
 21 | from text_alignment import LocalAligner,AffineLocalAligner
 22 | 
 23 | 
 24 | 
 25 | def get_alignment_highlight(text1,text2):
 26 |     aligns = align(text1, text2)
 27 |     alignment = aligns[0]
 28 |     seq1 = nltk.word_tokenize(text1)
 29 |     seq2 = nltk.word_tokenize(text2)
 30 |     align_clean_1, align_clean_2 = cleanAlignment(alignment)
 31 |     [i,j] = contains(align_clean_1, seq1)
 32 |     [k,l] = contains(align_clean_2, seq2)
 33 |     seq1.insert(i,"<mark>")
 34 |     seq1.insert(j,"</mark>")
 35 |     seq2.insert(k,"<mark>")
 36 |     seq2.insert(l,"</mark>")
 37 | 
 38 |     text1  = " ".join(seq1)
 39 |     text2 = " ".join(seq2)
 40 | 
 41 |     return text1,text2
 42 | 
 43 | 
 44 | 
 45 | def markup_alignment_for_display(alignment_dict,left_text,right_text):
 46 | 
 47 |     left_text = left_text.split()
 48 |     right_text = right_text.split()
 49 |     l = alignment_dict['left']
 50 |     r = alignment_dict['right']
 51 |     left_start = alignment_dict['left_start']
 52 |     left_end = alignment_dict['left_end']
 53 |     right_start = alignment_dict['right_start']
 54 |     right_end = alignment_dict['right_end']
 55 | 
 56 | 
 57 | 
 58 |     #mark up l and r alignments with style
 59 |     l_styled = []
 60 |     r_styled = []
 61 |     temp_text = ""
 62 |     for i in range(len(l)):
 63 |         if l[i] == r[i] and l[i] != "-":
 64 |             temp_text+=l[i]
 65 |             temp_text+=" "
 66 |         if l[i] != r[i]:
 67 |             if len(temp_text)>0:
 68 |                 temp_text = u"<mark>{0}</mark>".format(temp_text) 
 69 |                 l_styled.append(temp_text)
 70 |                 r_styled.append(temp_text)
 71 |                 temp_text = ""
 72 |             if l[i] != "-" and r[i] != "-":
 73 |                 l_styled.append(u"{0}".format(l[i]))
 74 |                 r_styled.append(u"{0}".format(r[i]))
 75 |             else:
 76 |                 l_styled.append(l[i])
 77 |                 r_styled.append(r[i])
 78 |     
 79 |     temp_text = u"<mark>{0}</mark>".format(temp_text) 
 80 |     l_styled.append(temp_text)
 81 |     r_styled.append(temp_text)
 82 | 
 83 |     #l[i] = "<mark>{0}</mark>".format(l[i])
 84 |     #r[i] = "<mark>{0}</mark>".format(r[i])
 85 |     
 86 |     #l.insert(0,"<mark>")
 87 |     #l.append("</mark>")
 88 |     #r.insert(0,"<mark>")
 89 |     #r.append("</mark>")
 90 |     
 91 |     padding = [u"<br><br>"]
 92 | 
 93 |     left_text = left_text[:left_start]+padding+l_styled+\
 94 |             padding+left_text[left_end:]
 95 | 
 96 |     right_text = right_text[:right_start]+padding+r_styled+padding\
 97 |             +right_text[right_end:]
 98 |     
 99 |     left_text = u" ".join(left_text)
100 |     right_text = u" ".join(right_text)  
101 |     
102 |     return left_text,right_text
103 | 
104 | 
105 | 
106 | 
107 | def markup_alignment_difference(l,r):
108 |     l_styled = []
109 |     r_styled = []
110 |     temp_text = ""
111 |     for i in range(len(l)):
112 |         if l[i] != r[i]:
113 |             l[i] = u"<mark>{0}</mark>".format(l[i])
114 |             r[i] = u"<mark>{0}</mark>".format(r[i])
115 |      
116 |     return l,r
117 | 
118 | 
119 | class DemoWebserver(object):
120 | 
121 |     _cp_config = {
122 |        'tools.staticdir.on' : True,
123 |        'tools.staticdir.dir' : "{0}/html".format(os.environ['POLICY_DIFFUSION']),
124 |        'tools.staticdir.index' : '/templates/searchdemo.html.jinja',
125 |        'tools.sessions.on': True,
126 |     }
127 |     
128 |     
129 | 
130 |     def __init__(self,elastic_connection):
131 |         self.ec = elastic_connection
132 |         self.lidy = LID(elastic_host = os.environ['ELASTICSEARCH_IP'],
133 |                 query_results_limit=os.environ['QUERY_RESULTS_LIMIT'])
134 |         
135 |         self.aligner = LocalAligner()
136 |         #self.query_bill = "bill"
137 | 
138 | 
139 |     @cherrypy.expose
140 |     def searchdemo(self, query_string="proof of identity", query_bill = "bill", query_results=[]):
141 |         
142 |         query_string = re.sub('\"',' ',query_string)
143 | 
144 |         if query_bill == "model legislation":
145 | 
146 |             query_result = lidy.find_model_legislation_alignments(query_string, document_type="text",
147 |                     split_sections=False, query_document_id="front_end_query")
148 | 
149 |             results_to_show = []
150 | 
151 |             for result_doc in query_result['alignment_results']:
152 | 
153 |                 meta_data = result_doc['document_id'].replace('old_bills', 'oldbills').split('_')
154 |                 meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]]
155 | 
156 |                 result_text = ec.get_model_legislation_by_id(result_doc['document_id'])['source']
157 |                 result_text = re.sub('\"',' ',result_text)
158 | 
159 |                 alignment = result_doc['alignments'][0]
160 |                 score = alignment['score']
161 | 
162 |                 left,right = markup_alignment_for_display(alignment,
163 |                         query_string, result_text)
164 |                 left = re.sub('\"',' ',left)
165 |                 right = re.sub('\"',' ',right)
166 |                 results_to_show.append([score] + meta_data + [left,right])
167 | 
168 |             results_to_show.sort(key = lambda x:x[0],reverse = True)
169 | 
170 |             tmpl = env.get_template("searchdemo.html.jinja")
171 |             c = {
172 |                     'query_string': query_string,
173 |                     'results_to_show': results_to_show,
174 |             }
175 |             return tmpl.render(**c)
176 |         
177 | 
178 |         if query_bill == "constitution":
179 | 
180 |             query_result = constitution_lidy.find_constitution_alignments(query_string, document_type="text",
181 |                     split_sections=True, query_document_id="text")
182 | 
183 |             results_to_show = []
184 | 
185 |             for result_doc in query_result['alignment_results']:
186 | 
187 |                 state = result_doc['document_id'][:-5].upper()
188 | 		year = result_doc['document_id'][-4:]
189 |                 meta_data = ["CONSTITUTION", state, year]
190 | 
191 |                 result_text = ec.get_constitution_by_id(result_doc['document_id'])['constitution']
192 |                 result_text = re.sub('\"',' ',result_text)
193 | 		print result_text
194 | 
195 |                 alignment = result_doc['alignments'][0]
196 |                 score = alignment['score']
197 | 
198 |                 left,right = markup_alignment_for_display(alignment,
199 |                         query_string, result_text)
200 |                 left = re.sub('\"',' ',left)
201 |                 right = re.sub('\"',' ',right)
202 |                 results_to_show.append([score] + meta_data + [left,right])
203 | 
204 |             results_to_show.sort(key = lambda x:x[0],reverse = True)
205 | 
206 |             tmpl = env.get_template("searchdemo.html.jinja")
207 |             c = {
208 |                     'query_string': query_string,
209 |                     'results_to_show': results_to_show,
210 |             }
211 |             return tmpl.render(**c)
212 |         
213 | 
214 |         else:
215 |             query_result = lidy.find_state_bill_alignments(query_string, document_type="text",
216 |                 split_sections=False, query_document_id="front_end_query")
217 | 
218 |             results_to_show = []
219 | 
220 |             for result_doc in query_result['alignment_results']:
221 |             
222 |                 meta_data = result_doc['document_id'].split("_")
223 |                 meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]]
224 |             
225 |                 result_text = ec.get_bill_by_id(result_doc['document_id'])['bill_document_last']
226 |                 result_text = re.sub('\"',' ',result_text)
227 |             
228 |                 alignment = result_doc['alignments'][0]
229 |                 score = alignment['score']
230 | 
231 |                 left,right = markup_alignment_for_display(alignment,
232 |                         query_string,result_text)
233 |                 left = re.sub('\"',' ',left)
234 |                 right = re.sub('\"',' ',right)
235 |                 results_to_show.append([score] + meta_data + [left,right])
236 | 
237 |             results_to_show.sort(key = lambda x:x[0],reverse = True)
238 | 
239 |             tmpl = env.get_template("searchdemo.html.jinja") 
240 |             c = {
241 |                     'query_string': query_string,
242 |                     'results_to_show': results_to_show,
243 |             }
244 |             return tmpl.render(**c)
245 | 
246 | 
247 | 
248 | if __name__ == '__main__':
249 |     policy_diffusion_path=os.environ['POLICY_DIFFUSION']
250 |     ec_ip = os.environ['ELASTICSEARCH_IP']
251 |     parser = argparse.ArgumentParser()
252 |     parser.add_argument('--host', type=str, default='0.0.0.0')
253 |     parser.add_argument('--port', type=int, default=29010)
254 |     parser.add_argument('--elasticsearch_connection',default=u"{0}:9200".format(ec_ip))
255 |     args = parser.parse_args()
256 | 
257 |     env = Environment(loader=FileSystemLoader("{0}/html/templates".format(policy_diffusion_path)))
258 |     
259 |     query_samples = [x.strip() for x in open("{0}/data/state_bill_samples.txt".format(policy_diffusion_path))]
260 | 
261 |     aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend=-1.5)
262 | 
263 |     ec = ElasticConnection(host = ec_ip)
264 | 
265 |     lidy = LID(query_results_limit=20, elastic_host=ec_ip,
266 |             lucene_score_threshold=0.01, aligner=aligner)
267 | 
268 |     constitution_lidy = LID(query_results_limit=10000,
269 |             elastic_host=ec_ip, lucene_score_threshold=0.01, 
270 |             aligner=aligner)
271 | 
272 | 
273 |     es_host,es_port = args.elasticsearch_connection.split(":") 
274 |     cherrypy.config.update({'server.socket_port': args.port, 'server.socket_host': args.host})
275 |     cherrypy.quickstart(DemoWebserver(ec), "/")
276 | 


--------------------------------------------------------------------------------
/lid/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/utils/__init__.py


--------------------------------------------------------------------------------
/lid/utils/general_utils.py:
--------------------------------------------------------------------------------
  1 | import ujson
  2 | import base64
  3 | import urllib2
  4 | import socket
  5 | from ftplib import FTP, error_perm
  6 | import re
  7 | from StringIO import StringIO
  8 | import time
  9 | import multiprocessing
 10 | import pickle
 11 | import multiprocessing as mp
 12 | import gc
 13 | import signal
 14 | import csv
 15 | import codecs
 16 | import cStringIO
 17 | 
 18 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ #########
 19 | 
 20 | class TimedOutExc(Exception):
 21 |     pass
 22 | 
 23 | def deadline(timeout, *args):
 24 | 
 25 |     def decorate(f):
 26 |         def handler(signum, frame):
 27 |             raise TimedOutExc()
 28 |         
 29 |         def new_f(*args):
 30 | 
 31 |             signal.signal(signal.SIGALRM, handler)
 32 |             signal.alarm(timeout)
 33 |             return f(*args)
 34 |             signa.alarm(0)
 35 | 
 36 |         new_f.__name__ = f.__name__
 37 |         return new_f
 38 |     return decorate
 39 | 
 40 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ #########
 41 | 
 42 | class UTF8Recoder:
 43 |     """
 44 |     Iterator that reads an encoded stream and reencodes the input to UTF-8
 45 |     """
 46 |     def __init__(self, f, encoding):
 47 |         self.reader = codecs.getreader(encoding)(f)
 48 | 
 49 |     def __iter__(self):
 50 |         return self
 51 | 
 52 |     def next(self):
 53 |         return self.reader.next().encode("utf-8")
 54 | 
 55 | 
 56 | class UnicodeReader():
 57 |     """
 58 |     A CSV reader which will iterate over lines in the CSV file "f",
 59 |     which is encoded in the given encoding.
 60 |     """
 61 | 
 62 |     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
 63 |         f = UTF8Recoder(f, encoding)
 64 |         self.reader = csv.reader(f, dialect=dialect, **kwds)
 65 | 
 66 |     def next(self):
 67 |         row = self.reader.next()
 68 |         return [unicode(s, "utf-8") for s in row]
 69 | 
 70 |     def __iter__(self):
 71 |         return self
 72 | 
 73 | 
 74 | 
 75 | 
 76 | class UnicodeWriter():
 77 |     def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
 78 |         self.queue = cStringIO.StringIO()
 79 |         self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
 80 |         self.stream = f
 81 |         self.encoder = codecs.getincrementalencoder(encoding)()
 82 |     def writerow(self, row):
 83 |         '''writerow(unicode) -> None
 84 |         This function takes a Unicode string and encodes it to the output.
 85 |         '''
 86 |         self.writer.writerow([s.encode("utf-8") for s in row])
 87 |         data = self.queue.getvalue()
 88 |         data = data.decode("utf-8")
 89 |         data = self.encoder.encode(data)
 90 |         self.stream.write(data)
 91 |         self.queue.truncate(0)
 92 | 
 93 |     def writerows(self, rows):
 94 |         for row in rows:
 95 |             self.writerow(row)
 96 | 
 97 | #********DEPRECATED*************
 98 | class WorkerPool():
 99 | 
100 |     def __init__(self,num_workers=1,worker_timeout = 600):
101 |         
102 |         self._num_workers = num_workers
103 |         self._worker_timeout = worker_timeout
104 |         self._results = mp.Queue()
105 |         self._pool = [None]*self._num_workers
106 |         self._worker_times = [0.0]*self._num_workers
107 | 
108 |     def _assign_new_task(self,worker_id,input_args):
109 |         p = self._pool[worker_id]
110 |         p.join()
111 |         arg = input_args.pop()
112 |         new_p = mp.Process(target= func,args = (arg,self._results),name = ('process_'+str(worker_id)))
113 |         new_p.start()
114 |         self._pool[worker_id] = new_p
115 |         self._worker_times[worker_id] = time.time()
116 |         
117 |     def work(self,func,input_args):
118 |         worker_counter = 0
119 |         #define wrapper function that queues result from input func
120 |         def new_func(x):
121 |             y = func(*x)
122 |             self._results.put(y)
123 | 
124 |         
125 |         while len(input_args) > 0 or ("running" in status):
126 |             
127 |             #assign new worker tasks to empty pool slots
128 |             for i in range(self._num_workers):
129 |                     
130 |                 if len(input_args) > 0 and self._pool[i] is None:
131 |                     arg = input_args.pop(0)
132 |                     new_p = mp.Process(target= new_func,args = (arg,),name = ('process_'+str(i)))
133 |                     new_p.start()
134 |                     print worker_counter
135 |                     worker_counter+=1
136 |                     self._pool[i] = new_p
137 |                     self._worker_times[i] = time.time()
138 | 
139 |             time.sleep(0.1)
140 |             status = self.check_pool_status(time.time())
141 |             import numpy as np
142 |             print time.time() - np.array(self._worker_times)
143 |             for i in range(len(status)):
144 |                 if status[i] == "completed":
145 |                     p = self._pool[i]
146 |                     p.terminate()
147 |                     p.join()
148 |                     self._pool[i] = None
149 |                     del p
150 |                 elif status[i] == "timeout":
151 |                     p = self._pool[i]
152 |                     p.terminate()
153 |                     self._pool[i] = None
154 |                     print "terminated job  ",p.name
155 |                     gc.collect()
156 |         
157 |         result_list = []
158 | 
159 |         while not self._results.empty():
160 |             result_list.append( self._results.get() )
161 | 
162 |         return result_list
163 | 
164 |     #returns a list of bools indicating running status of each worker. 
165 |     #running,timeout,completed
166 |     def check_pool_status(self,current_time):
167 |         status_list = []
168 |         for i in range(self._num_workers):
169 | 
170 |             worker = self._pool[i]
171 |             if worker is None:
172 |                 status_list.append("closed")
173 |             elif worker.is_alive() and (current_time-self._worker_times[i]<self._worker_timeout):
174 |                 status_list.append("running")
175 |             elif worker.is_alive() and (current_time-self._worker_times[i]>=self._worker_timeout):
176 |                 status_list.append("timeout")
177 |             elif not worker.is_alive():
178 |                 status_list.append("completed")
179 | 
180 |         return status_list
181 | # ********DEPRECATED*************
182 | 
183 | 
184 | def alignment_tokenizer(s,type = "space"):
185 |     if type == "space":
186 |         s = s.split(" ")
187 |     return s    
188 | 
189 | #creates a searalized json object for bill sources
190 | def bill_source_to_json(url,source,date):
191 |     jsonObj = {}
192 |     jsonObj['url'] = url
193 |     jsonObj['date'] = date
194 |     jsonObj['source'] = base64.b64encode(source)
195 | 
196 |     return ujson.encode(jsonObj)
197 | 
198 | #creates a json object for bill sources (not encoded)
199 | def bill_source_to_json_not_encoded(url,source,date):
200 |     jsonObj = {}
201 |     jsonObj['url'] = url
202 |     jsonObj['date'] = date
203 |     jsonObj['source'] = source
204 | 
205 |     return ujson.encode(jsonObj)
206 | 
207 | #wrapper for urllib2.urlopen that catches URLERROR and socket error
208 | def fetch_url(url):
209 | 
210 |     #fetch ftp file
211 |     if 'ftp://' in url:
212 | 
213 |         try:
214 |             domain_pattern = re.compile("/[A-Za-z0-9\.]+")
215 |             domain_name = domain_pattern.search(url).group(0)[1:]
216 |             ftp = FTP(domain_name,timeout=10)
217 |             ftp.login()
218 |             file_name = "/".join(url.split("/")[3:])
219 | 
220 |             r = StringIO()
221 |             ftp.retrbinary('RETR {0}'.format(file_name), r.write)
222 |             document = r.getvalue()
223 |             time.sleep(1)
224 | 
225 |         except (KeyboardInterrupt, SystemExit):
226 |             raise
227 |         except:
228 |             document = None
229 | 
230 | 
231 |         return document
232 | 
233 |     #fetch http file
234 |     else:
235 | 
236 |         try:
237 |             req  = urllib2.urlopen(url,timeout=10)
238 |             document = req.read()
239 |         except (KeyboardInterrupt, SystemExit):
240 |             raise
241 |         except:
242 |             document = None
243 | 
244 |         return document
245 | 
246 | #used to find alignments in broader text
247 | def find_subsequence(s,q):
248 |     '''
249 |     is the list s contained in q in order and if it is what are indices
250 |     '''
251 |     for i in range(len(q)):
252 |         T = True
253 |         for j in range(len(s)):
254 |             if s[j] != q[i+j]:
255 |                 T = False
256 |                 break
257 |         if T:
258 |             return (i, i + j + 1)
259 |     return (0,0)
260 | 
261 | 
262 | def load_pickle(name):
263 |     with open('{0}.p'.format(name),'rb') as fp:
264 |         f =pickle.load(fp)
265 | 
266 |     return f
267 | 
268 | 
269 | def save_pickle(thing, name):
270 |     with open('{0}.p'.format(name),'wb') as fp:
271 |         pickle.dump(thing, fp)
272 | 


--------------------------------------------------------------------------------
/lid/utils/sunlight_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import string
 4 | import tempfile
 5 | import importlib
 6 | import subprocess
 7 | 
 8 | 
 9 | 
10 | PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
11 | 
12 | 
13 | def convert_pdf(filename, type='xml'):
14 |     commands = {'text': ['pdftotext', '-layout', filename, '-'],
15 |             'text-nolayout': ['pdftotext', filename, '-'],
16 |             'xml': ['pdftohtml', '-xml', '-stdout', filename],
17 |             'html': ['pdftohtml', '-stdout', filename]}
18 |     try:
19 |         pipe = subprocess.Popen(commands[type], stdout=subprocess.PIPE,
20 |                 close_fds=True).stdout
21 |     except OSError as e:
22 |         raise EnvironmentError("error running %s, missing executable? [%s]" %
23 |                 ' '.join(commands[type]), e)
24 |         data = pipe.read()
25 |     pipe.close()
26 |     return data
27 | 
28 | 
29 | def pdfdata_to_text(data):
30 |     with tempfile.NamedTemporaryFile(delete=True) as tmpf:
31 |         tmpf.write(data)
32 |         tmpf.flush()
33 |         return convert_pdf(tmpf.name, 'text')
34 | 
35 | 
36 | def worddata_to_text(data):
37 |     desc, txtfile = tempfile.mkstemp(prefix='tmp-worddata-', suffix='.txt')
38 |     try:
39 |         with tempfile.NamedTemporaryFile(delete=True) as tmpf:
40 |             tmpf.write(data)
41 |             tmpf.flush()
42 |             subprocess.check_call(['timeout', '10', 'abiword',
43 |                 '--to=%s' % txtfile, tmpf.name])
44 |             f = open(txtfile)
45 |             text = f.read()
46 |             tmpf.close()
47 |             f.close()
48 |     finally:
49 |         os.remove(txtfile)
50 |         os.close(desc)
51 |     return text.decode('utf8')
52 | 
53 | 
54 | def text_after_line_numbers(lines):
55 |     text = []
56 |     for line in lines.splitlines():
57 |         # real bill text starts with an optional space, line number
58 |         # more spaces, then real text
59 |         match = re.match('\s*\d+\s+(.*)', line)
60 |         if match:
61 |             text.append(match.group(1))
62 | 
63 |     # return all real bill text joined w/ newlines
64 |     return '\n'.join(text).decode('utf-8', 'ignore')
65 | 
66 | 
67 | def plaintext(abbr, doc, doc_bytes):
68 |     # use module to pull text out of the bytes
69 |     module = importlib.import_module(abbr)
70 |     text = module.extract_text(doc, doc_bytes)
71 | 
72 |     if not text:
73 |         return
74 | 
75 |     if isinstance(text, unicode):
76 |         text = text.encode('ascii', 'ignore')
77 |     else:
78 |         text = text.decode('utf8', 'ignore').encode('ascii', 'ignore')
79 |     text = text.replace(u'\xa0', u' ')  # nbsp -> sp
80 |     text = PUNCTUATION.sub(' ', text)   # strip punctuation
81 |     text = re.sub('\s+', ' ', text)     # collapse spaces
82 |     return text
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/lid/utils/text_cleaning.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Clean text in ElasticSearch
  3 | '''
  4 | 
  5 | import elasticsearch
  6 | import re
  7 | import string
  8 | import urllib2
  9 | from elasticsearch import Elasticsearch
 10 | from pprint import pprint
 11 | import nltk
 12 | 
 13 | #custom modules
 14 | #from database import ElasticConnection
 15 | 
 16 | def clean_text(text, lower = True):
 17 |     '''
 18 |     variables:
 19 |         text: string corresponding to text of bill
 20 |         bill_name: string corresponding to bill_id
 21 | 
 22 |     returns:
 23 |         string that is cleaned up text 
 24 |     decription:
 25 |         clean text 
 26 |     '''
 27 |     #make text lowercase
 28 |     if lower == True:
 29 |         text = text.lower()
 30 |    
 31 |     text = re.sub('\n[ ]*[0-9]+', '', text)
 32 |     text = re.sub('[ ]{2,}', u' ', text) 
 33 | 
 34 |     #parse by line
 35 |     text_list =  text.splitlines()
 36 | 
 37 |     #replace funky symbols and multiple new lines
 38 |     ntext_list = []
 39 |     for line in text_list:
 40 |         line = line.replace(u'\xa0', u' ')
 41 |         line = line.replace(u'>>', u' ')
 42 |         line = line.replace(u'\xa7', u' ')
 43 |         line = line.replace(u'\xe2', u' ')
 44 |         line = line.replace(u'\u201c', u' ')
 45 |         line = line.replace(u'\u201d', u' ')
 46 |         line = line.replace(u'\xbb', u' ')
 47 |         line = line.replace(u'\xa9', u' ')
 48 |         line = line.replace(u' ,', u',')
 49 |         line = line.replace(u'{ font-family: courier, arial, sans-serif; font-size: 10pt; } table { empty-cells:show; }', u' ')
 50 |         line = re.sub( '\s+', u' ', line)
 51 |         ntext_list.append(line)
 52 |     return (string.join(ntext_list, '\n'))
 53 |  
 54 |  
 55 | 
 56 | 
 57 |    
 58 | 
 59 | def split_to_sections(cleantext,state):
 60 |     '''
 61 |     variables:
 62 |         cleantext: clean version of text of bill
 63 |         state: abbreviation of state ID
 64 | 
 65 |     returns:
 66 |         list of bill sections
 67 |     decription:
 68 |         splits bill text into sections
 69 |     '''
 70 |     if state == 'ak':
 71 |         chunked_list = cleantext.split("\n*")
 72 |     elif state in ('al','ar','mt','or','ri'):
 73 |         chunked_list = cleantext.split('\nsection')
 74 |     elif state in ('nm','tx'):
 75 |         chunked_list = cleantext.split('\n section')
 76 |     elif state in ('az','ia','nv', 'wa', 'vt'):
 77 |         chunked_list = cleantext.split('\nsec.')
 78 |     elif state in ('me', 'mi'):
 79 |         chunked_list = cleantext.split('\n sec.')
 80 |     elif state == 'co':
 81 |         chunked_list = re.split('[[0-9][0-9]\.section|[0-9]\.section', cleantext)
 82 |     elif state in ('de','fl','tn'):
 83 |         chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext)
 84 |     elif state == 'ga':
 85 |         cleantext = re.sub('[0-9][0-9]\\n|[0-9]\\n', ' ', cleantext)
 86 |         chunked_list = re.split('\\nsection\s[0-9][0-9]|\\nsection\s[0-9]', cleantext)
 87 |     elif state in ('hi','sd','in'):
 88 |         chunked_list = re.split('\\n\ssection\s[0-9][0-9]\.|\\n\ssection\s[0-9]', cleantext)
 89 |     elif state == 'pa':
 90 |         chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext) 
 91 |     elif state in ('id', 'la', 'md', 'nd'):
 92 |         chunked_list = re.split('\\nsection\s[0-9][0-9]\.|\\nsection\s[0-9]\.', cleantext)
 93 |     elif state == 'il':
 94 |         cleantext = re.sub('\\n\s[0-9][0-9]|\\n\s[0-9]', ' ', cleantext)
 95 |         chunked_list = re.split('\\n\s\ssection\s', cleantext)
 96 |     elif state == 'sc':
 97 |         chunked_list = cleantext.split('\n \n')
 98 |     elif state == 'ks':
 99 |         chunked_list = re.split('\\nsection\s|sec\.', cleantext)
100 |     elif state in ('ne', 'mn'):
101 |         chunked_list = re.split('\ssection\s[0-9]\.|\ssec.\s[0-9][0-9]\.|\ssec.\s[0-9]\.', cleantext)
102 |     elif state == 'ky':
103 |         chunked_list = cleantext.split('\n\n\n section .')
104 |     elif state == 'ms':
105 |         chunked_list = cleantext.split('\n\n\n section ')
106 |     elif state in ('ma', 'nc', 'oh','ut'):
107 |         chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext)
108 |     elif state == 'mo':
109 |         chunked_list = re.split('\\n\s[0-9][0-9]\.\s|\\n\s[0-9]\.\s', cleantext)
110 |     elif state == 'nh':
111 |         chunked_list = re.split('\n\n[0-9][0-9]\s|\n\n[0-9]\s', cleantext)
112 |     elif state == 'nj':
113 |         chunked_list = re.split('\\n\\n\s[0-9][0-9]\.\s|\\n\\n\s[0-9]\.\s', cleantext)
114 |     elif state == 'ny':
115 |         chunked_list = re.split('\ssection\s[0-9]\.|\.\ss\s[0-9]\.', cleantext)
116 |     elif state == 'ok':
117 |         chunked_list = re.split('\nsection\s\.\s', cleantext)
118 |     elif state == 'va':
119 |         chunked_list = re.split('(([A-Z])|[0-9][0-9])\.\s|(([A-Z])|[0-9])\.\s', cleantext)
120 |     elif state == 'wi':
121 |         chunked_list = re.split('\\n[0-9][0-9]section\s\\n|\\n[0-9]section\s\\n', cleantext)
122 |     elif state == 'wv':
123 |         chunked_list = re.split('\n\s\([a-z]\)\s', cleantext)
124 |     elif state == 'wy':
125 |         chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext)
126 |     elif state == 'ca':
127 |         chunked_list = re.split('section\s[0-9]\.|sec.\s[0-9][0-9]\.|sec.\s[0-9]\.', cleantext)
128 |     elif state == None:
129 |         chunked_list = cleantext.split("\n")
130 |     else:
131 |         chunked_list = cleantext.split("\n")
132 | 
133 |     return chunked_list
134 | 
135 | #Delete empty sections (run before deleting numbers in lines)
136 | def delete_empty_sections(chunked_list):
137 |     '''
138 |     decription: deletes empty elements in bills
139 |     '''
140 |     return [x for x in chunked_list if x is not None and len(x)>2] 
141 | 
142 | #Need to delete number lines for: OR, OK, NE, PA (run before deleting lines) 
143 | def delete_numbers_in_lines (chunked_list):
144 |     '''
145 |     decription:
146 |         cleans pdf extractor errors where number of lines were included in text
147 |     '''
148 |     re_string = '\\n\s[0-9][0-9]|\\n[0-9][0-9]|\\n[0-9]|\\n\s[0-9]'
149 |     chunked_list = [re.sub(re_string,'',t) for t in chunked_list]
150 |     return chunked_list
151 | 
152 | 
153 | 
154 | #Delete multiple new lines for each section
155 | def delete_lines (chunked_list):
156 |     '''
157 |     description: deletes multiple lines and spaces for each section
158 |     '''
159 |     chunked_list = [re.sub( '\s+', ' ', x) for x in chunked_list]
160 |     return chunked_list
161 |         
162 | 
163 | 
164 | def clean_document(doc_text,doc_type = "text",split_to_section = False,**kwargs):
165 |     """text -- document text
166 |        doc_type --- the type of the document ( "state_bill", "model_legislation", "None")    """
167 |     
168 |     if doc_type == "state_bill":
169 |         doc_text = clean_text(doc_text)
170 |         doc_text_sections = split_to_sections(doc_text,kwargs['state_id'])
171 |         doc_text_sections = delete_empty_sections(doc_text_sections)
172 |         if kwargs['state_id'] in ['or','ok','ne','pa']:
173 |             doc_text_sections = delete_numbers_in_lines(doc_text_sections)
174 |         doc_text_sections = delete_lines(doc_text_sections)
175 |     
176 |     elif doc_type == "model_legislation":
177 |         doc_text = clean_text(doc_text)
178 |         doc_text_sections = doc_text.split('\nsection')
179 |         doc_text_sections = delete_empty_sections(doc_text_sections)
180 |         doc_text_sections = delete_lines(doc_text_sections)
181 |         
182 |     elif doc_type == "text":
183 |         doc_text = clean_text(doc_text)
184 |         doc_text_sections = doc_text.split('\n')
185 |         doc_text_sections = delete_empty_sections(doc_text_sections)
186 |         doc_text_sections = delete_lines(doc_text_sections)
187 |     
188 |     if split_to_section == True:
189 |         return doc_text_sections
190 |     elif split_to_section == False:
191 |         return [" ".join(doc_text_sections)]
192 | 
193 | #delete boiler plate present in all alec exposed bills after "effective date"
194 | def delete_boiler_plate_alec_exposed (chunked_list):
195 |     chunked_list = [re.sub('({effective date).*$', ' ', x) for x in chunked_list]
196 |     chunked_list = chunked_list[1:]
197 |     return chunked_list
198 | 
199 | #good example is test_clean_text_for_alignment('va')
200 | 
201 | def test_clean_text(state):
202 |    es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300)
203 |    match = es.search(index="state_bills", body={"query": {"match": {'state': state}}})
204 |    state_text = match['hits']['hits'][3]['_source']['bill_document_first']
205 |    cleaned_doc = clean_document(state_text,doc_type = "state_bill",state_id = "mi",split_to_section = False)
206 |    return cleaned_doc
207 | 
208 | def main():
209 |     #Get data from elasticsearch to test
210 |     
211 |     print test_clean_text("mi")
212 | 
213 | if __name__ == "__main__":
214 |     main()
215 | 
216 | 
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/readme.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jinja2==2.7.3
2 | elasticsearch==1.5
3 | enum34==1.0.4
4 | cherrypy==3.8.0
5 | numba==0.17.0
6 | ujson==1.33
7 | 


--------------------------------------------------------------------------------
/scripts/bill_to_bill_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | with open('/Users/mattburg/Dropbox/bill_similarity_matrix.json') as data_file:
 5 |     data = json.load(data_file)
 6 | 
 7 | #data = {'ca_1': [{'id': 'ks_2', 'score': 134, 'state': 'ks'}, {'id': 'wy_12', 'score': 80, 'state': 'wy'}],'wa_3': [{'id': 'ca_1', 'score': 20, 'state': 'ca'}, {'id': 'al_5', 'score': 40, 'state': 'al'}]} 
 8 | 
 9 | 
10 | #Need list of dictionary to make it dataframe
11 | df_dict = {}
12 | df_list = []
13 | for item in data:
14 | 	for i in range(len(data[item])):
15 | 		state_1 = item[0:2]
16 | 		state_2 = data[item][i]['state']
17 | 		state_1_2 = '-'.join(sorted([state_1, state_2]))
18 | 		df_dict={
19 | 		'state_1': item[0:2],
20 | 		'state_2':data[item][i]['state'],
21 | 		'score': data[item][i]['score'],
22 | 		'state_1_2': state_1_2}
23 | 		df_list.append(df_dict)
24 | 
25 | 
26 | df = pd.DataFrame(df_list)
27 | 
28 | 	
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/scripts/bill_to_bill_parallel.sh:
--------------------------------------------------------------------------------
1 | cat /home/mburgess/policy_diffusion/data/bill_ids_random.txt | parallel --delay 0.1 \
2 |     --joblog /home/mburgess/bill_to_bill_alignments.log \
3 |     --tmpdir /mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments \
4 |     --files \
5 |     /home/mburgess/policy_diffusion/scripts/generate_bill_to_bill_matches.py
6 | 


--------------------------------------------------------------------------------
/scripts/compare_constitutions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Written for Python 2.7
 4 | 
 5 | from lid import LID
 6 | from text_alignment import AffineLocalAligner,LocalAligner
 7 | import database
 8 | import json
 9 | import base64
10 | import codecs
11 | import re
12 | import logging
13 | import os
14 | import traceback
15 | import sys
16 | from database import ElasticConnection
17 | from elasticsearch import Elasticsearch
18 | import time
19 | 
20 | def get_constitution_alignments(query_doc):
21 |     result_docs = constitution_lidy.find_constitution_alignments(
22 |             query_doc,
23 |             document_type = "text",
24 |             split_sections = True,
25 |             query_document_id = "text")
26 |     return result_docs
27 |  
28 | 
29 | def main():
30 | 
31 |     docs = ec.get_all_doc_ids('constitutions')
32 | 
33 |     for doc in docs:
34 |         print doc
35 |         doc_text = es_connection.get_source(index = 'constitutions', id = doc)['constitution']
36 |         result_doc = get_constitution_alignments(doc_text)
37 |         open('/mnt/data/jwalsh/constitution_matches.json', 'a').write(json.dumps(result_doc))
38 |         time.sleep(1)
39 | 
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     #elastic host ip
44 |     ip_addy = os.environ['ELASTICSEARCH_IP']
45 | 
46 |     #instantiate lid,aligner and elasticsearch objects
47 |     aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
48 |     ec = ElasticConnection(host = ip_addy)
49 |     es_connection = Elasticsearch([{'host': ip_addy, 'port': 9200}])
50 | 
51 |     query_results_limit = os.environ['QUERY_RESULTS_LIMIT']
52 |     constitution_lidy = LID(query_results_limit=query_results_limit, elastic_host=ip_addy,
53 | 	lucene_score_threshold=0.01, aligner=aligner)
54 | 
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/scripts/generate_bill_to_bill_matches.py:
--------------------------------------------------------------------------------
 1 | #!/opt/anaconda/bin/python
 2 | 
 3 | from lid import LID
 4 | from text_alignment import AffineLocalAligner,LocalAligner
 5 | import database
 6 | import json
 7 | import base64
 8 | import codecs
 9 | import re
10 | import logging
11 | import os
12 | import traceback
13 | import sys
14 | from utils.general_utils import deadline,TimedOutExc
15 | from database import ElasticConnection
16 | import time
17 | 
18 | class NoneDocException(Exception):
19 |     pass
20 | 
21 | 
22 | @deadline(1000)
23 | def get_alignments(query_doc,bill_id):
24 |     result_docs = lidy.find_state_bill_alignments(query_doc,document_type = "state_bill",
25 |             split_sections = True,state_id = bill_id[0:2],query_document_id = bill_id)
26 |     return result_docs
27 | 
28 | 
29 | def test(model_doc):
30 |     return model_doc
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     
35 |     #elastic host ip
36 |     ip_addy = "54.203.12.145"
37 | 
38 |     #configure logging
39 |     logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']),
40 |                 level=logging.DEBUG)
41 |     logging.getLogger('elasticsearch').setLevel(logging.ERROR)
42 |     logging.getLogger('urllib3').setLevel(logging.ERROR)
43 |     logging.getLogger('json').setLevel(logging.ERROR)
44 |     
45 |     
46 |     #instantiate lid,aligner and elasticsearch objects
47 |     
48 |     aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
49 |     
50 |     ec = ElasticConnection(host = ip_addy)
51 | 
52 |     lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner)
53 |     
54 |     #for line in sys.stdin:
55 |     
56 |     try:
57 |         
58 |         bill_id = sys.argv[1]
59 |         query_doc =  ec.get_bill_by_id(bill_id)['bill_document_last']
60 |         
61 |         if query_doc is None:
62 |             raise NoneDocException
63 |         
64 |         result_doc = get_alignments(query_doc,bill_id)
65 |         logging.info("obtained alignments for {0}".format(bill_id))
66 |         print json.dumps(result_doc)
67 |     
68 |     except (KeyboardInterrupt, SystemExit):
69 |         raise
70 |     
71 |     except NoneDocException:
72 |         
73 |         m = "none doc error query_id {0}: {1}".format(bill_id, "None doc error")
74 |         logging.error(m)
75 |         print json.dumps({"query_document_id": bill_id,"error":"none doc error"})
76 | 
77 |     except TimedOutExc:
78 |         
79 |         m = "timeout error query_id {0}: {1}".format(bill_id, "timeout error")
80 |         logging.error(m)
81 |         print json.dumps({"query_document_id": bill_id,"error":"timeout error"})
82 | 
83 |     except:
84 | 
85 |         trace_message = re.sub("\n+", "\t", traceback.format_exc())
86 |         trace_message = re.sub("\s+", " ", trace_message)
87 |         trace_message = "<<{0}>>".format(trace_message)
88 |         m = "random error query_id {0}: {1}".format(bill_id, trace_message)
89 |         logging.error(m)
90 |         print json.dumps({"query_document_id": bill_id,"error":"trace_message"})
91 | 


--------------------------------------------------------------------------------
/scripts/generate_model_legislation_matches.py:
--------------------------------------------------------------------------------
 1 | #!/opt/anaconda/bin/python
 2 | 
 3 | from lid import LID
 4 | from text_alignment import AffineLocalAligner,LocalAligner
 5 | import database
 6 | import json
 7 | import base64
 8 | import codecs
 9 | import re
10 | import logging
11 | import os
12 | import traceback
13 | import sys
14 | from utils.general_utils import deadline,TimedOutExc
15 | import time
16 | 
17 | 
18 | 
19 | @deadline(1000)
20 | def get_alignments(model_doc):
21 |     result_docs = lidy.find_state_bill_alignments(model_doc['source'],document_type = "model_legislation",
22 |             split_sections = True,query_document_id = model_doc['id'])
23 |     return result_docs
24 | 
25 | 
26 | def test(model_doc):
27 |     return model_doc
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     
32 |     #elastic host ip
33 |     ip_addy = "54.203.12.145"
34 | 
35 | 
36 | 
37 |     #configure logging
38 |     logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']),
39 |                 level=logging.DEBUG)
40 |     logging.getLogger('elasticsearch').setLevel(logging.ERROR)
41 |     logging.getLogger('urllib3').setLevel(logging.ERROR)
42 |     logging.getLogger('json').setLevel(logging.ERROR)
43 |     
44 |     
45 |     #instantiate lid object
46 |     
47 |     aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
48 | 
49 |     lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner)
50 |     
51 |     for line in sys.stdin:
52 |         model_doc = json.loads(line.strip())
53 |         
54 |         try:
55 |             result_doc = get_alignments(model_doc)
56 |             #result_doc = test(model_doc)
57 |             print json.dumps(result_doc)
58 |         
59 |         except (KeyboardInterrupt, SystemExit):
60 |             raise
61 |         except TimedOutExc:
62 |             m = "timeout error query_id {0}: {1}".format(model_doc['id'], trace_message)
63 |             logging.error(m)
64 |             print json.dumps({"query_document_id": model_doc['id'],"error":"timeout error"})
65 | 
66 |         except:
67 |             trace_message = re.sub("\n+", "\t", traceback.format_exc())
68 |             trace_message = re.sub("\s+", " ", trace_message)
69 |             trace_message = "<<{0}>>".format(trace_message)
70 |             m = "random error query_id {0}: {1}".format(model_doc['id'], trace_message)
71 |             logging.error(m)
72 |             print json.dumps({"query_document_id": model_doc['id'],"error":"trace_message"})
73 | 
74 | 


--------------------------------------------------------------------------------
/scripts/model_legislation_network.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def func(x):
 4 |    x['weight'] = x.count()
 5 |    return x
 6 | df = pd.read_csv("/Users/mattburg/Downloads/interest_groups_to_state_network_fixed.csv")
 7 | df = df[df.score>100]
 8 | df = df.groupby(df.edge_id).count()
 9 | 
10 | alec_total = 2208.
11 | alice_total = 1500.
12 | 
13 | index = df.index
14 | ids = df['lobby_id'].tolist()
15 | 
16 | print "Source,Target,Weight,Type"
17 | for x,y in zip(index,ids):
18 |     s,t = x.split("_")
19 |     if s == "alec":
20 |         y = float(y)/alec_total
21 |     elif s == "alice":
22 |         y = float(y)/alice_total
23 |     else:
24 |         continue
25 |     print "{0},{1},{2},{3}".format(s,t,y,"undirected")
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/model_legislation_parallel.sh:
--------------------------------------------------------------------------------
1 | cat /mnt/data/sunlight/dssg/model_legislation/extracted_model_legislation.json | parallel --pipe --delay 1.0 \
2 |     --joblog /home/mburgess/model_legistlation_alignments.log \
3 |     --tmpdir /mnt/data/sunlight/dssg/alignment_results/model_legislation_alignments \
4 |     --files \
5 |     /home/mburgess/policy_diffusion/scripts/generate_model_legislation_matches.py
6 | 


--------------------------------------------------------------------------------
/scripts/model_legislation_to_bill_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | from database import *
 4 | import numpy as np
 5 | 
 6 | #open json file
 7 | alec_json = "/Users/eugeniagiraudy/Dropbox/DSSG/policy_diffusion/scripts/model_legislation_alignments.json"
 8 | 
 9 | def create_bill_to_bill_matrix(jsonfile):
10 | 	'''
11 | 	Converts a json file with matching text between model legislation and bills into a 
12 | 	dataframe.
13 | 
14 | 	'''
15 | 	alignments = [json.loads(x.strip()) for x in open(jsonfile)]
16 | 	df_list = []
17 | 	for i in range(len(alignments)):
18 | 		left_id = alignments[i]['query_document_id']
19 | 		interest_group = left_id.split('_')
20 | 		interest_group = "_".join(interest_group[0:2])
21 | 		try:
22 | 			for result in alignments[i]['alignment_results']:
23 | 				right_id = result['document_id']
24 | 				score_list = []
25 | 				for j in range(len(result['alignments'])):
26 | 					score = result['alignments'][j]['score']
27 | 					score_list.append(score)
28 | 					#Need to decide whehter we want the sum, average, max
29 | 					score_max = max(score_list)
30 | 				df_list.append([interest_group, left_id,right_id,score_max,right_id[0:2],left_id+"_"+right_id,'undirected'])
31 | 		except KeyError:
32 | 				print left_id, 'failed'
33 | 				continue
34 | 	df = pd.DataFrame(df_list)
35 | 	df.columns = ['interst_group_id','model_legislation_id', 'unique_id','score_max','state','bill_ml_id','undirected']
36 | 	return df
37 | 
38 | 
39 | def grab_ids_for_data_frame(df):
40 | 	'''
41 | 	Grabs bill ids from ElasticSearch and adds it to a dataframe. 
42 | 	Outputs csv file with data frame containing model legislation to bills matches and 
43 | 	information on date introduced and date signed
44 | 
45 | 	Arguments: 
46 | 		dataframe = data frame containing model legislation to bill analysis
47 | 
48 | 	'''
49 | 	bill_id_list = df['unique_id']
50 | 	bill_id_list = bill_id_list.tolist()
51 | 
52 | 	ec = ElasticConnection(host = '54.203.12.145', port = 9200)
53 | 
54 | 	bill_dates = []
55 | 	bill_signed = []
56 | 	for bill in bill_id_list:
57 | 	    bill_all = ec.get_bill_by_id(bill)
58 | 	    date_introduced = bill_all['date_introduced']
59 | 	    date_signed = bill_all['date_signed']
60 | 	    bill_dates.append(date_introduced)
61 | 	    bill_signed.append(date_signed)
62 | 	    print bill
63 | 	bills_introd_signed = zip(bill_id_list, bill_dates, bill_signed)
64 | 	df_dates = pd.DataFrame(bills_introd_signed)
65 | 	df_dates.columns = ['unique_id', 'date_introduced', 'date_signed']
66 | 	df2 = pd.merge(df, df_dates, on='unique_id')
67 | 	#Drop duplicates from the merge
68 | 	df3 = df2.drop_duplicates('bill_ml_id')	
69 | 	return df3.to_csv('./model_legislation_to_bills_max_score.csv')
70 | 	
71 | 	
72 | 
73 | 
74 | #Analysis of ALEC
75 | 
76 | df_alec = df3[(df3.interst_group_id =='alec_bills')|(df3.interst_group_id=='alec_old')]
77 | #eliminate cases where two model legislations influence the same bill
78 | df_alec = df_alec.groupby(['unique_id']).max()
79 | date = df_alec['date_introduced']
80 | df_alec['year_introduced']=date.apply(lambda x:x.year)
81 | #eliminate cases wher states may have two identical bills for a given year
82 | df_grouped = df_alec.groupby(['state', 'year_introduced', 'model_legislation_id']).max()
83 | df_grouped.to_csv('./alec_model_legislation_to_bills_max_score_unique.csv')
84 | 
85 | 


--------------------------------------------------------------------------------
/tests/text_alignment_tests.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | import matplotlib.pyplot as plt
  4 | import time
  5 | import numpy as np
  6 | from compiler.ast import flatten
  7 | from alignment.sequence import Sequence
  8 | from alignment.vocabulary import Vocabulary
  9 | from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner
 10 | from utils.general_utils import find_subsequence
 11 | from text_alignment import *
 12 | 
 13 | 
 14 | #function from python package for testing results
 15 | def seqToAlign(a, b, matchScore = 3, mismatchScore = -1, gapScore = -2):
 16 |     '''
 17 |     args:
 18 |         a: list of words
 19 |         b: list of words
 20 |         matchScore: num
 21 |         mismatchScore: num
 22 |         gapScore: num
 23 |     Returns:
 24 |         o/w returns list of tuples with score and top alignments
 25 |     Description:
 26 |         helper function for finding alignments given a list of words
 27 |     '''
 28 |     # Create a vocabulary and encode the sequences.
 29 |     a = a[0]
 30 |     b = b[0]
 31 |     seq1 = Sequence(a)
 32 |     seq2 = Sequence(b)
 33 |     v = Vocabulary()
 34 |     aEncoded = v.encodeSequence(seq1)
 35 |     bEncoded = v.encodeSequence(seq2)
 36 | 
 37 |     # Create a scoring and align the sequences using local aligner.
 38 |     scoring = SimpleScoring(matchScore, mismatchScore)
 39 |     aligner = LocalSequenceAligner(scoring, gapScore)
 40 |     score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
 41 |     alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds]
 42 | 
 43 |     return [(a.score, list(a.first), list(a.second)) for a in alignments]
 44 | 
 45 | 
 46 | #testing functions
 47 | def create_doc_test_cases():
 48 |     #tests
 49 |     t1 = [['a']*100]
 50 |     t2 = [['b']*50 + ['a','a','b']*50]
 51 | 
 52 |     s1 = [[1]*100]
 53 |     s2 = [[2]*50 + [1,1,2]*50]
 54 | 
 55 |     v1 = [np.array([0, 1, 2, 3, 4, 7, 6, 3, 2, 1, 3])]
 56 |     v2  = [np.array([0, 1, 2, 3, 4, 4, 5, 2, 1, 2, 2])]
 57 | 
 58 |     w1 = [np.array([7, 6, 3, 2, 1, 3, 0, 1, 2, 3, 4])]
 59 |     w2  = [np.array([4, 5, 2, 1, 2, 2, 0, 1, 2, 3, 4])]
 60 | 
 61 |     tests = [(t1,t2), (s1,s2),(v1,v2), (w1,w2), ([np.random.choice(5, 30)],[np.random.choice(5, 30)]), \
 62 |     ([np.array([1, 2, 0, 0, 1, 2, 3, 0, 1, 3, 0, 4, 3, 3, 0, 3, 0, 2, 0, 4, 3, 4, 2, \
 63 |        1, 1, 1, 1, 1, 0, 1])], [np.array([2, 0, 3, 1, 2, 4, 0, 1, 3, 0, 1, 4, 1, 3, 1, 4, 0, 0, 1, 2, 4, 0, 0, \
 64 |        2, 4, 1, 3, 2, 2, 4])])]
 65 | 
 66 |     return tests
 67 | 
 68 | 
 69 | #LocalAligner algorithm tests
 70 | def LocalAligner_unit_tests():
 71 |     
 72 |     def test_alignment(t1,t2):
 73 |         f = LocalAligner()
 74 |         alignment=f.align(t1,t2) #default score is 3,-1,-2
 75 |         score, l, r  = alignment.alignments[0]
 76 | 
 77 |         #find score of recovered alignment
 78 |         align_score = f.alignment_score(l,r)
 79 | 
 80 |         #run package algorithm
 81 |         alignments = seqToAlign(t1,t2) #default score is 3,-1,-2
 82 | 
 83 |         if score == align_score and score == alignments[0][0]:
 84 |             print 'package, backtraced alignment, and alignmnet matrix consistent'
 85 |         else:        
 86 |             print 'dp_alg_score: ' + str(score)
 87 |             print 'alignment_score: ' + str(align_score)
 88 |             print 'package_score: ' + str(alignments[0][0])
 89 | 
 90 |     #tests
 91 |     tests = create_doc_test_cases()
 92 |     for test in tests:
 93 |         z1, z2 = test
 94 |         test_alignment(z1,z2)
 95 | 
 96 |         f = LocalAligner()
 97 |         alignment=f.align(z1,z2) #default score is 3,-1,-2
 98 | 
 99 |         score, l, r  = alignment.alignments[0]
100 | 
101 |         #run package algorithm
102 |         alignments = seqToAlign(z1,z2) #default score is 3,-1,-2
103 | 
104 |         l_true, r_true = alignments[0][1:]
105 | 
106 |         for i in range(len(l)):
107 |             if l[i] != l_true[i]:
108 |                 print 'not same sequence'
109 |                 break
110 | 
111 |         for i in range(len(r)):
112 |             if r[i] != r_true[i]:
113 |                 print 'not same sequence'
114 |                 break
115 | 
116 | 
117 | def test_alignment(t1,t2, algorithm):
118 |     f = algorithm()
119 |     alignment=f.align(t1,t2) #default score is 3,-1,-2
120 |     score, l, r  = alignment.alignments[0]
121 | 
122 |     #find score of recovered alignment
123 |     align_score = f.alignment_score(l,r)
124 | 
125 |     if score == align_score:
126 |         print 'backtraced alignment and alignmnet matrix consistent'
127 |     else:
128 |         print 'backtraced alignment and alignmnet matrix not consistent'
129 |         print 'dp_alg_score: ' + str(score)
130 |         print 'alignment_score: ' + str(align_score)
131 | 
132 |         print 'left_alignment: ', l 
133 |         print 'right_alignment: ', r
134 | 
135 | 
136 | def generic_doc_unit_test(algorithm):
137 | 
138 |     tests = create_doc_test_cases()
139 |     for test in tests:
140 |         z1, z2 = test
141 |         test_alignment(z1,z2, algorithm)
142 | 
143 | 
144 | def LocalAligner_speed_test():
145 |     
146 |     input_sizes = [np.exp2(p) for p in range(2,7)]
147 | 
148 |     average_our_times = []
149 |     average_package_times = []
150 |     for input_size in input_sizes:
151 |         print input_size
152 |         v1 = [np.random.randint(0,10,input_size)]
153 |         v2 = [np.random.randint(0,10,input_size)]
154 |         our_times = []
155 |         package_times = []
156 |         f = LocalAligner()
157 |         for i in range(2):
158 |             t1 = time.time()
159 |             f.align(v1,v2)
160 |             our_times.append(time.time()-t1)
161 |             
162 |             t2 = time.time()
163 |             seqToAlign(v1,v2)
164 |             package_times.append(time.time()-t2)
165 |     
166 |         average_our_times.append(np.mean(our_times))
167 |         average_package_times.append(np.mean(package_times))
168 |     
169 |     plt.plot(input_sizes,average_package_times, color = 'b', label = 'package')
170 |     plt.plot(input_sizes,average_our_times, color='r', label = 'our implementation')
171 |     plt.legend(loc='upper right')
172 |     plt.xlabel('input size')
173 |     plt.ylim(0,0.02)
174 |     plt.show()
175 | 
176 | 
177 | def generic_doc_speed_test(algorithm):
178 |     '''
179 |     compares speed of algorithm to local alignment algorithm
180 |     '''
181 |     
182 |     input_sizes = [np.exp2(p) for p in range(2,7)]
183 | 
184 |     average_alg_times = []
185 |     average_local_times = []
186 |     for input_size in input_sizes:
187 |         print input_size
188 |         v1 = [np.random.randint(0,10,input_size)]
189 |         v2 = [np.random.randint(0,10,input_size)]
190 |         local_times = []
191 |         alg_times = []
192 |         f = LocalAligner()
193 |         g = algorithm()
194 |         for i in range(2):
195 |             t1 = time.time()
196 |             f.align(v1,v2)
197 |             local_times.append(time.time()-t1)
198 |             
199 |             t2 = time.time()
200 |             g.align(v1,v2)
201 |             alg_times.append(time.time()-t2)
202 |     
203 |         average_local_times.append(np.mean(local_times))
204 |         average_alg_times.append(np.mean(alg_times))
205 |     
206 |     return average_local_times, average_alg_times
207 | 
208 | 
209 | def doc_test_alignment_indices(algorithm):
210 |     #tests
211 |     tests = create_doc_test_cases()
212 | 
213 |     good_job = True
214 |     for test in tests:
215 | 
216 |         left_text, right_text = test
217 |         try:
218 |             left_text[0] = left_text[0].tolist()
219 |             right_text[0] = right_text[0].tolist()
220 |         except:
221 |             pass
222 |         f = algorithm()
223 |         Alignment = f.align(left_text,right_text)
224 |         left, right = clean_alignment(Alignment.alignments[0])
225 | 
226 | 
227 |         left_start, left_end = find_subsequence(left, flatten(left_text))
228 |         right_start, right_end = find_subsequence(right, flatten(right_text))
229 | 
230 |         if Alignment.alignment_indices[0]['left_start'] != left_start or \
231 |             Alignment.alignment_indices[0]['left_end'] != left_end or \
232 |             Alignment.alignment_indices[0]['right_start'] != right_start or \
233 |             Alignment.alignment_indices[0]['right_end'] != right_end:
234 | 
235 |             print 'alignment length: ', len(left)
236 | 
237 |             print 'indices are messed up'
238 | 
239 |             print 'left_start: ', Alignment.alignment_indices[0]['left_start']
240 |             print 'true left_start: ', left_start
241 |             print 'left_end: ', Alignment.alignment_indices[0]['left_end']
242 |             print 'true left_end', left_end
243 |             print '\n'
244 | 
245 |             print 'right_start: ', Alignment.alignment_indices[0]['right_start']
246 |             print 'true right_start: ', right_start
247 |             print 'right_end: ', Alignment.alignment_indices[0]['right_end']
248 |             print 'true right_end: ', right_end
249 | 
250 |             print '\n'
251 | 
252 |             good_job = False
253 | 
254 |     if good_job:
255 |         print 'indices worked'
256 | 
257 | 
258 | #SectionLocalAlignment Tests
259 | def create_section_tests():
260 |     tests = create_doc_test_cases()
261 | 
262 |     #convert tests into sections so 
263 |     #that it makes sense for case
264 |     left_test = []
265 |     right_test = []
266 |     for test1, test2 in tests:
267 |         left_test.append(list(test1[0]))
268 |         right_test.append(list(test2[0]))
269 | 
270 |     return left_test, right_test
271 | 
272 | 
273 | def section_unit_tests(Algorithm):
274 |     left_test, right_test = create_section_tests()
275 | 
276 |     f = Algorithm()
277 |     Alignment = f.align(left_test, [flatten(right_test)])
278 | 
279 |     good_job = True
280 |     for score, left, right in Alignment.alignments:
281 |         true_score = f.alignment_score(left, right)
282 |         if true_score != score:
283 |             print 'left: ', left
284 |             print 'right: ', right
285 |             print 'true alignment score: ', true_score
286 |             print 'calculated score: ', score
287 |             good_job = False
288 | 
289 |     if good_job:
290 |         print "calculated alignment scores correctly"
291 | 
292 | 
293 | def section_speed_test():
294 | 
295 |     input_sizes = [np.exp2(p) for p in range(2,9)]
296 | 
297 |     average_local_times = []
298 |     average_section_times = []
299 |     for input_size in input_sizes:
300 |         print input_size
301 |         v1 = [np.random.randint(0,10,input_size)]
302 |         v2 = [np.random.randint(0,10,input_size)]
303 | 
304 |         cut1 = random.randint(0,len(v1))
305 |         cut2 = random.randint(cut1,len(v2))
306 |         cut3 = random.randint(cut2,len(v2))
307 |         w1 = [v1[0][:cut1], v1[0][cut1:cut2], v1[0][cut2:cut3]]
308 | 
309 |         local_times = []
310 |         section_times = []
311 |         for i in range(2):
312 |             t1 = time.time()
313 |             f = LocalAligner()
314 |             f.align(v1,v2)
315 |             local_times.append(time.time()-t1)
316 | 
317 |             t2 = time.time()
318 |             f = LocalAligner()
319 |             f.align(w1,v2)
320 |             section_times.append(time.time()-t2)
321 |     
322 |         average_local_times.append(np.mean(local_times))
323 |         average_section_times.append(np.mean(section_times))
324 |     
325 |     plt.plot(input_sizes,average_section_times, color = 'b', label = 'section local alignment')
326 |     plt.plot(input_sizes,average_local_times, color='r', label = 'local alignment')
327 |     plt.legend(loc='upper right')
328 |     plt.xlabel('input size')
329 |     plt.ylim(0,0.02)
330 |     plt.show()
331 | 
332 | 
333 | def section_test_alignment_indices():
334 |     left_test, right_test = create_section_tests()
335 |     left_test_flattened = flatten(left_test)
336 |     right_test_flattened = flatten(right_test)
337 | 
338 |     f = LocalAligner()
339 |     Alignment = f.align(left_test, [right_test_flattened])
340 | 
341 |     good_job = True
342 |     for i in range(len(Alignment.alignments)):
343 |         left, right = clean_alignment(Alignment.alignments[i])
344 | 
345 |         print 'alignment length: ', len(left)
346 | 
347 |         left_start, left_end = find_subsequence(left, left_test_flattened)
348 |         right_start, right_end = find_subsequence(right, right_test_flattened)
349 | 
350 |         if Alignment.alignment_indices[i]['left_start'] != left_start or \
351 |             Alignment.alignment_indices[i]['left_end'] != left_end or \
352 |             Alignment.alignment_indices[i]['right_start'] != right_start or \
353 |             Alignment.alignment_indices[i]['right_end'] != right_end:
354 |             
355 |             print 'indices are messed up: '
356 | 
357 |             print 'left_start: ', Alignment.alignment_indices[i]['left_start']
358 |             print 'true left_start: ', left_start
359 |             print 'left_end: ', Alignment.alignment_indices[i]['left_end']
360 |             print 'true left_end', left_end
361 |             print '\n'
362 | 
363 |             print 'right_start: ', Alignment.alignment_indices[i]['right_start']
364 |             print 'true right_start: ', right_start
365 |             print 'right_end: ', Alignment.alignment_indices[i]['right_end']
366 |             print 'true right_end: ', right_end
367 | 
368 |             print '\n'
369 | 
370 |             good_job = False
371 | 
372 |     if good_job:
373 |         print 'indices worked'
374 | 
375 | 
376 | ############################################################
377 | ##helper functions
378 | def clean_alignment(alignment):
379 |     '''
380 |     arg:
381 |         alignment object
382 |     returns:
383 |         2 list of alignment words without the alignment symbol
384 |     '''
385 |     keep1 = []
386 |     keep2 = []
387 |     for item in alignment[1]:
388 |         if item != '-':
389 |             keep1.append(item)
390 | 
391 |     for item in alignment[2]:
392 |         if item != '-':
393 |             keep2.append(item)
394 | 
395 |     return (keep1, keep2)
396 | 
397 | 
398 | if __name__ == '__main__':
399 |     print "running LocalAligner unit tests.... \n"
400 |     LocalAligner_unit_tests()
401 | 
402 |     print "running LocalAligner speed tests.... \n"
403 |     LocalAligner_speed_test()
404 | 
405 |     print "running LocalAligner index tests.... \n"
406 |     doc_test_alignment_indices(LocalAligner)
407 | 
408 |     print "running AffineLocalAligner unit tests.... \n"
409 |     generic_doc_unit_test(AffineLocalAligner)
410 | 
411 |     print "running AffineLocalAligner speed tests.... \n"
412 |     generic_doc_speed_test(AffineLocalAligner)
413 | 
414 |     print "running section unit tests for localaligner.... \n"
415 |     section_unit_tests(LocalAligner)
416 | 
417 |     print "running section unit tests for affinealigner.... \n"
418 |     section_unit_tests(AffineLocalAligner)
419 | 
420 |     print "running section speed tests.... \n"
421 |     section_speed_test()
422 | 
423 |     print 'running test on keeping track of indices for section algorithm..... \n'
424 |     section_test_alignment_indices()
425 | 
426 |     print 'running speed test on Word2VecLocalAligner.... \n'


--------------------------------------------------------------------------------