├── .gitignore ├── LICENSE ├── README.md ├── archive ├── alignment_evaluation.py ├── classifier.py ├── exploratory.R ├── input │ ├── .download_bulk_sunlight_files.sh.swp │ ├── Drakefile │ ├── Sunlight_data.ipynb │ ├── bill_metadata.py │ ├── bill_metadata.sql │ ├── campaign_contributions.sh │ ├── committee_metadata.py │ ├── committee_metadata.sql │ ├── download_bulk_sunlight_files.sh │ ├── legislators.py │ ├── legislators.sql │ ├── lobbyists │ │ └── compile_lobbyist_lists.sh │ ├── opensecrets │ │ └── opensecrets_candidates.sql │ ├── state_metadata.py │ ├── state_metadata.sql │ └── unzip_bulk_files.sh ├── prototype_text_alignment_algorithms.py ├── score_alignments.py └── tfidf_ranking.py ├── bashrc_lid ├── data ├── bill_ids.txt ├── classifier │ └── training_data_alignment_classifier.csv ├── evaluation_set │ ├── bills_for_evaluation_set.csv │ └── labeled_bills.json ├── model_legislation_urls │ └── clean_urls.txt └── state_bill_samples.txt ├── db ├── elasticsearch.yml ├── evaluation_mapping.json ├── state_bill_index.json └── state_bill_mapping.json ├── html ├── bootstrap3 │ ├── css │ │ ├── .Rhistory │ │ ├── bootstrap-theme.css │ │ ├── bootstrap-theme.css.map │ │ ├── bootstrap-theme.min.css │ │ ├── bootstrap.css │ │ ├── bootstrap.css.map │ │ ├── bootstrap.min.css │ │ └── custom.css │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ └── js │ │ ├── bootstrap.js │ │ ├── bootstrap.min.js │ │ └── npm.js ├── index.html └── templates │ └── searchdemo.html.jinja ├── lid ├── .DS_Store ├── __init__.py ├── alignment_classifier.py ├── config.py ├── database.py ├── etl │ ├── __init__.py │ ├── extractors.py │ ├── load_constitutions_into_elasticsearch.py │ ├── scrapers.py │ └── state_bill_extractors.py ├── evaluation │ ├── __init__.py │ ├── bills_for_evaluation_set.py │ └── lid_evaluation.py ├── frontend.py ├── lid.py ├── text_alignment.py └── utils │ ├── __init__.py │ ├── general_utils.py │ ├── sunlight_utils.py │ └── text_cleaning.py ├── readme.txt ├── requirements.txt ├── scripts ├── bill_to_bill_analysis.py ├── bill_to_bill_parallel.sh ├── compare_constitutions.py ├── generate_bill_to_bill_matches.py ├── generate_model_legislation_matches.py ├── model_legislation_network.py ├── model_legislation_parallel.sh └── model_legislation_to_bill_analysis.py └── tests └── text_alignment_tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Drake log 2 | drake.log 3 | 4 | # Database info 5 | default_profile 6 | 7 | # Sunlight key 8 | .sunlight.* 9 | 10 | # IPython Notebook checkpoints 11 | .ipynb_checkpoints/ 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # Drake 69 | drake.* 70 | .drake/ 71 | 72 | #pycharm 73 | *.idea 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Data Science for Social Good 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Legislative Influence Detector 2 | 3 | Legislators often lack the time to write bills, so they tend to rely on outside groups to help. Researchers and concerned citizens would like to know who’s writing legislative bills, but trying to read those bills, let alone trace their source, is tedious and time consuming. This is especially true at the state and local levels, where arguably more important policy decisions are made every day. 4 | 5 | This project provides tools to help analyze and access government bills. Using the Sunlight Foundation’s collection of state bills and model legislation scraped from lobbying groups from around the country, we built tools to shed light on the origination and diffusion of policy ideas around the country, the effectiveness of various lobbying organizations, and the democratic nature of individual bills, all in near real time. 6 | 7 | # How does it work? 8 | 9 | We use the Smith-Waterman local-alignment algorithm to find matching text across documents. This algorithm grabs pieces of text from each document and compares each word, adding points for matches and subtracting points for mismatches. Unfortunately, the local-alignment algorithm is too slow for large sets of text, such as ours. It could take the algorithm thousands of years to finish analyzing the legislation. We improved the speed of the analysis by first limiting the number of documents that need to be compared. Elasticsearch, our database of choice for this project, efficiently calculates Lucene scores. When we use LID to search for a document, it quickly compares our document against all others and grabs the 100 most similar documents as measured by their Lucene scores. Then we run the local-alignment algorithm on those 100. 10 | 11 | # How to use it? 12 | 13 | * The text_alignmnet.py file gives our implemtnation of the smith-waterman algorithm. Feel free to use it! 14 | 15 | # Important Files 16 | 17 | * text_alignment.py: contains our fast implementation of the smith-waterman algorithm. 18 | 19 | ## Environmental Variables 20 | * POLICY_DIFFUSION 21 | * LOGFILE_DIRECTORY: should not exist inside repository, to prevent repository bloating 22 | * TEMPFILE_DIRECTORY: stores files created temporarily while the algorithm runs 23 | * ELASTICSEARCH_IP 24 | 25 | -------------------------------------------------------------------------------- /archive/classifier.py: -------------------------------------------------------------------------------- 1 | from alignment_evaluation import alignment_features 2 | import numpy as np 3 | import nltk 4 | from sklearn import linear_model 5 | from sklearn.metrics import confusion_matrix, accuracy_score 6 | 7 | from score_alignments import StateTFIDF 8 | import json 9 | import argparse 10 | import os 11 | from database import ElasticConnection 12 | import random 13 | import codecs 14 | from utils.general_utils import alignment_tokenizer 15 | from utils.general_utils import UnicodeWriter 16 | from sklearn.metrics import jaccard_similarity_score 17 | 18 | 19 | def construct_training_set(alignments_file,out_file_name): 20 | """ 21 | Args: 22 | alignments_file (file) -- file containing sample alignments 23 | 24 | out_file_name (string) -- name of training data file to write to 25 | 26 | Returns: 27 | None 28 | """ 29 | ec = ElasticConnection(host= "54.203.12.145") 30 | 31 | training_examples = [] 32 | for i,x in enumerate(alignments_file): 33 | json_obj = json.loads(x.strip()) 34 | 35 | if "alignment_results" not in json_obj.keys(): 36 | continue 37 | 38 | left_doc_id = json_obj['query_document_id'] 39 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title'] 40 | 41 | left_doc = json_obj['query_document'] 42 | left_doc = reduce(lambda x,y:x+y,left_doc) 43 | 44 | left_doc_length = len(left_doc.split()) 45 | 46 | for i,alignment_doc in enumerate(json_obj['alignment_results']): 47 | 48 | right_doc_id = alignment_doc['document_id'] 49 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title'] 50 | 51 | for alignment in alignment_doc['alignments']: 52 | 53 | left = alignment['left'] 54 | right = alignment['right'] 55 | left_start = alignment['left_start'] 56 | right_start = alignment['right_start'] 57 | left_end = alignment['left_end'] 58 | right_end = alignment['right_end'] 59 | score = alignment['score'] 60 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end, 61 | right_end,score,left_bill_title,right_bill_title, 62 | " ".join(left)," ".join(right)]) 63 | 64 | 65 | random.shuffle(training_examples) 66 | 67 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end", 68 | "right_end","score","left_bill_title","right_bill_title","left","right"] 69 | 70 | 71 | k = 500 72 | with codecs.open(out_file_name, 'wb') as output_file: 73 | writer = UnicodeWriter(output_file, header) 74 | writer.writerow(header) 75 | for l in training_examples[0:k]: 76 | l = [unicode(x) for x in l] 77 | writer.writerow(l) 78 | 79 | 80 | return 81 | ======= 82 | from score_alignments import StateTFIDF 83 | import json 84 | import argparse 85 | import os 86 | from database import ElasticConnection 87 | import random 88 | import codecs 89 | from utils.general_utils import alignment_tokenizer 90 | from utils.general_utils import UnicodeWriter 91 | 92 | 93 | def construct_training_set(alignments_file,out_file_name): 94 | """ 95 | Args: 96 | alignments_file (file) -- file containing sample alignments 97 | 98 | out_file_name (string) -- name of training data file to write to 99 | 100 | Returns: 101 | None 102 | """ 103 | ec = ElasticConnection(host= "54.203.12.145") 104 | 105 | training_examples = [] 106 | for i,x in enumerate(alignments_file): 107 | json_obj = json.loads(x.strip()) 108 | 109 | if "alignment_results" not in json_obj.keys(): 110 | continue 111 | 112 | left_doc_id = json_obj['query_document_id'] 113 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title'] 114 | 115 | left_doc = json_obj['query_document'] 116 | left_doc = reduce(lambda x,y:x+y,left_doc) 117 | 118 | left_doc_length = len(left_doc.split()) 119 | 120 | for i,alignment_doc in enumerate(json_obj['alignment_results']): 121 | 122 | right_doc_id = alignment_doc['document_id'] 123 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title'] 124 | 125 | for alignment in alignment_doc['alignments']: 126 | 127 | left = alignment['left'] 128 | right = alignment['right'] 129 | left_start = alignment['left_start'] 130 | right_start = alignment['right_start'] 131 | left_end = alignment['left_end'] 132 | right_end = alignment['right_end'] 133 | score = alignment['score'] 134 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end, 135 | right_end,score,left_bill_title,right_bill_title, 136 | " ".join(left)," ".join(right)]) 137 | 138 | 139 | random.shuffle(training_examples) 140 | 141 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end", 142 | "right_end","score","left_bill_title","right_bill_title","left","right"] 143 | 144 | 145 | k = 500 146 | with codecs.open(out_file_name, 'wb') as output_file: 147 | writer = UnicodeWriter(output_file, header) 148 | writer.writerow(header) 149 | for l in training_examples[0:k]: 150 | l = [unicode(x) for x in l] 151 | writer.writerow(l) 152 | 153 | 154 | return 155 | 156 | 157 | def features_matrix(alignment): 158 | right = alignment['right'] 159 | left = alignment['left'] 160 | features['left_tfidf'], features['right_tfidf'] = s.tfidf_score(left, right) 161 | features = alignment_features(left, right) 162 | features['score'] = alignment['score'] 163 | features['label'] = alignment['label'] 164 | 165 | return features 166 | 167 | def evaluate_model(): 168 | data = list_alignments 169 | featuresets = [features_matrix(alignment) for alignment in data] 170 | 171 | data_list = [[value['avg_consec_match_length'], value['avg_gap_length_l'], 172 | value['avg_gap_length_r'], value['jaccard_score'], 173 | value['length'], value['num_gaps_l'], value['num_gaps_r'], 174 | value['num_matches'], value['num_mismatches'], 175 | value['score'], value['label']] for value in featuresets] 176 | 177 | alignment_data = np.array(data_list) 178 | alignment_y=alignment_data[:,-1] 179 | alignment_X=alignment_data[:,:-1] 180 | 181 | # A random permutation, to split the data randomly 182 | np.random.seed(0) 183 | indices = np.random.permutation(len(alignment_X)) 184 | train_n = 5 185 | alignment_X_train = alignment_X[indices[:-train_n]] 186 | alignment_y_train = alignment_y[indices[:-train_n]] 187 | alignment_X_test = alignment_X[indices[-train_n:]] 188 | alignment_y_test = alignment_y[indices[-train_n:]] 189 | 190 | # Create and fit a logistic regression 191 | logistic = linear_model.LogisticRegression(C=1e5) 192 | logistic.fit(alignment_X_train, alignment_y_train) 193 | y_pred = logistic.predict(alignment_X_test) 194 | 195 | #Calculate accuracy 196 | accuracy_score(alignment_y_test, y_pred) 197 | cm = confusion_matrix(alignment_y_test, y_pred) 198 | 199 | 200 | 201 | def main(): 202 | parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ') 203 | parser.add_argument('command', 204 | help='command to run, options are: construct_training_set,train_model,evaluate_model') 205 | parser.add_argument('--alignment_samples_doc', dest='alignment_samples', 206 | help="file path to the alignment samples used to construct training set ") 207 | args = parser.parse_args() 208 | 209 | if args.command == "construct_training_set": 210 | construct_training_set(open(args.alignment_samples), 211 | os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set.csv") 212 | elif args.command == "train_model": 213 | pass 214 | elif args.command == "evaluate_model": 215 | pass 216 | else: 217 | print args 218 | print "command not recognized, please enter construct_training_set,train_model,evaluate_model" 219 | 220 | 221 | if __name__ == "__main__": 222 | main() 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /archive/exploratory.R: -------------------------------------------------------------------------------- 1 | library('RPostgreSQL') 2 | library('ggplot2') 3 | 4 | db_info <- read.csv('policy_diffusion/default_profile', sep='=', header=F, quote='', stringsAsFactors=F) 5 | 6 | # sessions 7 | drv <- dbDriver('PostgreSQL') 8 | con <- dbConnect(drv, user=db_info$V2[3], password=db_info$V2[4], 9 | dbname=db_info$V2[2], host=db_info$V2[1]) 10 | 11 | # number of governments 12 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state FROM bill_metadata) AS a;") 13 | 14 | # list the governments 15 | dbGetQuery(con, "SELECT DISTINCT state FROM bill_metadata ORDER BY state;") 16 | 17 | # number of sessions 18 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state, session FROM bill_metadata ORDER BY state, session) AS a;") 19 | 20 | # sessions 21 | dbGetQuery(con, "SELECT DISTINCT state, session FROM bill_metadata ORDER BY session;") 22 | 23 | # oldest session per government 24 | dbGetQuery(con, "SELECT state, MIN(session) AS min_session FROM bill_metadata GROUP BY state ORDER BY state;") 25 | 26 | # newest session per government 27 | dbGetQuery(con, "SELECT state, MAX(session) AS max_session FROM bill_metadata GROUP BY state ORDER BY state;") 28 | 29 | # bills and resolutions by government session 30 | bills_and_resolutions <- 31 | dbGetQuery(con, "SELECT a.state, 32 | a.session, 33 | a.bill_freq, 34 | b.resolution_freq 35 | FROM (SELECT state, session, count(*) as bill_freq FROM bill_metadata WHERE type LIKE '%bill%' GROUP BY state, session) AS a, 36 | (SELECT state, session, count(*) as resolution_freq FROM bill_metadata WHERE type LIKE '%resolution%' GROUP BY state, session) AS b 37 | WHERE a.state = b.state AND 38 | a.session = b.session 39 | ORDER BY bill_freq DESC;") 40 | 41 | br_plt <- ggplot(bills_and_resolutions, aes(bill_freq, resolution_freq)) 42 | br_plt + theme(axis.text=element_text(size=18), 43 | axis.title=element_text(size=18,face="bold")) + 44 | ylim(0, max(bills_and_resolutions$bill_freq)) + 45 | geom_point() + 46 | xlab("bills") + 47 | ylab("resolutions") + 48 | geom_abline(intercept=0, slope=1) + 49 | geom_text(data=subset(bills_and_resolutions, bill_freq > 5000), 50 | aes(bill_freq, resolution_freq, label=toupper(state)), 51 | vjust=-.5, size=8) + 52 | geom_text(data=subset(bills_and_resolutions, bill_freq < resolution_freq & bill_freq > 100), 53 | aes(bill_freq, resolution_freq, label=toupper(state)), 54 | vjust=-.5, size=8) 55 | 56 | 57 | # how many bills Sunlight scraped from each government after the second 58 | # year it started scraping that government 59 | bills_by_state_year <- 60 | dbGetQuery(con, "SELECT UPPER(c.state) as state, 61 | EXTRACT(YEAR FROM c.created_at) AS year, 62 | COUNT(*) AS freq 63 | FROM bill_metadata AS c, 64 | -- find minimum year 65 | (SELECT a.state, 66 | MIN(a.year) AS min_year 67 | FROM (SELECT state, 68 | EXTRACT(YEAR FROM created_at) AS year 69 | FROM bill_metadata) AS a 70 | GROUP BY state) as b 71 | WHERE c.state = b.state AND 72 | EXTRACT(YEAR FROM created_at) >= b.min_year 73 | GROUP BY c.state, 74 | EXTRACT(YEAR FROM c.created_at) 75 | ORDER BY c.state, 76 | EXTRACT(YEAR FROM c.created_at);") 77 | 78 | # we're missing data for some states in some years 79 | dbGetQuery(con, "SELECT c.state, 80 | c.year - 1 AS missing_year 81 | FROM (SELECT *, 82 | b.year - lag(b.year) OVER w AS gap 83 | FROM (SELECT a.state, 84 | a.year, 85 | COUNT(*) 86 | FROM (SELECT state, 87 | EXTRACT(YEAR FROM created_at) AS year 88 | FROM bill_metadata) AS a 89 | GROUP BY a.state, 90 | a.year 91 | ORDER BY a.state, 92 | a.year) AS b 93 | WINDOW w AS (ORDER BY b.state, b.year)) AS c 94 | WHERE c.gap > 1;") 95 | 96 | missing_values <- data.frame(state = c('MT', 'ND', 'NV', 'TX', 'TX'), 97 | year = c(2014, 2014, 2012, 2012, 2014), 98 | freq = rep(0,5)) 99 | bills_by_state_year <- rbind(bills_by_state_year, missing_values) 100 | bills_by_state_year <- bills_by_state_year[ order(bills_by_state_year$state, bills_by_state_year$year), ] 101 | 102 | # New Jersey 2012 is wrong. Subtract 2013 number from total here: http://www.njleg.state.nj.us/bills/BillsByNumber.asp 103 | bills_by_state_year$freq[ bills_by_state_year$state == 'NJ' & bills_by_state_year$year == 2012 ] <- 6808 104 | 105 | sy_plt <- ggplot(bills_by_state_year, aes(year, freq, color=state)) 106 | sy_plt + theme(legend.position="none", 107 | axis.text=element_text(size=18), 108 | axis.title=element_text(size=18,face="bold")) + 109 | geom_line(size=2) + 110 | ylab("frequency") + 111 | geom_text(data=data.frame(state=c('NJ', 'TX', 'NJ', 'NY', 'IL', 'TX'), 112 | year=c(2012, 2013, 2014, 2014, 2015.05, 2015), 113 | freq=c(6850, 11700, 7500, 13200, 7000, 10000)), 114 | aes(x=year, y=freq, label=state), 115 | vjust=-.5, size=7) 116 | -------------------------------------------------------------------------------- /archive/input/.download_bulk_sunlight_files.sh.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/archive/input/.download_bulk_sunlight_files.sh.swp -------------------------------------------------------------------------------- /archive/input/Drakefile: -------------------------------------------------------------------------------- 1 | PROFILE:=default_profile 2 | %include $[PROFILE] 3 | 4 | psql() 5 | psql -v ON_ERROR_STOP=1 -f $[INPUT] && touch $[OUTPUT] 6 | 7 | 8 | 9 | 10 | ; GRAB STATE LEGISLATIVE METADATA FROM SUNLIGHT 11 | /mnt/data/sunlight/data/input/state_metadata.csv <- [-timestamp] 12 | ; input/./state_metadata.py | sed -E "s/u?'//g" > $OUTPUT 13 | 14 | ; CREATE TABLE / COPY FOR STATE METADATA 15 | ;psql/input/touch_state_metadata <- input/state_metadata.sql, data/input/state_metadata.csv [method:psql] 16 | 17 | 18 | 19 | ; CREATE TABLE FOR BILL METADATA 20 | ; (sql file creates the table; the python script pushes the data) 21 | ;psql/input/touch_bill_metadata <- input/bill_metadata.sql [method:psql] 22 | 23 | ; GRAB BILL METADATA FROM SUNLIGHT 24 | ;data/input/touch_bill_metadata <- input/download_bulk_sunlight_files.sh 25 | ; bash $INPUT && touch $OUTPUT 26 | 27 | 28 | -------------------------------------------------------------------------------- /archive/input/bill_metadata.py: -------------------------------------------------------------------------------- 1 | 2 | import psycopg2 3 | from psycopg2.extras import Json 4 | import json 5 | import csv 6 | import os 7 | import re 8 | 9 | 10 | 11 | # GRAB DATABASE INFO FROM default_profile 12 | db_info = [] 13 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file: 14 | reader = csv.reader(db_file, delimiter='=', quotechar='"') 15 | for row in reader: 16 | db_info.append(row[1]) 17 | 18 | 19 | # CONNECT TO DATABASE 20 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 21 | cur = conn.cursor() 22 | 23 | 24 | 25 | # PARSE BILL METADATA FOR DATABASE INSERTION 26 | def parse_bill_metadata(bill_metadata): 27 | bill_id = bill_metadata['bill_id'] 28 | title = bill_metadata['title'] 29 | if len(bill_metadata['alternate_titles']) > 0: 30 | alternate_titles = Json(bill_metadata['alternate_titles'][0]) 31 | else: 32 | alternate_titles = None 33 | if len(bill_metadata['versions']) > 0: 34 | versions = Json(bill_metadata['versions'][0]) 35 | else: 36 | versions = None 37 | if 'subjects' in bill_metadata: 38 | if len(bill_metadata['subjects']) > 0: 39 | subjects = bill_metadata['subjects'][0] 40 | else: 41 | subjects = None 42 | else: 43 | subjects = None 44 | if 'scraped_subjects' in bill_metadata: 45 | if len(bill_metadata['scraped_subjects']) > 0: 46 | scraped_subjects = bill_metadata['scraped_subjects'][0] 47 | else: 48 | scraped_subjects = None 49 | else: 50 | scraped_subjects = None 51 | type_ = bill_metadata['type'][0] 52 | if 'level' in bill_metadata: 53 | level = bill_metadata['level'] 54 | else: 55 | level = None 56 | if len(bill_metadata['sponsors']) > 0: 57 | sponsors = Json(bill_metadata['sponsors'][0]) 58 | else: 59 | sponsors = None 60 | if len(bill_metadata['actions']) > 0: 61 | actions = Json(bill_metadata['actions'][0]) 62 | else: 63 | actions = None 64 | if len(bill_metadata['action_dates']) > 0: 65 | action_dates = Json(bill_metadata['action_dates']) 66 | else: 67 | action_dates = None 68 | if len(bill_metadata['documents']) > 0: 69 | documents = Json(bill_metadata['documents'][0]) 70 | else: 71 | documents = None 72 | if len(bill_metadata['votes']) > 0: 73 | votes = Json(bill_metadata['votes'][0]) 74 | else: 75 | votes = None 76 | id_ = bill_metadata['id'] 77 | state = bill_metadata['state'] 78 | chamber = bill_metadata['chamber'] 79 | session = bill_metadata['session'] 80 | 81 | all_ids = bill_metadata['all_ids'][0] 82 | created_at = bill_metadata['created_at'] 83 | updated_at = bill_metadata['updated_at'] 84 | 85 | return((bill_id, title, alternate_titles, versions, subjects, scraped_subjects, 86 | type_, level, sponsors, actions, action_dates, documents, votes, id_, state, 87 | chamber, session, all_ids, created_at, updated_at)) 88 | 89 | 90 | 91 | # GRAB BILL METADATA AND PUSH TO DATABASE 92 | temp_bill_metadata = [] 93 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/bills/'): 94 | for name in files: 95 | directory_file = os.path.join(path, name) 96 | with open(directory_file) as json_file: 97 | bill = json.load(json_file) 98 | parsed_data = parse_bill_metadata(bill) 99 | temp_bill_metadata.append(parsed_data) 100 | if len(temp_bill_metadata) == 10000 or name == files[len(files)-1]: 101 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_bill_metadata) 102 | cur.execute("INSERT INTO bill_metadata VALUES " + args_str) 103 | conn.commit() 104 | temp_bill_metadata = [] 105 | -------------------------------------------------------------------------------- /archive/input/bill_metadata.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF EXISTS bill_metadata; 3 | 4 | CREATE TABLE bill_metadata ( 5 | bill_id VARCHAR, 6 | title VARCHAR, 7 | alternate_titles JSON, 8 | versions VARCHAR, 9 | subjects VARCHAR, 10 | scraped_subjects VARCHAR, 11 | type VARCHAR, 12 | level VARCHAR, 13 | sponsors JSON, 14 | actions JSON, 15 | action_dates JSON, 16 | documents JSON, 17 | votes JSON, 18 | leg_id VARCHAR, 19 | state CHAR(2), 20 | chamber VARCHAR, 21 | session VARCHAR, 22 | all_ids VARCHAR, 23 | created_at TIMESTAMP WITHOUT TIME ZONE, 24 | updated_at TIMESTAMP WITHOUT TIME ZONE 25 | ); 26 | -------------------------------------------------------------------------------- /archive/input/campaign_contributions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source default_profile 4 | 5 | rm /mnt/data/sunlight/followthemoney/contributions.csv 6 | 7 | for state in AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY 8 | do 9 | url="http://www.followthemoney.org/aaengine/aafetch.php?s=$state&law-ot=S,H&gro=d-id&APIKey=$FOLLOWTHEMONEYKEY&mode=csv" 10 | wget -O- --header="Accept: text/html" --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $url >> /mnt/data/sunlight/followthemoney/contributions.csv 11 | done 12 | -------------------------------------------------------------------------------- /archive/input/committee_metadata.py: -------------------------------------------------------------------------------- 1 | 2 | from sunlight import openstates 3 | import psycopg2 4 | from psycopg2.extras import Json 5 | import json 6 | import csv 7 | import sys 8 | import re 9 | import os 10 | 11 | 12 | # GRAB DATABASE INFO FROM default_profile 13 | db_info = [] 14 | with open('default_profile', 'rb') as db_file: 15 | reader = csv.reader(db_file, delimiter='=', quotechar='"') 16 | for row in reader: 17 | db_info.append(row[1]) 18 | 19 | 20 | # CONNECT TO DATABASE 21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 22 | cur = conn.cursor() 23 | 24 | 25 | # PARSE COMMITTEE METADATA 26 | def parse_committee_metadata(committee_metadata): 27 | id_ = committee_metadata['id'] 28 | state = committee_metadata['state'] 29 | chamber = committee_metadata['chamber'] 30 | committee = committee_metadata['committee'] 31 | subcommittee = committee_metadata['subcommittee'] 32 | if len(committee_metadata['members']) > 0: 33 | members = Json(committee_metadata['members'][0]) 34 | else: 35 | members = None 36 | sources = committee_metadata['sources'][0]['url'] 37 | parent_id = committee_metadata['parent_id'] 38 | created_at = committee_metadata['created_at'] 39 | updated_at = committee_metadata['updated_at'] 40 | if len(committee_metadata['all_ids']) > 0: 41 | all_ids = committee_metadata['all_ids'][0] 42 | else: 43 | all_ids = None 44 | if 'level' in committee_metadata: 45 | level = committee_metadata['level'] 46 | else: 47 | level = None 48 | 49 | return((id_, state, chamber, committee, subcommittee, members, 50 | sources, parent_id, created_at, updated_at, all_ids, level)) 51 | 52 | 53 | 54 | # GRAB COMMITTEE METADATA FROM FILES AND PUSH TO DATABASE 55 | temp_committee_metadata = [] 56 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/committees/'): 57 | for name in files: 58 | directory_file = os.path.join(path, name) 59 | with open(directory_file) as json_file: 60 | committee = json.load(json_file) 61 | parsed_data = parse_committee_metadata(committee) 62 | temp_committee_metadata.append(parsed_data) 63 | 64 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_committee_metadata) 65 | cur.execute("INSERT INTO committees VALUES " + args_str) 66 | conn.commit() 67 | 68 | -------------------------------------------------------------------------------- /archive/input/committee_metadata.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF EXISTS committees; 3 | 4 | CREATE TABLE committees ( 5 | id VARCHAR, 6 | state VARCHAR(2), 7 | chamber VARCHAR(10), 8 | committee VARCHAR, 9 | subcommittee VARCHAR, 10 | members JSON, 11 | sources VARCHAR, 12 | parent_id VARCHAR(10), 13 | created_at TIMESTAMP WITHOUT TIME ZONE, 14 | updated_at TIMESTAMP WITHOUT TIME ZONE, 15 | all_ids VARCHAR, 16 | level VARCHAR(5) 17 | ); 18 | -------------------------------------------------------------------------------- /archive/input/download_bulk_sunlight_files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### DOWNLOAD BULK DATA ### 4 | eval $(cat /home/jwalsh/policy_diffusion/default_profile | sed 's/^/export /') 5 | state_abbrevs=$(psql -t -c "SELECT abbreviation FROM state_metadata WHERE bills_identified IS NULL AND abbreviation > 'l' ORDER BY abbreviation;") 6 | user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36" 7 | month="07" # the first day of this month is the last day of records to download 8 | for i in $state_abbrevs; do 9 | urls="$urls -O http://static.openstates.org/downloads/2016-${month}-01-${i}-json.zip" 10 | done 11 | curl -A '$user_agent' $urls 12 | 13 | -------------------------------------------------------------------------------- /archive/input/legislators.py: -------------------------------------------------------------------------------- 1 | 2 | import psycopg2 3 | from psycopg2.extras import Json 4 | import json 5 | import csv 6 | import sys 7 | import re 8 | import os 9 | 10 | 11 | 12 | # GRAB DATABASE INFO FROM default_profile 13 | db_info = [] 14 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file: 15 | reader = csv.reader(db_file, delimiter='=', quotechar='"') 16 | for row in reader: 17 | db_info.append(row[1]) 18 | 19 | 20 | # CONNECT TO DATABASE 21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 22 | cur = conn.cursor() 23 | 24 | 25 | 26 | # PARSE BILL METADATA FOR DATABASE INSERTION 27 | def parse_legislator_metadata(legislator_metadata): 28 | id_ = legislator_metadata['id'] 29 | if 'votesmart_id' in legislator_metadata: 30 | votesmart_id = legislator_metadata['votesmart_id'] 31 | else: 32 | votesmart_id = None 33 | if 'transparencydata_id' in legislator_metadata: 34 | transparencydata_id = legislator_metadata['transparencydata_id'] 35 | else: 36 | transparencydata_id = None 37 | first_name = legislator_metadata['first_name'] 38 | if len(legislator_metadata['middle_name']) > 0: 39 | middle_name = legislator_metadata['middle_name'] 40 | else: 41 | middle_name = None 42 | last_name = legislator_metadata['last_name'] 43 | if len(legislator_metadata['suffixes']) > 0: 44 | suffixes = legislator_metadata['suffixes'] 45 | else: 46 | suffixes = None 47 | full_name = legislator_metadata['full_name'] 48 | if 'party' in legislator_metadata: 49 | party = legislator_metadata['party'] 50 | else: 51 | party = None 52 | active = legislator_metadata['active'] 53 | if 'url' in legislator_metadata: 54 | url = legislator_metadata['url'] 55 | else: 56 | url = None 57 | if 'photo_url' in legislator_metadata: 58 | photo_url = legislator_metadata['photo_url'] 59 | else: 60 | photo_url = None 61 | if 'office_address' in legislator_metadata: 62 | office_address = legislator_metadata['office_address'] 63 | else: 64 | office_address = None 65 | if 'office_phone' in legislator_metadata: 66 | office_phone = legislator_metadata['office_phone'] 67 | else: 68 | office_phone = None 69 | leg_id = legislator_metadata['leg_id'] 70 | if 'chamber' in legislator_metadata: 71 | chamber = legislator_metadata['chamber'] 72 | else: 73 | chamber = None 74 | if 'district' in legislator_metadata: 75 | district = legislator_metadata['district'] 76 | else: 77 | district = None 78 | state = legislator_metadata['state'] 79 | if len(legislator_metadata['offices']) > 0: 80 | offices = Json(legislator_metadata['offices'][0]) 81 | else: 82 | offices = None 83 | if 'email' in legislator_metadata: 84 | email = legislator_metadata['email'] 85 | else: 86 | email = None 87 | if len(legislator_metadata['roles']) > 0: 88 | roles = Json(legislator_metadata['roles'][0]) 89 | else: 90 | roles = None 91 | if 'old_roles' in legislator_metadata: 92 | old_roles = Json(legislator_metadata['old_roles']) 93 | else: 94 | old_roles = None 95 | all_legislative_ids = legislator_metadata['all_ids'][0] 96 | if 'level' in legislator_metadata: 97 | level = legislator_metadata['level'] 98 | else: 99 | level = None 100 | if len(legislator_metadata['sources']) > 0: 101 | sources = Json(legislator_metadata['sources'][0]) 102 | else: 103 | sources = None 104 | created_at = legislator_metadata['created_at'] 105 | updated_at = legislator_metadata['updated_at'] 106 | 107 | return((id_, votesmart_id, transparencydata_id, 108 | first_name, middle_name, last_name, suffixes, full_name, 109 | party, active, url, photo_url, office_address, office_phone, 110 | leg_id, chamber, district, state, offices, email, 111 | roles, old_roles, all_legislative_ids, level, sources, 112 | created_at, updated_at)) 113 | 114 | 115 | 116 | # GRAB BILL METADATA FROM SUNLIGHT AND PUSH TO DATABASE 117 | temp_legislator_metadata = [] 118 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/legislators/'): 119 | for name in files: 120 | directory_file = os.path.join(path, name) 121 | with open(directory_file) as json_file: 122 | legislator = json.load(json_file) 123 | parsed_data = parse_legislator_metadata(legislator) 124 | temp_legislator_metadata.append(parsed_data) 125 | 126 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_legislator_metadata) 127 | cur.execute("INSERT INTO legislators VALUES " + args_str) 128 | conn.commit() 129 | 130 | -------------------------------------------------------------------------------- /archive/input/legislators.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF EXISTS legislators; 3 | 4 | CREATE TABLE legislators ( 5 | id VARCHAR, 6 | votesmart_id VARCHAR, 7 | transparencydata_id VARCHAR, 8 | first_name VARCHAR, 9 | middle_name VARCHAR, 10 | last_name VARCHAR, 11 | suffixes VARCHAR, 12 | full_name VARCHAR, 13 | party VARCHAR, 14 | active BOOLEAN, 15 | url VARCHAR, 16 | photo_url VARCHAR, 17 | office_address VARCHAR, 18 | office_phone VARCHAR, 19 | leg_id VARCHAR, 20 | chamber VARCHAR, 21 | district VARCHAR, 22 | state VARCHAR, 23 | offices JSON, 24 | email VARCHAR, 25 | roles JSON, 26 | old_roles JSON, 27 | all_legislative_ids VARCHAR, 28 | level VARCHAR, 29 | sources JSON, 30 | created_at TIMESTAMP WITHOUT TIME ZONE, 31 | updated_at TIMESTAMP WITHOUT TIME ZONE 32 | ); 33 | -------------------------------------------------------------------------------- /archive/input/lobbyists/compile_lobbyist_lists.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | ## ILLINOIS ## 5 | #url= 6 | 7 | 8 | ## MICHIGAN ## 9 | # http://miboecfr.nictusa.com/cgi-bin/cfr/lobby_srch_res.cgi 10 | 11 | url=http://miboecfr.nictusa.com/cfr/dumpdata/aaarZaGrk/mi_lobby.sh 12 | wget -O michigan_lobbyists.txt --user-agent="jtwalsh@uchicago.edu" $url 13 | 14 | #sed -E 's/\t/,/g' michigan_lobbyists.csv | sed 's/#/ Number/g' | sed -E 's/\(MaxLen=(.){1,3}\)//g' 15 | 16 | http://miboecfr.nictusa.com/cfr/dumpdata/aaa3AaiZp/mi_lobby.sh 17 | 18 | # second line of the file has metadata 19 | # the bottom of the file has garbage too 20 | 21 | -------------------------------------------------------------------------------- /archive/input/opensecrets/opensecrets_candidates.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS opensecrets.candidates; 2 | 3 | CREATE TABLE opensecrets.candidates ( 4 | cycle INTEGER NOT NULL, 5 | fec_candidate_id VARCHAR(9) NOT NULL, 6 | candidate_id VARCHAR(9) NOT NULL, 7 | first_last_party VARCHAR(38) NOT NULL, 8 | party VARCHAR(7) NOT NULL, 9 | office_sought VARCHAR(4), 10 | office_held VARCHAR(4), 11 | currently_running BOOLEAN, 12 | VARCHAR(4), 13 | "RL" VARCHAR(4) 14 | ); 15 | -------------------------------------------------------------------------------- /archive/input/state_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from sunlight import openstates 5 | import psycopg2 6 | import csv 7 | import sys 8 | import re 9 | 10 | 11 | 12 | # GRAB DATABASE INFO FROM default_profile 13 | db_info = [] 14 | with open('default_profile', 'rb') as db_file: 15 | reader = csv.reader(db_file, delimiter='=', quotechar='"') 16 | for row in reader: 17 | db_info.append(row[1]) 18 | 19 | 20 | # CONNECT TO DATABASE 21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3]) 22 | cur = conn.cursor() 23 | 24 | 25 | # FUNCTION TO PARSE STATE METADATA 26 | def parse_state_metadata(state_metadata): 27 | name = state_metadata['name'] 28 | abbreviation = state_metadata['abbreviation'] 29 | if 'lower' in state_metadata['chambers']: 30 | lower_chamber_name = state_metadata['chambers']['lower']['name'] 31 | lower_chamber_title = state_metadata['chambers']['lower']['title'] 32 | else: 33 | lower_chamber_name = None 34 | lower_chamber_title = None 35 | upper_chamber_name = state_metadata['chambers']['upper']['name'] 36 | upper_chamber_title = state_metadata['chambers']['upper']['title'] 37 | feature_flags = ', '.join(state_metadata['feature_flags']) 38 | return((name, abbreviation, lower_chamber_name, lower_chamber_title, 39 | upper_chamber_name, upper_chamber_name, feature_flags)) 40 | 41 | 42 | # GRAB THE DATA FROM SUNLIGHT API 43 | state_metadata = openstates.all_metadata() 44 | 45 | 46 | # PARSE SUNLIGHT DATA AND WRITE TO POSTGRES 47 | temp_state_metadata = [] 48 | for state in state_metadata: 49 | temp_state_metadata.append(parse_state_metadata(state)) 50 | 51 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_state_metadata) 52 | cur.execute("INSERT INTO state_metadata VALUES " + args_str) 53 | conn.commit() -------------------------------------------------------------------------------- /archive/input/state_metadata.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF EXISTS state_metadata; 3 | 4 | CREATE TABLE state_metadata ( 5 | name VARCHAR(20), 6 | abbreviation VARCHAR(2), 7 | lower_chamber_name VARCHAR(10), 8 | lower_chamber_title VARCHAR(15), 9 | upper_chamber_name VARCHAR(10), 10 | upper_chamber_title VARCHAR(15), 11 | feature_flags VARCHAR(50) 12 | ); 13 | -------------------------------------------------------------------------------- /archive/input/unzip_bulk_files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | filenames=$(ls /mnt/data/sunlight/openstates_zipped_files/) 4 | 5 | for i in $filenames; do 6 | dir_name=$(sed -E 's/201[0-9]-0[0-9]-0[0-9]-//g' ${i} | sed -E 's/-json.zip//g') 7 | unzip /mnt/data/sunlight/openstates_zipped_files/${i} -d /mnt/data/sunlight/openstates_unzipped/${dir_name} 8 | done 9 | -------------------------------------------------------------------------------- /archive/prototype_text_alignment_algorithms.py: -------------------------------------------------------------------------------- 1 | from text_alignment import * 2 | from gensim.models import Word2Vec 3 | from evaluation.score_alignments import load_word2vec 4 | from scipy.spatial.distance import cosine 5 | 6 | class Word2VecLocalAligner(LocalAligner): 7 | 8 | def __init__(self,match_score = 3, mismatch_score = -1, gap_score = -2): 9 | LocalAligner.__init__(self, match_score, mismatch_score, gap_score) 10 | self.model = load_word2vec() 11 | self._algorithm_name = 'word2vec_local_alignment' 12 | 13 | def __str__(self): 14 | 15 | name_str = "{0} instance".format(self._algorithm_name) 16 | param_str_1 = "match_score = {0}".format(self.gap_score) 17 | param_str_2 = "mismatch_score = {0}".format(self.match_score) 18 | param_str_3 = "gap_score = {0}".format(self.mismatch_score) 19 | return "{0}: {1}, {2}, {3}".format(name_str,param_str_1,param_str_2,param_str_3) 20 | 21 | 22 | def align(self,left_sections,right_sections): 23 | ''' 24 | description: 25 | find alignments between two documents using word2vec 26 | args: 27 | left_sections: a list of lists of words 28 | right_sections: a list of lists of words (usually just a list of a list of words) 29 | 30 | returns: 31 | alignment object 32 | ''' 33 | 34 | alignments = [] 35 | alignment_indices = [] 36 | 37 | for left in left_sections: 38 | for right in right_sections: 39 | 40 | a_ints, b_ints, word_map = self._transform_text(left, right) 41 | 42 | score_matrix, pointer_matrix = self._compute_matrix(a_ints, b_ints,self.match_score, 43 | self.mismatch_score, self.gap_score, self.model) 44 | 45 | l, r, score, align_index = self._backtrace(a_ints, b_ints, score_matrix, pointer_matrix) 46 | 47 | reverse_word_map = {v:k for k,v in word_map.items()} 48 | reverse_word_map["-"] = "-" 49 | l = [reverse_word_map[w] for w in l] 50 | r = [reverse_word_map[w] for w in r] 51 | 52 | alignment_indices.append(align_index) 53 | alignments.append((score, l, r)) 54 | 55 | left = reduce(lambda x,y:x+y,left_sections) 56 | right = reduce(lambda x,y:x+y,right_sections) 57 | 58 | return Alignment(left,right,alignments,alignment_indices) 59 | 60 | 61 | @jit 62 | def _compute_matrix(self, left, right, match_score, mismatch_score, gap_score, model): 63 | ''' 64 | description: 65 | create matrix of optimal scores 66 | args: 67 | left: an array of integers 68 | right: an array of integers 69 | match_score: score for match in alignment 70 | mismatch_score: score for mismatch in alignment 71 | gap_start: score for first gap 72 | gap_extend: score for every gap 73 | model: word2vec model 74 | returns: 75 | three matrices required to construct optimal solution 76 | ''' 77 | m = len(left) + 1 78 | n = len(right) + 1 79 | score_matrix = np.zeros((m, n),dtype = float) 80 | scores = np.zeros((4),dtype = float) 81 | pointer_matrix = np.zeros((m,n),dtype = int) 82 | for i in xrange(1, m): 83 | for j in xrange(1, n): 84 | 85 | if left[i-1] == right[j-1]: 86 | scores[1] = score_matrix[i-1,j-1] + match_score 87 | else: 88 | scores[1] = score_matrix[i-1,j-1] + mismatch_score*cosine(left[i-1], right[j-1]) 89 | 90 | scores[2] = score_matrix[i, j - 1] + gap_score 91 | 92 | scores[3] = score_matrix[i - 1, j] + gap_score 93 | 94 | max_decision = np.argmax(scores) 95 | 96 | pointer_matrix[i,j] = max_decision 97 | score_matrix[i,j] = scores[max_decision] 98 | 99 | return score_matrix, pointer_matrix -------------------------------------------------------------------------------- /archive/score_alignments.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Functions for scoring alignments 3 | ''' 4 | 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.metrics import jaccard_similarity_score 7 | import numpy as np 8 | import scipy as sp 9 | from database import * 10 | from gensim.models import Word2Vec 11 | from utils.general_utils import save_pickle 12 | import json 13 | 14 | def weight_length(alignment, left_length, right_length): 15 | print alignment 16 | return np.sum([a[0]*(len(a[1])/float(left_length))*(len(a[2])/float(right_length)) for a in alignment.alignments]) 17 | 18 | def weight_tfidf(alignment, state_tfidf, left_state, right_state): 19 | ''' 20 | state_tfidf: dictionary with tfidf scores by state 21 | ''' 22 | f = StateTFIDF(state_tfidf) 23 | return np.sum([f.tfidf_score(a, left_state, right_state)*a[0] for a in alignment.alignments]) 24 | 25 | def jaccard_coefficient(left, right): 26 | jaccard_scores = jaccard_similarity_score(left,right) 27 | return jaccard_scores 28 | 29 | def load_word2vec(): 30 | model = Word2Vec.load_word2vec_format('/mnt/data/sunlight/GoogleNews-vectors-negative300.bin', binary=True) 31 | 32 | return model 33 | 34 | def word2vec_similarity(list_of_alignments, model): 35 | ''' 36 | model is word2vec model 37 | ''' 38 | distances = [] 39 | for alignment in list_of_alignments: 40 | score, left, right = alignment 41 | 42 | word_distance_list = [] 43 | for i in range(len(left)): 44 | 45 | if left[i] not in model or right[i] not in model: 46 | continue 47 | 48 | word_distance_list.append(model.similarity(left[i], right[i])) 49 | 50 | distances.append(np.mean(word_distance_list)) 51 | 52 | return np.mean(distances) 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | #################################################################### 62 | ##tfidf functions 63 | 64 | def tfidf_by_state(state, num_bills = 'all'): 65 | ''' 66 | description: 67 | create dictionary of tfidf scores for a particular state 68 | args: 69 | num_bills: number of bills to run the algorithm open 70 | returns: 71 | dictionary of tfidf scores with words as keys 72 | ''' 73 | es = ElasticConnection() 74 | state_bills = es.get_bills_by_state(state, num_bills) 75 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \ 76 | if bill['_source']['bill_document_last'] != None] 77 | 78 | vectorizer = TfidfVectorizer() 79 | X = vectorizer.fit_transform(corpus) 80 | idf = vectorizer.idf_ 81 | idf = vectorizer._tfidf.idf_ 82 | 83 | return dict(zip(vectorizer.get_feature_names(), idf)) 84 | 85 | 86 | def tfidf_all_bills(): 87 | ''' 88 | description: 89 | create dictionary of tfidf scores for a particular state 90 | args: 91 | num_bills: number of bills to run the algorithm open 92 | returns: 93 | dictionary of tfidf scores with words as keys 94 | ''' 95 | es = ElasticConnection() 96 | state_bills = es.get_all_bills() 97 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \ 98 | if bill['_source']['bill_document_last'] != None] 99 | 100 | vectorizer = TfidfVectorizer() 101 | X = vectorizer.fit_transform(corpus) 102 | idf = vectorizer.idf_ 103 | idf = vectorizer._tfidf.idf_ 104 | 105 | return dict(zip(vectorizer.get_feature_names(), idf)) 106 | 107 | 108 | def tfidf_by_all_states(): 109 | states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 110 | 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 111 | 'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 112 | 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'] 113 | states = map(lambda x : x.lower(), states) 114 | 115 | tfidf = {} 116 | for state in states: 117 | print 'working on ', state 118 | tfidf[state] = tfidf_by_state(state) 119 | 120 | return tfidf 121 | 122 | 123 | #################################################################### 124 | ##state tfidf object 125 | class StateTFIDF(): 126 | 127 | def __init__(self, state_tfidf): 128 | self.state_tfidf = state_tfidf 129 | 130 | def find_tfidf(self, word, state): 131 | if state == 'model_legislation': 132 | return 0 133 | elif word == '-' or word not in self.state_tfidf[state]: 134 | return 0 135 | else: 136 | return self.state_tfidf[state][word] 137 | 138 | def tfidf_score(self, left, right, left_state, right_state): 139 | ''' 140 | gives average tfidf for a particular left and right components of alignment 141 | ''' 142 | left_scores = [] 143 | right_scores = [] 144 | 145 | for i in range(len(left)): 146 | left_scores.append(self.find_tfidf(left[i], left_state)) #need function 147 | right_scores.append(self.find_tfidf(right[i], right_state)) 148 | 149 | if scores == []: 150 | return 0 151 | else: 152 | return np.mean(left_scores), np.mean(right_scores) 153 | 154 | 155 | def tfidf_by_alignments(): 156 | alignments = [] 157 | with open('bill_to_bill_alignments.txt') as f: 158 | for i,line in enumerate(f): 159 | print 'line ', i 160 | alignments.append(json.loads(line)) 161 | 162 | if __name__ == "__main__": 163 | tfidf = tfidf_all_bills() 164 | save_pickle(tfidf, 'tfidf_all_bills') 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /archive/tfidf_ranking.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | import numpy as np 3 | import pickle 4 | from alignment_evaluation import * 5 | from database import * 6 | import time 7 | 8 | def calc_tfidf_alignments(alignments_list): 9 | ''' 10 | arg: 11 | list of alignment objects 12 | returns: 13 | dictionary with tfi_idf scores 14 | ''' 15 | corpus = [alignment[1] + alignment[2] \ 16 | for alignments in alignments_list for alignment in alignments ] 17 | corpus = [' '.join(doc) for doc in corpus] 18 | vectorizer = TfidfVectorizer() 19 | X = vectorizer.fit_transform(corpus) 20 | idf = vectorizer.idf_ 21 | idf = vectorizer._tfidf.idf_ 22 | return dict(zip(vectorizer.get_feature_names(), idf)) 23 | 24 | 25 | def rank_alignments(alignments_list): 26 | ''' 27 | arg: 28 | list of alignment objects 29 | returns: 30 | list of alignment objects sorted by averaged tfi_idf score 31 | ''' 32 | tfidf = calc_tfidf_alignments(alignments_list) 33 | 34 | not_in_dict = 0 35 | in_dict = 0 36 | 37 | alignments_tfidf = [] 38 | for alignments in alignments_list: 39 | tfidf_scores = [] 40 | for alignment in alignments: 41 | print alignment 42 | for word in alignment[1]: 43 | if word in tfidf: 44 | tfidf_scores.append(tfidf[word.lower()]) 45 | in_dict += 1 46 | if word != '-' and word not in tfidf: 47 | not_in_dict += 1 48 | for word in alignment[2]: 49 | if word in tfidf: 50 | tfidf_scores.append(tfidf[word.lower()]) 51 | in_dict += 1 52 | if word != '-' and word not in tfidf: 53 | not_in_dict += 1 54 | if tfidf_scores != []: 55 | alignments_tfidf.append((alignments, np.sum(tfidf_scores))) 56 | else: 57 | alignments_tfidf.append((alignments, 0)) 58 | 59 | print "num not in dict: ", not_in_dict 60 | print "in dict: ", in_dict 61 | 62 | alignments_tfidf.sort(key = lambda x: x[1], reverse=True) 63 | 64 | return alignments_tfidf 65 | 66 | 67 | def tfidf_by_state(state, num_bills = 'all'): 68 | ''' 69 | description: 70 | create dictionary of tfidf scores for a particular 71 | args: 72 | state 73 | num_bills: number of bills to run the algorithm open 74 | returns: 75 | dictionary of tfidf scores with words as keys 76 | ''' 77 | es = ElasticConnection() 78 | state_bills = es.get_bills_by_state(state, num_bills) 79 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \ 80 | if bill['_source']['bill_document_last'] != None] 81 | 82 | vectorizer = TfidfVectorizer() 83 | X = vectorizer.fit_transform(corpus) 84 | idf = vectorizer.idf_ 85 | idf = vectorizer._tfidf.idf_ 86 | 87 | return dict(zip(vectorizer.get_feature_names(), idf)) 88 | 89 | 90 | def tfidf_by_all_states(): 91 | states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 92 | 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE', 93 | 'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 94 | 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'] 95 | states = map(lambda x : x.lower(), states) 96 | 97 | tfidf = {} 98 | for state in states: 99 | print 'working on ', state 100 | tfidf[state] = tfidf_by_state(state) 101 | 102 | return tfidf 103 | 104 | 105 | #################################################################### 106 | ##state tfidf object 107 | class StateTFIDF(): 108 | 109 | def __init__(self, state_tfidf): 110 | self.state_tfidf = state_tfidf 111 | 112 | def find_tfidf(self, word, state): 113 | if state == 'model_legislation': 114 | return 0 115 | elif word == '-' or word not in self.state_tfidf[state]: 116 | return 0 117 | else: 118 | return self.state_tfidf[state][word] 119 | 120 | def tfidf_score(self, alignment_with_state): 121 | scores = [] 122 | print 'alignment_with_state: ', alignment_with_state 123 | raw_input("Press Enter to continue...") 124 | alignment, left_state, right_state = alignment_with_state 125 | score, left, right = alignment[0] #TODO: make work for more than one alignment 126 | 127 | for i in range(len(left)): 128 | scores.append(self.find_tfidf(left[i], left_state)) #need function 129 | scores.append(self.find_tfidf(right[i], right_state)) 130 | 131 | if scores == []: 132 | return 0 133 | else: 134 | return np.mean(scores) 135 | 136 | 137 | #################################################################### 138 | ##ranking functions 139 | def rank(alignments_list, functions): 140 | ''' 141 | depending on the function used, alignments_list may contain states of the alignments of not 142 | ''' 143 | ranking = [] 144 | for alignments in alignments_list: 145 | scores = [] 146 | #keep track of for normalization 147 | max_function_values = np.zeros((4)) 148 | 149 | for i in range(len(functions)): 150 | function = functions[i] 151 | output = function(alignments) 152 | scores.append(output) 153 | ranking.append((alignments, scores)) 154 | 155 | if max_function_values[i] < output: 156 | max_function_values[i] = output 157 | 158 | final_ranking = [] 159 | for alignments, scores in ranking: 160 | rank_value = [] 161 | scores_max = zip(scores, max_function_values) 162 | 163 | for score, maxim in scores_max: 164 | rank_value.append(score / float(maxim)) 165 | 166 | final_ranking.append((alignments[0][0], np.mean(scores))) 167 | 168 | final_ranking.sort(key = lambda x: x[1], reverse=True) 169 | 170 | return final_ranking 171 | 172 | 173 | def inspect_ranking(ranking): 174 | for alignments, tfidf in ranking: 175 | score, left, right = alignments 176 | for i in range(len(left)): 177 | print left[i], right[i] 178 | print 'alignment score: ', score 179 | print 'mean tfidf: ', tfidf 180 | raw_input("Press Enter to continue...") 181 | print '\n' 182 | 183 | 184 | 185 | if __name__ == '__main__': 186 | 187 | 188 | # tfidf = calc_tfidf(alignments_list) 189 | 190 | # alignments_tfidf = rank_alignments(alignments_list) 191 | 192 | # print 'testing speed of calculating tfidf per state' 193 | 194 | # t1 = time.time() 195 | # t=tfidf_state('al') 196 | # print 'alabama time: {0} seconds'.format(time.time()-t1) 197 | 198 | # t1 = time.time() 199 | # t=tfidf_state('ny') 200 | # print 'new york time: {0} seconds'.format(time.time()-t1) 201 | 202 | # print 'calculate tfidf by state...' 203 | 204 | # tfidf = tfidf_by_all_states() 205 | 206 | # with open('state_tfidfs.p', 'wb') as fp: 207 | # pickle.dump(tfidf, fp) 208 | 209 | print 'loading experiment and building alignment list...' 210 | with open('experiment.p', 'rb') as fp: 211 | e = pickle.load(fp) 212 | 213 | alignments_list = [] 214 | for key, value in e.results.iteritems(): 215 | i, j = key 216 | state_i = e.bills[i]['state'] 217 | state_j = e.bills[j]['state'] 218 | alignments_list.append((value['alignments'], state_i, state_j)) 219 | 220 | 221 | with open('state_tfidfs.p', 'rb') as fp: 222 | tfidf = pickle.load(fp) 223 | f = StateTFIDF(tfidf) 224 | 225 | print 'calculating ranking...' 226 | ranking = rank(alignments_list, [f.tfidf_score]) 227 | inspect_ranking(ranking) 228 | 229 | -------------------------------------------------------------------------------- /bashrc_lid: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make sure path to this project is set 4 | if [ ! -n "$POLICY_DIFFUSION" ]; then 5 | echo "Error: \$POLICY_DIFFUSION environment variable is not set" 6 | return 7 | fi 8 | 9 | # ensure user specified directory for log files 10 | if [ ! -n "$LOGFILE_DIRECTORY" ]; then 11 | echo "Error: \$LOGFILE_DIRECTORY environment variable is not set" 12 | return 13 | fi 14 | 15 | # ensure users specified a directory for temporary files 16 | if [ ! -n "$TEMPFILE_DIRECTORY" ]; then 17 | echo "Error: \$TEMPFILE_DIRECTORY environment variable is not set" 18 | return 19 | fi 20 | 21 | # ensure users specified the IP address for the ElasticSearch instance 22 | if [ ! -n "$ELASTICSEARCH_IP" ]; then 23 | echo "Error: \$ELASTICSEARCH_IP environment variable is not set" 24 | return 25 | fi 26 | 27 | # add python code to path 28 | export PYTHONPATH=${POLICY_DIFFUSION}/lid:${PYTHONPATH} 29 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/etl:${PYTHONPATH} 30 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/utils:${PYTHONPATH} 31 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/evaluation:${PYTHONPATH} 32 | export PYTHONPATH=${POLICY_DIFFUSION}/scripts:${PYTHONPATH} 33 | -------------------------------------------------------------------------------- /data/evaluation_set/bills_for_evaluation_set.csv: -------------------------------------------------------------------------------- 1 | Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 55,2011,Moak,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 750,2011,Howell,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 164,2011 Regular Session,Marzian,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,mo,SB 213,2011,Schaefer,Enacted,https://legiscan.com/MO/text/SB213/id/294359/Missouri-2011-SB213-Enrolled.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,ar,SB 4,2011,Johnson,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 1053,2011,,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,sd,HB 1062,2011,Lust,Enacted,http://legis.sd.gov/docs/legsession/2011/Bills/HB1062HJU.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 146,2011,Payne,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,id,SB 1056,2011,,Enacted,http://legislature.idaho.gov/legislation/2011/S1056.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 2181,187th,Gobi,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,me,LD 1377,2012,Sanborn,Enacted,http://www.mainelegislature.org/legis/bills/getPDF.asp?paper=HP1016&item=1&snum=125 Adult Guardianship and Protective Proceedings Jurisdiction Act,fl,HB 1431,2010,Schwartz,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2010/pdf/1431.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB7687,2014,Craven/McCaffrey,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,SB 7464,2012,Hannon,Introduced,https://legiscan.com/NY/text/S07464/id/646869/New_York-2011-S07464-Introduced.html Adult Guardianship and Protective Proceedings Jurisdiction Act,wy,SB 39,2013,Ross,Enacted,https://legiscan.com/WY/text/SF0039/2013 Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 1366,188th,Gobi,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2655,2013,Hopson,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,sc,SB 1070,2010,Hayes,Enacted,http://www.scstatehouse.gov/sess118_2009-2010/prever/1070_20100120.htm Adult Guardianship and Protective Proceedings Jurisdiction Act,az,HB 2426,49th-2nd-regular,Driggs,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB 5958,2011,Kennedy,Introduced,https://legiscan.com/RI/text/H5958/id/268260/Rhode_Island-2011-H5958-Draft.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,nh,SB 209,2015,Stiles,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,SB525,2015,Lombardi/Craven,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ia,HF 734,2010,,Enacted,https://www.legis.iowa.gov/DOCS/IowaActs/83/2/pdf/Chapter_1086.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,vt,SB 169,2010,Campbell,Introduced,https://legiscan.com/VT/text/S0169/id/384141/Vermont-2009-S0169-Introduced.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 231,2010,Kelley,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ok,SB 2204,2010,Ivester,Enacted,http://www.oklegislature.gov/cf_pdf/2009-10%20ENR/sb/sb2204%20enr.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 80,2010,Howell,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,AB 4253,2011,DiCicco,Introduced,http://www.njleg.state.nj.us/2010/Bills/A4500/4253_I1.HTM Adult Guardianship and Protective Proceedings Jurisdiction Act,hi,SB 2318,2012,Chun,Enacted,https://legiscan.com/HI/text/SB2318/id/544560/Hawaii-2012-SB2318-Introduced.html Adult Guardianship and Protective Proceedings Jurisdiction Act,pa,HB 1720,2012,Hennessey,Enacted,http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=1720&pn=2589 Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 191,2012,Moak,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,HB 5150,2012,,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,A 2628,215,Rudder,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,mi,SB 466,2013-2014,Schuitmaker,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,S 2249,188th,,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2240,2014,Hopson,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,A 857,2013-2014,Weinstein,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,tn,SB 444,2010,Stewart,Enacted,https://legiscan.com/TN/text/SB0444/id/461093/Tennessee-2009-SB0444-Draft.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2998,84,Rodriguez,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ga,SB 207,2015_16,McKoon,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,nc,HB 817,2015,Hurley,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 426,2010,,Introduced,http://www.cga.ct.gov/2010/FC/2010SB-00426-R000518-FC.htm Adult Guardianship and Protective Proceedings Jurisdiction Act,mn,SF412,2009-2010,Moua,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 497,2009,Wirth,Introduced,http://www.nmlegis.gov/Sessions/09%20Regular/bills/senate/SB0497.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,nv,SB 313,75,,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 576,2009,Doyle,Introduced,http://www.cga.ct.gov/2009/FC/pdf/2009SB-00576-R000752-FC.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2260,81,Truitt,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,wa,HB 1261,2009,Goodman,Enacted,http://lawfilesext.leg.wa.gov/biennium/2009-10/Pdf/Bills/House%20Passed%20Legislature/1261-S.PL.pdf Adult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 122,2009,Kelley,Introduced, Adult Guardianship and Protective Proceedings Jurisdiction Act,il,HB 759,96th,Ryg,Enacted, Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 86,98th,Marzian,Introduced,https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0CB4QFjAAahUKEwi939z7k_TGAhVVMYgKHasODFs&url=http%3A%2F%2Fwww.lrc.ky.gov%2Frecord%2F10rs%2FHB86%2Fbill.doc&ei=jGOyVb1p1eKgBKudsNgF&usg=AFQjCNHcJ0pa2RJG5jxy0CHbLYpUEAADEA&sig2=E7yw8zaghDIujs7uzqPhHQ Adult Guardianship and Protective Proceedings Jurisdiction Act,ut,SB 122,2008,Hillyard,Enacted,http://le.utah.gov/~2008/bills/static/SB0122.html Adult Guardianship and Protective Proceedings Jurisdiction Act,mo,HB 2105,2008,Cooper,Introduced,http://house.mo.gov/billtracking/bills081/billpdf/intro/HB2105I.PDF Anatomical Gift Act (2006),ct,SB 250,2010,,Enacted,http://www.cga.ct.gov/2010/TOB/S/2010SB-00250-R00-SB.htm Anatomical Gift Act (2006),ky,SB 4,2010,Williams,Enacted,http://www.lrc.ky.gov/record/10RS/SB4/bill.doc Anatomical Gift Act (2006),md,HB 1451,2010,"Barve, Dumais",Introduced,http://mlis.state.md.us/2010rs/bills/hb/hb1451f.pdf Anatomical Gift Act (2006),il,HB 2339,98th,Davis,Enacted, Anatomical Gift Act (2006),pa,HB 2700,2009-2010,Petrarca,Introduced, Anatomical Gift Act (2006),pa,SB750,2011-2012,Greenleaf/Petrarca,Introduced, Anatomical Gift Act (2006),ma,S 1098,187th,Fargo,Introduced, Anatomical Gift Act (2006),md,SB 756,2011,Kelley,Enacted, Anatomical Gift Act (2006),pa,SB 180,2015-2016,Greenleaf,Introduced, Anatomical Gift Act (2006),tx,HB 2027,81,Zerwas,Enacted, Anatomical Gift Act (2006),ct,HB 6677,2009,,Introduced,http://www.cga.ct.gov/2009/FC/2009HB-06677-R000964-FC.htm Anatomical Gift Act (2006),fl,SB 766,2009,,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2009/pdf/0766ER.pdf Anatomical Gift Act (2006),oh,HB 529,2009,Wachtmann,Enacted,http://archives.legislature.state.oh.us/analysis.cfm?ID=127_HB_529&ACT=As%20Enrolled&hf=analyses127/08-hb529-127.htm Anatomical Gift Act (2006),il,HB 1349,96th,Davis,Introduced, Anatomical Gift Act (2006),ak,SB 181,2007,McGuire,Introduced,http://www.legis.state.ak.us/PDF/25/Bills/SB0181A.PDF Anatomical Gift Act (2006),az,SB 1099,2007,Allen,Enacted,http://www.azleg.gov/legtext/48leg/1r/bills/sb1099h.htm Anatomical Gift Act (2006),id,SB 1017,2007,,Enacted,http://legislature.idaho.gov/legislation/2007/S1017.html Anatomical Gift Act (2006),ne,LB 1036,2010,Council,Enacted,http://www.nebraskalegislature.gov/FloorDocs/101/PDF/Final/LB1036.pdf Anatomical Gift Act (2006),nh,HB 1430,2010,Foose,Enacted,http://www.nhliberty.org/bills/view/2010/HB1430 Anatomical Gift Act (2006),vt,S 205,2009-2010,Ayer,Enacted, Anatomical Gift Act (2006),pa,SB 850,2013-2014,Greenleaf,Introduced, Anatomical Gift Act (2006),wa,HB 1637,2008,Hinkle,Enacted,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/Session%20Laws/House/1637-S.SL.pdf Anatomical Gift Act (2006),wi,SB 310,2008,Risser,Enacted,http://docs.legis.wisconsin.gov/2007/related/proposals/sb310 Anatomical Gift Act (2006),ca,AB 1689,2008,Lieber,Enacted,http://leginfo.legislature.ca.gov/faces/billNavClient.xhtml?bill_id=200720080AB1689 Anatomical Gift Act (2006),ga,SB 405,2008,Balfour,Enacted,http://www.legis.ga.gov/Legislation/20072008/84683.pdf Anatomical Gift Act (2006),hi,HB 2139,2008,,Enacted,http://www.capitol.hawaii.gov/session2008/bills/HB2139_CD1_.pdf Anatomical Gift Act (2006),ak,HB 196,2008,,Enacted,http://www.legis.state.ak.us/PDF/25/Bills/HB0196Z.PDF Anatomical Gift Act (2006),va,HB 2684,2007,Frederick,Enacted,http://lis.virginia.gov/cgi-bin/legp604.exe?071+ful+CHAP0092 Anatomical Gift Act (2006),wa,HB 1637,2007,Hinkle,Introduced,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/House%20Passed%20Legislature/1637-S.PL.pdf Anatomical Gift Act (2006),mo,SB 1139,2008,Dempsey,Enacted,http://www.senate.mo.gov/08info/pdf-bill/tat/SB1139.pdf Anatomical Gift Act (2006),ms,HB 1075,2008,Holland,Enacted,https://www.donatelifems.org/HB1075SG.pdf Anatomical Gift Act (2006),me,LD 1505,2008,Hobbins,Enacted,http://www.mainelegislature.org/legis/bills/bills_123rd/billpdfs/SP052801.pdf Anatomical Gift Act (2006),mi,HB 4940,2008,Condino,Enacted,http://www.legislature.mi.gov/documents/2007-2008/publicact/pdf/2008-PA-0039.pdf Anatomical Gift Act (2006),ny,SB 5154,2008,Hannon,Introduced,http://assembly.state.ny.us/leg/?default_fld=&bn=S05154&term=2007&Text=Y Anatomical Gift Act (2006),nj,SB 754,2008,Codey,Enacted,http://www.njleg.state.nj.us/2008/Bills/PL08/50_.PDF Anatomical Gift Act (2006),ia,SF 509,2007,,Enacted,http://coolice.legis.iowa.gov/legislation/82ndGA/enrolled/sf509.html Anatomical Gift Act (2006),mn,SF 883,2007,Scheid,Enacted,https://www.revisor.mn.gov/bills/text.php?number=SF883&version=0&session_year=2007&session_number=0 Anatomical Gift Act (2006),mo,HB723,2007,Stevenson/Koster,Introduced,http://www.house.mo.gov/billtracking/bills071/billpdf/intro/HB0723I.PDF Anatomical Gift Act (2006),nj,AB 3909,2007,Conaway,Introduced,http://www.njleg.state.nj.us/2006/Bills/A4000/3909_I1.HTM Anatomical Gift Act (2006),nm,HB 1276,2007,Cervantes,Enacted,http://www.nmlegis.gov/Sessions/07%20Regular/final/HB1276.pdf Anatomical Gift Act (2006),nc,HB 1372,2007,Folwell,Enacted,http://www.ncga.state.nc.us/Sessions/2007/Bills/House/PDF/H1372v6.pdf Anatomical Gift Act (2006),nd,SB 2163,2007,Kilzer,Enacted,http://legis.nd.gov/assembly/60-2007/bill-text/HAUN0400.pdf Anatomical Gift Act (2006),or,HB 3092,2007,,Enacted,https://olis.leg.state.or.us/liz/2007R1/Downloads/MeasureDocument/HB3092 Anatomical Gift Act (2006),tn,HB 1557,2007,Shepard,Enacted,http://state.tn.us/sos/acts/105/pub/pc0428.pdf Anatomical Gift Act (2006),tx,SB 1597,2007,Janek,Introduced,http://www.legis.state.tx.us/tlodocs/80R/billtext/html/SB01597E.htm Anatomical Gift Act (2006),ut,SB 92,2007,Hillyard,Enacted,http://le.utah.gov/~2007/bills/static/SB0092.html -------------------------------------------------------------------------------- /data/model_legislation_urls/clean_urls.txt: -------------------------------------------------------------------------------- 1 | http://publicpolicyalliance.org/legislation/model-alac-bill/ 2 | http://www.svia.org/Relations/Legislation.aspx 3 | http://www.mpp.org/legislation/model-medical-marijuana-bill.html?referrer=https://www.google.com/ 4 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Pets_Shops.pdf 5 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Pet_Shops.pdf 6 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Swap_Meets.pdf 7 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf 8 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf 9 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Traveling_Circus.pdf 10 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_Traveling_Circus.pdf 11 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_for_Display_of_Exotics.pdf 12 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_for_Display_of_Exotics. 13 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_for_trapping.pdf 14 | http://www.bornfreeusa.org/downloads/pdf/Model_Wildlife_Feeding_Legislation.pdf 15 | http://images2.americanprogress.org/campus/web/ALEC_voter_ID_model_legislation.pdf 16 | http://www.publiccharters.org/wp-content/uploads/2014/01/ModelLaw_P7-wCVR_20110402T222341.pdf 17 | http://apps.americanbar.org/tax/groups/salt/ABA1_OFFICIAL_MODEL_ACT_REPORT_AS_ADOPTED_8-7-06.pdf 18 | http://www.justice.gov/olp/model-state-provisions-pimping-pandering-and-prostitution 19 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidencePrescriptiveModelBill2015.pdf 20 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidenceTaskForceKeyedtoNISTModelBillRB.pdf 21 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDPrescriptiveModelBill2015.pdf 22 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDStandardTaskForceModelBill2015.pdf 23 | http://www.innocenceproject.org/free-innocent/improve-the-law/RecordingofCustodialInterrogationsModelBill2015.pdf 24 | http://www.innocenceproject.org/free-innocent/improve-the-law/CompensationModelBill2015.pdf 25 | http://www.innocenceproject.org/free-innocent/improve-the-law/JailhouseInformantModelBill2015.pdf 26 | http://www.innocenceproject.org/free-innocent/improve-the-law/AccesstoPostConvictionDNATestingModelBill2015.pdf 27 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_disclosure.pdf 28 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_clawbacks.pdf 29 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_jobstandards.pdf 30 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_UEDB.pdf 31 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_complete.pdf 32 | http://www.pcia.com/images/Advocacy_Docs/PCIA_Model_State_Siting_Legislation_2012.pdf 33 | http://nepc.colorado.edu/files/NEPC-VirtSchool-2-LB-Bathon.pdf 34 | http://www.shallnot.org/legislation 35 | http://www.khi.org/assets/uploads/news/13359/goldwater_institute_right_to_try_model_legislation.pdf 36 | http://www.icmec.org/en_X1/pdf/Child_Pornography_Model_Law_English_7th_Edition_2012.pdf 37 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf 38 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf 39 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf 40 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf 41 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Education-Savings-Account-Act.pdf 42 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Military-Family-Scholarship-Program-Act.pdf 43 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Foster-Child-Scholarship-Program-Act.pdf 44 | http://greatlakescenter.org/docs/Policy_Briefs/Research-Based-Options/02-Trujillo_Turnarounds-LB.pdf 45 | http://www.academicfreedompetition.com/freedom.php 46 | http://www.dpcare.org/#!dpcc-model-legislation/c14ob 47 | http://toxicsinpackaging.org/model-legislation/model/ 48 | http://www.adl.org/assets/pdf/combating-hate/Hate-Crimes-Law.pdf 49 | http://www.davidyerushalmilaw.com/CLE-Course-on-Draft-Uniform-Act--American-Laws-for-American-Courts-b25-p0.html%22 50 | https://www.aclu.org/model-act-regulating-use-wearable-body-cameras-law-enforcement 51 | http://www.ncsl.org/documents/standcomm/sccomfc/point_of_sale_model_bill2010.pdf 52 | http://object.cato.org/pdfs/model-tax-credit-legislation-schaeffer-cato.pdf 53 | http://inspectorsgeneral.org/files/2011/01/IG-Model-Legislation.pdf 54 | http://www.inacol.org/wp-content/uploads/2015/02/Principles-For-Model-Legislation-2012.pdf 55 | http://www.emacweb.org/index.php/mutualaidresources/intrastate-mutual-aid/modellegislation 56 | http://aldf.org/downloads/ALDF_Model_Laws_v15_0.pdf 57 | http://www.nationalpartnership.org/research-library/work-family/psd/model-paid-sick-and-safe-days-legislation.pdf 58 | http://www.nationalpartnership.org/research-library/work-family/psd/section-by-section-analysis-model-legislation.pdf 59 | http://www.nationalpartnership.org/research-library/work-family/psd/fact-sheet-model-legislation-main-points.pdf 60 | https://www.aapa.org/WorkArea/DownloadAsset.aspx?id=548 61 | http://www.indianasenaterepublicans.com/clientuploads/directory/publications/Sen%20David%20Long%20Article%20V%20Packet-Online.pdf 62 | https://www.mackinac.org/21341http://www.naso.org/Resources/Legislation/ModelLegislation.aspx 63 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a 64 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf?e3490a 65 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf?e3490a 66 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf?e3490a 67 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a 68 | http://autisticadvocacy.org/wp-content/uploads/2014/03/OrganTransplantationModelLegislation.pdf 69 | https://s3.amazonaws.com/peacelegislation/PEACE+Act.pdf 70 | http://www.peaceonourstreets.com/peace 71 | http://www.splc.org/article/2000/07/student-press-law-center-model-legislation-to-protect-student-free-expression-rights 72 | http://www.splc.org/article/1998/09/student-press-law-center-model-guidelines-for-high-school-student-media 73 | http://www.splc.org/article/2009/02/splc-college-student-media-model-guidelines 74 | http://www.peaceonourstreets.com/hemp 75 | http://www.safeaccessnow.org/model_legislation 76 | http://www.constitutionproject.org/pdf/FINAL%20Model%20Legislation.PDF 77 | http://web.archive.org/web/20080725012036/http://innocenceproject.org/docs/Preservation_Evidence_Prescriptive_08.pdf 78 | http://web.archive.org/web/20080705090003/http://www.innocenceproject.org/docs/Preservation_Pilot_08.pdf 79 | http://web.archive.org/web/20080705180144/http://innocenceproject.org/docs/Preservation_Task_Force_08.pdf 80 | http://web.archive.org/web/20080807124850/http://www.innocenceproject.org/docs/Eye_ID_Prescriptive_08.pdf 81 | http://web.archive.org/web/20080705085403/http://www.innocenceproject.org/docs/Eyewitness_ID_Written_Policies_08.pdf 82 | http://web.archive.org/web/20080705085547/http://www.innocenceproject.org/docs/Eyewitness_ID_Task_Force_08.pdf 83 | http://web.archive.org/web/20111014124824/http://www.innocenceproject.org/docs/Recording_Custodial_Interrogations_08.pdf 84 | http://web.archive.org/web/20080725013155/http://innocenceproject.org/docs/CJR_Commission_08.pdf 85 | http://web.archive.org/web/20080705090212/http://www.innocenceproject.org/docs/Compensation08.pdf 86 | http://www.inta.org/Advocacy/Pages/WorldCustomsOrganizationModelLegislation.aspx 87 | https://www.ij.org/images/pdf_folder/legislative/business-reg-relief-act.pdf 88 | https://www.ij.org/images/pdf_folder/legislative/model-reporting-law.pdf 89 | https://www.ij.org/images/pdf_folder/legislative/business-reg-act.pdf 90 | https://www.ij.org/images/pdf_folder/legislative/anti-slapp-model.pdf 91 | https://www.ij.org/images/pdf_folder/legislative/model-ed-legislation.pdf 92 | https://www.ij.org/images/pdf_folder/legislative/ijmodeleconlib.pdf 93 | https://www.ij.org/images/pdf_folder/legislative/ijmodelforfeiturelaw.pdf 94 | http://www.nclc.org/images/pdf/arbitration/model-state-arb-act-2015.pdf 95 | http://www.nclc.org/images/pdf/debt_collection/model_family_financial_protection_act.pdf 96 | http://www.nclc.org/images/pdf/legislation/model_laws/state-model-law-2011.pdf 97 | http://www.nclc.org/images/pdf/foreclosure_mortgage/mediation/model-judicial.pdf 98 | http://www.gunlaws.com/ConstitutionalCarry.htm 99 | http://www.gunlaws.com/GFZ/GFZ-BillReview.htm 100 | http://www.gunlaws.com/HighSchoolMarksmanship.htm 101 | http://www.gunlaws.com/lostcry.htm 102 | http://www.gunlaws.com/PropertyInVehicleLaw.htm 103 | http://www.gunlaws.com/DefensiveDisplay.htm 104 | http://www.gunlaws.com/MontanaMadeGuns.htm 105 | http://www.gunlaws.com/BIDSvNICS.htm 106 | http://www.gunlaws.com/sunshin.htm 107 | http://www.gunlaws.com/911-Limited-Immunity.htm 108 | http://www.gunlaws.com/EnumeratedPowersAct.htm 109 | http://ncra.files.cms-plus.com/GovernmentRelations/FINAL%20Third-Party%20Contracting%20Model%20Legislation.pdf 110 | https://www.heartland.org/policy-documents/model-bill-parent-trigger 111 | http://www.glsen.org/sites/default/files/GLSEN%20state%20model%20legislation.pdf 112 | http://www.frc.org/onepagers/model-legislation-divorce-reform-for-families-with-children 113 | http://www.lac.org/toolkits/sealing/Model%20Expungement%20Statute.pdf 114 | http://www.hopeafterrapeconception.org/model-legislation.html 115 | https://algaonline.org/DocumentCenter/View/11 116 | http://www.nelp.org/content/uploads/2015/04/NELP-Model-Legislation-Work-Sharing.pdf 117 | http://www.flushthetpp.org/tpp-free-zone-model-legislation/ 118 | https://www.proenglish.org/official-english/legislation/model-legislation.html 119 | http://www.nascla.org/nascla-model-legislation 120 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1416085011/4th-Amendment-Protection-Act.pdf 121 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604203/Electronic-Data-Privacy-Act.pdf?1409604203 122 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603024/Freedom_from_Drone_Surveillance_Act.pdf?1409603024 123 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293 124 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293 125 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603013/CHOICE_Act_of_2015.pdf?1409603013 126 | http://www.cchr.org/download-material/model-legislation.html 127 | http://www.cchr.org/sites/default/files/Electroshock_Model_Legislation.pdf 128 | http://www.cchr.org/sites/default/files/Deadly_Restraint_Model_Legislation.pdf 129 | http://www.cchr.org/sites/default/files/Involuntary_Commitment_Model_Legislation.pdf 130 | http://www.cchr.org/sites/default/files/Rape_Model_Legislation.pdf 131 | http://www.licenseportability.org/assets/pdf/Interstate-Medical-Licensure-Compact-(FINAL).pdf 132 | http://assets.aarp.org/rgcenter/consume/d17158_dwell.pdf 133 | http://ticas.org/sites/default/files/legacy/files/File/Model%20Tax%20Credit.pdf 134 | https://represent.us/wp-content/uploads/2015/04/AACA-Full-Provisions.pdf 135 | http://www.naiaonline.org/uploads/Main_Upload_Directory/NaiaPetFriendlyGuide.pdf 136 | http://www.naiaonline.org/pdfs/NAIA_%20Model_Animal_Control_Law_Final.pdf 137 | http://www.naiaonline.org/uploads/Main_Upload_Directory/naiaShelterReportingAct2014.pdf 138 | http://www.naiaonline.org/pdfs/ShelterImportAndReportingModel.pdf 139 | http://www.naiaonline.org/uploads/Main_Upload_Directory/DogPurchaserProtectionModelLaw.pdf 140 | http://www.naiaonline.org/articles/article/naia-resolution-supporting-animal-welfare#sthash.X3spi6jw.dpbs 141 | http://www.naic.org/documents/committees_b_exchanges_adopted_health_benefit_exchanges.pdf 142 | http://netchoice.org/wp-content/uploads/maiyn-online-safety-model-legislation-v2-6.pdf 143 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att7.pdf 144 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att8.pdf 145 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att5.pdf 146 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf 147 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf 148 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_acc.pdf 149 | http://www.pwia.org/assets/cabinets/Cabinet474/PWIAModelLegislation.pdf 150 | http://www.nhcsl.org/2007-1.php 151 | http://legis.state.nm.us/Sessions/07%20Regular/final/SB0600.pdf 152 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf 153 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf 154 | http://gallery.mailchimp.com/c1a51befb8159efb3bbd1f2620f9e1/files/VRA_ModelResolution.pdf 155 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ExcessivelyRaisedVehicles.pdf 156 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_LowSpeedVehicles.pdf 157 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ElectronicLienTitle.pdf 158 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_AdvertisingonInternet.pdf 159 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLawAdvertisingOverInternetJurisdictionCourts.pdf 160 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_UnfairTradePractices.pdf 161 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ProspectivePurchaserInquiryFeeForNMVTIS.pdf 162 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MotorCarrierStaggeredRegistration.pdf 163 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_DisclosurePersnlInfoInMVRecords.pdf 164 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ContractingForMotorVehicleRegistration.pdf 165 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_SalvageVehicleTitling.pdf 166 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MVInspectionsByIndependentContractors.pdf 167 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_PrivatizationThirdParties.pdf 168 | 169 | -------------------------------------------------------------------------------- /db/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | ##################### Elasticsearch Configuration Example ##################### 2 | 3 | # This file contains an overview of various configuration settings, 4 | # targeted at operations staff. Application developers should 5 | # consult the guide at . 6 | # 7 | # The installation procedure is covered at 8 | # . 9 | # 10 | # Elasticsearch comes with reasonable defaults for most settings, 11 | # so you can try it out without bothering with configuration. 12 | # 13 | # Most of the time, these defaults are just fine for running a production 14 | # cluster. If you're fine-tuning your cluster, or wondering about the 15 | # effect of certain configuration option, please _do ask_ on the 16 | # mailing list or IRC channel [http://elasticsearch.org/community]. 17 | 18 | # Any element in the configuration can be replaced with environment variables 19 | # by placing them in ${...} notation. For example: 20 | # 21 | #node.rack: ${RACK_ENV_VAR} 22 | 23 | # For information on supported formats and syntax for the config file, see 24 | # 25 | 26 | 27 | ################################### Cluster ################################### 28 | 29 | # Cluster name identifies your cluster for auto-discovery. If you're running 30 | # multiple clusters on the same network, make sure you're using unique names. 31 | # 32 | cluster.name: sunlightcluster 33 | 34 | 35 | #################################### Node ##################################### 36 | 37 | # Node names are generated dynamically on startup, so you're relieved 38 | # from configuring them manually. You can tie this node to a specific name: 39 | # 40 | node.name: "sunlight_0" 41 | 42 | # Every node can be configured to allow or deny being eligible as the master, 43 | # and to allow or deny to store the data. 44 | # 45 | # Allow this node to be eligible as a master node (enabled by default): 46 | # 47 | #node.master: true 48 | # 49 | # Allow this node to store data (enabled by default): 50 | # 51 | #node.data: true 52 | 53 | # You can exploit these settings to design advanced cluster topologies. 54 | # 55 | # 1. You want this node to never become a master node, only to hold data. 56 | # This will be the "workhorse" of your cluster. 57 | # 58 | #node.master: false 59 | #node.data: true 60 | # 61 | # 2. You want this node to only serve as a master: to not store any data and 62 | # to have free resources. This will be the "coordinator" of your cluster. 63 | # 64 | #node.master: true 65 | #node.data: false 66 | # 67 | # 3. You want this node to be neither master nor data node, but 68 | # to act as a "search load balancer" (fetching data from nodes, 69 | # aggregating results, etc.) 70 | # 71 | #node.master: false 72 | #node.data: false 73 | 74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the 75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools 76 | # such as , 77 | # , 78 | # and 79 | # to inspect the cluster state. 80 | 81 | # A node can have generic attributes associated with it, which can later be used 82 | # for customized shard allocation filtering, or allocation awareness. An attribute 83 | # is a simple key value pair, similar to node.key: value, here is an example: 84 | # 85 | #node.rack: rack314 86 | 87 | # By default, multiple nodes are allowed to start from the same installation location 88 | # to disable it, set the following: 89 | #node.max_local_storage_nodes: 1 90 | 91 | 92 | #################################### Index #################################### 93 | 94 | # You can set a number of options (such as shard/replica options, mapping 95 | # or analyzer definitions, translog settings, ...) for indices globally, 96 | # in this file. 97 | # 98 | # Note, that it makes more sense to configure index settings specifically for 99 | # a certain index, either when creating it or by using the index templates API. 100 | # 101 | # See and 102 | # 103 | # for more information. 104 | 105 | # Set the number of shards (splits) of an index (5 by default): 106 | # 107 | #index.number_of_shards: 5 108 | 109 | # Set the number of replicas (additional copies) of an index (1 by default): 110 | # 111 | #index.number_of_replicas: 1 112 | 113 | # Note, that for development on a local machine, with small indices, it usually 114 | # makes sense to "disable" the distributed features: 115 | # 116 | #index.number_of_shards: 1 117 | #index.number_of_replicas: 0 118 | 119 | # These settings directly affect the performance of index and search operations 120 | # in your cluster. Assuming you have enough machines to hold shards and 121 | # replicas, the rule of thumb is: 122 | # 123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to 124 | # _distribute_ a big index across machines. 125 | # 2. Having more *replicas* enhances the _search_ performance and improves the 126 | # cluster _availability_. 127 | # 128 | # The "number_of_shards" is a one-time setting for an index. 129 | # 130 | # The "number_of_replicas" can be increased or decreased anytime, 131 | # by using the Index Update Settings API. 132 | # 133 | # Elasticsearch takes care about load balancing, relocating, gathering the 134 | # results from nodes, etc. Experiment with different settings to fine-tune 135 | # your setup. 136 | 137 | # Use the Index Status API () to inspect 138 | # the index status. 139 | 140 | 141 | #################################### Paths #################################### 142 | 143 | # Path to directory containing configuration (this file and logging.yml): 144 | # 145 | #path.conf: /path/to/conf 146 | 147 | # Path to directory where to store index data allocated for this node. 148 | # 149 | path.data: /mnt/elasticsearch/ 150 | # 151 | # Can optionally include more than one location, causing data to be striped across 152 | # the locations (a la RAID 0) on a file level, favouring locations with most free 153 | # space on creation. For example: 154 | # 155 | #path.data: /path/to/data1,/path/to/data2 156 | 157 | # Path to temporary files: 158 | # 159 | #path.work: /path/to/work 160 | 161 | # Path to log files: 162 | # 163 | #path.logs: /mnt/data/sunlight/es_logs 164 | 165 | # Path to where plugins are installed: 166 | # 167 | #path.plugins: /path/to/plugins 168 | 169 | 170 | #################################### Plugin ################################### 171 | 172 | # If a plugin listed here is not installed for current node, the node will not start. 173 | # 174 | #plugin.mandatory: mapper-attachments,lang-groovy 175 | 176 | 177 | ################################### Memory #################################### 178 | 179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that 180 | # it _never_ swaps. 181 | # 182 | # Set this property to true to lock the memory: 183 | # 184 | bootstrap.mlockall: true 185 | 186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set 187 | # to the same value, and that the machine has enough memory to allocate 188 | # for Elasticsearch, leaving enough memory for the operating system itself. 189 | # 190 | # You should also make sure that the Elasticsearch process is allowed to lock 191 | # the memory, eg. by using `ulimit -l unlimited`. 192 | 193 | 194 | ############################## Network And HTTP ############################### 195 | 196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens 197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node 198 | # communication. (the range means that if the port is busy, it will automatically 199 | # try the next port). 200 | 201 | # Set the bind address specifically (IPv4 or IPv6): 202 | # 203 | #network.bind_host: 192.168.0.1 204 | 205 | # Set the address other nodes will use to communicate with this node. If not 206 | # set, it is automatically derived. It must point to an actual IP address. 207 | # 208 | #network.publish_host: 192.168.0.1 209 | 210 | # Set both 'bind_host' and 'publish_host': 211 | # 212 | #network.host: 192.168.0.1 213 | 214 | # Set a custom port for the node to node communication (9300 by default): 215 | # 216 | #transport.tcp.port: 9300 217 | 218 | # Enable compression for all communication between nodes (disabled by default): 219 | # 220 | #transport.tcp.compress: true 221 | 222 | # Set a custom port to listen for HTTP traffic: 223 | # 224 | #http.port: 9200 225 | 226 | # Set a custom allowed content length: 227 | # 228 | #http.max_content_length: 100mb 229 | 230 | # Disable HTTP completely: 231 | # 232 | #http.enabled: false 233 | 234 | 235 | ################################### Gateway ################################### 236 | 237 | # The gateway allows for persisting the cluster state between full cluster 238 | # restarts. Every change to the state (such as adding an index) will be stored 239 | # in the gateway, and when the cluster starts up for the first time, 240 | # it will read its state from the gateway. 241 | 242 | # There are several types of gateway implementations. For more information, see 243 | # . 244 | 245 | # The default gateway type is the "local" gateway (recommended): 246 | # 247 | #gateway.type: local 248 | 249 | # Settings below control how and when to start the initial recovery process on 250 | # a full cluster restart (to reuse as much local data as possible when using shared 251 | # gateway). 252 | 253 | # Allow recovery process after N nodes in a cluster are up: 254 | # 255 | #gateway.recover_after_nodes: 1 256 | 257 | # Set the timeout to initiate the recovery process, once the N nodes 258 | # from previous setting are up (accepts time value): 259 | # 260 | #gateway.recover_after_time: 5m 261 | 262 | # Set how many nodes are expected in this cluster. Once these N nodes 263 | # are up (and recover_after_nodes is met), begin recovery process immediately 264 | # (without waiting for recover_after_time to expire): 265 | # 266 | #gateway.expected_nodes: 2 267 | 268 | 269 | ############################# Recovery Throttling ############################# 270 | 271 | # These settings allow to control the process of shards allocation between 272 | # nodes during initial recovery, replica allocation, rebalancing, 273 | # or when adding and removing nodes. 274 | 275 | # Set the number of concurrent recoveries happening on a node: 276 | # 277 | # 1. During the initial recovery 278 | # 279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4 280 | # 281 | # 2. During adding/removing nodes, rebalancing, etc 282 | # 283 | #cluster.routing.allocation.node_concurrent_recoveries: 2 284 | 285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb): 286 | # 287 | #indices.recovery.max_bytes_per_sec: 20mb 288 | 289 | # Set to limit the number of open concurrent streams when 290 | # recovering a shard from a peer: 291 | # 292 | #indices.recovery.concurrent_streams: 5 293 | 294 | 295 | ################################## Discovery ################################## 296 | 297 | # Discovery infrastructure ensures nodes can be found within a cluster 298 | # and master node is elected. Multicast discovery is the default. 299 | 300 | # Set to ensure a node sees N other master eligible nodes to be considered 301 | # operational within the cluster. This should be set to a quorum/majority of 302 | # the master-eligible nodes in the cluster. 303 | # 304 | #discovery.zen.minimum_master_nodes: 1 305 | 306 | # Set the time to wait for ping responses from other nodes when discovering. 307 | # Set this option to a higher value on a slow or congested network 308 | # to minimize discovery failures: 309 | # 310 | #discovery.zen.ping.timeout: 3s 311 | 312 | # For more information, see 313 | # 314 | 315 | # Unicast discovery allows to explicitly control which nodes will be used 316 | # to discover the cluster. It can be used when multicast is not present, 317 | # or to restrict the cluster communication-wise. 318 | # 319 | # 1. Disable multicast discovery (enabled by default): 320 | # 321 | #discovery.zen.ping.multicast.enabled: false 322 | # 323 | # 2. Configure an initial list of master nodes in the cluster 324 | # to perform discovery when new nodes (master or data) are started: 325 | # 326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] 327 | 328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery. 329 | # 330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery. 331 | # 332 | # For more information, see 333 | # 334 | # 335 | # See 336 | # for a step-by-step tutorial. 337 | 338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery. 339 | # 340 | # You have to install the cloud-gce plugin for enabling the GCE discovery. 341 | # 342 | # For more information, see . 343 | 344 | # Azure discovery allows to use Azure API in order to perform discovery. 345 | # 346 | # You have to install the cloud-azure plugin for enabling the Azure discovery. 347 | # 348 | # For more information, see . 349 | 350 | ################################## Slow Log ################################## 351 | 352 | # Shard level query and fetch threshold logging. 353 | 354 | #index.search.slowlog.threshold.query.warn: 10s 355 | #index.search.slowlog.threshold.query.info: 5s 356 | #index.search.slowlog.threshold.query.debug: 2s 357 | #index.search.slowlog.threshold.query.trace: 500ms 358 | 359 | #index.search.slowlog.threshold.fetch.warn: 1s 360 | #index.search.slowlog.threshold.fetch.info: 800ms 361 | #index.search.slowlog.threshold.fetch.debug: 500ms 362 | #index.search.slowlog.threshold.fetch.trace: 200ms 363 | 364 | #index.indexing.slowlog.threshold.index.warn: 10s 365 | #index.indexing.slowlog.threshold.index.info: 5s 366 | #index.indexing.slowlog.threshold.index.debug: 2s 367 | #index.indexing.slowlog.threshold.index.trace: 500ms 368 | 369 | ################################## GC Logging ################################ 370 | 371 | #monitor.jvm.gc.young.warn: 1000ms 372 | #monitor.jvm.gc.young.info: 700ms 373 | #monitor.jvm.gc.young.debug: 400ms 374 | 375 | #monitor.jvm.gc.old.warn: 10s 376 | #monitor.jvm.gc.old.info: 5s 377 | #monitor.jvm.gc.old.debug: 2s 378 | 379 | ################################## Security ################################ 380 | 381 | # Uncomment if you want to enable JSONP as a valid return transport on the 382 | # http server. With this enabled, it may pose a security risk, so disabling 383 | # it unless you need it is recommended (it is disabled by default). 384 | # 385 | #http.jsonp.enable: true 386 | -------------------------------------------------------------------------------- /db/evaluation_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "bill_document": { 3 | "dynamic": "false", 4 | "properties": { 5 | "bill_document_last": { 6 | "type": "string", 7 | "term_vector": "yes", 8 | "index": "analyzed", 9 | "_size": { 10 | "enabled": true, 11 | "store": true 12 | }, 13 | "fields": { 14 | "shingles": { 15 | "type": "string", 16 | "analyzer": "my_shingle_analyzer" 17 | } } 18 | }, 19 | "match": { 20 | "type": "string", 21 | "index": "not_analyzed" 22 | }, 23 | "state": { 24 | "type": "string", 25 | "index": "not_analyzed" 26 | }, 27 | "unique_id": { 28 | "type": "string", 29 | "index": "not_analyzed" 30 | } 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /db/state_bill_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index": { 4 | "number_of_shards": 1, 5 | "number_of_replicas": 0 6 | }, 7 | "analysis": { 8 | "filter": { 9 | "my_shingle_filter": { 10 | "type": "shingle", 11 | "min_shingle_size": 2, 12 | "max_shingle_size": 4, 13 | "output_unigrams": "false" 14 | } 15 | }, 16 | "analyzer": { 17 | "my_shingle_analyzer": { 18 | "type": "custom", 19 | "tokenizer": "standard", 20 | "filter": [ 21 | "lowercase", 22 | "my_shingle_filter" 23 | ] 24 | } 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /db/state_bill_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "bill_document": { 3 | "dynamic": "false", 4 | "properties": { 5 | "actions": { 6 | "properties": { 7 | "action": { 8 | "type": "string", 9 | "index": "analyzed" 10 | }, 11 | "actor": { 12 | "type": "string", 13 | "index": "analyzed" 14 | }, 15 | "date": { 16 | "type": "date", 17 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd" 18 | }, 19 | "type": { 20 | "type": "string", 21 | "index": "not_analyzed" 22 | } 23 | } 24 | }, 25 | "bill_document_first": { 26 | "type": "string", 27 | "term_vector": "yes", 28 | "index": "analyzed", 29 | "_size": { 30 | "enabled": true, 31 | "store": true 32 | }, 33 | "fields": { 34 | "shingles": { 35 | "type": "string", 36 | "analyzer": "my_shingle_analyzer" 37 | } } 38 | }, 39 | "bill_document_last": { 40 | "type": "string", 41 | "term_vector": "yes", 42 | "index": "analyzed", 43 | "_size": { 44 | "enabled": true, 45 | "store": true 46 | }, 47 | "fields": { 48 | "shingles": { 49 | "type": "string", 50 | "analyzer": "my_shingle_analyzer" 51 | } } 52 | }, 53 | "bill_id": { 54 | "type": "string", 55 | "index": "not_analyzed" 56 | }, 57 | "bill_title": { 58 | "type": "string", 59 | "term_vector": "yes", 60 | "index": "analyzed", 61 | "_size": { 62 | "enabled": true, 63 | "store": true 64 | }, 65 | "fields": { 66 | "shingles": { 67 | "type": "string", 68 | "analyzer": "my_shingle_analyzer" 69 | } } 70 | }, 71 | "bill_type": { 72 | "type": "string", 73 | "index": "not_analyzed" 74 | }, 75 | "chamber": { 76 | "type": "string", 77 | "index": "not_analyzed" 78 | }, 79 | "summary": { 80 | "type": "string", 81 | "term_vector": "yes", 82 | "index": "analyzed", 83 | "_size": { 84 | "enabled": true, 85 | "store": true 86 | }, 87 | "fields": { 88 | "shingles": { 89 | "type": "string", 90 | "analyzer": "my_shingle_analyzer" 91 | } } 92 | }, 93 | "short_title": { 94 | "type": "string", 95 | "term_vector": "yes", 96 | "index": "analyzed", 97 | "_size": { 98 | "enabled": true, 99 | "store": true 100 | }, 101 | "fields": { 102 | "shingles": { 103 | "type": "string", 104 | "analyzer": "my_shingle_analyzer" 105 | } } 106 | }, 107 | "date_created": { 108 | "type": "date", 109 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd" 110 | }, 111 | "date_updated": { 112 | "type": "date", 113 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd" 114 | }, 115 | "session": { 116 | "type": "string", 117 | "index": "analyzed" 118 | }, 119 | "state": { 120 | "type": "string", 121 | "index": "not_analyzed" 122 | }, 123 | "sunlight_id": { 124 | "type": "string", 125 | "index": "not_analyzed" 126 | }, 127 | "unique_id": { 128 | "type": "string", 129 | "index": "not_analyzed" 130 | } 131 | } 132 | } 133 | } -------------------------------------------------------------------------------- /html/bootstrap3/css/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/css/.Rhistory -------------------------------------------------------------------------------- /html/bootstrap3/css/custom.css: -------------------------------------------------------------------------------- 1 | mark { 2 | background-color: yellow; 3 | color: black; 4 | } 5 | 6 | .span3 { 7 | height: 250px !important; 8 | overflow: scroll; 9 | } 10 | 11 | .span5 { 12 | height: 800px !important; 13 | overflow: scroll; 14 | } 15 | td { 16 | padding: 5px; 17 | } 18 | 19 | tr:hover { background: #efedf5; } 20 | td a { 21 | display: block; 22 | padding: 16px; 23 | } 24 | -------------------------------------------------------------------------------- /html/bootstrap3/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /html/bootstrap3/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /html/bootstrap3/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /html/bootstrap3/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/html/bootstrap3/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /html/bootstrap3/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('../../js/transition.js') 3 | require('../../js/alert.js') 4 | require('../../js/button.js') 5 | require('../../js/carousel.js') 6 | require('../../js/collapse.js') 7 | require('../../js/dropdown.js') 8 | require('../../js/modal.js') 9 | require('../../js/tooltip.js') 10 | require('../../js/popover.js') 11 | require('../../js/scrollspy.js') 12 | require('../../js/tab.js') 13 | require('../../js/affix.js') -------------------------------------------------------------------------------- /html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | {% if page %}Page {{ page }}{% else %}Intro{% endif %} - Raccoon User Study Part 2 10 | 11 | 12 | 13 | 14 | 15 | 53 | 54 | 55 | Sunlight Demo 56 | 57 | 58 |
59 | 60 | 64 |
65 | 66 | 67 |
68 | 69 |
70 |
This is a test to see how we can compare query {{ query_string }}
71 |
This is a test to see how we can compare query and result text
72 |
73 | 74 | 75 | -------------------------------------------------------------------------------- /html/templates/searchdemo.html.jinja: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Search Demo 11 | 12 | 13 | 14 | 15 | 21 | 22 | 23 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 75 | 76 | 77 | Demo 78 | 79 | 80 | 81 |
82 | 83 | 84 | 85 |
86 |

Legislative Influence Detector — LID

87 |

Tracing Policy Ideas across Lobbyists and State Legislatures

88 |

http://dssg.uchicago.edu

89 |
90 | 91 |
92 | 93 |

Step 1: Choose the type of documents you'd like to search

94 |
95 | Search state bills
96 | Search model legislation
97 | Search constitutions worldwide
98 |
99 |

Step 2: Copy and paste the text of the legislative bill, model legislation, or constitution you'd like to find matches for

100 | 101 |
102 |

Step 3: Press the submit button

103 | 104 |
105 | 106 |
107 |
108 | 109 | 110 | 111 | 112 | 113 | 114 |

Step 4: Scroll potential matches and click on the ones you'd like to investigate. Green indicates likely matches. Red indicates likely not.

115 |
116 | 117 |
118 | 119 | 120 | {% for row in results_to_show %} 121 | 122 | 123 | 124 | {% if row.0 >= 50 %} 125 | 128 | {% elif row.0 <50 %} 129 | 132 | {% endif %} 133 | 135 | 136 | {% endfor %} 137 |
State/OrgSession/TypeNumber
{{ row.1 }}{{ row.2 }}{{ row.3 }}
138 | 139 |
140 |
141 |


142 | 143 | 144 |
145 |

146 |         
147 | 148 | 149 |
150 |

151 | 	
152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /lid/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/.DS_Store -------------------------------------------------------------------------------- /lid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/__init__.py -------------------------------------------------------------------------------- /lid/alignment_classifier.py: -------------------------------------------------------------------------------- 1 | from alignment_evaluation import alignment_features 2 | import numpy as np 3 | import nltk 4 | from sklearn import linear_model 5 | from sklearn.metrics import confusion_matrix, accuracy_score 6 | import csv 7 | import json 8 | import argparse 9 | import os 10 | from database import ElasticConnection 11 | import random 12 | import codecs 13 | from sklearn.feature_extraction.text import TfidfVectorizer 14 | from utils.general_utils import alignment_tokenizer 15 | from utils.general_utils import UnicodeWriter,UnicodeReader 16 | import pickle 17 | from sklearn.metrics import jaccard_similarity_score,classification_report 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.cross_validation import KFold 20 | 21 | 22 | 23 | '''Contains code for both the features and model of the alignment classifier used to classify alignments as 24 | substantive or boiler-plate''' 25 | 26 | def compute_tfidf_scores(alignment_data_path,pickle_file_name): 27 | count = 0 28 | alignment_docs = [] 29 | for line in alignment_data_path: 30 | print count 31 | count += 1 32 | if count >= 100000: 33 | break 34 | json_obj = json.loads(line.strip()) 35 | 36 | 37 | if "alignment_results" not in json_obj: 38 | continue 39 | 40 | for alignment_result in json_obj['alignment_results']: 41 | alignment_doc = [] 42 | for section_alignment in alignment_result['alignments']: 43 | alignment_doc.extend([x for x in section_alignment['left'] if x not in ['-',None]]) 44 | alignment_doc.extend([x for x in section_alignment['right'] if x not in ['-',None]]) 45 | alignment_docs.append( " ".join(alignment_doc)) 46 | 47 | 48 | vectorizer = TfidfVectorizer() 49 | X = vectorizer.fit_transform(alignment_docs) 50 | idf = vectorizer.idf_ 51 | idf = vectorizer._tfidf.idf_ 52 | term_scores = zip(vectorizer.get_feature_names(), idf) 53 | term_dict = dict(term_scores) 54 | pickle_file = codecs.open(pickle_file_name,mode = "wb") 55 | pickle.dump(term_dict,pickle_file) 56 | return 57 | 58 | 59 | 60 | def construct_training_set(alignments_file,out_file_name,score_threshold = None): 61 | """ 62 | Args: 63 | alignments_file (file) -- file containing sample alignments 64 | 65 | out_file_name (string) -- name of training data file to write to 66 | 67 | Returns: 68 | None 69 | """ 70 | ec = ElasticConnection(host= "54.203.12.145") 71 | 72 | training_examples = [] 73 | for i,x in enumerate(alignments_file): 74 | print i 75 | json_obj = json.loads(x.strip()) 76 | 77 | if "alignment_results" not in json_obj.keys(): 78 | continue 79 | 80 | left_doc_id = json_obj['query_document_id'] 81 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title'] 82 | 83 | left_doc = json_obj['query_document'] 84 | left_doc = reduce(lambda x,y:x+y,left_doc) 85 | 86 | left_doc_length = len(left_doc.split()) 87 | 88 | for i,alignment_doc in enumerate(json_obj['alignment_results']): 89 | 90 | right_doc_id = alignment_doc['document_id'] 91 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title'] 92 | 93 | for alignment in alignment_doc['alignments']: 94 | 95 | left = alignment['left'] 96 | right = alignment['right'] 97 | left_start = alignment['left_start'] 98 | right_start = alignment['right_start'] 99 | left_end = alignment['left_end'] 100 | right_end = alignment['right_end'] 101 | score = alignment['score'] 102 | if score < score_threshold: 103 | continue 104 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end, 105 | right_end,score,left_bill_title,right_bill_title, 106 | " ".join(left)," ".join(right)]) 107 | 108 | 109 | random.shuffle(training_examples) 110 | 111 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end", 112 | "right_end","score","left_bill_title","right_bill_title","left","right"] 113 | 114 | 115 | k = 500 116 | with codecs.open(out_file_name, 'wb') as output_file: 117 | writer = UnicodeWriter(output_file, header) 118 | writer.writerow(header) 119 | for l in training_examples[0:k]: 120 | l = [unicode(x) for x in l] 121 | writer.writerow(l) 122 | 123 | 124 | return 125 | 126 | 127 | 128 | def evaluate_alignment_classifier(): 129 | """runs k-fold cross validation on training set to evaluate classifier""" 130 | 131 | training_examples = [] 132 | for line in csv.reader(self._training_file): 133 | if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]: 134 | continue 135 | if len(line[10]) <= 1 or len(line[11]) < 1: 136 | continue 137 | training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])}) 138 | 139 | 140 | 141 | 142 | random.shuffle(training_examples) 143 | X,y = self.compute_feature_matrix(training_examples) 144 | 145 | self._model.fit(X_train,y_train) 146 | X,y = np.array(X),np.array(y) 147 | kf = KFold(n=len(X), n_folds=4, shuffle=False, 148 | random_state=None) 149 | for train_index, test_index in kf: 150 | X_train, X_test = X[train_index], X[test_index] 151 | y_train, y_test = y[train_index], y[test_index] 152 | self._model.fit(X_train,y_train) 153 | y_pred = self._model.predict(X_test) 154 | print classification_report(y_test, y_pred) 155 | 156 | self._model.fit(X,y) 157 | feat_names = ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l', 158 | 'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien'] 159 | 160 | for x in zip(feat_names,self._model.coef_.tolist()): 161 | print x 162 | 163 | 164 | 165 | 166 | class AlignmentClassifier(): 167 | """Classifier that labels alignments as either substantive (1) or boilerplate (0)""" 168 | 169 | 170 | def __init__(self,idf_file_path): 171 | """Keyword Args: 172 | 173 | idf_file_path: file path of the table that stores idf scores of the words 174 | 175 | """ 176 | self._idf_score_dict = pickle.load(open(idf_file_path)) 177 | self._training_file = codecs.open(os.environ['POLICY_DIFFUSION']+\ 178 | "/data/training_data_alignment_classifier_bigger.csv",mode = "rU") 179 | self._model = LogisticRegression(penalty='l1') 180 | 181 | def compute_feature_matrix(self,training_examples): 182 | """Keywords Args: 183 | 184 | training_examples: list of dicts, where each dict contains alignment: "left":left_text,"right":right_text 185 | and "label":label of alignment (1) substantive and boilerplate (0) 186 | 187 | Returns: 188 | 189 | X: feature matrix 190 | y: labels 191 | 192 | """ 193 | 194 | X = [] 195 | y = [] 196 | for training_example in training_examples: 197 | left = training_example['left'] 198 | right = training_example['right'] 199 | label = training_example['label'] 200 | meta_features = self._compute_alignment_meta_features(left,right) 201 | idf_features = self._compute_idf_score(left,right) 202 | features = meta_features + idf_features 203 | X.append(features) 204 | y.append(label) 205 | 206 | return X,y 207 | 208 | def train_model(self): 209 | """ Trains model using training examples in self._training_file and returns a trained model self._model 210 | 211 | Keywords Args: 212 | None 213 | 214 | Returns: 215 | None 216 | 217 | """ 218 | 219 | 220 | training_examples = [] 221 | for line in csv.reader(self._training_file): 222 | if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]: 223 | continue 224 | if len(line[10]) <= 1 or len(line[11]) < 1: 225 | continue 226 | training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])}) 227 | 228 | X,y = self.compute_feature_matrix(training_examples) 229 | 230 | self._model.fit(X_train,y_train) 231 | X,y = np.array(X),np.array(y) 232 | kf = KFold(n=len(X), n_folds=4, shuffle=False, 233 | random_state=None) 234 | for train_index, test_index in kf: 235 | X_train, X_test = X[train_index], X[test_index] 236 | y_train, y_test = y[train_index], y[test_index] 237 | self._model.fit(X_train,y_train) 238 | y_pred = self._model.predict(X_test) 239 | print classification_report(y_test, y_pred) 240 | 241 | self._model.fit(X,y) 242 | feat_names = ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l', 243 | 'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien'] 244 | 245 | for x in zip(feat_names,self._model.coef_.tolist()): 246 | print x 247 | 248 | 249 | def predict(self,alignment_example): 250 | """predicts label for alignment example 251 | 252 | 253 | Keyword Args: 254 | 255 | alignment_example: alignment [left,right] that needs to be labeled 256 | 257 | 258 | """ 259 | pass 260 | 261 | 262 | 263 | def _compute_alignment_meta_features(self,left, right): 264 | ''' 265 | This function takes as input two alignments and produce features of these 266 | ''' 267 | #alignment features 268 | features = {} 269 | features['length'] = len(left) 270 | features['num_gaps_l'] = 0 271 | features['num_gaps_r'] = 0 272 | features['num_mismatches'] = 0 273 | features['num_matches'] = 0 274 | features['avg_gap_length_l'] = [] 275 | features['avg_gap_length_r'] = [] 276 | features['avg_consec_match_length'] = [] 277 | features['jaccard_score'] = jaccard_similarity_score(left,right) 278 | 279 | #helper variables 280 | prev_gap_l = False 281 | prev_gap_r = False 282 | prev_match = False 283 | for i in range(len(left)): 284 | # print 'i: ', i 285 | # print 'features: ', features 286 | if left[i] == '-': 287 | features['num_gaps_l'] += 1 288 | if not prev_gap_l: 289 | features['avg_gap_length_l'].append(1) 290 | prev_gap_l = True 291 | else: 292 | features['avg_gap_length_l'][-1] += 1 293 | else: 294 | prev_gap_l = False 295 | if right[i] == '-': 296 | features['num_gaps_r'] += 1 297 | if not prev_gap_r: 298 | features['avg_gap_length_r'].append(1) 299 | prev_gap_r = True 300 | else: 301 | features['avg_gap_length_r'][-1] += 1 302 | else: 303 | prev_gap_r = False 304 | if left[i] != '-' and right[i] != '-': 305 | if left[i] != right[i]: 306 | features['num_mismatches'] += 1 307 | elif left[i] == right[i]: 308 | features['num_matches'] += 1 309 | if not prev_match: 310 | features['avg_consec_match_length'].append(1) 311 | prev_match = True 312 | else: 313 | features['avg_consec_match_length'][-1] += 1 314 | if left[i] != right[i]: 315 | prev_match = False 316 | 317 | if features['avg_gap_length_l'] != []: 318 | features['avg_gap_length_l'] = np.mean(features['avg_gap_length_l']) 319 | else: 320 | features['avg_gap_length_l'] = 0 321 | if features['avg_gap_length_r'] != []: 322 | features['avg_gap_length_r'] = np.mean(features['avg_gap_length_r']) 323 | else: 324 | features['avg_gap_length_r'] = 0 325 | if features['avg_consec_match_length'] != []: 326 | features['avg_consec_match_length'] = np.mean(features['avg_consec_match_length']) 327 | else: 328 | features['avg_consec_match_length'] = 0 329 | 330 | features = sorted(features.items(),key = lambda x:x[0],reverse= False) 331 | return [x[1] for x in features] 332 | 333 | 334 | def _compute_idf_score(self,left,right): 335 | idf_scores = [] 336 | 337 | for w in left: 338 | if w in self._idf_score_dict: 339 | idf_scores.append(self._idf_score_dict[w]) 340 | 341 | for w in right: 342 | if w in self._idf_score_dict: 343 | idf_scores.append(self._idf_score_dict[w]) 344 | 345 | 346 | return [np.mean(idf_scores),np.median(idf_scores)] 347 | 348 | 349 | 350 | 351 | def main(): 352 | parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ') 353 | parser.add_argument('command', 354 | help='command to run, options are: construct_training_set,train_model,evaluate_model') 355 | parser.add_argument('--alignment_samples_doc', dest='alignment_samples', 356 | help="file path to the alignment samples used to construct training set ") 357 | 358 | args = parser.parse_args() 359 | 360 | if args.command == "construct_training_set": 361 | construct_training_set(open(args.alignment_samples), 362 | os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set_high_scores.csv",50) 363 | elif args.command == "compute_tfidf_scores": 364 | alignments_file = codecs.open("/mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments.txt", 365 | encoding = "utf8") 366 | out_file = "/mnt/data/sunlight/dssg/features/alignment_tfidf_scores.p" 367 | compute_tfidf_scores(alignments_file,out_file) 368 | 369 | 370 | elif args.command == "train_model": 371 | pass 372 | elif args.command == "evaluate_model": 373 | pass 374 | else: 375 | print args 376 | print "command not recognized, please enter construct_training_set,train_model,evaluate_model" 377 | 378 | 379 | if __name__ == "__main__": 380 | main() 381 | 382 | 383 | -------------------------------------------------------------------------------- /lid/config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #Global path variables 4 | DATA_PATH = "/mnt/elasticsearch/dssg" 5 | 6 | -------------------------------------------------------------------------------- /lid/etl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/etl/__init__.py -------------------------------------------------------------------------------- /lid/etl/extractors.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from bs4 import BeautifulSoup 3 | from state_bill_extractors import bill_text_extractor 4 | import os 5 | import codecs 6 | import argparse 7 | import re 8 | import base64 9 | import json 10 | from tika import parser as tp 11 | import traceback 12 | import logging 13 | from config import DATA_PATH 14 | 15 | try: 16 | from os import scandir, walk 17 | except ImportError: 18 | from scandir import scandir, walk 19 | 20 | 21 | 22 | def get_first_and_last_bill_documents(json_obj): 23 | state_code = json_obj['state'] 24 | 25 | bill_documents = [] 26 | for v in range(2): 27 | 28 | try: 29 | bill_document = base64.b64decode(json_obj['versions'][v]['bill_document']) 30 | except: 31 | bill_documents.append(None) 32 | continue 33 | 34 | try: 35 | mimetype = json_obj['versions'][v]['mimetype'] 36 | 37 | except KeyError: 38 | mimetype = json_obj['versions'][v]['+mimetype'] 39 | 40 | url = json_obj['versions'][v]['url'] 41 | # try to extract text with bill-specific extractor 42 | bill_text = bill_text_extractor(state_code, bill_document, mimetype, url) 43 | 44 | # if fails then try tika extractor as backup 45 | if not bill_text or len(bill_text) < 1000: 46 | 47 | try: 48 | bill_text = tp.from_buffer(bill_document)['content'] 49 | #if extraction results in short text, most likely a fail 50 | if len(bill_text) < 1000: 51 | bill_text = None 52 | except Exception: 53 | bill_text = None 54 | 55 | 56 | bill_documents.append(bill_text) 57 | 58 | return bill_documents 59 | 60 | 61 | 62 | # extracts text from bill documents fetched from sunlight 63 | # and constructs new json obj with selected meta-data 64 | def extract_bill_document(bill_file_path): 65 | try: 66 | 67 | bill_dict = {} 68 | data_dict = json.loads(open(bill_file_path).read()) 69 | 70 | #test whether a document is a bill or resolution 71 | bill_text_count = [1 for x in data_dict['type'] if "bill" in x.lower()] 72 | good_bill_prefixes = ["A","AJ", "AJR","CACR","HB","S","HJR","ACA","HF","SF","HJ","SJ" 73 | "HJRCA","SJRCA","HSB","IP","LB","SB","SCA","SP"] 74 | if sum(bill_text_count) < 1 and data_dict['bill_id'].split()[0] not in good_bill_prefixes: 75 | return 76 | 77 | 78 | 79 | 80 | # extract first and last versions of bill document 81 | # and add to json dict 82 | bill_document_first, bill_document_last = get_first_and_last_bill_documents(data_dict) 83 | bill_dict['bill_document_first'] = bill_document_first 84 | bill_dict['bill_document_last'] = bill_document_last 85 | 86 | if bill_document_first == None or bill_document_last == None: 87 | logging.warning("failed to extract text for {0}".format(bill_file_path)) 88 | 89 | else: 90 | logging.info("successfully extracted text for {0}".format(bill_file_path)) 91 | 92 | # assign attributes that will be used 93 | bill_id = re.sub("\s+", "", data_dict['bill_id']) 94 | bill_dict['unique_id'] = "{0}_{1}_{2}".format(data_dict['state'], data_dict['session'], bill_id) 95 | bill_dict['bill_id'] = data_dict['bill_id'] 96 | bill_dict['date_updated'] = data_dict['updated_at'] 97 | bill_dict['session'] = data_dict['session'] 98 | bill_dict['sunlight_id'] = data_dict['id'] 99 | bill_dict['bill_title'] = data_dict['title'] 100 | bill_dict['bill_type'] = data_dict['type'] 101 | bill_dict['state'] = data_dict['state'] 102 | bill_dict['chamber'] = data_dict['chamber'] 103 | bill_dict['date_created'] = data_dict['created_at'] 104 | bill_dict['actions'] = data_dict['actions'] 105 | bill_dict['action_dates'] = data_dict['action_dates'] 106 | bill_dict['date_introduced'] = data_dict['action_dates']['first'] 107 | bill_dict['date_signed'] = data_dict['action_dates']['signed'] 108 | 109 | 110 | if "short_tite" in data_dict.keys(): 111 | bill_dict['short_title'] = data_dict['short_title'] 112 | elif "+short_title" in data_dict.keys(): 113 | bill_dict['short_title'] = data_dict['+short_title'] 114 | 115 | else: 116 | bill_dict['short_title'] = None 117 | 118 | if "summary" in data_dict.keys(): 119 | bill_dict['summary'] = data_dict['summary'] 120 | else: 121 | bill_dict['summary'] = None 122 | 123 | return bill_dict 124 | except (KeyboardInterrupt, SystemExit): 125 | raise 126 | except Exception as e: 127 | trace_message = re.sub("\n+", "\t", traceback.format_exc()) 128 | trace_message = re.sub("\s+", " ", trace_message) 129 | trace_message = "<<{0}>>".format(trace_message) 130 | m = "Failed to extract document for {0}: {1}".format(bill_file_path, trace_message) 131 | logging.error(m) 132 | 133 | if __name__ == "__main__": 134 | parser = argparse.ArgumentParser(description='Process some integers.') 135 | parser.add_argument('command', help='command to run, options are: build_index') 136 | parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ") 137 | 138 | args = parser.parse_args() 139 | 140 | #extracts text from bill documents and populates a json file with a json_object per row 141 | if args.command == "extract_bills": 142 | #configure logging 143 | logging.getLogger('tp').setLevel(logging.ERROR) 144 | logging.getLogger('requests').setLevel(logging.ERROR) 145 | logging.basicConfig(filename=os.environ['POLICY_DIFFUSION'] + '/logs/state_bill_extractor.log', 146 | level=logging.DEBUG) 147 | 148 | bill_files = [] 149 | for dirname, dirnames, filenames in walk(args.data_path): 150 | for filename in filenames: 151 | bill_files.append(os.path.join(dirname, filename)) 152 | 153 | outFile = codecs.open("{0}/extracted_data/extracted_bills.json".format(DATA_PATH), 'w') 154 | for i, bill_file in enumerate(bill_files): 155 | bill_json_obj = extract_bill_document(bill_file) 156 | 157 | outFile.write("{0}\n".format(json.dumps(bill_json_obj))) 158 | 159 | outFile.close() 160 | 161 | 162 | 163 | ##extracts text from model legislation 164 | def extract_model_legislation(json_file, encoded): 165 | ''' 166 | Keyword Args: 167 | json_file: corresponds to json file with model legislation 168 | encoded: True/False if json file is b64 encoded 169 | 170 | returns: 171 | dictionary with url, date, and text of model legislation 172 | decription: 173 | extract text from model legislation 174 | ''' 175 | data = [] 176 | with open(json_file) as f: 177 | for line in f: 178 | data.append(json.loads(line)) 179 | 180 | model_legislation = {} 181 | for i in range(len(data)): 182 | model_legislation[i] = data[i] 183 | 184 | if encoded == True: 185 | for i in range(len(model_legislation)): 186 | try: 187 | ml = model_legislation[i]['source'] 188 | ml = base64.b64decode(ml) 189 | ml = tp.from_buffer(ml) 190 | model_legislation[i]['source'] = ml['content'] 191 | except AttributeError: 192 | model_legislation[i]['source'] = None 193 | return model_legislation 194 | 195 | else: 196 | return model_legislation 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /lid/etl/load_constitutions_into_elasticsearch.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import time 4 | import glob 5 | import json 6 | import requests 7 | from io import open 8 | from elasticsearch import Elasticsearch 9 | 10 | files = glob.glob("*.txt") 11 | es = Elasticsearch([{'host': "54.244.236.175", 'port': 9200}]) 12 | 13 | for file in files: 14 | print file 15 | state_year = file.split(".")[0] 16 | state = state_year[:-5] 17 | year = int(state_year[-4:]) 18 | file_text = open(file, 'r', encoding='ISO-8859-1').read() 19 | json_object = { 20 | "document_type": "constitution", 21 | "state": state, 22 | "year": year, 23 | "constitution": file_text 24 | } 25 | 26 | es.index(index="constitutions", doc_type="constitution", id=state_year, body=json.dumps(json_object)) 27 | time.sleep(1) 28 | -------------------------------------------------------------------------------- /lid/etl/scrapers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import codecs 3 | import base64 4 | import logging 5 | import re 6 | import os 7 | import sys 8 | import multiprocessing 9 | import utils 10 | import random 11 | import argparse 12 | import traceback 13 | import urllib2 14 | from config import DATA_PATH 15 | from bs4 import BeautifulSoup 16 | 17 | 18 | try: 19 | from os import scandir, walk 20 | except ImportError: 21 | from scandir import scandir, walk 22 | 23 | BILL_SCRAPER_LOG = os.environ['POLICY_DIFFUSION'] + '/logs/bill_scraper.log' 24 | 25 | 26 | # scrapes all bills from the input data path 27 | def scrape_all_bills(bill_data_path, num_workers): 28 | logging.basicConfig(filename=BILL_SCRAPER_LOG, level=logging.DEBUG) 29 | 30 | bill_file_paths = [] 31 | for dirname, dirnames, filenames in walk(bill_data_path): 32 | for filename in filenames: 33 | bill_file_paths.append(os.path.join(dirname, filename)) 34 | 35 | 36 | scrape_bill_document_from_sunlight(bill_file_paths[0]) 37 | 38 | random.shuffle(bill_file_paths) 39 | 40 | pool = multiprocessing.Pool(num_workers) 41 | 42 | print "fetch {0} urls from sunlight...".format(len(bill_file_paths)) 43 | pool.map(scrape_bill_document_from_sunlight, bill_file_paths) 44 | 45 | print "finished fetching urls..." 46 | 47 | 48 | # open individual json file and scrape bill document, 49 | # from the s3 server provided by sunlight foundation 50 | def scrape_bill_document_from_sunlight(file_path): 51 | try: 52 | file_path = file_path.strip() 53 | 54 | #define path to write file 55 | out_file_path = file_path.split("/bills")[-1] 56 | out_file_path = re.sub("\s+", "_", out_file_path) 57 | out_dir_root_path = "{0}/scraped_bills".format(DATA_PATH) 58 | out_file_name = "{0}{1}.json".format(out_dir_root_path, out_file_path) 59 | 60 | bill_json = json.loads(codecs.open(file_path, encoding="utf8").read()) 61 | 62 | # filter versions to be only the first and last 63 | try: 64 | bill_json['versions'] = [bill_json['versions'][0], bill_json['versions'][-1]] 65 | except IndexError: 66 | return 67 | 68 | base_url = "{0}/{1}".format("http://static.openstates.org/documents", bill_json['state']) 69 | urls = ["{0}/{1}".format(base_url, x['doc_id']) for x in bill_json['versions']] 70 | source_urls = [x['url'] for x in bill_json['versions']] 71 | 72 | for i, url in enumerate(urls): 73 | 74 | bill_document = utils.fetch_url(url) 75 | 76 | #hash bill using base64 77 | if bill_document is not None: 78 | bill_document = base64.b64encode(bill_document) 79 | else: 80 | logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format( 81 | file_path, url, i, "link error")) 82 | 83 | bill_json['versions'][i]['bill_document'] = bill_document 84 | 85 | if not os.path.exists(os.path.dirname(out_file_name)): 86 | os.makedirs(os.path.dirname(out_file_name)) 87 | with codecs.open(out_file_name, "w", encoding="utf8") as f: 88 | f.write(json.dumps(bill_json)) 89 | 90 | logging.info("successfully scraped bill: {0}".format(out_file_path)) 91 | 92 | except Exception as e: 93 | trace_message = re.sub("\n+", "\t", traceback.format_exc()) 94 | trace_message = re.sub("\s+", " ", trace_message) 95 | trace_message = "<<{0}>>".format(trace_message) 96 | m = "Failed to obtain documents for {0}: {1}".format(file_path, trace_message) 97 | logging.error(m) 98 | 99 | return 100 | 101 | 102 | # scrapes bill document from original source link 103 | # this is a backup if s3 doesn't work 104 | def scrape_bill_document_from_original_source(filePath): 105 | filePath = filePath.strip() 106 | 107 | outFilePath = "/".join(filePath.split("/")[7:]) 108 | outFilePath = re.sub("\s+", "_", outFilePath) 109 | outDirRootPath = "/mnt/data/sunlight/dssg/scraped_bills_new" 110 | outFileName = "{0}/{1}.json".format(outDirRootPath, outFilePath) 111 | 112 | billFile = codecs.open(filePath, encoding="utf8").read() 113 | billJson = json.loads(billFile) 114 | 115 | # filters documents that are resolutions 116 | bill_text_count = [1 for x in billJson['type'] if "bill" in x.lower()] 117 | if sum(bill_text_count) < 1: 118 | return 119 | 120 | # filter versions to be only the first and last 121 | billJson['versions'] = [billJson['versions'][0], billJson['versions'][-1]] 122 | 123 | urls = [x['url'] for x in billJson['versions']] 124 | 125 | for i, url in enumerate(urls): 126 | 127 | billDocument = utils.fetch_url(url) 128 | 129 | if billDocument is not None: 130 | billDocument = base64.b64encode(billDocument) 131 | else: 132 | logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(filePath, url, i, "link error")) 133 | 134 | billJson['versions'][i]['bill_document'] = billDocument 135 | 136 | if not os.path.exists(os.path.dirname(outFileName)): 137 | os.makedirs(os.path.dirname(outFileName)) 138 | with codecs.open(outFileName, "w", encoding="utf8") as f: 139 | f.write(json.dumps(billJson)) 140 | 141 | logging.info("successfully scraped bill: {0}".format(outFilePath)) 142 | 143 | return 144 | 145 | 146 | # scrapes model legistlation from ALEC's official site 147 | # and the tracker website ALEC exposed 148 | def scrape_ALEC_model_legislation(): 149 | url = 'http://www.alec.org/model-legislation/' 150 | response = urllib2.urlopen(url).read() 151 | bs = BeautifulSoup(response, 'html5') 152 | 153 | # Get all links from website 154 | ALEClist = [] 155 | for link in bs.find_all('a'): 156 | if link.has_attr('href'): 157 | ALEClist.append(link.attrs['href']) 158 | 159 | # Filter list so that we have only the ones with model-legislation 160 | ALEClinks = [] 161 | i = 0 162 | for i in range(0, len(ALEClist)): 163 | if ALEClist[i][20:38] == "model-legislation/": 164 | ALEClinks.append(ALEClist[i]) 165 | i = i + 1 166 | 167 | # To get only unique links (get rid off duplicates) 168 | ALEClinks = set(ALEClinks) 169 | 170 | # Save to json file 171 | with open('{0}/data/model_legislation/alec_bills.json'.format(DATA_PATH, 'w')) as f: 172 | for line in ALEClinks: 173 | source = urllib2.urlopen(line).read() 174 | url = line 175 | date = 2015 176 | Jsonbill = bill_source_to_json(url, source, date) 177 | f.write("{0}\n".format(Jsonbill)) 178 | 179 | # Save old alec bills (from Center for the Media and Democracy) 180 | def scrape_alec_exposed_bills (): 181 | names = os.listdir('{0}/model_legislation/ALEC_exposed'.format(DATA_PATH)) 182 | with open('alec_old_bills.json', 'w') as f2: 183 | for name in names: 184 | try: 185 | text = tp.from_file(name) 186 | source = text['content'] 187 | except: 188 | source = None 189 | url = None 190 | date = '2010-2013' 191 | print name 192 | print source 193 | Jsonbill = bill_source_to_json_not_encoded(url, source, date) 194 | f2.write("{0}\n".format(Jsonbill)) 195 | 196 | 197 | def scrape_CSG_model_legislation(): 198 | url = 'http://www.csg.org/programs/policyprograms/SSL.aspx' 199 | doc = urllib2.urlopen(url).read() 200 | bs = BeautifulSoup(doc) 201 | 202 | links = [] 203 | for link in bs.find_all('a'): 204 | if link.has_attr('href'): 205 | candidate = link.attrs['href'] 206 | # links with pdf extension tend to be model bills 207 | if candidate[-4:] == ".pdf": 208 | links.append(candidate) 209 | 210 | # only keeps distinct links 211 | links2 = list(set(links)) 212 | 213 | badCount = 0 214 | goodCount = 0 215 | 216 | with open('csg_bills.json', 'w') as f: 217 | for line in links2: 218 | try: 219 | url_key = {} 220 | source = urllib2.urlopen(line).read() 221 | Jsonbill = bill_source_to_json(link, source, None) 222 | f.write("{0}\n".format(Jsonbill)) 223 | goodCount += 1 224 | except: 225 | badCount += 1 226 | print line 227 | 228 | print str(badCount) + " did not work" 229 | 230 | 231 | def scrape_ALICE_legislation(): 232 | path = "/mnt/data/sunlight/dssg/model_legislation/links_" 233 | lines = [] 234 | for i in [1, 2, 3]: 235 | filePath = path + str(i) + ".txt" 236 | with open(filePath) as f: 237 | lines.extend(f.read().splitlines()) 238 | 239 | text = ''.join(lines) 240 | bs = BeautifulSoup(text) 241 | 242 | links = [] 243 | for link in bs.find_all('a'): 244 | if link.has_attr('href'): 245 | links.append(link.attrs['href']) 246 | 247 | 248 | # grab pdfs from links 249 | billList = [] 250 | for url in links: 251 | doc = urllib2.urlopen(url).read() 252 | bs = BeautifulSoup(doc) 253 | 254 | for link in bs.find_all('a'): 255 | if link.has_attr('href'): 256 | candidate = link.attrs['href'] 257 | if candidate[-4:] == ".pdf": # links with pdf extension tend to be model bills 258 | billList.append("https://stateinnovation.org" + candidate) 259 | 260 | badCount = 0 261 | goodCount = 0 262 | with open('alice_bills.json', 'w') as f: 263 | for link in billList: 264 | # url_key = {} 265 | # source = urllib2.urlopen(link).read() 266 | # Jsonbill = bill_source_to_json(link, source, None) 267 | # f.write("{0}\n".format(Jsonbill)) 268 | try: 269 | source = urllib2.urlopen(link).read() 270 | Jsonbill = bill_source_to_json(link, source, None) 271 | f.write("{0}\n".format(Jsonbill)) 272 | goodCount += 1 273 | except: 274 | badCount += 1 275 | 276 | print str(badCount) + " did not work" 277 | 278 | def scrape_misc_legislation(): 279 | # Access list of clean urls 280 | with open('/mnt/data/sunlight/dssg/model_legislation/clean_urls.txt', 281 | 'r') as f: 282 | links = f.read().splitlines() 283 | 284 | badCount = 0 285 | goodCount = 0 286 | with open('misc_bills.json', 'w') as jsonfile: 287 | for link in links: 288 | try: 289 | source = urllib2.urlopen(link).read() 290 | Jsonbill = bill_source_to_json(link, source, None) 291 | jsonfile.write("{0}\n".format(Jsonbill)) 292 | goodCount += 1 293 | print goodCount 294 | except: 295 | badCount += 1 296 | 297 | print str(badCount) + " did not work" 298 | print str(goodCount) + " worked" 299 | 300 | 301 | 302 | def main(): 303 | 304 | parser = argparse.ArgumentParser(description='module that contains functions to scrape legislative data\ ' 305 | ' from sunlight foundation and various' 306 | 'lobbying organizations') 307 | parser.add_argument('command', help='command to run, options are: \n scrape_bills_from_sunlight') 308 | parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ") 309 | parser.add_argument('--num_workers', dest='num_workers',default = 10, 310 | type = int, help="file path of data to be indexed ") 311 | 312 | args = parser.parse_args() 313 | 314 | if args.command == "scrape_bills_from_sunlight": 315 | scrape_all_bills(args.data_path,args.num_workers) 316 | elif args.command == "scrape_ALEC_legislation": 317 | scrape_ALEC_model_legislation() 318 | elif args.command == "scrape_CSG_legislation": 319 | scrape_CSG_model_legislation() 320 | elif args.command == "scrape_ALICE_legislation": 321 | scrape_ALICE_legislation() 322 | elif args.command =="scrape_misc_legislation": 323 | scrape_misc_legislation() 324 | else: 325 | print("command not recognized, use -h flag to see list available commands") 326 | 327 | 328 | 329 | if __name__ == "__main__": 330 | main() 331 | -------------------------------------------------------------------------------- /lid/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/evaluation/__init__.py -------------------------------------------------------------------------------- /lid/evaluation/bills_for_evaluation_set.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import re 3 | import csv 4 | import urllib2 5 | import urllib 6 | from urllib import urlopen 7 | from tika import parser 8 | import pickle 9 | 10 | 11 | def create_bills(ls): 12 | ''' 13 | args: 14 | ls: list of lists of urls that correspond to matches 15 | 16 | returns: 17 | dictionary grouped by matches 18 | ''' 19 | k = 0 20 | bill_id = 0 21 | bills = {} 22 | bad_count = 0 23 | for urls in ls: 24 | for url,state in urls: 25 | try: 26 | print "bill_id: " + str(bill_id) 27 | bills[bill_id] = {} 28 | doc = urllib2.urlopen(url).read() 29 | text = parser.from_buffer(doc)['content'] 30 | bills[bill_id]['url'] = url 31 | bills[bill_id]['text'] = text 32 | bills[bill_id]['match'] = k 33 | bills[bill_id]['state'] = state 34 | except: 35 | pass 36 | bad_count += 1 37 | print 'bad_count: ', bad_count 38 | bill_id += 1 39 | k += 1 40 | 41 | #get more evaluation bills 42 | eval_bills = grab_more_eval_bills() 43 | for more_bills in eval_bills: 44 | print 'bill_group: ' k 45 | k +=1 46 | for text, state in more_bills: 47 | bill_id += 1 48 | print 'bill_id: ', i 49 | 50 | bills[bill_id] = {} 51 | bills[bill_id]['text'] = text 52 | bills[bill_id]['state'] = state 53 | bills[bill_id]['match'] = k 54 | 55 | try: 56 | for bill in bills.keys(): 57 | if bills[bill] == {} or bills[bill]['text'] == '' \ 58 | or bills[bill]['text'] == None: 59 | 60 | del bills[bill] 61 | except: 62 | pass 63 | 64 | return bills 65 | 66 | def get_bill_by_id(unique_id): 67 | es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300) 68 | match = es.search(index="state_bills", body={"query": {"match": {'unique_id': unique_id}}}) 69 | bill_text = match['hits']['hits'][0]['_source']['bill_document_first'] 70 | return bill_text 71 | 72 | def grab_more_eval_bills(): 73 | with open('../../data/evaluation_set/bills_for_evaluation_set.csv') as f: 74 | bills_list = [row for row in csv.reader(f.read().splitlines())] 75 | 76 | bill_ids_list = [] 77 | url_lists = [] 78 | topic_list = [] 79 | for i in range(len(bills_list)): 80 | state = bills_list[i][1] 81 | if state == 'ct': 82 | continue 83 | topic = bills_list[i][0] 84 | bill_number = bills_list[i][2] 85 | bill_number = re.sub(' ', '', bill_number) 86 | year = bills_list[i][3] 87 | url = bills_list[i][6] 88 | unique_id = str(state + '_' + year + '_' + bill_number) 89 | topic_list.append(topic) 90 | bill_ids_list.append(unique_id) 91 | url_lists.append(url) 92 | 93 | bills_ids = zip(bill_ids_list, url_lists) 94 | 95 | bad_count = 0 96 | bills_text = [] 97 | state_list = [] 98 | for i in range(len(bills_ids)): 99 | try: 100 | bill_text = get_bill_by_id(bills_ids[i][0]) 101 | except IndexError: 102 | try: 103 | url = bills_ids[i][1] 104 | doc = urllib.urlopen(url).read() 105 | bill_text = parser.from_buffer(doc)['content'] 106 | print url 107 | except IOError: 108 | bad_count += 1 109 | print 'bad_count: ', bad_count 110 | #skip this case 111 | continue 112 | bills_text.append(bill_text) 113 | state = bills_ids[i][0][0:2] 114 | state_list.append(state) 115 | 116 | bills_state = zip(bills_text, state_list, topic_list) 117 | 118 | bill_type_1 = [] 119 | bill_type_2 = [] 120 | for bill in bills_state: 121 | if bill[-1] == 'Adult Guardianship and Protective Proceedings Jurisdiction Act': 122 | bill_type_1.append((bill[0],bill[1])) 123 | else: 124 | bill_type_2.append((bill[0],bill[1])) 125 | 126 | return [bill_type_2, bill_type_1] 127 | 128 | def create_save_bills(bill_list): 129 | bills = create_bills(bill_list) 130 | with open('../../data/evaluation_set/labeled_bills.p', 'wb') as fp: 131 | pickle.dump(bills, fp) 132 | 133 | return bills 134 | 135 | 136 | if __name__ == '__main__': 137 | #each list in this list of lists contains bills that are matches 138 | similar_bills = [[('http://www.azleg.gov/legtext/52leg/1r/bills/hb2505p.pdf', 'az'), 139 | ('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=SB0012B&session=29', 'ak' ), 140 | ('http://www.capitol.hawaii.gov/session2015/bills/HB9_.PDF', 'hi'), 141 | ('http://www.capitol.hawaii.gov/session2015/bills/HB1047_.PDF', 'hi'), 142 | ('http://flsenate.gov/Session/Bill/2015/1490/BillText/Filed/HTML','fl'), 143 | ('http://ilga.gov/legislation/fulltext.asp?DocName=09900SB1836&GA=99&SessionId=88&DocTypeId=SB&LegID=88673&DocNum=1836&GAID=13&Session=&print=true','il'), 144 | ('http://www.legis.la.gov/Legis/ViewDocument.aspx?d=933306', 'la'), 145 | ('http://mgaleg.maryland.gov/2015RS/bills/sb/sb0040f.pdf', 'md'), 146 | ('http://www.legislature.mi.gov/documents/2015-2016/billintroduced/House/htm/2015-HIB-4167.htm', 'mi'), 147 | ('https://www.revisor.mn.gov/bills/text.php?number=HF549&version=0&session=ls89&session_year=2015&session_number=0','mn'), 148 | ('http://www.njleg.state.nj.us/2014/Bills/A2500/2354_R2.HTM','nj'), 149 | ('http://assembly.state.ny.us/leg/?sh=printbill&bn=A735&term=2015','ny'), 150 | ('http://www.ncga.state.nc.us/Sessions/2015/Bills/House/HTML/H270v1.html','nc'), 151 | ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/HB2005/A-Engrossed','or'), 152 | ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/SB947/Introduced','or'), 153 | ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2015&sessInd=0&billBody=H&billTyp=B&billNbr=0624&pn=0724', 'pa'), 154 | ('http://www.scstatehouse.gov/sess121_2015-2016/prever/172_20141203.htm','sc'), 155 | ('http://lawfilesext.leg.wa.gov/Biennium/2015-16/Htm/Bills/House%20Bills/1356.htm', 'wa'), 156 | ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874','wv'), 157 | ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874', 'wv'), 158 | # ('ftp://ftp.cga.ct.gov/2015/tob/h/2015HB-06784-R00-HB.htm','ct'), 159 | ('http://www.capitol.hawaii.gov/session2015/bills/SB129_.PDF','hi'), 160 | ('http://nebraskalegislature.gov/FloorDocs/104/PDF/Intro/LB493.pdf', 'ne'), 161 | ('http://www.gencourt.state.nh.us/legislation/2015/HB0600.html', 'nh')], 162 | [('http://alecexposed.org/w/images/2/2d/7K5-No_Sanctuary_Cities_for_Illegal_Immigrants_Act_Exposed.pdf', 'model_legislation'), 163 | ('http://www.kslegislature.org/li_2012/b2011_12/measures/documents/hb2578_00_0000.pdf', 'ks'), 164 | ('http://flsenate.gov/Session/Bill/2011/0237/BillText/Filed/HTML','fl'), 165 | ('http://openstates.org/al/bills/2012rs/SB211/','al'), 166 | ('http://le.utah.gov/~2011/bills/static/HB0497.html','ut'), 167 | ('http://webserver1.lsb.state.ok.us/cf_pdf/2013-14%20FLR/HFLR/HB1436%20HFLR.PDF','ok')], 168 | [('http://www.alec.org/model-legislation/the-disclosure-of-hydraulic-fracturing-fluid-composition-act/', 'model_legislation'), 169 | ('ftp://ftp.legis.state.tx.us/bills/82R/billtext/html/house_bills/HB03300_HB03399/HB03328S.htm', 'tx')], 170 | [('http://www.legislature.mi.gov/(S(ntrjry55mpj5pv55bv1wd155))/documents/2005-2006/billintroduced/House/htm/2005-HIB-5153.htm', 'mi'), 171 | ('http://www.schouse.gov/sess116_2005-2006/bills/4301.htm','sc'), 172 | ('http://www.lrc.ky.gov/record/06rs/SB38.htm', 'ky'), 173 | ('http://www.okhouse.gov/Legislation/BillFiles/hb2615cs%20db.PDF', 'ok'), 174 | ('http://state.tn.us/sos/acts/105/pub/pc0210.pdf', 'tn'), 175 | ('https://docs.legis.wisconsin.gov/2011/related/proposals/ab69', 'wi'), 176 | ('http://legisweb.state.wy.us/2008/Enroll/HB0137.pdf', 'wy'), 177 | ('http://www.kansas.gov/government/legislative/bills/2006/366.pdf', 'ks'), 178 | ('http://billstatus.ls.state.ms.us/documents/2006/pdf/SB/2400-2499/SB2426SG.pdf', 'mi')], 179 | [('http://www.alec.org/model-legislation/state-withdrawal-from-regional-climate-initiatives/', 'model_legislation'), 180 | ('http://www.legislature.mi.gov/documents/2011-2012/resolutionintroduced/House/htm/2011-HIR-0134.htm', 'mi'), 181 | ('http://www.nmlegis.gov/Sessions/11%20Regular/memorials/house/HJM024.html', 'nm')], 182 | [('http://alecexposed.org/w/images/9/90/7J1-Campus_Personal_Protection_Act_Exposed.pdf', 'model_legislation'), 183 | ('ftp://ftp.legis.state.tx.us/bills/831/billtext/html/house_bills/HB00001_HB00099/HB00056I.htm', 'tx')], 184 | # [ 185 | # ('http://essexuu.org/ctstat.html', 'ct'), we don't have connecituc 186 | # ('http://alisondb.legislature.state.al.us/alison/codeofalabama/constitution/1901/CA-170364.htm', 'al')], 187 | [('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=HB0162A&session=27', 'ak'), 188 | ('https://legiscan.com/AL/text/HB19/id/327641/Alabama-2011-HB19-Enrolled.pdf', 'al'), 189 | ('http://www.leg.state.co.us/clics/clics2012a/csl.nsf/fsbillcont3/0039C9417C9D9D5D87257981007F3CC9?open&file=1111_01.pdf', 'co'), 190 | ('http://www.capitol.hawaii.gov/session2012/Bills/HB2221_.PDF', 'hi'), 191 | ('http://ilga.gov/legislation/fulltext.asp?DocName=09700HB3058&GA=97&SessionId=84&DocTypeId=HB&LegID=60409&DocNum=3058&GAID=11&Session=&print=true', 'il'), 192 | ('http://coolice.legis.iowa.gov/Legislation/84thGA/Bills/SenateFiles/Introduced/SF142.html', 'ia'), 193 | ('ftp://www.arkleg.state.ar.us/Bills/2011/Public/HB1797.pdf','ar'), 194 | ('http://billstatus.ls.state.ms.us/documents/2012/html/HB/0900-0999/HB0921SG.htm', 'ms'), 195 | ('http://www.leg.state.nv.us/Session/76th2011/Bills/SB/SB373.pdf', 'nv'), 196 | ('http://www.njleg.state.nj.us/2012/Bills/A1000/674_I1.HTM', 'nj'), 197 | ('http://webserver1.lsb.state.ok.us/cf_pdf/2011-12%20INT/hB/HB2821%20INT.PDF', 'ok'), 198 | ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=PDF&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=0934&pn=1003', 'pa'), 199 | ('http://www.capitol.tn.gov/Bills/107/Bill/SB0016.pdf', 'tn')], 200 | [('http://www.legislature.idaho.gov/idstat/Title39/T39CH6SECT39-608.htm', 'id'), 201 | ('http://www.legis.nd.gov/cencode/t12-1c20.pdf?20150708171557', 'nd')] 202 | ] 203 | 204 | bills = create_save_bills(similar_bills) 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /lid/frontend.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | import os 3 | import pdb 4 | import sys 5 | import argparse 6 | import datetime as dt 7 | import time 8 | from collections import defaultdict 9 | import cherrypy 10 | from jinja2 import Environment, FileSystemLoader, Template 11 | import random 12 | import string 13 | import json 14 | from elasticsearch import Elasticsearch 15 | from database import ElasticConnection 16 | import re 17 | import nltk 18 | from utils.text_cleaning import clean_document 19 | from lid import LID 20 | from utils.general_utils import alignment_tokenizer 21 | from text_alignment import LocalAligner,AffineLocalAligner 22 | 23 | 24 | 25 | def get_alignment_highlight(text1,text2): 26 | aligns = align(text1, text2) 27 | alignment = aligns[0] 28 | seq1 = nltk.word_tokenize(text1) 29 | seq2 = nltk.word_tokenize(text2) 30 | align_clean_1, align_clean_2 = cleanAlignment(alignment) 31 | [i,j] = contains(align_clean_1, seq1) 32 | [k,l] = contains(align_clean_2, seq2) 33 | seq1.insert(i,"") 34 | seq1.insert(j,"") 35 | seq2.insert(k,"") 36 | seq2.insert(l,"") 37 | 38 | text1 = " ".join(seq1) 39 | text2 = " ".join(seq2) 40 | 41 | return text1,text2 42 | 43 | 44 | 45 | def markup_alignment_for_display(alignment_dict,left_text,right_text): 46 | 47 | left_text = left_text.split() 48 | right_text = right_text.split() 49 | l = alignment_dict['left'] 50 | r = alignment_dict['right'] 51 | left_start = alignment_dict['left_start'] 52 | left_end = alignment_dict['left_end'] 53 | right_start = alignment_dict['right_start'] 54 | right_end = alignment_dict['right_end'] 55 | 56 | 57 | 58 | #mark up l and r alignments with style 59 | l_styled = [] 60 | r_styled = [] 61 | temp_text = "" 62 | for i in range(len(l)): 63 | if l[i] == r[i] and l[i] != "-": 64 | temp_text+=l[i] 65 | temp_text+=" " 66 | if l[i] != r[i]: 67 | if len(temp_text)>0: 68 | temp_text = u"{0}".format(temp_text) 69 | l_styled.append(temp_text) 70 | r_styled.append(temp_text) 71 | temp_text = "" 72 | if l[i] != "-" and r[i] != "-": 73 | l_styled.append(u"{0}".format(l[i])) 74 | r_styled.append(u"{0}".format(r[i])) 75 | else: 76 | l_styled.append(l[i]) 77 | r_styled.append(r[i]) 78 | 79 | temp_text = u"{0}".format(temp_text) 80 | l_styled.append(temp_text) 81 | r_styled.append(temp_text) 82 | 83 | #l[i] = "{0}".format(l[i]) 84 | #r[i] = "{0}".format(r[i]) 85 | 86 | #l.insert(0,"") 87 | #l.append("") 88 | #r.insert(0,"") 89 | #r.append("") 90 | 91 | padding = [u"

"] 92 | 93 | left_text = left_text[:left_start]+padding+l_styled+\ 94 | padding+left_text[left_end:] 95 | 96 | right_text = right_text[:right_start]+padding+r_styled+padding\ 97 | +right_text[right_end:] 98 | 99 | left_text = u" ".join(left_text) 100 | right_text = u" ".join(right_text) 101 | 102 | return left_text,right_text 103 | 104 | 105 | 106 | 107 | def markup_alignment_difference(l,r): 108 | l_styled = [] 109 | r_styled = [] 110 | temp_text = "" 111 | for i in range(len(l)): 112 | if l[i] != r[i]: 113 | l[i] = u"{0}".format(l[i]) 114 | r[i] = u"{0}".format(r[i]) 115 | 116 | return l,r 117 | 118 | 119 | class DemoWebserver(object): 120 | 121 | _cp_config = { 122 | 'tools.staticdir.on' : True, 123 | 'tools.staticdir.dir' : "{0}/html".format(os.environ['POLICY_DIFFUSION']), 124 | 'tools.staticdir.index' : '/templates/searchdemo.html.jinja', 125 | 'tools.sessions.on': True, 126 | } 127 | 128 | 129 | 130 | def __init__(self,elastic_connection): 131 | self.ec = elastic_connection 132 | self.lidy = LID(elastic_host = os.environ['ELASTICSEARCH_IP'], 133 | query_results_limit=os.environ['QUERY_RESULTS_LIMIT']) 134 | 135 | self.aligner = LocalAligner() 136 | #self.query_bill = "bill" 137 | 138 | 139 | @cherrypy.expose 140 | def searchdemo(self, query_string="proof of identity", query_bill = "bill", query_results=[]): 141 | 142 | query_string = re.sub('\"',' ',query_string) 143 | 144 | if query_bill == "model legislation": 145 | 146 | query_result = lidy.find_model_legislation_alignments(query_string, document_type="text", 147 | split_sections=False, query_document_id="front_end_query") 148 | 149 | results_to_show = [] 150 | 151 | for result_doc in query_result['alignment_results']: 152 | 153 | meta_data = result_doc['document_id'].replace('old_bills', 'oldbills').split('_') 154 | meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]] 155 | 156 | result_text = ec.get_model_legislation_by_id(result_doc['document_id'])['source'] 157 | result_text = re.sub('\"',' ',result_text) 158 | 159 | alignment = result_doc['alignments'][0] 160 | score = alignment['score'] 161 | 162 | left,right = markup_alignment_for_display(alignment, 163 | query_string, result_text) 164 | left = re.sub('\"',' ',left) 165 | right = re.sub('\"',' ',right) 166 | results_to_show.append([score] + meta_data + [left,right]) 167 | 168 | results_to_show.sort(key = lambda x:x[0],reverse = True) 169 | 170 | tmpl = env.get_template("searchdemo.html.jinja") 171 | c = { 172 | 'query_string': query_string, 173 | 'results_to_show': results_to_show, 174 | } 175 | return tmpl.render(**c) 176 | 177 | 178 | if query_bill == "constitution": 179 | 180 | query_result = constitution_lidy.find_constitution_alignments(query_string, document_type="text", 181 | split_sections=True, query_document_id="text") 182 | 183 | results_to_show = [] 184 | 185 | for result_doc in query_result['alignment_results']: 186 | 187 | state = result_doc['document_id'][:-5].upper() 188 | year = result_doc['document_id'][-4:] 189 | meta_data = ["CONSTITUTION", state, year] 190 | 191 | result_text = ec.get_constitution_by_id(result_doc['document_id'])['constitution'] 192 | result_text = re.sub('\"',' ',result_text) 193 | print result_text 194 | 195 | alignment = result_doc['alignments'][0] 196 | score = alignment['score'] 197 | 198 | left,right = markup_alignment_for_display(alignment, 199 | query_string, result_text) 200 | left = re.sub('\"',' ',left) 201 | right = re.sub('\"',' ',right) 202 | results_to_show.append([score] + meta_data + [left,right]) 203 | 204 | results_to_show.sort(key = lambda x:x[0],reverse = True) 205 | 206 | tmpl = env.get_template("searchdemo.html.jinja") 207 | c = { 208 | 'query_string': query_string, 209 | 'results_to_show': results_to_show, 210 | } 211 | return tmpl.render(**c) 212 | 213 | 214 | else: 215 | query_result = lidy.find_state_bill_alignments(query_string, document_type="text", 216 | split_sections=False, query_document_id="front_end_query") 217 | 218 | results_to_show = [] 219 | 220 | for result_doc in query_result['alignment_results']: 221 | 222 | meta_data = result_doc['document_id'].split("_") 223 | meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]] 224 | 225 | result_text = ec.get_bill_by_id(result_doc['document_id'])['bill_document_last'] 226 | result_text = re.sub('\"',' ',result_text) 227 | 228 | alignment = result_doc['alignments'][0] 229 | score = alignment['score'] 230 | 231 | left,right = markup_alignment_for_display(alignment, 232 | query_string,result_text) 233 | left = re.sub('\"',' ',left) 234 | right = re.sub('\"',' ',right) 235 | results_to_show.append([score] + meta_data + [left,right]) 236 | 237 | results_to_show.sort(key = lambda x:x[0],reverse = True) 238 | 239 | tmpl = env.get_template("searchdemo.html.jinja") 240 | c = { 241 | 'query_string': query_string, 242 | 'results_to_show': results_to_show, 243 | } 244 | return tmpl.render(**c) 245 | 246 | 247 | 248 | if __name__ == '__main__': 249 | policy_diffusion_path=os.environ['POLICY_DIFFUSION'] 250 | ec_ip = os.environ['ELASTICSEARCH_IP'] 251 | parser = argparse.ArgumentParser() 252 | parser.add_argument('--host', type=str, default='0.0.0.0') 253 | parser.add_argument('--port', type=int, default=29010) 254 | parser.add_argument('--elasticsearch_connection',default=u"{0}:9200".format(ec_ip)) 255 | args = parser.parse_args() 256 | 257 | env = Environment(loader=FileSystemLoader("{0}/html/templates".format(policy_diffusion_path))) 258 | 259 | query_samples = [x.strip() for x in open("{0}/data/state_bill_samples.txt".format(policy_diffusion_path))] 260 | 261 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend=-1.5) 262 | 263 | ec = ElasticConnection(host = ec_ip) 264 | 265 | lidy = LID(query_results_limit=20, elastic_host=ec_ip, 266 | lucene_score_threshold=0.01, aligner=aligner) 267 | 268 | constitution_lidy = LID(query_results_limit=10000, 269 | elastic_host=ec_ip, lucene_score_threshold=0.01, 270 | aligner=aligner) 271 | 272 | 273 | es_host,es_port = args.elasticsearch_connection.split(":") 274 | cherrypy.config.update({'server.socket_port': args.port, 'server.socket_host': args.host}) 275 | cherrypy.quickstart(DemoWebserver(ec), "/") 276 | -------------------------------------------------------------------------------- /lid/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/lid/utils/__init__.py -------------------------------------------------------------------------------- /lid/utils/general_utils.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | import base64 3 | import urllib2 4 | import socket 5 | from ftplib import FTP, error_perm 6 | import re 7 | from StringIO import StringIO 8 | import time 9 | import multiprocessing 10 | import pickle 11 | import multiprocessing as mp 12 | import gc 13 | import signal 14 | import csv 15 | import codecs 16 | import cStringIO 17 | 18 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ ######### 19 | 20 | class TimedOutExc(Exception): 21 | pass 22 | 23 | def deadline(timeout, *args): 24 | 25 | def decorate(f): 26 | def handler(signum, frame): 27 | raise TimedOutExc() 28 | 29 | def new_f(*args): 30 | 31 | signal.signal(signal.SIGALRM, handler) 32 | signal.alarm(timeout) 33 | return f(*args) 34 | signa.alarm(0) 35 | 36 | new_f.__name__ = f.__name__ 37 | return new_f 38 | return decorate 39 | 40 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ ######### 41 | 42 | class UTF8Recoder: 43 | """ 44 | Iterator that reads an encoded stream and reencodes the input to UTF-8 45 | """ 46 | def __init__(self, f, encoding): 47 | self.reader = codecs.getreader(encoding)(f) 48 | 49 | def __iter__(self): 50 | return self 51 | 52 | def next(self): 53 | return self.reader.next().encode("utf-8") 54 | 55 | 56 | class UnicodeReader(): 57 | """ 58 | A CSV reader which will iterate over lines in the CSV file "f", 59 | which is encoded in the given encoding. 60 | """ 61 | 62 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 63 | f = UTF8Recoder(f, encoding) 64 | self.reader = csv.reader(f, dialect=dialect, **kwds) 65 | 66 | def next(self): 67 | row = self.reader.next() 68 | return [unicode(s, "utf-8") for s in row] 69 | 70 | def __iter__(self): 71 | return self 72 | 73 | 74 | 75 | 76 | class UnicodeWriter(): 77 | def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds): 78 | self.queue = cStringIO.StringIO() 79 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 80 | self.stream = f 81 | self.encoder = codecs.getincrementalencoder(encoding)() 82 | def writerow(self, row): 83 | '''writerow(unicode) -> None 84 | This function takes a Unicode string and encodes it to the output. 85 | ''' 86 | self.writer.writerow([s.encode("utf-8") for s in row]) 87 | data = self.queue.getvalue() 88 | data = data.decode("utf-8") 89 | data = self.encoder.encode(data) 90 | self.stream.write(data) 91 | self.queue.truncate(0) 92 | 93 | def writerows(self, rows): 94 | for row in rows: 95 | self.writerow(row) 96 | 97 | #********DEPRECATED************* 98 | class WorkerPool(): 99 | 100 | def __init__(self,num_workers=1,worker_timeout = 600): 101 | 102 | self._num_workers = num_workers 103 | self._worker_timeout = worker_timeout 104 | self._results = mp.Queue() 105 | self._pool = [None]*self._num_workers 106 | self._worker_times = [0.0]*self._num_workers 107 | 108 | def _assign_new_task(self,worker_id,input_args): 109 | p = self._pool[worker_id] 110 | p.join() 111 | arg = input_args.pop() 112 | new_p = mp.Process(target= func,args = (arg,self._results),name = ('process_'+str(worker_id))) 113 | new_p.start() 114 | self._pool[worker_id] = new_p 115 | self._worker_times[worker_id] = time.time() 116 | 117 | def work(self,func,input_args): 118 | worker_counter = 0 119 | #define wrapper function that queues result from input func 120 | def new_func(x): 121 | y = func(*x) 122 | self._results.put(y) 123 | 124 | 125 | while len(input_args) > 0 or ("running" in status): 126 | 127 | #assign new worker tasks to empty pool slots 128 | for i in range(self._num_workers): 129 | 130 | if len(input_args) > 0 and self._pool[i] is None: 131 | arg = input_args.pop(0) 132 | new_p = mp.Process(target= new_func,args = (arg,),name = ('process_'+str(i))) 133 | new_p.start() 134 | print worker_counter 135 | worker_counter+=1 136 | self._pool[i] = new_p 137 | self._worker_times[i] = time.time() 138 | 139 | time.sleep(0.1) 140 | status = self.check_pool_status(time.time()) 141 | import numpy as np 142 | print time.time() - np.array(self._worker_times) 143 | for i in range(len(status)): 144 | if status[i] == "completed": 145 | p = self._pool[i] 146 | p.terminate() 147 | p.join() 148 | self._pool[i] = None 149 | del p 150 | elif status[i] == "timeout": 151 | p = self._pool[i] 152 | p.terminate() 153 | self._pool[i] = None 154 | print "terminated job ",p.name 155 | gc.collect() 156 | 157 | result_list = [] 158 | 159 | while not self._results.empty(): 160 | result_list.append( self._results.get() ) 161 | 162 | return result_list 163 | 164 | #returns a list of bools indicating running status of each worker. 165 | #running,timeout,completed 166 | def check_pool_status(self,current_time): 167 | status_list = [] 168 | for i in range(self._num_workers): 169 | 170 | worker = self._pool[i] 171 | if worker is None: 172 | status_list.append("closed") 173 | elif worker.is_alive() and (current_time-self._worker_times[i]=self._worker_timeout): 176 | status_list.append("timeout") 177 | elif not worker.is_alive(): 178 | status_list.append("completed") 179 | 180 | return status_list 181 | # ********DEPRECATED************* 182 | 183 | 184 | def alignment_tokenizer(s,type = "space"): 185 | if type == "space": 186 | s = s.split(" ") 187 | return s 188 | 189 | #creates a searalized json object for bill sources 190 | def bill_source_to_json(url,source,date): 191 | jsonObj = {} 192 | jsonObj['url'] = url 193 | jsonObj['date'] = date 194 | jsonObj['source'] = base64.b64encode(source) 195 | 196 | return ujson.encode(jsonObj) 197 | 198 | #creates a json object for bill sources (not encoded) 199 | def bill_source_to_json_not_encoded(url,source,date): 200 | jsonObj = {} 201 | jsonObj['url'] = url 202 | jsonObj['date'] = date 203 | jsonObj['source'] = source 204 | 205 | return ujson.encode(jsonObj) 206 | 207 | #wrapper for urllib2.urlopen that catches URLERROR and socket error 208 | def fetch_url(url): 209 | 210 | #fetch ftp file 211 | if 'ftp://' in url: 212 | 213 | try: 214 | domain_pattern = re.compile("/[A-Za-z0-9\.]+") 215 | domain_name = domain_pattern.search(url).group(0)[1:] 216 | ftp = FTP(domain_name,timeout=10) 217 | ftp.login() 218 | file_name = "/".join(url.split("/")[3:]) 219 | 220 | r = StringIO() 221 | ftp.retrbinary('RETR {0}'.format(file_name), r.write) 222 | document = r.getvalue() 223 | time.sleep(1) 224 | 225 | except (KeyboardInterrupt, SystemExit): 226 | raise 227 | except: 228 | document = None 229 | 230 | 231 | return document 232 | 233 | #fetch http file 234 | else: 235 | 236 | try: 237 | req = urllib2.urlopen(url,timeout=10) 238 | document = req.read() 239 | except (KeyboardInterrupt, SystemExit): 240 | raise 241 | except: 242 | document = None 243 | 244 | return document 245 | 246 | #used to find alignments in broader text 247 | def find_subsequence(s,q): 248 | ''' 249 | is the list s contained in q in order and if it is what are indices 250 | ''' 251 | for i in range(len(q)): 252 | T = True 253 | for j in range(len(s)): 254 | if s[j] != q[i+j]: 255 | T = False 256 | break 257 | if T: 258 | return (i, i + j + 1) 259 | return (0,0) 260 | 261 | 262 | def load_pickle(name): 263 | with open('{0}.p'.format(name),'rb') as fp: 264 | f =pickle.load(fp) 265 | 266 | return f 267 | 268 | 269 | def save_pickle(thing, name): 270 | with open('{0}.p'.format(name),'wb') as fp: 271 | pickle.dump(thing, fp) 272 | -------------------------------------------------------------------------------- /lid/utils/sunlight_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import string 4 | import tempfile 5 | import importlib 6 | import subprocess 7 | 8 | 9 | 10 | PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation)) 11 | 12 | 13 | def convert_pdf(filename, type='xml'): 14 | commands = {'text': ['pdftotext', '-layout', filename, '-'], 15 | 'text-nolayout': ['pdftotext', filename, '-'], 16 | 'xml': ['pdftohtml', '-xml', '-stdout', filename], 17 | 'html': ['pdftohtml', '-stdout', filename]} 18 | try: 19 | pipe = subprocess.Popen(commands[type], stdout=subprocess.PIPE, 20 | close_fds=True).stdout 21 | except OSError as e: 22 | raise EnvironmentError("error running %s, missing executable? [%s]" % 23 | ' '.join(commands[type]), e) 24 | data = pipe.read() 25 | pipe.close() 26 | return data 27 | 28 | 29 | def pdfdata_to_text(data): 30 | with tempfile.NamedTemporaryFile(delete=True) as tmpf: 31 | tmpf.write(data) 32 | tmpf.flush() 33 | return convert_pdf(tmpf.name, 'text') 34 | 35 | 36 | def worddata_to_text(data): 37 | desc, txtfile = tempfile.mkstemp(prefix='tmp-worddata-', suffix='.txt') 38 | try: 39 | with tempfile.NamedTemporaryFile(delete=True) as tmpf: 40 | tmpf.write(data) 41 | tmpf.flush() 42 | subprocess.check_call(['timeout', '10', 'abiword', 43 | '--to=%s' % txtfile, tmpf.name]) 44 | f = open(txtfile) 45 | text = f.read() 46 | tmpf.close() 47 | f.close() 48 | finally: 49 | os.remove(txtfile) 50 | os.close(desc) 51 | return text.decode('utf8') 52 | 53 | 54 | def text_after_line_numbers(lines): 55 | text = [] 56 | for line in lines.splitlines(): 57 | # real bill text starts with an optional space, line number 58 | # more spaces, then real text 59 | match = re.match('\s*\d+\s+(.*)', line) 60 | if match: 61 | text.append(match.group(1)) 62 | 63 | # return all real bill text joined w/ newlines 64 | return '\n'.join(text).decode('utf-8', 'ignore') 65 | 66 | 67 | def plaintext(abbr, doc, doc_bytes): 68 | # use module to pull text out of the bytes 69 | module = importlib.import_module(abbr) 70 | text = module.extract_text(doc, doc_bytes) 71 | 72 | if not text: 73 | return 74 | 75 | if isinstance(text, unicode): 76 | text = text.encode('ascii', 'ignore') 77 | else: 78 | text = text.decode('utf8', 'ignore').encode('ascii', 'ignore') 79 | text = text.replace(u'\xa0', u' ') # nbsp -> sp 80 | text = PUNCTUATION.sub(' ', text) # strip punctuation 81 | text = re.sub('\s+', ' ', text) # collapse spaces 82 | return text 83 | 84 | 85 | -------------------------------------------------------------------------------- /lid/utils/text_cleaning.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clean text in ElasticSearch 3 | ''' 4 | 5 | import elasticsearch 6 | import re 7 | import string 8 | import urllib2 9 | from elasticsearch import Elasticsearch 10 | from pprint import pprint 11 | import nltk 12 | 13 | #custom modules 14 | #from database import ElasticConnection 15 | 16 | def clean_text(text, lower = True): 17 | ''' 18 | variables: 19 | text: string corresponding to text of bill 20 | bill_name: string corresponding to bill_id 21 | 22 | returns: 23 | string that is cleaned up text 24 | decription: 25 | clean text 26 | ''' 27 | #make text lowercase 28 | if lower == True: 29 | text = text.lower() 30 | 31 | text = re.sub('\n[ ]*[0-9]+', '', text) 32 | text = re.sub('[ ]{2,}', u' ', text) 33 | 34 | #parse by line 35 | text_list = text.splitlines() 36 | 37 | #replace funky symbols and multiple new lines 38 | ntext_list = [] 39 | for line in text_list: 40 | line = line.replace(u'\xa0', u' ') 41 | line = line.replace(u'>>', u' ') 42 | line = line.replace(u'\xa7', u' ') 43 | line = line.replace(u'\xe2', u' ') 44 | line = line.replace(u'\u201c', u' ') 45 | line = line.replace(u'\u201d', u' ') 46 | line = line.replace(u'\xbb', u' ') 47 | line = line.replace(u'\xa9', u' ') 48 | line = line.replace(u' ,', u',') 49 | line = line.replace(u'{ font-family: courier, arial, sans-serif; font-size: 10pt; } table { empty-cells:show; }', u' ') 50 | line = re.sub( '\s+', u' ', line) 51 | ntext_list.append(line) 52 | return (string.join(ntext_list, '\n')) 53 | 54 | 55 | 56 | 57 | 58 | 59 | def split_to_sections(cleantext,state): 60 | ''' 61 | variables: 62 | cleantext: clean version of text of bill 63 | state: abbreviation of state ID 64 | 65 | returns: 66 | list of bill sections 67 | decription: 68 | splits bill text into sections 69 | ''' 70 | if state == 'ak': 71 | chunked_list = cleantext.split("\n*") 72 | elif state in ('al','ar','mt','or','ri'): 73 | chunked_list = cleantext.split('\nsection') 74 | elif state in ('nm','tx'): 75 | chunked_list = cleantext.split('\n section') 76 | elif state in ('az','ia','nv', 'wa', 'vt'): 77 | chunked_list = cleantext.split('\nsec.') 78 | elif state in ('me', 'mi'): 79 | chunked_list = cleantext.split('\n sec.') 80 | elif state == 'co': 81 | chunked_list = re.split('[[0-9][0-9]\.section|[0-9]\.section', cleantext) 82 | elif state in ('de','fl','tn'): 83 | chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext) 84 | elif state == 'ga': 85 | cleantext = re.sub('[0-9][0-9]\\n|[0-9]\\n', ' ', cleantext) 86 | chunked_list = re.split('\\nsection\s[0-9][0-9]|\\nsection\s[0-9]', cleantext) 87 | elif state in ('hi','sd','in'): 88 | chunked_list = re.split('\\n\ssection\s[0-9][0-9]\.|\\n\ssection\s[0-9]', cleantext) 89 | elif state == 'pa': 90 | chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext) 91 | elif state in ('id', 'la', 'md', 'nd'): 92 | chunked_list = re.split('\\nsection\s[0-9][0-9]\.|\\nsection\s[0-9]\.', cleantext) 93 | elif state == 'il': 94 | cleantext = re.sub('\\n\s[0-9][0-9]|\\n\s[0-9]', ' ', cleantext) 95 | chunked_list = re.split('\\n\s\ssection\s', cleantext) 96 | elif state == 'sc': 97 | chunked_list = cleantext.split('\n \n') 98 | elif state == 'ks': 99 | chunked_list = re.split('\\nsection\s|sec\.', cleantext) 100 | elif state in ('ne', 'mn'): 101 | chunked_list = re.split('\ssection\s[0-9]\.|\ssec.\s[0-9][0-9]\.|\ssec.\s[0-9]\.', cleantext) 102 | elif state == 'ky': 103 | chunked_list = cleantext.split('\n\n\n section .') 104 | elif state == 'ms': 105 | chunked_list = cleantext.split('\n\n\n section ') 106 | elif state in ('ma', 'nc', 'oh','ut'): 107 | chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext) 108 | elif state == 'mo': 109 | chunked_list = re.split('\\n\s[0-9][0-9]\.\s|\\n\s[0-9]\.\s', cleantext) 110 | elif state == 'nh': 111 | chunked_list = re.split('\n\n[0-9][0-9]\s|\n\n[0-9]\s', cleantext) 112 | elif state == 'nj': 113 | chunked_list = re.split('\\n\\n\s[0-9][0-9]\.\s|\\n\\n\s[0-9]\.\s', cleantext) 114 | elif state == 'ny': 115 | chunked_list = re.split('\ssection\s[0-9]\.|\.\ss\s[0-9]\.', cleantext) 116 | elif state == 'ok': 117 | chunked_list = re.split('\nsection\s\.\s', cleantext) 118 | elif state == 'va': 119 | chunked_list = re.split('(([A-Z])|[0-9][0-9])\.\s|(([A-Z])|[0-9])\.\s', cleantext) 120 | elif state == 'wi': 121 | chunked_list = re.split('\\n[0-9][0-9]section\s\\n|\\n[0-9]section\s\\n', cleantext) 122 | elif state == 'wv': 123 | chunked_list = re.split('\n\s\([a-z]\)\s', cleantext) 124 | elif state == 'wy': 125 | chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext) 126 | elif state == 'ca': 127 | chunked_list = re.split('section\s[0-9]\.|sec.\s[0-9][0-9]\.|sec.\s[0-9]\.', cleantext) 128 | elif state == None: 129 | chunked_list = cleantext.split("\n") 130 | else: 131 | chunked_list = cleantext.split("\n") 132 | 133 | return chunked_list 134 | 135 | #Delete empty sections (run before deleting numbers in lines) 136 | def delete_empty_sections(chunked_list): 137 | ''' 138 | decription: deletes empty elements in bills 139 | ''' 140 | return [x for x in chunked_list if x is not None and len(x)>2] 141 | 142 | #Need to delete number lines for: OR, OK, NE, PA (run before deleting lines) 143 | def delete_numbers_in_lines (chunked_list): 144 | ''' 145 | decription: 146 | cleans pdf extractor errors where number of lines were included in text 147 | ''' 148 | re_string = '\\n\s[0-9][0-9]|\\n[0-9][0-9]|\\n[0-9]|\\n\s[0-9]' 149 | chunked_list = [re.sub(re_string,'',t) for t in chunked_list] 150 | return chunked_list 151 | 152 | 153 | 154 | #Delete multiple new lines for each section 155 | def delete_lines (chunked_list): 156 | ''' 157 | description: deletes multiple lines and spaces for each section 158 | ''' 159 | chunked_list = [re.sub( '\s+', ' ', x) for x in chunked_list] 160 | return chunked_list 161 | 162 | 163 | 164 | def clean_document(doc_text,doc_type = "text",split_to_section = False,**kwargs): 165 | """text -- document text 166 | doc_type --- the type of the document ( "state_bill", "model_legislation", "None") """ 167 | 168 | if doc_type == "state_bill": 169 | doc_text = clean_text(doc_text) 170 | doc_text_sections = split_to_sections(doc_text,kwargs['state_id']) 171 | doc_text_sections = delete_empty_sections(doc_text_sections) 172 | if kwargs['state_id'] in ['or','ok','ne','pa']: 173 | doc_text_sections = delete_numbers_in_lines(doc_text_sections) 174 | doc_text_sections = delete_lines(doc_text_sections) 175 | 176 | elif doc_type == "model_legislation": 177 | doc_text = clean_text(doc_text) 178 | doc_text_sections = doc_text.split('\nsection') 179 | doc_text_sections = delete_empty_sections(doc_text_sections) 180 | doc_text_sections = delete_lines(doc_text_sections) 181 | 182 | elif doc_type == "text": 183 | doc_text = clean_text(doc_text) 184 | doc_text_sections = doc_text.split('\n') 185 | doc_text_sections = delete_empty_sections(doc_text_sections) 186 | doc_text_sections = delete_lines(doc_text_sections) 187 | 188 | if split_to_section == True: 189 | return doc_text_sections 190 | elif split_to_section == False: 191 | return [" ".join(doc_text_sections)] 192 | 193 | #delete boiler plate present in all alec exposed bills after "effective date" 194 | def delete_boiler_plate_alec_exposed (chunked_list): 195 | chunked_list = [re.sub('({effective date).*$', ' ', x) for x in chunked_list] 196 | chunked_list = chunked_list[1:] 197 | return chunked_list 198 | 199 | #good example is test_clean_text_for_alignment('va') 200 | 201 | def test_clean_text(state): 202 | es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300) 203 | match = es.search(index="state_bills", body={"query": {"match": {'state': state}}}) 204 | state_text = match['hits']['hits'][3]['_source']['bill_document_first'] 205 | cleaned_doc = clean_document(state_text,doc_type = "state_bill",state_id = "mi",split_to_section = False) 206 | return cleaned_doc 207 | 208 | def main(): 209 | #Get data from elasticsearch to test 210 | 211 | print test_clean_text("mi") 212 | 213 | if __name__ == "__main__": 214 | main() 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/policy_diffusion/61d5885e809ab8d92db18d79f7a82b6461cf4343/readme.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2==2.7.3 2 | elasticsearch==1.5 3 | enum34==1.0.4 4 | cherrypy==3.8.0 5 | numba==0.17.0 6 | ujson==1.33 7 | -------------------------------------------------------------------------------- /scripts/bill_to_bill_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | with open('/Users/mattburg/Dropbox/bill_similarity_matrix.json') as data_file: 5 | data = json.load(data_file) 6 | 7 | #data = {'ca_1': [{'id': 'ks_2', 'score': 134, 'state': 'ks'}, {'id': 'wy_12', 'score': 80, 'state': 'wy'}],'wa_3': [{'id': 'ca_1', 'score': 20, 'state': 'ca'}, {'id': 'al_5', 'score': 40, 'state': 'al'}]} 8 | 9 | 10 | #Need list of dictionary to make it dataframe 11 | df_dict = {} 12 | df_list = [] 13 | for item in data: 14 | for i in range(len(data[item])): 15 | state_1 = item[0:2] 16 | state_2 = data[item][i]['state'] 17 | state_1_2 = '-'.join(sorted([state_1, state_2])) 18 | df_dict={ 19 | 'state_1': item[0:2], 20 | 'state_2':data[item][i]['state'], 21 | 'score': data[item][i]['score'], 22 | 'state_1_2': state_1_2} 23 | df_list.append(df_dict) 24 | 25 | 26 | df = pd.DataFrame(df_list) 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /scripts/bill_to_bill_parallel.sh: -------------------------------------------------------------------------------- 1 | cat /home/mburgess/policy_diffusion/data/bill_ids_random.txt | parallel --delay 0.1 \ 2 | --joblog /home/mburgess/bill_to_bill_alignments.log \ 3 | --tmpdir /mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments \ 4 | --files \ 5 | /home/mburgess/policy_diffusion/scripts/generate_bill_to_bill_matches.py 6 | -------------------------------------------------------------------------------- /scripts/compare_constitutions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Written for Python 2.7 4 | 5 | from lid import LID 6 | from text_alignment import AffineLocalAligner,LocalAligner 7 | import database 8 | import json 9 | import base64 10 | import codecs 11 | import re 12 | import logging 13 | import os 14 | import traceback 15 | import sys 16 | from database import ElasticConnection 17 | from elasticsearch import Elasticsearch 18 | import time 19 | 20 | def get_constitution_alignments(query_doc): 21 | result_docs = constitution_lidy.find_constitution_alignments( 22 | query_doc, 23 | document_type = "text", 24 | split_sections = True, 25 | query_document_id = "text") 26 | return result_docs 27 | 28 | 29 | def main(): 30 | 31 | docs = ec.get_all_doc_ids('constitutions') 32 | 33 | for doc in docs: 34 | print doc 35 | doc_text = es_connection.get_source(index = 'constitutions', id = doc)['constitution'] 36 | result_doc = get_constitution_alignments(doc_text) 37 | open('/mnt/data/jwalsh/constitution_matches.json', 'a').write(json.dumps(result_doc)) 38 | time.sleep(1) 39 | 40 | 41 | 42 | if __name__ == "__main__": 43 | #elastic host ip 44 | ip_addy = os.environ['ELASTICSEARCH_IP'] 45 | 46 | #instantiate lid,aligner and elasticsearch objects 47 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5) 48 | ec = ElasticConnection(host = ip_addy) 49 | es_connection = Elasticsearch([{'host': ip_addy, 'port': 9200}]) 50 | 51 | query_results_limit = os.environ['QUERY_RESULTS_LIMIT'] 52 | constitution_lidy = LID(query_results_limit=query_results_limit, elastic_host=ip_addy, 53 | lucene_score_threshold=0.01, aligner=aligner) 54 | 55 | main() 56 | 57 | -------------------------------------------------------------------------------- /scripts/generate_bill_to_bill_matches.py: -------------------------------------------------------------------------------- 1 | #!/opt/anaconda/bin/python 2 | 3 | from lid import LID 4 | from text_alignment import AffineLocalAligner,LocalAligner 5 | import database 6 | import json 7 | import base64 8 | import codecs 9 | import re 10 | import logging 11 | import os 12 | import traceback 13 | import sys 14 | from utils.general_utils import deadline,TimedOutExc 15 | from database import ElasticConnection 16 | import time 17 | 18 | class NoneDocException(Exception): 19 | pass 20 | 21 | 22 | @deadline(1000) 23 | def get_alignments(query_doc,bill_id): 24 | result_docs = lidy.find_state_bill_alignments(query_doc,document_type = "state_bill", 25 | split_sections = True,state_id = bill_id[0:2],query_document_id = bill_id) 26 | return result_docs 27 | 28 | 29 | def test(model_doc): 30 | return model_doc 31 | 32 | 33 | if __name__ == "__main__": 34 | 35 | #elastic host ip 36 | ip_addy = "54.203.12.145" 37 | 38 | #configure logging 39 | logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']), 40 | level=logging.DEBUG) 41 | logging.getLogger('elasticsearch').setLevel(logging.ERROR) 42 | logging.getLogger('urllib3').setLevel(logging.ERROR) 43 | logging.getLogger('json').setLevel(logging.ERROR) 44 | 45 | 46 | #instantiate lid,aligner and elasticsearch objects 47 | 48 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5) 49 | 50 | ec = ElasticConnection(host = ip_addy) 51 | 52 | lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner) 53 | 54 | #for line in sys.stdin: 55 | 56 | try: 57 | 58 | bill_id = sys.argv[1] 59 | query_doc = ec.get_bill_by_id(bill_id)['bill_document_last'] 60 | 61 | if query_doc is None: 62 | raise NoneDocException 63 | 64 | result_doc = get_alignments(query_doc,bill_id) 65 | logging.info("obtained alignments for {0}".format(bill_id)) 66 | print json.dumps(result_doc) 67 | 68 | except (KeyboardInterrupt, SystemExit): 69 | raise 70 | 71 | except NoneDocException: 72 | 73 | m = "none doc error query_id {0}: {1}".format(bill_id, "None doc error") 74 | logging.error(m) 75 | print json.dumps({"query_document_id": bill_id,"error":"none doc error"}) 76 | 77 | except TimedOutExc: 78 | 79 | m = "timeout error query_id {0}: {1}".format(bill_id, "timeout error") 80 | logging.error(m) 81 | print json.dumps({"query_document_id": bill_id,"error":"timeout error"}) 82 | 83 | except: 84 | 85 | trace_message = re.sub("\n+", "\t", traceback.format_exc()) 86 | trace_message = re.sub("\s+", " ", trace_message) 87 | trace_message = "<<{0}>>".format(trace_message) 88 | m = "random error query_id {0}: {1}".format(bill_id, trace_message) 89 | logging.error(m) 90 | print json.dumps({"query_document_id": bill_id,"error":"trace_message"}) 91 | -------------------------------------------------------------------------------- /scripts/generate_model_legislation_matches.py: -------------------------------------------------------------------------------- 1 | #!/opt/anaconda/bin/python 2 | 3 | from lid import LID 4 | from text_alignment import AffineLocalAligner,LocalAligner 5 | import database 6 | import json 7 | import base64 8 | import codecs 9 | import re 10 | import logging 11 | import os 12 | import traceback 13 | import sys 14 | from utils.general_utils import deadline,TimedOutExc 15 | import time 16 | 17 | 18 | 19 | @deadline(1000) 20 | def get_alignments(model_doc): 21 | result_docs = lidy.find_state_bill_alignments(model_doc['source'],document_type = "model_legislation", 22 | split_sections = True,query_document_id = model_doc['id']) 23 | return result_docs 24 | 25 | 26 | def test(model_doc): 27 | return model_doc 28 | 29 | 30 | if __name__ == "__main__": 31 | 32 | #elastic host ip 33 | ip_addy = "54.203.12.145" 34 | 35 | 36 | 37 | #configure logging 38 | logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']), 39 | level=logging.DEBUG) 40 | logging.getLogger('elasticsearch').setLevel(logging.ERROR) 41 | logging.getLogger('urllib3').setLevel(logging.ERROR) 42 | logging.getLogger('json').setLevel(logging.ERROR) 43 | 44 | 45 | #instantiate lid object 46 | 47 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5) 48 | 49 | lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner) 50 | 51 | for line in sys.stdin: 52 | model_doc = json.loads(line.strip()) 53 | 54 | try: 55 | result_doc = get_alignments(model_doc) 56 | #result_doc = test(model_doc) 57 | print json.dumps(result_doc) 58 | 59 | except (KeyboardInterrupt, SystemExit): 60 | raise 61 | except TimedOutExc: 62 | m = "timeout error query_id {0}: {1}".format(model_doc['id'], trace_message) 63 | logging.error(m) 64 | print json.dumps({"query_document_id": model_doc['id'],"error":"timeout error"}) 65 | 66 | except: 67 | trace_message = re.sub("\n+", "\t", traceback.format_exc()) 68 | trace_message = re.sub("\s+", " ", trace_message) 69 | trace_message = "<<{0}>>".format(trace_message) 70 | m = "random error query_id {0}: {1}".format(model_doc['id'], trace_message) 71 | logging.error(m) 72 | print json.dumps({"query_document_id": model_doc['id'],"error":"trace_message"}) 73 | 74 | -------------------------------------------------------------------------------- /scripts/model_legislation_network.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def func(x): 4 | x['weight'] = x.count() 5 | return x 6 | df = pd.read_csv("/Users/mattburg/Downloads/interest_groups_to_state_network_fixed.csv") 7 | df = df[df.score>100] 8 | df = df.groupby(df.edge_id).count() 9 | 10 | alec_total = 2208. 11 | alice_total = 1500. 12 | 13 | index = df.index 14 | ids = df['lobby_id'].tolist() 15 | 16 | print "Source,Target,Weight,Type" 17 | for x,y in zip(index,ids): 18 | s,t = x.split("_") 19 | if s == "alec": 20 | y = float(y)/alec_total 21 | elif s == "alice": 22 | y = float(y)/alice_total 23 | else: 24 | continue 25 | print "{0},{1},{2},{3}".format(s,t,y,"undirected") 26 | 27 | -------------------------------------------------------------------------------- /scripts/model_legislation_parallel.sh: -------------------------------------------------------------------------------- 1 | cat /mnt/data/sunlight/dssg/model_legislation/extracted_model_legislation.json | parallel --pipe --delay 1.0 \ 2 | --joblog /home/mburgess/model_legistlation_alignments.log \ 3 | --tmpdir /mnt/data/sunlight/dssg/alignment_results/model_legislation_alignments \ 4 | --files \ 5 | /home/mburgess/policy_diffusion/scripts/generate_model_legislation_matches.py 6 | -------------------------------------------------------------------------------- /scripts/model_legislation_to_bill_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from database import * 4 | import numpy as np 5 | 6 | #open json file 7 | alec_json = "/Users/eugeniagiraudy/Dropbox/DSSG/policy_diffusion/scripts/model_legislation_alignments.json" 8 | 9 | def create_bill_to_bill_matrix(jsonfile): 10 | ''' 11 | Converts a json file with matching text between model legislation and bills into a 12 | dataframe. 13 | 14 | ''' 15 | alignments = [json.loads(x.strip()) for x in open(jsonfile)] 16 | df_list = [] 17 | for i in range(len(alignments)): 18 | left_id = alignments[i]['query_document_id'] 19 | interest_group = left_id.split('_') 20 | interest_group = "_".join(interest_group[0:2]) 21 | try: 22 | for result in alignments[i]['alignment_results']: 23 | right_id = result['document_id'] 24 | score_list = [] 25 | for j in range(len(result['alignments'])): 26 | score = result['alignments'][j]['score'] 27 | score_list.append(score) 28 | #Need to decide whehter we want the sum, average, max 29 | score_max = max(score_list) 30 | df_list.append([interest_group, left_id,right_id,score_max,right_id[0:2],left_id+"_"+right_id,'undirected']) 31 | except KeyError: 32 | print left_id, 'failed' 33 | continue 34 | df = pd.DataFrame(df_list) 35 | df.columns = ['interst_group_id','model_legislation_id', 'unique_id','score_max','state','bill_ml_id','undirected'] 36 | return df 37 | 38 | 39 | def grab_ids_for_data_frame(df): 40 | ''' 41 | Grabs bill ids from ElasticSearch and adds it to a dataframe. 42 | Outputs csv file with data frame containing model legislation to bills matches and 43 | information on date introduced and date signed 44 | 45 | Arguments: 46 | dataframe = data frame containing model legislation to bill analysis 47 | 48 | ''' 49 | bill_id_list = df['unique_id'] 50 | bill_id_list = bill_id_list.tolist() 51 | 52 | ec = ElasticConnection(host = '54.203.12.145', port = 9200) 53 | 54 | bill_dates = [] 55 | bill_signed = [] 56 | for bill in bill_id_list: 57 | bill_all = ec.get_bill_by_id(bill) 58 | date_introduced = bill_all['date_introduced'] 59 | date_signed = bill_all['date_signed'] 60 | bill_dates.append(date_introduced) 61 | bill_signed.append(date_signed) 62 | print bill 63 | bills_introd_signed = zip(bill_id_list, bill_dates, bill_signed) 64 | df_dates = pd.DataFrame(bills_introd_signed) 65 | df_dates.columns = ['unique_id', 'date_introduced', 'date_signed'] 66 | df2 = pd.merge(df, df_dates, on='unique_id') 67 | #Drop duplicates from the merge 68 | df3 = df2.drop_duplicates('bill_ml_id') 69 | return df3.to_csv('./model_legislation_to_bills_max_score.csv') 70 | 71 | 72 | 73 | 74 | #Analysis of ALEC 75 | 76 | df_alec = df3[(df3.interst_group_id =='alec_bills')|(df3.interst_group_id=='alec_old')] 77 | #eliminate cases where two model legislations influence the same bill 78 | df_alec = df_alec.groupby(['unique_id']).max() 79 | date = df_alec['date_introduced'] 80 | df_alec['year_introduced']=date.apply(lambda x:x.year) 81 | #eliminate cases wher states may have two identical bills for a given year 82 | df_grouped = df_alec.groupby(['state', 'year_introduced', 'model_legislation_id']).max() 83 | df_grouped.to_csv('./alec_model_legislation_to_bills_max_score_unique.csv') 84 | 85 | -------------------------------------------------------------------------------- /tests/text_alignment_tests.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | import matplotlib.pyplot as plt 4 | import time 5 | import numpy as np 6 | from compiler.ast import flatten 7 | from alignment.sequence import Sequence 8 | from alignment.vocabulary import Vocabulary 9 | from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner 10 | from utils.general_utils import find_subsequence 11 | from text_alignment import * 12 | 13 | 14 | #function from python package for testing results 15 | def seqToAlign(a, b, matchScore = 3, mismatchScore = -1, gapScore = -2): 16 | ''' 17 | args: 18 | a: list of words 19 | b: list of words 20 | matchScore: num 21 | mismatchScore: num 22 | gapScore: num 23 | Returns: 24 | o/w returns list of tuples with score and top alignments 25 | Description: 26 | helper function for finding alignments given a list of words 27 | ''' 28 | # Create a vocabulary and encode the sequences. 29 | a = a[0] 30 | b = b[0] 31 | seq1 = Sequence(a) 32 | seq2 = Sequence(b) 33 | v = Vocabulary() 34 | aEncoded = v.encodeSequence(seq1) 35 | bEncoded = v.encodeSequence(seq2) 36 | 37 | # Create a scoring and align the sequences using local aligner. 38 | scoring = SimpleScoring(matchScore, mismatchScore) 39 | aligner = LocalSequenceAligner(scoring, gapScore) 40 | score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) 41 | alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds] 42 | 43 | return [(a.score, list(a.first), list(a.second)) for a in alignments] 44 | 45 | 46 | #testing functions 47 | def create_doc_test_cases(): 48 | #tests 49 | t1 = [['a']*100] 50 | t2 = [['b']*50 + ['a','a','b']*50] 51 | 52 | s1 = [[1]*100] 53 | s2 = [[2]*50 + [1,1,2]*50] 54 | 55 | v1 = [np.array([0, 1, 2, 3, 4, 7, 6, 3, 2, 1, 3])] 56 | v2 = [np.array([0, 1, 2, 3, 4, 4, 5, 2, 1, 2, 2])] 57 | 58 | w1 = [np.array([7, 6, 3, 2, 1, 3, 0, 1, 2, 3, 4])] 59 | w2 = [np.array([4, 5, 2, 1, 2, 2, 0, 1, 2, 3, 4])] 60 | 61 | tests = [(t1,t2), (s1,s2),(v1,v2), (w1,w2), ([np.random.choice(5, 30)],[np.random.choice(5, 30)]), \ 62 | ([np.array([1, 2, 0, 0, 1, 2, 3, 0, 1, 3, 0, 4, 3, 3, 0, 3, 0, 2, 0, 4, 3, 4, 2, \ 63 | 1, 1, 1, 1, 1, 0, 1])], [np.array([2, 0, 3, 1, 2, 4, 0, 1, 3, 0, 1, 4, 1, 3, 1, 4, 0, 0, 1, 2, 4, 0, 0, \ 64 | 2, 4, 1, 3, 2, 2, 4])])] 65 | 66 | return tests 67 | 68 | 69 | #LocalAligner algorithm tests 70 | def LocalAligner_unit_tests(): 71 | 72 | def test_alignment(t1,t2): 73 | f = LocalAligner() 74 | alignment=f.align(t1,t2) #default score is 3,-1,-2 75 | score, l, r = alignment.alignments[0] 76 | 77 | #find score of recovered alignment 78 | align_score = f.alignment_score(l,r) 79 | 80 | #run package algorithm 81 | alignments = seqToAlign(t1,t2) #default score is 3,-1,-2 82 | 83 | if score == align_score and score == alignments[0][0]: 84 | print 'package, backtraced alignment, and alignmnet matrix consistent' 85 | else: 86 | print 'dp_alg_score: ' + str(score) 87 | print 'alignment_score: ' + str(align_score) 88 | print 'package_score: ' + str(alignments[0][0]) 89 | 90 | #tests 91 | tests = create_doc_test_cases() 92 | for test in tests: 93 | z1, z2 = test 94 | test_alignment(z1,z2) 95 | 96 | f = LocalAligner() 97 | alignment=f.align(z1,z2) #default score is 3,-1,-2 98 | 99 | score, l, r = alignment.alignments[0] 100 | 101 | #run package algorithm 102 | alignments = seqToAlign(z1,z2) #default score is 3,-1,-2 103 | 104 | l_true, r_true = alignments[0][1:] 105 | 106 | for i in range(len(l)): 107 | if l[i] != l_true[i]: 108 | print 'not same sequence' 109 | break 110 | 111 | for i in range(len(r)): 112 | if r[i] != r_true[i]: 113 | print 'not same sequence' 114 | break 115 | 116 | 117 | def test_alignment(t1,t2, algorithm): 118 | f = algorithm() 119 | alignment=f.align(t1,t2) #default score is 3,-1,-2 120 | score, l, r = alignment.alignments[0] 121 | 122 | #find score of recovered alignment 123 | align_score = f.alignment_score(l,r) 124 | 125 | if score == align_score: 126 | print 'backtraced alignment and alignmnet matrix consistent' 127 | else: 128 | print 'backtraced alignment and alignmnet matrix not consistent' 129 | print 'dp_alg_score: ' + str(score) 130 | print 'alignment_score: ' + str(align_score) 131 | 132 | print 'left_alignment: ', l 133 | print 'right_alignment: ', r 134 | 135 | 136 | def generic_doc_unit_test(algorithm): 137 | 138 | tests = create_doc_test_cases() 139 | for test in tests: 140 | z1, z2 = test 141 | test_alignment(z1,z2, algorithm) 142 | 143 | 144 | def LocalAligner_speed_test(): 145 | 146 | input_sizes = [np.exp2(p) for p in range(2,7)] 147 | 148 | average_our_times = [] 149 | average_package_times = [] 150 | for input_size in input_sizes: 151 | print input_size 152 | v1 = [np.random.randint(0,10,input_size)] 153 | v2 = [np.random.randint(0,10,input_size)] 154 | our_times = [] 155 | package_times = [] 156 | f = LocalAligner() 157 | for i in range(2): 158 | t1 = time.time() 159 | f.align(v1,v2) 160 | our_times.append(time.time()-t1) 161 | 162 | t2 = time.time() 163 | seqToAlign(v1,v2) 164 | package_times.append(time.time()-t2) 165 | 166 | average_our_times.append(np.mean(our_times)) 167 | average_package_times.append(np.mean(package_times)) 168 | 169 | plt.plot(input_sizes,average_package_times, color = 'b', label = 'package') 170 | plt.plot(input_sizes,average_our_times, color='r', label = 'our implementation') 171 | plt.legend(loc='upper right') 172 | plt.xlabel('input size') 173 | plt.ylim(0,0.02) 174 | plt.show() 175 | 176 | 177 | def generic_doc_speed_test(algorithm): 178 | ''' 179 | compares speed of algorithm to local alignment algorithm 180 | ''' 181 | 182 | input_sizes = [np.exp2(p) for p in range(2,7)] 183 | 184 | average_alg_times = [] 185 | average_local_times = [] 186 | for input_size in input_sizes: 187 | print input_size 188 | v1 = [np.random.randint(0,10,input_size)] 189 | v2 = [np.random.randint(0,10,input_size)] 190 | local_times = [] 191 | alg_times = [] 192 | f = LocalAligner() 193 | g = algorithm() 194 | for i in range(2): 195 | t1 = time.time() 196 | f.align(v1,v2) 197 | local_times.append(time.time()-t1) 198 | 199 | t2 = time.time() 200 | g.align(v1,v2) 201 | alg_times.append(time.time()-t2) 202 | 203 | average_local_times.append(np.mean(local_times)) 204 | average_alg_times.append(np.mean(alg_times)) 205 | 206 | return average_local_times, average_alg_times 207 | 208 | 209 | def doc_test_alignment_indices(algorithm): 210 | #tests 211 | tests = create_doc_test_cases() 212 | 213 | good_job = True 214 | for test in tests: 215 | 216 | left_text, right_text = test 217 | try: 218 | left_text[0] = left_text[0].tolist() 219 | right_text[0] = right_text[0].tolist() 220 | except: 221 | pass 222 | f = algorithm() 223 | Alignment = f.align(left_text,right_text) 224 | left, right = clean_alignment(Alignment.alignments[0]) 225 | 226 | 227 | left_start, left_end = find_subsequence(left, flatten(left_text)) 228 | right_start, right_end = find_subsequence(right, flatten(right_text)) 229 | 230 | if Alignment.alignment_indices[0]['left_start'] != left_start or \ 231 | Alignment.alignment_indices[0]['left_end'] != left_end or \ 232 | Alignment.alignment_indices[0]['right_start'] != right_start or \ 233 | Alignment.alignment_indices[0]['right_end'] != right_end: 234 | 235 | print 'alignment length: ', len(left) 236 | 237 | print 'indices are messed up' 238 | 239 | print 'left_start: ', Alignment.alignment_indices[0]['left_start'] 240 | print 'true left_start: ', left_start 241 | print 'left_end: ', Alignment.alignment_indices[0]['left_end'] 242 | print 'true left_end', left_end 243 | print '\n' 244 | 245 | print 'right_start: ', Alignment.alignment_indices[0]['right_start'] 246 | print 'true right_start: ', right_start 247 | print 'right_end: ', Alignment.alignment_indices[0]['right_end'] 248 | print 'true right_end: ', right_end 249 | 250 | print '\n' 251 | 252 | good_job = False 253 | 254 | if good_job: 255 | print 'indices worked' 256 | 257 | 258 | #SectionLocalAlignment Tests 259 | def create_section_tests(): 260 | tests = create_doc_test_cases() 261 | 262 | #convert tests into sections so 263 | #that it makes sense for case 264 | left_test = [] 265 | right_test = [] 266 | for test1, test2 in tests: 267 | left_test.append(list(test1[0])) 268 | right_test.append(list(test2[0])) 269 | 270 | return left_test, right_test 271 | 272 | 273 | def section_unit_tests(Algorithm): 274 | left_test, right_test = create_section_tests() 275 | 276 | f = Algorithm() 277 | Alignment = f.align(left_test, [flatten(right_test)]) 278 | 279 | good_job = True 280 | for score, left, right in Alignment.alignments: 281 | true_score = f.alignment_score(left, right) 282 | if true_score != score: 283 | print 'left: ', left 284 | print 'right: ', right 285 | print 'true alignment score: ', true_score 286 | print 'calculated score: ', score 287 | good_job = False 288 | 289 | if good_job: 290 | print "calculated alignment scores correctly" 291 | 292 | 293 | def section_speed_test(): 294 | 295 | input_sizes = [np.exp2(p) for p in range(2,9)] 296 | 297 | average_local_times = [] 298 | average_section_times = [] 299 | for input_size in input_sizes: 300 | print input_size 301 | v1 = [np.random.randint(0,10,input_size)] 302 | v2 = [np.random.randint(0,10,input_size)] 303 | 304 | cut1 = random.randint(0,len(v1)) 305 | cut2 = random.randint(cut1,len(v2)) 306 | cut3 = random.randint(cut2,len(v2)) 307 | w1 = [v1[0][:cut1], v1[0][cut1:cut2], v1[0][cut2:cut3]] 308 | 309 | local_times = [] 310 | section_times = [] 311 | for i in range(2): 312 | t1 = time.time() 313 | f = LocalAligner() 314 | f.align(v1,v2) 315 | local_times.append(time.time()-t1) 316 | 317 | t2 = time.time() 318 | f = LocalAligner() 319 | f.align(w1,v2) 320 | section_times.append(time.time()-t2) 321 | 322 | average_local_times.append(np.mean(local_times)) 323 | average_section_times.append(np.mean(section_times)) 324 | 325 | plt.plot(input_sizes,average_section_times, color = 'b', label = 'section local alignment') 326 | plt.plot(input_sizes,average_local_times, color='r', label = 'local alignment') 327 | plt.legend(loc='upper right') 328 | plt.xlabel('input size') 329 | plt.ylim(0,0.02) 330 | plt.show() 331 | 332 | 333 | def section_test_alignment_indices(): 334 | left_test, right_test = create_section_tests() 335 | left_test_flattened = flatten(left_test) 336 | right_test_flattened = flatten(right_test) 337 | 338 | f = LocalAligner() 339 | Alignment = f.align(left_test, [right_test_flattened]) 340 | 341 | good_job = True 342 | for i in range(len(Alignment.alignments)): 343 | left, right = clean_alignment(Alignment.alignments[i]) 344 | 345 | print 'alignment length: ', len(left) 346 | 347 | left_start, left_end = find_subsequence(left, left_test_flattened) 348 | right_start, right_end = find_subsequence(right, right_test_flattened) 349 | 350 | if Alignment.alignment_indices[i]['left_start'] != left_start or \ 351 | Alignment.alignment_indices[i]['left_end'] != left_end or \ 352 | Alignment.alignment_indices[i]['right_start'] != right_start or \ 353 | Alignment.alignment_indices[i]['right_end'] != right_end: 354 | 355 | print 'indices are messed up: ' 356 | 357 | print 'left_start: ', Alignment.alignment_indices[i]['left_start'] 358 | print 'true left_start: ', left_start 359 | print 'left_end: ', Alignment.alignment_indices[i]['left_end'] 360 | print 'true left_end', left_end 361 | print '\n' 362 | 363 | print 'right_start: ', Alignment.alignment_indices[i]['right_start'] 364 | print 'true right_start: ', right_start 365 | print 'right_end: ', Alignment.alignment_indices[i]['right_end'] 366 | print 'true right_end: ', right_end 367 | 368 | print '\n' 369 | 370 | good_job = False 371 | 372 | if good_job: 373 | print 'indices worked' 374 | 375 | 376 | ############################################################ 377 | ##helper functions 378 | def clean_alignment(alignment): 379 | ''' 380 | arg: 381 | alignment object 382 | returns: 383 | 2 list of alignment words without the alignment symbol 384 | ''' 385 | keep1 = [] 386 | keep2 = [] 387 | for item in alignment[1]: 388 | if item != '-': 389 | keep1.append(item) 390 | 391 | for item in alignment[2]: 392 | if item != '-': 393 | keep2.append(item) 394 | 395 | return (keep1, keep2) 396 | 397 | 398 | if __name__ == '__main__': 399 | print "running LocalAligner unit tests.... \n" 400 | LocalAligner_unit_tests() 401 | 402 | print "running LocalAligner speed tests.... \n" 403 | LocalAligner_speed_test() 404 | 405 | print "running LocalAligner index tests.... \n" 406 | doc_test_alignment_indices(LocalAligner) 407 | 408 | print "running AffineLocalAligner unit tests.... \n" 409 | generic_doc_unit_test(AffineLocalAligner) 410 | 411 | print "running AffineLocalAligner speed tests.... \n" 412 | generic_doc_speed_test(AffineLocalAligner) 413 | 414 | print "running section unit tests for localaligner.... \n" 415 | section_unit_tests(LocalAligner) 416 | 417 | print "running section unit tests for affinealigner.... \n" 418 | section_unit_tests(AffineLocalAligner) 419 | 420 | print "running section speed tests.... \n" 421 | section_speed_test() 422 | 423 | print 'running test on keeping track of indices for section algorithm..... \n' 424 | section_test_alignment_indices() 425 | 426 | print 'running speed test on Word2VecLocalAligner.... \n' --------------------------------------------------------------------------------