├── readme.txt
├── lid
├── __init__.py
├── etl
│ ├── __init__.py
│ ├── load_constitutions_into_elasticsearch.py
│ ├── extractors.py
│ └── scrapers.py
├── utils
│ ├── __init__.py
│ ├── sunlight_utils.py
│ ├── text_cleaning.py
│ └── general_utils.py
├── evaluation
│ ├── __init__.py
│ └── bills_for_evaluation_set.py
├── .DS_Store
├── config.py
├── frontend.py
└── alignment_classifier.py
├── html
├── bootstrap3
│ ├── css
│ │ ├── .Rhistory
│ │ └── custom.css
│ ├── fonts
│ │ ├── glyphicons-halflings-regular.eot
│ │ ├── glyphicons-halflings-regular.ttf
│ │ ├── glyphicons-halflings-regular.woff
│ │ └── glyphicons-halflings-regular.woff2
│ └── js
│ │ └── npm.js
├── index.html
└── templates
│ └── searchdemo.html.jinja
├── requirements.txt
├── archive
├── input
│ ├── .download_bulk_sunlight_files.sh.swp
│ ├── unzip_bulk_files.sh
│ ├── state_metadata.sql
│ ├── committee_metadata.sql
│ ├── opensecrets
│ │ └── opensecrets_candidates.sql
│ ├── bill_metadata.sql
│ ├── lobbyists
│ │ └── compile_lobbyist_lists.sh
│ ├── campaign_contributions.sh
│ ├── download_bulk_sunlight_files.sh
│ ├── legislators.sql
│ ├── Drakefile
│ ├── state_metadata.py
│ ├── committee_metadata.py
│ ├── bill_metadata.py
│ └── legislators.py
├── prototype_text_alignment_algorithms.py
├── score_alignments.py
├── exploratory.R
├── tfidf_ranking.py
└── classifier.py
├── scripts
├── bill_to_bill_parallel.sh
├── model_legislation_parallel.sh
├── model_legislation_network.py
├── bill_to_bill_analysis.py
├── compare_constitutions.py
├── generate_model_legislation_matches.py
├── generate_bill_to_bill_matches.py
└── model_legislation_to_bill_analysis.py
├── db
├── state_bill_index.json
├── evaluation_mapping.json
├── state_bill_mapping.json
└── elasticsearch.yml
├── LICENSE
├── bashrc_lid
├── .gitignore
├── README.md
├── data
├── evaluation_set
│ └── bills_for_evaluation_set.csv
└── model_legislation_urls
│ └── clean_urls.txt
└── tests
└── text_alignment_tests.py
/readme.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lid/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lid/etl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lid/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lid/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/html/bootstrap3/css/.Rhistory:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lid/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/lid/.DS_Store
--------------------------------------------------------------------------------
/lid/config.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | #Global path variables
4 | DATA_PATH = "/mnt/elasticsearch/dssg"
5 |
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jinja2==2.7.3
2 | elasticsearch==1.5
3 | enum34==1.0.4
4 | cherrypy==3.8.0
5 | numba==0.17.0
6 | ujson==1.33
7 |
--------------------------------------------------------------------------------
/archive/input/.download_bulk_sunlight_files.sh.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/archive/input/.download_bulk_sunlight_files.sh.swp
--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/html/bootstrap3/fonts/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/html/bootstrap3/fonts/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/html/bootstrap3/fonts/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/html/bootstrap3/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/policy_diffusion/HEAD/html/bootstrap3/fonts/glyphicons-halflings-regular.woff2
--------------------------------------------------------------------------------
/archive/input/unzip_bulk_files.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | filenames=$(ls /mnt/data/sunlight/openstates_zipped_files/)
4 |
5 | for i in $filenames; do
6 | dir_name=$(sed -E 's/201[0-9]-0[0-9]-0[0-9]-//g' ${i} | sed -E 's/-json.zip//g')
7 | unzip /mnt/data/sunlight/openstates_zipped_files/${i} -d /mnt/data/sunlight/openstates_unzipped/${dir_name}
8 | done
9 |
--------------------------------------------------------------------------------
/scripts/bill_to_bill_parallel.sh:
--------------------------------------------------------------------------------
1 | cat /home/mburgess/policy_diffusion/data/bill_ids_random.txt | parallel --delay 0.1 \
2 | --joblog /home/mburgess/bill_to_bill_alignments.log \
3 | --tmpdir /mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments \
4 | --files \
5 | /home/mburgess/policy_diffusion/scripts/generate_bill_to_bill_matches.py
6 |
--------------------------------------------------------------------------------
/archive/input/state_metadata.sql:
--------------------------------------------------------------------------------
1 |
2 | DROP TABLE IF EXISTS state_metadata;
3 |
4 | CREATE TABLE state_metadata (
5 | name VARCHAR(20),
6 | abbreviation VARCHAR(2),
7 | lower_chamber_name VARCHAR(10),
8 | lower_chamber_title VARCHAR(15),
9 | upper_chamber_name VARCHAR(10),
10 | upper_chamber_title VARCHAR(15),
11 | feature_flags VARCHAR(50)
12 | );
13 |
--------------------------------------------------------------------------------
/scripts/model_legislation_parallel.sh:
--------------------------------------------------------------------------------
1 | cat /mnt/data/sunlight/dssg/model_legislation/extracted_model_legislation.json | parallel --pipe --delay 1.0 \
2 | --joblog /home/mburgess/model_legistlation_alignments.log \
3 | --tmpdir /mnt/data/sunlight/dssg/alignment_results/model_legislation_alignments \
4 | --files \
5 | /home/mburgess/policy_diffusion/scripts/generate_model_legislation_matches.py
6 |
--------------------------------------------------------------------------------
/html/bootstrap3/css/custom.css:
--------------------------------------------------------------------------------
1 | mark {
2 | background-color: yellow;
3 | color: black;
4 | }
5 |
6 | .span3 {
7 | height: 250px !important;
8 | overflow: scroll;
9 | }
10 |
11 | .span5 {
12 | height: 800px !important;
13 | overflow: scroll;
14 | }
15 | td {
16 | padding: 5px;
17 | }
18 |
19 | tr:hover { background: #efedf5; }
20 | td a {
21 | display: block;
22 | padding: 16px;
23 | }
24 |
--------------------------------------------------------------------------------
/archive/input/committee_metadata.sql:
--------------------------------------------------------------------------------
1 |
2 | DROP TABLE IF EXISTS committees;
3 |
4 | CREATE TABLE committees (
5 | id VARCHAR,
6 | state VARCHAR(2),
7 | chamber VARCHAR(10),
8 | committee VARCHAR,
9 | subcommittee VARCHAR,
10 | members JSON,
11 | sources VARCHAR,
12 | parent_id VARCHAR(10),
13 | created_at TIMESTAMP WITHOUT TIME ZONE,
14 | updated_at TIMESTAMP WITHOUT TIME ZONE,
15 | all_ids VARCHAR,
16 | level VARCHAR(5)
17 | );
18 |
--------------------------------------------------------------------------------
/archive/input/opensecrets/opensecrets_candidates.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS opensecrets.candidates;
2 |
3 | CREATE TABLE opensecrets.candidates (
4 | cycle INTEGER NOT NULL,
5 | fec_candidate_id VARCHAR(9) NOT NULL,
6 | candidate_id VARCHAR(9) NOT NULL,
7 | first_last_party VARCHAR(38) NOT NULL,
8 | party VARCHAR(7) NOT NULL,
9 | office_sought VARCHAR(4),
10 | office_held VARCHAR(4),
11 | currently_running BOOLEAN,
12 | VARCHAR(4),
13 | "RL" VARCHAR(4)
14 | );
15 |
--------------------------------------------------------------------------------
/html/bootstrap3/js/npm.js:
--------------------------------------------------------------------------------
1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
2 | require('../../js/transition.js')
3 | require('../../js/alert.js')
4 | require('../../js/button.js')
5 | require('../../js/carousel.js')
6 | require('../../js/collapse.js')
7 | require('../../js/dropdown.js')
8 | require('../../js/modal.js')
9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')
--------------------------------------------------------------------------------
/archive/input/bill_metadata.sql:
--------------------------------------------------------------------------------
1 |
2 | DROP TABLE IF EXISTS bill_metadata;
3 |
4 | CREATE TABLE bill_metadata (
5 | bill_id VARCHAR,
6 | title VARCHAR,
7 | alternate_titles JSON,
8 | versions VARCHAR,
9 | subjects VARCHAR,
10 | scraped_subjects VARCHAR,
11 | type VARCHAR,
12 | level VARCHAR,
13 | sponsors JSON,
14 | actions JSON,
15 | action_dates JSON,
16 | documents JSON,
17 | votes JSON,
18 | leg_id VARCHAR,
19 | state CHAR(2),
20 | chamber VARCHAR,
21 | session VARCHAR,
22 | all_ids VARCHAR,
23 | created_at TIMESTAMP WITHOUT TIME ZONE,
24 | updated_at TIMESTAMP WITHOUT TIME ZONE
25 | );
26 |
--------------------------------------------------------------------------------
/archive/input/lobbyists/compile_lobbyist_lists.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 |
4 | ## ILLINOIS ##
5 | #url=
6 |
7 |
8 | ## MICHIGAN ##
9 | # http://miboecfr.nictusa.com/cgi-bin/cfr/lobby_srch_res.cgi
10 |
11 | url=http://miboecfr.nictusa.com/cfr/dumpdata/aaarZaGrk/mi_lobby.sh
12 | wget -O michigan_lobbyists.txt --user-agent="jtwalsh@uchicago.edu" $url
13 |
14 | #sed -E 's/\t/,/g' michigan_lobbyists.csv | sed 's/#/ Number/g' | sed -E 's/\(MaxLen=(.){1,3}\)//g'
15 |
16 | http://miboecfr.nictusa.com/cfr/dumpdata/aaa3AaiZp/mi_lobby.sh
17 |
18 | # second line of the file has metadata
19 | # the bottom of the file has garbage too
20 |
21 |
--------------------------------------------------------------------------------
/archive/input/campaign_contributions.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | source default_profile
4 |
5 | rm /mnt/data/sunlight/followthemoney/contributions.csv
6 |
7 | for state in AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY
8 | do
9 | url="http://www.followthemoney.org/aaengine/aafetch.php?s=$state&law-ot=S,H&gro=d-id&APIKey=$FOLLOWTHEMONEYKEY&mode=csv"
10 | wget -O- --header="Accept: text/html" --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $url >> /mnt/data/sunlight/followthemoney/contributions.csv
11 | done
12 |
--------------------------------------------------------------------------------
/db/state_bill_index.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "index": {
4 | "number_of_shards": 1,
5 | "number_of_replicas": 0
6 | },
7 | "analysis": {
8 | "filter": {
9 | "my_shingle_filter": {
10 | "type": "shingle",
11 | "min_shingle_size": 2,
12 | "max_shingle_size": 4,
13 | "output_unigrams": "false"
14 | }
15 | },
16 | "analyzer": {
17 | "my_shingle_analyzer": {
18 | "type": "custom",
19 | "tokenizer": "standard",
20 | "filter": [
21 | "lowercase",
22 | "my_shingle_filter"
23 | ]
24 | }
25 | }
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/archive/input/download_bulk_sunlight_files.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### DOWNLOAD BULK DATA ###
4 | eval $(cat /home/jwalsh/policy_diffusion/default_profile | sed 's/^/export /')
5 | state_abbrevs=$(psql -t -c "SELECT abbreviation FROM state_metadata WHERE bills_identified IS NULL AND abbreviation > 'l' ORDER BY abbreviation;")
6 | user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36"
7 | month="07" # the first day of this month is the last day of records to download
8 | for i in $state_abbrevs; do
9 | urls="$urls -O http://static.openstates.org/downloads/2016-${month}-01-${i}-json.zip"
10 | done
11 | curl -A '$user_agent' $urls
12 |
13 |
--------------------------------------------------------------------------------
/scripts/model_legislation_network.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | def func(x):
4 | x['weight'] = x.count()
5 | return x
6 | df = pd.read_csv("/Users/mattburg/Downloads/interest_groups_to_state_network_fixed.csv")
7 | df = df[df.score>100]
8 | df = df.groupby(df.edge_id).count()
9 |
10 | alec_total = 2208.
11 | alice_total = 1500.
12 |
13 | index = df.index
14 | ids = df['lobby_id'].tolist()
15 |
16 | print "Source,Target,Weight,Type"
17 | for x,y in zip(index,ids):
18 | s,t = x.split("_")
19 | if s == "alec":
20 | y = float(y)/alec_total
21 | elif s == "alice":
22 | y = float(y)/alice_total
23 | else:
24 | continue
25 | print "{0},{1},{2},{3}".format(s,t,y,"undirected")
26 |
27 |
--------------------------------------------------------------------------------
/archive/input/legislators.sql:
--------------------------------------------------------------------------------
1 |
2 | DROP TABLE IF EXISTS legislators;
3 |
4 | CREATE TABLE legislators (
5 | id VARCHAR,
6 | votesmart_id VARCHAR,
7 | transparencydata_id VARCHAR,
8 | first_name VARCHAR,
9 | middle_name VARCHAR,
10 | last_name VARCHAR,
11 | suffixes VARCHAR,
12 | full_name VARCHAR,
13 | party VARCHAR,
14 | active BOOLEAN,
15 | url VARCHAR,
16 | photo_url VARCHAR,
17 | office_address VARCHAR,
18 | office_phone VARCHAR,
19 | leg_id VARCHAR,
20 | chamber VARCHAR,
21 | district VARCHAR,
22 | state VARCHAR,
23 | offices JSON,
24 | email VARCHAR,
25 | roles JSON,
26 | old_roles JSON,
27 | all_legislative_ids VARCHAR,
28 | level VARCHAR,
29 | sources JSON,
30 | created_at TIMESTAMP WITHOUT TIME ZONE,
31 | updated_at TIMESTAMP WITHOUT TIME ZONE
32 | );
33 |
--------------------------------------------------------------------------------
/lid/etl/load_constitutions_into_elasticsearch.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | import time
4 | import glob
5 | import json
6 | import requests
7 | from io import open
8 | from elasticsearch import Elasticsearch
9 |
10 | files = glob.glob("*.txt")
11 | es = Elasticsearch([{'host': "54.244.236.175", 'port': 9200}])
12 |
13 | for file in files:
14 | print file
15 | state_year = file.split(".")[0]
16 | state = state_year[:-5]
17 | year = int(state_year[-4:])
18 | file_text = open(file, 'r', encoding='ISO-8859-1').read()
19 | json_object = {
20 | "document_type": "constitution",
21 | "state": state,
22 | "year": year,
23 | "constitution": file_text
24 | }
25 |
26 | es.index(index="constitutions", doc_type="constitution", id=state_year, body=json.dumps(json_object))
27 | time.sleep(1)
28 |
--------------------------------------------------------------------------------
/db/evaluation_mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "bill_document": {
3 | "dynamic": "false",
4 | "properties": {
5 | "bill_document_last": {
6 | "type": "string",
7 | "term_vector": "yes",
8 | "index": "analyzed",
9 | "_size": {
10 | "enabled": true,
11 | "store": true
12 | },
13 | "fields": {
14 | "shingles": {
15 | "type": "string",
16 | "analyzer": "my_shingle_analyzer"
17 | } }
18 | },
19 | "match": {
20 | "type": "string",
21 | "index": "not_analyzed"
22 | },
23 | "state": {
24 | "type": "string",
25 | "index": "not_analyzed"
26 | },
27 | "unique_id": {
28 | "type": "string",
29 | "index": "not_analyzed"
30 | }
31 | }
32 | }
33 | }
--------------------------------------------------------------------------------
/archive/input/Drakefile:
--------------------------------------------------------------------------------
1 | PROFILE:=default_profile
2 | %include $[PROFILE]
3 |
4 | psql()
5 | psql -v ON_ERROR_STOP=1 -f $[INPUT] && touch $[OUTPUT]
6 |
7 |
8 |
9 |
10 | ; GRAB STATE LEGISLATIVE METADATA FROM SUNLIGHT
11 | /mnt/data/sunlight/data/input/state_metadata.csv <- [-timestamp]
12 | ; input/./state_metadata.py | sed -E "s/u?'//g" > $OUTPUT
13 |
14 | ; CREATE TABLE / COPY FOR STATE METADATA
15 | ;psql/input/touch_state_metadata <- input/state_metadata.sql, data/input/state_metadata.csv [method:psql]
16 |
17 |
18 |
19 | ; CREATE TABLE FOR BILL METADATA
20 | ; (sql file creates the table; the python script pushes the data)
21 | ;psql/input/touch_bill_metadata <- input/bill_metadata.sql [method:psql]
22 |
23 | ; GRAB BILL METADATA FROM SUNLIGHT
24 | ;data/input/touch_bill_metadata <- input/download_bulk_sunlight_files.sh
25 | ; bash $INPUT && touch $OUTPUT
26 |
27 |
28 |
--------------------------------------------------------------------------------
/scripts/bill_to_bill_analysis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 |
4 | with open('/Users/mattburg/Dropbox/bill_similarity_matrix.json') as data_file:
5 | data = json.load(data_file)
6 |
7 | #data = {'ca_1': [{'id': 'ks_2', 'score': 134, 'state': 'ks'}, {'id': 'wy_12', 'score': 80, 'state': 'wy'}],'wa_3': [{'id': 'ca_1', 'score': 20, 'state': 'ca'}, {'id': 'al_5', 'score': 40, 'state': 'al'}]}
8 |
9 |
10 | #Need list of dictionary to make it dataframe
11 | df_dict = {}
12 | df_list = []
13 | for item in data:
14 | for i in range(len(data[item])):
15 | state_1 = item[0:2]
16 | state_2 = data[item][i]['state']
17 | state_1_2 = '-'.join(sorted([state_1, state_2]))
18 | df_dict={
19 | 'state_1': item[0:2],
20 | 'state_2':data[item][i]['state'],
21 | 'score': data[item][i]['score'],
22 | 'state_1_2': state_1_2}
23 | df_list.append(df_dict)
24 |
25 |
26 | df = pd.DataFrame(df_list)
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Data Science for Social Good
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/bashrc_lid:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # make sure path to this project is set
4 | if [ ! -n "$POLICY_DIFFUSION" ]; then
5 | echo "Error: \$POLICY_DIFFUSION environment variable is not set"
6 | return
7 | fi
8 |
9 | # ensure user specified directory for log files
10 | if [ ! -n "$LOGFILE_DIRECTORY" ]; then
11 | echo "Error: \$LOGFILE_DIRECTORY environment variable is not set"
12 | return
13 | fi
14 |
15 | # ensure users specified a directory for temporary files
16 | if [ ! -n "$TEMPFILE_DIRECTORY" ]; then
17 | echo "Error: \$TEMPFILE_DIRECTORY environment variable is not set"
18 | return
19 | fi
20 |
21 | # ensure users specified the IP address for the ElasticSearch instance
22 | if [ ! -n "$ELASTICSEARCH_IP" ]; then
23 | echo "Error: \$ELASTICSEARCH_IP environment variable is not set"
24 | return
25 | fi
26 |
27 | # add python code to path
28 | export PYTHONPATH=${POLICY_DIFFUSION}/lid:${PYTHONPATH}
29 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/etl:${PYTHONPATH}
30 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/utils:${PYTHONPATH}
31 | export PYTHONPATH=${POLICY_DIFFUSION}/lid/evaluation:${PYTHONPATH}
32 | export PYTHONPATH=${POLICY_DIFFUSION}/scripts:${PYTHONPATH}
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Drake log
2 | drake.log
3 |
4 | # Database info
5 | default_profile
6 |
7 | # Sunlight key
8 | .sunlight.*
9 |
10 | # IPython Notebook checkpoints
11 | .ipynb_checkpoints/
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 |
17 | # C extensions
18 | *.so
19 |
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 |
62 | # Sphinx documentation
63 | docs/_build/
64 |
65 | # PyBuilder
66 | target/
67 |
68 | # Drake
69 | drake.*
70 | .drake/
71 |
72 | #pycharm
73 | *.idea
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/scripts/compare_constitutions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Written for Python 2.7
4 |
5 | from lid import LID
6 | from text_alignment import AffineLocalAligner,LocalAligner
7 | import database
8 | import json
9 | import base64
10 | import codecs
11 | import re
12 | import logging
13 | import os
14 | import traceback
15 | import sys
16 | from database import ElasticConnection
17 | from elasticsearch import Elasticsearch
18 | import time
19 |
20 | def get_constitution_alignments(query_doc):
21 | result_docs = constitution_lidy.find_constitution_alignments(
22 | query_doc,
23 | document_type = "text",
24 | split_sections = True,
25 | query_document_id = "text")
26 | return result_docs
27 |
28 |
29 | def main():
30 |
31 | docs = ec.get_all_doc_ids('constitutions')
32 |
33 | for doc in docs:
34 | print doc
35 | doc_text = es_connection.get_source(index = 'constitutions', id = doc)['constitution']
36 | result_doc = get_constitution_alignments(doc_text)
37 | open('/mnt/data/jwalsh/constitution_matches.json', 'a').write(json.dumps(result_doc))
38 | time.sleep(1)
39 |
40 |
41 |
42 | if __name__ == "__main__":
43 | #elastic host ip
44 | ip_addy = os.environ['ELASTICSEARCH_IP']
45 |
46 | #instantiate lid,aligner and elasticsearch objects
47 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
48 | ec = ElasticConnection(host = ip_addy)
49 | es_connection = Elasticsearch([{'host': ip_addy, 'port': 9200}])
50 |
51 | query_results_limit = os.environ['QUERY_RESULTS_LIMIT']
52 | constitution_lidy = LID(query_results_limit=query_results_limit, elastic_host=ip_addy,
53 | lucene_score_threshold=0.01, aligner=aligner)
54 |
55 | main()
56 |
57 |
--------------------------------------------------------------------------------
/archive/input/state_metadata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | from sunlight import openstates
5 | import psycopg2
6 | import csv
7 | import sys
8 | import re
9 |
10 |
11 |
12 | # GRAB DATABASE INFO FROM default_profile
13 | db_info = []
14 | with open('default_profile', 'rb') as db_file:
15 | reader = csv.reader(db_file, delimiter='=', quotechar='"')
16 | for row in reader:
17 | db_info.append(row[1])
18 |
19 |
20 | # CONNECT TO DATABASE
21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3])
22 | cur = conn.cursor()
23 |
24 |
25 | # FUNCTION TO PARSE STATE METADATA
26 | def parse_state_metadata(state_metadata):
27 | name = state_metadata['name']
28 | abbreviation = state_metadata['abbreviation']
29 | if 'lower' in state_metadata['chambers']:
30 | lower_chamber_name = state_metadata['chambers']['lower']['name']
31 | lower_chamber_title = state_metadata['chambers']['lower']['title']
32 | else:
33 | lower_chamber_name = None
34 | lower_chamber_title = None
35 | upper_chamber_name = state_metadata['chambers']['upper']['name']
36 | upper_chamber_title = state_metadata['chambers']['upper']['title']
37 | feature_flags = ', '.join(state_metadata['feature_flags'])
38 | return((name, abbreviation, lower_chamber_name, lower_chamber_title,
39 | upper_chamber_name, upper_chamber_name, feature_flags))
40 |
41 |
42 | # GRAB THE DATA FROM SUNLIGHT API
43 | state_metadata = openstates.all_metadata()
44 |
45 |
46 | # PARSE SUNLIGHT DATA AND WRITE TO POSTGRES
47 | temp_state_metadata = []
48 | for state in state_metadata:
49 | temp_state_metadata.append(parse_state_metadata(state))
50 |
51 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_state_metadata)
52 | cur.execute("INSERT INTO state_metadata VALUES " + args_str)
53 | conn.commit()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Legislative Influence Detector
2 |
3 | Legislators often lack the time to write bills, so they tend to rely on outside groups to help. Researchers and concerned citizens would like to know who’s writing legislative bills, but trying to read those bills, let alone trace their source, is tedious and time consuming. This is especially true at the state and local levels, where arguably more important policy decisions are made every day.
4 |
5 | This project provides tools to help analyze and access government bills. Using the Sunlight Foundation’s collection of state bills and model legislation scraped from lobbying groups from around the country, we built tools to shed light on the origination and diffusion of policy ideas around the country, the effectiveness of various lobbying organizations, and the democratic nature of individual bills, all in near real time.
6 |
7 | # How does it work?
8 |
9 | We use the Smith-Waterman local-alignment algorithm to find matching text across documents. This algorithm grabs pieces of text from each document and compares each word, adding points for matches and subtracting points for mismatches. Unfortunately, the local-alignment algorithm is too slow for large sets of text, such as ours. It could take the algorithm thousands of years to finish analyzing the legislation. We improved the speed of the analysis by first limiting the number of documents that need to be compared. Elasticsearch, our database of choice for this project, efficiently calculates Lucene scores. When we use LID to search for a document, it quickly compares our document against all others and grabs the 100 most similar documents as measured by their Lucene scores. Then we run the local-alignment algorithm on those 100.
10 |
11 | # How to use it?
12 |
13 | * The text_alignmnet.py file gives our implemtnation of the smith-waterman algorithm. Feel free to use it!
14 |
15 | # Important Files
16 |
17 | * text_alignment.py: contains our fast implementation of the smith-waterman algorithm.
18 |
19 | ## Environmental Variables
20 | * POLICY_DIFFUSION
21 | * LOGFILE_DIRECTORY: should not exist inside repository, to prevent repository bloating
22 | * TEMPFILE_DIRECTORY: stores files created temporarily while the algorithm runs
23 | * ELASTICSEARCH_IP
24 |
25 |
--------------------------------------------------------------------------------
/archive/input/committee_metadata.py:
--------------------------------------------------------------------------------
1 |
2 | from sunlight import openstates
3 | import psycopg2
4 | from psycopg2.extras import Json
5 | import json
6 | import csv
7 | import sys
8 | import re
9 | import os
10 |
11 |
12 | # GRAB DATABASE INFO FROM default_profile
13 | db_info = []
14 | with open('default_profile', 'rb') as db_file:
15 | reader = csv.reader(db_file, delimiter='=', quotechar='"')
16 | for row in reader:
17 | db_info.append(row[1])
18 |
19 |
20 | # CONNECT TO DATABASE
21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3])
22 | cur = conn.cursor()
23 |
24 |
25 | # PARSE COMMITTEE METADATA
26 | def parse_committee_metadata(committee_metadata):
27 | id_ = committee_metadata['id']
28 | state = committee_metadata['state']
29 | chamber = committee_metadata['chamber']
30 | committee = committee_metadata['committee']
31 | subcommittee = committee_metadata['subcommittee']
32 | if len(committee_metadata['members']) > 0:
33 | members = Json(committee_metadata['members'][0])
34 | else:
35 | members = None
36 | sources = committee_metadata['sources'][0]['url']
37 | parent_id = committee_metadata['parent_id']
38 | created_at = committee_metadata['created_at']
39 | updated_at = committee_metadata['updated_at']
40 | if len(committee_metadata['all_ids']) > 0:
41 | all_ids = committee_metadata['all_ids'][0]
42 | else:
43 | all_ids = None
44 | if 'level' in committee_metadata:
45 | level = committee_metadata['level']
46 | else:
47 | level = None
48 |
49 | return((id_, state, chamber, committee, subcommittee, members,
50 | sources, parent_id, created_at, updated_at, all_ids, level))
51 |
52 |
53 |
54 | # GRAB COMMITTEE METADATA FROM FILES AND PUSH TO DATABASE
55 | temp_committee_metadata = []
56 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/committees/'):
57 | for name in files:
58 | directory_file = os.path.join(path, name)
59 | with open(directory_file) as json_file:
60 | committee = json.load(json_file)
61 | parsed_data = parse_committee_metadata(committee)
62 | temp_committee_metadata.append(parsed_data)
63 |
64 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_committee_metadata)
65 | cur.execute("INSERT INTO committees VALUES " + args_str)
66 | conn.commit()
67 |
68 |
--------------------------------------------------------------------------------
/html/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
59 |
60 |
64 |
68 |
69 |
70 |
This is a test to see how we can compare query {{ query_string }}
71 |
This is a test to see how we can compare query and result text
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/scripts/generate_model_legislation_matches.py:
--------------------------------------------------------------------------------
1 | #!/opt/anaconda/bin/python
2 |
3 | from lid import LID
4 | from text_alignment import AffineLocalAligner,LocalAligner
5 | import database
6 | import json
7 | import base64
8 | import codecs
9 | import re
10 | import logging
11 | import os
12 | import traceback
13 | import sys
14 | from utils.general_utils import deadline,TimedOutExc
15 | import time
16 |
17 |
18 |
19 | @deadline(1000)
20 | def get_alignments(model_doc):
21 | result_docs = lidy.find_state_bill_alignments(model_doc['source'],document_type = "model_legislation",
22 | split_sections = True,query_document_id = model_doc['id'])
23 | return result_docs
24 |
25 |
26 | def test(model_doc):
27 | return model_doc
28 |
29 |
30 | if __name__ == "__main__":
31 |
32 | #elastic host ip
33 | ip_addy = "54.203.12.145"
34 |
35 |
36 |
37 | #configure logging
38 | logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']),
39 | level=logging.DEBUG)
40 | logging.getLogger('elasticsearch').setLevel(logging.ERROR)
41 | logging.getLogger('urllib3').setLevel(logging.ERROR)
42 | logging.getLogger('json').setLevel(logging.ERROR)
43 |
44 |
45 | #instantiate lid object
46 |
47 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
48 |
49 | lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner)
50 |
51 | for line in sys.stdin:
52 | model_doc = json.loads(line.strip())
53 |
54 | try:
55 | result_doc = get_alignments(model_doc)
56 | #result_doc = test(model_doc)
57 | print json.dumps(result_doc)
58 |
59 | except (KeyboardInterrupt, SystemExit):
60 | raise
61 | except TimedOutExc:
62 | m = "timeout error query_id {0}: {1}".format(model_doc['id'], trace_message)
63 | logging.error(m)
64 | print json.dumps({"query_document_id": model_doc['id'],"error":"timeout error"})
65 |
66 | except:
67 | trace_message = re.sub("\n+", "\t", traceback.format_exc())
68 | trace_message = re.sub("\s+", " ", trace_message)
69 | trace_message = "<<{0}>>".format(trace_message)
70 | m = "random error query_id {0}: {1}".format(model_doc['id'], trace_message)
71 | logging.error(m)
72 | print json.dumps({"query_document_id": model_doc['id'],"error":"trace_message"})
73 |
74 |
--------------------------------------------------------------------------------
/lid/utils/sunlight_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import string
4 | import tempfile
5 | import importlib
6 | import subprocess
7 |
8 |
9 |
10 | PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
11 |
12 |
13 | def convert_pdf(filename, type='xml'):
14 | commands = {'text': ['pdftotext', '-layout', filename, '-'],
15 | 'text-nolayout': ['pdftotext', filename, '-'],
16 | 'xml': ['pdftohtml', '-xml', '-stdout', filename],
17 | 'html': ['pdftohtml', '-stdout', filename]}
18 | try:
19 | pipe = subprocess.Popen(commands[type], stdout=subprocess.PIPE,
20 | close_fds=True).stdout
21 | except OSError as e:
22 | raise EnvironmentError("error running %s, missing executable? [%s]" %
23 | ' '.join(commands[type]), e)
24 | data = pipe.read()
25 | pipe.close()
26 | return data
27 |
28 |
29 | def pdfdata_to_text(data):
30 | with tempfile.NamedTemporaryFile(delete=True) as tmpf:
31 | tmpf.write(data)
32 | tmpf.flush()
33 | return convert_pdf(tmpf.name, 'text')
34 |
35 |
36 | def worddata_to_text(data):
37 | desc, txtfile = tempfile.mkstemp(prefix='tmp-worddata-', suffix='.txt')
38 | try:
39 | with tempfile.NamedTemporaryFile(delete=True) as tmpf:
40 | tmpf.write(data)
41 | tmpf.flush()
42 | subprocess.check_call(['timeout', '10', 'abiword',
43 | '--to=%s' % txtfile, tmpf.name])
44 | f = open(txtfile)
45 | text = f.read()
46 | tmpf.close()
47 | f.close()
48 | finally:
49 | os.remove(txtfile)
50 | os.close(desc)
51 | return text.decode('utf8')
52 |
53 |
54 | def text_after_line_numbers(lines):
55 | text = []
56 | for line in lines.splitlines():
57 | # real bill text starts with an optional space, line number
58 | # more spaces, then real text
59 | match = re.match('\s*\d+\s+(.*)', line)
60 | if match:
61 | text.append(match.group(1))
62 |
63 | # return all real bill text joined w/ newlines
64 | return '\n'.join(text).decode('utf-8', 'ignore')
65 |
66 |
67 | def plaintext(abbr, doc, doc_bytes):
68 | # use module to pull text out of the bytes
69 | module = importlib.import_module(abbr)
70 | text = module.extract_text(doc, doc_bytes)
71 |
72 | if not text:
73 | return
74 |
75 | if isinstance(text, unicode):
76 | text = text.encode('ascii', 'ignore')
77 | else:
78 | text = text.decode('utf8', 'ignore').encode('ascii', 'ignore')
79 | text = text.replace(u'\xa0', u' ') # nbsp -> sp
80 | text = PUNCTUATION.sub(' ', text) # strip punctuation
81 | text = re.sub('\s+', ' ', text) # collapse spaces
82 | return text
83 |
84 |
85 |
--------------------------------------------------------------------------------
/scripts/generate_bill_to_bill_matches.py:
--------------------------------------------------------------------------------
1 | #!/opt/anaconda/bin/python
2 |
3 | from lid import LID
4 | from text_alignment import AffineLocalAligner,LocalAligner
5 | import database
6 | import json
7 | import base64
8 | import codecs
9 | import re
10 | import logging
11 | import os
12 | import traceback
13 | import sys
14 | from utils.general_utils import deadline,TimedOutExc
15 | from database import ElasticConnection
16 | import time
17 |
18 | class NoneDocException(Exception):
19 | pass
20 |
21 |
22 | @deadline(1000)
23 | def get_alignments(query_doc,bill_id):
24 | result_docs = lidy.find_state_bill_alignments(query_doc,document_type = "state_bill",
25 | split_sections = True,state_id = bill_id[0:2],query_document_id = bill_id)
26 | return result_docs
27 |
28 |
29 | def test(model_doc):
30 | return model_doc
31 |
32 |
33 | if __name__ == "__main__":
34 |
35 | #elastic host ip
36 | ip_addy = "54.203.12.145"
37 |
38 | #configure logging
39 | logging.basicConfig(filename="{0}/logs/model_legislation_alignment.log".format(os.environ['POLICY_DIFFUSION']),
40 | level=logging.DEBUG)
41 | logging.getLogger('elasticsearch').setLevel(logging.ERROR)
42 | logging.getLogger('urllib3').setLevel(logging.ERROR)
43 | logging.getLogger('json').setLevel(logging.ERROR)
44 |
45 |
46 | #instantiate lid,aligner and elasticsearch objects
47 |
48 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend = -1.5)
49 |
50 | ec = ElasticConnection(host = ip_addy)
51 |
52 | lidy = LID(query_results_limit=100,elastic_host = ip_addy,lucene_score_threshold = 0.1,aligner = aligner)
53 |
54 | #for line in sys.stdin:
55 |
56 | try:
57 |
58 | bill_id = sys.argv[1]
59 | query_doc = ec.get_bill_by_id(bill_id)['bill_document_last']
60 |
61 | if query_doc is None:
62 | raise NoneDocException
63 |
64 | result_doc = get_alignments(query_doc,bill_id)
65 | logging.info("obtained alignments for {0}".format(bill_id))
66 | print json.dumps(result_doc)
67 |
68 | except (KeyboardInterrupt, SystemExit):
69 | raise
70 |
71 | except NoneDocException:
72 |
73 | m = "none doc error query_id {0}: {1}".format(bill_id, "None doc error")
74 | logging.error(m)
75 | print json.dumps({"query_document_id": bill_id,"error":"none doc error"})
76 |
77 | except TimedOutExc:
78 |
79 | m = "timeout error query_id {0}: {1}".format(bill_id, "timeout error")
80 | logging.error(m)
81 | print json.dumps({"query_document_id": bill_id,"error":"timeout error"})
82 |
83 | except:
84 |
85 | trace_message = re.sub("\n+", "\t", traceback.format_exc())
86 | trace_message = re.sub("\s+", " ", trace_message)
87 | trace_message = "<<{0}>>".format(trace_message)
88 | m = "random error query_id {0}: {1}".format(bill_id, trace_message)
89 | logging.error(m)
90 | print json.dumps({"query_document_id": bill_id,"error":"trace_message"})
91 |
--------------------------------------------------------------------------------
/scripts/model_legislation_to_bill_analysis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | from database import *
4 | import numpy as np
5 |
6 | #open json file
7 | alec_json = "/Users/eugeniagiraudy/Dropbox/DSSG/policy_diffusion/scripts/model_legislation_alignments.json"
8 |
9 | def create_bill_to_bill_matrix(jsonfile):
10 | '''
11 | Converts a json file with matching text between model legislation and bills into a
12 | dataframe.
13 |
14 | '''
15 | alignments = [json.loads(x.strip()) for x in open(jsonfile)]
16 | df_list = []
17 | for i in range(len(alignments)):
18 | left_id = alignments[i]['query_document_id']
19 | interest_group = left_id.split('_')
20 | interest_group = "_".join(interest_group[0:2])
21 | try:
22 | for result in alignments[i]['alignment_results']:
23 | right_id = result['document_id']
24 | score_list = []
25 | for j in range(len(result['alignments'])):
26 | score = result['alignments'][j]['score']
27 | score_list.append(score)
28 | #Need to decide whehter we want the sum, average, max
29 | score_max = max(score_list)
30 | df_list.append([interest_group, left_id,right_id,score_max,right_id[0:2],left_id+"_"+right_id,'undirected'])
31 | except KeyError:
32 | print left_id, 'failed'
33 | continue
34 | df = pd.DataFrame(df_list)
35 | df.columns = ['interst_group_id','model_legislation_id', 'unique_id','score_max','state','bill_ml_id','undirected']
36 | return df
37 |
38 |
39 | def grab_ids_for_data_frame(df):
40 | '''
41 | Grabs bill ids from ElasticSearch and adds it to a dataframe.
42 | Outputs csv file with data frame containing model legislation to bills matches and
43 | information on date introduced and date signed
44 |
45 | Arguments:
46 | dataframe = data frame containing model legislation to bill analysis
47 |
48 | '''
49 | bill_id_list = df['unique_id']
50 | bill_id_list = bill_id_list.tolist()
51 |
52 | ec = ElasticConnection(host = '54.203.12.145', port = 9200)
53 |
54 | bill_dates = []
55 | bill_signed = []
56 | for bill in bill_id_list:
57 | bill_all = ec.get_bill_by_id(bill)
58 | date_introduced = bill_all['date_introduced']
59 | date_signed = bill_all['date_signed']
60 | bill_dates.append(date_introduced)
61 | bill_signed.append(date_signed)
62 | print bill
63 | bills_introd_signed = zip(bill_id_list, bill_dates, bill_signed)
64 | df_dates = pd.DataFrame(bills_introd_signed)
65 | df_dates.columns = ['unique_id', 'date_introduced', 'date_signed']
66 | df2 = pd.merge(df, df_dates, on='unique_id')
67 | #Drop duplicates from the merge
68 | df3 = df2.drop_duplicates('bill_ml_id')
69 | return df3.to_csv('./model_legislation_to_bills_max_score.csv')
70 |
71 |
72 |
73 |
74 | #Analysis of ALEC
75 |
76 | df_alec = df3[(df3.interst_group_id =='alec_bills')|(df3.interst_group_id=='alec_old')]
77 | #eliminate cases where two model legislations influence the same bill
78 | df_alec = df_alec.groupby(['unique_id']).max()
79 | date = df_alec['date_introduced']
80 | df_alec['year_introduced']=date.apply(lambda x:x.year)
81 | #eliminate cases wher states may have two identical bills for a given year
82 | df_grouped = df_alec.groupby(['state', 'year_introduced', 'model_legislation_id']).max()
83 | df_grouped.to_csv('./alec_model_legislation_to_bills_max_score_unique.csv')
84 |
85 |
--------------------------------------------------------------------------------
/db/state_bill_mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "bill_document": {
3 | "dynamic": "false",
4 | "properties": {
5 | "actions": {
6 | "properties": {
7 | "action": {
8 | "type": "string",
9 | "index": "analyzed"
10 | },
11 | "actor": {
12 | "type": "string",
13 | "index": "analyzed"
14 | },
15 | "date": {
16 | "type": "date",
17 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
18 | },
19 | "type": {
20 | "type": "string",
21 | "index": "not_analyzed"
22 | }
23 | }
24 | },
25 | "bill_document_first": {
26 | "type": "string",
27 | "term_vector": "yes",
28 | "index": "analyzed",
29 | "_size": {
30 | "enabled": true,
31 | "store": true
32 | },
33 | "fields": {
34 | "shingles": {
35 | "type": "string",
36 | "analyzer": "my_shingle_analyzer"
37 | } }
38 | },
39 | "bill_document_last": {
40 | "type": "string",
41 | "term_vector": "yes",
42 | "index": "analyzed",
43 | "_size": {
44 | "enabled": true,
45 | "store": true
46 | },
47 | "fields": {
48 | "shingles": {
49 | "type": "string",
50 | "analyzer": "my_shingle_analyzer"
51 | } }
52 | },
53 | "bill_id": {
54 | "type": "string",
55 | "index": "not_analyzed"
56 | },
57 | "bill_title": {
58 | "type": "string",
59 | "term_vector": "yes",
60 | "index": "analyzed",
61 | "_size": {
62 | "enabled": true,
63 | "store": true
64 | },
65 | "fields": {
66 | "shingles": {
67 | "type": "string",
68 | "analyzer": "my_shingle_analyzer"
69 | } }
70 | },
71 | "bill_type": {
72 | "type": "string",
73 | "index": "not_analyzed"
74 | },
75 | "chamber": {
76 | "type": "string",
77 | "index": "not_analyzed"
78 | },
79 | "summary": {
80 | "type": "string",
81 | "term_vector": "yes",
82 | "index": "analyzed",
83 | "_size": {
84 | "enabled": true,
85 | "store": true
86 | },
87 | "fields": {
88 | "shingles": {
89 | "type": "string",
90 | "analyzer": "my_shingle_analyzer"
91 | } }
92 | },
93 | "short_title": {
94 | "type": "string",
95 | "term_vector": "yes",
96 | "index": "analyzed",
97 | "_size": {
98 | "enabled": true,
99 | "store": true
100 | },
101 | "fields": {
102 | "shingles": {
103 | "type": "string",
104 | "analyzer": "my_shingle_analyzer"
105 | } }
106 | },
107 | "date_created": {
108 | "type": "date",
109 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
110 | },
111 | "date_updated": {
112 | "type": "date",
113 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
114 | },
115 | "session": {
116 | "type": "string",
117 | "index": "analyzed"
118 | },
119 | "state": {
120 | "type": "string",
121 | "index": "not_analyzed"
122 | },
123 | "sunlight_id": {
124 | "type": "string",
125 | "index": "not_analyzed"
126 | },
127 | "unique_id": {
128 | "type": "string",
129 | "index": "not_analyzed"
130 | }
131 | }
132 | }
133 | }
--------------------------------------------------------------------------------
/archive/input/bill_metadata.py:
--------------------------------------------------------------------------------
1 |
2 | import psycopg2
3 | from psycopg2.extras import Json
4 | import json
5 | import csv
6 | import os
7 | import re
8 |
9 |
10 |
11 | # GRAB DATABASE INFO FROM default_profile
12 | db_info = []
13 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file:
14 | reader = csv.reader(db_file, delimiter='=', quotechar='"')
15 | for row in reader:
16 | db_info.append(row[1])
17 |
18 |
19 | # CONNECT TO DATABASE
20 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3])
21 | cur = conn.cursor()
22 |
23 |
24 |
25 | # PARSE BILL METADATA FOR DATABASE INSERTION
26 | def parse_bill_metadata(bill_metadata):
27 | bill_id = bill_metadata['bill_id']
28 | title = bill_metadata['title']
29 | if len(bill_metadata['alternate_titles']) > 0:
30 | alternate_titles = Json(bill_metadata['alternate_titles'][0])
31 | else:
32 | alternate_titles = None
33 | if len(bill_metadata['versions']) > 0:
34 | versions = Json(bill_metadata['versions'][0])
35 | else:
36 | versions = None
37 | if 'subjects' in bill_metadata:
38 | if len(bill_metadata['subjects']) > 0:
39 | subjects = bill_metadata['subjects'][0]
40 | else:
41 | subjects = None
42 | else:
43 | subjects = None
44 | if 'scraped_subjects' in bill_metadata:
45 | if len(bill_metadata['scraped_subjects']) > 0:
46 | scraped_subjects = bill_metadata['scraped_subjects'][0]
47 | else:
48 | scraped_subjects = None
49 | else:
50 | scraped_subjects = None
51 | type_ = bill_metadata['type'][0]
52 | if 'level' in bill_metadata:
53 | level = bill_metadata['level']
54 | else:
55 | level = None
56 | if len(bill_metadata['sponsors']) > 0:
57 | sponsors = Json(bill_metadata['sponsors'][0])
58 | else:
59 | sponsors = None
60 | if len(bill_metadata['actions']) > 0:
61 | actions = Json(bill_metadata['actions'][0])
62 | else:
63 | actions = None
64 | if len(bill_metadata['action_dates']) > 0:
65 | action_dates = Json(bill_metadata['action_dates'])
66 | else:
67 | action_dates = None
68 | if len(bill_metadata['documents']) > 0:
69 | documents = Json(bill_metadata['documents'][0])
70 | else:
71 | documents = None
72 | if len(bill_metadata['votes']) > 0:
73 | votes = Json(bill_metadata['votes'][0])
74 | else:
75 | votes = None
76 | id_ = bill_metadata['id']
77 | state = bill_metadata['state']
78 | chamber = bill_metadata['chamber']
79 | session = bill_metadata['session']
80 |
81 | all_ids = bill_metadata['all_ids'][0]
82 | created_at = bill_metadata['created_at']
83 | updated_at = bill_metadata['updated_at']
84 |
85 | return((bill_id, title, alternate_titles, versions, subjects, scraped_subjects,
86 | type_, level, sponsors, actions, action_dates, documents, votes, id_, state,
87 | chamber, session, all_ids, created_at, updated_at))
88 |
89 |
90 |
91 | # GRAB BILL METADATA AND PUSH TO DATABASE
92 | temp_bill_metadata = []
93 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/bills/'):
94 | for name in files:
95 | directory_file = os.path.join(path, name)
96 | with open(directory_file) as json_file:
97 | bill = json.load(json_file)
98 | parsed_data = parse_bill_metadata(bill)
99 | temp_bill_metadata.append(parsed_data)
100 | if len(temp_bill_metadata) == 10000 or name == files[len(files)-1]:
101 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_bill_metadata)
102 | cur.execute("INSERT INTO bill_metadata VALUES " + args_str)
103 | conn.commit()
104 | temp_bill_metadata = []
105 |
--------------------------------------------------------------------------------
/archive/prototype_text_alignment_algorithms.py:
--------------------------------------------------------------------------------
1 | from text_alignment import *
2 | from gensim.models import Word2Vec
3 | from evaluation.score_alignments import load_word2vec
4 | from scipy.spatial.distance import cosine
5 |
6 | class Word2VecLocalAligner(LocalAligner):
7 |
8 | def __init__(self,match_score = 3, mismatch_score = -1, gap_score = -2):
9 | LocalAligner.__init__(self, match_score, mismatch_score, gap_score)
10 | self.model = load_word2vec()
11 | self._algorithm_name = 'word2vec_local_alignment'
12 |
13 | def __str__(self):
14 |
15 | name_str = "{0} instance".format(self._algorithm_name)
16 | param_str_1 = "match_score = {0}".format(self.gap_score)
17 | param_str_2 = "mismatch_score = {0}".format(self.match_score)
18 | param_str_3 = "gap_score = {0}".format(self.mismatch_score)
19 | return "{0}: {1}, {2}, {3}".format(name_str,param_str_1,param_str_2,param_str_3)
20 |
21 |
22 | def align(self,left_sections,right_sections):
23 | '''
24 | description:
25 | find alignments between two documents using word2vec
26 | args:
27 | left_sections: a list of lists of words
28 | right_sections: a list of lists of words (usually just a list of a list of words)
29 |
30 | returns:
31 | alignment object
32 | '''
33 |
34 | alignments = []
35 | alignment_indices = []
36 |
37 | for left in left_sections:
38 | for right in right_sections:
39 |
40 | a_ints, b_ints, word_map = self._transform_text(left, right)
41 |
42 | score_matrix, pointer_matrix = self._compute_matrix(a_ints, b_ints,self.match_score,
43 | self.mismatch_score, self.gap_score, self.model)
44 |
45 | l, r, score, align_index = self._backtrace(a_ints, b_ints, score_matrix, pointer_matrix)
46 |
47 | reverse_word_map = {v:k for k,v in word_map.items()}
48 | reverse_word_map["-"] = "-"
49 | l = [reverse_word_map[w] for w in l]
50 | r = [reverse_word_map[w] for w in r]
51 |
52 | alignment_indices.append(align_index)
53 | alignments.append((score, l, r))
54 |
55 | left = reduce(lambda x,y:x+y,left_sections)
56 | right = reduce(lambda x,y:x+y,right_sections)
57 |
58 | return Alignment(left,right,alignments,alignment_indices)
59 |
60 |
61 | @jit
62 | def _compute_matrix(self, left, right, match_score, mismatch_score, gap_score, model):
63 | '''
64 | description:
65 | create matrix of optimal scores
66 | args:
67 | left: an array of integers
68 | right: an array of integers
69 | match_score: score for match in alignment
70 | mismatch_score: score for mismatch in alignment
71 | gap_start: score for first gap
72 | gap_extend: score for every gap
73 | model: word2vec model
74 | returns:
75 | three matrices required to construct optimal solution
76 | '''
77 | m = len(left) + 1
78 | n = len(right) + 1
79 | score_matrix = np.zeros((m, n),dtype = float)
80 | scores = np.zeros((4),dtype = float)
81 | pointer_matrix = np.zeros((m,n),dtype = int)
82 | for i in xrange(1, m):
83 | for j in xrange(1, n):
84 |
85 | if left[i-1] == right[j-1]:
86 | scores[1] = score_matrix[i-1,j-1] + match_score
87 | else:
88 | scores[1] = score_matrix[i-1,j-1] + mismatch_score*cosine(left[i-1], right[j-1])
89 |
90 | scores[2] = score_matrix[i, j - 1] + gap_score
91 |
92 | scores[3] = score_matrix[i - 1, j] + gap_score
93 |
94 | max_decision = np.argmax(scores)
95 |
96 | pointer_matrix[i,j] = max_decision
97 | score_matrix[i,j] = scores[max_decision]
98 |
99 | return score_matrix, pointer_matrix
--------------------------------------------------------------------------------
/archive/input/legislators.py:
--------------------------------------------------------------------------------
1 |
2 | import psycopg2
3 | from psycopg2.extras import Json
4 | import json
5 | import csv
6 | import sys
7 | import re
8 | import os
9 |
10 |
11 |
12 | # GRAB DATABASE INFO FROM default_profile
13 | db_info = []
14 | with open('/home/jwalsh/policy_diffusion/default_profile', 'rb') as db_file:
15 | reader = csv.reader(db_file, delimiter='=', quotechar='"')
16 | for row in reader:
17 | db_info.append(row[1])
18 |
19 |
20 | # CONNECT TO DATABASE
21 | conn = psycopg2.connect(host = db_info[0], database = db_info[1], user = db_info[2], password = db_info[3])
22 | cur = conn.cursor()
23 |
24 |
25 |
26 | # PARSE BILL METADATA FOR DATABASE INSERTION
27 | def parse_legislator_metadata(legislator_metadata):
28 | id_ = legislator_metadata['id']
29 | if 'votesmart_id' in legislator_metadata:
30 | votesmart_id = legislator_metadata['votesmart_id']
31 | else:
32 | votesmart_id = None
33 | if 'transparencydata_id' in legislator_metadata:
34 | transparencydata_id = legislator_metadata['transparencydata_id']
35 | else:
36 | transparencydata_id = None
37 | first_name = legislator_metadata['first_name']
38 | if len(legislator_metadata['middle_name']) > 0:
39 | middle_name = legislator_metadata['middle_name']
40 | else:
41 | middle_name = None
42 | last_name = legislator_metadata['last_name']
43 | if len(legislator_metadata['suffixes']) > 0:
44 | suffixes = legislator_metadata['suffixes']
45 | else:
46 | suffixes = None
47 | full_name = legislator_metadata['full_name']
48 | if 'party' in legislator_metadata:
49 | party = legislator_metadata['party']
50 | else:
51 | party = None
52 | active = legislator_metadata['active']
53 | if 'url' in legislator_metadata:
54 | url = legislator_metadata['url']
55 | else:
56 | url = None
57 | if 'photo_url' in legislator_metadata:
58 | photo_url = legislator_metadata['photo_url']
59 | else:
60 | photo_url = None
61 | if 'office_address' in legislator_metadata:
62 | office_address = legislator_metadata['office_address']
63 | else:
64 | office_address = None
65 | if 'office_phone' in legislator_metadata:
66 | office_phone = legislator_metadata['office_phone']
67 | else:
68 | office_phone = None
69 | leg_id = legislator_metadata['leg_id']
70 | if 'chamber' in legislator_metadata:
71 | chamber = legislator_metadata['chamber']
72 | else:
73 | chamber = None
74 | if 'district' in legislator_metadata:
75 | district = legislator_metadata['district']
76 | else:
77 | district = None
78 | state = legislator_metadata['state']
79 | if len(legislator_metadata['offices']) > 0:
80 | offices = Json(legislator_metadata['offices'][0])
81 | else:
82 | offices = None
83 | if 'email' in legislator_metadata:
84 | email = legislator_metadata['email']
85 | else:
86 | email = None
87 | if len(legislator_metadata['roles']) > 0:
88 | roles = Json(legislator_metadata['roles'][0])
89 | else:
90 | roles = None
91 | if 'old_roles' in legislator_metadata:
92 | old_roles = Json(legislator_metadata['old_roles'])
93 | else:
94 | old_roles = None
95 | all_legislative_ids = legislator_metadata['all_ids'][0]
96 | if 'level' in legislator_metadata:
97 | level = legislator_metadata['level']
98 | else:
99 | level = None
100 | if len(legislator_metadata['sources']) > 0:
101 | sources = Json(legislator_metadata['sources'][0])
102 | else:
103 | sources = None
104 | created_at = legislator_metadata['created_at']
105 | updated_at = legislator_metadata['updated_at']
106 |
107 | return((id_, votesmart_id, transparencydata_id,
108 | first_name, middle_name, last_name, suffixes, full_name,
109 | party, active, url, photo_url, office_address, office_phone,
110 | leg_id, chamber, district, state, offices, email,
111 | roles, old_roles, all_legislative_ids, level, sources,
112 | created_at, updated_at))
113 |
114 |
115 |
116 | # GRAB BILL METADATA FROM SUNLIGHT AND PUSH TO DATABASE
117 | temp_legislator_metadata = []
118 | for path, subdirs, files in os.walk(r'/mnt/data/sunlight/openstates_unzipped/legislators/'):
119 | for name in files:
120 | directory_file = os.path.join(path, name)
121 | with open(directory_file) as json_file:
122 | legislator = json.load(json_file)
123 | parsed_data = parse_legislator_metadata(legislator)
124 | temp_legislator_metadata.append(parsed_data)
125 |
126 | args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in temp_legislator_metadata)
127 | cur.execute("INSERT INTO legislators VALUES " + args_str)
128 | conn.commit()
129 |
130 |
--------------------------------------------------------------------------------
/html/templates/searchdemo.html.jinja:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
Search Demo
11 |
12 |
13 |
14 |
15 |
21 |
22 |
23 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
75 |
76 |
77 |
Demo
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
Legislative Influence Detector — LID
87 |
Tracing Policy Ideas across Lobbyists and State Legislatures
88 |
http://dssg.uchicago.edu
89 |
90 |
91 |
92 |
93 |
Step 1: Choose the type of documents you'd like to search
94 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
Step 4: Scroll potential matches and click on the ones you'd like to investigate. Green indicates likely matches. Red indicates likely not.
115 |
141 |
142 |
143 |
144 |
147 |
148 |
149 |
152 |
153 |
154 |
155 |
156 |
--------------------------------------------------------------------------------
/archive/score_alignments.py:
--------------------------------------------------------------------------------
1 | '''
2 | Functions for scoring alignments
3 | '''
4 |
5 | from sklearn.feature_extraction.text import TfidfVectorizer
6 | from sklearn.metrics import jaccard_similarity_score
7 | import numpy as np
8 | import scipy as sp
9 | from database import *
10 | from gensim.models import Word2Vec
11 | from utils.general_utils import save_pickle
12 | import json
13 |
14 | def weight_length(alignment, left_length, right_length):
15 | print alignment
16 | return np.sum([a[0]*(len(a[1])/float(left_length))*(len(a[2])/float(right_length)) for a in alignment.alignments])
17 |
18 | def weight_tfidf(alignment, state_tfidf, left_state, right_state):
19 | '''
20 | state_tfidf: dictionary with tfidf scores by state
21 | '''
22 | f = StateTFIDF(state_tfidf)
23 | return np.sum([f.tfidf_score(a, left_state, right_state)*a[0] for a in alignment.alignments])
24 |
25 | def jaccard_coefficient(left, right):
26 | jaccard_scores = jaccard_similarity_score(left,right)
27 | return jaccard_scores
28 |
29 | def load_word2vec():
30 | model = Word2Vec.load_word2vec_format('/mnt/data/sunlight/GoogleNews-vectors-negative300.bin', binary=True)
31 |
32 | return model
33 |
34 | def word2vec_similarity(list_of_alignments, model):
35 | '''
36 | model is word2vec model
37 | '''
38 | distances = []
39 | for alignment in list_of_alignments:
40 | score, left, right = alignment
41 |
42 | word_distance_list = []
43 | for i in range(len(left)):
44 |
45 | if left[i] not in model or right[i] not in model:
46 | continue
47 |
48 | word_distance_list.append(model.similarity(left[i], right[i]))
49 |
50 | distances.append(np.mean(word_distance_list))
51 |
52 | return np.mean(distances)
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | ####################################################################
62 | ##tfidf functions
63 |
64 | def tfidf_by_state(state, num_bills = 'all'):
65 | '''
66 | description:
67 | create dictionary of tfidf scores for a particular state
68 | args:
69 | num_bills: number of bills to run the algorithm open
70 | returns:
71 | dictionary of tfidf scores with words as keys
72 | '''
73 | es = ElasticConnection()
74 | state_bills = es.get_bills_by_state(state, num_bills)
75 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
76 | if bill['_source']['bill_document_last'] != None]
77 |
78 | vectorizer = TfidfVectorizer()
79 | X = vectorizer.fit_transform(corpus)
80 | idf = vectorizer.idf_
81 | idf = vectorizer._tfidf.idf_
82 |
83 | return dict(zip(vectorizer.get_feature_names(), idf))
84 |
85 |
86 | def tfidf_all_bills():
87 | '''
88 | description:
89 | create dictionary of tfidf scores for a particular state
90 | args:
91 | num_bills: number of bills to run the algorithm open
92 | returns:
93 | dictionary of tfidf scores with words as keys
94 | '''
95 | es = ElasticConnection()
96 | state_bills = es.get_all_bills()
97 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
98 | if bill['_source']['bill_document_last'] != None]
99 |
100 | vectorizer = TfidfVectorizer()
101 | X = vectorizer.fit_transform(corpus)
102 | idf = vectorizer.idf_
103 | idf = vectorizer._tfidf.idf_
104 |
105 | return dict(zip(vectorizer.get_feature_names(), idf))
106 |
107 |
108 | def tfidf_by_all_states():
109 | states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL',
110 | 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE',
111 | 'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD',
112 | 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
113 | states = map(lambda x : x.lower(), states)
114 |
115 | tfidf = {}
116 | for state in states:
117 | print 'working on ', state
118 | tfidf[state] = tfidf_by_state(state)
119 |
120 | return tfidf
121 |
122 |
123 | ####################################################################
124 | ##state tfidf object
125 | class StateTFIDF():
126 |
127 | def __init__(self, state_tfidf):
128 | self.state_tfidf = state_tfidf
129 |
130 | def find_tfidf(self, word, state):
131 | if state == 'model_legislation':
132 | return 0
133 | elif word == '-' or word not in self.state_tfidf[state]:
134 | return 0
135 | else:
136 | return self.state_tfidf[state][word]
137 |
138 | def tfidf_score(self, left, right, left_state, right_state):
139 | '''
140 | gives average tfidf for a particular left and right components of alignment
141 | '''
142 | left_scores = []
143 | right_scores = []
144 |
145 | for i in range(len(left)):
146 | left_scores.append(self.find_tfidf(left[i], left_state)) #need function
147 | right_scores.append(self.find_tfidf(right[i], right_state))
148 |
149 | if scores == []:
150 | return 0
151 | else:
152 | return np.mean(left_scores), np.mean(right_scores)
153 |
154 |
155 | def tfidf_by_alignments():
156 | alignments = []
157 | with open('bill_to_bill_alignments.txt') as f:
158 | for i,line in enumerate(f):
159 | print 'line ', i
160 | alignments.append(json.loads(line))
161 |
162 | if __name__ == "__main__":
163 | tfidf = tfidf_all_bills()
164 | save_pickle(tfidf, 'tfidf_all_bills')
165 |
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/archive/exploratory.R:
--------------------------------------------------------------------------------
1 | library('RPostgreSQL')
2 | library('ggplot2')
3 |
4 | db_info <- read.csv('policy_diffusion/default_profile', sep='=', header=F, quote='', stringsAsFactors=F)
5 |
6 | # sessions
7 | drv <- dbDriver('PostgreSQL')
8 | con <- dbConnect(drv, user=db_info$V2[3], password=db_info$V2[4],
9 | dbname=db_info$V2[2], host=db_info$V2[1])
10 |
11 | # number of governments
12 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state FROM bill_metadata) AS a;")
13 |
14 | # list the governments
15 | dbGetQuery(con, "SELECT DISTINCT state FROM bill_metadata ORDER BY state;")
16 |
17 | # number of sessions
18 | dbGetQuery(con, "SELECT COUNT(*) FROM (SELECT DISTINCT state, session FROM bill_metadata ORDER BY state, session) AS a;")
19 |
20 | # sessions
21 | dbGetQuery(con, "SELECT DISTINCT state, session FROM bill_metadata ORDER BY session;")
22 |
23 | # oldest session per government
24 | dbGetQuery(con, "SELECT state, MIN(session) AS min_session FROM bill_metadata GROUP BY state ORDER BY state;")
25 |
26 | # newest session per government
27 | dbGetQuery(con, "SELECT state, MAX(session) AS max_session FROM bill_metadata GROUP BY state ORDER BY state;")
28 |
29 | # bills and resolutions by government session
30 | bills_and_resolutions <-
31 | dbGetQuery(con, "SELECT a.state,
32 | a.session,
33 | a.bill_freq,
34 | b.resolution_freq
35 | FROM (SELECT state, session, count(*) as bill_freq FROM bill_metadata WHERE type LIKE '%bill%' GROUP BY state, session) AS a,
36 | (SELECT state, session, count(*) as resolution_freq FROM bill_metadata WHERE type LIKE '%resolution%' GROUP BY state, session) AS b
37 | WHERE a.state = b.state AND
38 | a.session = b.session
39 | ORDER BY bill_freq DESC;")
40 |
41 | br_plt <- ggplot(bills_and_resolutions, aes(bill_freq, resolution_freq))
42 | br_plt + theme(axis.text=element_text(size=18),
43 | axis.title=element_text(size=18,face="bold")) +
44 | ylim(0, max(bills_and_resolutions$bill_freq)) +
45 | geom_point() +
46 | xlab("bills") +
47 | ylab("resolutions") +
48 | geom_abline(intercept=0, slope=1) +
49 | geom_text(data=subset(bills_and_resolutions, bill_freq > 5000),
50 | aes(bill_freq, resolution_freq, label=toupper(state)),
51 | vjust=-.5, size=8) +
52 | geom_text(data=subset(bills_and_resolutions, bill_freq < resolution_freq & bill_freq > 100),
53 | aes(bill_freq, resolution_freq, label=toupper(state)),
54 | vjust=-.5, size=8)
55 |
56 |
57 | # how many bills Sunlight scraped from each government after the second
58 | # year it started scraping that government
59 | bills_by_state_year <-
60 | dbGetQuery(con, "SELECT UPPER(c.state) as state,
61 | EXTRACT(YEAR FROM c.created_at) AS year,
62 | COUNT(*) AS freq
63 | FROM bill_metadata AS c,
64 | -- find minimum year
65 | (SELECT a.state,
66 | MIN(a.year) AS min_year
67 | FROM (SELECT state,
68 | EXTRACT(YEAR FROM created_at) AS year
69 | FROM bill_metadata) AS a
70 | GROUP BY state) as b
71 | WHERE c.state = b.state AND
72 | EXTRACT(YEAR FROM created_at) >= b.min_year
73 | GROUP BY c.state,
74 | EXTRACT(YEAR FROM c.created_at)
75 | ORDER BY c.state,
76 | EXTRACT(YEAR FROM c.created_at);")
77 |
78 | # we're missing data for some states in some years
79 | dbGetQuery(con, "SELECT c.state,
80 | c.year - 1 AS missing_year
81 | FROM (SELECT *,
82 | b.year - lag(b.year) OVER w AS gap
83 | FROM (SELECT a.state,
84 | a.year,
85 | COUNT(*)
86 | FROM (SELECT state,
87 | EXTRACT(YEAR FROM created_at) AS year
88 | FROM bill_metadata) AS a
89 | GROUP BY a.state,
90 | a.year
91 | ORDER BY a.state,
92 | a.year) AS b
93 | WINDOW w AS (ORDER BY b.state, b.year)) AS c
94 | WHERE c.gap > 1;")
95 |
96 | missing_values <- data.frame(state = c('MT', 'ND', 'NV', 'TX', 'TX'),
97 | year = c(2014, 2014, 2012, 2012, 2014),
98 | freq = rep(0,5))
99 | bills_by_state_year <- rbind(bills_by_state_year, missing_values)
100 | bills_by_state_year <- bills_by_state_year[ order(bills_by_state_year$state, bills_by_state_year$year), ]
101 |
102 | # New Jersey 2012 is wrong. Subtract 2013 number from total here: http://www.njleg.state.nj.us/bills/BillsByNumber.asp
103 | bills_by_state_year$freq[ bills_by_state_year$state == 'NJ' & bills_by_state_year$year == 2012 ] <- 6808
104 |
105 | sy_plt <- ggplot(bills_by_state_year, aes(year, freq, color=state))
106 | sy_plt + theme(legend.position="none",
107 | axis.text=element_text(size=18),
108 | axis.title=element_text(size=18,face="bold")) +
109 | geom_line(size=2) +
110 | ylab("frequency") +
111 | geom_text(data=data.frame(state=c('NJ', 'TX', 'NJ', 'NY', 'IL', 'TX'),
112 | year=c(2012, 2013, 2014, 2014, 2015.05, 2015),
113 | freq=c(6850, 11700, 7500, 13200, 7000, 10000)),
114 | aes(x=year, y=freq, label=state),
115 | vjust=-.5, size=7)
116 |
--------------------------------------------------------------------------------
/lid/etl/extractors.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from bs4 import BeautifulSoup
3 | from state_bill_extractors import bill_text_extractor
4 | import os
5 | import codecs
6 | import argparse
7 | import re
8 | import base64
9 | import json
10 | from tika import parser as tp
11 | import traceback
12 | import logging
13 | from config import DATA_PATH
14 |
15 | try:
16 | from os import scandir, walk
17 | except ImportError:
18 | from scandir import scandir, walk
19 |
20 |
21 |
22 | def get_first_and_last_bill_documents(json_obj):
23 | state_code = json_obj['state']
24 |
25 | bill_documents = []
26 | for v in range(2):
27 |
28 | try:
29 | bill_document = base64.b64decode(json_obj['versions'][v]['bill_document'])
30 | except:
31 | bill_documents.append(None)
32 | continue
33 |
34 | try:
35 | mimetype = json_obj['versions'][v]['mimetype']
36 |
37 | except KeyError:
38 | mimetype = json_obj['versions'][v]['+mimetype']
39 |
40 | url = json_obj['versions'][v]['url']
41 | # try to extract text with bill-specific extractor
42 | bill_text = bill_text_extractor(state_code, bill_document, mimetype, url)
43 |
44 | # if fails then try tika extractor as backup
45 | if not bill_text or len(bill_text) < 1000:
46 |
47 | try:
48 | bill_text = tp.from_buffer(bill_document)['content']
49 | #if extraction results in short text, most likely a fail
50 | if len(bill_text) < 1000:
51 | bill_text = None
52 | except Exception:
53 | bill_text = None
54 |
55 |
56 | bill_documents.append(bill_text)
57 |
58 | return bill_documents
59 |
60 |
61 |
62 | # extracts text from bill documents fetched from sunlight
63 | # and constructs new json obj with selected meta-data
64 | def extract_bill_document(bill_file_path):
65 | try:
66 |
67 | bill_dict = {}
68 | data_dict = json.loads(open(bill_file_path).read())
69 |
70 | #test whether a document is a bill or resolution
71 | bill_text_count = [1 for x in data_dict['type'] if "bill" in x.lower()]
72 | good_bill_prefixes = ["A","AJ", "AJR","CACR","HB","S","HJR","ACA","HF","SF","HJ","SJ"
73 | "HJRCA","SJRCA","HSB","IP","LB","SB","SCA","SP"]
74 | if sum(bill_text_count) < 1 and data_dict['bill_id'].split()[0] not in good_bill_prefixes:
75 | return
76 |
77 |
78 |
79 |
80 | # extract first and last versions of bill document
81 | # and add to json dict
82 | bill_document_first, bill_document_last = get_first_and_last_bill_documents(data_dict)
83 | bill_dict['bill_document_first'] = bill_document_first
84 | bill_dict['bill_document_last'] = bill_document_last
85 |
86 | if bill_document_first == None or bill_document_last == None:
87 | logging.warning("failed to extract text for {0}".format(bill_file_path))
88 |
89 | else:
90 | logging.info("successfully extracted text for {0}".format(bill_file_path))
91 |
92 | # assign attributes that will be used
93 | bill_id = re.sub("\s+", "", data_dict['bill_id'])
94 | bill_dict['unique_id'] = "{0}_{1}_{2}".format(data_dict['state'], data_dict['session'], bill_id)
95 | bill_dict['bill_id'] = data_dict['bill_id']
96 | bill_dict['date_updated'] = data_dict['updated_at']
97 | bill_dict['session'] = data_dict['session']
98 | bill_dict['sunlight_id'] = data_dict['id']
99 | bill_dict['bill_title'] = data_dict['title']
100 | bill_dict['bill_type'] = data_dict['type']
101 | bill_dict['state'] = data_dict['state']
102 | bill_dict['chamber'] = data_dict['chamber']
103 | bill_dict['date_created'] = data_dict['created_at']
104 | bill_dict['actions'] = data_dict['actions']
105 | bill_dict['action_dates'] = data_dict['action_dates']
106 | bill_dict['date_introduced'] = data_dict['action_dates']['first']
107 | bill_dict['date_signed'] = data_dict['action_dates']['signed']
108 |
109 |
110 | if "short_tite" in data_dict.keys():
111 | bill_dict['short_title'] = data_dict['short_title']
112 | elif "+short_title" in data_dict.keys():
113 | bill_dict['short_title'] = data_dict['+short_title']
114 |
115 | else:
116 | bill_dict['short_title'] = None
117 |
118 | if "summary" in data_dict.keys():
119 | bill_dict['summary'] = data_dict['summary']
120 | else:
121 | bill_dict['summary'] = None
122 |
123 | return bill_dict
124 | except (KeyboardInterrupt, SystemExit):
125 | raise
126 | except Exception as e:
127 | trace_message = re.sub("\n+", "\t", traceback.format_exc())
128 | trace_message = re.sub("\s+", " ", trace_message)
129 | trace_message = "<<{0}>>".format(trace_message)
130 | m = "Failed to extract document for {0}: {1}".format(bill_file_path, trace_message)
131 | logging.error(m)
132 |
133 | if __name__ == "__main__":
134 | parser = argparse.ArgumentParser(description='Process some integers.')
135 | parser.add_argument('command', help='command to run, options are: build_index')
136 | parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ")
137 |
138 | args = parser.parse_args()
139 |
140 | #extracts text from bill documents and populates a json file with a json_object per row
141 | if args.command == "extract_bills":
142 | #configure logging
143 | logging.getLogger('tp').setLevel(logging.ERROR)
144 | logging.getLogger('requests').setLevel(logging.ERROR)
145 | logging.basicConfig(filename=os.environ['POLICY_DIFFUSION'] + '/logs/state_bill_extractor.log',
146 | level=logging.DEBUG)
147 |
148 | bill_files = []
149 | for dirname, dirnames, filenames in walk(args.data_path):
150 | for filename in filenames:
151 | bill_files.append(os.path.join(dirname, filename))
152 |
153 | outFile = codecs.open("{0}/extracted_data/extracted_bills.json".format(DATA_PATH), 'w')
154 | for i, bill_file in enumerate(bill_files):
155 | bill_json_obj = extract_bill_document(bill_file)
156 |
157 | outFile.write("{0}\n".format(json.dumps(bill_json_obj)))
158 |
159 | outFile.close()
160 |
161 |
162 |
163 | ##extracts text from model legislation
164 | def extract_model_legislation(json_file, encoded):
165 | '''
166 | Keyword Args:
167 | json_file: corresponds to json file with model legislation
168 | encoded: True/False if json file is b64 encoded
169 |
170 | returns:
171 | dictionary with url, date, and text of model legislation
172 | decription:
173 | extract text from model legislation
174 | '''
175 | data = []
176 | with open(json_file) as f:
177 | for line in f:
178 | data.append(json.loads(line))
179 |
180 | model_legislation = {}
181 | for i in range(len(data)):
182 | model_legislation[i] = data[i]
183 |
184 | if encoded == True:
185 | for i in range(len(model_legislation)):
186 | try:
187 | ml = model_legislation[i]['source']
188 | ml = base64.b64decode(ml)
189 | ml = tp.from_buffer(ml)
190 | model_legislation[i]['source'] = ml['content']
191 | except AttributeError:
192 | model_legislation[i]['source'] = None
193 | return model_legislation
194 |
195 | else:
196 | return model_legislation
197 |
198 |
199 |
200 |
--------------------------------------------------------------------------------
/archive/tfidf_ranking.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfVectorizer
2 | import numpy as np
3 | import pickle
4 | from alignment_evaluation import *
5 | from database import *
6 | import time
7 |
8 | def calc_tfidf_alignments(alignments_list):
9 | '''
10 | arg:
11 | list of alignment objects
12 | returns:
13 | dictionary with tfi_idf scores
14 | '''
15 | corpus = [alignment[1] + alignment[2] \
16 | for alignments in alignments_list for alignment in alignments ]
17 | corpus = [' '.join(doc) for doc in corpus]
18 | vectorizer = TfidfVectorizer()
19 | X = vectorizer.fit_transform(corpus)
20 | idf = vectorizer.idf_
21 | idf = vectorizer._tfidf.idf_
22 | return dict(zip(vectorizer.get_feature_names(), idf))
23 |
24 |
25 | def rank_alignments(alignments_list):
26 | '''
27 | arg:
28 | list of alignment objects
29 | returns:
30 | list of alignment objects sorted by averaged tfi_idf score
31 | '''
32 | tfidf = calc_tfidf_alignments(alignments_list)
33 |
34 | not_in_dict = 0
35 | in_dict = 0
36 |
37 | alignments_tfidf = []
38 | for alignments in alignments_list:
39 | tfidf_scores = []
40 | for alignment in alignments:
41 | print alignment
42 | for word in alignment[1]:
43 | if word in tfidf:
44 | tfidf_scores.append(tfidf[word.lower()])
45 | in_dict += 1
46 | if word != '-' and word not in tfidf:
47 | not_in_dict += 1
48 | for word in alignment[2]:
49 | if word in tfidf:
50 | tfidf_scores.append(tfidf[word.lower()])
51 | in_dict += 1
52 | if word != '-' and word not in tfidf:
53 | not_in_dict += 1
54 | if tfidf_scores != []:
55 | alignments_tfidf.append((alignments, np.sum(tfidf_scores)))
56 | else:
57 | alignments_tfidf.append((alignments, 0))
58 |
59 | print "num not in dict: ", not_in_dict
60 | print "in dict: ", in_dict
61 |
62 | alignments_tfidf.sort(key = lambda x: x[1], reverse=True)
63 |
64 | return alignments_tfidf
65 |
66 |
67 | def tfidf_by_state(state, num_bills = 'all'):
68 | '''
69 | description:
70 | create dictionary of tfidf scores for a particular
71 | args:
72 | state
73 | num_bills: number of bills to run the algorithm open
74 | returns:
75 | dictionary of tfidf scores with words as keys
76 | '''
77 | es = ElasticConnection()
78 | state_bills = es.get_bills_by_state(state, num_bills)
79 | corpus = [bill['_source']['bill_document_last'] for bill in state_bills \
80 | if bill['_source']['bill_document_last'] != None]
81 |
82 | vectorizer = TfidfVectorizer()
83 | X = vectorizer.fit_transform(corpus)
84 | idf = vectorizer.idf_
85 | idf = vectorizer._tfidf.idf_
86 |
87 | return dict(zip(vectorizer.get_feature_names(), idf))
88 |
89 |
90 | def tfidf_by_all_states():
91 | states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL',
92 | 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO','MT', 'NE',
93 | 'NV', 'NH','NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD',
94 | 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
95 | states = map(lambda x : x.lower(), states)
96 |
97 | tfidf = {}
98 | for state in states:
99 | print 'working on ', state
100 | tfidf[state] = tfidf_by_state(state)
101 |
102 | return tfidf
103 |
104 |
105 | ####################################################################
106 | ##state tfidf object
107 | class StateTFIDF():
108 |
109 | def __init__(self, state_tfidf):
110 | self.state_tfidf = state_tfidf
111 |
112 | def find_tfidf(self, word, state):
113 | if state == 'model_legislation':
114 | return 0
115 | elif word == '-' or word not in self.state_tfidf[state]:
116 | return 0
117 | else:
118 | return self.state_tfidf[state][word]
119 |
120 | def tfidf_score(self, alignment_with_state):
121 | scores = []
122 | print 'alignment_with_state: ', alignment_with_state
123 | raw_input("Press Enter to continue...")
124 | alignment, left_state, right_state = alignment_with_state
125 | score, left, right = alignment[0] #TODO: make work for more than one alignment
126 |
127 | for i in range(len(left)):
128 | scores.append(self.find_tfidf(left[i], left_state)) #need function
129 | scores.append(self.find_tfidf(right[i], right_state))
130 |
131 | if scores == []:
132 | return 0
133 | else:
134 | return np.mean(scores)
135 |
136 |
137 | ####################################################################
138 | ##ranking functions
139 | def rank(alignments_list, functions):
140 | '''
141 | depending on the function used, alignments_list may contain states of the alignments of not
142 | '''
143 | ranking = []
144 | for alignments in alignments_list:
145 | scores = []
146 | #keep track of for normalization
147 | max_function_values = np.zeros((4))
148 |
149 | for i in range(len(functions)):
150 | function = functions[i]
151 | output = function(alignments)
152 | scores.append(output)
153 | ranking.append((alignments, scores))
154 |
155 | if max_function_values[i] < output:
156 | max_function_values[i] = output
157 |
158 | final_ranking = []
159 | for alignments, scores in ranking:
160 | rank_value = []
161 | scores_max = zip(scores, max_function_values)
162 |
163 | for score, maxim in scores_max:
164 | rank_value.append(score / float(maxim))
165 |
166 | final_ranking.append((alignments[0][0], np.mean(scores)))
167 |
168 | final_ranking.sort(key = lambda x: x[1], reverse=True)
169 |
170 | return final_ranking
171 |
172 |
173 | def inspect_ranking(ranking):
174 | for alignments, tfidf in ranking:
175 | score, left, right = alignments
176 | for i in range(len(left)):
177 | print left[i], right[i]
178 | print 'alignment score: ', score
179 | print 'mean tfidf: ', tfidf
180 | raw_input("Press Enter to continue...")
181 | print '\n'
182 |
183 |
184 |
185 | if __name__ == '__main__':
186 |
187 |
188 | # tfidf = calc_tfidf(alignments_list)
189 |
190 | # alignments_tfidf = rank_alignments(alignments_list)
191 |
192 | # print 'testing speed of calculating tfidf per state'
193 |
194 | # t1 = time.time()
195 | # t=tfidf_state('al')
196 | # print 'alabama time: {0} seconds'.format(time.time()-t1)
197 |
198 | # t1 = time.time()
199 | # t=tfidf_state('ny')
200 | # print 'new york time: {0} seconds'.format(time.time()-t1)
201 |
202 | # print 'calculate tfidf by state...'
203 |
204 | # tfidf = tfidf_by_all_states()
205 |
206 | # with open('state_tfidfs.p', 'wb') as fp:
207 | # pickle.dump(tfidf, fp)
208 |
209 | print 'loading experiment and building alignment list...'
210 | with open('experiment.p', 'rb') as fp:
211 | e = pickle.load(fp)
212 |
213 | alignments_list = []
214 | for key, value in e.results.iteritems():
215 | i, j = key
216 | state_i = e.bills[i]['state']
217 | state_j = e.bills[j]['state']
218 | alignments_list.append((value['alignments'], state_i, state_j))
219 |
220 |
221 | with open('state_tfidfs.p', 'rb') as fp:
222 | tfidf = pickle.load(fp)
223 | f = StateTFIDF(tfidf)
224 |
225 | print 'calculating ranking...'
226 | ranking = rank(alignments_list, [f.tfidf_score])
227 | inspect_ranking(ranking)
228 |
229 |
--------------------------------------------------------------------------------
/archive/classifier.py:
--------------------------------------------------------------------------------
1 | from alignment_evaluation import alignment_features
2 | import numpy as np
3 | import nltk
4 | from sklearn import linear_model
5 | from sklearn.metrics import confusion_matrix, accuracy_score
6 |
7 | from score_alignments import StateTFIDF
8 | import json
9 | import argparse
10 | import os
11 | from database import ElasticConnection
12 | import random
13 | import codecs
14 | from utils.general_utils import alignment_tokenizer
15 | from utils.general_utils import UnicodeWriter
16 | from sklearn.metrics import jaccard_similarity_score
17 |
18 |
19 | def construct_training_set(alignments_file,out_file_name):
20 | """
21 | Args:
22 | alignments_file (file) -- file containing sample alignments
23 |
24 | out_file_name (string) -- name of training data file to write to
25 |
26 | Returns:
27 | None
28 | """
29 | ec = ElasticConnection(host= "54.203.12.145")
30 |
31 | training_examples = []
32 | for i,x in enumerate(alignments_file):
33 | json_obj = json.loads(x.strip())
34 |
35 | if "alignment_results" not in json_obj.keys():
36 | continue
37 |
38 | left_doc_id = json_obj['query_document_id']
39 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
40 |
41 | left_doc = json_obj['query_document']
42 | left_doc = reduce(lambda x,y:x+y,left_doc)
43 |
44 | left_doc_length = len(left_doc.split())
45 |
46 | for i,alignment_doc in enumerate(json_obj['alignment_results']):
47 |
48 | right_doc_id = alignment_doc['document_id']
49 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
50 |
51 | for alignment in alignment_doc['alignments']:
52 |
53 | left = alignment['left']
54 | right = alignment['right']
55 | left_start = alignment['left_start']
56 | right_start = alignment['right_start']
57 | left_end = alignment['left_end']
58 | right_end = alignment['right_end']
59 | score = alignment['score']
60 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
61 | right_end,score,left_bill_title,right_bill_title,
62 | " ".join(left)," ".join(right)])
63 |
64 |
65 | random.shuffle(training_examples)
66 |
67 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
68 | "right_end","score","left_bill_title","right_bill_title","left","right"]
69 |
70 |
71 | k = 500
72 | with codecs.open(out_file_name, 'wb') as output_file:
73 | writer = UnicodeWriter(output_file, header)
74 | writer.writerow(header)
75 | for l in training_examples[0:k]:
76 | l = [unicode(x) for x in l]
77 | writer.writerow(l)
78 |
79 |
80 | return
81 | =======
82 | from score_alignments import StateTFIDF
83 | import json
84 | import argparse
85 | import os
86 | from database import ElasticConnection
87 | import random
88 | import codecs
89 | from utils.general_utils import alignment_tokenizer
90 | from utils.general_utils import UnicodeWriter
91 |
92 |
93 | def construct_training_set(alignments_file,out_file_name):
94 | """
95 | Args:
96 | alignments_file (file) -- file containing sample alignments
97 |
98 | out_file_name (string) -- name of training data file to write to
99 |
100 | Returns:
101 | None
102 | """
103 | ec = ElasticConnection(host= "54.203.12.145")
104 |
105 | training_examples = []
106 | for i,x in enumerate(alignments_file):
107 | json_obj = json.loads(x.strip())
108 |
109 | if "alignment_results" not in json_obj.keys():
110 | continue
111 |
112 | left_doc_id = json_obj['query_document_id']
113 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
114 |
115 | left_doc = json_obj['query_document']
116 | left_doc = reduce(lambda x,y:x+y,left_doc)
117 |
118 | left_doc_length = len(left_doc.split())
119 |
120 | for i,alignment_doc in enumerate(json_obj['alignment_results']):
121 |
122 | right_doc_id = alignment_doc['document_id']
123 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
124 |
125 | for alignment in alignment_doc['alignments']:
126 |
127 | left = alignment['left']
128 | right = alignment['right']
129 | left_start = alignment['left_start']
130 | right_start = alignment['right_start']
131 | left_end = alignment['left_end']
132 | right_end = alignment['right_end']
133 | score = alignment['score']
134 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
135 | right_end,score,left_bill_title,right_bill_title,
136 | " ".join(left)," ".join(right)])
137 |
138 |
139 | random.shuffle(training_examples)
140 |
141 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
142 | "right_end","score","left_bill_title","right_bill_title","left","right"]
143 |
144 |
145 | k = 500
146 | with codecs.open(out_file_name, 'wb') as output_file:
147 | writer = UnicodeWriter(output_file, header)
148 | writer.writerow(header)
149 | for l in training_examples[0:k]:
150 | l = [unicode(x) for x in l]
151 | writer.writerow(l)
152 |
153 |
154 | return
155 |
156 |
157 | def features_matrix(alignment):
158 | right = alignment['right']
159 | left = alignment['left']
160 | features['left_tfidf'], features['right_tfidf'] = s.tfidf_score(left, right)
161 | features = alignment_features(left, right)
162 | features['score'] = alignment['score']
163 | features['label'] = alignment['label']
164 |
165 | return features
166 |
167 | def evaluate_model():
168 | data = list_alignments
169 | featuresets = [features_matrix(alignment) for alignment in data]
170 |
171 | data_list = [[value['avg_consec_match_length'], value['avg_gap_length_l'],
172 | value['avg_gap_length_r'], value['jaccard_score'],
173 | value['length'], value['num_gaps_l'], value['num_gaps_r'],
174 | value['num_matches'], value['num_mismatches'],
175 | value['score'], value['label']] for value in featuresets]
176 |
177 | alignment_data = np.array(data_list)
178 | alignment_y=alignment_data[:,-1]
179 | alignment_X=alignment_data[:,:-1]
180 |
181 | # A random permutation, to split the data randomly
182 | np.random.seed(0)
183 | indices = np.random.permutation(len(alignment_X))
184 | train_n = 5
185 | alignment_X_train = alignment_X[indices[:-train_n]]
186 | alignment_y_train = alignment_y[indices[:-train_n]]
187 | alignment_X_test = alignment_X[indices[-train_n:]]
188 | alignment_y_test = alignment_y[indices[-train_n:]]
189 |
190 | # Create and fit a logistic regression
191 | logistic = linear_model.LogisticRegression(C=1e5)
192 | logistic.fit(alignment_X_train, alignment_y_train)
193 | y_pred = logistic.predict(alignment_X_test)
194 |
195 | #Calculate accuracy
196 | accuracy_score(alignment_y_test, y_pred)
197 | cm = confusion_matrix(alignment_y_test, y_pred)
198 |
199 |
200 |
201 | def main():
202 | parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ')
203 | parser.add_argument('command',
204 | help='command to run, options are: construct_training_set,train_model,evaluate_model')
205 | parser.add_argument('--alignment_samples_doc', dest='alignment_samples',
206 | help="file path to the alignment samples used to construct training set ")
207 | args = parser.parse_args()
208 |
209 | if args.command == "construct_training_set":
210 | construct_training_set(open(args.alignment_samples),
211 | os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set.csv")
212 | elif args.command == "train_model":
213 | pass
214 | elif args.command == "evaluate_model":
215 | pass
216 | else:
217 | print args
218 | print "command not recognized, please enter construct_training_set,train_model,evaluate_model"
219 |
220 |
221 | if __name__ == "__main__":
222 | main()
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
--------------------------------------------------------------------------------
/lid/utils/text_cleaning.py:
--------------------------------------------------------------------------------
1 | '''
2 | Clean text in ElasticSearch
3 | '''
4 |
5 | import elasticsearch
6 | import re
7 | import string
8 | import urllib2
9 | from elasticsearch import Elasticsearch
10 | from pprint import pprint
11 | import nltk
12 |
13 | #custom modules
14 | #from database import ElasticConnection
15 |
16 | def clean_text(text, lower = True):
17 | '''
18 | variables:
19 | text: string corresponding to text of bill
20 | bill_name: string corresponding to bill_id
21 |
22 | returns:
23 | string that is cleaned up text
24 | decription:
25 | clean text
26 | '''
27 | #make text lowercase
28 | if lower == True:
29 | text = text.lower()
30 |
31 | text = re.sub('\n[ ]*[0-9]+', '', text)
32 | text = re.sub('[ ]{2,}', u' ', text)
33 |
34 | #parse by line
35 | text_list = text.splitlines()
36 |
37 | #replace funky symbols and multiple new lines
38 | ntext_list = []
39 | for line in text_list:
40 | line = line.replace(u'\xa0', u' ')
41 | line = line.replace(u'>>', u' ')
42 | line = line.replace(u'\xa7', u' ')
43 | line = line.replace(u'\xe2', u' ')
44 | line = line.replace(u'\u201c', u' ')
45 | line = line.replace(u'\u201d', u' ')
46 | line = line.replace(u'\xbb', u' ')
47 | line = line.replace(u'\xa9', u' ')
48 | line = line.replace(u' ,', u',')
49 | line = line.replace(u'{ font-family: courier, arial, sans-serif; font-size: 10pt; } table { empty-cells:show; }', u' ')
50 | line = re.sub( '\s+', u' ', line)
51 | ntext_list.append(line)
52 | return (string.join(ntext_list, '\n'))
53 |
54 |
55 |
56 |
57 |
58 |
59 | def split_to_sections(cleantext,state):
60 | '''
61 | variables:
62 | cleantext: clean version of text of bill
63 | state: abbreviation of state ID
64 |
65 | returns:
66 | list of bill sections
67 | decription:
68 | splits bill text into sections
69 | '''
70 | if state == 'ak':
71 | chunked_list = cleantext.split("\n*")
72 | elif state in ('al','ar','mt','or','ri'):
73 | chunked_list = cleantext.split('\nsection')
74 | elif state in ('nm','tx'):
75 | chunked_list = cleantext.split('\n section')
76 | elif state in ('az','ia','nv', 'wa', 'vt'):
77 | chunked_list = cleantext.split('\nsec.')
78 | elif state in ('me', 'mi'):
79 | chunked_list = cleantext.split('\n sec.')
80 | elif state == 'co':
81 | chunked_list = re.split('[[0-9][0-9]\.section|[0-9]\.section', cleantext)
82 | elif state in ('de','fl','tn'):
83 | chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext)
84 | elif state == 'ga':
85 | cleantext = re.sub('[0-9][0-9]\\n|[0-9]\\n', ' ', cleantext)
86 | chunked_list = re.split('\\nsection\s[0-9][0-9]|\\nsection\s[0-9]', cleantext)
87 | elif state in ('hi','sd','in'):
88 | chunked_list = re.split('\\n\ssection\s[0-9][0-9]\.|\\n\ssection\s[0-9]', cleantext)
89 | elif state == 'pa':
90 | chunked_list = re.split('section\s[0-9][0-9]\.|section\s[0-9]\.', cleantext)
91 | elif state in ('id', 'la', 'md', 'nd'):
92 | chunked_list = re.split('\\nsection\s[0-9][0-9]\.|\\nsection\s[0-9]\.', cleantext)
93 | elif state == 'il':
94 | cleantext = re.sub('\\n\s[0-9][0-9]|\\n\s[0-9]', ' ', cleantext)
95 | chunked_list = re.split('\\n\s\ssection\s', cleantext)
96 | elif state == 'sc':
97 | chunked_list = cleantext.split('\n \n')
98 | elif state == 'ks':
99 | chunked_list = re.split('\\nsection\s|sec\.', cleantext)
100 | elif state in ('ne', 'mn'):
101 | chunked_list = re.split('\ssection\s[0-9]\.|\ssec.\s[0-9][0-9]\.|\ssec.\s[0-9]\.', cleantext)
102 | elif state == 'ky':
103 | chunked_list = cleantext.split('\n\n\n section .')
104 | elif state == 'ms':
105 | chunked_list = cleantext.split('\n\n\n section ')
106 | elif state in ('ma', 'nc', 'oh','ut'):
107 | chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext)
108 | elif state == 'mo':
109 | chunked_list = re.split('\\n\s[0-9][0-9]\.\s|\\n\s[0-9]\.\s', cleantext)
110 | elif state == 'nh':
111 | chunked_list = re.split('\n\n[0-9][0-9]\s|\n\n[0-9]\s', cleantext)
112 | elif state == 'nj':
113 | chunked_list = re.split('\\n\\n\s[0-9][0-9]\.\s|\\n\\n\s[0-9]\.\s', cleantext)
114 | elif state == 'ny':
115 | chunked_list = re.split('\ssection\s[0-9]\.|\.\ss\s[0-9]\.', cleantext)
116 | elif state == 'ok':
117 | chunked_list = re.split('\nsection\s\.\s', cleantext)
118 | elif state == 'va':
119 | chunked_list = re.split('(([A-Z])|[0-9][0-9])\.\s|(([A-Z])|[0-9])\.\s', cleantext)
120 | elif state == 'wi':
121 | chunked_list = re.split('\\n[0-9][0-9]section\s\\n|\\n[0-9]section\s\\n', cleantext)
122 | elif state == 'wv':
123 | chunked_list = re.split('\n\s\([a-z]\)\s', cleantext)
124 | elif state == 'wy':
125 | chunked_list = re.split('\ssection\s[0-9][0-9]\.|\ssection\s[0-9]\.', cleantext)
126 | elif state == 'ca':
127 | chunked_list = re.split('section\s[0-9]\.|sec.\s[0-9][0-9]\.|sec.\s[0-9]\.', cleantext)
128 | elif state == None:
129 | chunked_list = cleantext.split("\n")
130 | else:
131 | chunked_list = cleantext.split("\n")
132 |
133 | return chunked_list
134 |
135 | #Delete empty sections (run before deleting numbers in lines)
136 | def delete_empty_sections(chunked_list):
137 | '''
138 | decription: deletes empty elements in bills
139 | '''
140 | return [x for x in chunked_list if x is not None and len(x)>2]
141 |
142 | #Need to delete number lines for: OR, OK, NE, PA (run before deleting lines)
143 | def delete_numbers_in_lines (chunked_list):
144 | '''
145 | decription:
146 | cleans pdf extractor errors where number of lines were included in text
147 | '''
148 | re_string = '\\n\s[0-9][0-9]|\\n[0-9][0-9]|\\n[0-9]|\\n\s[0-9]'
149 | chunked_list = [re.sub(re_string,'',t) for t in chunked_list]
150 | return chunked_list
151 |
152 |
153 |
154 | #Delete multiple new lines for each section
155 | def delete_lines (chunked_list):
156 | '''
157 | description: deletes multiple lines and spaces for each section
158 | '''
159 | chunked_list = [re.sub( '\s+', ' ', x) for x in chunked_list]
160 | return chunked_list
161 |
162 |
163 |
164 | def clean_document(doc_text,doc_type = "text",split_to_section = False,**kwargs):
165 | """text -- document text
166 | doc_type --- the type of the document ( "state_bill", "model_legislation", "None") """
167 |
168 | if doc_type == "state_bill":
169 | doc_text = clean_text(doc_text)
170 | doc_text_sections = split_to_sections(doc_text,kwargs['state_id'])
171 | doc_text_sections = delete_empty_sections(doc_text_sections)
172 | if kwargs['state_id'] in ['or','ok','ne','pa']:
173 | doc_text_sections = delete_numbers_in_lines(doc_text_sections)
174 | doc_text_sections = delete_lines(doc_text_sections)
175 |
176 | elif doc_type == "model_legislation":
177 | doc_text = clean_text(doc_text)
178 | doc_text_sections = doc_text.split('\nsection')
179 | doc_text_sections = delete_empty_sections(doc_text_sections)
180 | doc_text_sections = delete_lines(doc_text_sections)
181 |
182 | elif doc_type == "text":
183 | doc_text = clean_text(doc_text)
184 | doc_text_sections = doc_text.split('\n')
185 | doc_text_sections = delete_empty_sections(doc_text_sections)
186 | doc_text_sections = delete_lines(doc_text_sections)
187 |
188 | if split_to_section == True:
189 | return doc_text_sections
190 | elif split_to_section == False:
191 | return [" ".join(doc_text_sections)]
192 |
193 | #delete boiler plate present in all alec exposed bills after "effective date"
194 | def delete_boiler_plate_alec_exposed (chunked_list):
195 | chunked_list = [re.sub('({effective date).*$', ' ', x) for x in chunked_list]
196 | chunked_list = chunked_list[1:]
197 | return chunked_list
198 |
199 | #good example is test_clean_text_for_alignment('va')
200 |
201 | def test_clean_text(state):
202 | es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300)
203 | match = es.search(index="state_bills", body={"query": {"match": {'state': state}}})
204 | state_text = match['hits']['hits'][3]['_source']['bill_document_first']
205 | cleaned_doc = clean_document(state_text,doc_type = "state_bill",state_id = "mi",split_to_section = False)
206 | return cleaned_doc
207 |
208 | def main():
209 | #Get data from elasticsearch to test
210 |
211 | print test_clean_text("mi")
212 |
213 | if __name__ == "__main__":
214 | main()
215 |
216 |
217 |
218 |
219 |
--------------------------------------------------------------------------------
/lid/utils/general_utils.py:
--------------------------------------------------------------------------------
1 | import ujson
2 | import base64
3 | import urllib2
4 | import socket
5 | from ftplib import FTP, error_perm
6 | import re
7 | from StringIO import StringIO
8 | import time
9 | import multiprocessing
10 | import pickle
11 | import multiprocessing as mp
12 | import gc
13 | import signal
14 | import csv
15 | import codecs
16 | import cStringIO
17 |
18 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ #########
19 |
20 | class TimedOutExc(Exception):
21 | pass
22 |
23 | def deadline(timeout, *args):
24 |
25 | def decorate(f):
26 | def handler(signum, frame):
27 | raise TimedOutExc()
28 |
29 | def new_f(*args):
30 |
31 | signal.signal(signal.SIGALRM, handler)
32 | signal.alarm(timeout)
33 | return f(*args)
34 | signa.alarm(0)
35 |
36 | new_f.__name__ = f.__name__
37 | return new_f
38 | return decorate
39 |
40 | #######Code from http://www.filosophy.org/post/32/python_function_execution_deadlines__in_simple_examples/ #########
41 |
42 | class UTF8Recoder:
43 | """
44 | Iterator that reads an encoded stream and reencodes the input to UTF-8
45 | """
46 | def __init__(self, f, encoding):
47 | self.reader = codecs.getreader(encoding)(f)
48 |
49 | def __iter__(self):
50 | return self
51 |
52 | def next(self):
53 | return self.reader.next().encode("utf-8")
54 |
55 |
56 | class UnicodeReader():
57 | """
58 | A CSV reader which will iterate over lines in the CSV file "f",
59 | which is encoded in the given encoding.
60 | """
61 |
62 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
63 | f = UTF8Recoder(f, encoding)
64 | self.reader = csv.reader(f, dialect=dialect, **kwds)
65 |
66 | def next(self):
67 | row = self.reader.next()
68 | return [unicode(s, "utf-8") for s in row]
69 |
70 | def __iter__(self):
71 | return self
72 |
73 |
74 |
75 |
76 | class UnicodeWriter():
77 | def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
78 | self.queue = cStringIO.StringIO()
79 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
80 | self.stream = f
81 | self.encoder = codecs.getincrementalencoder(encoding)()
82 | def writerow(self, row):
83 | '''writerow(unicode) -> None
84 | This function takes a Unicode string and encodes it to the output.
85 | '''
86 | self.writer.writerow([s.encode("utf-8") for s in row])
87 | data = self.queue.getvalue()
88 | data = data.decode("utf-8")
89 | data = self.encoder.encode(data)
90 | self.stream.write(data)
91 | self.queue.truncate(0)
92 |
93 | def writerows(self, rows):
94 | for row in rows:
95 | self.writerow(row)
96 |
97 | #********DEPRECATED*************
98 | class WorkerPool():
99 |
100 | def __init__(self,num_workers=1,worker_timeout = 600):
101 |
102 | self._num_workers = num_workers
103 | self._worker_timeout = worker_timeout
104 | self._results = mp.Queue()
105 | self._pool = [None]*self._num_workers
106 | self._worker_times = [0.0]*self._num_workers
107 |
108 | def _assign_new_task(self,worker_id,input_args):
109 | p = self._pool[worker_id]
110 | p.join()
111 | arg = input_args.pop()
112 | new_p = mp.Process(target= func,args = (arg,self._results),name = ('process_'+str(worker_id)))
113 | new_p.start()
114 | self._pool[worker_id] = new_p
115 | self._worker_times[worker_id] = time.time()
116 |
117 | def work(self,func,input_args):
118 | worker_counter = 0
119 | #define wrapper function that queues result from input func
120 | def new_func(x):
121 | y = func(*x)
122 | self._results.put(y)
123 |
124 |
125 | while len(input_args) > 0 or ("running" in status):
126 |
127 | #assign new worker tasks to empty pool slots
128 | for i in range(self._num_workers):
129 |
130 | if len(input_args) > 0 and self._pool[i] is None:
131 | arg = input_args.pop(0)
132 | new_p = mp.Process(target= new_func,args = (arg,),name = ('process_'+str(i)))
133 | new_p.start()
134 | print worker_counter
135 | worker_counter+=1
136 | self._pool[i] = new_p
137 | self._worker_times[i] = time.time()
138 |
139 | time.sleep(0.1)
140 | status = self.check_pool_status(time.time())
141 | import numpy as np
142 | print time.time() - np.array(self._worker_times)
143 | for i in range(len(status)):
144 | if status[i] == "completed":
145 | p = self._pool[i]
146 | p.terminate()
147 | p.join()
148 | self._pool[i] = None
149 | del p
150 | elif status[i] == "timeout":
151 | p = self._pool[i]
152 | p.terminate()
153 | self._pool[i] = None
154 | print "terminated job ",p.name
155 | gc.collect()
156 |
157 | result_list = []
158 |
159 | while not self._results.empty():
160 | result_list.append( self._results.get() )
161 |
162 | return result_list
163 |
164 | #returns a list of bools indicating running status of each worker.
165 | #running,timeout,completed
166 | def check_pool_status(self,current_time):
167 | status_list = []
168 | for i in range(self._num_workers):
169 |
170 | worker = self._pool[i]
171 | if worker is None:
172 | status_list.append("closed")
173 | elif worker.is_alive() and (current_time-self._worker_times[i]
=self._worker_timeout):
176 | status_list.append("timeout")
177 | elif not worker.is_alive():
178 | status_list.append("completed")
179 |
180 | return status_list
181 | # ********DEPRECATED*************
182 |
183 |
184 | def alignment_tokenizer(s,type = "space"):
185 | if type == "space":
186 | s = s.split(" ")
187 | return s
188 |
189 | #creates a searalized json object for bill sources
190 | def bill_source_to_json(url,source,date):
191 | jsonObj = {}
192 | jsonObj['url'] = url
193 | jsonObj['date'] = date
194 | jsonObj['source'] = base64.b64encode(source)
195 |
196 | return ujson.encode(jsonObj)
197 |
198 | #creates a json object for bill sources (not encoded)
199 | def bill_source_to_json_not_encoded(url,source,date):
200 | jsonObj = {}
201 | jsonObj['url'] = url
202 | jsonObj['date'] = date
203 | jsonObj['source'] = source
204 |
205 | return ujson.encode(jsonObj)
206 |
207 | #wrapper for urllib2.urlopen that catches URLERROR and socket error
208 | def fetch_url(url):
209 |
210 | #fetch ftp file
211 | if 'ftp://' in url:
212 |
213 | try:
214 | domain_pattern = re.compile("/[A-Za-z0-9\.]+")
215 | domain_name = domain_pattern.search(url).group(0)[1:]
216 | ftp = FTP(domain_name,timeout=10)
217 | ftp.login()
218 | file_name = "/".join(url.split("/")[3:])
219 |
220 | r = StringIO()
221 | ftp.retrbinary('RETR {0}'.format(file_name), r.write)
222 | document = r.getvalue()
223 | time.sleep(1)
224 |
225 | except (KeyboardInterrupt, SystemExit):
226 | raise
227 | except:
228 | document = None
229 |
230 |
231 | return document
232 |
233 | #fetch http file
234 | else:
235 |
236 | try:
237 | req = urllib2.urlopen(url,timeout=10)
238 | document = req.read()
239 | except (KeyboardInterrupt, SystemExit):
240 | raise
241 | except:
242 | document = None
243 |
244 | return document
245 |
246 | #used to find alignments in broader text
247 | def find_subsequence(s,q):
248 | '''
249 | is the list s contained in q in order and if it is what are indices
250 | '''
251 | for i in range(len(q)):
252 | T = True
253 | for j in range(len(s)):
254 | if s[j] != q[i+j]:
255 | T = False
256 | break
257 | if T:
258 | return (i, i + j + 1)
259 | return (0,0)
260 |
261 |
262 | def load_pickle(name):
263 | with open('{0}.p'.format(name),'rb') as fp:
264 | f =pickle.load(fp)
265 |
266 | return f
267 |
268 |
269 | def save_pickle(thing, name):
270 | with open('{0}.p'.format(name),'wb') as fp:
271 | pickle.dump(thing, fp)
272 |
--------------------------------------------------------------------------------
/lid/frontend.py:
--------------------------------------------------------------------------------
1 | #/usr/bin/env python
2 | import os
3 | import pdb
4 | import sys
5 | import argparse
6 | import datetime as dt
7 | import time
8 | from collections import defaultdict
9 | import cherrypy
10 | from jinja2 import Environment, FileSystemLoader, Template
11 | import random
12 | import string
13 | import json
14 | from elasticsearch import Elasticsearch
15 | from database import ElasticConnection
16 | import re
17 | import nltk
18 | from utils.text_cleaning import clean_document
19 | from lid import LID
20 | from utils.general_utils import alignment_tokenizer
21 | from text_alignment import LocalAligner,AffineLocalAligner
22 |
23 |
24 |
25 | def get_alignment_highlight(text1,text2):
26 | aligns = align(text1, text2)
27 | alignment = aligns[0]
28 | seq1 = nltk.word_tokenize(text1)
29 | seq2 = nltk.word_tokenize(text2)
30 | align_clean_1, align_clean_2 = cleanAlignment(alignment)
31 | [i,j] = contains(align_clean_1, seq1)
32 | [k,l] = contains(align_clean_2, seq2)
33 | seq1.insert(i,"")
34 | seq1.insert(j,"")
35 | seq2.insert(k,"")
36 | seq2.insert(l,"")
37 |
38 | text1 = " ".join(seq1)
39 | text2 = " ".join(seq2)
40 |
41 | return text1,text2
42 |
43 |
44 |
45 | def markup_alignment_for_display(alignment_dict,left_text,right_text):
46 |
47 | left_text = left_text.split()
48 | right_text = right_text.split()
49 | l = alignment_dict['left']
50 | r = alignment_dict['right']
51 | left_start = alignment_dict['left_start']
52 | left_end = alignment_dict['left_end']
53 | right_start = alignment_dict['right_start']
54 | right_end = alignment_dict['right_end']
55 |
56 |
57 |
58 | #mark up l and r alignments with style
59 | l_styled = []
60 | r_styled = []
61 | temp_text = ""
62 | for i in range(len(l)):
63 | if l[i] == r[i] and l[i] != "-":
64 | temp_text+=l[i]
65 | temp_text+=" "
66 | if l[i] != r[i]:
67 | if len(temp_text)>0:
68 | temp_text = u"{0}".format(temp_text)
69 | l_styled.append(temp_text)
70 | r_styled.append(temp_text)
71 | temp_text = ""
72 | if l[i] != "-" and r[i] != "-":
73 | l_styled.append(u"{0}".format(l[i]))
74 | r_styled.append(u"{0}".format(r[i]))
75 | else:
76 | l_styled.append(l[i])
77 | r_styled.append(r[i])
78 |
79 | temp_text = u"{0}".format(temp_text)
80 | l_styled.append(temp_text)
81 | r_styled.append(temp_text)
82 |
83 | #l[i] = "{0}".format(l[i])
84 | #r[i] = "{0}".format(r[i])
85 |
86 | #l.insert(0,"")
87 | #l.append("")
88 | #r.insert(0,"")
89 | #r.append("")
90 |
91 | padding = [u"
"]
92 |
93 | left_text = left_text[:left_start]+padding+l_styled+\
94 | padding+left_text[left_end:]
95 |
96 | right_text = right_text[:right_start]+padding+r_styled+padding\
97 | +right_text[right_end:]
98 |
99 | left_text = u" ".join(left_text)
100 | right_text = u" ".join(right_text)
101 |
102 | return left_text,right_text
103 |
104 |
105 |
106 |
107 | def markup_alignment_difference(l,r):
108 | l_styled = []
109 | r_styled = []
110 | temp_text = ""
111 | for i in range(len(l)):
112 | if l[i] != r[i]:
113 | l[i] = u"{0}".format(l[i])
114 | r[i] = u"{0}".format(r[i])
115 |
116 | return l,r
117 |
118 |
119 | class DemoWebserver(object):
120 |
121 | _cp_config = {
122 | 'tools.staticdir.on' : True,
123 | 'tools.staticdir.dir' : "{0}/html".format(os.environ['POLICY_DIFFUSION']),
124 | 'tools.staticdir.index' : '/templates/searchdemo.html.jinja',
125 | 'tools.sessions.on': True,
126 | }
127 |
128 |
129 |
130 | def __init__(self,elastic_connection):
131 | self.ec = elastic_connection
132 | self.lidy = LID(elastic_host = os.environ['ELASTICSEARCH_IP'],
133 | query_results_limit=os.environ['QUERY_RESULTS_LIMIT'])
134 |
135 | self.aligner = LocalAligner()
136 | #self.query_bill = "bill"
137 |
138 |
139 | @cherrypy.expose
140 | def searchdemo(self, query_string="proof of identity", query_bill = "bill", query_results=[]):
141 |
142 | query_string = re.sub('\"',' ',query_string)
143 |
144 | if query_bill == "model legislation":
145 |
146 | query_result = lidy.find_model_legislation_alignments(query_string, document_type="text",
147 | split_sections=False, query_document_id="front_end_query")
148 |
149 | results_to_show = []
150 |
151 | for result_doc in query_result['alignment_results']:
152 |
153 | meta_data = result_doc['document_id'].replace('old_bills', 'oldbills').split('_')
154 | meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]]
155 |
156 | result_text = ec.get_model_legislation_by_id(result_doc['document_id'])['source']
157 | result_text = re.sub('\"',' ',result_text)
158 |
159 | alignment = result_doc['alignments'][0]
160 | score = alignment['score']
161 |
162 | left,right = markup_alignment_for_display(alignment,
163 | query_string, result_text)
164 | left = re.sub('\"',' ',left)
165 | right = re.sub('\"',' ',right)
166 | results_to_show.append([score] + meta_data + [left,right])
167 |
168 | results_to_show.sort(key = lambda x:x[0],reverse = True)
169 |
170 | tmpl = env.get_template("searchdemo.html.jinja")
171 | c = {
172 | 'query_string': query_string,
173 | 'results_to_show': results_to_show,
174 | }
175 | return tmpl.render(**c)
176 |
177 |
178 | if query_bill == "constitution":
179 |
180 | query_result = constitution_lidy.find_constitution_alignments(query_string, document_type="text",
181 | split_sections=True, query_document_id="text")
182 |
183 | results_to_show = []
184 |
185 | for result_doc in query_result['alignment_results']:
186 |
187 | state = result_doc['document_id'][:-5].upper()
188 | year = result_doc['document_id'][-4:]
189 | meta_data = ["CONSTITUTION", state, year]
190 |
191 | result_text = ec.get_constitution_by_id(result_doc['document_id'])['constitution']
192 | result_text = re.sub('\"',' ',result_text)
193 | print result_text
194 |
195 | alignment = result_doc['alignments'][0]
196 | score = alignment['score']
197 |
198 | left,right = markup_alignment_for_display(alignment,
199 | query_string, result_text)
200 | left = re.sub('\"',' ',left)
201 | right = re.sub('\"',' ',right)
202 | results_to_show.append([score] + meta_data + [left,right])
203 |
204 | results_to_show.sort(key = lambda x:x[0],reverse = True)
205 |
206 | tmpl = env.get_template("searchdemo.html.jinja")
207 | c = {
208 | 'query_string': query_string,
209 | 'results_to_show': results_to_show,
210 | }
211 | return tmpl.render(**c)
212 |
213 |
214 | else:
215 | query_result = lidy.find_state_bill_alignments(query_string, document_type="text",
216 | split_sections=False, query_document_id="front_end_query")
217 |
218 | results_to_show = []
219 |
220 | for result_doc in query_result['alignment_results']:
221 |
222 | meta_data = result_doc['document_id'].split("_")
223 | meta_data = [meta_data[0].upper(),meta_data[1].upper(),meta_data[2]]
224 |
225 | result_text = ec.get_bill_by_id(result_doc['document_id'])['bill_document_last']
226 | result_text = re.sub('\"',' ',result_text)
227 |
228 | alignment = result_doc['alignments'][0]
229 | score = alignment['score']
230 |
231 | left,right = markup_alignment_for_display(alignment,
232 | query_string,result_text)
233 | left = re.sub('\"',' ',left)
234 | right = re.sub('\"',' ',right)
235 | results_to_show.append([score] + meta_data + [left,right])
236 |
237 | results_to_show.sort(key = lambda x:x[0],reverse = True)
238 |
239 | tmpl = env.get_template("searchdemo.html.jinja")
240 | c = {
241 | 'query_string': query_string,
242 | 'results_to_show': results_to_show,
243 | }
244 | return tmpl.render(**c)
245 |
246 |
247 |
248 | if __name__ == '__main__':
249 | policy_diffusion_path=os.environ['POLICY_DIFFUSION']
250 | ec_ip = os.environ['ELASTICSEARCH_IP']
251 | parser = argparse.ArgumentParser()
252 | parser.add_argument('--host', type=str, default='0.0.0.0')
253 | parser.add_argument('--port', type=int, default=29010)
254 | parser.add_argument('--elasticsearch_connection',default=u"{0}:9200".format(ec_ip))
255 | args = parser.parse_args()
256 |
257 | env = Environment(loader=FileSystemLoader("{0}/html/templates".format(policy_diffusion_path)))
258 |
259 | query_samples = [x.strip() for x in open("{0}/data/state_bill_samples.txt".format(policy_diffusion_path))]
260 |
261 | aligner = AffineLocalAligner(match_score=4, mismatch_score=-1, gap_start=-3, gap_extend=-1.5)
262 |
263 | ec = ElasticConnection(host = ec_ip)
264 |
265 | lidy = LID(query_results_limit=20, elastic_host=ec_ip,
266 | lucene_score_threshold=0.01, aligner=aligner)
267 |
268 | constitution_lidy = LID(query_results_limit=10000,
269 | elastic_host=ec_ip, lucene_score_threshold=0.01,
270 | aligner=aligner)
271 |
272 |
273 | es_host,es_port = args.elasticsearch_connection.split(":")
274 | cherrypy.config.update({'server.socket_port': args.port, 'server.socket_host': args.host})
275 | cherrypy.quickstart(DemoWebserver(ec), "/")
276 |
--------------------------------------------------------------------------------
/lid/evaluation/bills_for_evaluation_set.py:
--------------------------------------------------------------------------------
1 | from elasticsearch import Elasticsearch
2 | import re
3 | import csv
4 | import urllib2
5 | import urllib
6 | from urllib import urlopen
7 | from tika import parser
8 | import pickle
9 |
10 |
11 | def create_bills(ls):
12 | '''
13 | args:
14 | ls: list of lists of urls that correspond to matches
15 |
16 | returns:
17 | dictionary grouped by matches
18 | '''
19 | k = 0
20 | bill_id = 0
21 | bills = {}
22 | bad_count = 0
23 | for urls in ls:
24 | for url,state in urls:
25 | try:
26 | print "bill_id: " + str(bill_id)
27 | bills[bill_id] = {}
28 | doc = urllib2.urlopen(url).read()
29 | text = parser.from_buffer(doc)['content']
30 | bills[bill_id]['url'] = url
31 | bills[bill_id]['text'] = text
32 | bills[bill_id]['match'] = k
33 | bills[bill_id]['state'] = state
34 | except:
35 | pass
36 | bad_count += 1
37 | print 'bad_count: ', bad_count
38 | bill_id += 1
39 | k += 1
40 |
41 | #get more evaluation bills
42 | eval_bills = grab_more_eval_bills()
43 | for more_bills in eval_bills:
44 | print 'bill_group: ' k
45 | k +=1
46 | for text, state in more_bills:
47 | bill_id += 1
48 | print 'bill_id: ', i
49 |
50 | bills[bill_id] = {}
51 | bills[bill_id]['text'] = text
52 | bills[bill_id]['state'] = state
53 | bills[bill_id]['match'] = k
54 |
55 | try:
56 | for bill in bills.keys():
57 | if bills[bill] == {} or bills[bill]['text'] == '' \
58 | or bills[bill]['text'] == None:
59 |
60 | del bills[bill]
61 | except:
62 | pass
63 |
64 | return bills
65 |
66 | def get_bill_by_id(unique_id):
67 | es = Elasticsearch(['54.203.12.145:9200', '54.203.12.145:9200'], timeout=300)
68 | match = es.search(index="state_bills", body={"query": {"match": {'unique_id': unique_id}}})
69 | bill_text = match['hits']['hits'][0]['_source']['bill_document_first']
70 | return bill_text
71 |
72 | def grab_more_eval_bills():
73 | with open('../../data/evaluation_set/bills_for_evaluation_set.csv') as f:
74 | bills_list = [row for row in csv.reader(f.read().splitlines())]
75 |
76 | bill_ids_list = []
77 | url_lists = []
78 | topic_list = []
79 | for i in range(len(bills_list)):
80 | state = bills_list[i][1]
81 | if state == 'ct':
82 | continue
83 | topic = bills_list[i][0]
84 | bill_number = bills_list[i][2]
85 | bill_number = re.sub(' ', '', bill_number)
86 | year = bills_list[i][3]
87 | url = bills_list[i][6]
88 | unique_id = str(state + '_' + year + '_' + bill_number)
89 | topic_list.append(topic)
90 | bill_ids_list.append(unique_id)
91 | url_lists.append(url)
92 |
93 | bills_ids = zip(bill_ids_list, url_lists)
94 |
95 | bad_count = 0
96 | bills_text = []
97 | state_list = []
98 | for i in range(len(bills_ids)):
99 | try:
100 | bill_text = get_bill_by_id(bills_ids[i][0])
101 | except IndexError:
102 | try:
103 | url = bills_ids[i][1]
104 | doc = urllib.urlopen(url).read()
105 | bill_text = parser.from_buffer(doc)['content']
106 | print url
107 | except IOError:
108 | bad_count += 1
109 | print 'bad_count: ', bad_count
110 | #skip this case
111 | continue
112 | bills_text.append(bill_text)
113 | state = bills_ids[i][0][0:2]
114 | state_list.append(state)
115 |
116 | bills_state = zip(bills_text, state_list, topic_list)
117 |
118 | bill_type_1 = []
119 | bill_type_2 = []
120 | for bill in bills_state:
121 | if bill[-1] == 'Adult Guardianship and Protective Proceedings Jurisdiction Act':
122 | bill_type_1.append((bill[0],bill[1]))
123 | else:
124 | bill_type_2.append((bill[0],bill[1]))
125 |
126 | return [bill_type_2, bill_type_1]
127 |
128 | def create_save_bills(bill_list):
129 | bills = create_bills(bill_list)
130 | with open('../../data/evaluation_set/labeled_bills.p', 'wb') as fp:
131 | pickle.dump(bills, fp)
132 |
133 | return bills
134 |
135 |
136 | if __name__ == '__main__':
137 | #each list in this list of lists contains bills that are matches
138 | similar_bills = [[('http://www.azleg.gov/legtext/52leg/1r/bills/hb2505p.pdf', 'az'),
139 | ('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=SB0012B&session=29', 'ak' ),
140 | ('http://www.capitol.hawaii.gov/session2015/bills/HB9_.PDF', 'hi'),
141 | ('http://www.capitol.hawaii.gov/session2015/bills/HB1047_.PDF', 'hi'),
142 | ('http://flsenate.gov/Session/Bill/2015/1490/BillText/Filed/HTML','fl'),
143 | ('http://ilga.gov/legislation/fulltext.asp?DocName=09900SB1836&GA=99&SessionId=88&DocTypeId=SB&LegID=88673&DocNum=1836&GAID=13&Session=&print=true','il'),
144 | ('http://www.legis.la.gov/Legis/ViewDocument.aspx?d=933306', 'la'),
145 | ('http://mgaleg.maryland.gov/2015RS/bills/sb/sb0040f.pdf', 'md'),
146 | ('http://www.legislature.mi.gov/documents/2015-2016/billintroduced/House/htm/2015-HIB-4167.htm', 'mi'),
147 | ('https://www.revisor.mn.gov/bills/text.php?number=HF549&version=0&session=ls89&session_year=2015&session_number=0','mn'),
148 | ('http://www.njleg.state.nj.us/2014/Bills/A2500/2354_R2.HTM','nj'),
149 | ('http://assembly.state.ny.us/leg/?sh=printbill&bn=A735&term=2015','ny'),
150 | ('http://www.ncga.state.nc.us/Sessions/2015/Bills/House/HTML/H270v1.html','nc'),
151 | ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/HB2005/A-Engrossed','or'),
152 | ('https://olis.leg.state.or.us/liz/2015R1/Downloads/MeasureDocument/SB947/Introduced','or'),
153 | ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2015&sessInd=0&billBody=H&billTyp=B&billNbr=0624&pn=0724', 'pa'),
154 | ('http://www.scstatehouse.gov/sess121_2015-2016/prever/172_20141203.htm','sc'),
155 | ('http://lawfilesext.leg.wa.gov/Biennium/2015-16/Htm/Bills/House%20Bills/1356.htm', 'wa'),
156 | ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874','wv'),
157 | ('http://www.legis.state.wv.us/Bill_Status/bills_text.cfm?billdoc=hb2874%20intr.htm&yr=2015&sesstype=RS&i=2874', 'wv'),
158 | # ('ftp://ftp.cga.ct.gov/2015/tob/h/2015HB-06784-R00-HB.htm','ct'),
159 | ('http://www.capitol.hawaii.gov/session2015/bills/SB129_.PDF','hi'),
160 | ('http://nebraskalegislature.gov/FloorDocs/104/PDF/Intro/LB493.pdf', 'ne'),
161 | ('http://www.gencourt.state.nh.us/legislation/2015/HB0600.html', 'nh')],
162 | [('http://alecexposed.org/w/images/2/2d/7K5-No_Sanctuary_Cities_for_Illegal_Immigrants_Act_Exposed.pdf', 'model_legislation'),
163 | ('http://www.kslegislature.org/li_2012/b2011_12/measures/documents/hb2578_00_0000.pdf', 'ks'),
164 | ('http://flsenate.gov/Session/Bill/2011/0237/BillText/Filed/HTML','fl'),
165 | ('http://openstates.org/al/bills/2012rs/SB211/','al'),
166 | ('http://le.utah.gov/~2011/bills/static/HB0497.html','ut'),
167 | ('http://webserver1.lsb.state.ok.us/cf_pdf/2013-14%20FLR/HFLR/HB1436%20HFLR.PDF','ok')],
168 | [('http://www.alec.org/model-legislation/the-disclosure-of-hydraulic-fracturing-fluid-composition-act/', 'model_legislation'),
169 | ('ftp://ftp.legis.state.tx.us/bills/82R/billtext/html/house_bills/HB03300_HB03399/HB03328S.htm', 'tx')],
170 | [('http://www.legislature.mi.gov/(S(ntrjry55mpj5pv55bv1wd155))/documents/2005-2006/billintroduced/House/htm/2005-HIB-5153.htm', 'mi'),
171 | ('http://www.schouse.gov/sess116_2005-2006/bills/4301.htm','sc'),
172 | ('http://www.lrc.ky.gov/record/06rs/SB38.htm', 'ky'),
173 | ('http://www.okhouse.gov/Legislation/BillFiles/hb2615cs%20db.PDF', 'ok'),
174 | ('http://state.tn.us/sos/acts/105/pub/pc0210.pdf', 'tn'),
175 | ('https://docs.legis.wisconsin.gov/2011/related/proposals/ab69', 'wi'),
176 | ('http://legisweb.state.wy.us/2008/Enroll/HB0137.pdf', 'wy'),
177 | ('http://www.kansas.gov/government/legislative/bills/2006/366.pdf', 'ks'),
178 | ('http://billstatus.ls.state.ms.us/documents/2006/pdf/SB/2400-2499/SB2426SG.pdf', 'mi')],
179 | [('http://www.alec.org/model-legislation/state-withdrawal-from-regional-climate-initiatives/', 'model_legislation'),
180 | ('http://www.legislature.mi.gov/documents/2011-2012/resolutionintroduced/House/htm/2011-HIR-0134.htm', 'mi'),
181 | ('http://www.nmlegis.gov/Sessions/11%20Regular/memorials/house/HJM024.html', 'nm')],
182 | [('http://alecexposed.org/w/images/9/90/7J1-Campus_Personal_Protection_Act_Exposed.pdf', 'model_legislation'),
183 | ('ftp://ftp.legis.state.tx.us/bills/831/billtext/html/house_bills/HB00001_HB00099/HB00056I.htm', 'tx')],
184 | # [
185 | # ('http://essexuu.org/ctstat.html', 'ct'), we don't have connecituc
186 | # ('http://alisondb.legislature.state.al.us/alison/codeofalabama/constitution/1901/CA-170364.htm', 'al')],
187 | [('http://www.legis.state.ak.us/basis/get_bill_text.asp?hsid=HB0162A&session=27', 'ak'),
188 | ('https://legiscan.com/AL/text/HB19/id/327641/Alabama-2011-HB19-Enrolled.pdf', 'al'),
189 | ('http://www.leg.state.co.us/clics/clics2012a/csl.nsf/fsbillcont3/0039C9417C9D9D5D87257981007F3CC9?open&file=1111_01.pdf', 'co'),
190 | ('http://www.capitol.hawaii.gov/session2012/Bills/HB2221_.PDF', 'hi'),
191 | ('http://ilga.gov/legislation/fulltext.asp?DocName=09700HB3058&GA=97&SessionId=84&DocTypeId=HB&LegID=60409&DocNum=3058&GAID=11&Session=&print=true', 'il'),
192 | ('http://coolice.legis.iowa.gov/Legislation/84thGA/Bills/SenateFiles/Introduced/SF142.html', 'ia'),
193 | ('ftp://www.arkleg.state.ar.us/Bills/2011/Public/HB1797.pdf','ar'),
194 | ('http://billstatus.ls.state.ms.us/documents/2012/html/HB/0900-0999/HB0921SG.htm', 'ms'),
195 | ('http://www.leg.state.nv.us/Session/76th2011/Bills/SB/SB373.pdf', 'nv'),
196 | ('http://www.njleg.state.nj.us/2012/Bills/A1000/674_I1.HTM', 'nj'),
197 | ('http://webserver1.lsb.state.ok.us/cf_pdf/2011-12%20INT/hB/HB2821%20INT.PDF', 'ok'),
198 | ('http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=PDF&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=0934&pn=1003', 'pa'),
199 | ('http://www.capitol.tn.gov/Bills/107/Bill/SB0016.pdf', 'tn')],
200 | [('http://www.legislature.idaho.gov/idstat/Title39/T39CH6SECT39-608.htm', 'id'),
201 | ('http://www.legis.nd.gov/cencode/t12-1c20.pdf?20150708171557', 'nd')]
202 | ]
203 |
204 | bills = create_save_bills(similar_bills)
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
--------------------------------------------------------------------------------
/data/evaluation_set/bills_for_evaluation_set.csv:
--------------------------------------------------------------------------------
1 | Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 55,2011,Moak,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 750,2011,Howell,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 164,2011 Regular Session,Marzian,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,mo,SB 213,2011,Schaefer,Enacted,https://legiscan.com/MO/text/SB213/id/294359/Missouri-2011-SB213-Enrolled.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,ar,SB 4,2011,Johnson,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 1053,2011,,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,sd,HB 1062,2011,Lust,Enacted,http://legis.sd.gov/docs/legsession/2011/Bills/HB1062HJU.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 146,2011,Payne,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,id,SB 1056,2011,,Enacted,http://legislature.idaho.gov/legislation/2011/S1056.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 2181,187th,Gobi,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,me,LD 1377,2012,Sanborn,Enacted,http://www.mainelegislature.org/legis/bills/getPDF.asp?paper=HP1016&item=1&snum=125
Adult Guardianship and Protective Proceedings Jurisdiction Act,fl,HB 1431,2010,Schwartz,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2010/pdf/1431.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB7687,2014,Craven/McCaffrey,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,SB 7464,2012,Hannon,Introduced,https://legiscan.com/NY/text/S07464/id/646869/New_York-2011-S07464-Introduced.html
Adult Guardianship and Protective Proceedings Jurisdiction Act,wy,SB 39,2013,Ross,Enacted,https://legiscan.com/WY/text/SF0039/2013
Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,H 1366,188th,Gobi,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2655,2013,Hopson,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,sc,SB 1070,2010,Hayes,Enacted,http://www.scstatehouse.gov/sess118_2009-2010/prever/1070_20100120.htm
Adult Guardianship and Protective Proceedings Jurisdiction Act,az,HB 2426,49th-2nd-regular,Driggs,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,HB 5958,2011,Kennedy,Introduced,https://legiscan.com/RI/text/H5958/id/268260/Rhode_Island-2011-H5958-Draft.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,nh,SB 209,2015,Stiles,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ri,SB525,2015,Lombardi/Craven,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ia,HF 734,2010,,Enacted,https://www.legis.iowa.gov/DOCS/IowaActs/83/2/pdf/Chapter_1086.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,vt,SB 169,2010,Campbell,Introduced,https://legiscan.com/VT/text/S0169/id/384141/Vermont-2009-S0169-Introduced.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 231,2010,Kelley,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ok,SB 2204,2010,Ivester,Enacted,http://www.oklegislature.gov/cf_pdf/2009-10%20ENR/sb/sb2204%20enr.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,va,SB 80,2010,Howell,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,AB 4253,2011,DiCicco,Introduced,http://www.njleg.state.nj.us/2010/Bills/A4500/4253_I1.HTM
Adult Guardianship and Protective Proceedings Jurisdiction Act,hi,SB 2318,2012,Chun,Enacted,https://legiscan.com/HI/text/SB2318/id/544560/Hawaii-2012-SB2318-Introduced.html
Adult Guardianship and Protective Proceedings Jurisdiction Act,pa,HB 1720,2012,Hennessey,Enacted,http://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2011&sessInd=0&billBody=H&billTyp=B&billNbr=1720&pn=2589
Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,HB 191,2012,Moak,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,HB 5150,2012,,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,nj,A 2628,215,Rudder,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,mi,SB 466,2013-2014,Schuitmaker,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ma,S 2249,188th,,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ms,SB 2240,2014,Hopson,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ny,A 857,2013-2014,Weinstein,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,tn,SB 444,2010,Stewart,Enacted,https://legiscan.com/TN/text/SB0444/id/461093/Tennessee-2009-SB0444-Draft.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2998,84,Rodriguez,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ga,SB 207,2015_16,McKoon,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,nc,HB 817,2015,Hurley,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 426,2010,,Introduced,http://www.cga.ct.gov/2010/FC/2010SB-00426-R000518-FC.htm
Adult Guardianship and Protective Proceedings Jurisdiction Act,mn,SF412,2009-2010,Moua,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,nm,SB 497,2009,Wirth,Introduced,http://www.nmlegis.gov/Sessions/09%20Regular/bills/senate/SB0497.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,nv,SB 313,75,,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ct,SB 576,2009,Doyle,Introduced,http://www.cga.ct.gov/2009/FC/pdf/2009SB-00576-R000752-FC.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,tx,HB 2260,81,Truitt,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,wa,HB 1261,2009,Goodman,Enacted,http://lawfilesext.leg.wa.gov/biennium/2009-10/Pdf/Bills/House%20Passed%20Legislature/1261-S.PL.pdf
Adult Guardianship and Protective Proceedings Jurisdiction Act,md,SB 122,2009,Kelley,Introduced,
Adult Guardianship and Protective Proceedings Jurisdiction Act,il,HB 759,96th,Ryg,Enacted,
Adult Guardianship and Protective Proceedings Jurisdiction Act,ky,HB 86,98th,Marzian,Introduced,https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0CB4QFjAAahUKEwi939z7k_TGAhVVMYgKHasODFs&url=http%3A%2F%2Fwww.lrc.ky.gov%2Frecord%2F10rs%2FHB86%2Fbill.doc&ei=jGOyVb1p1eKgBKudsNgF&usg=AFQjCNHcJ0pa2RJG5jxy0CHbLYpUEAADEA&sig2=E7yw8zaghDIujs7uzqPhHQ
Adult Guardianship and Protective Proceedings Jurisdiction Act,ut,SB 122,2008,Hillyard,Enacted,http://le.utah.gov/~2008/bills/static/SB0122.html
Adult Guardianship and Protective Proceedings Jurisdiction Act,mo,HB 2105,2008,Cooper,Introduced,http://house.mo.gov/billtracking/bills081/billpdf/intro/HB2105I.PDF
Anatomical Gift Act (2006),ct,SB 250,2010,,Enacted,http://www.cga.ct.gov/2010/TOB/S/2010SB-00250-R00-SB.htm
Anatomical Gift Act (2006),ky,SB 4,2010,Williams,Enacted,http://www.lrc.ky.gov/record/10RS/SB4/bill.doc
Anatomical Gift Act (2006),md,HB 1451,2010,"Barve, Dumais",Introduced,http://mlis.state.md.us/2010rs/bills/hb/hb1451f.pdf
Anatomical Gift Act (2006),il,HB 2339,98th,Davis,Enacted,
Anatomical Gift Act (2006),pa,HB 2700,2009-2010,Petrarca,Introduced,
Anatomical Gift Act (2006),pa,SB750,2011-2012,Greenleaf/Petrarca,Introduced,
Anatomical Gift Act (2006),ma,S 1098,187th,Fargo,Introduced,
Anatomical Gift Act (2006),md,SB 756,2011,Kelley,Enacted,
Anatomical Gift Act (2006),pa,SB 180,2015-2016,Greenleaf,Introduced,
Anatomical Gift Act (2006),tx,HB 2027,81,Zerwas,Enacted,
Anatomical Gift Act (2006),ct,HB 6677,2009,,Introduced,http://www.cga.ct.gov/2009/FC/2009HB-06677-R000964-FC.htm
Anatomical Gift Act (2006),fl,SB 766,2009,,Introduced,http://static-lobbytools.s3.amazonaws.com/bills/2009/pdf/0766ER.pdf
Anatomical Gift Act (2006),oh,HB 529,2009,Wachtmann,Enacted,http://archives.legislature.state.oh.us/analysis.cfm?ID=127_HB_529&ACT=As%20Enrolled&hf=analyses127/08-hb529-127.htm
Anatomical Gift Act (2006),il,HB 1349,96th,Davis,Introduced,
Anatomical Gift Act (2006),ak,SB 181,2007,McGuire,Introduced,http://www.legis.state.ak.us/PDF/25/Bills/SB0181A.PDF
Anatomical Gift Act (2006),az,SB 1099,2007,Allen,Enacted,http://www.azleg.gov/legtext/48leg/1r/bills/sb1099h.htm
Anatomical Gift Act (2006),id,SB 1017,2007,,Enacted,http://legislature.idaho.gov/legislation/2007/S1017.html
Anatomical Gift Act (2006),ne,LB 1036,2010,Council,Enacted,http://www.nebraskalegislature.gov/FloorDocs/101/PDF/Final/LB1036.pdf
Anatomical Gift Act (2006),nh,HB 1430,2010,Foose,Enacted,http://www.nhliberty.org/bills/view/2010/HB1430
Anatomical Gift Act (2006),vt,S 205,2009-2010,Ayer,Enacted,
Anatomical Gift Act (2006),pa,SB 850,2013-2014,Greenleaf,Introduced,
Anatomical Gift Act (2006),wa,HB 1637,2008,Hinkle,Enacted,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/Session%20Laws/House/1637-S.SL.pdf
Anatomical Gift Act (2006),wi,SB 310,2008,Risser,Enacted,http://docs.legis.wisconsin.gov/2007/related/proposals/sb310
Anatomical Gift Act (2006),ca,AB 1689,2008,Lieber,Enacted,http://leginfo.legislature.ca.gov/faces/billNavClient.xhtml?bill_id=200720080AB1689
Anatomical Gift Act (2006),ga,SB 405,2008,Balfour,Enacted,http://www.legis.ga.gov/Legislation/20072008/84683.pdf
Anatomical Gift Act (2006),hi,HB 2139,2008,,Enacted,http://www.capitol.hawaii.gov/session2008/bills/HB2139_CD1_.pdf
Anatomical Gift Act (2006),ak,HB 196,2008,,Enacted,http://www.legis.state.ak.us/PDF/25/Bills/HB0196Z.PDF
Anatomical Gift Act (2006),va,HB 2684,2007,Frederick,Enacted,http://lis.virginia.gov/cgi-bin/legp604.exe?071+ful+CHAP0092
Anatomical Gift Act (2006),wa,HB 1637,2007,Hinkle,Introduced,http://lawfilesext.leg.wa.gov/biennium/2007-08/Pdf/Bills/House%20Passed%20Legislature/1637-S.PL.pdf
Anatomical Gift Act (2006),mo,SB 1139,2008,Dempsey,Enacted,http://www.senate.mo.gov/08info/pdf-bill/tat/SB1139.pdf
Anatomical Gift Act (2006),ms,HB 1075,2008,Holland,Enacted,https://www.donatelifems.org/HB1075SG.pdf
Anatomical Gift Act (2006),me,LD 1505,2008,Hobbins,Enacted,http://www.mainelegislature.org/legis/bills/bills_123rd/billpdfs/SP052801.pdf
Anatomical Gift Act (2006),mi,HB 4940,2008,Condino,Enacted,http://www.legislature.mi.gov/documents/2007-2008/publicact/pdf/2008-PA-0039.pdf
Anatomical Gift Act (2006),ny,SB 5154,2008,Hannon,Introduced,http://assembly.state.ny.us/leg/?default_fld=&bn=S05154&term=2007&Text=Y
Anatomical Gift Act (2006),nj,SB 754,2008,Codey,Enacted,http://www.njleg.state.nj.us/2008/Bills/PL08/50_.PDF
Anatomical Gift Act (2006),ia,SF 509,2007,,Enacted,http://coolice.legis.iowa.gov/legislation/82ndGA/enrolled/sf509.html
Anatomical Gift Act (2006),mn,SF 883,2007,Scheid,Enacted,https://www.revisor.mn.gov/bills/text.php?number=SF883&version=0&session_year=2007&session_number=0
Anatomical Gift Act (2006),mo,HB723,2007,Stevenson/Koster,Introduced,http://www.house.mo.gov/billtracking/bills071/billpdf/intro/HB0723I.PDF
Anatomical Gift Act (2006),nj,AB 3909,2007,Conaway,Introduced,http://www.njleg.state.nj.us/2006/Bills/A4000/3909_I1.HTM
Anatomical Gift Act (2006),nm,HB 1276,2007,Cervantes,Enacted,http://www.nmlegis.gov/Sessions/07%20Regular/final/HB1276.pdf
Anatomical Gift Act (2006),nc,HB 1372,2007,Folwell,Enacted,http://www.ncga.state.nc.us/Sessions/2007/Bills/House/PDF/H1372v6.pdf
Anatomical Gift Act (2006),nd,SB 2163,2007,Kilzer,Enacted,http://legis.nd.gov/assembly/60-2007/bill-text/HAUN0400.pdf
Anatomical Gift Act (2006),or,HB 3092,2007,,Enacted,https://olis.leg.state.or.us/liz/2007R1/Downloads/MeasureDocument/HB3092
Anatomical Gift Act (2006),tn,HB 1557,2007,Shepard,Enacted,http://state.tn.us/sos/acts/105/pub/pc0428.pdf
Anatomical Gift Act (2006),tx,SB 1597,2007,Janek,Introduced,http://www.legis.state.tx.us/tlodocs/80R/billtext/html/SB01597E.htm
Anatomical Gift Act (2006),ut,SB 92,2007,Hillyard,Enacted,http://le.utah.gov/~2007/bills/static/SB0092.html
--------------------------------------------------------------------------------
/lid/etl/scrapers.py:
--------------------------------------------------------------------------------
1 | import json
2 | import codecs
3 | import base64
4 | import logging
5 | import re
6 | import os
7 | import sys
8 | import multiprocessing
9 | import utils
10 | import random
11 | import argparse
12 | import traceback
13 | import urllib2
14 | from config import DATA_PATH
15 | from bs4 import BeautifulSoup
16 |
17 |
18 | try:
19 | from os import scandir, walk
20 | except ImportError:
21 | from scandir import scandir, walk
22 |
23 | BILL_SCRAPER_LOG = os.environ['POLICY_DIFFUSION'] + '/logs/bill_scraper.log'
24 |
25 |
26 | # scrapes all bills from the input data path
27 | def scrape_all_bills(bill_data_path, num_workers):
28 | logging.basicConfig(filename=BILL_SCRAPER_LOG, level=logging.DEBUG)
29 |
30 | bill_file_paths = []
31 | for dirname, dirnames, filenames in walk(bill_data_path):
32 | for filename in filenames:
33 | bill_file_paths.append(os.path.join(dirname, filename))
34 |
35 |
36 | scrape_bill_document_from_sunlight(bill_file_paths[0])
37 |
38 | random.shuffle(bill_file_paths)
39 |
40 | pool = multiprocessing.Pool(num_workers)
41 |
42 | print "fetch {0} urls from sunlight...".format(len(bill_file_paths))
43 | pool.map(scrape_bill_document_from_sunlight, bill_file_paths)
44 |
45 | print "finished fetching urls..."
46 |
47 |
48 | # open individual json file and scrape bill document,
49 | # from the s3 server provided by sunlight foundation
50 | def scrape_bill_document_from_sunlight(file_path):
51 | try:
52 | file_path = file_path.strip()
53 |
54 | #define path to write file
55 | out_file_path = file_path.split("/bills")[-1]
56 | out_file_path = re.sub("\s+", "_", out_file_path)
57 | out_dir_root_path = "{0}/scraped_bills".format(DATA_PATH)
58 | out_file_name = "{0}{1}.json".format(out_dir_root_path, out_file_path)
59 |
60 | bill_json = json.loads(codecs.open(file_path, encoding="utf8").read())
61 |
62 | # filter versions to be only the first and last
63 | try:
64 | bill_json['versions'] = [bill_json['versions'][0], bill_json['versions'][-1]]
65 | except IndexError:
66 | return
67 |
68 | base_url = "{0}/{1}".format("http://static.openstates.org/documents", bill_json['state'])
69 | urls = ["{0}/{1}".format(base_url, x['doc_id']) for x in bill_json['versions']]
70 | source_urls = [x['url'] for x in bill_json['versions']]
71 |
72 | for i, url in enumerate(urls):
73 |
74 | bill_document = utils.fetch_url(url)
75 |
76 | #hash bill using base64
77 | if bill_document is not None:
78 | bill_document = base64.b64encode(bill_document)
79 | else:
80 | logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(
81 | file_path, url, i, "link error"))
82 |
83 | bill_json['versions'][i]['bill_document'] = bill_document
84 |
85 | if not os.path.exists(os.path.dirname(out_file_name)):
86 | os.makedirs(os.path.dirname(out_file_name))
87 | with codecs.open(out_file_name, "w", encoding="utf8") as f:
88 | f.write(json.dumps(bill_json))
89 |
90 | logging.info("successfully scraped bill: {0}".format(out_file_path))
91 |
92 | except Exception as e:
93 | trace_message = re.sub("\n+", "\t", traceback.format_exc())
94 | trace_message = re.sub("\s+", " ", trace_message)
95 | trace_message = "<<{0}>>".format(trace_message)
96 | m = "Failed to obtain documents for {0}: {1}".format(file_path, trace_message)
97 | logging.error(m)
98 |
99 | return
100 |
101 |
102 | # scrapes bill document from original source link
103 | # this is a backup if s3 doesn't work
104 | def scrape_bill_document_from_original_source(filePath):
105 | filePath = filePath.strip()
106 |
107 | outFilePath = "/".join(filePath.split("/")[7:])
108 | outFilePath = re.sub("\s+", "_", outFilePath)
109 | outDirRootPath = "/mnt/data/sunlight/dssg/scraped_bills_new"
110 | outFileName = "{0}/{1}.json".format(outDirRootPath, outFilePath)
111 |
112 | billFile = codecs.open(filePath, encoding="utf8").read()
113 | billJson = json.loads(billFile)
114 |
115 | # filters documents that are resolutions
116 | bill_text_count = [1 for x in billJson['type'] if "bill" in x.lower()]
117 | if sum(bill_text_count) < 1:
118 | return
119 |
120 | # filter versions to be only the first and last
121 | billJson['versions'] = [billJson['versions'][0], billJson['versions'][-1]]
122 |
123 | urls = [x['url'] for x in billJson['versions']]
124 |
125 | for i, url in enumerate(urls):
126 |
127 | billDocument = utils.fetch_url(url)
128 |
129 | if billDocument is not None:
130 | billDocument = base64.b64encode(billDocument)
131 | else:
132 | logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(filePath, url, i, "link error"))
133 |
134 | billJson['versions'][i]['bill_document'] = billDocument
135 |
136 | if not os.path.exists(os.path.dirname(outFileName)):
137 | os.makedirs(os.path.dirname(outFileName))
138 | with codecs.open(outFileName, "w", encoding="utf8") as f:
139 | f.write(json.dumps(billJson))
140 |
141 | logging.info("successfully scraped bill: {0}".format(outFilePath))
142 |
143 | return
144 |
145 |
146 | # scrapes model legistlation from ALEC's official site
147 | # and the tracker website ALEC exposed
148 | def scrape_ALEC_model_legislation():
149 | url = 'http://www.alec.org/model-legislation/'
150 | response = urllib2.urlopen(url).read()
151 | bs = BeautifulSoup(response, 'html5')
152 |
153 | # Get all links from website
154 | ALEClist = []
155 | for link in bs.find_all('a'):
156 | if link.has_attr('href'):
157 | ALEClist.append(link.attrs['href'])
158 |
159 | # Filter list so that we have only the ones with model-legislation
160 | ALEClinks = []
161 | i = 0
162 | for i in range(0, len(ALEClist)):
163 | if ALEClist[i][20:38] == "model-legislation/":
164 | ALEClinks.append(ALEClist[i])
165 | i = i + 1
166 |
167 | # To get only unique links (get rid off duplicates)
168 | ALEClinks = set(ALEClinks)
169 |
170 | # Save to json file
171 | with open('{0}/data/model_legislation/alec_bills.json'.format(DATA_PATH, 'w')) as f:
172 | for line in ALEClinks:
173 | source = urllib2.urlopen(line).read()
174 | url = line
175 | date = 2015
176 | Jsonbill = bill_source_to_json(url, source, date)
177 | f.write("{0}\n".format(Jsonbill))
178 |
179 | # Save old alec bills (from Center for the Media and Democracy)
180 | def scrape_alec_exposed_bills ():
181 | names = os.listdir('{0}/model_legislation/ALEC_exposed'.format(DATA_PATH))
182 | with open('alec_old_bills.json', 'w') as f2:
183 | for name in names:
184 | try:
185 | text = tp.from_file(name)
186 | source = text['content']
187 | except:
188 | source = None
189 | url = None
190 | date = '2010-2013'
191 | print name
192 | print source
193 | Jsonbill = bill_source_to_json_not_encoded(url, source, date)
194 | f2.write("{0}\n".format(Jsonbill))
195 |
196 |
197 | def scrape_CSG_model_legislation():
198 | url = 'http://www.csg.org/programs/policyprograms/SSL.aspx'
199 | doc = urllib2.urlopen(url).read()
200 | bs = BeautifulSoup(doc)
201 |
202 | links = []
203 | for link in bs.find_all('a'):
204 | if link.has_attr('href'):
205 | candidate = link.attrs['href']
206 | # links with pdf extension tend to be model bills
207 | if candidate[-4:] == ".pdf":
208 | links.append(candidate)
209 |
210 | # only keeps distinct links
211 | links2 = list(set(links))
212 |
213 | badCount = 0
214 | goodCount = 0
215 |
216 | with open('csg_bills.json', 'w') as f:
217 | for line in links2:
218 | try:
219 | url_key = {}
220 | source = urllib2.urlopen(line).read()
221 | Jsonbill = bill_source_to_json(link, source, None)
222 | f.write("{0}\n".format(Jsonbill))
223 | goodCount += 1
224 | except:
225 | badCount += 1
226 | print line
227 |
228 | print str(badCount) + " did not work"
229 |
230 |
231 | def scrape_ALICE_legislation():
232 | path = "/mnt/data/sunlight/dssg/model_legislation/links_"
233 | lines = []
234 | for i in [1, 2, 3]:
235 | filePath = path + str(i) + ".txt"
236 | with open(filePath) as f:
237 | lines.extend(f.read().splitlines())
238 |
239 | text = ''.join(lines)
240 | bs = BeautifulSoup(text)
241 |
242 | links = []
243 | for link in bs.find_all('a'):
244 | if link.has_attr('href'):
245 | links.append(link.attrs['href'])
246 |
247 |
248 | # grab pdfs from links
249 | billList = []
250 | for url in links:
251 | doc = urllib2.urlopen(url).read()
252 | bs = BeautifulSoup(doc)
253 |
254 | for link in bs.find_all('a'):
255 | if link.has_attr('href'):
256 | candidate = link.attrs['href']
257 | if candidate[-4:] == ".pdf": # links with pdf extension tend to be model bills
258 | billList.append("https://stateinnovation.org" + candidate)
259 |
260 | badCount = 0
261 | goodCount = 0
262 | with open('alice_bills.json', 'w') as f:
263 | for link in billList:
264 | # url_key = {}
265 | # source = urllib2.urlopen(link).read()
266 | # Jsonbill = bill_source_to_json(link, source, None)
267 | # f.write("{0}\n".format(Jsonbill))
268 | try:
269 | source = urllib2.urlopen(link).read()
270 | Jsonbill = bill_source_to_json(link, source, None)
271 | f.write("{0}\n".format(Jsonbill))
272 | goodCount += 1
273 | except:
274 | badCount += 1
275 |
276 | print str(badCount) + " did not work"
277 |
278 | def scrape_misc_legislation():
279 | # Access list of clean urls
280 | with open('/mnt/data/sunlight/dssg/model_legislation/clean_urls.txt',
281 | 'r') as f:
282 | links = f.read().splitlines()
283 |
284 | badCount = 0
285 | goodCount = 0
286 | with open('misc_bills.json', 'w') as jsonfile:
287 | for link in links:
288 | try:
289 | source = urllib2.urlopen(link).read()
290 | Jsonbill = bill_source_to_json(link, source, None)
291 | jsonfile.write("{0}\n".format(Jsonbill))
292 | goodCount += 1
293 | print goodCount
294 | except:
295 | badCount += 1
296 |
297 | print str(badCount) + " did not work"
298 | print str(goodCount) + " worked"
299 |
300 |
301 |
302 | def main():
303 |
304 | parser = argparse.ArgumentParser(description='module that contains functions to scrape legislative data\ '
305 | ' from sunlight foundation and various'
306 | 'lobbying organizations')
307 | parser.add_argument('command', help='command to run, options are: \n scrape_bills_from_sunlight')
308 | parser.add_argument('--data_path', dest='data_path', help="file path of data to be indexed ")
309 | parser.add_argument('--num_workers', dest='num_workers',default = 10,
310 | type = int, help="file path of data to be indexed ")
311 |
312 | args = parser.parse_args()
313 |
314 | if args.command == "scrape_bills_from_sunlight":
315 | scrape_all_bills(args.data_path,args.num_workers)
316 | elif args.command == "scrape_ALEC_legislation":
317 | scrape_ALEC_model_legislation()
318 | elif args.command == "scrape_CSG_legislation":
319 | scrape_CSG_model_legislation()
320 | elif args.command == "scrape_ALICE_legislation":
321 | scrape_ALICE_legislation()
322 | elif args.command =="scrape_misc_legislation":
323 | scrape_misc_legislation()
324 | else:
325 | print("command not recognized, use -h flag to see list available commands")
326 |
327 |
328 |
329 | if __name__ == "__main__":
330 | main()
331 |
--------------------------------------------------------------------------------
/tests/text_alignment_tests.py:
--------------------------------------------------------------------------------
1 |
2 | import random
3 | import matplotlib.pyplot as plt
4 | import time
5 | import numpy as np
6 | from compiler.ast import flatten
7 | from alignment.sequence import Sequence
8 | from alignment.vocabulary import Vocabulary
9 | from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner
10 | from utils.general_utils import find_subsequence
11 | from text_alignment import *
12 |
13 |
14 | #function from python package for testing results
15 | def seqToAlign(a, b, matchScore = 3, mismatchScore = -1, gapScore = -2):
16 | '''
17 | args:
18 | a: list of words
19 | b: list of words
20 | matchScore: num
21 | mismatchScore: num
22 | gapScore: num
23 | Returns:
24 | o/w returns list of tuples with score and top alignments
25 | Description:
26 | helper function for finding alignments given a list of words
27 | '''
28 | # Create a vocabulary and encode the sequences.
29 | a = a[0]
30 | b = b[0]
31 | seq1 = Sequence(a)
32 | seq2 = Sequence(b)
33 | v = Vocabulary()
34 | aEncoded = v.encodeSequence(seq1)
35 | bEncoded = v.encodeSequence(seq2)
36 |
37 | # Create a scoring and align the sequences using local aligner.
38 | scoring = SimpleScoring(matchScore, mismatchScore)
39 | aligner = LocalSequenceAligner(scoring, gapScore)
40 | score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
41 | alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds]
42 |
43 | return [(a.score, list(a.first), list(a.second)) for a in alignments]
44 |
45 |
46 | #testing functions
47 | def create_doc_test_cases():
48 | #tests
49 | t1 = [['a']*100]
50 | t2 = [['b']*50 + ['a','a','b']*50]
51 |
52 | s1 = [[1]*100]
53 | s2 = [[2]*50 + [1,1,2]*50]
54 |
55 | v1 = [np.array([0, 1, 2, 3, 4, 7, 6, 3, 2, 1, 3])]
56 | v2 = [np.array([0, 1, 2, 3, 4, 4, 5, 2, 1, 2, 2])]
57 |
58 | w1 = [np.array([7, 6, 3, 2, 1, 3, 0, 1, 2, 3, 4])]
59 | w2 = [np.array([4, 5, 2, 1, 2, 2, 0, 1, 2, 3, 4])]
60 |
61 | tests = [(t1,t2), (s1,s2),(v1,v2), (w1,w2), ([np.random.choice(5, 30)],[np.random.choice(5, 30)]), \
62 | ([np.array([1, 2, 0, 0, 1, 2, 3, 0, 1, 3, 0, 4, 3, 3, 0, 3, 0, 2, 0, 4, 3, 4, 2, \
63 | 1, 1, 1, 1, 1, 0, 1])], [np.array([2, 0, 3, 1, 2, 4, 0, 1, 3, 0, 1, 4, 1, 3, 1, 4, 0, 0, 1, 2, 4, 0, 0, \
64 | 2, 4, 1, 3, 2, 2, 4])])]
65 |
66 | return tests
67 |
68 |
69 | #LocalAligner algorithm tests
70 | def LocalAligner_unit_tests():
71 |
72 | def test_alignment(t1,t2):
73 | f = LocalAligner()
74 | alignment=f.align(t1,t2) #default score is 3,-1,-2
75 | score, l, r = alignment.alignments[0]
76 |
77 | #find score of recovered alignment
78 | align_score = f.alignment_score(l,r)
79 |
80 | #run package algorithm
81 | alignments = seqToAlign(t1,t2) #default score is 3,-1,-2
82 |
83 | if score == align_score and score == alignments[0][0]:
84 | print 'package, backtraced alignment, and alignmnet matrix consistent'
85 | else:
86 | print 'dp_alg_score: ' + str(score)
87 | print 'alignment_score: ' + str(align_score)
88 | print 'package_score: ' + str(alignments[0][0])
89 |
90 | #tests
91 | tests = create_doc_test_cases()
92 | for test in tests:
93 | z1, z2 = test
94 | test_alignment(z1,z2)
95 |
96 | f = LocalAligner()
97 | alignment=f.align(z1,z2) #default score is 3,-1,-2
98 |
99 | score, l, r = alignment.alignments[0]
100 |
101 | #run package algorithm
102 | alignments = seqToAlign(z1,z2) #default score is 3,-1,-2
103 |
104 | l_true, r_true = alignments[0][1:]
105 |
106 | for i in range(len(l)):
107 | if l[i] != l_true[i]:
108 | print 'not same sequence'
109 | break
110 |
111 | for i in range(len(r)):
112 | if r[i] != r_true[i]:
113 | print 'not same sequence'
114 | break
115 |
116 |
117 | def test_alignment(t1,t2, algorithm):
118 | f = algorithm()
119 | alignment=f.align(t1,t2) #default score is 3,-1,-2
120 | score, l, r = alignment.alignments[0]
121 |
122 | #find score of recovered alignment
123 | align_score = f.alignment_score(l,r)
124 |
125 | if score == align_score:
126 | print 'backtraced alignment and alignmnet matrix consistent'
127 | else:
128 | print 'backtraced alignment and alignmnet matrix not consistent'
129 | print 'dp_alg_score: ' + str(score)
130 | print 'alignment_score: ' + str(align_score)
131 |
132 | print 'left_alignment: ', l
133 | print 'right_alignment: ', r
134 |
135 |
136 | def generic_doc_unit_test(algorithm):
137 |
138 | tests = create_doc_test_cases()
139 | for test in tests:
140 | z1, z2 = test
141 | test_alignment(z1,z2, algorithm)
142 |
143 |
144 | def LocalAligner_speed_test():
145 |
146 | input_sizes = [np.exp2(p) for p in range(2,7)]
147 |
148 | average_our_times = []
149 | average_package_times = []
150 | for input_size in input_sizes:
151 | print input_size
152 | v1 = [np.random.randint(0,10,input_size)]
153 | v2 = [np.random.randint(0,10,input_size)]
154 | our_times = []
155 | package_times = []
156 | f = LocalAligner()
157 | for i in range(2):
158 | t1 = time.time()
159 | f.align(v1,v2)
160 | our_times.append(time.time()-t1)
161 |
162 | t2 = time.time()
163 | seqToAlign(v1,v2)
164 | package_times.append(time.time()-t2)
165 |
166 | average_our_times.append(np.mean(our_times))
167 | average_package_times.append(np.mean(package_times))
168 |
169 | plt.plot(input_sizes,average_package_times, color = 'b', label = 'package')
170 | plt.plot(input_sizes,average_our_times, color='r', label = 'our implementation')
171 | plt.legend(loc='upper right')
172 | plt.xlabel('input size')
173 | plt.ylim(0,0.02)
174 | plt.show()
175 |
176 |
177 | def generic_doc_speed_test(algorithm):
178 | '''
179 | compares speed of algorithm to local alignment algorithm
180 | '''
181 |
182 | input_sizes = [np.exp2(p) for p in range(2,7)]
183 |
184 | average_alg_times = []
185 | average_local_times = []
186 | for input_size in input_sizes:
187 | print input_size
188 | v1 = [np.random.randint(0,10,input_size)]
189 | v2 = [np.random.randint(0,10,input_size)]
190 | local_times = []
191 | alg_times = []
192 | f = LocalAligner()
193 | g = algorithm()
194 | for i in range(2):
195 | t1 = time.time()
196 | f.align(v1,v2)
197 | local_times.append(time.time()-t1)
198 |
199 | t2 = time.time()
200 | g.align(v1,v2)
201 | alg_times.append(time.time()-t2)
202 |
203 | average_local_times.append(np.mean(local_times))
204 | average_alg_times.append(np.mean(alg_times))
205 |
206 | return average_local_times, average_alg_times
207 |
208 |
209 | def doc_test_alignment_indices(algorithm):
210 | #tests
211 | tests = create_doc_test_cases()
212 |
213 | good_job = True
214 | for test in tests:
215 |
216 | left_text, right_text = test
217 | try:
218 | left_text[0] = left_text[0].tolist()
219 | right_text[0] = right_text[0].tolist()
220 | except:
221 | pass
222 | f = algorithm()
223 | Alignment = f.align(left_text,right_text)
224 | left, right = clean_alignment(Alignment.alignments[0])
225 |
226 |
227 | left_start, left_end = find_subsequence(left, flatten(left_text))
228 | right_start, right_end = find_subsequence(right, flatten(right_text))
229 |
230 | if Alignment.alignment_indices[0]['left_start'] != left_start or \
231 | Alignment.alignment_indices[0]['left_end'] != left_end or \
232 | Alignment.alignment_indices[0]['right_start'] != right_start or \
233 | Alignment.alignment_indices[0]['right_end'] != right_end:
234 |
235 | print 'alignment length: ', len(left)
236 |
237 | print 'indices are messed up'
238 |
239 | print 'left_start: ', Alignment.alignment_indices[0]['left_start']
240 | print 'true left_start: ', left_start
241 | print 'left_end: ', Alignment.alignment_indices[0]['left_end']
242 | print 'true left_end', left_end
243 | print '\n'
244 |
245 | print 'right_start: ', Alignment.alignment_indices[0]['right_start']
246 | print 'true right_start: ', right_start
247 | print 'right_end: ', Alignment.alignment_indices[0]['right_end']
248 | print 'true right_end: ', right_end
249 |
250 | print '\n'
251 |
252 | good_job = False
253 |
254 | if good_job:
255 | print 'indices worked'
256 |
257 |
258 | #SectionLocalAlignment Tests
259 | def create_section_tests():
260 | tests = create_doc_test_cases()
261 |
262 | #convert tests into sections so
263 | #that it makes sense for case
264 | left_test = []
265 | right_test = []
266 | for test1, test2 in tests:
267 | left_test.append(list(test1[0]))
268 | right_test.append(list(test2[0]))
269 |
270 | return left_test, right_test
271 |
272 |
273 | def section_unit_tests(Algorithm):
274 | left_test, right_test = create_section_tests()
275 |
276 | f = Algorithm()
277 | Alignment = f.align(left_test, [flatten(right_test)])
278 |
279 | good_job = True
280 | for score, left, right in Alignment.alignments:
281 | true_score = f.alignment_score(left, right)
282 | if true_score != score:
283 | print 'left: ', left
284 | print 'right: ', right
285 | print 'true alignment score: ', true_score
286 | print 'calculated score: ', score
287 | good_job = False
288 |
289 | if good_job:
290 | print "calculated alignment scores correctly"
291 |
292 |
293 | def section_speed_test():
294 |
295 | input_sizes = [np.exp2(p) for p in range(2,9)]
296 |
297 | average_local_times = []
298 | average_section_times = []
299 | for input_size in input_sizes:
300 | print input_size
301 | v1 = [np.random.randint(0,10,input_size)]
302 | v2 = [np.random.randint(0,10,input_size)]
303 |
304 | cut1 = random.randint(0,len(v1))
305 | cut2 = random.randint(cut1,len(v2))
306 | cut3 = random.randint(cut2,len(v2))
307 | w1 = [v1[0][:cut1], v1[0][cut1:cut2], v1[0][cut2:cut3]]
308 |
309 | local_times = []
310 | section_times = []
311 | for i in range(2):
312 | t1 = time.time()
313 | f = LocalAligner()
314 | f.align(v1,v2)
315 | local_times.append(time.time()-t1)
316 |
317 | t2 = time.time()
318 | f = LocalAligner()
319 | f.align(w1,v2)
320 | section_times.append(time.time()-t2)
321 |
322 | average_local_times.append(np.mean(local_times))
323 | average_section_times.append(np.mean(section_times))
324 |
325 | plt.plot(input_sizes,average_section_times, color = 'b', label = 'section local alignment')
326 | plt.plot(input_sizes,average_local_times, color='r', label = 'local alignment')
327 | plt.legend(loc='upper right')
328 | plt.xlabel('input size')
329 | plt.ylim(0,0.02)
330 | plt.show()
331 |
332 |
333 | def section_test_alignment_indices():
334 | left_test, right_test = create_section_tests()
335 | left_test_flattened = flatten(left_test)
336 | right_test_flattened = flatten(right_test)
337 |
338 | f = LocalAligner()
339 | Alignment = f.align(left_test, [right_test_flattened])
340 |
341 | good_job = True
342 | for i in range(len(Alignment.alignments)):
343 | left, right = clean_alignment(Alignment.alignments[i])
344 |
345 | print 'alignment length: ', len(left)
346 |
347 | left_start, left_end = find_subsequence(left, left_test_flattened)
348 | right_start, right_end = find_subsequence(right, right_test_flattened)
349 |
350 | if Alignment.alignment_indices[i]['left_start'] != left_start or \
351 | Alignment.alignment_indices[i]['left_end'] != left_end or \
352 | Alignment.alignment_indices[i]['right_start'] != right_start or \
353 | Alignment.alignment_indices[i]['right_end'] != right_end:
354 |
355 | print 'indices are messed up: '
356 |
357 | print 'left_start: ', Alignment.alignment_indices[i]['left_start']
358 | print 'true left_start: ', left_start
359 | print 'left_end: ', Alignment.alignment_indices[i]['left_end']
360 | print 'true left_end', left_end
361 | print '\n'
362 |
363 | print 'right_start: ', Alignment.alignment_indices[i]['right_start']
364 | print 'true right_start: ', right_start
365 | print 'right_end: ', Alignment.alignment_indices[i]['right_end']
366 | print 'true right_end: ', right_end
367 |
368 | print '\n'
369 |
370 | good_job = False
371 |
372 | if good_job:
373 | print 'indices worked'
374 |
375 |
376 | ############################################################
377 | ##helper functions
378 | def clean_alignment(alignment):
379 | '''
380 | arg:
381 | alignment object
382 | returns:
383 | 2 list of alignment words without the alignment symbol
384 | '''
385 | keep1 = []
386 | keep2 = []
387 | for item in alignment[1]:
388 | if item != '-':
389 | keep1.append(item)
390 |
391 | for item in alignment[2]:
392 | if item != '-':
393 | keep2.append(item)
394 |
395 | return (keep1, keep2)
396 |
397 |
398 | if __name__ == '__main__':
399 | print "running LocalAligner unit tests.... \n"
400 | LocalAligner_unit_tests()
401 |
402 | print "running LocalAligner speed tests.... \n"
403 | LocalAligner_speed_test()
404 |
405 | print "running LocalAligner index tests.... \n"
406 | doc_test_alignment_indices(LocalAligner)
407 |
408 | print "running AffineLocalAligner unit tests.... \n"
409 | generic_doc_unit_test(AffineLocalAligner)
410 |
411 | print "running AffineLocalAligner speed tests.... \n"
412 | generic_doc_speed_test(AffineLocalAligner)
413 |
414 | print "running section unit tests for localaligner.... \n"
415 | section_unit_tests(LocalAligner)
416 |
417 | print "running section unit tests for affinealigner.... \n"
418 | section_unit_tests(AffineLocalAligner)
419 |
420 | print "running section speed tests.... \n"
421 | section_speed_test()
422 |
423 | print 'running test on keeping track of indices for section algorithm..... \n'
424 | section_test_alignment_indices()
425 |
426 | print 'running speed test on Word2VecLocalAligner.... \n'
--------------------------------------------------------------------------------
/lid/alignment_classifier.py:
--------------------------------------------------------------------------------
1 | from alignment_evaluation import alignment_features
2 | import numpy as np
3 | import nltk
4 | from sklearn import linear_model
5 | from sklearn.metrics import confusion_matrix, accuracy_score
6 | import csv
7 | import json
8 | import argparse
9 | import os
10 | from database import ElasticConnection
11 | import random
12 | import codecs
13 | from sklearn.feature_extraction.text import TfidfVectorizer
14 | from utils.general_utils import alignment_tokenizer
15 | from utils.general_utils import UnicodeWriter,UnicodeReader
16 | import pickle
17 | from sklearn.metrics import jaccard_similarity_score,classification_report
18 | from sklearn.linear_model import LogisticRegression
19 | from sklearn.cross_validation import KFold
20 |
21 |
22 |
23 | '''Contains code for both the features and model of the alignment classifier used to classify alignments as
24 | substantive or boiler-plate'''
25 |
26 | def compute_tfidf_scores(alignment_data_path,pickle_file_name):
27 | count = 0
28 | alignment_docs = []
29 | for line in alignment_data_path:
30 | print count
31 | count += 1
32 | if count >= 100000:
33 | break
34 | json_obj = json.loads(line.strip())
35 |
36 |
37 | if "alignment_results" not in json_obj:
38 | continue
39 |
40 | for alignment_result in json_obj['alignment_results']:
41 | alignment_doc = []
42 | for section_alignment in alignment_result['alignments']:
43 | alignment_doc.extend([x for x in section_alignment['left'] if x not in ['-',None]])
44 | alignment_doc.extend([x for x in section_alignment['right'] if x not in ['-',None]])
45 | alignment_docs.append( " ".join(alignment_doc))
46 |
47 |
48 | vectorizer = TfidfVectorizer()
49 | X = vectorizer.fit_transform(alignment_docs)
50 | idf = vectorizer.idf_
51 | idf = vectorizer._tfidf.idf_
52 | term_scores = zip(vectorizer.get_feature_names(), idf)
53 | term_dict = dict(term_scores)
54 | pickle_file = codecs.open(pickle_file_name,mode = "wb")
55 | pickle.dump(term_dict,pickle_file)
56 | return
57 |
58 |
59 |
60 | def construct_training_set(alignments_file,out_file_name,score_threshold = None):
61 | """
62 | Args:
63 | alignments_file (file) -- file containing sample alignments
64 |
65 | out_file_name (string) -- name of training data file to write to
66 |
67 | Returns:
68 | None
69 | """
70 | ec = ElasticConnection(host= "54.203.12.145")
71 |
72 | training_examples = []
73 | for i,x in enumerate(alignments_file):
74 | print i
75 | json_obj = json.loads(x.strip())
76 |
77 | if "alignment_results" not in json_obj.keys():
78 | continue
79 |
80 | left_doc_id = json_obj['query_document_id']
81 | left_bill_title = ec.get_bill_by_id(left_doc_id)['bill_title']
82 |
83 | left_doc = json_obj['query_document']
84 | left_doc = reduce(lambda x,y:x+y,left_doc)
85 |
86 | left_doc_length = len(left_doc.split())
87 |
88 | for i,alignment_doc in enumerate(json_obj['alignment_results']):
89 |
90 | right_doc_id = alignment_doc['document_id']
91 | right_bill_title = ec.get_bill_by_id(right_doc_id)['bill_title']
92 |
93 | for alignment in alignment_doc['alignments']:
94 |
95 | left = alignment['left']
96 | right = alignment['right']
97 | left_start = alignment['left_start']
98 | right_start = alignment['right_start']
99 | left_end = alignment['left_end']
100 | right_end = alignment['right_end']
101 | score = alignment['score']
102 | if score < score_threshold:
103 | continue
104 | training_examples.append([left_doc_id,right_doc_id,left_doc_length,left_start,right_start,left_end,
105 | right_end,score,left_bill_title,right_bill_title,
106 | " ".join(left)," ".join(right)])
107 |
108 |
109 | random.shuffle(training_examples)
110 |
111 | header = ["left_doc_id","right_doc_id","left_doc_length","left_start","right_start","left_end",
112 | "right_end","score","left_bill_title","right_bill_title","left","right"]
113 |
114 |
115 | k = 500
116 | with codecs.open(out_file_name, 'wb') as output_file:
117 | writer = UnicodeWriter(output_file, header)
118 | writer.writerow(header)
119 | for l in training_examples[0:k]:
120 | l = [unicode(x) for x in l]
121 | writer.writerow(l)
122 |
123 |
124 | return
125 |
126 |
127 |
128 | def evaluate_alignment_classifier():
129 | """runs k-fold cross validation on training set to evaluate classifier"""
130 |
131 | training_examples = []
132 | for line in csv.reader(self._training_file):
133 | if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
134 | continue
135 | if len(line[10]) <= 1 or len(line[11]) < 1:
136 | continue
137 | training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
138 |
139 |
140 |
141 |
142 | random.shuffle(training_examples)
143 | X,y = self.compute_feature_matrix(training_examples)
144 |
145 | self._model.fit(X_train,y_train)
146 | X,y = np.array(X),np.array(y)
147 | kf = KFold(n=len(X), n_folds=4, shuffle=False,
148 | random_state=None)
149 | for train_index, test_index in kf:
150 | X_train, X_test = X[train_index], X[test_index]
151 | y_train, y_test = y[train_index], y[test_index]
152 | self._model.fit(X_train,y_train)
153 | y_pred = self._model.predict(X_test)
154 | print classification_report(y_test, y_pred)
155 |
156 | self._model.fit(X,y)
157 | feat_names = ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
158 | 'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
159 |
160 | for x in zip(feat_names,self._model.coef_.tolist()):
161 | print x
162 |
163 |
164 |
165 |
166 | class AlignmentClassifier():
167 | """Classifier that labels alignments as either substantive (1) or boilerplate (0)"""
168 |
169 |
170 | def __init__(self,idf_file_path):
171 | """Keyword Args:
172 |
173 | idf_file_path: file path of the table that stores idf scores of the words
174 |
175 | """
176 | self._idf_score_dict = pickle.load(open(idf_file_path))
177 | self._training_file = codecs.open(os.environ['POLICY_DIFFUSION']+\
178 | "/data/training_data_alignment_classifier_bigger.csv",mode = "rU")
179 | self._model = LogisticRegression(penalty='l1')
180 |
181 | def compute_feature_matrix(self,training_examples):
182 | """Keywords Args:
183 |
184 | training_examples: list of dicts, where each dict contains alignment: "left":left_text,"right":right_text
185 | and "label":label of alignment (1) substantive and boilerplate (0)
186 |
187 | Returns:
188 |
189 | X: feature matrix
190 | y: labels
191 |
192 | """
193 |
194 | X = []
195 | y = []
196 | for training_example in training_examples:
197 | left = training_example['left']
198 | right = training_example['right']
199 | label = training_example['label']
200 | meta_features = self._compute_alignment_meta_features(left,right)
201 | idf_features = self._compute_idf_score(left,right)
202 | features = meta_features + idf_features
203 | X.append(features)
204 | y.append(label)
205 |
206 | return X,y
207 |
208 | def train_model(self):
209 | """ Trains model using training examples in self._training_file and returns a trained model self._model
210 |
211 | Keywords Args:
212 | None
213 |
214 | Returns:
215 | None
216 |
217 | """
218 |
219 |
220 | training_examples = []
221 | for line in csv.reader(self._training_file):
222 | if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
223 | continue
224 | if len(line[10]) <= 1 or len(line[11]) < 1:
225 | continue
226 | training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
227 |
228 | X,y = self.compute_feature_matrix(training_examples)
229 |
230 | self._model.fit(X_train,y_train)
231 | X,y = np.array(X),np.array(y)
232 | kf = KFold(n=len(X), n_folds=4, shuffle=False,
233 | random_state=None)
234 | for train_index, test_index in kf:
235 | X_train, X_test = X[train_index], X[test_index]
236 | y_train, y_test = y[train_index], y[test_index]
237 | self._model.fit(X_train,y_train)
238 | y_pred = self._model.predict(X_test)
239 | print classification_report(y_test, y_pred)
240 |
241 | self._model.fit(X,y)
242 | feat_names = ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
243 | 'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
244 |
245 | for x in zip(feat_names,self._model.coef_.tolist()):
246 | print x
247 |
248 |
249 | def predict(self,alignment_example):
250 | """predicts label for alignment example
251 |
252 |
253 | Keyword Args:
254 |
255 | alignment_example: alignment [left,right] that needs to be labeled
256 |
257 |
258 | """
259 | pass
260 |
261 |
262 |
263 | def _compute_alignment_meta_features(self,left, right):
264 | '''
265 | This function takes as input two alignments and produce features of these
266 | '''
267 | #alignment features
268 | features = {}
269 | features['length'] = len(left)
270 | features['num_gaps_l'] = 0
271 | features['num_gaps_r'] = 0
272 | features['num_mismatches'] = 0
273 | features['num_matches'] = 0
274 | features['avg_gap_length_l'] = []
275 | features['avg_gap_length_r'] = []
276 | features['avg_consec_match_length'] = []
277 | features['jaccard_score'] = jaccard_similarity_score(left,right)
278 |
279 | #helper variables
280 | prev_gap_l = False
281 | prev_gap_r = False
282 | prev_match = False
283 | for i in range(len(left)):
284 | # print 'i: ', i
285 | # print 'features: ', features
286 | if left[i] == '-':
287 | features['num_gaps_l'] += 1
288 | if not prev_gap_l:
289 | features['avg_gap_length_l'].append(1)
290 | prev_gap_l = True
291 | else:
292 | features['avg_gap_length_l'][-1] += 1
293 | else:
294 | prev_gap_l = False
295 | if right[i] == '-':
296 | features['num_gaps_r'] += 1
297 | if not prev_gap_r:
298 | features['avg_gap_length_r'].append(1)
299 | prev_gap_r = True
300 | else:
301 | features['avg_gap_length_r'][-1] += 1
302 | else:
303 | prev_gap_r = False
304 | if left[i] != '-' and right[i] != '-':
305 | if left[i] != right[i]:
306 | features['num_mismatches'] += 1
307 | elif left[i] == right[i]:
308 | features['num_matches'] += 1
309 | if not prev_match:
310 | features['avg_consec_match_length'].append(1)
311 | prev_match = True
312 | else:
313 | features['avg_consec_match_length'][-1] += 1
314 | if left[i] != right[i]:
315 | prev_match = False
316 |
317 | if features['avg_gap_length_l'] != []:
318 | features['avg_gap_length_l'] = np.mean(features['avg_gap_length_l'])
319 | else:
320 | features['avg_gap_length_l'] = 0
321 | if features['avg_gap_length_r'] != []:
322 | features['avg_gap_length_r'] = np.mean(features['avg_gap_length_r'])
323 | else:
324 | features['avg_gap_length_r'] = 0
325 | if features['avg_consec_match_length'] != []:
326 | features['avg_consec_match_length'] = np.mean(features['avg_consec_match_length'])
327 | else:
328 | features['avg_consec_match_length'] = 0
329 |
330 | features = sorted(features.items(),key = lambda x:x[0],reverse= False)
331 | return [x[1] for x in features]
332 |
333 |
334 | def _compute_idf_score(self,left,right):
335 | idf_scores = []
336 |
337 | for w in left:
338 | if w in self._idf_score_dict:
339 | idf_scores.append(self._idf_score_dict[w])
340 |
341 | for w in right:
342 | if w in self._idf_score_dict:
343 | idf_scores.append(self._idf_score_dict[w])
344 |
345 |
346 | return [np.mean(idf_scores),np.median(idf_scores)]
347 |
348 |
349 |
350 |
351 | def main():
352 | parser = argparse.ArgumentParser(description='Classifier to label aligned text as "substantive" ')
353 | parser.add_argument('command',
354 | help='command to run, options are: construct_training_set,train_model,evaluate_model')
355 | parser.add_argument('--alignment_samples_doc', dest='alignment_samples',
356 | help="file path to the alignment samples used to construct training set ")
357 |
358 | args = parser.parse_args()
359 |
360 | if args.command == "construct_training_set":
361 | construct_training_set(open(args.alignment_samples),
362 | os.environ['POLICY_DIFFUSION']+"/data/classifier/alignments_training_set_high_scores.csv",50)
363 | elif args.command == "compute_tfidf_scores":
364 | alignments_file = codecs.open("/mnt/data/sunlight/dssg/alignment_results/bill_to_bill_alignments.txt",
365 | encoding = "utf8")
366 | out_file = "/mnt/data/sunlight/dssg/features/alignment_tfidf_scores.p"
367 | compute_tfidf_scores(alignments_file,out_file)
368 |
369 |
370 | elif args.command == "train_model":
371 | pass
372 | elif args.command == "evaluate_model":
373 | pass
374 | else:
375 | print args
376 | print "command not recognized, please enter construct_training_set,train_model,evaluate_model"
377 |
378 |
379 | if __name__ == "__main__":
380 | main()
381 |
382 |
383 |
--------------------------------------------------------------------------------
/data/model_legislation_urls/clean_urls.txt:
--------------------------------------------------------------------------------
1 | http://publicpolicyalliance.org/legislation/model-alac-bill/
2 | http://www.svia.org/Relations/Legislation.aspx
3 | http://www.mpp.org/legislation/model-medical-marijuana-bill.html?referrer=https://www.google.com/
4 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Pets_Shops.pdf
5 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Pet_Shops.pdf
6 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_Swap_Meets.pdf
7 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf
8 | http://www.bornfreeusa.org/downloads/pdf/Model_Unweaned_Bird_Legislation.pdf
9 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_Traveling_Circus.pdf
10 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_Traveling_Circus.pdf
11 | http://www.bornfreeusa.org/downloads/pdf/Model_State_Legislation_for_Display_of_Exotics.pdf
12 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Legislation_for_Display_of_Exotics.
13 | http://www.bornfreeusa.org/downloads/pdf/Model_City_Ordinance_for_trapping.pdf
14 | http://www.bornfreeusa.org/downloads/pdf/Model_Wildlife_Feeding_Legislation.pdf
15 | http://images2.americanprogress.org/campus/web/ALEC_voter_ID_model_legislation.pdf
16 | http://www.publiccharters.org/wp-content/uploads/2014/01/ModelLaw_P7-wCVR_20110402T222341.pdf
17 | http://apps.americanbar.org/tax/groups/salt/ABA1_OFFICIAL_MODEL_ACT_REPORT_AS_ADOPTED_8-7-06.pdf
18 | http://www.justice.gov/olp/model-state-provisions-pimping-pandering-and-prostitution
19 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidencePrescriptiveModelBill2015.pdf
20 | http://www.innocenceproject.org/free-innocent/improve-the-law/PreservationofBiologicalEvidenceTaskForceKeyedtoNISTModelBillRB.pdf
21 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDPrescriptiveModelBill2015.pdf
22 | http://www.innocenceproject.org/free-innocent/improve-the-law/EWIDStandardTaskForceModelBill2015.pdf
23 | http://www.innocenceproject.org/free-innocent/improve-the-law/RecordingofCustodialInterrogationsModelBill2015.pdf
24 | http://www.innocenceproject.org/free-innocent/improve-the-law/CompensationModelBill2015.pdf
25 | http://www.innocenceproject.org/free-innocent/improve-the-law/JailhouseInformantModelBill2015.pdf
26 | http://www.innocenceproject.org/free-innocent/improve-the-law/AccesstoPostConvictionDNATestingModelBill2015.pdf
27 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_disclosure.pdf
28 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_clawbacks.pdf
29 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_jobstandards.pdf
30 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_UEDB.pdf
31 | http://www.goodjobsfirst.org/sites/default/files/docs/pdf/GJF_model_complete.pdf
32 | http://www.pcia.com/images/Advocacy_Docs/PCIA_Model_State_Siting_Legislation_2012.pdf
33 | http://nepc.colorado.edu/files/NEPC-VirtSchool-2-LB-Bathon.pdf
34 | http://www.shallnot.org/legislation
35 | http://www.khi.org/assets/uploads/news/13359/goldwater_institute_right_to_try_model_legislation.pdf
36 | http://www.icmec.org/en_X1/pdf/Child_Pornography_Model_Law_English_7th_Edition_2012.pdf
37 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf
38 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf
39 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf
40 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf
41 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Education-Savings-Account-Act.pdf
42 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Military-Family-Scholarship-Program-Act.pdf
43 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Foster-Child-Scholarship-Program-Act.pdf
44 | http://greatlakescenter.org/docs/Policy_Briefs/Research-Based-Options/02-Trujillo_Turnarounds-LB.pdf
45 | http://www.academicfreedompetition.com/freedom.php
46 | http://www.dpcare.org/#!dpcc-model-legislation/c14ob
47 | http://toxicsinpackaging.org/model-legislation/model/
48 | http://www.adl.org/assets/pdf/combating-hate/Hate-Crimes-Law.pdf
49 | http://www.davidyerushalmilaw.com/CLE-Course-on-Draft-Uniform-Act--American-Laws-for-American-Courts-b25-p0.html%22
50 | https://www.aclu.org/model-act-regulating-use-wearable-body-cameras-law-enforcement
51 | http://www.ncsl.org/documents/standcomm/sccomfc/point_of_sale_model_bill2010.pdf
52 | http://object.cato.org/pdfs/model-tax-credit-legislation-schaeffer-cato.pdf
53 | http://inspectorsgeneral.org/files/2011/01/IG-Model-Legislation.pdf
54 | http://www.inacol.org/wp-content/uploads/2015/02/Principles-For-Model-Legislation-2012.pdf
55 | http://www.emacweb.org/index.php/mutualaidresources/intrastate-mutual-aid/modellegislation
56 | http://aldf.org/downloads/ALDF_Model_Laws_v15_0.pdf
57 | http://www.nationalpartnership.org/research-library/work-family/psd/model-paid-sick-and-safe-days-legislation.pdf
58 | http://www.nationalpartnership.org/research-library/work-family/psd/section-by-section-analysis-model-legislation.pdf
59 | http://www.nationalpartnership.org/research-library/work-family/psd/fact-sheet-model-legislation-main-points.pdf
60 | https://www.aapa.org/WorkArea/DownloadAsset.aspx?id=548
61 | http://www.indianasenaterepublicans.com/clientuploads/directory/publications/Sen%20David%20Long%20Article%20V%20Packet-Online.pdf
62 | https://www.mackinac.org/21341http://www.naso.org/Resources/Legislation/ModelLegislation.aspx
63 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a
64 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UEMTSA.pdf?e3490a
65 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-UE.pdf?e3490a
66 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Parental-Choice-Scholarship-Program-Act-MTE.pdf?e3490a
67 | http://www.federationforchildren.org/wp-content/uploads/2013/12/Great-Schools-Tax-Credit-Program-Act.pdf?e3490a
68 | http://autisticadvocacy.org/wp-content/uploads/2014/03/OrganTransplantationModelLegislation.pdf
69 | https://s3.amazonaws.com/peacelegislation/PEACE+Act.pdf
70 | http://www.peaceonourstreets.com/peace
71 | http://www.splc.org/article/2000/07/student-press-law-center-model-legislation-to-protect-student-free-expression-rights
72 | http://www.splc.org/article/1998/09/student-press-law-center-model-guidelines-for-high-school-student-media
73 | http://www.splc.org/article/2009/02/splc-college-student-media-model-guidelines
74 | http://www.peaceonourstreets.com/hemp
75 | http://www.safeaccessnow.org/model_legislation
76 | http://www.constitutionproject.org/pdf/FINAL%20Model%20Legislation.PDF
77 | http://web.archive.org/web/20080725012036/http://innocenceproject.org/docs/Preservation_Evidence_Prescriptive_08.pdf
78 | http://web.archive.org/web/20080705090003/http://www.innocenceproject.org/docs/Preservation_Pilot_08.pdf
79 | http://web.archive.org/web/20080705180144/http://innocenceproject.org/docs/Preservation_Task_Force_08.pdf
80 | http://web.archive.org/web/20080807124850/http://www.innocenceproject.org/docs/Eye_ID_Prescriptive_08.pdf
81 | http://web.archive.org/web/20080705085403/http://www.innocenceproject.org/docs/Eyewitness_ID_Written_Policies_08.pdf
82 | http://web.archive.org/web/20080705085547/http://www.innocenceproject.org/docs/Eyewitness_ID_Task_Force_08.pdf
83 | http://web.archive.org/web/20111014124824/http://www.innocenceproject.org/docs/Recording_Custodial_Interrogations_08.pdf
84 | http://web.archive.org/web/20080725013155/http://innocenceproject.org/docs/CJR_Commission_08.pdf
85 | http://web.archive.org/web/20080705090212/http://www.innocenceproject.org/docs/Compensation08.pdf
86 | http://www.inta.org/Advocacy/Pages/WorldCustomsOrganizationModelLegislation.aspx
87 | https://www.ij.org/images/pdf_folder/legislative/business-reg-relief-act.pdf
88 | https://www.ij.org/images/pdf_folder/legislative/model-reporting-law.pdf
89 | https://www.ij.org/images/pdf_folder/legislative/business-reg-act.pdf
90 | https://www.ij.org/images/pdf_folder/legislative/anti-slapp-model.pdf
91 | https://www.ij.org/images/pdf_folder/legislative/model-ed-legislation.pdf
92 | https://www.ij.org/images/pdf_folder/legislative/ijmodeleconlib.pdf
93 | https://www.ij.org/images/pdf_folder/legislative/ijmodelforfeiturelaw.pdf
94 | http://www.nclc.org/images/pdf/arbitration/model-state-arb-act-2015.pdf
95 | http://www.nclc.org/images/pdf/debt_collection/model_family_financial_protection_act.pdf
96 | http://www.nclc.org/images/pdf/legislation/model_laws/state-model-law-2011.pdf
97 | http://www.nclc.org/images/pdf/foreclosure_mortgage/mediation/model-judicial.pdf
98 | http://www.gunlaws.com/ConstitutionalCarry.htm
99 | http://www.gunlaws.com/GFZ/GFZ-BillReview.htm
100 | http://www.gunlaws.com/HighSchoolMarksmanship.htm
101 | http://www.gunlaws.com/lostcry.htm
102 | http://www.gunlaws.com/PropertyInVehicleLaw.htm
103 | http://www.gunlaws.com/DefensiveDisplay.htm
104 | http://www.gunlaws.com/MontanaMadeGuns.htm
105 | http://www.gunlaws.com/BIDSvNICS.htm
106 | http://www.gunlaws.com/sunshin.htm
107 | http://www.gunlaws.com/911-Limited-Immunity.htm
108 | http://www.gunlaws.com/EnumeratedPowersAct.htm
109 | http://ncra.files.cms-plus.com/GovernmentRelations/FINAL%20Third-Party%20Contracting%20Model%20Legislation.pdf
110 | https://www.heartland.org/policy-documents/model-bill-parent-trigger
111 | http://www.glsen.org/sites/default/files/GLSEN%20state%20model%20legislation.pdf
112 | http://www.frc.org/onepagers/model-legislation-divorce-reform-for-families-with-children
113 | http://www.lac.org/toolkits/sealing/Model%20Expungement%20Statute.pdf
114 | http://www.hopeafterrapeconception.org/model-legislation.html
115 | https://algaonline.org/DocumentCenter/View/11
116 | http://www.nelp.org/content/uploads/2015/04/NELP-Model-Legislation-Work-Sharing.pdf
117 | http://www.flushthetpp.org/tpp-free-zone-model-legislation/
118 | https://www.proenglish.org/official-english/legislation/model-legislation.html
119 | http://www.nascla.org/nascla-model-legislation
120 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1416085011/4th-Amendment-Protection-Act.pdf
121 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604203/Electronic-Data-Privacy-Act.pdf?1409604203
122 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603024/Freedom_from_Drone_Surveillance_Act.pdf?1409603024
123 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293
124 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409604293/Freedom-from-Location-Surveillance-Act.pdf?1409604293
125 | https://d3n8a8pro7vhmx.cloudfront.net/offnow/pages/224/attachments/original/1409603013/CHOICE_Act_of_2015.pdf?1409603013
126 | http://www.cchr.org/download-material/model-legislation.html
127 | http://www.cchr.org/sites/default/files/Electroshock_Model_Legislation.pdf
128 | http://www.cchr.org/sites/default/files/Deadly_Restraint_Model_Legislation.pdf
129 | http://www.cchr.org/sites/default/files/Involuntary_Commitment_Model_Legislation.pdf
130 | http://www.cchr.org/sites/default/files/Rape_Model_Legislation.pdf
131 | http://www.licenseportability.org/assets/pdf/Interstate-Medical-Licensure-Compact-(FINAL).pdf
132 | http://assets.aarp.org/rgcenter/consume/d17158_dwell.pdf
133 | http://ticas.org/sites/default/files/legacy/files/File/Model%20Tax%20Credit.pdf
134 | https://represent.us/wp-content/uploads/2015/04/AACA-Full-Provisions.pdf
135 | http://www.naiaonline.org/uploads/Main_Upload_Directory/NaiaPetFriendlyGuide.pdf
136 | http://www.naiaonline.org/pdfs/NAIA_%20Model_Animal_Control_Law_Final.pdf
137 | http://www.naiaonline.org/uploads/Main_Upload_Directory/naiaShelterReportingAct2014.pdf
138 | http://www.naiaonline.org/pdfs/ShelterImportAndReportingModel.pdf
139 | http://www.naiaonline.org/uploads/Main_Upload_Directory/DogPurchaserProtectionModelLaw.pdf
140 | http://www.naiaonline.org/articles/article/naia-resolution-supporting-animal-welfare#sthash.X3spi6jw.dpbs
141 | http://www.naic.org/documents/committees_b_exchanges_adopted_health_benefit_exchanges.pdf
142 | http://netchoice.org/wp-content/uploads/maiyn-online-safety-model-legislation-v2-6.pdf
143 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att7.pdf
144 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att8.pdf
145 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att5.pdf
146 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf
147 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_att1.pdf
148 | http://wdr.doleta.gov/directives/attach/UIPL/UIPL_20_12_acc.pdf
149 | http://www.pwia.org/assets/cabinets/Cabinet474/PWIAModelLegislation.pdf
150 | http://www.nhcsl.org/2007-1.php
151 | http://legis.state.nm.us/Sessions/07%20Regular/final/SB0600.pdf
152 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf
153 | http://www.nhcsl.org/model/HighSchoolOutcomesImprovementAct.pdf
154 | http://gallery.mailchimp.com/c1a51befb8159efb3bbd1f2620f9e1/files/VRA_ModelResolution.pdf
155 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ExcessivelyRaisedVehicles.pdf
156 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_LowSpeedVehicles.pdf
157 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ElectronicLienTitle.pdf
158 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_AdvertisingonInternet.pdf
159 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLawAdvertisingOverInternetJurisdictionCourts.pdf
160 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_UnfairTradePractices.pdf
161 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ProspectivePurchaserInquiryFeeForNMVTIS.pdf
162 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MotorCarrierStaggeredRegistration.pdf
163 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_DisclosurePersnlInfoInMVRecords.pdf
164 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_ContractingForMotorVehicleRegistration.pdf
165 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_SalvageVehicleTitling.pdf
166 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_MVInspectionsByIndependentContractors.pdf
167 | http://www.aamva.org/uploadedFiles/MainSite/Content/SolutionsBestPractices/BestPracticesModelLegislation(1)/ModelLaw_PrivatizationThirdParties.pdf
168 |
169 |
--------------------------------------------------------------------------------
/db/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | ##################### Elasticsearch Configuration Example #####################
2 |
3 | # This file contains an overview of various configuration settings,
4 | # targeted at operations staff. Application developers should
5 | # consult the guide at .
6 | #
7 | # The installation procedure is covered at
8 | # .
9 | #
10 | # Elasticsearch comes with reasonable defaults for most settings,
11 | # so you can try it out without bothering with configuration.
12 | #
13 | # Most of the time, these defaults are just fine for running a production
14 | # cluster. If you're fine-tuning your cluster, or wondering about the
15 | # effect of certain configuration option, please _do ask_ on the
16 | # mailing list or IRC channel [http://elasticsearch.org/community].
17 |
18 | # Any element in the configuration can be replaced with environment variables
19 | # by placing them in ${...} notation. For example:
20 | #
21 | #node.rack: ${RACK_ENV_VAR}
22 |
23 | # For information on supported formats and syntax for the config file, see
24 | #
25 |
26 |
27 | ################################### Cluster ###################################
28 |
29 | # Cluster name identifies your cluster for auto-discovery. If you're running
30 | # multiple clusters on the same network, make sure you're using unique names.
31 | #
32 | cluster.name: sunlightcluster
33 |
34 |
35 | #################################### Node #####################################
36 |
37 | # Node names are generated dynamically on startup, so you're relieved
38 | # from configuring them manually. You can tie this node to a specific name:
39 | #
40 | node.name: "sunlight_0"
41 |
42 | # Every node can be configured to allow or deny being eligible as the master,
43 | # and to allow or deny to store the data.
44 | #
45 | # Allow this node to be eligible as a master node (enabled by default):
46 | #
47 | #node.master: true
48 | #
49 | # Allow this node to store data (enabled by default):
50 | #
51 | #node.data: true
52 |
53 | # You can exploit these settings to design advanced cluster topologies.
54 | #
55 | # 1. You want this node to never become a master node, only to hold data.
56 | # This will be the "workhorse" of your cluster.
57 | #
58 | #node.master: false
59 | #node.data: true
60 | #
61 | # 2. You want this node to only serve as a master: to not store any data and
62 | # to have free resources. This will be the "coordinator" of your cluster.
63 | #
64 | #node.master: true
65 | #node.data: false
66 | #
67 | # 3. You want this node to be neither master nor data node, but
68 | # to act as a "search load balancer" (fetching data from nodes,
69 | # aggregating results, etc.)
70 | #
71 | #node.master: false
72 | #node.data: false
73 |
74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the
75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools
76 | # such as ,
77 | # ,
78 | # and
79 | # to inspect the cluster state.
80 |
81 | # A node can have generic attributes associated with it, which can later be used
82 | # for customized shard allocation filtering, or allocation awareness. An attribute
83 | # is a simple key value pair, similar to node.key: value, here is an example:
84 | #
85 | #node.rack: rack314
86 |
87 | # By default, multiple nodes are allowed to start from the same installation location
88 | # to disable it, set the following:
89 | #node.max_local_storage_nodes: 1
90 |
91 |
92 | #################################### Index ####################################
93 |
94 | # You can set a number of options (such as shard/replica options, mapping
95 | # or analyzer definitions, translog settings, ...) for indices globally,
96 | # in this file.
97 | #
98 | # Note, that it makes more sense to configure index settings specifically for
99 | # a certain index, either when creating it or by using the index templates API.
100 | #
101 | # See and
102 | #
103 | # for more information.
104 |
105 | # Set the number of shards (splits) of an index (5 by default):
106 | #
107 | #index.number_of_shards: 5
108 |
109 | # Set the number of replicas (additional copies) of an index (1 by default):
110 | #
111 | #index.number_of_replicas: 1
112 |
113 | # Note, that for development on a local machine, with small indices, it usually
114 | # makes sense to "disable" the distributed features:
115 | #
116 | #index.number_of_shards: 1
117 | #index.number_of_replicas: 0
118 |
119 | # These settings directly affect the performance of index and search operations
120 | # in your cluster. Assuming you have enough machines to hold shards and
121 | # replicas, the rule of thumb is:
122 | #
123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to
124 | # _distribute_ a big index across machines.
125 | # 2. Having more *replicas* enhances the _search_ performance and improves the
126 | # cluster _availability_.
127 | #
128 | # The "number_of_shards" is a one-time setting for an index.
129 | #
130 | # The "number_of_replicas" can be increased or decreased anytime,
131 | # by using the Index Update Settings API.
132 | #
133 | # Elasticsearch takes care about load balancing, relocating, gathering the
134 | # results from nodes, etc. Experiment with different settings to fine-tune
135 | # your setup.
136 |
137 | # Use the Index Status API () to inspect
138 | # the index status.
139 |
140 |
141 | #################################### Paths ####################################
142 |
143 | # Path to directory containing configuration (this file and logging.yml):
144 | #
145 | #path.conf: /path/to/conf
146 |
147 | # Path to directory where to store index data allocated for this node.
148 | #
149 | path.data: /mnt/elasticsearch/
150 | #
151 | # Can optionally include more than one location, causing data to be striped across
152 | # the locations (a la RAID 0) on a file level, favouring locations with most free
153 | # space on creation. For example:
154 | #
155 | #path.data: /path/to/data1,/path/to/data2
156 |
157 | # Path to temporary files:
158 | #
159 | #path.work: /path/to/work
160 |
161 | # Path to log files:
162 | #
163 | #path.logs: /mnt/data/sunlight/es_logs
164 |
165 | # Path to where plugins are installed:
166 | #
167 | #path.plugins: /path/to/plugins
168 |
169 |
170 | #################################### Plugin ###################################
171 |
172 | # If a plugin listed here is not installed for current node, the node will not start.
173 | #
174 | #plugin.mandatory: mapper-attachments,lang-groovy
175 |
176 |
177 | ################################### Memory ####################################
178 |
179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that
180 | # it _never_ swaps.
181 | #
182 | # Set this property to true to lock the memory:
183 | #
184 | bootstrap.mlockall: true
185 |
186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set
187 | # to the same value, and that the machine has enough memory to allocate
188 | # for Elasticsearch, leaving enough memory for the operating system itself.
189 | #
190 | # You should also make sure that the Elasticsearch process is allowed to lock
191 | # the memory, eg. by using `ulimit -l unlimited`.
192 |
193 |
194 | ############################## Network And HTTP ###############################
195 |
196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens
197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node
198 | # communication. (the range means that if the port is busy, it will automatically
199 | # try the next port).
200 |
201 | # Set the bind address specifically (IPv4 or IPv6):
202 | #
203 | #network.bind_host: 192.168.0.1
204 |
205 | # Set the address other nodes will use to communicate with this node. If not
206 | # set, it is automatically derived. It must point to an actual IP address.
207 | #
208 | #network.publish_host: 192.168.0.1
209 |
210 | # Set both 'bind_host' and 'publish_host':
211 | #
212 | #network.host: 192.168.0.1
213 |
214 | # Set a custom port for the node to node communication (9300 by default):
215 | #
216 | #transport.tcp.port: 9300
217 |
218 | # Enable compression for all communication between nodes (disabled by default):
219 | #
220 | #transport.tcp.compress: true
221 |
222 | # Set a custom port to listen for HTTP traffic:
223 | #
224 | #http.port: 9200
225 |
226 | # Set a custom allowed content length:
227 | #
228 | #http.max_content_length: 100mb
229 |
230 | # Disable HTTP completely:
231 | #
232 | #http.enabled: false
233 |
234 |
235 | ################################### Gateway ###################################
236 |
237 | # The gateway allows for persisting the cluster state between full cluster
238 | # restarts. Every change to the state (such as adding an index) will be stored
239 | # in the gateway, and when the cluster starts up for the first time,
240 | # it will read its state from the gateway.
241 |
242 | # There are several types of gateway implementations. For more information, see
243 | # .
244 |
245 | # The default gateway type is the "local" gateway (recommended):
246 | #
247 | #gateway.type: local
248 |
249 | # Settings below control how and when to start the initial recovery process on
250 | # a full cluster restart (to reuse as much local data as possible when using shared
251 | # gateway).
252 |
253 | # Allow recovery process after N nodes in a cluster are up:
254 | #
255 | #gateway.recover_after_nodes: 1
256 |
257 | # Set the timeout to initiate the recovery process, once the N nodes
258 | # from previous setting are up (accepts time value):
259 | #
260 | #gateway.recover_after_time: 5m
261 |
262 | # Set how many nodes are expected in this cluster. Once these N nodes
263 | # are up (and recover_after_nodes is met), begin recovery process immediately
264 | # (without waiting for recover_after_time to expire):
265 | #
266 | #gateway.expected_nodes: 2
267 |
268 |
269 | ############################# Recovery Throttling #############################
270 |
271 | # These settings allow to control the process of shards allocation between
272 | # nodes during initial recovery, replica allocation, rebalancing,
273 | # or when adding and removing nodes.
274 |
275 | # Set the number of concurrent recoveries happening on a node:
276 | #
277 | # 1. During the initial recovery
278 | #
279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4
280 | #
281 | # 2. During adding/removing nodes, rebalancing, etc
282 | #
283 | #cluster.routing.allocation.node_concurrent_recoveries: 2
284 |
285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb):
286 | #
287 | #indices.recovery.max_bytes_per_sec: 20mb
288 |
289 | # Set to limit the number of open concurrent streams when
290 | # recovering a shard from a peer:
291 | #
292 | #indices.recovery.concurrent_streams: 5
293 |
294 |
295 | ################################## Discovery ##################################
296 |
297 | # Discovery infrastructure ensures nodes can be found within a cluster
298 | # and master node is elected. Multicast discovery is the default.
299 |
300 | # Set to ensure a node sees N other master eligible nodes to be considered
301 | # operational within the cluster. This should be set to a quorum/majority of
302 | # the master-eligible nodes in the cluster.
303 | #
304 | #discovery.zen.minimum_master_nodes: 1
305 |
306 | # Set the time to wait for ping responses from other nodes when discovering.
307 | # Set this option to a higher value on a slow or congested network
308 | # to minimize discovery failures:
309 | #
310 | #discovery.zen.ping.timeout: 3s
311 |
312 | # For more information, see
313 | #
314 |
315 | # Unicast discovery allows to explicitly control which nodes will be used
316 | # to discover the cluster. It can be used when multicast is not present,
317 | # or to restrict the cluster communication-wise.
318 | #
319 | # 1. Disable multicast discovery (enabled by default):
320 | #
321 | #discovery.zen.ping.multicast.enabled: false
322 | #
323 | # 2. Configure an initial list of master nodes in the cluster
324 | # to perform discovery when new nodes (master or data) are started:
325 | #
326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"]
327 |
328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery.
329 | #
330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery.
331 | #
332 | # For more information, see
333 | #
334 | #
335 | # See
336 | # for a step-by-step tutorial.
337 |
338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery.
339 | #
340 | # You have to install the cloud-gce plugin for enabling the GCE discovery.
341 | #
342 | # For more information, see .
343 |
344 | # Azure discovery allows to use Azure API in order to perform discovery.
345 | #
346 | # You have to install the cloud-azure plugin for enabling the Azure discovery.
347 | #
348 | # For more information, see .
349 |
350 | ################################## Slow Log ##################################
351 |
352 | # Shard level query and fetch threshold logging.
353 |
354 | #index.search.slowlog.threshold.query.warn: 10s
355 | #index.search.slowlog.threshold.query.info: 5s
356 | #index.search.slowlog.threshold.query.debug: 2s
357 | #index.search.slowlog.threshold.query.trace: 500ms
358 |
359 | #index.search.slowlog.threshold.fetch.warn: 1s
360 | #index.search.slowlog.threshold.fetch.info: 800ms
361 | #index.search.slowlog.threshold.fetch.debug: 500ms
362 | #index.search.slowlog.threshold.fetch.trace: 200ms
363 |
364 | #index.indexing.slowlog.threshold.index.warn: 10s
365 | #index.indexing.slowlog.threshold.index.info: 5s
366 | #index.indexing.slowlog.threshold.index.debug: 2s
367 | #index.indexing.slowlog.threshold.index.trace: 500ms
368 |
369 | ################################## GC Logging ################################
370 |
371 | #monitor.jvm.gc.young.warn: 1000ms
372 | #monitor.jvm.gc.young.info: 700ms
373 | #monitor.jvm.gc.young.debug: 400ms
374 |
375 | #monitor.jvm.gc.old.warn: 10s
376 | #monitor.jvm.gc.old.info: 5s
377 | #monitor.jvm.gc.old.debug: 2s
378 |
379 | ################################## Security ################################
380 |
381 | # Uncomment if you want to enable JSONP as a valid return transport on the
382 | # http server. With this enabled, it may pose a security risk, so disabling
383 | # it unless you need it is recommended (it is disabled by default).
384 | #
385 | #http.jsonp.enable: true
386 |
--------------------------------------------------------------------------------