├── __init__.py
├── vis
    ├── __init__.py
    ├── bokeh_plots
    │   ├── __init__.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   └── test_cross_filter.py
    │   ├── utils.py
    │   └── domains_dashboard.py
    ├── html
    │   ├── img
    │   │   ├── apply.png
    │   │   ├── boost.png
    │   │   ├── delete.jpg
    │   │   ├── reload.png
    │   │   ├── remove.png
    │   │   ├── search.png
    │   │   ├── nyu_stacked_black.png
    │   │   └── engineering_long_black.png
    │   ├── css
    │   │   ├── dashboard_styles.css
    │   │   ├── cross_filter.css
    │   │   ├── jquery.urlive.css
    │   │   ├── d3.slider.css
    │   │   └── dropdowns-enhancement.min.css
    │   ├── libs
    │   │   ├── bootflat-2.0.4
    │   │   │   └── fonts
    │   │   │   │   ├── glyphicons-halflings-regular.eot
    │   │   │   │   ├── glyphicons-halflings-regular.ttf
    │   │   │   │   └── glyphicons-halflings-regular.woff
    │   │   └── bootstrap-datetimepicker-4.15.35
    │   │   │   └── css
    │   │   │       └── bootstrap-datetimepicker.min.css
    │   ├── cross_filter_plot_area.html
    │   ├── js
    │   │   ├── libs
    │   │   │   ├── queue.min.js
    │   │   │   ├── d3.lasso.min.js
    │   │   │   └── jquery.urlive.js
    │   │   ├── cross_filter.js
    │   │   ├── topicvis.js
    │   │   ├── utils.js
    │   │   ├── crawlersigslots.js
    │   │   ├── snippetsviewer.js
    │   │   ├── bokeh_controller.js
    │   │   ├── sigslot_core.js
    │   │   └── tagsgallery.js
    │   ├── base.html
    │   ├── domains_dashboard.html
    │   ├── cross_filter.html
    │   ├── crawlervis.html
    │   └── release.html
    └── config.conf-in
├── models
    └── __init__.py
├── elastic
    ├── test
    │   ├── __init__.py
    │   └── test_get_documents.py
    ├── .gitignore
    ├── __init__.py
    ├── scripts
    │   ├── create_config_index.sh
    │   ├── delete_index.sh
    │   ├── put_mapping.sh
    │   ├── create_index.sh
    │   └── mapping.json
    ├── ddt_index_config_entries.json
    ├── delete_index.py
    ├── mapping_terms.json
    ├── delete.py
    ├── get_term_vectors.py
    ├── config.json
    ├── load_config.py
    ├── config.py
    ├── aggregations.py
    ├── mapping.json
    ├── get_config.py
    ├── create_index.py
    ├── README.md
    ├── add_documents.py
    ├── get_documents.py
    ├── get_mtermvectors.py
    └── stopwords.txt
├── ranking
    ├── __init__.py
    ├── .gitignore
    ├── run.sh
    ├── preprocess.py
    ├── BayesianSets.py
    ├── get_bigrams_trigrams.py
    ├── tfidf.py
    ├── rank.py
    ├── extract_terms.py
    └── word2vec.py
├── seeds_generator
    ├── __init__.py
    ├── src
    │   ├── main
    │   │   ├── config
    │   │   │   └── queries.txt
    │   │   └── java
    │   │   │   └── page_downloader
    │   │   │       ├── Download_Utils.java
    │   │   │       ├── App.java
    │   │   │       ├── Download_urls.java
    │   │   │       ├── StartCrawl.java
    │   │   │       ├── Extract.java
    │   │   │       ├── Download.java
    │   │   │       ├── BingSearch.java
    │   │   │       ├── GoogleSearch.java
    │   │   │       └── Crawl.java
    │   └── test
    │   │   └── java
    │   │       └── page_downloader
    │   │           └── AppTest.java
    ├── conf
    │   └── config.properties
    ├── download.py
    ├── pom.xml
    └── concat_nltk.py
├── online_classifier
    ├── __init__.py
    ├── tfidf_vector.py
    ├── online_classifier.py
    └── tf_vector.py
├── logs
    └── README.md
├── .dockerignore
├── run_demo.sh
├── conda.recipe
    ├── README.md
    ├── meta.yaml
    └── build.sh
├── .gitignore
├── bin
    ├── ddt
    └── ddt-dev
├── environment.yml
├── supervisord.conf
├── Dockerfile
├── Makefile
└── README.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/elastic/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ranking/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/seeds_generator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vis/bokeh_plots/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/online_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vis/bokeh_plots/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ranking/.gitignore:
--------------------------------------------------------------------------------
1 | selected_terms.txt
2 | exclude.txt


--------------------------------------------------------------------------------
/logs/README.md:
--------------------------------------------------------------------------------
1 | Logfiles from supervisor processes go here
2 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/config/queries.txt:
--------------------------------------------------------------------------------
1 | explosive chemicals


--------------------------------------------------------------------------------
/ranking/run.sh:
--------------------------------------------------------------------------------
1 | python rank.py ../lda_pipeline/data/lda_input.csv 3,4,7,28
2 | 


--------------------------------------------------------------------------------
/elastic/.gitignore:
--------------------------------------------------------------------------------
1 | /local
2 | /bin
3 | /include
4 | /lib/python*
5 | /build
6 | *.pyc
7 | 


--------------------------------------------------------------------------------
/elastic/__init__.py:
--------------------------------------------------------------------------------
1 | from config import es, es_server
2 | 
3 | __export__ = ['es_server', 'es']
4 | 


--------------------------------------------------------------------------------
/vis/html/img/apply.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/apply.png


--------------------------------------------------------------------------------
/vis/html/img/boost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/boost.png


--------------------------------------------------------------------------------
/vis/html/img/delete.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/delete.jpg


--------------------------------------------------------------------------------
/vis/html/img/reload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/reload.png


--------------------------------------------------------------------------------
/vis/html/img/remove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/remove.png


--------------------------------------------------------------------------------
/vis/html/img/search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/search.png


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | vis/config.conf
2 | ranking/D_cbow_pdw_8B.pkl
3 | data/
4 | vis/html/models/
5 | *.pyc
6 | *.log
7 | *.class
8 | *.jar
9 | 


--------------------------------------------------------------------------------
/vis/html/img/nyu_stacked_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/nyu_stacked_black.png


--------------------------------------------------------------------------------
/vis/html/img/engineering_long_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/engineering_long_black.png


--------------------------------------------------------------------------------
/seeds_generator/conf/config.properties:
--------------------------------------------------------------------------------
1 | ACCOUNTKEY SYQ5NpERm7UmF1ZCdysXfQjS5wD41a27sSnBS5KReqA
2 | ACCOUNTKEY_GOOG AIzaSyADaHyjihNC3591IehV5pcmqK044jdrFEM
3 | CSE_ID_GOOG 016642719151054520299:gftwrd3ql-m
4 | 


--------------------------------------------------------------------------------
/vis/html/css/dashboard_styles.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     background-color:transparent;
 3 | }
 4 | .bk-vbox {
 5 |     padding-left:1px;
 6 | }
 7 | 
 8 | .bk-data-table {
 9 |     margin: 0px 20px 20px 0px;
10 | }
11 | 


--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/run_demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Activating DDT enviroment..."
3 | source activate ddt
4 | 
5 | echo "Using ElasticSearch at $ELASTICSEARCH_SERVER"
6 | 
7 | echo "Starting services..."
8 | supervisord -c /ddt/supervisord.conf
9 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Download_Utils.java:
--------------------------------------------------------------------------------
1 | public class Download_Utils{
2 |     public static String validate_url(String url){
3 | 	if(!url.contains("http"))
4 | 	    url = "http://" + url;
5 | 	return url;
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/conda.recipe/README.md:
--------------------------------------------------------------------------------
1 | This conda recipe was originally added by Continuum Analytics in July 2015 under the DDT license
2 | 
3 | You may need other recipes from https://github.com/memex-explorer/memex-explorer or packages from the memex channel 
4 | 
5 | 


--------------------------------------------------------------------------------
/elastic/scripts/create_config_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ $# -eq 0 ]
 4 | then
 5 |     ELASTIC=http://localhost:9200
 6 | else
 7 |     ELASTIC=$1
 8 | fi
 9 | 
10 | ./create_index.sh config $ELASTIC
11 | ./put_mapping.sh config domains config.json $ELASTIC
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/elastic/ddt_index_config_entries.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entries": [
 3 |     {
 4 | 	"id" : "1",
 5 | 	"domain_name": "Gun Control",
 6 | 	"index" : "gun_control"
 7 |     },
 8 |     {
 9 | 	"id" : "2",
10 | 	"domain_name": "Ebola",
11 | 	"index" : "ebola"
12 |     }
13 |     ]
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/elastic/delete_index.py:
--------------------------------------------------------------------------------
 1 | from config import es as default_es
 2 | from pprint import pprint
 3 | 
 4 | def delete_index(es_index='', es=None):
 5 |     if es is None:
 6 |         es = default_es
 7 | 
 8 |     if es_index != "":
 9 |         res = es.indices.delete(index=es_index)
10 | 
11 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/App.java:
--------------------------------------------------------------------------------
 1 | package page_downloader;
 2 | 
 3 | /**
 4 |  * Hello world!
 5 |  *
 6 |  */
 7 | public class App 
 8 | {
 9 |     public static void main( String[] args )
10 |     {
11 |         System.out.println( "Hello World!" );
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | local
 2 | bin
 3 | include
 4 | /lib
 5 | python2.7
 6 | /build
 7 | /nltk_data
 8 | config.conf
 9 | results.txt
10 | *.pyc
11 | *~
12 | *.#*
13 | *#*
14 | *.bak
15 | *log
16 | *.class
17 | *.jar
18 | *.out
19 | seeds_generator/target/*
20 | seeds_generator/conf/queries.txt
21 | ranking/D_cbow_pdw_8B.pkl
22 | data/
23 | vis/html/models/
24 | *.DS_Store
25 | .idea
26 | *.swp
27 | .cache/
28 | 


--------------------------------------------------------------------------------
/elastic/scripts/delete_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ $# -eq 0 ]
 3 | then 
 4 |     INDEX=memex
 5 | else
 6 |     INDEX=$1
 7 | fi
 8 | 
 9 | if [ $# -gt 1 ]
10 | then
11 |     TYPE=$2
12 | else
13 |     TYPE=page
14 | fi
15 | 
16 | if [ $# -gt 2 ]
17 | then
18 |     ELASTIC=$3
19 | else
20 |     ELASTIC=http://localhost:9200
21 | fi
22 | echo $INDEX
23 | 
24 | curl -XDELETE "$ELASTIC/$INDEX/$TYPE"; echo
25 | 


--------------------------------------------------------------------------------
/elastic/mapping_terms.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "terms": {
 3 |     "properties": {
 4 |       "term": {
 5 |         "type": "string"
 6 |       },
 7 |       "index": {
 8 |         "type": "string"
 9 |       },
10 |       "doc_type": {
11 |         "type": "string"
12 |       },
13 |       "tf": {
14 |         "type": "integer"
15 |       },
16 |       "tag": {
17 |         "type": "string"
18 |       }
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/bin/ddt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_PATH="${BASH_SOURCE[0]}";
 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH)
 5 | 
 6 | export NLTK_DATA=$SCRIPT_DIR/../lib/ddt/nltk_data
 7 | export ACHE_HOME=$SCRIPT_DIR/../lib/ache/
 8 | export DDT_HOME=$SCRIPT_DIR/../lib/ddt
 9 | # ugly, but DDT doesn't really have a concept of installs
10 | export PYTHONPATH=$SCRIPT_DIR/../lib/ddt:$PYTHONPATH
11 | 
12 | python $SCRIPT_DIR/../lib/ddt/vis/server.py
13 | 


--------------------------------------------------------------------------------
/elastic/delete.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from config import es as default_es
 3 | from elasticsearch.exceptions import NotFoundError
 4 | 
 5 | def delete(ids, es_index='memex', es_doc_type='page', es=None):
 6 |     if es is None:
 7 |         es = default_es
 8 |         
 9 |     for id in ids:
10 |         try:
11 |             es.delete(es_index, es_doc_type, id)
12 |         except NotFoundError:
13 |             continue
14 | 


--------------------------------------------------------------------------------
/vis/html/css/cross_filter.css:
--------------------------------------------------------------------------------
 1 | .help-dropdown {
 2 |   padding-top:8px;
 3 | }
 4 | 
 5 | .help-dropdown-content {
 6 |   display: none;
 7 |   position: absolute;
 8 |   background-color: #f9f9f9;
 9 |   min-width: 360px;
10 |   box-shadow: 0px 4px 8px 0px rgba(0,0,0,1);
11 |   padding: 12px 16px;
12 |   z-index: 1000;
13 | }
14 | .help-dropdown:hover .help-dropdown-content {
15 |   display: block;
16 | }
17 | 
18 | .bokeh_plot {
19 |   padding-bottom:30px;
20 | }
21 | 
22 | .bokeh_table {
23 |   padding-bottom:10px;
24 | }
25 | 


--------------------------------------------------------------------------------
/elastic/scripts/put_mapping.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ $# -eq 0 ]
 3 | then
 4 |     INDEX=memex
 5 | else
 6 |     INDEX=$1
 7 | fi
 8 | 
 9 | if [ $# -gt 1 ]
10 | then
11 |     TYPE=$2
12 |     echo $TYPE
13 | else
14 |     TYPE=page
15 | fi
16 | 
17 | if [ $# -gt 2 ]
18 | then 
19 |     MAPPING=$3
20 | else
21 |     MAPPING='mapping.json'
22 | fi
23 | 
24 | if [ $# -gt 3 ]
25 | then
26 |     ELASTIC=$4
27 | else
28 |     ELASTIC=http://localhost:9200
29 | fi
30 | 
31 | curl -XPUT "$ELASTIC/$INDEX/$TYPE/_mapping?pretty=1" -d @$MAPPING
32 | 


--------------------------------------------------------------------------------
/vis/html/cross_filter_plot_area.html:
--------------------------------------------------------------------------------
 1 | {% block content %}
 2 | 
 3 |   <div>{{ widgets_script | safe }}</div>
 4 |   <div>{{ plots_script | safe }}</div>
 5 |   <div class="bokeh_plot" id="queries_plot">{{ plots_div['queries'] | safe }}</div>
 6 |   <div class="bokeh_plot">{{ plots_div['tags'] | safe }}</div>
 7 |   <div class="bokeh_plot">{{ plots_div['hostnames'] | safe }}</div>
 8 |   <div class="bokeh_plot">{{ plots_div['tlds'] | safe }}</div>
 9 |   <div class="bokeh_plot">{{ plots_div['ts'] | safe }}</div>
10 | 
11 | {% endblock content %}
12 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ddt
 2 | 
 3 | channels:
 4 |   - memex
 5 |   - vida-nyu
 6 | 
 7 | dependencies:
 8 |     - elasticsearch
 9 |     - supervisor
10 |     - meld3
11 |     - dateutil
12 |     - cython >=0.22
13 |     - ddt-word2vec
14 |     - maven
15 |     - nltk
16 |     - scipy
17 |     - numexpr >=2.4
18 |     - scikit-learn >=0.16.1
19 |     - pyelasticsearch >=1.2
20 |     - cherrypy
21 |     - requests
22 |     - ache >=0.3.1
23 |     - jinja2
24 |     - bokeh=0.10.0
25 |     - pyldavis=2.1.0
26 |     - topik
27 |     - functools32
28 |     - networkx=1.11
29 | 


--------------------------------------------------------------------------------
/bin/ddt-dev:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_PATH="${BASH_SOURCE[0]}";
 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH)
 5 | # ugly, but portable
 6 | export DDT_HOME=$(python -c "import os, sys; sys.stdout.write(os.path.abspath('$SCRIPT_DIR/..')+'\n')")
 7 | echo "DDT_HOME  : $DDT_HOME"
 8 | export NLTK_DATA=$DDT_HOME/nltk_data
 9 | echo "NLTK_DATA : $NLTK_DATA"
10 | export ACHE_HOME=$(dirname $(which ache))/../lib/ache/
11 | echo "ACHE_HOME : $ACHE_HOME"
12 | 
13 | # ugly, but DDT doesn't really have a concept of installs
14 | export PYTHONPATH=$DDT_HOME:$PYTHONPATH
15 | echo "PYTHONPATH: $PYTHONPATH"
16 | 
17 | python $DDT_HOME/vis/server.py
18 | 


--------------------------------------------------------------------------------
/elastic/scripts/create_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ $# -eq 0 ]
 3 | then 
 4 |     INDEX=memex
 5 | else
 6 |     INDEX=$1
 7 | fi
 8 | 
 9 | if [ $# -gt 1 ]
10 | then
11 |     ELASTIC=$2
12 | else
13 |     ELASTIC=http://localhost:9200
14 | fi
15 | 
16 | curl -s -XPUT "$ELASTIC/$INDEX"; echo
17 | #  -d '{
18 | #     "index" : {
19 | # 	"analysis":{
20 | # 	    "analyzer":{
21 | # 		"html" : {
22 | # 	            "type" : "custom",
23 | # 		    "tokenizer" : "standard",
24 | #                     "filter" : ["lowercase" , "stop"],
25 | #                     "char_filter" : ["html_strip"]
26 | #                 }
27 | #             }
28 | # 	}
29 | #     }
30 | # }'
31 | 


--------------------------------------------------------------------------------
/vis/html/js/libs/queue.min.js:
--------------------------------------------------------------------------------
1 | !function(){function n(n){function e(){for(;i=a<c.length&&n>p;){var u=a++,e=c[u],o=t.call(e,1);o.push(l(u)),++p,e[0].apply(null,o)}}function l(n){return function(u,t){--p,null==s&&(null!=u?(s=u,a=d=0/0,o()):(c[n]=t,--d?i||e():o()))}}function o(){null!=s?m(s):f?m(s,c):m.apply(null,[s].concat(c))}var r,i,f,c=[],a=0,p=0,d=0,s=null,m=u;return n||(n=1/0),r={defer:function(){return s||(c.push(arguments),++d,e()),r},await:function(n){return m=n,f=!1,d||o(),r},awaitAll:function(n){return m=n,f=!0,d||o(),r}}}function u(){}var t=[].slice;n.version="1.0.7","function"==typeof define&&define.amd?define(function(){return n}):"object"==typeof module&&module.exports?module.exports=n:this.queue=n}();


--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |     name: ddt
 3 |     version: 2.3.0
 4 | 
 5 | build:
 6 |     number: 0
 7 |     has_prefix_files:
 8 |       - lib/ddt/vis/config.conf
 9 | 
10 | source:
11 |     git_url: https://github.com/ViDA-NYU/domain_discovery_tool
12 |     git_tag: 2.3
13 | 
14 | requirements:
15 |   build:
16 |     - cython >=0.22
17 |     - ddt-word2vec
18 |     - maven
19 |     - nltk
20 |   run:
21 |     - scipy
22 |     - cython >=0.22
23 |     - numexpr >=2.4
24 |     - scikit-learn >=0.16.1
25 |     - pyelasticsearch >=1.2
26 |     - nltk
27 |     - cherrypy
28 |     - requests
29 |     - ddt-word2vec
30 |     - ache >=0.3.1
31 |     - functools32
32 | 
33 | #about:
34 | #    license: Apache?
35 | 


--------------------------------------------------------------------------------
/vis/config.conf-in:
--------------------------------------------------------------------------------
 1 | [global]
 2 | server.socket_host = 0.0.0.0
 3 | server.socket_port = 8084
 4 | server.thread_pool = 10
 5 | 
 6 | [/]
 7 | tools.staticdir.root = .
 8 | tools.encode.on = True
 9 | tools.gzip.on = True
10 | 
11 | [/css]
12 | tools.staticdir.on = True
13 | tools.staticdir.dir = css
14 | 
15 | [/js]
16 | tools.staticdir.on = True
17 | tools.staticdir.dir = js
18 | 
19 | [/img]
20 | tools.staticdir.on = True
21 | tools.staticdir.dir = img
22 | 
23 | [/models]
24 | tools.staticdir.on = True
25 | tools.staticdir.dir = models
26 | 
27 | [/bootflat-2.0.4]
28 | tools.staticdir.on = True
29 | tools.staticdir.dir = libs/bootflat-2.0.4
30 | 
31 | [/bootstrap-datetimepicker-4.15.35]
32 | tools.staticdir.on = True
33 | tools.staticdir.dir = libs/bootstrap-datetimepicker-4.15.35


--------------------------------------------------------------------------------
/online_classifier/tfidf_vector.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfTransformer
 2 | from nltk import corpus
 3 | 
 4 | from tf_vector import tf_vectorizer
 5 | 
 6 | class tfidf_vectorizer(tf_vectorizer):
 7 |     
 8 |     def __init__(self, convert_to_ascii=False, max_features= 10000, ngram_range=(1,1)):
 9 |         self.tfidf_transformer = None
10 |         tf_vectorizer.__init__(self, convert_to_ascii, max_features, ngram_range)
11 |         
12 |     def tfidf(self, data):
13 |         [X_counts, features] = self.vectorize(data)
14 |         if self.tfidf_transformer is None:
15 |             self.tfidf_transformer = TfidfTransformer()
16 |             X = self.tfidf_transformer.fit_transform(X_counts)
17 |         else:
18 |             X = self.tfidf_transformer.transform(X_counts)
19 | 
20 |         return [X, X_counts, features]
21 | 
22 | 


--------------------------------------------------------------------------------
/seeds_generator/src/test/java/page_downloader/AppTest.java:
--------------------------------------------------------------------------------
 1 | package page_downloader;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/elastic/get_term_vectors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from os import environ
 3 | 
 4 | from config import es as default_es
 5 | 
 6 | es = default_es
 7 | 
 8 | query = {
 9 |     "query": {
10 |         "match_all": {}
11 |     },
12 |     "fields": []
13 | }
14 | res = es.search(query, 
15 |                 index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
16 |                 doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
17 | 
18 | hits = res['hits']
19 | print 'Document found: %d' % hits['total']
20 | ids = [hit['_id'] for hit in hits['hits']]
21 | body={
22 |     "ids": ids,
23 |     "parameters": {
24 |         "fields": [ "text" ]
25 |     }
26 | }
27 | res = es.send_request('POST',
28 |                       ['memex', 'page', '_mtermvectors'],
29 |                       body=body, query_params={})
30 | 
31 | 


--------------------------------------------------------------------------------
/ranking/preprocess.py:
--------------------------------------------------------------------------------
 1 | from nltk import word_tokenize
 2 | from nltk.text import TextCollection
 3 | from nltk import corpus
 4 | 
 5 | from pprint import pprint
 6 | 
 7 | ENGLISH_STOPWORDS = set(corpus.stopwords.words('english'))
 8 | 
 9 | class TextPreprocess:
10 |     def __init__(self,display=False):
11 |         self.display=display
12 |         
13 |     def preprocess(self,text):
14 |         #text = text.split(" ");
15 |         text = word_tokenize(text)
16 |         if self.display:
17 |             print "After Tokenizing"
18 |             print text
19 |             print "\n\n"
20 | 
21 |         text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2]
22 |         
23 |         tc = TextCollection([text])
24 |         words = list(set(tc))
25 |         
26 |         word_tf = {word: tc.tf(word, text) * len(text) for word in words}
27 | 
28 |         return word_tf
29 | 


--------------------------------------------------------------------------------
/ranking/BayesianSets.py:
--------------------------------------------------------------------------------
 1 | from math import sqrt
 2 | from numpy import *
 3 | 
 4 | import sys
 5 | 
 6 | reload(sys)
 7 | sys.setdefaultencoding("utf-8")
 8 | 
 9 | 
10 | class BayesianSets:
11 |     # D-> Query Set
12 |     # X-> Data Set
13 |     def score(self, D, X) :
14 | 
15 |         #Compute Bayesian Sets Parameters
16 |         c = 2
17 |         N = D.shape[0]
18 |         T = concatenate((D,X))
19 |         m = divide(sum(T, axis=0),T.shape[0])
20 | 
21 |         a = multiply(m, c)
22 |         b = multiply(subtract(1,m),c)
23 | 
24 |         at = add(a,sum(D, axis=0))
25 |         bt = subtract(add(b,N),sum(D, axis=0))
26 |         
27 |         C = sum(subtract(add(subtract(log(add(a,b)),log(add(add(a,b),N))), log(bt)), log (b)))
28 | 
29 |         q = transpose(add(subtract(subtract(log(at),log(a)),log(bt)), log(b)))
30 |         
31 |         score_X = transpose(add(C, dot(X,q)))
32 |         
33 |         return asarray(score_X)
34 |         
35 | 


--------------------------------------------------------------------------------
/elastic/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "domains" : {
 3 |         "_timestamp" : {
 4 | 	    "enabled" : true,
 5 | 	    "store" : true
 6 | 	},
 7 | 	"properties" : {
 8 |             "domain_name" : {
 9 | 		"type" : "string"
10 | 	    },
11 | 	    "timestamp" : {
12 | 		"type" : "date"
13 | 	    },
14 | 	    "index" : {
15 | 		"type" : "string"
16 | 	    },
17 | 	    "doc_type": {
18 | 		"type": "string"
19 | 	    },
20 | 	    "mapping":{
21 | 		"properties": {
22 | 		    "timestamp": {
23 | 			"type": "string"
24 | 		    },
25 | 		    "text": {
26 | 			"type": "string"
27 | 		    },
28 | 		    "html": {
29 | 			"type": "string"
30 | 		    },
31 | 		    "tag":{
32 | 			"type": "string"
33 | 		    },
34 | 		    "content-type":{
35 | 			"type": "string"
36 | 		    }
37 | 		}
38 | 	    },
39 | 	    "tag_colors": {
40 | 		"properties": {
41 | 		    "index": {
42 | 			"type": "integer"
43 | 		    },
44 | 		    "colors": {
45 | 			"type": "string"
46 | 		    }
47 | 		}
48 | 	    }
49 | 	}
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisorctl]
 2 | username=darpamemex
 3 | 
 4 | [supervisord]
 5 | childlogdir=logs
 6 | logfile=supervisord.log ; (main log file;default $CWD/supervisord.log)
 7 | logfile_maxbytes=50MB        ; (max main logfile bytes b4 rotation;default 50MB)
 8 | logfile_backups=10           ; (num of main logfile rotation backups;default 10)
 9 | loglevel=info                ; (log level;default info; others: debug,warn,trace)
10 | pidfile=supervisord.pid ; (supervisord pidfile;default supervisord.pid)
11 | nodaemon=true               ; (start in foreground if true;default false)
12 | minfds=1024                  ; (min. avail startup file descriptors;default 1024)
13 | minprocs=200                 ; (min. avail process descriptors;default 200)
14 | 
15 | [inet_http_server]
16 | port = 127.0.0.1:9001
17 | 
18 | [program:elasticsearch]
19 | command=elasticsearch
20 | priority=1
21 | 
22 | [program:ddt]
23 | command=bash ./bin/ddt-dev
24 | priority=2
25 | 
26 | [rpcinterface:supervisor]
27 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface


--------------------------------------------------------------------------------
/elastic/load_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from datetime import datetime
 4 | from add_documents import add_document
 5 | 
 6 | from config import es as default_es
 7 | 
 8 | def load_config(entries, es_index='config', es_doc_type='domains', es=None):
 9 | 
10 |     if es is None:
11 |         es = default_es
12 | 
13 |     add_document(entries, es_index, es_doc_type, es)
14 | 
15 | if __name__ == "__main__":
16 | 
17 |     if len(sys.argv)>1:
18 |         config_file = sys.argv[1]
19 |     else:
20 |         config_file = 'ddt_index_config_entries.json'
21 | 
22 |     if len(sys.argv)>2:    
23 |         es_index = sys.argv[2]
24 |     else:
25 |         es_index = 'config'
26 | 
27 |     if len(sys.argv)>3:    
28 |         es_doc_type = sys.argv[3]
29 |     else:
30 |         es_doc_type = 'domains'
31 | 
32 |     es = None
33 |     if len(sys.argv)>4:    
34 |         es_host = sys.argv[4]
35 |         from pyelasticsearch import ElasticSearch
36 |         es = ElasticSearch(es_host)
37 |         
38 |     load_config(config_file, es_index, es_doc_type, es)
39 | 
40 | 


--------------------------------------------------------------------------------
/elastic/config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | provides access to elasticsearch server
 3 | 
 4 | es_server - the name of the endpoint
 5 | es - an Elasticsearch instance connected to es_server
 6 | '''
 7 | 
 8 | from elasticsearch import Elasticsearch
 9 | from os import environ
10 | import certifi
11 | 
12 | if environ.get('ELASTICSEARCH_SERVER'):
13 |     es_server = environ['ELASTICSEARCH_SERVER']
14 | else:
15 |     es_server = 'http://localhost:9200/'
16 | 
17 | print 'ELASTICSEARCH_SERVER ', es_server
18 | 
19 | if environ.get('ELASTICSEARCH_USER'):
20 |     es_user = environ['ELASTICSEARCH_USER']
21 | else: 
22 |     es_user = ""
23 | 
24 | print 'ELASTICSEARCH_USER ', es_user
25 | 
26 | if environ.get('ELASTICSEARCH_PASSWD'):
27 |     es_passwd = environ['ELASTICSEARCH_PASSWD']
28 | else:
29 |     es_passwd = ""
30 | 
31 | if es_user:
32 |     es = Elasticsearch([es_server], http_auth=(es_user, es_passwd), use_ssl=True, verify_certs=True, ca_certs=certifi.where(), timeout=100)
33 | else:
34 |     es = Elasticsearch([es_server])
35 | 
36 | if environ.get('ELASTICSEARCH_DOC_TYPE'):
37 |     es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']
38 | else:
39 |     es_doc_type = 'page'
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/vis/html/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |     <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
 8 |     <link type="text/css" rel="stylesheet" href="css/crawler-white.css" media="all">
 9 |     <link type="text/css" rel="stylesheet" href="css/dashboard_styles.css" media="all">
10 |     <title>Domain Discovery Tool</title>
11 |   </head>
12 |   <body>
13 |     <nav class="navbar navbar-default navbar-fixed-top">
14 |       <div class="container-fluid">
15 |         <div class="navbar-header">
16 |           <img src="img/nyu_stacked_black.png" style="height:40px;margin-top:5px;padding-right:15px;" />
17 |         </div>
18 |         <a class="navbar-brand" href="#">Domain Discovery Tool</a>
19 |         <div class="navbar-collapse collapse">
20 |           {% block navigation %}
21 |           {% endblock navigation %}
22 |         </div>
23 |       </nav>
24 |       <div>
25 |         {% block content %}
26 |         {% endblock content %}
27 |       </div>
28 |     </div>
29 |   </body>
30 | </html>
31 | 


--------------------------------------------------------------------------------
/vis/html/js/cross_filter.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This module handles the server callback to update the bokeh plots
 3 |  */
 4 | var crossFilterUpdate = function(){
 5 |   var data_table_ids = ['urls', 'tlds', 'tags', 'queries'];
 6 | 
 7 |   setTimeout(function() { //need timeout to wait for class change
 8 |     var global_state = {};
 9 |     for (i=0; i<data_table_ids.length; i++) {
10 |       global_state[data_table_ids[i]] = get_table_state(data_table_ids[i]);
11 |     }
12 | 
13 |     global_state['datetimepicker_start'] = $('#datetimepicker_start').data('date') || '';
14 |     global_state['datetimepicker_end'] = $('#datetimepicker_end').data('date') || '';
15 |     $.ajax({
16 |       type: "POST",
17 |       url: '/update_cross_filter_plots' + window.location.search, //session info
18 |       data: JSON.stringify(global_state),
19 |       contentType: "application/json",
20 |       dataType: "json",
21 |       success: function(response) {
22 |         $("#plot_area").html(response);
23 |       }
24 |     });
25 |   }, 10);
26 | 
27 |   var get_table_state = function(id) {
28 |     var current = $("#".concat(id)).find(".bk-slick-cell.l0.selected");
29 |     var active_cells = [];
30 |     for (j = 0; j < current.length; j++) {
31 |       active_cells.push(current[j].innerText);
32 |     }
33 |     return active_cells;
34 |   };
35 | };
36 | 


--------------------------------------------------------------------------------
/conda.recipe/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BLD_DIR=`pwd`
 4 | 
 5 | SRC_DIR=$RECIPE_DIR/..
 6 | pushd $SRC_DIR
 7 | 
 8 | mkdir -vp ${PREFIX}/lib/ddt/bin;
 9 | mkdir -vp ${PREFIX}/lib/ddt/elastic;
10 | mkdir -vp ${PREFIX}/lib/ddt/models;
11 | mkdir -vp ${PREFIX}/lib/ddt/nltk_data
12 | mkdir -vp ${PREFIX}/lib/ddt/ranking;
13 | mkdir -vp ${PREFIX}/lib/ddt/lda_pipeline;
14 | mkdir -vp ${PREFIX}/lib/ddt/seeds_generator;
15 | mkdir -vp ${PREFIX}/lib/ddt/vis;
16 | mkdir -vp ${PREFIX}/lib/ddt/vis/html;
17 | 
18 | pushd seeds_generator
19 | mvn compile assembly:single
20 | popd
21 | 
22 | python -m nltk.downloader -d ${PREFIX}/lib/ddt/nltk_data stopwords brown
23 | 
24 | cp -av elastic/* ${PREFIX}/lib/ddt/elastic
25 | cp -av lda_pipeline/* ${PREFIX}/lib/ddt/lda_pipeline
26 | cp -av models/* ${PREFIX}/lib/ddt/models
27 | cp -av ranking/* ${PREFIX}/lib/ddt/ranking
28 | cp -av seeds_generator/* ${PREFIX}/lib/ddt/seeds_generator
29 | 
30 | # hardcode here, let conda fix this on install
31 | sed "s#tools.staticdir.root = .#tools.staticdir.root = ${PREFIX}/lib/ddt/vis/html#g" vis/config.conf-in > ${PREFIX}/lib/ddt/vis/config.conf
32 | 
33 | cp -av vis/* ${PREFIX}/lib/ddt/vis
34 | 
35 | cp -av bin/ddt ${PREFIX}/bin/ddt
36 | chmod +x ${PREFIX}/bin/ddt
37 | 
38 | # ugly, but DDT hardcodes the location of word2vec here
39 | pushd ${PREFIX}/lib/ddt/ranking
40 | ln -s ../../../data/D_cbow_pdw_8B.pkl ./D_cbow_pdw_8B.pkl
41 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Download_urls.java:
--------------------------------------------------------------------------------
 1 | public class Download_urls {
 2 |     public Download_urls(){
 3 |     }
 4 |     
 5 |     public void download(String[] urls, String es_index, String es_doc_type, String es_server){
 6 | 	Download download = new Download("uploaded", es_index, es_doc_type, es_server);
 7 | 	
 8 | 	for(String url: urls){
 9 | 	    download.addTask(Download_Utils.validate_url(url));
10 | 	}
11 | 	
12 | 	download.shutdown();
13 |     }
14 | 
15 |     public static void main(String[] args) {
16 | 	
17 | 	String urls_str = ""; //default
18 | 	String es_index = "memex";
19 | 	String es_doc_type = "page";
20 | 	String es_server = "localhost";
21 | 	
22 | 	int i = 0;
23 | 	while (i < args.length){
24 | 	    String arg = args[i];
25 | 	    if(arg.equals("-u")){
26 | 		urls_str = args[++i];
27 | 	    } else if(arg.equals("-i")){
28 | 		es_index = args[++i];
29 | 	    } else if(arg.equals("-d")){
30 | 		es_doc_type = args[++i];
31 | 	    } else if(arg.equals("-s")){
32 | 		es_server = args[++i];
33 | 	    }else {
34 | 		System.out.println("Unrecognized option");
35 | 		break;
36 | 	    }
37 | 	    ++i;
38 | 	}
39 | 
40 | 	String[] urls = null;
41 | 	if(urls_str != null & !urls_str.isEmpty())
42 | 	    urls = urls_str.split(" ");
43 | 		
44 | 	Download_urls download_urls = new Download_urls();
45 | 	download_urls.download(urls, es_index, es_doc_type, es_server);
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/StartCrawl.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.Arrays;
 3 | 
 4 | public class StartCrawl {
 5 |     public static void main(String[] args) {
 6 | 
 7 | 	String crawl = ""; //default
 8 | 	String urls_str = "";
 9 | 	String top = "10";
10 | 	String es_index = "memex";
11 | 	String es_doc_type = "page";
12 | 	String es_server = "localhost";
13 | 	
14 | 	int i = 0;
15 | 	while (i < args.length){
16 | 	    String arg = args[i];
17 | 	    if(arg.equals("-c")){
18 | 		crawl = args[++i];
19 | 	    } else if(arg.equals("-u")){ 
20 | 		urls_str = args[++i];
21 | 	    } else if(arg.equals("-t")){ 
22 | 		top = args[++i];
23 | 	    } else if(arg.equals("-i")){
24 | 		es_index = args[++i];
25 | 	    } else if(arg.equals("-d")){
26 | 		es_doc_type = args[++i];
27 | 	    } else if(arg.equals("-s")){
28 | 		es_server = args[++i];
29 | 	    }else {
30 | 		System.out.println("Unrecognized option");
31 | 		break;
32 | 	    }
33 | 	    ++i;
34 | 	}
35 | 
36 | 	ArrayList<String> urls = null;
37 | 	if(!urls_str.isEmpty()){
38 | 	    urls = new ArrayList<String>(Arrays.asList(urls_str.split(",")));
39 | 	}
40 | 
41 | 	Crawl c = new Crawl(es_index, es_doc_type, es_server);
42 | 
43 | 	if(urls != null && crawl.equals("forward"))
44 | 	    c.addForwardCrawlTask(urls, top);
45 | 	else if(urls != null && crawl.equals("backward"))
46 | 	    c.addBackwardCrawlTask(urls, top);
47 | 
48 | 	c.shutdown();
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/vis/html/css/jquery.urlive.css:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * jquery.urlive.css v1.1.1, jQuery URLive
 3 |  *
 4 |  * View the plugin repository at:
 5 |  * https://github.com/markserbol/urlive
 6 |  *
 7 |  */
 8 | 
 9 | .urlive-container {
10 | 	color:inherit;
11 | 	text-decoration:none;
12 | 	display:block;
13 | 	width:auto;
14 | 	overflow:auto;
15 | 	position:relative;
16 | 	overflow:hidden;
17 | 	margin:0;
18 | 	font-size:12px;
19 | 	line-height:normal;
20 | }
21 | 
22 | .urlive-link:hover {
23 | 	box-shadow:0 0 4px rgba(10,10,10,0.6);
24 | }
25 | 
26 | .urlive-link * {
27 | 	display:block;
28 | }
29 | 
30 | .urlive-image {
31 | 	width:100%;
32 | 	display:block;
33 | }
34 | 
35 | .urlive-title {
36 | 	font-size:1.15em;
37 | 	font-weight:bold;
38 | }
39 | 
40 | .urlive-description {
41 | 	font-size:1em;
42 | }
43 | 
44 | .urlive-url {
45 | 	font-size:0.9em;
46 | 	overflow:hidden;
47 | 	white-space:nowrap;
48 | 	text-overflow:ellipsis;
49 | }
50 | 
51 | .urlive-sitename, .urlive-type {
52 | 	display:none;
53 | }
54 | 
55 | .urlive-img-wrapper {
56 |   display: inline-block;
57 | 	float:left;
58 | 	margin-right:10px;
59 | 	max-width:80px;
60 |   padding-left: 5px;
61 | }
62 | 
63 | .urlive-text-wrapper {
64 |   display: inline-block;
65 | 	overflow:auto;
66 | 	max-width:535px;
67 | }
68 | 
69 | /* SMALL IMAGE STYLES*/
70 | .urlive-img-small .urlive-img-wrapper {
71 | 	width:auto;
72 | 	max-width:80px;
73 | }
74 | 
75 | /* LARGE IMAGE STYLES */
76 | .urlive-img-large .urlive-img-wrapper {
77 | 	width:100%;
78 | 	max-width:none;
79 | 	float:none;
80 | }
81 | 


--------------------------------------------------------------------------------
/ranking/get_bigrams_trigrams.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | from elastic.get_documents import get_documents
 3 | from online_classifier.tfidf_vector import tfidf_vectorizer
 4 | import numpy as np
 5 | import operator
 6 | import math
 7 | from sets import Set
 8 | 
 9 | from nltk import corpus
10 | ENGLISH_STOPWORDS = set(corpus.stopwords.words('english'))
11 | 
12 | MAX_PHRASES = 1000
13 | 
14 | def get_bigrams_trigrams(text=[], termCount=20, es=None):
15 | 
16 |         bigram_vectorizer = tfidf_vectorizer(convert_to_ascii=True, ngram_range=(2,2))
17 |         trigram_vectorizer = tfidf_vectorizer(convert_to_ascii=True, ngram_range=(3,3))
18 |         
19 |         [bigram_tfidf, bigram_tf, bi_corpus] = bigram_vectorizer.tfidf(text)
20 |         [trigram_tfidf, trigram_tf, tri_corpus] = trigram_vectorizer.tfidf(text)
21 |                 
22 |         N = np.shape(bigram_tfidf)[0]
23 |         avg = np.divide(bigram_tfidf.sum(axis=0), N)
24 |         sortedAvgIndices = np.argsort(avg)[::-1]
25 |         top_bigrams = [bi_corpus[sortedAvgIndices[0,i]] for i in range(0, np.shape(sortedAvgIndices)[1])][0:termCount]
26 | 
27 |         N = np.shape(trigram_tfidf)[0]
28 |         avg = np.divide(trigram_tfidf.sum(axis=0), N)
29 |         sortedAvgIndices = np.argsort(avg)[::-1]
30 |         top_trigrams = [tri_corpus[sortedAvgIndices[0,i]] for i in range(0, np.shape(sortedAvgIndices)[1])][0:termCount]
31 | 
32 |         return bigram_tfidf, trigram_tfidf, bigram_tf, trigram_tf, bi_corpus, tri_corpus, top_bigrams, top_trigrams
33 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Domain Discover Tool Dockerfile
 3 | #
 4 | # https://github.com/ViDA-NYU/domain_discovery_tool
 5 | #
 6 | 
 7 | # Pull base image.
 8 | FROM ubuntu:trusty
 9 | 
10 | # Install some dependencies and useful tools
11 | RUN apt-get update &&\
12 |  apt-get -y install\
13 |  build-essential\
14 |  openjdk-7-jdk\
15 |  wget curl vim
16 | 
17 | # Install miniconda
18 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
19 |     wget --quiet http://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh && \
20 |     /bin/bash /Miniconda2-latest-Linux-x86_64.sh -b -p /opt/conda && \
21 |     rm Miniconda2-latest-Linux-x86_64.sh && \
22 |     /opt/conda/bin/conda install --yes conda==3.14.1
23 | ENV PATH /opt/conda/bin:$PATH
24 | 
25 | # Expose Domain Discovery Tool port
26 | EXPOSE 8084
27 | 
28 | # Expose ElasticSearch ports
29 | EXPOSE 9200
30 | EXPOSE 9300
31 | 
32 | # Expose Supervisord port
33 | EXPOSE 9001
34 | 
35 | WORKDIR /ddt
36 | 
37 | # Add build file
38 | ADD ./Makefile /ddt/Makefile
39 | 
40 | # Install conda dependencies and download nltk data
41 | ADD ./environment.yml /ddt/environment.yml
42 | RUN make conda_env
43 | RUN make get_nltk_data
44 | 
45 | # Compile Java app
46 | ADD ./seeds_generator /ddt/seeds_generator
47 | RUN make downloader_app
48 | 
49 | # Add remaining python source files
50 | ADD . /ddt
51 | 
52 | # Setup remaning configs
53 | RUN make cherrypy_config link_word2vec_data
54 | 
55 | # Patch address to listen to external connections
56 | RUN sed -i "s#port = 127.0.0.1:9001#port = 0.0.0.0:9001#g" supervisord.conf
57 | 
58 | CMD bash -c 'source activate ddt; /ddt/bin/ddt-dev'
59 | 


--------------------------------------------------------------------------------
/elastic/scripts/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "page": {
 3 |     "_timestamp": {
 4 |       "enabled": true,
 5 |       "store": true
 6 |     },
 7 |     "_id": {
 8 |       "path": "url"
 9 |     },
10 |     "properties": {
11 |       "url": {
12 |         "type": "string",
13 |         "index": "not_analyzed"
14 |       },
15 |       "text": {
16 |         "type": "string",
17 |         "term_vector": "yes"
18 |       },
19 |       "html": {
20 |         "type": "string",
21 | 	      "index": "no"
22 |       },
23 |       "query": {
24 |         "type": "string"
25 |       },
26 |       "retrieved": {
27 |         "type": "date"
28 |       },
29 |       "last_modified": {
30 |         "type": "date"
31 |       },
32 |       "length": {
33 |         "type": "integer"
34 |       },
35 |       "md5": {
36 |         "type": "binary"
37 |       },
38 |       "redirect": {
39 |         "type": "string",
40 |         "index": "not_analyzed"
41 |       },
42 |       "relevance": {
43 |         "type": "float"
44 |       },
45 |       "thumbnail_name": {
46 |         "type": "string"
47 |       },
48 |       "thumbnail": {
49 |         "type": "binary"
50 |       },
51 |       "tag": {
52 |         "type": "string"
53 |       },
54 |       "class": {
55 |         "type": "string"
56 |       },
57 |       "doc_name": {
58 |         "type": "string",
59 |         "index": "not_analyzed"
60 |       },
61 |       "doc_distance": {
62 |         "type": "float"
63 |       },
64 |       "topic_name": {
65 |         "type": "string",
66 |         "index": "not_analyzed"
67 |       },
68 |       "x": {
69 |         "type": "float"
70 |       },
71 |       "y": {
72 |         "type": "float"
73 |       },
74 |       "topic_weight": {
75 |         "type": "float"
76 |       }
77 |     }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/vis/html/css/d3.slider.css:
--------------------------------------------------------------------------------
 1 | .d3-slider {
 2 |     position: relative;
 3 |     font-family: Verdana,Arial,sans-serif;
 4 |     font-size: 1.1em;
 5 |     border: 1px solid #dadada;
 6 |     border-radius: 5px;
 7 |     z-index: 2;
 8 | }
 9 | 
10 | .d3-slider-horizontal {
11 |     height: .8em;
12 | }  
13 | 
14 | .d3-slider-range {
15 |   background:#2980b9;
16 |   left:0px;
17 |   right:0px;
18 |   height: 0.8em;
19 |   position: absolute;
20 | }
21 | 
22 | .d3-slider-range-vertical {
23 |   background:#2980b9;
24 |   left:0px;
25 |   right:0px;
26 |   position: absolute;
27 |   top:0;
28 | }
29 | 
30 | .d3-slider-vertical {
31 |     width: .8em;
32 |     height: 100px;
33 | }      
34 | 
35 | .d3-slider-handle {
36 |     position: absolute;
37 |     width: 1.2em;
38 |     height: 1.2em;
39 |     border: 1px solid #d3d3d3;
40 |     border-radius: 4px;
41 |     background: #eee;
42 |     background: linear-gradient(to bottom, #eee 0%, #ddd 100%);
43 |     z-index: 3;
44 | }
45 | 
46 | .d3-slider-handle:hover {
47 |     border: 1px solid #999999;
48 | }
49 | 
50 | .d3-slider-horizontal .d3-slider-handle {
51 |     top: -.3em;
52 |     margin-left: -.6em;
53 | }
54 | 
55 | .d3-slider-axis {
56 |     position: relative;
57 |     z-index: 1;    
58 |     height: 20px;
59 | }
60 | 
61 | .d3-slider-axis-bottom {
62 |     top: .8em;
63 | }
64 | 
65 | .d3-slider-axis-right {
66 |     left: .8em;
67 | }
68 | 
69 | .d3-slider-axis path {
70 |     stroke-width: 0;
71 |     fill: none;
72 | }
73 | 
74 | .d3-slider-axis line {
75 |     fill: none;
76 |     stroke: #aaa;
77 |     shape-rendering: crispEdges;
78 | }
79 | 
80 | .d3-slider-axis text {
81 |     font-size: 11px;
82 | }
83 | 
84 | .d3-slider-vertical .d3-slider-handle {
85 |     left: -.25em;
86 |     margin-left: 0;
87 |     margin-bottom: -.6em;      
88 | }
89 | 


--------------------------------------------------------------------------------
/seeds_generator/download.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import sys
 3 | from os import environ
 4 | 
 5 | from subprocess import Popen, PIPE, STDOUT
 6 | 
 7 | def encode( url):
 8 |   return urllib2.quote(url).replace("/", "%2F")
 9 | 
10 | def decode( url):
11 |   return urllib2.unquote(url).replace("%2F", "/")
12 | 
13 | def validate_url( url):
14 |   s = url[:4]
15 |   if s == "http":
16 |     return url
17 |   else:
18 |     url = "http://" + url
19 |   return url
20 |   
21 | def get_downloaded_urls(inputfile):
22 |   urls = []
23 |   with open(inputfile, 'r') as f:
24 |     urls = f.readlines
25 |   urls = [url.strip() for url in urls]
26 |   return urls
27 | 
28 | def download(inputfile, es_index = "memex", es_doc_type = "page", es_host="http://localhost"):
29 |   parts = es_host.split(':')
30 |   if len(parts) == 2:
31 |     es_host = parts[0]
32 |   elif len(parts) == 3:
33 |     es_host = parts[1]
34 | 
35 |   es_host = es_host.strip('/')
36 | 
37 |   print es_host
38 | 
39 |   query = ""
40 |   with open('conf/queries.txt', 'r') as f:
41 |     for line in f:
42 |       query = line.strip();
43 | 
44 |   comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar Download " \
45 |          + inputfile + ' "' + query +'" ' + es_index + " " + es_doc_type + " " + es_host;
46 | 
47 |   print comm
48 | 
49 |   p=Popen(comm, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
50 |   # output, errors = p.communicate()
51 |   # print output
52 |   # if not (errors == None):
53 |   #   print '*' * 80, '\n\n\n'  
54 |   #   print errors
55 |   
56 | def main(argv):
57 |   if len(argv) != 1:
58 |     print "Invalid arguments"
59 |     print "python download.py inputfile"
60 |     return
61 |   inputfile=argv[0]
62 |   
63 |   download(inputfile)
64 | 
65 | if __name__=="__main__":
66 |   main(sys.argv[1:])
67 | 


--------------------------------------------------------------------------------
/elastic/aggregations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from os import environ
 3 | from config import es as default_es
 4 | 
 5 | def get_significant_terms(ids, termCount = 50, mapping=None, es_index='memex', es_doc_type='page', es=None):
 6 |     if es is None:
 7 |         es = default_es
 8 | 
 9 |     with open(environ['DDT_HOME']+'/elastic/stopwords.txt', 'r') as f:
10 |         stopwords = [word.strip() for word in f.readlines()]
11 | 
12 |     query = {
13 |         "query":{
14 |             "ids": {
15 |                 "values": ids
16 |             }
17 |         },
18 |         "aggregations" : {
19 |             "significantTerms" : {
20 |                 "significant_terms" : { 
21 |                     "field" : mapping["text"],
22 |                     "size" : termCount,
23 |                     "exclude": stopwords
24 |                 }
25 |             },
26 |         },
27 |         "size": 0
28 |     }
29 | 
30 |     res = es.search(body=query, index=es_index, doc_type=es_doc_type, timeout=30)
31 | 
32 |     return [item['key'] for item in res['aggregations']['significantTerms']['buckets'] if len(item['key']) > 2]
33 | 
34 | # This returns the unique values of the field and the number of documents associated with that unique value
35 | def get_unique_values(field, size, es_index='memex', es_doc_type='page', es=None):
36 |     if es is None:
37 |         es = default_es
38 |         
39 | 
40 |     query = {
41 |         "size": 0,
42 |         "aggs" : {
43 |             "unique_values" : {
44 |                 "terms" : { "field" : field,
45 |                             "size": size}
46 |                 
47 |             }
48 |         }
49 |     }
50 |     res = es.search(body=query, index=es_index, doc_type=es_doc_type, timeout=30)
51 | 
52 |     return {item['key']:item['doc_count'] for item in res['aggregations']['unique_values']['buckets']}
53 |     
54 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Extract.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | import de.l3s.boilerpipe.extractors.KeepEverythingExtractor;
 3 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
 4 | import de.l3s.boilerpipe.sax.HTMLDocument;
 5 | import de.l3s.boilerpipe.document.TextDocument;
 6 | import de.l3s.boilerpipe.document.TextBlock;
 7 | import java.net.URL;
 8 | import java.util.*;
 9 | import java.util.HashMap;
10 | import java.lang.String;
11 | import java.net.URLDecoder;
12 | import java.io.PrintWriter;
13 | 
14 | public class Extract {
15 |     public Map process(String html)
16 |     {
17 | 	try{
18 | 	    HashMap map = new HashMap();
19 | 	    String content = "";
20 | 	    if(!html.contains("@empty@")){
21 | 		content = KeepEverythingExtractor.INSTANCE.getText(html);
22 | 	    }
23 | 	    content = content.trim().replaceAll(" +", " ");
24 | 	    content = content.replaceAll("[\n\"\t]", " ");
25 | 	    content = content.replaceAll(",","");
26 | 	    content = content.toLowerCase();
27 | 
28 | 	    map.put("content", content);
29 | 	    
30 | 	    HTMLDocument htmlDoc = new HTMLDocument(html);
31 | 	    TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
32 | 	    String title = doc.getTitle();
33 | 	    map.put("title", title);
34 | 
35 | 	    return map;
36 | 	}
37 | 	catch(Exception e){
38 | 	    System.err.println("process Exception" + e.getMessage());
39 | 	}
40 | 	
41 | 	return null;
42 |     }
43 | 
44 |     public static void main(String[] args) {
45 | 	Extract e = new Extract();
46 | 
47 | 	try{
48 | 	    BufferedReader br = 
49 | 		new BufferedReader(new InputStreamReader(System.in));
50 | 	    
51 | 	    String html = "";
52 | 	    String input;
53 | 
54 | 	    while((input=br.readLine())!=null){
55 | 		html += input;
56 | 	    }
57 | 
58 | 	    e.process(html);
59 | 	    
60 | 	}catch(IOException io){
61 | 	    io.printStackTrace();
62 | 	}
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/elastic/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "page": {
 3 |     "_timestamp": {
 4 |       "enabled": true,
 5 |       "store": true
 6 |     },
 7 |     "_id": {
 8 |       "path": "url"
 9 |     },
10 |     "properties": {
11 |       "url": {
12 |         "type": "string",
13 |         "index": "not_analyzed"
14 |       },
15 |       "text": {
16 |         "type": "string",
17 |         "term_vector": "yes"
18 |       },
19 |       "html": {
20 |         "type": "string",
21 | 	      "index": "no"
22 |       },
23 |       "query": {
24 |         "type": "string",
25 | 	"index": "not_analyzed"
26 |       },
27 |       "retrieved": {
28 |         "type": "date"
29 |       },
30 |       "last_modified": {
31 |         "type": "date"
32 |       },
33 |       "length": {
34 |         "type": "integer"
35 |       },
36 |       "md5": {
37 |         "type": "binary"
38 |       },
39 |       "redirect": {
40 |         "type": "string",
41 |         "index": "not_analyzed"
42 |       },
43 |       "relevance": {
44 |         "type": "float"
45 |       },
46 |       "thumbnail_name": {
47 |         "type": "string"
48 |       },
49 |       "thumbnail": {
50 |         "type": "binary"
51 |       },
52 |       "tag": {
53 |           "type": "string",
54 | 	  "index": "not_analyzed"
55 |       },
56 |       "class": {
57 |         "type": "string"
58 |       },
59 |       "doc_name": {
60 |         "type": "string",
61 |         "index": "not_analyzed"
62 |       },
63 |       "doc_distance": {
64 |         "type": "float"
65 |       },
66 |       "topic_name": {
67 |         "type": "string",
68 |         "index": "not_analyzed"
69 |       },
70 |       "x": {
71 |         "type": "float"
72 |       },
73 |       "y": {
74 |         "type": "float"
75 |       },
76 |       "topic_weight": {
77 |         "type": "float"
78 |       },
79 |       "crawled_backward": {
80 |           "type": "float"
81 |       },
82 |       "crawled_forward": {
83 |           "type": "float"
84 |       }
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/seeds_generator/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>page_downloader</groupId>
 5 |   <artifactId>seeds_generator</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>seeds_generator</name>
 9 |   <url>http://maven.apache.org</url>
10 |   <dependencies>
11 |     <dependency>
12 |       <groupId>org.json</groupId>
13 | 	<artifactId>json</artifactId>
14 | 	<version>20140107</version>
15 |     </dependency>
16 |     <dependency>
17 |       <groupId>junit</groupId>
18 |       <artifactId>junit</artifactId>
19 |       <version>3.8.1</version>
20 |       <scope>test</scope>
21 |     </dependency>
22 |     <dependency>
23 | 	<groupId>org.apache.httpcomponents</groupId>
24 | 	<artifactId>httpclient</artifactId>
25 | 	<version>4.4.1</version>
26 |     </dependency>
27 |     <dependency>
28 |       <groupId>com.robbypond</groupId>
29 |       <artifactId>boilerpipe</artifactId>
30 |       <version>1.2.3</version>
31 |     </dependency>
32 |     <dependency>
33 |       <groupId>xerces</groupId>
34 |       <artifactId>xerces</artifactId>
35 |       <version>2.4.0</version>
36 |     </dependency>
37 |     <dependency>
38 |       <groupId>net.sourceforge.nekohtml</groupId>
39 |       <artifactId>nekohtml</artifactId>
40 |       <version>1.9.13</version>
41 |     </dependency>
42 |     <dependency>
43 |       <groupId>org.elasticsearch</groupId>
44 |       <artifactId>elasticsearch</artifactId>
45 |       <version>1.5.2</version>
46 |     </dependency>
47 |   </dependencies>
48 |   <build>
49 |     <plugins>
50 |       <plugin>
51 |         <artifactId>maven-assembly-plugin</artifactId>
52 |         <version>2.5.4</version>
53 |         <configuration>
54 |           <descriptorRefs>
55 |             <descriptorRef>jar-with-dependencies</descriptorRef>
56 |           </descriptorRefs>
57 |         </configuration>
58 |       </plugin>
59 |     </plugins>
60 |   </build>
61 | </project>
62 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Download.java:
--------------------------------------------------------------------------------
 1 | import java.util.concurrent.Executors;
 2 | import java.util.concurrent.ExecutorService;
 3 | import java.io.FileReader;
 4 | import java.io.BufferedReader;
 5 | import java.io.IOException;
 6 | import java.util.concurrent.TimeUnit;
 7 | import org.elasticsearch.client.transport.TransportClient;
 8 | import org.elasticsearch.common.transport.InetSocketTransportAddress;
 9 | import org.elasticsearch.client.Client;
10 | 
11 | public class Download {
12 | 
13 |     private String query = "";
14 |     private String es_index = "memex";
15 |     private String es_doc_type = "page";
16 |     private Client client = null;
17 |     private int poolSize = 100;
18 |     private ExecutorService downloaderService = Executors.newFixedThreadPool(poolSize);
19 | 
20 |     public Download(String query, String es_index, String es_doc_type, String es_host){
21 | 	this.query = query;
22 | 	if(es_host.isEmpty())
23 | 	    es_host = "localhost";
24 | 	else {
25 | 	    String[] parts = es_host.split(":");
26 | 	    if (parts.length == 2)
27 | 		es_host = parts[0];
28 | 	    else if(parts.length == 3)
29 | 		es_host = parts[1];
30 | 	    
31 | 	    es_host = es_host.replaceAll("/","");
32 | 	}
33 | 
34 | 	this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(es_host, 9300));
35 | 	
36 | 	if(!es_index.isEmpty())
37 | 	    this.es_index = es_index;
38 | 	if(!es_doc_type.isEmpty())
39 | 	    this.es_doc_type = es_doc_type;
40 |     }
41 | 
42 |     public void setQuery(String query){
43 | 	this.query = query;
44 |     }
45 | 
46 |     public void addTask(String url){
47 | 	downloaderService.execute(new Download_URL(url.trim(), this.query, this.es_index, this.es_doc_type, this.client));
48 |     }
49 | 
50 |     public void shutdown(){
51 | 	try {
52 | 	    downloaderService.shutdown();
53 | 	    //downloaderService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
54 | 	    downloaderService.awaitTermination(60 , TimeUnit.SECONDS);
55 | 	    this.client.close();
56 | 	} catch (InterruptedException e) {
57 | 	    e.printStackTrace();
58 | 	}
59 |     }
60 | 
61 |     
62 | }
63 | 


--------------------------------------------------------------------------------
/online_classifier/online_classifier.py:
--------------------------------------------------------------------------------
 1 | from sklearn import linear_model
 2 | from sklearn.calibration import CalibratedClassifierCV
 3 | 
 4 | from tfidf_vector import tfidf_vectorizer
 5 | 
 6 | import numpy as np
 7 | 
 8 | class OnlineClassifier:
 9 | 
10 |     def __init__(self, max_features=10000):
11 |         self.clf = None
12 |         self.tfidf_vector = tfidf_vectorizer(convert_to_ascii=True, max_features = max_features)
13 |         
14 |     def vectorize(self, train, test=[]):
15 |         [X_train, _, _] = self.tfidf_vector.tfidf(train)
16 |         
17 |         X_test = None
18 |         if test:
19 |             [X_test, _, _] = self.tfidf_vector.tfidf(test)
20 |             
21 |         return [X_train, X_test]
22 | 
23 |     def fit(self, X, Y):
24 |         clf = linear_model.SGDClassifier(n_iter=1)
25 |         try:
26 |             clf.fit(X, Y)
27 |         except ValueError as verr:
28 |             print("Value error: {0}".format(verr))
29 |             return None
30 |         self.clf = clf
31 |         return clf
32 | 
33 |     def partialFit(self, X, Y):
34 |         if self.clf is None:
35 |             self.fit(X, Y)
36 |         else:
37 |             self.clf.partial_fit(X,Y)
38 |         return self.clf
39 |     
40 |     def calibrate(self,  X, Y):
41 |         if self.clf != None:
42 |             sigmoid = CalibratedClassifierCV(self.clf, cv=2, method='sigmoid')
43 |             sigmoid.fit(X,Y)
44 |             return sigmoid
45 |         else:
46 |             return None
47 | 
48 |     def calibrateScore(self, sigmoid, X, Y):
49 |         return sigmoid.score(X,Y)
50 | 
51 |     def predictClass(self, X, sigmoid):
52 |         return [self.clf.predict(X), sigmoid.predict(X), np.multiply(sigmoid.predict_proba(X),100)]
53 | 
54 |     def classify(self, train, train_labels, test, test_labels, partial=False):
55 |         [X_train, X_test] = self.vectorize(train, test)
56 |         if partial:
57 |             clf = self.partialFit(X_train, train_labels)
58 |         else:
59 |             clf = self.fit(X_train, train_labels)
60 |         sigmoid = self.calibrate(clf, X_train, train_labels)
61 |         self.predictClass(X_test, test_labels, clf, sigmoid)
62 |         
63 |         
64 | 


--------------------------------------------------------------------------------
/elastic/get_config.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from config import es as default_es
 3 | 
 4 | def get_available_domains(es=None):
 5 |     if es is None:
 6 |         es = default_es
 7 | 
 8 |     query = {
 9 |         "query": {
10 |             "match_all": {}
11 |         },
12 |     }
13 |     res = es.search(body=query, 
14 |                     index='config',
15 |                     doc_type='domains',
16 |                     size=100
17 |                 )
18 | 
19 |     hits = res['hits']['hits']
20 | 
21 |     result = {}
22 |     for hit in hits:
23 |         result[hit['_id']] = hit['_source']
24 |         result[hit['_id']]['timestamp'] = long(convert_to_epoch(datetime.strptime(result[hit['_id']]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f')))
25 |         
26 |     return result
27 | 
28 | def get_mapping(es=None):
29 |     if es is None:
30 |         es = default_es
31 |         
32 |     query = {
33 |         "query": {
34 |             "match_all": {}
35 |         },
36 |     }
37 |     res = es.search(body=query, 
38 |                     index='config',
39 |                     doc_type='mapping',
40 |                     size=100
41 |                 )
42 |     
43 |     hits = res['hits']['hits']
44 | 
45 |     res = {}
46 |     for hit in hits:
47 |         res[hit['_source']['field']] = hit['_source']['value']
48 | 
49 |     return res
50 | 
51 | def get_tag_colors(es=None):
52 |     if es is None:
53 |         es = default_es
54 |         
55 |     query = {
56 |         "query": {
57 |             "match_all": {}
58 |         }
59 |     }
60 |     res = es.search(body=query, 
61 |                     index='config',
62 |                     doc_type='tag_colors',
63 |                     size=100
64 |                 )
65 |     
66 |     hits = res['hits']['hits']
67 | 
68 |     res = {}
69 |     for hit in hits:
70 |         res[hit['_id']] = {'index': hit['_source']['index']}
71 |         res[hit['_id']]['colors'] = hit['_source']['colors']
72 | 
73 |     return res
74 | 
75 | 
76 | def convert_to_epoch(dt):
77 |     epoch = datetime.utcfromtimestamp(0)
78 |     delta = dt - epoch
79 |     return delta.total_seconds()
80 | 
81 | if __name__ == "__main__":
82 |     get_available_domains()
83 | 
84 |     
85 | 


--------------------------------------------------------------------------------
/ranking/tfidf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from elastic.get_mtermvectors import getTermStatistics
 3 | 
 4 | class tfidf:
 5 |     def __init__(self, opt_docs = None, rm_stopwords=True, rm_numbers=True, pos_tags=[],  term_freq=1, mapping=None, es_index = 'memex', es_doc_type = 'page', es = None):
 6 |         self.documents = opt_docs
 7 |         self.corpus = None
 8 |         self.tfidfArray = None
 9 |         self.tfArray = None
10 |         self.ttf = None
11 |         self.mapping = mapping
12 |         self.rm_stopwords = rm_stopwords
13 |         self.rm_numbers = rm_numbers
14 |         self.pos_tags = pos_tags
15 |         self.es_index = es_index
16 |         self.es_doc_type = es_doc_type
17 |         self.es = es
18 |         self.term_freq = term_freq
19 |         
20 |         if opt_docs != None:
21 |           self.process(opt_docs)
22 | 
23 |     def getTopTerms(self,top):
24 |         N = len(self.documents)
25 |         avg = np.divide(np.sum(self.tfidfArray, axis=0), N)
26 |         sortedAvgIndices = np.argsort(avg)[::-1]
27 |         return [self.corpus[i] for i in sortedAvgIndices[0:top]]
28 | 
29 |     def getIndex(self, terms):
30 |         index = []
31 |         for term in terms:
32 |             if term.strip() in self.corpus:
33 |                 index.append(self.corpus.index(term.strip()))
34 |         return index
35 | 
36 |     def getTfidfArray(self):
37 |         return [self.documents, self.corpus, self.tfidfArray]
38 | 
39 |     def getTfArray(self):
40 |         return [self.documents, self.corpus, self.tfArray]
41 | 
42 |     def getTtf(self):
43 |         return self.ttf
44 | 
45 |     def getURLs(self, args):
46 |         return self.documents
47 | 
48 |     def getTerms(self, indices):
49 |         return [self.corpus[x] for x in indices]
50 | 
51 |     def process(self, documents):
52 |         [data_tfidf, data_tf, data_ttf, corpus, urls] = getTermStatistics(documents, self.rm_stopwords, self.rm_numbers, self.pos_tags, self.term_freq, mapping=self.mapping, es_index=self.es_index, es_doc_type=self.es_doc_type, es=self.es)
53 |         self.tfidfArray = data_tfidf
54 |         self.tfArray = data_tf
55 |         self.ttf = data_ttf
56 |         self.corpus = corpus
57 |         self.documents = urls
58 | 


--------------------------------------------------------------------------------
/elastic/create_index.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | import json
 3 | 
 4 | from config import es as default_es
 5 | 
 6 | def create_index(es_index='memex', mapping=environ['DDT_HOME']+'/elastic/mapping.json', es=None):
 7 |     if es is None:
 8 |         es = default_es
 9 | 
10 |     json_page_data=open(mapping).read()
11 | 
12 |     page_mappings = json.loads(json_page_data)
13 | 
14 |     doctypes = {}
15 |     for doc_type in page_mappings.keys():
16 |         doctypes[doc_type] = page_mappings[doc_type]
17 |         
18 |     mappings = {
19 |         "mappings": doctypes
20 |     }
21 |     
22 |     fields = es_index.lower().split(' ')
23 |     es_index = '_'.join([item for item in fields if item not in ''])
24 | 
25 |     res = es.indices.create(index=es_index, body=mappings, ignore=400)
26 | 
27 |     es.indices.refresh(es_index)
28 | 
29 |     return res
30 | 
31 | def create_terms_index(es_index='ddt_terms', es=None):
32 |     if es is None:
33 |         es = default_es
34 | 
35 |     json_terms_data=open(environ['DDT_HOME']+'/elastic/mapping_terms.json').read()
36 | 
37 |     terms_mappings = json.loads(json_terms_data)
38 | 
39 |     mappings = {"mappings": 
40 |                 {
41 |                     "terms":terms_mappings["terms"]
42 |                 }
43 |             }
44 |     
45 |     fields = es_index.lower().split(' ')
46 |     es_index = '_'.join([item for item in fields if item not in ''])
47 | 
48 |     res = es.indices.create(index=es_index, body=mappings, ignore=400)
49 | 
50 |     es.indices.refresh(es_index)
51 | 
52 |     return res
53 | 
54 | def create_config_index(es_index='config', es=None):
55 |     if es is None:
56 |         es = default_es
57 | 
58 |     json_config_data=open(environ['DDT_HOME']+'/elastic/config.json').read()
59 | 
60 |     config_mappings = json.loads(json_config_data)
61 | 
62 |     mappings = {"mappings": 
63 |                 {
64 |                     "domains": config_mappings["domains"] 
65 |                 }
66 |             }
67 | 
68 |     fields = es_index.lower().split(' ')
69 | 
70 |     es_index = '_'.join([item for item in fields if item not in ''])
71 | 
72 |     res = es.indices.create(index=es_index, body=mappings, ignore=400)
73 |     
74 |     return res
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/ranking/rank.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import tfidf
 4 | import BayesianSets
 5 | import numpy as np
 6 | 
 7 | class rank:
 8 |     def results(self,table,query_urls, other_urls):
 9 | 
10 |         [urls, corpus, data] = table.getTfidfArray()
11 | 
12 |         #Normalise the data
13 |         col_sum_d = np.sum(data,axis=0)    
14 |         norm_d = np.divide(data, col_sum_d)
15 | 
16 |         indices = [urls.index(url) for url in query_urls]
17 |         subquery_data = norm_d[indices, :]
18 | 
19 |         indices = [urls.index(url) for url in other_urls]
20 |         other_data = norm_d[indices, :]
21 | 
22 |         # Check if any of the features are not present in any 
23 |         # of the query set documents
24 |         check_for_zero = np.sum(subquery_data, axis=0)
25 |         zero_indices = np.where(check_for_zero == 0)[0]
26 | 
27 |         if(len(zero_indices) > 0):
28 |             # If features not present in query set documents
29 |             # then remove them
30 |             old_corpus = corpus
31 |             corpus = []
32 |             [corpus.append(old_corpus[i]) for i in range(0,len(old_corpus)) if i not in zero_indices]
33 | 
34 |             subquery_data = np.delete(subquery_data, zero_indices, 1)
35 |             other_data = np.delete(other_data, zero_indices, 1)
36 | 
37 |         bs = BayesianSets.BayesianSets()
38 |         
39 |         score = bs.score(subquery_data, other_data)
40 | 
41 |         indices = np.argsort(np.multiply(score,-1))
42 |         ranked_urls = [other_urls[index] for index in indices]
43 |         ranked_scores = [score[index] for index in indices]
44 |         return [ranked_urls,ranked_scores]
45 | 
46 | def main(argv):
47 |     if len(argv) != 2:
48 |         print "Invalid arguments"
49 |         print "python rank.py inputfile 0,1,2"
50 |         return
51 |     
52 |     # File containing information of documents
53 |     input_file = argv[0]
54 |     # Most relevant documents
55 |     query_index = [int(i) for i in argv[1].split(',')]
56 |     ranker = rank()
57 |     [ranked_urls,scores] = ranker.results(input_file,query_index)
58 | 
59 |     for i in range(0,len(ranked_urls)):
60 |         print ranked_urls[i]," ", str(scores[i])
61 |     
62 | if __name__=="__main__":
63 |     main(sys.argv[1:])
64 | 


--------------------------------------------------------------------------------
/elastic/README.md:
--------------------------------------------------------------------------------
 1 | # ElasticSearch utility for MEMEX (Experimental)
 2 | 
 3 | Jean-Daniel Fekete, March 10th, 2015
 4 | Yamuna Krishnamurthy
 5 | 
 6 | Using ElasticSearch requires its installation first. Go to:
 7 | https://www.elastic.co/downloads/elasticsearch, get the version that fits your system. Install it and start the server.
 8 | It should work on port 9200 on localhost. The installed version should be higher than 1.4 to provide some of the features we need.
 9 | 
10 | To debug and see the contents of the data in ElasticSearch, install the "Head" plugin:
11 |    ```
12 |    sudo elasticsearch/bin/plugin -install mobz/elasticsearch-head
13 |    ```
14 | Then look at the contents of Elasticsearch by opening the url: http://localhost:9200/_plugin/head/
15 | 
16 | Also, install python >= 2.7.9, not python3.
17 | 
18 | Then, you can populate the database with html documents
19 | 
20 | This directory contains python scripts for various operations with elasticsearch:
21 | 
22 | ## Methods for creating an index
23 | 
24 |     ```
25 |     create_index.py
26 |     ```
27 | 
28 | ## Methods for adding and updating documents
29 |     ```
30 |     add_documents.py
31 |     ```
32 | ## Methods to Search documents
33 | 
34 |   ```
35 |   search_documents.py
36 |   ```
37 | ## Getting the term vectors
38 | 
39 | To perform its search, ElasticSearch maintains term vectors and computes TF/IDF on them. The information can be retrieved with the sample script:
40 |    ```
41 |    get_term_vectors.py
42 |    ```
43 | ## Methods to get specific documents
44 | 
45 |    ```
46 |    get_documents.py
47 |    ```
48 | ## Methods to do aggregations
49 | 
50 |    ```
51 |    aggregations.py
52 |    ```
53 |    
54 | ## Methods for delete an index
55 | 
56 |     ```
57 |     delete.py
58 |     ```
59 | 
60 | The shell scripts in the script directory can be used as follows for testing the elasticsearch:
61 | 
62 | ## Creating the ElasticSearch Index
63 | 
64 | A Database is called an Index in ElasticSearch. To create it, use the script `create_index.sh'
65 |   ```
66 |   ./create_index.sh
67 |   ```
68 | 
69 | Then, a Schema should be defined. A ElasticSearch Schema is called a "Mapping" for example `mapping.json`. You can install it with the script:
70 |   ```
71 |   ./put_mapping.sh
72 |   ```
73 | 
74 | 


--------------------------------------------------------------------------------
/vis/html/js/topicvis.js:
--------------------------------------------------------------------------------
 1 | (function(exports){
 2 | 
 3 |   /**
 4 |    * Sting to be used when grabbing the settings form DOM element.
 5 |    */
 6 |   var form = "#topicvis_settings_form";
 7 |   var MIN_TOPICS = 2;
 8 |   var MAX_TOPICS = 20;
 9 | 
10 | 
11 |   /**
12 |    * Default settings for the topik visualizations.
13 |    */
14 |   exports.visSettings = {
15 |     tokenizer: "simple",
16 |     vectorizer: "bag_of_words",
17 |     model: "plsa",
18 |     ntopics: 2,
19 |     visualizer: "",
20 |     session: "",
21 |   };
22 | 
23 | 
24 |   /**
25 |    * Convert the values in the form to simple key-value pairs, in which the key
26 |    * is the html name of the input and the value is the value of the input.
27 |    */
28 |   exports.formToObject = function(form){
29 |     var objects = {};
30 |     var formData = $(form).serializeArray();
31 |     for(var i = 0; i < formData.length; i++){
32 |       objects[formData[i]["name"]] = formData[i]["value"]
33 |     }
34 |     if((objects.ntopics > MAX_TOPICS) || (objects.ntopics < MIN_TOPICS)){
35 |       $("#error_ntopics").css("display", "inline");
36 |       throw "ntopics must be a number between " + MIN_TOPICS + " and " + MAX_TOPICS + ".";
37 |     } else {
38 |       $("#error_ntopics").css("display", "none");
39 |       return objects;
40 |     }
41 |   }
42 | 
43 | 
44 |   /**
45 |    * Update visSettings with the new settings using jQuery.extend
46 |    */
47 |   exports.updateSettings = function(){
48 |     $.extend(true, exports.visSettings, exports.formToObject(form));
49 |   }
50 | 
51 | 
52 |   /**
53 |    * When either button is clicked, use the context dependent "this" to
54 |    * grab the value of the clicked button and update visSettings, then change
55 |    * the href on the link button to contain the vis settings parsed as URL
56 |    * paramaters.
57 |    */
58 |   $("#ldavisPlot, #termitePlot").on("click", function(){
59 |     exports.visSettings.visualizer = $(this).attr("value");
60 |     exports.visSettings.session = JSON.stringify(exports.vis.sessionInfo());
61 |     var url = "/topicvis?" + $.param(exports.visSettings);
62 |     $(this).attr("href", url);
63 |   });
64 | 
65 | 
66 |   /**
67 |    * When the save button is clicked, update visSettings with the new values
68 |    * from the form.
69 |    */
70 |   $("#save_topicvis_settings").on("click", function(){
71 |     exports.updateSettings();
72 |     $("#topicVisSettingsModal").modal("hide");
73 |   });
74 | 
75 | })(this.TopicVis = {});
76 | 


--------------------------------------------------------------------------------
/vis/html/js/utils.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @fileoverview Contains commonly used functions throughout the
 3 |  * application.
 4 |  *
 5 |  * @author (cesarpalomo@gmail.com) Cesar Palomo
 6 |  */
 7 | var Utils = (function() {
 8 |   var tooltipDiv = undefined;
 9 |   var pressedKey = undefined;
10 | 
11 |   // Creates a div for tooltip content.
12 |   var maybeCreateTooltip = function() {
13 |     if (tooltipDiv === undefined) {
14 |       tooltipDiv = d3.select('body')
15 |         .append('div')
16 |           .classed('tooltip', true)
17 |           .style('opacity', 1e-6);
18 |     }
19 |   };
20 | 
21 |   // Registers window to listen for pressed keys.
22 |   d3.select(window).on('keydown', function() {
23 |     pressedKey = d3.event.keyCode;
24 |   });
25 |   d3.select(window).on('keyup', function() {
26 |     pressedKey = undefined;
27 |   });
28 | 
29 |   var pub = {};
30 |   pub.parseFullDate = function(epochInSeconds) {
31 |     return moment.unix(epochInSeconds).format('MMMM Do YYYY h:mm a');
32 |   };
33 |   pub.parseDateTime = function(epochInSeconds) {
34 |     return moment.unix(epochInSeconds).format('MM/DD/YY h:mm a');
35 |   };
36 |   pub.toUTC = function(date) {
37 |       return (date.getUTCMonth()+1) + '/' + date.getUTCDate() + '/' + date.getUTCFullYear() + ' ' + date.getUTCHours() + ':' + date.getUTCMinutes() + ' UTC';
38 |   };  
39 |     
40 |   pub.showTooltip = function() {
41 |     maybeCreateTooltip();
42 |     tooltipDiv.transition()
43 |       .duration(500)
44 |       .style('opacity', 1);
45 |   };
46 |   pub.hideTooltip = function() {
47 |     maybeCreateTooltip();
48 |     tooltipDiv.transition()
49 |       .duration(500)
50 |       .style('opacity', 1e-6);
51 |   };
52 |   pub.updateTooltip = function(text, opt_x, opt_y) {
53 |     maybeCreateTooltip();
54 |     var x = opt_x || d3.event.pageX + 10;
55 |     var y = opt_y || d3.event.pageY - 25;
56 |     tooltipDiv
57 |       .text(text)
58 |       .style('left', x + 'px')
59 |       .style('top',  y + 'px');
60 |   };
61 |   pub.setWaitCursorEnabled = function(enabled) {
62 |     d3.select('#mask')
63 |       .style('display', enabled ? 'block' : 'none')
64 |       .style('cursor', enabled ? 'wait' : 'pointer');
65 |   };
66 |   pub.getRandomInt = function(min, max) {
67 |     return Math.floor(Math.random() * (max - min)) + min;
68 |   };
69 |   pub.openInNewTab = function(url) {
70 |     var win = window.open(url, '_blank');
71 |     win.focus();
72 |   };
73 |   pub.isKeyPressed = function(key) {
74 |     return pressedKey === key;
75 |   };
76 |   return pub;
77 | }());
78 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Domain Discovery Tool development
 2 | # Type "make" or "make all" to build the complete development environment 
 3 | # Type "make help" for a list of commands
 4 | 
 5 | # Variables for the Makefile
 6 | .PHONY = conda_environment cherrypy_config nltk_data word2vec_data
 7 | SHELL := /bin/bash
 8 | CONDA_ROOT := $(shell conda info --root)
 9 | CONDA_ENV := $(CONDA_ROOT)/envs/ddt
10 | 
11 | CONDA_ENV_TARGET := $(CONDA_ENV)/conda-meta/history
12 | DOWNLOADER_APP_TARGET := seeds_generator/target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar
13 | CHERRY_PY_CONFIG_TARGET := vis/config.conf
14 | GET_NLTK_DATA_TARGET := nltk_data/corpora nltk_data/tokenizers
15 | LINK_WORD2VEC_DATA_TARGET := ranking/D_cbow_pdw_8B.pkl
16 | 
17 | # Makefile commands, see below for actual builds
18 | 
19 | ## all              : set up DDT development environment
20 | all: conda_env downloader_app cherrypy_config get_nltk_data link_word2vec_data
21 | 
22 | ## help             : show all commands.
23 | # Note the double '##' in the line above: this is what's matched to produce
24 | # the list of commands.
25 | help                : Makefile
26 | 	@sed -n 's/^## //p' $<
27 | 
28 | ## conda_env        : Install/update a conda environment with needed packages
29 | conda_env: $(CONDA_ENV_TARGET)
30 | 
31 | ## downloader_app   : Build the Java-based downloader application
32 | downloader_app: $(DOWNLOADER_APP_TARGET)
33 | 
34 | ## cherrypy_config  : Configure CherryPy (set absolute root environment)
35 | cherrypy_config: $(CHERRY_PY_CONFIG_TARGET)
36 | 
37 | ## get_nltk_data    : Download NLTK corpus and tokenizers 
38 | get_nltk_data: $(GET_NLTK_DATA_TARGET)
39 | 
40 | ## link_word2vec_data : Hardlink the word2vec data from the conda environment
41 | link_word2vec_data: $(LINK_WORD2VEC_DATA_TARGET)
42 | 
43 | # Actual Target work here
44 | 
45 | $(CONDA_ENV_TARGET): environment.yml
46 | 	conda env update
47 | 
48 | $(DOWNLOADER_APP_TARGET): $(CONDA_ENV_TARGET) seeds_generator/pom.xml $(wildcard seeds_generator/src/main/java/page_downloader/*.java)
49 | 	source activate ddt; \
50 | 	pushd seeds_generator; \
51 | 	mvn compile assembly:single; \
52 | 	popd
53 | 
54 | $(CHERRY_PY_CONFIG_TARGET): vis/config.conf-in
55 | 	sed "s#tools.staticdir.root = .#tools.staticdir.root = ${PWD}/vis/html#g" vis/config.conf-in > vis/config.conf
56 | 
57 | $(GET_NLTK_DATA_TARGET): $(CONDA_ENV)
58 | 	source activate ddt; \
59 | 	python -m nltk.downloader -d ${PWD}/nltk_data stopwords brown punkt averaged_perceptron_tagger
60 | 
61 | $(LINK_WORD2VEC_DATA_TARGET): $(CONDA_ENV)/data/D_cbow_pdw_8B.pkl
62 | 	ln $(CONDA_ENV)/data/D_cbow_pdw_8B.pkl ${PWD}/ranking
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/elastic/add_documents.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | 
 5 | from config import es as default_es
 6 | 
 7 | from elasticsearch import helpers
 8 | 
 9 | def add_document(entries, es_index='memex', es_doc_type='page', es=None):
10 |     if es is None:
11 |         es = default_es
12 | 
13 |     es_entries = []
14 |     for doc in entries:
15 |         entry = {"_index": es_index,
16 |                  "_type": es_doc_type,
17 |                  "_source": {k: v for k, v in doc.items() if k not in ['_id']} }
18 | 
19 |         if '_id' in doc.keys():
20 |             entry['_id'] = doc['_id']
21 | 
22 |         es_entries.append(entry)    
23 | 
24 |     helpers.bulk(es, es_entries, refresh=True)
25 | 
26 | def update_document(update_entries, es_index='memex', es_doc_type='page', es=None):
27 |     if es is None:
28 |         es = default_es
29 |     
30 |     helpers.bulk(es, [{"_op_type": "update",
31 |                        "_index": es_index,
32 |                        "_type": es_doc_type,
33 |                        "doc": value,
34 |                        "doc_as_upsert": True,
35 |                        "_id": key} for key, value in update_entries.items()], refresh=True, request_timeout=600)
36 | 
37 | def delete_document(delete_entries, es_index='memex', es_doc_type='page', es=None):
38 |     if es is None:
39 |         es = default_es
40 |     
41 |     helpers.bulk(es, [{"_op_type": "delete",
42 |                        "_index": es_index,
43 |                        "_type": es_doc_type,
44 |                        "_id": key} for key in delete_entries], refresh=True, request_timeout=600)
45 | 
46 | def refresh(es_index='memex', es_doc_type='page', es=None):
47 |     if es is None:
48 |         es = default_es
49 | 
50 |     es.refresh(es_index)
51 | 
52 | if __name__ == "__main__":
53 |     if len(sys.argv)>1:
54 |         inputfile = sys.argv[1]
55 |         urls = []
56 |         with open(inputfile, 'r') as f:
57 |             for line in f:
58 |                 urls.append(line.strip())
59 |     else:
60 |         urls = [
61 |             'http://en.wikipedia.org/wiki/Dark_internet',
62 |             'http://www.dailymail.co.uk/.../article-3017888/...details-sold-dark-web.html',
63 |             'http://en.wikipedia.org/wiki/Deep_Web',
64 |             'http://www.rogerdavies.com/2011/06/dark-internet',
65 |             'http://www.straightdope.com/.../read/3092/how-can-i-access-the-deep-dark-web'
66 |         ]
67 |     entries = []
68 |     for url in urls:
69 |         print 'Retrieving url %s' % url
70 |         e = compute_index_entry(url=url)
71 |         
72 |         if e: entries.append(e)
73 |     
74 |     if len(entries):
75 |         add_document(entries)
76 |     
77 |     url = 'http://en.wikipedia.org/wiki/Dark_internet',
78 |     entry = {
79 |         'url': url,
80 |         'relevance' : 1
81 |     }
82 |     update_document([entry])
83 | 


--------------------------------------------------------------------------------
/seeds_generator/concat_nltk.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import re
 4 | import nltk
 5 | import codecs
 6 | from elastic.get_documents import get_documents
 7 | 
 8 | ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english'))
 9 | NON_ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words()) - ENGLISH_STOPWORDS
10 | 
11 | STOPWORDS_DICT = {}
12 | for lang in nltk.corpus.stopwords.fileids():
13 |  STOPWORDS_DICT[lang] = set(nltk.corpus.stopwords.words(lang))
14 | 
15 | def get_language(text):
16 |     words = set(nltk.wordpunct_tokenize(text.lower()))
17 |     return max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
18 | 
19 | 
20 | def is_english(text):
21 |     text = text.lower()
22 |     words = set(nltk.wordpunct_tokenize(text))
23 |     return len(words & ENGLISH_STOPWORDS) > len(words & NON_ENGLISH_STOPWORDS)
24 | 
25 | def valid_words(text):
26 |     tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
27 |     words = tokenizer.tokenize(text)
28 |     filtered = [w for w in words if (not w.lower() in ENGLISH_STOPWORDS and len(w) > 2)]
29 |     return " ".join(filtered)
30 | 
31 | def process_text(content):
32 |  content = content.strip().replace(" +", " ");
33 |  content = content.replace("[\n\"\t]", " ");
34 |  content = content.replace(",","");
35 |  content = content.lower();
36 |  return content
37 | 
38 | '''
39 | KEY = re.compile("sex|woman|labor|slave|prostitution|organ|child|traffic|force")
40 | def check_key_terms(content):
41 |   content = content.lower()
42 |   if KEY.search(content):
43 |     content = content.replace("\n", " ")
44 |     return content
45 |   else:
46 |     return ""
47 | '''
48 | 
49 | def get_all_files(dirname):
50 |   print "Loading all filenames"
51 |   files = []
52 |   for [path, dirnames, filenames] in os.walk(dirname):
53 |       for filename in filenames:
54 |         files.append(path + "/" + filename)
55 |   print "Done loading files", len(files)
56 |   return files
57 | 
58 | def get_bag_of_words(urls):
59 |  docs = get_documents(urls)
60 |  bag_of_words = {}
61 |  for url in docs.keys():
62 |   bof = process_text(docs[url])
63 |   bof = valid_words(bof)
64 |   bag_of_words[url] = bof
65 |  return bag_of_words
66 | 
67 | def main(argv):
68 |  output = open(argv[0], "w")
69 |  len_count = 0 #Count number of documents have less than 100 characters
70 |  count = 0
71 |  #for file in files:
72 |  for content in codecs.getreader("utf-8")(sys.stdin):
73 |   if (count % 1000) == 0:
74 |    print "all count:\t" + str(count) + "\tless-100 count:\t" + str(len_count)
75 |   count += 1
76 |   content = content.strip()
77 |   url, text = content.split("\t")
78 |   if not '@empty@' in text:
79 |    text = valid_words(text)
80 |    #if len(text) > 100:
81 |    # len_count += 1
82 |    output.write(url + ";" + text + "\n")
83 |   else:
84 |    output.write(url + ";\n")
85 |  output.close()
86 | 
87 | if __name__=="__main__":
88 |   main(sys.argv[1:])
89 | 


--------------------------------------------------------------------------------
/ranking/extract_terms.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import tfidf
 4 | import BayesianSets
 5 | 
 6 | import numpy as np
 7 | import scipy.sparse as sps
 8 | 
 9 | class extract_terms:
10 |     def __init__(self, tfidf):
11 |         self.table = tfidf
12 | 
13 |     def getTopTerms(self,top):
14 |         return self.table.getTopTerms(top)
15 |         
16 |     def results(self,query_terms):
17 |         
18 |         [urls, corpus, d] = self.table.getTfidfArray()
19 | 
20 |         if sps.issparse(d):
21 |             d = d.toarray()
22 |         
23 |         query_index = self.getIndex(corpus, query_terms)
24 |         
25 |         #Normalise the data
26 |         col_sum_d = np.sum(d, axis=0)    
27 |         norm_d = np.divide(d, col_sum_d)
28 | 
29 |         data = np.transpose(norm_d)
30 | 
31 |         # documents other than the relevant documents
32 |         index = [x for x in range(0,len(data)) if x not in query_index]
33 |         
34 |         subquery_data = data[query_index,:]
35 |         other_data = data[index,:]
36 | 
37 |         # Check if any of the features are not present in any 
38 |         # of the query set documents
39 |         check_for_zero = np.sum(subquery_data, axis=0)
40 |         zero_indices = np.where(check_for_zero == 0)[0]
41 |         
42 |         if(len(zero_indices) > 0):
43 |             # If features not present in query set documents
44 |             # then remove them
45 |             subquery_data = np.delete(subquery_data, zero_indices, 1)
46 |             other_data  = np.delete(other_data, zero_indices, 1)
47 | 
48 |         bs = BayesianSets.BayesianSets()
49 |         score = bs.score(subquery_data, other_data)
50 | 
51 |         rank_index = np.argsort(score)[::-1]
52 | 
53 |         offset_rank_index = [index[x] for x in rank_index]
54 | 
55 |         # Get the terms corresponding to the scored indices
56 |         ranked_terms = self.table.getTerms(offset_rank_index)
57 | 
58 |         ranked_scores = [score[rank_index[i]] for i in range(0, len(score))]
59 |         return [ranked_terms,ranked_scores]
60 | 
61 |     def getIndex(self, corpus, query_terms):
62 |         indices = []
63 |         for term in query_terms:
64 |             try:
65 |                 indices.append(corpus.index(term))
66 |             except ValueError:
67 |                 pass
68 |         return indices
69 |         
70 | 
71 | def main(argv):
72 |     if len(argv) != 2:
73 |         print "Invalid arguments"
74 |         print "python rank.py inputfile 0,1,2"
75 |         return
76 |     
77 |     # File containing information of documents
78 |     input_file = argv[0]
79 |     # Most relevant documents
80 |     query_index = [int(i) for i in argv[1].split(',')]
81 |     ranker = extract_terms()
82 |     [ranked_urls,scores] = ranker.results(input_file,query_index)
83 | 
84 |     for i in range(0,20):
85 |         print ranked_urls[i]," ", str(scores[i])
86 |     
87 | if __name__=="__main__":
88 |     main(sys.argv[1:])
89 | 


--------------------------------------------------------------------------------
/vis/html/domains_dashboard.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |     <head>
 4 |         <link rel="stylesheet" href="bootflat-2.0.4/css/bootstrap.min.css">
 5 |         <link rel="stylesheet" href="bootflat-2.0.4/css/bootflat.min.css">
 6 |         <link type="text/css" rel="stylesheet" href="css/bokeh-0.10.0.min.css" media="all">
 7 |         <link rel="stylesheet" href="bootflat-2.0.4/css/bootstrap.min.css">
 8 |         <link rel="stylesheet" href="bootflat-2.0.4/css/bootflat.min.css">
 9 |         <link type="text/css" rel="stylesheet" href="css/crawler-white.css" media="all">
10 |         <link type="text/css" rel="stylesheet" href="css/dashboard_styles.css" media="all">
11 |         <script type="text/javascript" src="js/libs/bokeh-0.10.0.min.js"></script>
12 |     </head>
13 |     <body>
14 |         <nav class="navbar navbar-default navbar-fixed-top">
15 |           <div class="container-fluid">
16 |             <div class="navbar-header">
17 |               <img src="img/nyu_stacked_black.png" style="height:40px;margin-top:5px;padding-right:15px;" />
18 |             </div>
19 |             <div class="collapse navbar-collapse" id="crawlerSelection">
20 |               <ul class="nav navbar-nav">
21 |                 <li><a class="navbar-brand" href="#">Domain Discovery Tool</a></li>
22 |               </ul>
23 |             </div>
24 |           </div>
25 |         </nav>
26 |         <div class="container">
27 |             <div class="row">
28 |                 {% if ((pages_script and pages_div) or (queries_script and queries_div)) %}
29 |                 {% if ((pages_script and pages_div) and (endings_div and endings_script)) %}
30 |                 <div class="col-md-12">
31 |                     <h4>Page Statistics</h4>
32 |                         {{ pages_script | safe }}
33 |                         {{ pages_div | safe }}
34 |                 </div>
35 |                 <div class="col-md-12">
36 |                     <h4>Endings Statistics</h4>
37 |                         {{ endings_script | safe }}
38 |                         {{ endings_div | safe }}
39 |                 </div>
40 |                 {% else %}
41 |                 <div class="col-md-12">
42 |                     <center><h3>No page information available for this domain.</h3></center>
43 |                 </div>
44 |                 {% endif %}
45 |                 <div class="col-md-12">
46 |                     {% if (queries_script and queries_div) %}
47 |                     <h4>Queries Statistics</h4>
48 |                         {{ queries_script | safe }}
49 |                         {{ queries_div | safe }}
50 |                     {% else %}
51 |                         <center><h3>No queries information available for this domain.</h3></center>
52 |                     {% endif %}
53 |                 </div>
54 |                 {% else %}
55 |                 <div class="col-md-12">
56 |                     <h2>No statistics available for this domain.</h2>
57 |                 </div>
58 |                 {% endif %}
59 |             </div>
60 |         </div>
61 |     </body>
62 | </html>
63 | 


--------------------------------------------------------------------------------
/vis/html/js/crawlersigslots.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @fileoverview Manager for signal slots throught the application.
 3 |  * Refer to this when creating new signals, and to connect slots.
 4 |  *
 5 |  * @author (cesarpalomo@gmail.com) Cesar Palomo
 6 |  */
 7 | 
 8 | 
 9 | 
10 | /**
11 |  * Manages signal slots for application UI.
12 |  */
13 | var SigSlots = (function() {
14 |   ////// Signals definition is centralized here.
15 |   __sig__.available_crawlers_list_loaded = function(crawlers) {};
16 |   __sig__.available_crawlers_list_reloaded = function(crawlers) {};
17 |   __sig__.available_proj_alg_list_loaded = function(proj_alg) {};
18 |   __sig__.new_pages_summary_fetched = function(summary, isFilter) {};
19 |   __sig__.previous_pages_summary_fetched = function(summary, isFilter) {};
20 |   __sig__.terms_summary_fetched = function(summary) {};
21 |   __sig__.term_focus = function(term, onFocus) {};
22 |   __sig__.term_toggle = function(term, shiftClick) {};
23 |   __sig__.terms_snippets_loaded = function(snippetsData) {};
24 |   __sig__.pages_loaded = function(pages) {};
25 |   __sig__.queries_loaded = function(queries) {};
26 |   __sig__.tags_loaded = function(tags) {};
27 |   __sig__.model_tags_loaded = function(tags) {};
28 |   __sig__.tags_colors_loaded = function(tagColors){};
29 |   __sig__.tag_focus = function(tag, onFocus) {};
30 |   __sig__.tag_clicked = function(tag) {};
31 |   __sig__.tag_action_clicked = function(tag, actionType, pages, refresh_plot) {};
32 |   __sig__.tag_individual_page_action_clicked = function(tag, actionType, page) {};
33 |   __sig__.brushed_pages_changed = function(pagesIndices) {};
34 | 
35 |   __sig__.add_crawler = function(index_name) {};
36 |   __sig__.del_crawler = function(domains) {};
37 |   __sig__.query_enter = function(terms) {};
38 |   __sig__.filter_enter = function(terms) {};
39 |   __sig__.add_term = function(term) {};
40 |   __sig__.add_neg_term = function(term) {};
41 |   __sig__.delete_term = function(term) {};
42 |   __sig__.load_new_pages_summary = function(isFilter) {};
43 |   __sig__.set_pages_tags_completed = function(){};
44 |   __sig__.bokeh_insert_plot = function() {};
45 |   __sig__.update_online_classifier = function() {};
46 |   __sig__.update_online_classifier_completed = function(accuracy) {};
47 |   __sig__.build_hierarchy_filters = function(filters) {};
48 |   __sig__.new_tag_loaded = function(flag_newTag) {};
49 | 
50 |   //__sig__.pages_labels_changed = function() {};
51 |   //__sig__.term_selected = function(term) {};
52 |   //__sig__.query_enter = function(query) {};
53 |   //__sig__.pages_do_ranking = function() {};
54 |   //__sig__.pages_extract_terms = function() {};
55 |   //__sig__.brushed_pages_changed = function(pagesIndices) {};
56 |   //__sig__.add_term_to_query_box = function(term) {};
57 | 
58 |   var pub = {};
59 |   ////// CONNECTS SIGNALS TO SLOTS
60 |   // e.g. SigSlots.connect(__sig__.eventHappened, myObject, myObject.onEventHappened);
61 |   pub.connect = function(
62 |       signal, slotInstance, slotMethod) {
63 |     __sig__.connect(
64 |       __sig__, signal,
65 |       slotInstance, slotMethod);
66 |   };
67 |   return pub;
68 | }());
69 | 


--------------------------------------------------------------------------------
/elastic/test/test_get_documents.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from mock import patch
 4 | 
 5 | from ..get_documents import get_plotting_data
 6 | 
 7 | @patch('domain_discovery_tool.elastic.config.es.search')
 8 | def test_get_plotting_data(mock_es_search):
 9 |     mock_es_search.return_value = {
10 |         u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
11 |                      u'hits': {u'hits': [{u'_id': u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937',
12 |                                           u'_index': u'apple',
13 |                                           u'_score': 1.0,
14 |                                           u'_type': u'page',
15 |                                           u'fields': {u'query': [u'apple'],
16 |                                            u'retrieved': [u'2016-04-16T00:06:35.292'],
17 |                                            u'tag': [u'Relevant'],
18 |                                            u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']}},
19 |                                          {u'_id': u'http://www.applevacations.com/',
20 |                                           u'_index': u'apple',
21 |                                           u'_score': 1.0,
22 |                                           u'_type': u'page',
23 |                                           u'fields': {u'query': [u'apple'],
24 |                                            u'retrieved': [u'2016-04-16T00:06:36.135'],
25 |                                            u'tag': [u'Irrelevant', u'Relevant'],
26 |                                            u'url': [u'http://www.applevacations.com/']}},
27 |                                          {u'_id': u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU',
28 |                                           u'_index': u'apple',
29 |                                           u'_score': 1.0,
30 |                                           u'_type': u'page',
31 |                                           u'fields': {u'query': [u'apple'],
32 |                                            u'retrieved': [u'2016-04-16T00:06:34.806'],
33 |                                            u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']}}],
34 |                                u'max_score': 1.0,
35 |                                u'total': 285},
36 |                      u'timed_out': False,
37 |                      u'took': 9}
38 | 
39 |     result = [
40 |         {u'query': [u'apple'],
41 |          u'retrieved': [u'2016-04-16T00:06:35.292'],
42 |          u'tag': [u'Relevant'],
43 |          u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']},
44 |         {u'query': [u'apple'],
45 |          u'retrieved': [u'2016-04-16T00:06:36.135'],
46 |          u'tag': [u'Irrelevant', u'Relevant'],
47 |          u'url': [u'http://www.applevacations.com/']},
48 |         {u'query': [u'apple'],
49 |          u'retrieved': [u'2016-04-16T00:06:34.806'],
50 |          u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']}
51 |     ]
52 | 
53 |     assert get_plotting_data(u'potatoes', es=None) == result
54 | 


--------------------------------------------------------------------------------
/vis/bokeh_plots/utils.py:
--------------------------------------------------------------------------------
  1 | from bokeh.plotting import figure
  2 | from functools32 import wraps
  3 | 
  4 | DATETIME_FORMAT = dict(
  5 |     microseconds=["%m/%d/%y %I:%M:%S %p"],
  6 |     milliseconds=["%m/%d/%y %I:%M:%S %p"],
  7 |     seconds=["%m/%d/%y %I:%M:%S %p"],
  8 |     minsec=["%m/%d/%y %I:%M:%S %p"],
  9 |     minutes=["%m/%d/%y %I:%M:%S %p"],
 10 |     hourmin=["%m/%d/%y %I:%M:%S %p"],
 11 |     hours=["%m/%d/%y %I:%M:%S %p"],
 12 |     days=["%m/%d/%y %I:%M:%S %p"],
 13 |     months=["%m/%d/%y %I:%M:%S %p"],
 14 |     years=["%m/%d/%y %I:%M:%S %p"],
 15 | )
 16 | 
 17 | FONT = "Helvetica"
 18 | FONT_SIZE = "10pt"
 19 | 
 20 | NODATA_COLOR = "#eeeeee"
 21 | GRAY = "#CCCCCC"
 22 | DARK_GRAY = "#6B6B73"
 23 | BLUE = '#67a9cf'
 24 | RED = '#ef8a62'
 25 | 
 26 | AXIS_FORMATS = dict(
 27 |     minor_tick_in=None,
 28 |     minor_tick_out=None,
 29 |     major_tick_in=None,
 30 |     major_label_text_font=FONT,
 31 |     major_label_text_font_size="8pt",
 32 |     axis_label_text_font=FONT,
 33 |     axis_label_text_font_style="italic",
 34 |     axis_label_text_font_size="8pt",
 35 | 
 36 |     axis_line_color=DARK_GRAY,
 37 |     major_tick_line_color=DARK_GRAY,
 38 |     major_label_text_color=DARK_GRAY,
 39 | 
 40 |     major_tick_line_cap="round",
 41 |     axis_line_cap="round",
 42 |     axis_line_width=1,
 43 |     major_tick_line_width=1,
 44 | )
 45 | PLOT_FORMATS = dict(
 46 |     toolbar_location=None,
 47 |     logo=None,
 48 |     outline_line_color="#FFFFFF",
 49 |     title_text_font=FONT,
 50 |     title_text_align='center',
 51 |     title_text_color=DARK_GRAY,
 52 |     title_text_font_size="9pt",
 53 |     title_text_baseline='bottom',
 54 |     min_border_left=0,
 55 |     min_border_right=0,
 56 |     min_border_top=0,
 57 |     min_border_bottom=0,
 58 | )
 59 | LINE_FORMATS = dict(
 60 |     line_cap='round',
 61 |     line_join='round',
 62 |     line_width=2
 63 | )
 64 | FONT_PROPS_SM = dict(
 65 |     text_font=FONT,
 66 |     text_font_size='8pt',
 67 | )
 68 | FONT_PROPS_MD = dict(
 69 |     text_font=FONT,
 70 |     text_font_size='10pt',
 71 | )
 72 | FONT_PROPS_LG = dict(
 73 |     text_font=FONT,
 74 |     text_font_size='12pt',
 75 | )
 76 | BLANK_AXIS = dict(
 77 |     minor_tick_in=None,
 78 |     minor_tick_out=None,
 79 |     major_tick_in=None,
 80 |     major_label_text_font=FONT,
 81 |     major_label_text_font_size="8pt",
 82 |     axis_label_text_font=FONT,
 83 |     axis_label_text_font_style="italic",
 84 |     axis_label_text_font_size="8pt",
 85 | 
 86 |     axis_line_color='white',
 87 |     major_tick_line_color='white',
 88 |     major_label_text_color='white',
 89 |     axis_label_text_color='white',
 90 | 
 91 |     major_tick_line_cap="round",
 92 |     axis_line_cap="round",
 93 |     axis_line_width=1,
 94 |     major_tick_line_width=1,
 95 | )
 96 | 
 97 | def make_empty_plot(plot_width, plot_height):
 98 |     return figure(plot_width=plot_width, plot_height=plot_height,
 99 |                   tools="", toolbar_location=None)
100 | 
101 | def empty_plot_on_empty_df(func):
102 |     @wraps(func)
103 |     def wrapper(*args, **kwargs):
104 |         if len(args[0]) == 0:
105 |             return make_empty_plot(func.func_defaults[0],
106 |                                    func.func_defaults[1])
107 |         return func(*args, **kwargs)
108 |     return wrapper
109 | 


--------------------------------------------------------------------------------
/online_classifier/tf_vector.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | from nltk import corpus
 3 | 
 4 | class tf_vectorizer:
 5 |     
 6 |     def __init__(self, convert_to_ascii=False, max_features=10000, ngram_range=(1,1)):
 7 |         self.convert_to_ascii = convert_to_ascii
 8 |         self.count_vect = None
 9 |         self.max_features = max_features
10 |         self.ngram_range = ngram_range
11 |         self.ENGLISH_STOPWORDS = corpus.stopwords.words('english')
12 |         
13 |     def vectorize(self, data):
14 |         X_counts = None
15 | 
16 |         if self.count_vect is None:
17 |             self.count_vect = CountVectorizer(stop_words=self.ENGLISH_STOPWORDS, preprocessor=self.preprocess, strip_accents='ascii', ngram_range=self.ngram_range, max_features=self.max_features)
18 |             X_counts = self.count_vect.fit_transform(data)
19 |         else:
20 |             X_counts = self.count_vect.transform(data)
21 | 
22 |         return [X_counts, self.count_vect.get_feature_names()]
23 | 
24 |     def tf(self, data):
25 |         return self.vectorize(data)
26 | 
27 |     def preprocess(self, text):
28 |         # Remove unwanted chars and new lines
29 |         text = text.lower().replace(","," ").replace("__"," ").replace("(", " ").replace(")", " ").replace("[", " ").replace("]", " ").replace(".", " ").replace("/", " ").replace("\\", " ").replace("_", " ").replace("#", " ").replace("-", " ").replace("+", " ").replace("%", " ").replace(";", " ").replace(":", " ").replace("'", " ").replace("\""," ").replace("^", " ")
30 | 
31 |         text = text.replace("\n"," ")
32 | 
33 |         if self.convert_to_ascii:
34 |             # Convert to ascii
35 |             ascii_text = []
36 |             for x in text.split(" "):
37 |                 try:
38 |                     ascii_text.append(x.encode('ascii', 'ignore'))
39 |                 except:
40 |                     continue
41 |             
42 |             text = " ".join(ascii_text)
43 |         
44 |         preprocessed_text = " ".join([word.strip() for word in text.split(" ") if len(word.strip()) > 2 and (word.strip() != "") and (self.isnumeric(word.strip()) == False) and self.notHtmlTag(word.strip()) and self.notMonth(word.strip())])
45 | 
46 |         return preprocessed_text
47 | 
48 |     def notHtmlTag(self, word):
49 |         html_tags = ["http", "html", "img", "images", "image", "index"]
50 |         
51 |         for tag in html_tags:
52 |             if (tag in word) or (word in ["url", "com", "www", "www3", "admin", "backup", "content"]):
53 |                 return False
54 | 
55 |         return True
56 | 
57 |     def notMonth(self, word):
58 |         month_tags = ["jan", "january", "feb", "february","mar", "march","apr", "april","may", "jun", "june", "jul", "july", "aug", "august","sep", "sept", "september","oct","october","nov","november","dec", "december","montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sontag"]
59 | 
60 |         if word in month_tags:
61 |             return False
62 | 
63 |         return True
64 | 
65 |     def isnumeric(self, s):
66 |         # Check if string is a numeric
67 |         try: 
68 |             int(s.replace(".","").replace("-","").replace("+",""))
69 |             return True
70 |         except ValueError:
71 |             try:
72 |                 long(s.replace(".","").replace("-","").replace("+",""))
73 |                 return True
74 |             except ValueError:
75 |                 return False
76 | 
77 |     
78 | 


--------------------------------------------------------------------------------
/vis/html/js/snippetsviewer.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @fileoverview Manages a set of snippets for a frequent term appearing in URLs.
  3 |  *
  4 |  * @author (cesarpalomo@gmail.com) Cesar Palomo
  5 |  */
  6 | 
  7 | 
  8 | 
  9 | /**
 10 |  * Manages a set of snippets for a frequent term appearing in URLs.
 11 |  *
 12 |  * @param parentContainerId ID for div element for snippets viewer.
 13 |  */
 14 | var SnippetsViewer = function(parentContainerId) {
 15 |   this.parentContainerId = parentContainerId;
 16 | 
 17 |   // Items in viewer.
 18 |   this.items = [];
 19 | };
 20 | 
 21 | 
 22 | /**
 23 |  * Clears all items in the viewer.
 24 |  */
 25 | SnippetsViewer.prototype.clear = function(lazyUpdate) {
 26 |   this.items = [];
 27 |   if (!lazyUpdate) {
 28 |     this.update();
 29 |   }
 30 | };
 31 | 
 32 | 
 33 | /**
 34 |  * Adds item to viewer: {term: xyz, snippet: abcd xyz nhonhonho}
 35 |  */
 36 | SnippetsViewer.prototype.addItem = function(snippet, lazyUpdate) {
 37 |   this.items.push(snippet);
 38 |   if (!lazyUpdate) {
 39 |     this.update();
 40 |   }
 41 | };
 42 | 
 43 | 
 44 | /**
 45 |  * Adds multiple items to viewer.
 46 |  */
 47 | SnippetsViewer.prototype.addItems = function(snippets, lazyUpdate) {
 48 |   this.items = this.items.concat(snippets);
 49 |   if (!lazyUpdate) {
 50 |     this.update();
 51 |   }
 52 | };
 53 | 
 54 | 
 55 | /**
 56 |  * Updates viewer.
 57 |  */
 58 | SnippetsViewer.prototype.update = function() {
 59 |   var viewer = this;
 60 |   var items = d3.select(this.parentContainerId)
 61 |     .selectAll('.item').data(this.items, function(item, i) {
 62 |       return item.term + '-' + i + '-' + item.snippet.substring(0, 30);
 63 |   });
 64 | 
 65 |   // New items.
 66 |   items.enter()
 67 |     .append('div')
 68 |     .classed('noselect', true)
 69 |     .classed('item', true)
 70 |     .on('click', function(item, i) {
 71 |       var elem = d3.select(this);
 72 |       elem.classed('dblclicked', !elem.classed('dblclicked'));
 73 |       viewer.onItemDoubleClick(item, i);
 74 |       window.open(item.url, '_blank');
 75 |     });
 76 | 
 77 |   // Removes missing items.
 78 |   items.exit().remove();
 79 | 
 80 |   // Updates existing items.
 81 |   items
 82 |     .html(function(item, i) {
 83 |       return viewer.getItemInfo(item, i);
 84 |     });
 85 |   items.each(function(item, i) {
 86 |     var tags = item.term['tags'];
 87 |     var isPositive = tags.indexOf('Positive') != -1;
 88 |     var isNegative = tags.indexOf('Negative') != -1;
 89 |     d3.select(this).selectAll('em')
 90 |       .classed('Positive', isPositive)
 91 |       .classed('Negative', isNegative);
 92 |   });
 93 | };
 94 | 
 95 | 
 96 | /**
 97 |  * Builds html content with info about an item in the viewer.
 98 |  */
 99 | SnippetsViewer.prototype.getItemInfo = function(item, i) {
100 |   // TODO Add more details about term.
101 |   return '<p>' + item.snippet + '</p>';
102 | };
103 | 
104 | 
105 | /**
106 |  * Builds html content with buttons for labeling relevancy an item in the viewer,
107 |  * such as Yes, No, Maybe.
108 |  */
109 | SnippetsViewer.prototype.getItemLabels = function(item, i) {
110 |   // TODO.
111 |   return '<p>Yes No Maybe</p>';
112 | };
113 | 
114 | 
115 | /**
116 |  * Handles click in an item.
117 |  */
118 | SnippetsViewer.prototype.onItemClick = function(item, i) {
119 |   // TODO.
120 |   console.log('itemClicked ' + i);
121 | };
122 | 
123 | 
124 | /**
125 |  * Handles click in an item.
126 |  */
127 | SnippetsViewer.prototype.onItemDoubleClick = function(item, i) {
128 |   // TODO.
129 |   console.log('itemDoubleClicked ' + i);
130 | };
131 | 


--------------------------------------------------------------------------------
/ranking/word2vec.py:
--------------------------------------------------------------------------------
 1 | from pickle import load
 2 | import numpy as np
 3 | from os import environ
 4 | 
 5 | from elastic.get_mtermvectors import getTermFrequency
 6 | from preprocess import TextPreprocess
 7 | from elastic.get_documents import get_documents_by_id
 8 | 
 9 | class word2vec:
10 |     def __init__(self, opt_docs = None, mapping=None, from_es=True, es_index = 'memex', es_doc_type = 'page', es = None):
11 |         self.documents = opt_docs
12 |         self.word2vec = None
13 |         self.word_vec = None
14 |         self.es = es
15 | 
16 |         if not from_es:
17 |             f = open(environ['DDT_HOME']+'/ranking/D_cbow_pdw_8B.pkl', 'rb')
18 |             self.word_vec = load(f)
19 |         
20 |         if opt_docs != None:
21 |             self.process(opt_docs, mapping, es_index, es_doc_type, es)
22 | 
23 |     def get_word2vec(self):
24 |         return [self.documents,self.word2vec]
25 | 
26 |     def get(self, word):
27 |         if self.word_vec is None:
28 |             results = get_documents_by_id([word], ["term"], "word_phrase_to_vec", "terms", self.es)
29 |             if results is None:
30 |                 return None;
31 |             else:
32 |                 return results[0]["term"][0]
33 |         else:
34 |             return self.word_vec.get(word)
35 | 
36 |     def process(self, documents, mapping=None, es_index = 'memex', es_doc_type = 'page', es = None):
37 |         [data_tf, corpus, urls] = getTermFrequency(documents, mapping, es_index, es_doc_type, es)
38 |         
39 |         documents = urls
40 | 
41 |         word2vec_list_docs = []
42 |         urls = []
43 |         i = 0
44 |         for doc in data_tf:
45 |             if self.word_vec is None:
46 |                 results = get_documents_by_id(doc.keys(), ["term", "vector"], "word_phrase_to_vec", "terms", self.es)
47 |                 word_vec_doc = [res["vector"][0] for res in results]
48 |             else:
49 |                 word_vec_doc = [self.word_vec[term] for term in doc.keys() if doc[term] >= 1 and not self.word_vec.get(term) is None]
50 |                 
51 |             if word_vec_doc:
52 |                 m_word_vec = np.array(word_vec_doc).mean(axis=0) 
53 |                 word2vec_list_docs.append(m_word_vec.tolist())
54 |                 urls.append(documents[i])
55 |             i = i + 1
56 |         
57 |         self.documents = urls
58 |         
59 |         self.word2vec = np.array(word2vec_list_docs)
60 | 
61 |         return [self.documents,self.word2vec]
62 | 
63 |     def process_text(self, urls, documents):
64 |         tp = TextPreprocess()
65 | 
66 |         word2vec_list_docs = []
67 |         final_urls = []
68 |         i = 0
69 |         for text in documents:
70 |             doc = tp.preprocess(text)
71 |             if self.word_vec is None:
72 |                 terms = [term for term in doc.keys() if doc[term] > 5]
73 |                 results = get_documents_by_id(terms, ["term", "vector"], "word_phrase_to_vec", "terms", self.es)
74 |                 word_vec_doc = [res["vector"] for res in results]
75 |             else:    
76 |                 word_vec_doc = [self.word_vec[term] for term in doc.keys() if not self.word_vec.get(term) is None]
77 |                 #word_vec_doc = [self.word_vec[term]*doc[term] for term in doc.keys() if not self.word_vec.get(term) is None]
78 | 
79 |             if word_vec_doc:
80 |                 m_word_vec = np.array(word_vec_doc).mean(axis=0)
81 |                 #m_word_vec = np.sum(np.array(word_vec_doc), axis=0)/np.sum(np.array([doc[term] for term in doc.keys() if not self.word_vec.get(term) is None]))
82 |                 word2vec_list_docs.append(m_word_vec.tolist())
83 |                 final_urls.append(urls[i])
84 |             i = i + 1
85 | 
86 |         self.documents = final_urls
87 | 
88 |         self.word2vec = np.array(word2vec_list_docs)
89 | 
90 |         return [self.documents,self.word2vec]
91 | 


--------------------------------------------------------------------------------
/vis/html/js/libs/d3.lasso.min.js:
--------------------------------------------------------------------------------
1 | d3.lasso=function(){function t(){function t(){u="",P.attr("d",null),v.attr("d",null),g=0,n[0].forEach(function(t){t.hoverSelected=!1,t.loopSelected=!1;var e=t.getBBox();t.lassoPoint={cx:Math.round(e.x+e.width/2),cy:Math.round(e.y+e.height/2),edges:{top:0,right:0,bottom:0,left:0},close_edges:{left:0,right:0}}}),1==a&&n.on("mouseover.lasso",function(){d3.select(this)[0][0].hoverSelected=!0}),i.start()}function c(){var t=d3.mouse(this)[0],a=d3.mouse(this)[1];""==u?(u=u+"M "+t+" "+a,h=[t,a],M.attr("cx",t).attr("cy",a).attr("r",7).attr("display",null)):u=u+" L "+t+" "+a,n[0].forEach(function(t){t.lassoPoint.close_edges={left:0,right:0}});var l=Math.sqrt(Math.pow(t-h[0],2)+Math.pow(a-h[1],2)),c="M "+t+" "+a+" L "+h[0]+" "+h[1];P.attr("d",u),o>=l?v.attr("display",null):v.attr("display","none"),s=o>=l?!0:!1;var d=u+"Z";x.attr("d",d);for(var y=P.node(),p=y.getTotalLength(),m=(y.getPointAtLength(g-1),g);p>=m;m++){var _=y.getPointAtLength(m),S={x:Math.round(100*_.x)/100,y:Math.round(100*_.y)/100},b=y.getPointAtLength(m-1),L={x:Math.round(100*b.x)/100,y:Math.round(100*b.y)/100};n[0].filter(function(t){var n;return t.lassoPoint.cy===S.y&&t.lassoPoint.cy!=L.y?(f={x:L.x,y:L.y},n=!1):t.lassoPoint.cy===S.y&&t.lassoPoint.cy===L.y?n=!1:t.lassoPoint.cy===L.y&&t.lassoPoint.cy!=S.y?n=e(t.lassoPoint.cy-S.y)!=e(t.lassoPoint.cy-f.y):(f={x:L.x,y:L.y},n=e(t.lassoPoint.cy-S.y)!=e(t.lassoPoint.cy-L.y)),n}).forEach(function(t){S.x>t.lassoPoint.cx&&(t.lassoPoint.edges.right=t.lassoPoint.edges.right+1),S.x<t.lassoPoint.cx&&(t.lassoPoint.edges.left=t.lassoPoint.edges.left+1)})}if(1==s&&1==r){v.attr("d",c),close_path_node=v.node();for(var w=close_path_node.getTotalLength(),m=0;w>=m;m++){var _=close_path_node.getPointAtLength(m),b=close_path_node.getPointAtLength(m-1);n[0].filter(function(t){return t.lassoPoint.cy==Math.round(_.y)}).forEach(function(t){Math.round(_.y)!=Math.round(b.y)&&Math.round(_.x)>t.lassoPoint.cx&&(t.lassoPoint.close_edges.right=1),Math.round(_.y)!=Math.round(b.y)&&Math.round(_.x)<t.lassoPoint.cx&&(t.lassoPoint.close_edges.left=1)})}n[0].forEach(function(t){t.loopSelected=t.lassoPoint.edges.left+t.lassoPoint.close_edges.left>0&&(t.lassoPoint.edges.right+t.lassoPoint.close_edges.right)%2==1?!0:!1})}else n[0].forEach(function(t){t.loopSelected=!1});d3.selectAll(n[0].filter(function(t){return t.loopSelected&&s||t.hoverSelected})).attr("d",function(t){return t.possible=!0}),d3.selectAll(n[0].filter(function(t){return!(t.loopSelected&&s||t.hoverSelected)})).attr("d",function(t){return t.possible=!1}),i.draw(),g=p+1}function d(){n.on("mouseover.lasso",null),n.filter(function(t){return t.possible===!0}).attr("d",function(t){return t.selected=!0}),n.filter(function(t){return t.possible===!1}).attr("d",function(t){return t.selected=!1}),n.attr("d",function(t){return t.possible=!1}),P.attr("d",null),v.attr("d",null),M.attr("display","none"),i.end()}var u,h,f,g,y=d3.select(this[0][0]),p=y.append("g").attr("class","lasso"),P=p.append("path").attr("class","drawn"),v=p.append("path").attr("class","loop_close"),x=p.append("path").attr("display","none"),M=p.append("circle").attr("class","origin"),m=d3.behavior.drag().on("dragstart",t).on("drag",c).on("dragend",d);l.call(m)}function e(t){return t?0>t?-1:1:0}var n=null,o=75,r=!0,s=!1,a=!0,l=null,i={start:function(){},draw:function(){},end:function(){}};return t.items=function(e){return arguments.length?(n=e,n[0].forEach(function(t){var e=d3.select(t);"undefined"==typeof e.datum()?e.datum({possible:!1,selected:!1}):e.attr("d",function(t){return t.possible=!1,t.selected=!1,t})}),t):n},t.closePathDistance=function(e){return arguments.length?(o=e,t):o},t.closePathSelect=function(e){return arguments.length?(r=1==e?!0:!1,t):r},t.isPathClosed=function(e){return arguments.length?(s=1==e?!0:!1,t):s},t.hoverSelect=function(e){return arguments.length?(a=1==e?!0:!1,t):a},t.on=function(e,n){if(!arguments.length)return i;if(1===arguments.length)return i[e];var o=["start","draw","end"];return o.indexOf(e)>-1&&(i[e]=n),t},t.area=function(e){return arguments.length?(l=e,t):l},t};
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Domain Discovery Tool
 2 | 
 3 | This repository contains the Domain Discovery Tool (DDT) project. DDT is an interactive system that helps users explore and better understand a domain (or topic) as it is represented on the Web. It achieves this by integrating human insights with machine computation (data mining and machine learning) through visualization. DDT allows a domain expert to visualize and analyze pages returned by a search engine or a crawler, and easily provide feedback about relevance. This feedback, in turn, can be used to address two challenges:
 4 | 
 5 | * Guide users in the process of domain understanding and help them construct effective queries to be issued to a search engine; and
 6 | * Configure focused crawlers that efficiently search the Web for additional pages on the topic. DDT allows users to quickly select crawling seeds as well as positive and negatives required to create a page classifier for the focus topic.
 7 | 
 8 | ## Installing on your machine
 9 | 
10 | Building and deploying the Domain Discovery Tool can either be done using its Makefile to create a local development environment, or automatically by conda or Docker for deployment.  The conda build environment is currently only supported on 64-bit OS X and Linux.
11 | 
12 | ### Local development
13 | 
14 | First install conda, either through the Anaconda or miniconda installers provided by Continuum.  You will also need Git and a Java Development Kit.  These are system tools that are generally not provided by conda.
15 | 
16 | Clone the DDT repository and enter it:
17 | 
18 | ```
19 | https://github.com/ViDA-NYU/domain_discovery_tool
20 | cd domain_discovery_tool
21 | ```
22 | 
23 | Use the `make` command to build DDT and download/install its dependencies.
24 | 
25 | ```
26 | make
27 | ```
28 | 
29 | After a successful installation, you can activate the DDT development environment:
30 | 
31 | ```
32 | source activate ddt
33 | ```
34 | 
35 | And (from the top-level `domain_discovery_tool` directory),  start
36 | supervisord to run the web application and its associated services:
37 | 
38 | ```
39 | supervisord
40 | ```
41 | 
42 | Now you should be able to head to http://localhost:8084/ to interact
43 | with the tool.
44 | 
45 | ### Docker development
46 | 
47 | First, make sure you have Docker installed and running. Then, you can create an DDT image using the Dockerfile. Run the following command in the root folder of this project:
48 | 
49 |     docker build -t domain_discovery_tool .
50 | 
51 | or download the latest published docker build (you do not need to clone the DDT repository in this case):
52 | 
53 |     docker pull vidanyu/ddt:latest
54 | 
55 | Run the app using the Docker image that you just built (or pulled). This starts the elasticsearch and the DDT server:
56 | 
57 |     docker run -i -p 8084:8084 -p 9200:9200 -t <domain_discovery_tool or vidanyu/ddt:latest> /ddt/run_demo.sh
58 | 
59 | To see the app running, go to:
60 | 
61 |     http://localhost:8084/seedcrawler
62 | 
63 | Alternativaly, you can also specify an external ElasticSearch server address using an enviroment variable:
64 | 
65 |     docker run -p 8084:8084 -e "ELASTICSEARCH_SERVER=http://127.0.0.1:9200" -i -t <domain_discovery_tool or vidanyu/ddt:latest>
66 | 
67 | ## Further Documentation
68 | 
69 | [Detailed Description of the tool](https://s3.amazonaws.com/vida-nyu/DDT/domain_discovery_tool.pdf)
70 | 
71 | [Demo Scripts and Videos](https://s3.amazonaws.com/vida-nyu/DDT/DomainDiscoveryToolDemoScripts.pdf)
72 | 
73 | **Note:** To follow the demo videos download and use the following demo build version of DDT:
74 | 
75 | ```
76 | docker pull vidanyu/ddt:2.7.0-demo
77 | docker run -i -p 8084:8084 -p 9200:9200 -p 9001:9001  -t vidanyu/ddt:2.7.0-demo
78 | ```
79 | 
80 | ## Publication
81 | 
82 | Yamuna Krishnamurthy, Kien Pham, Aecio Santos, and Juliana Friere. 2016. [Interactive Web Content Exploration for Domain Discovery](http://poloclub.gatech.edu/idea2016/papers/p64-krishnamurthy.pdf) (Interactive Data Exploration and Analytics ([IDEA](http://poloclub.gatech.edu/idea2016/)) Workshop at Knowledge Discovery and Data Mining ([KDD](http://www.kdd.org/kdd2016/)), San Francisco, CA).
83 | 
84 | ## Contact
85 | 
86 | DDT Development Team [ddt-dev@vgc.poly.edu]
87 | 
88 | 


--------------------------------------------------------------------------------
/vis/bokeh_plots/test/test_cross_filter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from pandas.util.testing import assert_frame_equal
 4 | import pytest
 5 | 
 6 | from ..cross_filter import (parse_es_response, calculate_query_correlation,
 7 |     calculate_graph_coords, duplicate_multi_rows, normalize)
 8 | 
 9 | @pytest.fixture
10 | def es_response():
11 |     return [
12 |         {u'query': [u'apple', u'banana'],
13 |          u'retrieved': [u'2016-04-16T00:06:35.292'],
14 |          u'tag': [u'Relevant'],
15 |          u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']},
16 |         {u'query': [u'apple', u'banana'],
17 |          u'retrieved': [u'2016-04-16T00:06:36.135'],
18 |          u'tag': [u'Irrelevant', u'Relevant'],
19 |          u'url': [u'http://www.applevacations.com/']},
20 |         {u'query': [u'apple'],
21 |          u'retrieved': [u'2016-04-16T00:06:34.806'],
22 |          u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']},
23 |         {u'query': [u'banana'],
24 |          u'retrieved': [u'2016-04-16T00:06:36.135'],
25 |          u'tag': [u'Irrelevant', u'Relevant'],
26 |          u'url': [u'http://www.bananavacations.com/']},
27 |         {u'query': [u'carrot'],
28 |          u'retrieved': [u'2016-04-16T00:06:34.806'],
29 |          u'url': [u'http://www.nytimes.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']}
30 |     ]
31 | 
32 | 
33 | def test_parse_es_response(es_response):
34 |     # note that data becomes ordered by `retrieved` field
35 |     data = {'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU',
36 |              u'http://www.nytimes.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU',
37 |              u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937',
38 |              u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937',
39 |              u'http://www.applevacations.com/',
40 |              u'http://www.applevacations.com/',
41 |              u'http://www.applevacations.com/',
42 |              u'http://www.applevacations.com/',
43 |              u'http://www.bananavacations.com/',
44 |              u'http://www.bananavacations.com/'],
45 |      'query': [u'apple', u'carrot', u'apple', u'banana', u'apple', u'apple', u'banana', u'banana', u'banana', u'banana'],
46 |      'tag': ['Untagged', 'Untagged', u'Relevant', u'Relevant', u'Irrelevant', u'Relevant', u'Irrelevant', u'Relevant', u'Irrelevant', u'Relevant'],
47 |      'hostname': [u'reuters.com', u'nytimes.com', u'politico.com', u'politico.com', u'applevacations.com', u'applevacations.com', u'applevacations.com', u'applevacations.com', u'bananavacations.com', u'bananavacations.com'],
48 |      'tld': [u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com']}
49 | 
50 |     df = parse_es_response(es_response)
51 | 
52 |     assert df.to_dict('list') == data
53 |     assert df.index.tz.tzname("") == 'UTC'
54 | 
55 | def test_calculate_query_correlation(es_response):
56 |     df = parse_es_response(es_response)
57 | 
58 |     graph = calculate_query_correlation(df, 'query')
59 | 
60 |     assert graph == {(u'apple', u'banana'): 1.0}
61 | 
62 | def test_calculate_graph_coords(es_response):
63 |     df = parse_es_response(es_response)
64 | 
65 |     graph = calculate_graph_coords(df, 'query')
66 | 
67 |     assert np.allclose(graph.x.tolist(), [-0.5, -0.5, 1.0])
68 |     assert np.allclose(graph.y.tolist(), [0.8660254037, -0.8660254037, 0.0])
69 |     assert graph.url.tolist() == [4, 5, 1]
70 |     assert graph.index.tolist() == [u'apple', u'banana', u'carrot']
71 | 
72 | def test_duplicate_multitag_rows(es_response):
73 |     df = parse_es_response(es_response)
74 | 
75 |     # Refactored to be called inside of parse_es_response
76 |     # df = duplicate_multi_rows(df, 'tag')
77 | 
78 |     assert df.shape == (10,5)
79 |     assert df.tag.tolist() == ['Untagged', 'Untagged', u'Relevant', u'Relevant',
80 |                                u'Irrelevant', u'Relevant', u'Irrelevant',
81 |                                u'Relevant', u'Irrelevant', u'Relevant']
82 | 
83 | def test_normalize():
84 |     assert np.allclose(normalize(pd.Series([1,2,3]), 3, 1.5).tolist(), [1.5, 2.0, 3.0])
85 | 


--------------------------------------------------------------------------------
/vis/html/js/bokeh_controller.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This module handles communication between the bokeh callbacks and the rest of
  3 |  * the DDT application. Many of these functions are helper functions called from
  4 |  * the bokeh CustomJS callbacks in `vis/bokeh_graphs/clustering.py`.
  5 |  */
  6 | (function(exports){
  7 | 
  8 |   exports.inds = [];
  9 |   exports.plot = {};
 10 | 
 11 |   // Takes urls and tags from Bokeh and changes their tags.
 12 |   exports.updateTags = function(selectedUrls, tag, action){
 13 |     // Add the tag to tagsgallery if it does not exist. For example a custom tag
 14 |     exports.vis.tagsGallery.addItem(tag, false);
 15 |     exports.vis.tagsGallery.applyOrRemoveTag(tag, action, selectedUrls, false);
 16 |   }
 17 | 
 18 |  exports.addCustomTags = function(custom_tags){
 19 |      for(var i in custom_tags){
 20 | 	 if(custom_tags[i] != "Custom tags")
 21 | 	     exports.vis.tagsGallery.addItem(custom_tags[i], false);
 22 |      }
 23 |  }
 24 |     
 25 |   exports.crawlPages = function(selectedURLs, crawl_type){
 26 |       exports.vis.crawlPages(selectedURLs, crawl_type);
 27 |   }
 28 | 
 29 |   // Shows the selected pages on the pageGallery below the plot.
 30 |   exports.showPages = function(inds){
 31 |       exports.inds = inds;
 32 |       exports.vis.onBrushedPagesChanged(inds);
 33 |   }
 34 | 
 35 |   // Inserts the bokeh plot at the specified dom element.
 36 |     exports.insertPlot = function(plotData){
 37 |     $("#pages_landscape").html(plotData);
 38 |   }
 39 | 
 40 |   exports.BokehPlotKey = function(){
 41 |     return Bokeh.index[Object.keys(Bokeh.index)[0]].model.children()[0]
 42 |   }
 43 | 
 44 | 
 45 |   exports.getGlyphRenderersByType = function(glyphType) {
 46 |     var allRenderers = exports.plot.get("renderers");
 47 |     var renderers = [];
 48 |     $.each(exports.plot.get("renderers"), function(index, value) {
 49 |       if (value.attributes.hasOwnProperty("glyph") && value.attributes.glyph.type === glyphType) {
 50 |         renderers.push(value);
 51 |       }
 52 |     });
 53 |     return renderers;
 54 |   };
 55 | 
 56 | 
 57 |   exports.updatePlotColors = function(url, color) {
 58 |     var renderer = exports.getGlyphRenderersByType("Circle")[0];
 59 |     var d = renderer.get("data_source").get("data");
 60 |     url_index = -1;
 61 |     urls = [].concat.apply([], d.urls);
 62 |     for(var i in urls){
 63 | 	if(urls[i] == url){
 64 | 	    url_index = i;
 65 | 	    break;
 66 | 	}
 67 |     }
 68 |     d.color[url_index] = color;
 69 |     renderer.get("data_source").set("data", d);
 70 |     renderer.get("data_source").trigger("change");
 71 |   };
 72 | 
 73 | 
 74 |   // Gets the necessary javascript and HTML for rendering the bokeh plot into
 75 |   // the dom.
 76 |   exports.getPlotData = function(data){
 77 |     Bokeh.index = {};
 78 |     exports.insertPlot(data.plot);
 79 |     exports.plot = exports.BokehPlotKey()
 80 |   }
 81 | 
 82 | 
 83 |   exports.getEmptyPlot = function(){
 84 |     $.ajax({
 85 |       url: "/getEmptyBokehPlot",
 86 |       type: "GET",
 87 |       success: function(data){
 88 |         exports.insertPlot(data);
 89 |       },
 90 |     });
 91 |   }
 92 | 
 93 |   exports.updateData = function(updated_tags){
 94 |     // Update the data with the new tags  
 95 |     var data = exports.vis.pagesLandscape.getPagesData();
 96 |     for(var i in data){
 97 | 	var url = data[i]["url"];
 98 | 	if(updated_tags[url] != undefined){
 99 | 	    data[i]["tags"] = updated_tags[url]["tags"];
100 | 	    exports.updatePlotColors(url, updated_tags[url]["color"]);
101 | 	}
102 |     }
103 |     exports.vis.pagesLandscape.setPagesData(data);
104 |     exports.vis.pagesGallery.update();
105 |   }
106 | 
107 |   exports.clear = function(updated_tags){
108 |     exports.getEmptyPlot();
109 |   }
110 | 	
111 |   // Connect to updateSession to bokeh_get_session signal
112 |   SigSlots.connect(__sig__.bokeh_insert_plot, exports, exports.getPlotData);
113 |     
114 |   exports.getEmptyPlot();
115 |     
116 |   // Statistics page functions and callbacks.
117 |   $("#goto_statistics").on("click", function(){
118 |     var url = "/statistics?" + $.param({session: JSON.stringify(exports.vis.sessionInfo())});
119 |     $(this).attr("href", url);
120 |   });
121 |     
122 | })(this.BokehPlots = {});
123 | 


--------------------------------------------------------------------------------
/vis/html/cross_filter.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block navigation %}
 4 |   <ul class="nav navbar-nav navbar-right">
 5 |     <li>
 6 |       <div class="help-dropdown">
 7 |         <h5>Help</h5>
 8 |         <div class="help-dropdown-content dropdown-menu-right">
 9 |           <p>
10 |             Click on table rows to filter the data in the visualizations and use
11 |             Super + Click (Cmd on OSX, Ctl on Linux, Windows) to deselect. Shift +
12 |             Click will select or deselect multiple rows at a time.
13 |           </p>
14 |         </div>
15 |       </div>
16 |     </li>
17 |   </ul>
18 | {% endblock navigation %}
19 | 
20 | {% block content %}
21 |   <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
22 |   <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css" integrity="sha384-fLW2N01lMqjakBkx3l/M9EahuwpSfeNvV63J5ezn3uZzapT0u7EYsXMjQV+0En5r" crossorigin="anonymous">
23 |   <link rel="stylesheet" type="text/css" href="http://cdn.pydata.org/bokeh/release/bokeh-0.10.0.min.css">
24 |   <link rel="stylesheet" href="bootstrap-datetimepicker-4.15.35/css/bootstrap-datetimepicker.min.css">
25 |   <link rel="stylesheet" href="css/cross_filter.css">
26 | 
27 |   <script src="http://cdn.pydata.org/bokeh/release/bokeh-0.10.0.min.js"></script>
28 | 
29 |   <div class="container-fluid">
30 |     <div class="col-md-4">
31 |       <div class="bokeh_table" id="queries">{{ widgets_div['queries'] | safe }}</div>
32 |       <div class="bokeh_table" id="tags">{{ widgets_div['tags'] | safe }}</div>
33 |       <div class="bokeh_table" id="urls">{{ widgets_div['urls'] | safe }}</div>
34 |       <div class="bokeh_table" id="tlds">{{ widgets_div['tlds'] | safe }}</div>
35 |       <div class="h6">Start Date</div>
36 |       <div class="form-group">
37 |           <div class='input-group date' id='datetimepicker_start'>
38 |               <input type='text' class="form-control" />
39 |               <span class="input-group-addon">
40 |                   <span class="glyphicon glyphicon-calendar"></span>
41 |               </span>
42 |           </div>
43 |       </div>
44 |       <div class="h6">End Date</div>
45 |       <div class="form-group">
46 |           <div class='input-group date' id='datetimepicker_end'>
47 |               <input type='text' class="form-control" />
48 |               <span class="input-group-addon">
49 |                   <span class="glyphicon glyphicon-calendar"></span>
50 |               </span>
51 |           </div>
52 |       </div>
53 |     </div>
54 |     <div class="col-md-8">
55 |       <div id="plot_area">
56 |       {% include 'cross_filter_plot_area.html' %}
57 |       </div>
58 |     </div>
59 |   </div>
60 | 
61 |   <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
62 |   <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
63 |   <!-- Include all compiled plugins (below), or include individual files as needed -->
64 |   <script type="text/javascript" src="js/libs/moment.js"></script>
65 |   <script type="text/javascript" src="bootstrap-datetimepicker-4.15.35/js/bootstrap-datetimepicker.min.js"></script>
66 |   <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
67 |   <!--Custom JS files-->
68 |   <script type="text/javascript" src="js/cross_filter.js"></script>
69 | 
70 |   <script type="text/javascript">
71 |     $(document).ready(function () {
72 |       $('#datetimepicker_start').datetimepicker();
73 |       $('#datetimepicker_end').datetimepicker({
74 |           useCurrent: false //Important! See issue #1075
75 |       });
76 |       $("#datetimepicker_start").on("dp.change", function (e) {
77 |           crossFilterUpdate();
78 |           $('#datetimepicker_end').data("DateTimePicker").minDate(e.date);
79 |       });
80 |       $("#datetimepicker_end").on("dp.change", function (e) {
81 |           crossFilterUpdate();
82 |           $('#datetimepicker_start').data("DateTimePicker").maxDate(e.date);
83 |       });
84 |     });
85 |   </script>
86 | {% endblock content %}
87 | 


--------------------------------------------------------------------------------
/vis/bokeh_plots/domains_dashboard.py:
--------------------------------------------------------------------------------
  1 | from urlparse import urlparse
  2 | from collections import Counter
  3 | from operator import itemgetter
  4 | import datetime
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import networkx as nx
  9 | 
 10 | from bokeh.plotting import figure, show, output_file
 11 | from bokeh.embed import components
 12 | from bokeh.models import ColumnDataSource, HoverTool
 13 | from bokeh.models.widgets import Panel, Tabs, Button, DataTable, DateFormatter, TableColumn
 14 | from bokeh.models.widgets.layouts import HBox, VBox
 15 | from bokeh.charts import Bar
 16 | from bokeh.io import vform, vplot
 17 | 
 18 | 
 19 | DOMAIN_PLOT_LIMIT = 10
 20 | DOMAIN_TABLE_LIMIT = None
 21 | 
 22 | ENDING_PLOT_LIMIT = 10
 23 | ENDING_TABLE_LIMIT = None
 24 | 
 25 | BAR_WIDTH = 0.4
 26 | 
 27 | 
 28 | def pages_timeseries(response):
 29 |     parsed_dates = pd.Series(pd.to_datetime([x[1] for x in response]).order(),
 30 |             name="datetimes")
 31 |     hits = pd.Series(range(1, len(parsed_dates) + 1), name="hits")
 32 |     dates = pd.concat([hits, parsed_dates], axis=1).set_index("datetimes")
 33 |     dates = dates.resample("30S").dropna()
 34 |     plot = figure(plot_height=584, x_axis_type="datetime", x_axis_label="Time",
 35 |             y_axis_label="Fetched")
 36 |     plot.line(x=dates.index, y=dates["hits"])
 37 |     return Panel(child=plot, title="Fetched")
 38 | 
 39 | 
 40 | def endings_table(source):
 41 |     columns = [
 42 |             TableColumn(field="x", title="Ending"),
 43 |             TableColumn(field="y", title="Count"),
 44 |         ]
 45 |     table = DataTable(source=source,
 46 |             columns=columns, width=400, height=280)
 47 |     return table
 48 | 
 49 | 
 50 | def endings_plot(source):
 51 |     plot = Bar(source.data, values="y", label="x",
 52 |             title="Most Common URL Endings by Number", bar_width=BAR_WIDTH,
 53 |             height=584, xlabel="Endings", ylabel="Occurences")
 54 |     return plot
 55 | 
 56 | 
 57 | def endings_dashboard(response):
 58 |     urls = [x[0][0] for x in response["pages"]]
 59 |     parsed_urls = [urlparse(x).hostname for x in urls]
 60 |     endings_counter = Counter([x[x.rfind("."):] for x in parsed_urls]).most_common(ENDING_PLOT_LIMIT)
 61 |     xendings = [x[0] for x in endings_counter]
 62 |     yendings = [y[1] for y in endings_counter]
 63 |     source = ColumnDataSource(data=dict(x=xendings, y=yendings))
 64 | 
 65 |     table = VBox(children=[endings_table(source)])
 66 |     plot = VBox(children=[endings_plot(source)])
 67 |     return components(vplot(HBox(children=[table, plot])))
 68 | 
 69 | 
 70 | def domains_dashboard(response, extra_plots=None):
 71 |     """
 72 |     Domains dashboard plot function. Takes an arguments for extra plots which
 73 |     will be added in a tab with the other plots.
 74 |     """
 75 |     # Parsed Response Data
 76 |     urls = [x[0][0] for x in response["pages"]]
 77 |     parsed_urls = [urlparse(x).hostname for x in urls]
 78 | 
 79 |     # Domain names Bar chart.
 80 |     domains_counter = Counter(parsed_urls).most_common(DOMAIN_PLOT_LIMIT)
 81 |     xdomains = [x[0] for x in domains_counter]
 82 |     ydomains = [y[1] for y in domains_counter]
 83 |     source_domains = ColumnDataSource(data=dict(x=xdomains, y=ydomains))
 84 | 
 85 |     bar_domains = Bar(source_domains.data, values="y", label="x", title="Most Common Sites by Number",
 86 |             bar_width=BAR_WIDTH, height=584, xlabel="Sites",
 87 |             ylabel="Occurences")
 88 |     panel_domains = Panel(child=bar_domains, title="Sites")
 89 | 
 90 |     # Domain Information Table
 91 |     table_domains_counter = Counter(parsed_urls).most_common(DOMAIN_TABLE_LIMIT)
 92 |     xdomains_table = [x[0] for x in table_domains_counter]
 93 |     ydomains_table = [y[1] for y in table_domains_counter]
 94 |     source_table_domains = ColumnDataSource(data=dict(x=xdomains_table,
 95 |         y=ydomains_table))
 96 | 
 97 |     columns_domain = [
 98 |             TableColumn(field="x", title="Site Name"),
 99 |             TableColumn(field="y", title="Count"),
100 |         ]
101 |     data_table_domain = DataTable(source=source_table_domains, columns=columns_domain, width=400,
102 |             height=280)
103 | 
104 |     # Add the plots and charts to a vform and organize them with VBox and HBox
105 |     plot_tabs = Tabs(tabs=[panel_domains, extra_plots])
106 | 
107 |     # Take the plot and table and arrange them in a hbox.
108 |     vbox_tables = VBox(children=[data_table_domain])
109 |     vbox_plots = VBox(children=[plot_tabs])
110 |     hbox_dashboard = HBox(children=[vbox_tables, vbox_plots])
111 |     return components(vplot(hbox_dashboard))
112 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/BingSearch.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedReader;
  2 | import java.io.IOException;
  3 | import java.io.InputStreamReader;
  4 | import java.io.InputStream;
  5 | import java.io.FileInputStream;
  6 | import java.io.StringReader;
  7 | import java.io.File;
  8 | import java.io.FileReader;
  9 | import java.io.PrintWriter;
 10 | import java.net.HttpURLConnection;
 11 | import java.net.MalformedURLException;
 12 | import java.net.URL;
 13 | import java.util.Properties;
 14 | import java.util.ArrayList;
 15 | import org.apache.commons.codec.binary.Base64;
 16 | import org.xml.sax.InputSource;
 17 | import org.w3c.dom.*;
 18 | import javax.xml.parsers.DocumentBuilderFactory;
 19 | import javax.xml.parsers.DocumentBuilder;
 20 | 
 21 | public class BingSearch {
 22 |     
 23 |     private String accountKey;
 24 |     private Properties prop; 
 25 | 
 26 |     public BingSearch(){
 27 | 	try{
 28 | 	    prop = new Properties();
 29 | 	    FileInputStream is = new FileInputStream("conf/config.properties");
 30 | 	    prop.load(is);
 31 | 	    accountKey = prop.getProperty("ACCOUNTKEY");
 32 | 	}   
 33 | 	catch(Exception e){
 34 | 	    e.printStackTrace();
 35 | 	    prop = null;
 36 | 	}
 37 |     } 
 38 | 
 39 | 	
 40 |     public ArrayList<String> search(String query, String top, String es_index, String es_doc_type, String es_server){
 41 | 	System.out.println("Query: " + query);
 42 | 
 43 | 	if (this.prop == null){
 44 | 	    System.out.println("Error: config file is not loaded yet");
 45 | 	    return null;
 46 | 	}
 47 | 
 48 | 	Download download = new Download(query, es_index, es_doc_type, es_server);
 49 | 	
 50 | 	ArrayList<String> results = new ArrayList<String>();
 51 | 	query = query.replaceAll(" ", "%20");
 52 | 	byte[] accountKeyBytes = Base64.encodeBase64((this.accountKey + ":" + this.accountKey).getBytes());
 53 | 	String accountKeyEnc = new String(accountKeyBytes);
 54 | 	URL query_url;
 55 | 	try {
 56 | 	    int chunk = 50;
 57 | 	    if (Integer.valueOf(top) < 50)
 58 | 		chunk = Integer.valueOf(top); 
 59 | 	    int skip_index = 0;
 60 | 	    while(chunk > 0){
 61 | 	    	query_url = new URL("https://api.datamarket.azure.com/Data.ashx/Bing/Search/v1/Web?Adult=%27Off%27&$skip=" + String.valueOf(skip_index*50) + "&Query=%27" + query + "%20filetype:html" + "%27&$top=" + String.valueOf(chunk));
 62 | 	    	System.out.println(query_url);
 63 | 
 64 | 	    	HttpURLConnection conn = (HttpURLConnection)query_url.openConnection();
 65 | 	    	conn.setRequestMethod("GET");
 66 | 	    	conn.setRequestProperty("Authorization", "Basic " + accountKeyEnc);
 67 | 
 68 | 	    	BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
 69 | 	    	String output = "";
 70 | 	    	String line;
 71 | 	    	while ((line = br.readLine()) != null) {
 72 | 			output = output + line;
 73 | 	    	} 
 74 | 	    	conn.disconnect();
 75 | 
 76 | 	    	DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
 77 | 	    	DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); 
 78 | 	    	InputSource is = new InputSource(new StringReader(output));
 79 | 	    	Document doc = docBuilder.parse(is);
 80 | 	    	NodeList urls = doc.getElementsByTagName("d:Url");
 81 | 	    	int totalUrls = urls.getLength();
 82 | 
 83 | 	    	for (int i=0; i<totalUrls; i++){
 84 | 			Element e = (Element)urls.item(i);
 85 | 			NodeList nl = e.getChildNodes();
 86 | 			String url = Download_Utils.validate_url((nl.item(0).getNodeValue()));
 87 | 			results.add(url);
 88 | 			download.addTask(url);
 89 | 	    	}
 90 | 		if ((Integer.valueOf(top) - chunk) < 50) 
 91 | 			chunk = Integer.valueOf(top) - chunk;
 92 | 		else chunk += 50;
 93 | 		++skip_index;
 94 | 	    }
 95 | 	} 
 96 | 	catch (MalformedURLException e1) {
 97 | 	    e1.printStackTrace();
 98 | 	} 
 99 | 	catch (IOException e) {
100 | 	    e.printStackTrace();
101 | 	}
102 | 	catch (Exception e){
103 | 	    e.printStackTrace();
104 | 	}
105 | 
106 | 	download.shutdown();
107 | 	System.out.println("Number of results: " + String.valueOf(results.size()));
108 | 	return results;
109 |     }
110 | 
111 |     public static void main(String[] args) {
112 | 	
113 | 	String query = ""; //default
114 | 	String top = "50"; //default
115 | 	String es_index = "memex";
116 | 	String es_doc_type = "page";
117 | 	String es_server = "localhost";
118 | 	
119 | 	int i = 0;
120 | 	while (i < args.length){
121 | 	    String arg = args[i];
122 | 	    if(arg.equals("-q")){
123 | 		query = args[++i];
124 | 	    } else if(arg.equals("-t")){ 
125 | 		top = args[++i];
126 | 	    } else if(arg.equals("-i")){
127 | 		es_index = args[++i];
128 | 	    } else if(arg.equals("-d")){
129 | 		es_doc_type = args[++i];
130 | 	    } else if(arg.equals("-s")){
131 | 		es_server = args[++i];
132 | 	    }else {
133 | 		System.out.println("Unrecognized option");
134 | 		break;
135 | 	    }
136 | 	    ++i;
137 | 	}
138 | 	
139 | 	System.out.println("Query = " + query);
140 | 	System.out.println("Get the top " + top + " results");
141 | 	
142 | 	BingSearch bs = new BingSearch();
143 | 	bs.search(query, top, es_index, es_doc_type, es_server);
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/GoogleSearch.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedReader;
  2 | import java.io.IOException;
  3 | import java.io.InputStreamReader;
  4 | import java.io.InputStream;
  5 | import java.io.FileInputStream;
  6 | import java.io.StringReader;
  7 | import java.io.File;
  8 | import java.io.FileReader;
  9 | import java.io.PrintWriter;
 10 | import java.net.HttpURLConnection;
 11 | import java.net.MalformedURLException;
 12 | import java.net.URL;
 13 | import java.util.Properties;
 14 | import java.util.ArrayList;
 15 | import org.json.JSONObject;
 16 | import org.json.JSONArray;
 17 | import org.json.JSONString;
 18 | 
 19 | 
 20 | public class GoogleSearch {
 21 |     
 22 |     private String accountKey;
 23 |     private String cseID;
 24 |     private Properties prop; 
 25 | 
 26 |     public GoogleSearch(){
 27 | 	try{
 28 | 	    prop = new Properties();
 29 | 	    FileInputStream is = new FileInputStream("conf/config.properties");
 30 | 	    prop.load(is);
 31 | 	    accountKey = prop.getProperty("ACCOUNTKEY_GOOG");
 32 | 	    cseID = prop.getProperty("CSE_ID_GOOG");
 33 | 	}   
 34 | 	catch(Exception e){
 35 | 	    e.printStackTrace();
 36 | 	    prop = null;
 37 | 	}
 38 |     } 
 39 | 
 40 | 	
 41 |     public ArrayList<String> search(String query, String top, String es_index, String es_doc_type, String es_server){
 42 | 	    System.out.println("Query: " + query);
 43 |         int nTop = Integer.valueOf(top);
 44 | 
 45 | 	    if (this.prop == null){
 46 | 	        System.out.println("Error: config file is not loaded yet");
 47 | 	        return null;
 48 | 	    }
 49 | 
 50 | 	    Download download = new Download(query, es_index, es_doc_type, es_server);
 51 | 	    
 52 | 	    ArrayList<String> results = new ArrayList<String>();
 53 | 	    ArrayList<String> titles = new ArrayList<String>();
 54 | 	    ArrayList<String> snippets = new ArrayList<String>();
 55 | 	    query = "&num=" + String.valueOf(step) + "&key=" + accountKey + "&cx=" + cseID + "&q=" + query.replaceAll(" ", "%20");
 56 | 	    URL query_url;
 57 | 
 58 | 	    try {
 59 |             int step = 10; //10 is the maximum number of results to return in each query
 60 |             for (int start = 1; start < nTop; start += step){
 61 | 	        	query_url = new URL("https://www.googleapis.com/customsearch/v1?start=" + String.valueOf(start) + query);  
 62 | 	        	System.out.println(query_url);
 63 | 
 64 | 	        	HttpURLConnection conn = (HttpURLConnection)query_url.openConnection();
 65 | 	        	conn.setRequestMethod("GET");
 66 | 	        	BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
 67 | 	        	String output = "";
 68 | 	            String line;
 69 | 	        	while ((line = br.readLine()) != null) {
 70 | 	    		    output = output + line;
 71 | 	        	} 
 72 | 	        	conn.disconnect();
 73 | 
 74 |                 JSONObject obj = new JSONObject(output);
 75 | 	    	    JSONArray items = obj.getJSONArray("items");
 76 | 
 77 | 	    	    for(int i=0; i < items.length(); ++i){
 78 |                     JSONObject item = items.getJSONObject(i);
 79 |                     String link = (String)item.get("link");
 80 | 	    	        results.add(link);
 81 | 	    	        download.addTask(link);
 82 | 
 83 |                     titles.add((String)item.get("title"));
 84 |                     snippets.add((String)item.get("snippet"));
 85 |                     //All keys of the json object: snippet, htmlFormattedUrl, htmlTitle
 86 |                     //kind, pagemap, displayLink, link, htmlSnippet, title, formatedUrl, cacheId 
 87 | 	    	    }
 88 | 	        }
 89 | 
 90 | 	     } 
 91 | 	     catch (MalformedURLException e) {
 92 | 	         e.printStackTrace();
 93 | 	     } 
 94 | 	     catch (IOException e) {
 95 | 	         e.printStackTrace();
 96 | 	     }
 97 | 	     catch (Exception e){
 98 | 	         e.printStackTrace();
 99 | 	     }
100 | 
101 | 	    download.shutdown();
102 | 	    System.out.println("Number of results: " + String.valueOf(results.size()));
103 |     
104 |         //TODO: Return titles and snippets
105 | 	    return results;
106 |     }
107 | 
108 |     public static void main(String[] args) {
109 | 	
110 | 	String query = ""; //default
111 | 	String top = "50"; //default
112 | 	String es_index = "memex";
113 | 	String es_doc_type = "page";
114 | 	String es_server = "localhost";
115 | 	
116 | 	int i = 0;
117 | 	while (i < args.length){
118 | 	    String arg = args[i];
119 | 	    if(arg.equals("-q")){
120 | 		query = args[++i];
121 | 	    } else if(arg.equals("-t")){ 
122 | 		top = args[++i];
123 | 	    } else if(arg.equals("-i")){
124 | 		es_index = args[++i];
125 | 	    } else if(arg.equals("-d")){
126 | 		es_doc_type = args[++i];
127 | 	    } else if(arg.equals("-s")){
128 | 		es_server = args[++i];
129 | 	    }else {
130 | 		System.out.println("Unrecognized option");
131 | 		break;
132 | 	    }
133 | 	    ++i;
134 | 	}
135 | 	
136 | 	System.out.println("Query = " + query);
137 | 	System.out.println("Get the top " + top + " results");
138 | 	
139 | 	GoogleSearch bs = new GoogleSearch();
140 | 	bs.search(query, top, es_index, es_doc_type, es_server);
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/vis/html/crawlervis.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | 
  4 | <html>
  5 |   <head>
  6 |     <title>Crawler</title>
  7 |     <meta name="viewport" content="initial-scale=1.0, user-scalable=no" />
  8 |     <link type="text/css" rel="stylesheet" href="css/d3.slider.css" media="all">
  9 |     <link type="text/css" rel="stylesheet" href="css/jquery-ui.css" media="all">
 10 |     <link type="text/css" rel="stylesheet" href="css/jquery.urlive.css" media="all">
 11 |     <link type="text/css" rel="stylesheet" href="css/crawler-white.css" media="all">
 12 | 
 13 |     <script type="text/javascript" src="js/libs/d3.v3.5.5.min.js"></script>
 14 |     <script type="text/javascript" src="js/libs/d3.layout.cloud.js"></script>
 15 |     <script type="text/javascript" src="js/libs/jquery-1.10.0.min.js"></script>
 16 |     <script type="text/javascript" src="js/libs/jquery-ui.js"></script>
 17 |     <script type="text/javascript" src="js/libs/jquery.urlive.js"></script>
 18 | 
 19 |     <script type="text/javascript" src="js/libs/d3.slider.js"></script>
 20 |     <script type="text/javascript" src="js/libs/d3.lasso.js"></script>
 21 |     <script type="text/javascript" src="js/libs/moment.js"></script>
 22 |     <script type="text/javascript" src="js/libs/queue.min.js"></script>
 23 |     <!--script type="text/javascript" src="js/pagescontrols.js"></script-->
 24 |     <script type="text/javascript" src="js/pagesgallery.js"></script>
 25 |     <script type="text/javascript" src="js/pageslandscape.js"></script>
 26 |     <script type="text/javascript" src="js/tagsgallery.js"></script>
 27 |     <script type="text/javascript" src="js/crawlervis.js"></script>
 28 |     <script type="text/javascript" src="js/dataaccess.js"></script>
 29 |     <script type="text/javascript" src="js/sigslot_core.js"></script>
 30 |     <script type="text/javascript" src="js/crawlersigslots.js"></script>
 31 |     <script type="text/javascript" src="js/crawlerstatslist.js"></script>
 32 |     <script type="text/javascript" src="js/snippetsviewer.js"></script>
 33 |     <!--script type="text/javascript" src="js/termsgallery.js"></script-->
 34 |     <script type="text/javascript" src="js/utils.js"></script>
 35 |     <script type="text/javascript" src="js/wordlist.js"></script>
 36 |   </head>
 37 | 
 38 |   <body>
 39 |     <div id="page_container">
 40 |       <!-- Left panel -->
 41 |       <div id="left_panel">
 42 |         <div>
 43 |           Crawler: <select id="selectCrawler"></select>
 44 |         </div>
 45 |         <div id="crawler_info">
 46 |           <div id="statslist" class="statslist">
 47 |             <svg>
 48 |               <defs>
 49 |                 <pattern id="pattern-stripe" 
 50 |                   width="4" height="4" 
 51 |                   patternUnits="userSpaceOnUse"
 52 |                   patternTransform="rotate(45)">
 53 |                   <rect width="3" height="6" transform="translate(0,0)" fill="white"></rect>
 54 |                 </pattern>
 55 |                 <mask id="mask-stripe">
 56 |                   <rect x="0" y="0" width="100%" height="100%" fill="url(#pattern-stripe)" />
 57 |                 </mask>
 58 |               </defs>
 59 |             </svg> 
 60 |           </div>
 61 |         </div>
 62 | 
 63 |         <div id="query_panel">
 64 |           Filter:<input type="text" id="filter_box" class="query_input" list="filter_box_previous_filters"></input><span id="submit_filter"></span>
 65 |           <datalist id='filter_box_previous_filters'></datalist>
 66 |           <div id="filter_cap"><h3>Cap:</h3><span><select id="filter_cap_select"></select><span></div>
 67 | 
 68 |           <div id="filter_statslist" class="statslist">
 69 |             <svg>
 70 |               <defs>
 71 |                 <pattern id="pattern-stripe" 
 72 |                   width="4" height="4" 
 73 |                   patternUnits="userSpaceOnUse"
 74 |                   patternTransform="rotate(45)">
 75 |                   <rect width="3" height="6" transform="translate(0,0)" fill="white"></rect>
 76 |                 </pattern>
 77 |                 <mask id="mask-stripe">
 78 |                   <rect x="0" y="0" width="100%" height="100%" fill="url(#pattern-stripe)" />
 79 |                 </mask>      
 80 |               </defs>
 81 |             </svg> 
 82 |           </div>
 83 |         </div>
 84 | 
 85 |       </div>
 86 |       <!-- Middle panel -->
 87 |       <div id="middle_panel">
 88 |         Pages<span id="last_update_info_box" class="noselect"></span>
 89 |         <span id="pages_landscape_buttons" class="control_buttons">
 90 |           <span id="pages_landscape_update" class="button-img disabled pages_landscape_button noselect">
 91 |             <img src="img/reload.png" width="22px">
 92 |           </span>
 93 |           <span id="pages_landscape_boost" class="button-img disabled pages_landscape_button noselect">
 94 |             <img src="img/boost.png" width="22px">
 95 |           </span>
 96 |         </span>
 97 |         Tags
 98 |         <div id="pages_landscape_container">
 99 |           <div id="pages_landscape" class="noselect"></div>
100 |           <div id="tags_items">
101 |           </div>
102 |         </div>
103 | 
104 |         <div id="pages_panel">
105 |           <div id="pages_items"></div>
106 |         </div>
107 | 
108 |       </div>
109 |       <!-- Right panel -->
110 |       <div id="right_panel">
111 |         Terms
112 |         <div id="wordlist" class="noselect"></div>
113 |         <div id="terms_snippets_viewer" class="terms_interface"></div>
114 |       </div>
115 |     </div>
116 |     <div id="mask"></div>
117 |     <div id="urlBgMask"></div>
118 | 
119 |     <script>
120 |       // Initializes main app class after page is loaded.
121 |       window.onload = function() {
122 |         CrawlerVis.buildForCrawler();
123 |       };
124 |     </script>
125 |   </body>
126 | </html>
127 | 


--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Crawl.java:
--------------------------------------------------------------------------------
  1 | import java.util.concurrent.Executors;
  2 | import java.util.concurrent.ExecutorService;
  3 | import java.io.FileReader;
  4 | import java.io.BufferedReader;
  5 | import java.io.IOException;
  6 | import java.lang.InterruptedException;
  7 | import java.util.concurrent.ExecutionException;
  8 | import java.util.concurrent.TimeUnit;
  9 | import org.elasticsearch.client.transport.TransportClient;
 10 | import org.elasticsearch.common.transport.InetSocketTransportAddress;
 11 | import org.elasticsearch.client.Client;
 12 | import java.util.ArrayList;
 13 | import java.util.Map;
 14 | 
 15 | import org.elasticsearch.common.xcontent.XContentFactory;
 16 | import org.elasticsearch.action.search.SearchResponse;
 17 | import org.elasticsearch.search.SearchHit; 
 18 | import org.elasticsearch.action.search.SearchType;
 19 | import org.elasticsearch.index.query.QueryBuilders;
 20 | import org.elasticsearch.index.query.QueryBuilder;
 21 | import org.elasticsearch.index.query.MissingFilterBuilder;
 22 | import org.elasticsearch.index.query.FilterBuilders;
 23 | import org.elasticsearch.action.index.IndexRequest;
 24 | import org.elasticsearch.action.update.UpdateRequest;
 25 | 
 26 | public class Crawl {
 27 | 
 28 |     private ArrayList<String> urls = null;
 29 |     private String es_index = "memex";
 30 |     private String es_doc_type = "page";
 31 |     private String es_host = "localhost";
 32 |     private Client client = null;
 33 |     private int poolSize = 100;
 34 |     private ExecutorService crawlForwardService = Executors.newFixedThreadPool(poolSize);
 35 |     private ExecutorService crawlBackwardService = Executors.newFixedThreadPool(poolSize);
 36 |     private int MAXSIZE = 100000;
 37 |     
 38 |     public Crawl(String es_index, String es_doc_type, String es_host){
 39 | 	if(es_host.isEmpty())
 40 | 	    es_host = "localhost";
 41 | 	else {
 42 | 	    String[] parts = es_host.split(":");
 43 | 	    if (parts.length == 2)
 44 | 		es_host = parts[0];
 45 | 	    else if(parts.length == 3)
 46 | 		es_host = parts[1];
 47 | 	    
 48 | 	    es_host = es_host.replaceAll("/","");
 49 | 	}
 50 | 
 51 | 	this.es_host = es_host;
 52 | 
 53 | 	this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(es_host, 9300));
 54 | 	
 55 | 	if(!es_index.isEmpty())
 56 | 	    this.es_index = es_index;
 57 | 	if(!es_doc_type.isEmpty())
 58 | 	    this.es_doc_type = es_doc_type;
 59 | 	
 60 |     }
 61 | 
 62 |     public void addForwardCrawlTask(ArrayList<String> urls, String top){
 63 | 	try{
 64 | 	    for (String f_url : urls) {
 65 | 		SearchResponse searchResponse = client.prepareSearch(this.es_index)
 66 | 		    .setTypes(this.es_doc_type)
 67 | 		    .setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
 68 | 		    .setFetchSource(new String[]{"url"}, null)
 69 | 		    .setQuery(QueryBuilders.termQuery("url", f_url))
 70 | 		    .setSize(this.MAXSIZE)
 71 | 		    .setFrom(0).setExplain(true)
 72 | 		    .execute()
 73 | 		    .actionGet();
 74 | 		
 75 | 		for (SearchHit hit : searchResponse.getHits()) {
 76 | 		    UpdateRequest updateRequest = new UpdateRequest(this.es_index, this.es_doc_type, hit.getId())
 77 | 			.doc(XContentFactory.jsonBuilder()
 78 | 			     .startObject()
 79 | 			     .field("crawled_forward", 1)
 80 | 			     .endObject());
 81 | 		    this.client.update(updateRequest).get();
 82 | 		}
 83 | 	    }
 84 | 
 85 | 	    crawlForwardService.execute(new CrawlerInterface(urls, null, "forward", top, this.es_index, this.es_doc_type, this.es_host, this.client));
 86 | 	} catch (IOException e1) {
 87 | 	    // TODO Auto-generated catch block
 88 | 	    e1.printStackTrace();
 89 | 	} catch (InterruptedException e2) {
 90 | 	    // TODO Auto-generated catch block
 91 | 	    e2.printStackTrace();
 92 | 	} catch (ExecutionException e3) {
 93 | 	    // TODO Auto-generated catch block
 94 | 	    e3.printStackTrace();
 95 | 	} 
 96 |     }
 97 | 
 98 |     public void addBackwardCrawlTask(ArrayList<String> urls, String top){
 99 | 	try{
100 | 	    MissingFilterBuilder filter=FilterBuilders.missingFilter("crawled_backward");
101 | 	    QueryBuilder qb = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(),filter);
102 | 	    SearchResponse searchResponse = client.prepareSearch(this.es_index)
103 | 		.setTypes(this.es_doc_type)
104 | 		.setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
105 | 		.setFetchSource(new String[]{"url", "crawled_backward"}, null)
106 | 		.setQuery(qb)
107 | 		.setSize(this.MAXSIZE)
108 | 		.setFrom(0).setExplain(true)
109 | 		.execute()
110 | 		.actionGet();
111 | 	    
112 | 	    ArrayList<String> not_crawled = new ArrayList<String>();
113 | 	    for (SearchHit hit : searchResponse.getHits()) {
114 | 		Map map = hit.getSource();
115 | 		String url = (String)map.get("url");
116 | 		if(urls.contains(url)){
117 | 		    not_crawled.add(url);
118 | 		    UpdateRequest updateRequest = new UpdateRequest(this.es_index, this.es_doc_type, hit.getId())
119 | 			.doc(XContentFactory.jsonBuilder()
120 | 			     .startObject()
121 | 			     .field("crawled_backward", 1)
122 | 			     .endObject());
123 | 		    this.client.update(updateRequest).get();
124 | 		}
125 | 	    }
126 | 	    
127 | 	    crawlBackwardService.execute(new CrawlerInterface(not_crawled, null, "backward", top, this.es_index, this.es_doc_type, this.es_host, this.client));
128 | 
129 | 	} catch (IOException e1) {
130 | 	    // TODO Auto-generated catch block
131 | 	    e1.printStackTrace();
132 | 	} catch (InterruptedException e2) {
133 | 	    // TODO Auto-generated catch block
134 | 	    e2.printStackTrace();
135 | 	} catch (ExecutionException e3) {
136 | 	    // TODO Auto-generated catch block
137 | 	    e3.printStackTrace();
138 | 	} 
139 | 
140 |     }
141 | 
142 |     public void shutdown(){
143 | 	try {
144 | 	    crawlForwardService.shutdown();
145 | 	    crawlBackwardService.shutdown();
146 | 	    crawlForwardService.awaitTermination(60 , TimeUnit.SECONDS);
147 | 	    crawlBackwardService.awaitTermination(60 , TimeUnit.SECONDS);
148 | 	    System.out.println("SHUTDOWN");
149 | 	    this.client.close();
150 | 	} catch (InterruptedException e) {
151 | 	    e.printStackTrace();
152 | 	}
153 |     }
154 | 
155 |     
156 | }
157 | 


--------------------------------------------------------------------------------
/vis/html/js/sigslot_core.js:
--------------------------------------------------------------------------------
  1 | // Filename:		sigslot_core.js
  2 | // Purpose:		provides an abstracted event handling system
  3 | // Classes:		NW_sigslot_registry, NW_SignalObj
  4 | // Global Objects:	__sig__ (aka, __signals_registry__)
  5 | // Dependencies:	none
  6 | // Author: Alex Russell (slightlyoff@crhomium.org)
  7 | 
  8 | // class definition for signal objects
  9 | function NW_SignalObj(obj, fp){
 10 | 	this.fp = fp;
 11 | 	this.obj = obj;
 12 | 	this.slots = new Array();
 13 | 	this.addSlot = function(pobj, pfp){
 14 | 		var slot = null;
 15 | 		if(__sig__.isSigFP(pfp)){
 16 | 			slot = __sig__.getSig(pfp);
 17 | 		}else{
 18 | 			// whee! recursive data structures!
 19 | 			slot = new NW_SignalObj(pobj, pfp);
 20 | 			__sig__.addSig(slot);
 21 | 		}
 22 | 		this.slots[this.slots.length]=slot;
 23 | 	}
 24 | 
 25 | 	this.rmSlot = function(pobj, pfp){
 26 | 		if(__sig__.isSigFP(pfp)){
 27 | 			var tslot = __sig__.getSig(pfp);
 28 | 			for(var x in this.slots){
 29 | 				if(this.slots[x]==tslot){
 30 | 					delete this.slots[this.slots.length];
 31 | 					// make sure we only remove the first instance
 32 | 					return true;
 33 | 				}
 34 | 			}
 35 | 		}else{return false;}
 36 | 	}
 37 | }
 38 | 
 39 | function NW_sigslot_registry(){
 40 | 	this.uID = 0;
 41 | 	// conArr contains an arry of signal objects
 42 | 	this.connArr = new Array();
 43 | 
 44 | 	// this method provides the mapping between signals and slots
 45 | 	this.connect = function(sigObj, sigFP, slotObj, slotFP){
 46 | 		var isFound = this.isSigFP(sigFP);
 47 | 		if(!isFound){
 48 | 			this.addSig(new NW_SignalObj(sigObj, sigFP));
 49 | 		}
 50 | 		var csig = this.getSig(sigFP);
 51 | 		csig.addSlot(slotObj, slotFP);
 52 | 	}
 53 | 
 54 | 	// this method provides the mapping between signals and slots
 55 | 	this.disconnect = function(sigObj, sigFP, slotObj, slotFP){
 56 | 		var csig = this.getSig(sigFP);
 57 | 		csig.rmSlot(slotObj, slotFP);
 58 | 	}
 59 | 
 60 | 	this.addSig = function(sigObj){
 61 | 		var cUID = this.uID++;// should be atomic anyway, but make sure
 62 | 		this.connArr[cUID]=sigObj;
 63 | 	}
 64 | 
 65 | 	this.isSigFP = function(fp){
 66 | 		var isFound = false;
 67 | 		for(var x in this.connArr){ if(this.connArr[x].fp == fp){isFound = true;} }
 68 | 		return isFound;
 69 | 	}
 70 | 
 71 | 	this.getSig = function(fp){
 72 | 		for(var x in this.connArr){ if(this.connArr[x].fp == fp){return this.connArr[x];} }
 73 | 		return null;
 74 | 	}
 75 | 
 76 | 	this.emit = function(fp){
 77 | 		for(x in this.connArr){
 78 | 			// find the signal object
 79 | 			if(this.connArr[x].fp==fp){ 
 80 | 				var csig = this.connArr[x];
 81 | 				var args = arguments;
 82 | 				var alen = args.length;
 83 | 				// unroll the args array
 84 | 				if(alen == 1){
 85 | 					(csig.fp).call(csig.obj);
 86 | 					for(y in csig.slots){
 87 | 						this.emit(csig.slots[y].fp);
 88 | 					}
 89 | 				}else if(alen == 2){
 90 | 					(csig.fp).call(csig.obj, args[1]);
 91 | 					for(y in csig.slots){
 92 | 						this.emit(csig.slots[y].fp, args[1]);
 93 | 					}
 94 | 				}else if(alen == 3){
 95 | 					(csig.fp).call(csig.obj, args[1], args[2]);
 96 | 					for(y in csig.slots){
 97 | 						this.emit(csig.slots[y].fp, args[1], args[2]);
 98 | 					}
 99 | 				}else if(alen == 4){
100 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3]);
101 | 					for(y in csig.slots){
102 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3]);
103 | 					}
104 | 
105 | 				}else if(alen == 5){
106 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4]);
107 | 					for(y in csig.slots){
108 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4]);
109 | 					}
110 | 				}else if(alen == 6){
111 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5]);
112 | 					for(y in csig.slots){
113 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5]);
114 | 					}
115 | 				}else if(alen == 7){
116 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6]);
117 | 					for(y in csig.slots){
118 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6]);
119 | 					}
120 | 				}else if(alen == 8){
121 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7]);
122 | 					for(y in csig.slots){
123 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7]);
124 | 					}
125 | 				}else if(alen == 9){
126 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]);
127 | 					for(y in csig.slots){
128 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]);
129 | 					}
130 | 				}else if(alen == 10){
131 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]);
132 | 					for(y in csig.slots){
133 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]);
134 | 					}
135 | 				}else if(alen == 11){
136 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10]);
137 | 					for(y in csig.slots){
138 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10]);
139 | 					}
140 | 				}else if(alen == 12){
141 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11]);
142 | 					for(y in csig.slots){
143 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11]);
144 | 					}
145 | 				}else if(alen == 13){
146 | 					// if your function needs more than 12 args, you need to learn how to write better code =)
147 | 					(csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12]);
148 | 					for(y in csig.slots){
149 | 						this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12]);
150 | 					}
151 | 				}
152 | 			}
153 | 		}
154 | 	}
155 | }
156 | 
157 | var __signals_registry__ = new NW_sigslot_registry();
158 | var __sig__ = __signals_registry__; // alias
159 | 


--------------------------------------------------------------------------------
/elastic/get_documents.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from os import environ
  3 | import sys
  4 | from config import es as default_es
  5 | 
  6 | def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None):
  7 |     if es is None:
  8 |         es = default_es
  9 | 
 10 |     results = {}
 11 | 
 12 |     if len(terms) > 0:
 13 | 
 14 |         for term in terms:
 15 |             query = {
 16 |                 "query": {
 17 |                     "term": {
 18 |                         term_field: term
 19 |                     }
 20 |                 },
 21 |                 "fields": fields
 22 |             }
 23 | 
 24 |             res = es.search(body=query,
 25 |                             index=es_index,
 26 |                             doc_type=es_doc_type)
 27 | 
 28 |             if res['hits']['hits']:
 29 |                 hits = res['hits']['hits']
 30 | 
 31 |                 records = []
 32 |                 for hit in hits:
 33 |                     record = {}
 34 |                     if not hit.get('fields') is None:
 35 |                         record = hit['fields']
 36 |                     record['id'] =hit['_id']
 37 |                     records.append(record)
 38 |                 results[term] = records
 39 | 
 40 |     return results
 41 | 
 42 | 
 43 | def get_more_like_this(urls, fields=[], pageCount=200, es_index='memex', es_doc_type='page', es=None):
 44 |     if es is None:
 45 |         es = default_es
 46 | 
 47 |     docs = [{"_index": es_index, "_type": es_doc_type, "_id": url} for url in urls]
 48 | 
 49 |     with open(environ['DDT_HOME']+'/elastic/stopwords.txt', 'r') as f:
 50 |         stopwords = [word.strip() for word in f.readlines()]
 51 | 
 52 |     query = {
 53 |         "query":{
 54 |             "more_like_this": {
 55 |                 "fields" : ["text"],
 56 |                 "docs": docs,
 57 |                 "min_term_freq": 1,
 58 |                 "stop_words": stopwords
 59 |             }
 60 |         },
 61 |         "fields": fields,
 62 |         "size": pageCount
 63 |     }
 64 | 
 65 |     res = es.search(body=query, index = es_index, doc_type = es_doc_type)
 66 |     hits = res['hits']['hits']
 67 | 
 68 |     results = []
 69 |     for hit in hits:
 70 |         fields = hit['fields']
 71 |         fields['id'] = hit['_id']
 72 |         fields['score'] = hit['_score']
 73 |         results.append(fields)
 74 | 
 75 |     return results
 76 | 
 77 | def get_most_recent_documents(opt_maxNumberOfPages = 200, mapping=None, fields = [], opt_filter = None, es_index = 'memex', es_doc_type = 'page', es = None):
 78 | 
 79 |     if mapping == None:
 80 |         print "No mappings found"
 81 |         return []
 82 | 
 83 |     if es is None:
 84 |         es = default_es
 85 | 
 86 |     query = {
 87 |         "size": opt_maxNumberOfPages,
 88 |         "sort": [
 89 |             {
 90 |                 mapping["timestamp"]: {
 91 |                     "order": "desc"
 92 |                 }
 93 |             }
 94 |         ]
 95 |     }
 96 | 
 97 |     match_q = {
 98 |         "match_all": {}
 99 |     }
100 | 
101 |     if not mapping.get("content_type") is None:
102 |         match_q = {
103 |             "match": {
104 |                 mapping["content_type"]: "text/html"
105 |             }
106 |         }
107 | 
108 | 
109 |     if opt_filter is None:
110 |         query["query"] = {
111 |             "filtered": {
112 |                 "query": match_q,
113 |                 "filter":{
114 |                     "exists": {
115 |                         "field": mapping['text']
116 |                     }
117 |                 }
118 |             }
119 |         }
120 |     else:
121 |         query["query"] = {
122 |             "query_string": {
123 |                 "query": "(" + mapping['text'] + ":" + opt_filter.replace('"', '\"') + ")"
124 |             }
125 |         }
126 | 
127 |     if len(fields) > 0:
128 |         query["fields"] = fields
129 | 
130 |     res = es.search(body=query, index = es_index, doc_type = es_doc_type)
131 |     hits = res['hits']['hits']
132 | 
133 |     results = []
134 |     for hit in hits:
135 |         fields = hit['fields']
136 |         fields['id'] = hit['_id']
137 |         results.append(fields)
138 | 
139 |     return results
140 | 
141 | def get_all_ids(pageCount = 100000, fields=[], es_index = 'memex', es_doc_type = 'page', es = None):
142 |     if es is None:
143 |         es = default_es
144 | 
145 |     query = {
146 |         "query": {
147 |             "match_all": {}
148 |         },
149 |         "fields": fields
150 |     }
151 | 
152 |     try:
153 |         res = es.search(body=query, index = es_index, doc_type = es_doc_type, size = pageCount, request_timeout=600)
154 |         hits = res['hits']['hits']
155 | 
156 |         results = []
157 |         for hit in hits:
158 |             fields = hit['fields']
159 |             fields['id'] = hit['_id']
160 |             results.append(fields)
161 | 
162 |         return results
163 |     except:
164 |         print("Unexpected error:", sys.exc_info()[0])
165 |         print es_index
166 |         return []
167 | 
168 | def get_documents_by_id(ids=[], fields=[], es_index = 'memex', es_doc_type = 'page', es = None):
169 |     if es is None:
170 |         es = default_es
171 | 
172 |     query = {
173 |         "query": {
174 |             "ids": {
175 |                 "values": ids
176 |             }
177 |         },
178 |         "fields": fields
179 |     }
180 | 
181 |     res = es.search(body=query, index = es_index, doc_type = es_doc_type, size=len(ids))
182 | 
183 |     hits = res['hits']['hits']
184 | 
185 |     results = []
186 |     for hit in hits:
187 |         if hit.get('fields'):
188 |             fields = hit['fields']
189 |             fields['id'] = hit['_id']
190 |             results.append(fields)
191 |     return results
192 | 
193 | def get_plotting_data(pageCount=200, es_index = 'memex', es_doc_type = 'page', es = None):
194 |     if es is None:
195 |         es = default_es
196 | 
197 |     res = es.search(index=es_index, doc_type = es_doc_type, size=pageCount, fields=["retrieved", "url", "tag", "query"])
198 | 
199 |     fields = []
200 |     for item in res['hits']['hits']:
201 |         if item['fields'].get('tag') != None:
202 |             if "" in item['fields']['tag']:
203 |                 item['fields'].pop('tag')
204 |         fields.append(item['fields'])
205 | 
206 |     return fields
207 | 
208 | if __name__ == "__main__":
209 |     urls = []
210 |     with open(environ['MEMEX_HOME']+'/seed_crawler/seeds_generator/results.txt', 'r') as f:
211 |         urls = f.readlines()
212 |     urls = [url.strip() for url in urls]
213 | 
214 |     docs = get_documents(urls)
215 | 


--------------------------------------------------------------------------------
/vis/html/css/dropdowns-enhancement.min.css:
--------------------------------------------------------------------------------
1 | .dropdown-menu>li>label{display:block;padding:3px 20px;clear:both;font-weight:400;line-height:1.42857143;color:#333;white-space:nowrap}.dropdown-menu>li>label:hover,.dropdown-menu>li>label:focus{text-decoration:none;color:#262626;background-color:#f5f5f5}.dropdown-menu>li>input:checked~label,.dropdown-menu>li>input:checked~label:hover,.dropdown-menu>li>input:checked~label:focus,.dropdown-menu>.active>label,.dropdown-menu>.active>label:hover,.dropdown-menu>.active>label:focus{color:#fff;text-decoration:none;outline:0;background-color:#428bca}.dropdown-menu>li>input[disabled]~label,.dropdown-menu>li>input[disabled]~label:hover,.dropdown-menu>li>input[disabled]~label:focus,.dropdown-menu>.disabled>label,.dropdown-menu>.disabled>label:hover,.dropdown-menu>.disabled>label:focus{color:#999}.dropdown-menu>li>input[disabled]~label:hover,.dropdown-menu>li>input[disabled]~label:focus,.dropdown-menu>.disabled>label:hover,.dropdown-menu>.disabled>label:focus{text-decoration:none;background-color:transparent;background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);cursor:not-allowed}.dropdown-menu>li>label{margin-bottom:0;cursor:pointer}.dropdown-menu>li>input[type=radio],.dropdown-menu>li>input[type=checkbox]{display:none;position:absolute;top:-9999em;left:-9999em}.dropdown-menu>li>label:focus,.dropdown-menu>li>input:focus~label{outline:thin dotted;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}.dropdown-menu.pull-right{right:0;left:auto}.dropdown-menu.pull-top{bottom:100%;top:auto;margin:0 0 2px;-webkit-box-shadow:0 -6px 12px rgba(0,0,0,.175);box-shadow:0 -6px 12px rgba(0,0,0,.175)}.dropdown-menu.pull-center{right:50%;left:auto}.dropdown-menu.pull-middle{right:100%;margin:0 2px 0 0;box-shadow:-5px 0 10px rgba(0,0,0,.2);left:auto}.dropdown-menu.pull-middle.pull-right{right:auto;left:100%;margin:0 0 0 2px;box-shadow:5px 0 10px rgba(0,0,0,.2)}.dropdown-menu.pull-middle.pull-center{right:50%;margin:0;box-shadow:0 0 10px rgba(0,0,0,.2)}.dropdown-menu.bullet{margin-top:8px}.dropdown-menu.bullet:before{width:0;height:0;content:'';display:inline-block;position:absolute;border-color:transparent;border-style:solid;-webkit-transform:rotate(360deg);border-width:0 7px 7px;border-bottom-color:#ccc;border-bottom-color:rgba(0,0,0,.15);top:-7px;left:9px}.dropdown-menu.bullet:after{width:0;height:0;content:'';display:inline-block;position:absolute;border-color:transparent;border-style:solid;-webkit-transform:rotate(360deg);border-width:0 6px 6px;border-bottom-color:#fff;top:-6px;left:10px}.dropdown-menu.bullet.pull-right:before{left:auto;right:9px}.dropdown-menu.bullet.pull-right:after{left:auto;right:10px}.dropdown-menu.bullet.pull-top{margin-top:0;margin-bottom:8px}.dropdown-menu.bullet.pull-top:before{top:auto;bottom:-7px;border-bottom-width:0;border-top-width:7px;border-top-color:#ccc;border-top-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-top:after{top:auto;bottom:-6px;border-bottom:0;border-top-width:6px;border-top-color:#fff}.dropdown-menu.bullet.pull-center:before{left:auto;right:50%;margin-right:-7px}.dropdown-menu.bullet.pull-center:after{left:auto;right:50%;margin-right:-6px}.dropdown-menu.bullet.pull-middle{margin-right:8px}.dropdown-menu.bullet.pull-middle:before{top:50%;left:100%;right:auto;margin-top:-7px;border-right-width:0;border-bottom-color:transparent;border-top-width:7px;border-left-color:#ccc;border-left-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-middle:after{top:50%;left:100%;right:auto;margin-top:-6px;border-right-width:0;border-bottom-color:transparent;border-top-width:6px;border-left-color:#fff}.dropdown-menu.bullet.pull-middle.pull-right{margin-right:0;margin-left:8px}.dropdown-menu.bullet.pull-middle.pull-right:before{left:-7px;border-left-width:0;border-right-width:7px;border-right-color:#ccc;border-right-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-middle.pull-right:after{left:-6px;border-left-width:0;border-right-width:6px;border-right-color:#fff}.dropdown-menu.bullet.pull-middle.pull-center{margin-left:0;margin-right:0}.dropdown-menu.bullet.pull-middle.pull-center:before{border:0;display:none}.dropdown-menu.bullet.pull-middle.pull-center:after{border:0;display:none}.dropdown-submenu{position:relative}.dropdown-submenu>.dropdown-menu{top:0;left:100%;margin-top:-6px;margin-left:-1px;border-top-left-radius:0}.dropdown-submenu>a:before{display:block;float:right;width:0;height:0;content:"";margin-top:6px;margin-right:-8px;border-width:4px 0 4px 4px;border-style:solid;border-left-style:dashed;border-top-color:transparent;border-bottom-color:transparent}@media (max-width:767px){.navbar-nav .dropdown-submenu>a:before{margin-top:8px;border-color:inherit;border-style:solid;border-width:4px 4px 0;border-left-color:transparent;border-right-color:transparent}.navbar-nav .dropdown-submenu>a{padding-left:40px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>label{padding-left:35px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:45px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:55px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:65px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:75px}}.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a,.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:hover,.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:focus{background-color:#e7e7e7;color:#555}@media (max-width:767px){.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:before{border-top-color:#555}}.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a,.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:hover,.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:focus{background-color:#080808;color:#fff}@media (max-width:767px){.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:before{border-top-color:#fff}}


--------------------------------------------------------------------------------
/elastic/get_mtermvectors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from sklearn.feature_extraction import DictVectorizer
  3 | import nltk
  4 | import math
  5 | from sets import Set
  6 | import time
  7 | import numpy as np
  8 | import operator
  9 | 
 10 | from config import es as default_es
 11 | from elastic.get_documents import get_documents_by_id
 12 | 
 13 | ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english'))
 14 | MAX_TERMS = 2000
 15 | 
 16 | def pos_filter(pos_tags=['NN', 'NNS', 'NNP', 'NNPS', 'VBN', 'JJ'], docterms=[]):
 17 |     tagged = nltk.pos_tag(docterms)
 18 |     valid_words = [tag[0] for tag in tagged if tag[1] in pos_tags]
 19 |     return valid_words
 20 | 
 21 | def tfidf(tf, df, n_doc):
 22 |     idf = math.log(n_doc / float(df))
 23 |     return tf * idf
 24 | 
 25 | def terms_from_es_json(doc, rm_stopwords=True, rm_numbers=True, termstatistics = False, term_freq = 0, mapping=None, es=None):
 26 |     terms = {}
 27 |     docterms = doc["term_vectors"][mapping['text']]["terms"]
 28 |     n_doc = doc["term_vectors"][mapping['text']]["field_statistics"]["doc_count"]
 29 |     valid_words = docterms.keys()
 30 |     
 31 |     if rm_stopwords:
 32 |         valid_words = [k for k in valid_words if k not in ENGLISH_STOPWORDS and (len(k) > 2)]
 33 | 
 34 |     if rm_numbers:
 35 |         valid_words = [k for k in valid_words if not k.lstrip('-').replace('.','',1).replace(',','',1).isdigit()]
 36 |         
 37 |     if termstatistics == True:
 38 |         terms = {term: {'tfidf':tfidf(docterms[term]["term_freq"], docterms[term]["doc_freq"], n_doc),
 39 |                         'tf': docterms[term]["term_freq"],
 40 |                         'ttf': docterms[term]["ttf"],
 41 |                     } for term in valid_words if docterms[term]["ttf"] > term_freq
 42 |         }
 43 |     else:
 44 |         terms = { term: {'tf': docterms[term]['term_freq']} for term in valid_words if docterms[term]["term_freq"] > term_freq}
 45 | 
 46 |     # Restrict the number of terms for large documents
 47 |     if len(terms.keys()) > MAX_TERMS:
 48 |         sorted_terms = []
 49 |         if termstatistics == True:
 50 |             terms_tfidf = {term:terms[term]["tfidf"] for term in terms.keys()}
 51 |             sorted_terms = sorted(terms_tfidf.items(), key=operator.itemgetter(1), reverse=True)
 52 |         else:
 53 |             terms_tf = {term:terms[term]["tf"] for term in terms.keys()}
 54 |             sorted_terms = sorted(terms_tf.items(), key=operator.itemgetter(1), reverse=True)
 55 | 
 56 |         terms = {item[0]: terms[item[0]] for item in sorted_terms[0:MAX_TERMS]}
 57 | 
 58 |     return terms
 59 | 
 60 | 
 61 | def getTermFrequency(all_hits, rm_stopwords=True, rm_numbers=True, pos_tags=[], term_freq=0, mapping=None, es_index='memex', es_doc_type='page', es=None):
 62 |     if es is None:
 63 |         es = default_es
 64 | 
 65 |     docs = []
 66 |     stats = []
 67 |     corpus = []
 68 | 
 69 |     once = True
 70 |     for i in range(0, len(all_hits), 10):
 71 |         hits = all_hits[i:i+10]
 72 | 
 73 |         term_res = es.mtermvectors(index=es_index,
 74 |                                    doc_type=es_doc_type,
 75 |                                    fields=mapping['text'], 
 76 |                                    ids=hits) 
 77 | 
 78 |         for doc in term_res['docs']:
 79 |             if doc.get('term_vectors'):
 80 |                 if mapping['text'] in doc['term_vectors']:
 81 |                     docs.append(doc['_id'])
 82 |                     res = terms_from_es_json(doc=doc, rm_stopwords=rm_stopwords, rm_numbers=rm_numbers,  term_freq=term_freq, mapping=mapping)
 83 |                     stats.append(res)
 84 | 
 85 |     tfs = []
 86 |     for stat in stats:
 87 |         tf={}
 88 |         tf={k:stat[k]['tf'] for k in stat.keys()}
 89 |         tfs.append(tf)
 90 | 
 91 |     v_tf = DictVectorizer()
 92 |     data = v_tf.fit_transform(tfs).toarray()
 93 |     corpus = v_tf.get_feature_names()
 94 | 
 95 |     if len(pos_tags) > 0:
 96 |         filtered_words = pos_filter(pos_tags, corpus)
 97 |         indices = [corpus.index(term) for term in corpus if term not in filtered_words]
 98 |         corpus =  np.delete(corpus, indices)
 99 |         corpus = corpus.tolist()
100 |         data =  np.delete(data, indices, 1)
101 | 
102 |     return [data, corpus, docs]
103 | 
104 | 
105 | def getTermStatistics(all_hits, rm_stopwords=True, rm_numbers=True, pos_tags=[], term_freq=0, num_terms=MAX_TERMS, mapping=None, es_index='memex', es_doc_type='page', es=None):
106 |     if es is None:
107 |         es = default_es
108 | 
109 |     stats = []
110 |     docs = []
111 | 
112 |     ttf = {}
113 |     for i in range(0, len(all_hits), 10):
114 |         hits = all_hits[i:i+10]
115 | 
116 |         term_res = es.mtermvectors(index=es_index,
117 |                                    doc_type=es_doc_type,
118 |                                    term_statistics=True, 
119 |                                    fields=mapping['text'], 
120 |                                    ids=hits)
121 | 
122 |         for doc in term_res['docs']:
123 |             if doc.get('term_vectors'):
124 |                 if mapping['text'] in doc['term_vectors']:
125 |                     docs.append(doc['_id'])
126 |                     res = terms_from_es_json(doc=doc, rm_stopwords=rm_stopwords, rm_numbers=rm_numbers, termstatistics=True, term_freq=term_freq, mapping=mapping)
127 |                     stats.append(res)
128 |                     for k in res.keys():
129 |                         ttf[k] = res[k]['ttf']
130 | 
131 | 
132 |     tfidfs = []
133 |     tfs = []
134 |     for stat in stats:
135 |         tfidf={k: stat[k]['tfidf'] for k in stat.keys()}
136 |         tfidfs.append(tfidf)
137 |         tf={k:stat[k]['tf'] for k in stat.keys()}
138 |         tfs.append(tf)
139 | 
140 |     v_tfidf = DictVectorizer()
141 |     v_tf = DictVectorizer()
142 | 
143 |     data = v_tfidf.fit_transform(tfidfs).toarray()
144 |     corpus = v_tfidf.get_feature_names()
145 |     tf_data = v_tf.fit_transform(tfs).toarray()
146 |     
147 |     if len(pos_tags) > 0:
148 |         filtered_words = pos_filter(pos_tags, corpus)
149 |         indices = [corpus.index(term) for term in corpus if term not in filtered_words]
150 |         corpus =  np.delete(corpus, indices)
151 |         corpus = corpus.tolist()
152 |         data =  np.delete(data, indices, 1)
153 |         tf_data =  np.delete(tf_data, indices, 1)
154 | 
155 |         if len(corpus) > MAX_TERMS:
156 |             mean_tfidf = np.mean(data, axis=0)
157 |             indices = np.argsort(mean_tfidf)[::-1]
158 |             corpus = [corpus[i] for i in indices]
159 |             data = data[:, indices]
160 |             tf_data = tf_data[:, indices]
161 |             
162 |         ttf = {key:value for key, value in ttf.iteritems() if key in corpus}
163 | 
164 |     result = [data, tf_data, ttf, corpus, docs]
165 | 
166 |     del tfidfs
167 |     del tfs
168 | 
169 |     return result
170 | 
171 | 


--------------------------------------------------------------------------------
/vis/html/libs/bootstrap-datetimepicker-4.15.35/css/bootstrap-datetimepicker.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Datetimepicker for Bootstrap 3
3 |  * version : 4.15.35
4 |  * https://github.com/Eonasdan/bootstrap-datetimepicker/
5 |  */.bootstrap-datetimepicker-widget{list-style:none}.bootstrap-datetimepicker-widget.dropdown-menu{margin:2px 0;padding:4px;width:19em}@media (min-width:768px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:992px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:1200px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}.bootstrap-datetimepicker-widget.dropdown-menu:before,.bootstrap-datetimepicker-widget.dropdown-menu:after{content:'';display:inline-block;position:absolute}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:before{border-left:7px solid transparent;border-right:7px solid transparent;border-bottom:7px solid #ccc;border-bottom-color:rgba(0,0,0,0.2);top:-7px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:after{border-left:6px solid transparent;border-right:6px solid transparent;border-bottom:6px solid white;top:-6px;left:8px}.bootstrap-datetimepicker-widget.dropdown-menu.top:before{border-left:7px solid transparent;border-right:7px solid transparent;border-top:7px solid #ccc;border-top-color:rgba(0,0,0,0.2);bottom:-7px;left:6px}.bootstrap-datetimepicker-widget.dropdown-menu.top:after{border-left:6px solid transparent;border-right:6px solid transparent;border-top:6px solid white;bottom:-6px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:before{left:auto;right:6px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:after{left:auto;right:7px}.bootstrap-datetimepicker-widget .list-unstyled{margin:0}.bootstrap-datetimepicker-widget a[data-action]{padding:6px 0}.bootstrap-datetimepicker-widget a[data-action]:active{box-shadow:none}.bootstrap-datetimepicker-widget .timepicker-hour,.bootstrap-datetimepicker-widget .timepicker-minute,.bootstrap-datetimepicker-widget .timepicker-second{width:54px;font-weight:bold;font-size:1.2em;margin:0}.bootstrap-datetimepicker-widget button[data-action]{padding:6px}.bootstrap-datetimepicker-widget .btn[data-action="incrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Hours"}.bootstrap-datetimepicker-widget .btn[data-action="incrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="decrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Hours"}.bootstrap-datetimepicker-widget .btn[data-action="decrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="showHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Hours"}.bootstrap-datetimepicker-widget .btn[data-action="showMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="togglePeriod"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle AM/PM"}.bootstrap-datetimepicker-widget .btn[data-action="clear"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Clear the picker"}.bootstrap-datetimepicker-widget .btn[data-action="today"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Set the date to today"}.bootstrap-datetimepicker-widget .picker-switch{text-align:center}.bootstrap-datetimepicker-widget .picker-switch::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle Date and Time Screens"}.bootstrap-datetimepicker-widget .picker-switch td{padding:0;margin:0;height:auto;width:auto;line-height:inherit}.bootstrap-datetimepicker-widget .picker-switch td span{line-height:2.5;height:2.5em;width:100%}.bootstrap-datetimepicker-widget table{width:100%;margin:0}.bootstrap-datetimepicker-widget table td,.bootstrap-datetimepicker-widget table th{text-align:center;border-radius:4px}.bootstrap-datetimepicker-widget table th{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table th.picker-switch{width:145px}.bootstrap-datetimepicker-widget table th.disabled,.bootstrap-datetimepicker-widget table th.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table th.prev::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Previous Month"}.bootstrap-datetimepicker-widget table th.next::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Next Month"}.bootstrap-datetimepicker-widget table thead tr:first-child th{cursor:pointer}.bootstrap-datetimepicker-widget table thead tr:first-child th:hover{background:#eee}.bootstrap-datetimepicker-widget table td{height:54px;line-height:54px;width:54px}.bootstrap-datetimepicker-widget table td.cw{font-size:.8em;height:20px;line-height:20px;color:#777}.bootstrap-datetimepicker-widget table td.day{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table td.day:hover,.bootstrap-datetimepicker-widget table td.hour:hover,.bootstrap-datetimepicker-widget table td.minute:hover,.bootstrap-datetimepicker-widget table td.second:hover{background:#eee;cursor:pointer}.bootstrap-datetimepicker-widget table td.old,.bootstrap-datetimepicker-widget table td.new{color:#777}.bootstrap-datetimepicker-widget table td.today{position:relative}.bootstrap-datetimepicker-widget table td.today:before{content:'';display:inline-block;border:solid transparent;border-width:0 0 7px 7px;border-bottom-color:#337ab7;border-top-color:rgba(0,0,0,0.2);position:absolute;bottom:4px;right:4px}.bootstrap-datetimepicker-widget table td.active,.bootstrap-datetimepicker-widget table td.active:hover{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td.active.today:before{border-bottom-color:#fff}.bootstrap-datetimepicker-widget table td.disabled,.bootstrap-datetimepicker-widget table td.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table td span{display:inline-block;width:54px;height:54px;line-height:54px;margin:2px 1.5px;cursor:pointer;border-radius:4px}.bootstrap-datetimepicker-widget table td span:hover{background:#eee}.bootstrap-datetimepicker-widget table td span.active{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td span.old{color:#777}.bootstrap-datetimepicker-widget table td span.disabled,.bootstrap-datetimepicker-widget table td span.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget.usetwentyfour td.hour{height:27px;line-height:27px}.bootstrap-datetimepicker-widget.wider{width:21em}.bootstrap-datetimepicker-widget .datepicker-decades .decade{line-height:1.8em !important}.input-group.date .input-group-addon{cursor:pointer}.sr-only{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}


--------------------------------------------------------------------------------
/vis/html/js/tagsgallery.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @fileoverview js Gallery of tags.
  3 |  *
  4 |  * @author (cesarpalomo@gmail.com) Cesar Palomo
  5 |  */
  6 | 
  7 | 
  8 | 
  9 | /**
 10 |  * Manages a list of tags used for pages (some predefined and some defined by user).
 11 |  * Interaction is possible through click on "tag selected" and "untag selected".
 12 |  *
 13 |  * @param parentContainerId ID for gallery parent div element.
 14 |  * @param predefinedTags list of predefined tags, with tag name.
 15 |  * @param tagsLogic mechanism to handle tags logic: some tags are not applicable, and some tags when
 16 |  *        applied should trigger the removal of other tags (e.g. when Yes is applied, No is
 17 |  *        removed).
 18 |  *        Must be in the format:
 19 |  *         {
 20 |  *           'TagName': {
 21 |  *             applicable: true/false,
 22 |  *             removable: true/false,
 23 |  *             negate: ['Tag1', 'Tag2'],
 24 |  *           },
 25 |  *         }
 26 |  */
 27 | var TagsGallery = function(parentContainerId, predefinedTags, tagsLogic) {
 28 |   this.parentContainerId = parentContainerId;
 29 | 
 30 |   // Predefined items in gallery.
 31 |   this.predefinedItems = predefinedTags;
 32 | 
 33 |   // User-defined items in gallery.
 34 |   this.userItems = [];
 35 | 
 36 |   // Handles tags logic.
 37 |   this.tagsLogic = tagsLogic;
 38 | 
 39 |   this.update();
 40 | };
 41 | 
 42 | 
 43 | /**
 44 |  * Clears list of items.
 45 |  */
 46 | TagsGallery.prototype.clear = function(lazyUpdate) {
 47 |   this.userItems = [];
 48 | 
 49 |   if (!lazyUpdate) {
 50 |     this.update();
 51 |   }
 52 | };
 53 | 
 54 | 
 55 | /**
 56 |  * Adds item to gallery.
 57 |  */
 58 | TagsGallery.prototype.addItem = function(tag, lazyUpdate) {
 59 |     if(this.predefinedItems.indexOf(tag) < 0) {
 60 | 	if(this.userItems.indexOf(tag) < 0){
 61 | 	    this.userItems.push(tag);
 62 | 	    this.tagsLogic[tag] = {'applicable': true, 'removable': true, negate: []};
 63 | 	    if(this.tagsLogic["Neutral"]["negate"].indexOf(tag) < 0)
 64 | 		this.tagsLogic["Neutral"]["negate"].push(tag); 
 65 | 	    if (!lazyUpdate) {
 66 | 		this.update();
 67 | 	    }
 68 | 	}
 69 |     }
 70 | };
 71 | 
 72 | 
 73 | /**
 74 |  * Removes item from gallery.
 75 |  */
 76 | TagsGallery.prototype.removeItem = function(tag) {
 77 |     var index = this.userItems.indexOf(tag);
 78 |     if( index >= 0 && this.predefinedItems.indexOf(tag) < 0){
 79 | 	this.userItems.splice(index, 1);
 80 |     }
 81 | }
 82 | 
 83 | /**
 84 |  * Get items from gallery.
 85 |  */
 86 | TagsGallery.prototype.getCustomTags = function() {
 87 |     return this.userItems;
 88 | }
 89 | 
 90 | /**
 91 |  * Sets mechanism to handle tags logic: some tags are not applicable, and some tags when applied
 92 |  * should trigger the removal of other tags (e.g. when Yes is applied, No is removed).
 93 |  * Logic must be in the format:
 94 |  *  {
 95 |  *    'TagName': {
 96 |  *      applicable: true/false,
 97 |  *      removable: true/false,
 98 |  *      negate: ['Tag1', 'Tag2'],
 99 |  *    },
100 |  *  }
101 |  */
102 | TagsGallery.prototype.setTagsLogic = function(tagsLogic) {
103 |   // Handles tags logic.
104 |   this.tagsLogic = tagsLogic;
105 | };
106 | 
107 | 
108 | /**
109 |  * Updates gallery.
110 |  */
111 | TagsGallery.prototype.update = function() {
112 |   var gallery = this;
113 |   this.items = this.predefinedItems.concat(this.userItems);
114 | 
115 |   var gallery = this;
116 |   var items = d3.select(this.parentContainerId)
117 |     .selectAll('.item').data(this.items, function(item, i) {
118 |       return item + '-' + i;
119 |   });
120 | 
121 |   // Configures actions on images.
122 |   items.each(function(item, i) {
123 |     // Only clickable tags.
124 |     var isApplicable = gallery.isTagApplicable(item);
125 |     var isRemovable = gallery.isTagRemovable(item);
126 | 
127 |     if (isApplicable || isRemovable) {
128 |       var itemElm = d3.select(this);
129 |       itemElm.selectAll('img').each(function() {
130 |         var img = d3.select(this);
131 |         var actionType = img.attr('actionType');
132 |         if ((isApplicable && actionType == 'Apply') 
133 |             || (isRemovable && actionType == 'Remove')) {
134 |           img
135 |             .on('mouseover', function() {
136 |               Utils.showTooltip();
137 |             })
138 |             .on('mousemove', function() {
139 |               Utils.updateTooltip(actionType + ' tag "' + item + '"');
140 |             })
141 |             .on('mouseout', function() {
142 |               Utils.hideTooltip();
143 |             })
144 |             .on('click', function() {
145 |               gallery.onItemActionClick(item, i, actionType);
146 |               event.stopPropagation();
147 |             });
148 |         }
149 |       });
150 |     }
151 |   });
152 | };
153 | 
154 | 
155 | /**
156 |  * Returns whether a tag is applicable.
157 |  */
158 | TagsGallery.prototype.isTagApplicable = function(tag) {
159 |   return tag in this.tagsLogic && this.tagsLogic[tag]['applicable'];
160 | };
161 | 
162 | 
163 | /**
164 |  * Returns whether a tag is removable.
165 |  */
166 | TagsGallery.prototype.isTagRemovable = function(tag) {
167 |   return tag in this.tagsLogic && this.tagsLogic[tag]['removable'];
168 | };
169 | 
170 | 
171 | /**
172 |  * Builds html content with info about an item in the gallery.
173 |  */
174 | TagsGallery.prototype.getItemInfo = function(item, i) {
175 |   return item;
176 | };
177 | 
178 | 
179 | /**
180 |  * Builds html content with buttons for labeling relevancy an item in the gallery,
181 |  * such as Yes, No, Maybe.
182 |  */
183 | TagsGallery.prototype.getItemButtons = function(item, i) {
184 |   var w = 12;
185 |   var a = this.isTagApplicable(item) ? 'clickable' : 'not-clickable';
186 |   var r = this.isTagRemovable(item) ? 'clickable' : 'not-clickable';
187 |   return '<img actionType="Remove" src="img/remove.png" width="' + w + 'px" class="' + r + '">'
188 |     + '<img actionType="Apply" src="img/apply.png" width="' + w + 'px" class="' + a + '">';
189 | };
190 | 
191 | 
192 | /**
193 |  * Handles click in an item.
194 |  */
195 | TagsGallery.prototype.onItemClick = function(item, i) {
196 |   __sig__.emit(__sig__.tag_clicked, item);
197 | };
198 | 
199 | 
200 | /**
201 |  * Handles item focus.
202 |  */
203 | TagsGallery.prototype.onItemFocus = function(item, i, onFocus) {
204 |   __sig__.emit(__sig__.tag_focus, item, onFocus);
205 | };
206 | 
207 | 
208 | /**
209 |  * Handles click in an item.
210 |  */
211 | TagsGallery.prototype.onItemActionClick = function(item, i, actionType) {
212 |   this.applyOrRemoveTag(item, actionType);
213 | };
214 | 
215 | 
216 | /**
217 |  * Applies or removes tag.
218 |  */
219 | TagsGallery.prototype.applyOrRemoveTag = function(tag, actionType, opt_pages, refresh_plot) {
220 |   // Handles tags logic.
221 |   if (tag in this.tagsLogic) {
222 |     var logicForTag = this.tagsLogic[tag];
223 | 
224 |     if (actionType == 'Apply') {
225 |       // Removes tags in negate.
226 |       for (var i in logicForTag.negate) {
227 |         var negateTag = logicForTag.negate[i];
228 |         __sig__.emit(__sig__.tag_action_clicked, negateTag, 'Remove', opt_pages, refresh_plot);
229 |       }
230 |       if (logicForTag.applicable && !logicForTag.isVirtual) {
231 |         __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot);
232 |       }
233 |     } else {
234 |       // Removes tag when removable.
235 |       if (logicForTag.removable) {
236 |         __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot);
237 |       }
238 |     }
239 |   } else {
240 |     __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot);
241 |   }
242 | };
243 | 
244 | 
245 | 
246 | /**
247 |  * Returns applicable tags.
248 |  */
249 | TagsGallery.prototype.getApplicableTags = function() {
250 |   var gallery = this;
251 |   return this.items.filter(function(tag) {
252 |     return gallery.isTagApplicable(tag);
253 |   });
254 | };
255 | 


--------------------------------------------------------------------------------
/elastic/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | abst
  6 | accordance
  7 | according
  8 | accordingly
  9 | across
 10 | act
 11 | actually
 12 | added
 13 | adj
 14 | affected
 15 | affecting
 16 | affects
 17 | after
 18 | afterwards
 19 | again
 20 | against
 21 | ah
 22 | all
 23 | almost
 24 | alone
 25 | along
 26 | already
 27 | also
 28 | although
 29 | always
 30 | am
 31 | among
 32 | amongst
 33 | an
 34 | and
 35 | announce
 36 | another
 37 | any
 38 | anybody
 39 | anyhow
 40 | anymore
 41 | anyone
 42 | anything
 43 | anyway
 44 | anyways
 45 | anywhere
 46 | apparently
 47 | approximately
 48 | are
 49 | aren
 50 | arent
 51 | arise
 52 | around
 53 | as
 54 | aside
 55 | ask
 56 | asking
 57 | at
 58 | auth
 59 | available
 60 | away
 61 | awfully
 62 | b
 63 | back
 64 | be
 65 | became
 66 | because
 67 | become
 68 | becomes
 69 | becoming
 70 | been
 71 | before
 72 | beforehand
 73 | begin
 74 | beginning
 75 | beginnings
 76 | begins
 77 | behind
 78 | being
 79 | believe
 80 | below
 81 | beside
 82 | besides
 83 | between
 84 | beyond
 85 | biol
 86 | both
 87 | brief
 88 | briefly
 89 | but
 90 | by
 91 | c
 92 | ca
 93 | came
 94 | can
 95 | cannot
 96 | can't
 97 | cause
 98 | causes
 99 | certain
100 | certainly
101 | co
102 | com
103 | come
104 | comes
105 | contain
106 | containing
107 | contains
108 | could
109 | couldnt
110 | d
111 | date
112 | did
113 | didn't
114 | different
115 | do
116 | does
117 | doesn't
118 | doing
119 | done
120 | don't
121 | down
122 | downwards
123 | due
124 | during
125 | e
126 | each
127 | ed
128 | edu
129 | effect
130 | eg
131 | eight
132 | eighty
133 | either
134 | else
135 | elsewhere
136 | end
137 | ending
138 | enough
139 | especially
140 | et
141 | et-al
142 | etc
143 | even
144 | ever
145 | every
146 | everybody
147 | everyone
148 | everything
149 | everywhere
150 | ex
151 | except
152 | f
153 | far
154 | few
155 | ff
156 | fifth
157 | first
158 | five
159 | fix
160 | followed
161 | following
162 | follows
163 | for
164 | former
165 | formerly
166 | forth
167 | found
168 | four
169 | from
170 | further
171 | furthermore
172 | g
173 | gave
174 | get
175 | gets
176 | getting
177 | give
178 | given
179 | gives
180 | giving
181 | go
182 | goes
183 | gone
184 | got
185 | gotten
186 | h
187 | had
188 | happens
189 | hardly
190 | has
191 | hasn't
192 | have
193 | haven't
194 | having
195 | he
196 | hed
197 | hence
198 | her
199 | here
200 | hereafter
201 | hereby
202 | herein
203 | heres
204 | hereupon
205 | hers
206 | herself
207 | hes
208 | hi
209 | hid
210 | him
211 | himself
212 | his
213 | hither
214 | home
215 | how
216 | howbeit
217 | however
218 | hundred
219 | i
220 | id
221 | ie
222 | if
223 | i'll
224 | im
225 | immediate
226 | immediately
227 | importance
228 | important
229 | in
230 | inc
231 | indeed
232 | index
233 | information
234 | instead
235 | into
236 | invention
237 | inward
238 | is
239 | isn't
240 | it
241 | itd
242 | it'll
243 | its
244 | itself
245 | i've
246 | j
247 | just
248 | k
249 | keep	keeps
250 | kept
251 | kg
252 | km
253 | know
254 | known
255 | knows
256 | l
257 | largely
258 | last
259 | lately
260 | later
261 | latter
262 | latterly
263 | least
264 | less
265 | lest
266 | let
267 | lets
268 | like
269 | liked
270 | likely
271 | line
272 | little
273 | 'll
274 | look
275 | looking
276 | looks
277 | ltd
278 | m
279 | made
280 | mainly
281 | make
282 | makes
283 | many
284 | may
285 | maybe
286 | me
287 | mean
288 | means
289 | meantime
290 | meanwhile
291 | merely
292 | mg
293 | might
294 | million
295 | miss
296 | ml
297 | more
298 | moreover
299 | most
300 | mostly
301 | mr
302 | mrs
303 | much
304 | mug
305 | must
306 | my
307 | myself
308 | n
309 | na
310 | name
311 | namely
312 | nay
313 | nd
314 | near
315 | nearly
316 | necessarily
317 | necessary
318 | need
319 | needs
320 | neither
321 | never
322 | nevertheless
323 | new
324 | next
325 | nine
326 | ninety
327 | no
328 | nobody
329 | non
330 | none
331 | nonetheless
332 | noone
333 | nor
334 | normally
335 | nos
336 | not
337 | noted
338 | nothing
339 | now
340 | nowhere
341 | o
342 | obtain
343 | obtained
344 | obviously
345 | of
346 | off
347 | often
348 | oh
349 | ok
350 | okay
351 | old
352 | omitted
353 | on
354 | once
355 | one
356 | ones
357 | only
358 | onto
359 | or
360 | ord
361 | other
362 | others
363 | otherwise
364 | ought
365 | our
366 | ours
367 | ourselves
368 | out
369 | outside
370 | over
371 | overall
372 | owing
373 | own
374 | p
375 | page
376 | pages
377 | part
378 | particular
379 | particularly
380 | past
381 | per
382 | perhaps
383 | placed
384 | please
385 | plus
386 | poorly
387 | possible
388 | possibly
389 | potentially
390 | pp
391 | predominantly
392 | present
393 | previously
394 | primarily
395 | probably
396 | promptly
397 | proud
398 | provides
399 | put
400 | q
401 | que
402 | quickly
403 | quite
404 | qv
405 | r
406 | ran
407 | rather
408 | rd
409 | re
410 | readily
411 | really
412 | recent
413 | recently
414 | ref
415 | refs
416 | regarding
417 | regardless
418 | regards
419 | related
420 | relatively
421 | research
422 | respectively
423 | resulted
424 | resulting
425 | results
426 | right
427 | run
428 | s
429 | said
430 | same
431 | saw
432 | say
433 | saying
434 | says
435 | sec
436 | section
437 | see
438 | seeing
439 | seem
440 | seemed
441 | seeming
442 | seems
443 | seen
444 | self
445 | selves
446 | sent
447 | seven
448 | several
449 | shall
450 | she
451 | shed
452 | she'll
453 | shes
454 | should
455 | shouldn't
456 | show
457 | showed
458 | shown
459 | showns
460 | shows
461 | significant
462 | significantly
463 | similar
464 | similarly
465 | since
466 | six
467 | slightly
468 | so
469 | some
470 | somebody
471 | somehow
472 | someone
473 | somethan
474 | something
475 | sometime
476 | sometimes
477 | somewhat
478 | somewhere
479 | soon
480 | sorry
481 | specifically
482 | specified
483 | specify
484 | specifying
485 | still
486 | stop
487 | strongly
488 | sub
489 | substantially
490 | successfully
491 | such
492 | sufficiently
493 | suggest
494 | sup
495 | sure	t
496 | take
497 | taken
498 | taking
499 | tell
500 | tends
501 | th
502 | than
503 | thank
504 | thanks
505 | thanx
506 | that
507 | that'll
508 | thats
509 | that've
510 | the
511 | their
512 | theirs
513 | them
514 | themselves
515 | then
516 | thence
517 | there
518 | thereafter
519 | thereby
520 | thered
521 | therefore
522 | therein
523 | there'll
524 | thereof
525 | therere
526 | theres
527 | thereto
528 | thereupon
529 | there've
530 | these
531 | they
532 | theyd
533 | they'll
534 | theyre
535 | they've
536 | think
537 | this
538 | those
539 | thou
540 | though
541 | thoughh
542 | thousand
543 | throug
544 | through
545 | throughout
546 | thru
547 | thus
548 | til
549 | tip
550 | to
551 | together
552 | too
553 | took
554 | toward
555 | towards
556 | tried
557 | tries
558 | truly
559 | try
560 | trying
561 | ts
562 | twice
563 | two
564 | u
565 | un
566 | under
567 | unfortunately
568 | unless
569 | unlike
570 | unlikely
571 | until
572 | unto
573 | up
574 | upon
575 | ups
576 | us
577 | use
578 | used
579 | useful
580 | usefully
581 | usefulness
582 | uses
583 | using
584 | usually
585 | v
586 | value
587 | various
588 | 've
589 | very
590 | via
591 | viz
592 | vol
593 | vols
594 | vs
595 | w
596 | want
597 | wants
598 | was
599 | wasnt
600 | way
601 | we
602 | wed
603 | welcome
604 | we'll
605 | went
606 | were
607 | werent
608 | we've
609 | what
610 | whatever
611 | what'll
612 | whats
613 | when
614 | whence
615 | whenever
616 | where
617 | whereafter
618 | whereas
619 | whereby
620 | wherein
621 | wheres
622 | whereupon
623 | wherever
624 | whether
625 | which
626 | while
627 | whim
628 | whither
629 | who
630 | whod
631 | whoever
632 | whole
633 | who'll
634 | whom
635 | whomever
636 | whos
637 | whose
638 | why
639 | widely
640 | willing
641 | wish
642 | with
643 | within
644 | without
645 | wont
646 | words
647 | world
648 | would
649 | wouldnt
650 | www
651 | x
652 | y
653 | yes
654 | yet
655 | you
656 | youd
657 | you'll
658 | your
659 | youre
660 | yours
661 | yourself
662 | yourselves
663 | you've
664 | z
665 | zero
666 | 


--------------------------------------------------------------------------------
/vis/html/release.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <html>
  4 |   <head>
  5 |     <title>Domain Discovery Tool Release Notes</title>
  6 |     <meta charset="utf-8">
  7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  9 | 
 10 |     <link rel="stylesheet" href="libs/bootflat-2.0.4/css/bootstrap.min.css">
 11 |     <link rel="stylesheet" href="libs/bootflat-2.0.4/css/bootflat.min.css">
 12 |     <link href='http://fonts.googleapis.com/css?family=Roboto' rel='stylesheet' type='text/css'>
 13 | 
 14 |     <link type="text/css" rel="stylesheet" href="css/d3.slider.css" media="all">
 15 |     <link type="text/css" rel="stylesheet" href="css/jquery-ui.css" media="all">
 16 |     <link type="text/css" rel="stylesheet" href="css/jquery.urlive.css" media="all">
 17 |     <link type="text/css" rel="stylesheet" href="css/crawler-white.css" media="all">
 18 |   </head>
 19 | 
 20 |   <body>
 21 |     <div class="container-fluid">
 22 |       <h1>Domain Discovery Tool v2.9 Release Notes</h1>
 23 |       <h2>Features</h2>
 24 |       <ul>
 25 | 	<li>Incorporated an online model learner to incrementally learn the domain model as the user annotates the pages. The accuracy of the model is an indicator of completeness of the domain. The model is also used to label the unlabeled pages which helps the user see which pages the model is most unsure of and thereby helps the user to annotate a smaller subset of pages that would most affect the performance of the model</li>
 26 | 	<li>Added a quality indicator for domain model construction</li>
 27 | 	<li>The interface was restructured to support multiple criteria for page filtering and improve workflow</li>
 28 | 	<li>Made improvements to handle long pages</li>
 29 |       </ul>
 30 |       <h1>Domain Discovery Tool v2.8.5 Release Notes</h1>
 31 |       <h2>Features</h2>
 32 |       <ul>
 33 | 	<li>Added ability to delete domains</li>
 34 | 	<li>Added settings to model built in order to select custom tags as either relevant and irrelevant to build ACHE classifier</li>
 35 |       </ul>
 36 |       <h1>Domain Discovery Tool v2.8.4 Release Notes</h1>
 37 |       <h2>Features</h2>
 38 |       <ul>
 39 | 	<li>Crawl backward and forward links of selected pages to extend content</li>
 40 | 	<li>Topics dashboard to discover and visualize the distribution of topics in the domain content</li>
 41 | 	<li>Added ability to custom tag individual pages in the pagesgallery</li>
 42 | 	<li>Choose between Google and Bing web searches</li>
 43 | 	<li>Assign different colors for custom tags</li>
 44 |       </ul>
 45 |       <h1>Domain Discovery Tool v2.8.3 Release Notes</h1>
 46 |       <h2>Features</h2>
 47 |       <ul>
 48 | 	<li>Added ability to custom tag pages</li>
 49 | 	<li>Filter pages by tags</li>
 50 |       </ul>
 51 |       <h1>Domain Discovery Tool v2.8.2 Release Notes</h1>
 52 |       <h2>Features</h2>
 53 |       <ul>
 54 | 	<li>Added a statistics dashboard to provide various aggregations of the contents of the DDT data including queries and annotations</li>
 55 |       </ul>
 56 |       <h1>Domain Discovery Tool v2.8.1 Release Notes</h1>
 57 |       <h2>Features</h2>
 58 |       <ul>
 59 | 	<li>Replaced D3 page clustering visualization with Continuum's bokeh plot</li>
 60 |       </ul>
 61 |       <h1>Domain Discovery Tool v2.8 Release Notes</h1>
 62 |       <h2>Features</h2>
 63 |       <ul>
 64 | 	<li>Added the feature to select the kind of pages to retrieve as follows:</li>
 65 | 	<ul>
 66 | 	  <li>Most recent pages</li>
 67 | 	  <li>Pages for specific queries</li>
 68 | 	  <li>Relevant pages and pages similar to those</li>
 69 | 	  <li>Irrelevant pages and pages similar to those</li>
 70 | 	</ul>
 71 |       </ul>
 72 |       <h1>Domain Discovery Tool v2.7.1 Release Notes</h1>
 73 |       <h2>Features</h2>
 74 |       <ul>
 75 | 	<li>Fixed bugs in ranking and accessing elasticsearch data</li>
 76 |       </ul>
 77 |       <h1>Domain Discovery Tool v2.7.0 Release Notes</h1>
 78 |       <h2>Features</h2>
 79 |       <ul>
 80 | 	<li>Decoupled server and client to allow multiple-users</li> 
 81 | 	<li>Added capability to upload a file with list of URLs or enter a list of URLs in a text box for which the pages are to be downloaded to be viewed in DDT</li>
 82 |       </ul>
 83 | 
 84 |       <h1>Domain Discovery Tool v2.6.1 Release Notes</h1>
 85 |       <h2>Features</h2>
 86 |       <ul> 
 87 | 	<li>Moved the model building to the menu bar</li>
 88 |       </ul>
 89 | 
 90 |       <h1>Domain Discovery Tool v2.6.0 Release Notes</h1>
 91 |       <h2>Features</h2>
 92 |       <ul> 
 93 | 	<li>Added a menu bar at the top with logo and name of tool and moved the following to the menu:</li>
 94 | 	  <ul>
 95 | 	    <li>list of available domains and adding new domains to it</li>
 96 | 	  </ul>
 97 | 	<li>Changed the look and feel of the terms window. </li>
 98 | 	  <ul>
 99 | 	    <li>All the words appear to the left.</li>
100 | 	    <li>Custom words are also added to the same list as the extracted ones</li>
101 | 	    <li>Custom words are distinguised by the delete icon before them. This delete icon allows the deletion of the custom terms.</li>
102 | 	    <li>The positive and negative bars are both on the right</li>
103 | 	  </ul>
104 | 	<li>Updated the development environment to use conda</li>
105 | 	<li>Updated docker build to use the make and not use fab anymore</li>
106 |       </ul>
107 | 
108 |       <h1>Domain Discovery Tool v2.5.0 Release Notes</h1>
109 |       <h2>Features</h2>
110 |       <ul> 
111 | 	<li>Filter the corpora by date range</li>
112 | 	<li>Added bigrams and trigrams to term list</li>
113 | 	<li>Added ability to enter custom relevant and irrelevant terms</li>
114 |       </ul>
115 | 
116 |       <h1>Domain Discovery Tool v2.4.0 Release Notes</h1>
117 |       <h2>Features</h2>
118 |       <ul> 
119 | 	<li>Ability to create a new domain</li>
120 | 	<li>Query the web and output the downloaded pages to elasticsearch</li>
121 | 	<li>The downloading of the pages can be seen in the page summary window below the web search component</li>
122 | 	<li>Update option to update to new downloaded pages and re-rank terms</li>
123 | 	<li>Visualize the page clustering projections on the multidimensional visualization (MDV) window</li>
124 | 	<li>Currently supported projection methods are: PCA, t-SNE, K-Means</li>
125 | 	<li>Allows lasso selection of the pages in the MDV window</li>
126 | 	<li>Allows zoom in MDV window when key 'z' is kept pressed</li>
127 | 	<li>The files selected in the MDV window are displayed with a short snippet from the page, an image and url in the pages panel below the MDV window</li>
128 | 	<li>Shift+click on the page in the pages panel brings up a snapshot of the page</li>
129 | 	<li>Clicking on the url in pages panel opens the page in a new tab</li>
130 | 	<li>After inspecting the pages all selected pages can be tagged relevant or irrelevant as a group or individually</li>
131 | 	<li>Ability to connect to exisiting corpora stored in elasticsearch</li>
132 | 	<li>Filter the corpora for documents with specific query words</li>
133 | 	<li>The number of filtered documents to be retrieved can be set using the pageCap</li>
134 | 	<li>When documents are filtered, the terms retrieved reflect the significant terms in the filtered documents.</li>
135 | 	<li>The update also retrieves the top 50 frequent terms in the pages that provide more insight into the content of the pages</li>
136 | 	<li>The red and blue bars to the right and left of the term show a relative occurrence of the term in negative and positive pages. This facilitates selecting terms that are more discriminative</li>
137 | 	<li>When the mouse is hovered over the word the snippets from pages that the selected term occurs in are displayed. This provides a context for the term </li>
138 | 	<li>The terms can be tagged positive or negative by one-click and double-click respectively</li>
139 | 	<li>After tagging when update is clicked the terms are re-ranked based on previous tagging</li>
140 | 	<li>Shift+Click on the term to enter it into the web search</li>
141 | 	<li>These results are stored on the back end to build page classifiers for crawlers</li>
142 | 	<li>Clicking on the Model button builds the Ache crawler page classifier model, features and seeds files</li>
143 | 	<li>It also outputs the annotated pages as training data. This can be used by any other crawler to build their respective models</li>
144 | 	<li>Once the model is built it can be downloaded and saved to be further retrieved for starting a crawler</li>
145 |       </ul>
146 |     </div>
147 |   </body>
148 | 
149 | </html>
150 | 


--------------------------------------------------------------------------------
/vis/html/js/libs/jquery.urlive.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * jquery.urlive.js v1.1.1, jQuery URLive
  3 |  *
  4 |  * Copyright 2014 Mark Serbol.   
  5 |  * Use, reproduction, distribution, and modification of this code is subject to the terms and 
  6 |  * conditions of the MIT license, available at http://www.opensource.org/licenses/MIT.
  7 |  *
  8 |  * https://github.com/markserbol/urlive
  9 |  *
 10 |  */
 11 | 
 12 | ;(function($){
 13 | 	var defaults = {
 14 | 		container: '.urlive-container',
 15 | 		target: '_blank',
 16 | 		imageSize: 'auto',
 17 | 		render: true,
 18 | 		disableClick: false,
 19 | 		regexp: /((https?:\/\/)?[\w-@]+(\.[a-z]+)+\.?(:\d+)?(\/\S*)?)/i,
 20 | 		yqlSelect: '*',
 21 | 		callbacks: {
 22 | 			onStart: function() {},
 23 | 			onSuccess: function() {},
 24 | 			onFail: function() {},
 25 | 			noData: function() {},
 26 | 			onLoadEnd: function() {},
 27 | 			imgError: function() {},
 28 | 			onClick: function() {}
 29 | 		}
 30 | 	},
 31 | 	
 32 | 	xajax = (function(ajax){		
 33 | 		var exRegex = RegExp(window.location.protocol + '//' + window.location.hostname),
 34 | 			yql_base_uri = 'http'+(/^https/.test(window.location.protocol)?'s':'') + 
 35 | 			               '://query.yahooapis.com/v1/public/yql?callback=?',
 36 | 			yql_query = 'select {SELECT} from html where url="{URL}" and xpath="*" and compat="html5"';
 37 | 		
 38 | 		return function(o) {		
 39 | 			var url = (!/^https?:\/\//i.test(o.url)) ? window.location.protocol + '//' + o.url : o.url;	
 40 |           
 41 | 			if (/get/i.test(o.type) && !/json/i.test(o.dataType) && !exRegex.test(url) && /:\/\//.test(url)){			
 42 | 			
 43 | 				o.url = yql_base_uri;
 44 | 				o.dataType = 'json';			
 45 | 				o.data = {
 46 | 					q: yql_query.replace('{SELECT}', o.yqlSelect).replace(
 47 | 						'{URL}',
 48 | 						url + (o.data ? (/\?/.test(url) ? '&' : '?') + $.param(o.data) : '')
 49 | 					),
 50 | 					format: 'xml'
 51 | 				};
 52 | 
 53 | 				if (!o.success && o.complete) {
 54 | 					o.success = o.complete;
 55 | 					delete o.complete;
 56 | 				}
 57 | 				
 58 | 				o.success = (function(success){
 59 | 					return function(data){						
 60 | 						if(success){							
 61 | 							success.call(this, {
 62 | 								responseText: (data.results[0] || '').replace(/<script[^>]+?\/>|<script(.|\s)*?\/script>/gi, '')
 63 | 							}, 'success');
 64 | 						}
 65 | 							
 66 | 					};
 67 | 				})(o.success);
 68 | 					
 69 | 			}		
 70 | 			return ajax.apply(this, arguments);				
 71 | 		};
 72 | 		
 73 | 	})($.ajax),	
 74 | 	
 75 | 	findUrlive = function(){
 76 | 		var selector = $(this).data('urlive-container') || $(this);		
 77 | 		return $(selector).find('.urlive-link');
 78 | 	},
 79 | 	
 80 | 	methods = {
 81 | 		init: function(options){
 82 | 			var opts = $.extend(true, defaults, options);
 83 | 			
 84 | 			return this.each(function(){
 85 | 				var el = $(this), url = undefined;
 86 | 				
 87 | 				el.data('urlive-container', opts.container);
 88 | 								
 89 |         if ('url' in opts) {
 90 |           url = opts.url;
 91 |         } else {
 92 |           if(el.is('a')){
 93 |             url = el.attr('href');
 94 |           }else{
 95 |             var text = el.val() || el.text(), 
 96 |               regexp = opts.regexp, 
 97 |               email = /^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
 98 |           
 99 |             url = regexp.exec(text);
100 |             
101 |             url = (url && !email.test(url[0])) ? url[0] : null;			
102 |           }
103 |         }
104 | 				
105 | 				if(url){
106 | 					if(/\.(?:jpe?g|gif|png)/.test(url)){
107 | 						var ti = url.substr(url.lastIndexOf('/') + 1);
108 | 						draw({image:url, title:ti, url:url});
109 | 					}else{
110 | 						getData(url);
111 | 					}
112 | 				}
113 | 						
114 | 				function getData(url){					
115 | 					xajax({
116 | 						url: url,
117 | 						type: 'GET',
118 | 						yqlSelect: opts.yqlSelect,
119 | 						beforeSend: opts.callbacks.onStart				
120 | 					}).done(function(data){
121 | 						if(!$.isEmptyObject(data.results)){
122 | 							data = data.results[0];
123 | 							
124 | 							html = $('<div/>',{html:data});
125 | 		
126 | 							get = function(prop){	
127 | 								return html.find('[property="' + prop + '"]').attr('content') 
128 | 											 || html.find('[name="' + prop + '"]').attr('content') 
129 | 											 || html.find(prop).html() || html.find(prop).attr('src');
130 | 							};
131 | 											
132 | 							set = {
133 | 								image: el.data('image') || get('og:image') || get('img'), 
134 | 								title: el.data('title') || get('og:title') || get('title'), 
135 | 								description: el.data('description') || get('og:description') || get('description'),
136 | 								url: el.data('url') || get('og:url') || url,	
137 | 								type: el.data('type') || get('og:type'),				
138 | 								sitename: el.data('site_name') || get('og:site_name')
139 | 							};
140 | 													
141 | 							opts.callbacks.onSuccess(set);
142 | 							
143 | 							if(opts.render){
144 | 								draw(set);
145 | 							}
146 | 												
147 | 						}else{
148 | 							opts.callbacks.noData();
149 | 							$.error('YQL request succeeded but with empty results', data);
150 | 							
151 | 						}
152 | 					}).fail(function (jqXHR, textStatus, errorThrown) {
153 | 						opts.callbacks.onFail();
154 | 						$.error('YQL request error: ', textStatus, errorThrown);		
155 | 					});			
156 | 				}
157 | 				
158 | 				function draw(set){			
159 | 					//outer = $('<a/>',{ 'class':'urlive-link', href: set.url, target: opts.target});
160 | 					outer = $('<div/>',{ 'class':'urlive-container', href: set.url, target: opts.target});
161 | 					imgWrapper = $('<div/>',{ 'class':'urlive-img-wrapper'});
162 | 					textWrapper = $('<div/>',{'class':'urlive-text-wrapper'});
163 | 															
164 | 					$.each(set, function(key, val){			
165 | 						if(val){
166 | 							if(key == 'image'){
167 |                                 
168 |                                 if(!/^(?:[a-z]+:)?\/\//i.test(val)){
169 |                                   val = (!/^https?:\/\//i.test(set.url)) ? window.location.protocol + '//' + set.url + val : set.url + val;
170 |                                 }
171 |                                 
172 | 								img = $('<img/>', {src: val});
173 | 								
174 | 								img.error(opts.callbacks.imgError);		
175 | 														
176 | 								img.appendTo(imgWrapper);
177 | 								
178 | 								img.hide().load(function() {
179 | 									var imgW = $(this).width(), 
180 | 									outer = $(this).closest('.urlive-link');							
181 | 								
182 | 									$(this).addClass('urlive-'+key).show();
183 | 									
184 | 									if(opts.imageSize == 'auto'){
185 | 										
186 | 										if(imgW >= outer.width()){																	
187 | 											outer.addClass('urlive-img-large');	 			
188 | 										}else{
189 | 											outer.addClass('urlive-img-small'); 										
190 | 										}
191 | 									}else if(opts.imageSize == 'large'){
192 | 										outer.addClass('urlive-img-large');
193 | 									}else if(opts.imageSize == 'small'){
194 | 										outer.addClass('urlive-img-small');								
195 | 									}
196 | 									
197 | 									opts.callbacks.onLoadEnd();
198 | 								});
199 | 								
200 | 							}else{
201 | 								elem = $('<span/>', {'class':'urlive-'+key, text: val});								
202 | 								elem.appendTo(textWrapper);
203 | 							}	
204 | 						}
205 | 					});
206 | 								
207 | 					outer.append(imgWrapper, textWrapper).appendTo(el.data('urlive-container'));
208 | 
209 | 					outer.on('click', opts.callbacks.onClick);
210 | 					
211 | 					if(opts.disableClick){
212 | 						outer.on('click', function(e){
213 | 							e.preventDefault();
214 | 						});
215 | 					}
216 | 					
217 | 				}
218 | 				
219 | 			});
220 | 		},
221 | 		
222 | 		close: function(duration){
223 | 			var urlive = findUrlive.apply(this);
224 | 			
225 | 			urlive.fadeOut(duration);	
226 | 		},
227 | 		
228 | 		remove: function(duration){
229 | 			var urlive = findUrlive.apply(this);
230 | 			
231 | 			if(duration){
232 | 				urlive.fadeOut(duration, function(){
233 | 					urlive.remove();
234 | 				});	
235 | 			}else{
236 | 				urlive.remove();
237 | 			}
238 | 		},
239 | 				
240 | 		open: function(duration){
241 | 			var urlive = findUrlive.apply(this);
242 | 			
243 | 			urlive.fadeIn(duration);	
244 | 		},
245 | 		
246 | 		disable: function(){
247 | 			var urlive = findUrlive.apply(this);
248 | 			
249 | 			urlive.on('click',function(e) {
250 |         e.preventDefault();
251 |       });	
252 | 		},
253 | 		
254 | 		enable: function(){
255 | 			var urlive = findUrlive.apply(this);
256 | 			
257 | 			urlive.off('click');	
258 | 		}
259 | 		
260 | 	};
261 | 	
262 | 	$.fn.urlive = function(method){
263 | 		if(methods[method]){
264 | 			return methods[method].apply(this, Array.prototype.slice.call(arguments, 1));
265 | 		}else if(typeof method === 'object' || !method){
266 | 			return methods.init.apply(this, arguments);
267 | 		}else{
268 | 			$.error('Method "' + method + '" does not exist on jquery.urlive');
269 | 		}
270 | 	};
271 | 	
272 | })(jQuery);
273 | 


--------------------------------------------------------------------------------