├── __init__.py ├── vis ├── __init__.py ├── bokeh_plots │ ├── __init__.py │ ├── test │ │ ├── __init__.py │ │ └── test_cross_filter.py │ ├── utils.py │ └── domains_dashboard.py ├── html │ ├── img │ │ ├── apply.png │ │ ├── boost.png │ │ ├── delete.jpg │ │ ├── reload.png │ │ ├── remove.png │ │ ├── search.png │ │ ├── nyu_stacked_black.png │ │ └── engineering_long_black.png │ ├── css │ │ ├── dashboard_styles.css │ │ ├── cross_filter.css │ │ ├── jquery.urlive.css │ │ ├── d3.slider.css │ │ └── dropdowns-enhancement.min.css │ ├── libs │ │ ├── bootflat-2.0.4 │ │ │ └── fonts │ │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ │ └── glyphicons-halflings-regular.woff │ │ └── bootstrap-datetimepicker-4.15.35 │ │ │ └── css │ │ │ └── bootstrap-datetimepicker.min.css │ ├── cross_filter_plot_area.html │ ├── js │ │ ├── libs │ │ │ ├── queue.min.js │ │ │ ├── d3.lasso.min.js │ │ │ └── jquery.urlive.js │ │ ├── cross_filter.js │ │ ├── topicvis.js │ │ ├── utils.js │ │ ├── crawlersigslots.js │ │ ├── snippetsviewer.js │ │ ├── bokeh_controller.js │ │ ├── sigslot_core.js │ │ └── tagsgallery.js │ ├── base.html │ ├── domains_dashboard.html │ ├── cross_filter.html │ ├── crawlervis.html │ └── release.html └── config.conf-in ├── models └── __init__.py ├── elastic ├── test │ ├── __init__.py │ └── test_get_documents.py ├── .gitignore ├── __init__.py ├── scripts │ ├── create_config_index.sh │ ├── delete_index.sh │ ├── put_mapping.sh │ ├── create_index.sh │ └── mapping.json ├── ddt_index_config_entries.json ├── delete_index.py ├── mapping_terms.json ├── delete.py ├── get_term_vectors.py ├── config.json ├── load_config.py ├── config.py ├── aggregations.py ├── mapping.json ├── get_config.py ├── create_index.py ├── README.md ├── add_documents.py ├── get_documents.py ├── get_mtermvectors.py └── stopwords.txt ├── ranking ├── __init__.py ├── .gitignore ├── run.sh ├── preprocess.py ├── BayesianSets.py ├── get_bigrams_trigrams.py ├── tfidf.py ├── rank.py ├── extract_terms.py └── word2vec.py ├── seeds_generator ├── __init__.py ├── src │ ├── main │ │ ├── config │ │ │ └── queries.txt │ │ └── java │ │ │ └── page_downloader │ │ │ ├── Download_Utils.java │ │ │ ├── App.java │ │ │ ├── Download_urls.java │ │ │ ├── StartCrawl.java │ │ │ ├── Extract.java │ │ │ ├── Download.java │ │ │ ├── BingSearch.java │ │ │ ├── GoogleSearch.java │ │ │ └── Crawl.java │ └── test │ │ └── java │ │ └── page_downloader │ │ └── AppTest.java ├── conf │ └── config.properties ├── download.py ├── pom.xml └── concat_nltk.py ├── online_classifier ├── __init__.py ├── tfidf_vector.py ├── online_classifier.py └── tf_vector.py ├── logs └── README.md ├── .dockerignore ├── run_demo.sh ├── conda.recipe ├── README.md ├── meta.yaml └── build.sh ├── .gitignore ├── bin ├── ddt └── ddt-dev ├── environment.yml ├── supervisord.conf ├── Dockerfile ├── Makefile └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /elastic/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ranking/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /seeds_generator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vis/bokeh_plots/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /online_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vis/bokeh_plots/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ranking/.gitignore: -------------------------------------------------------------------------------- 1 | selected_terms.txt 2 | exclude.txt -------------------------------------------------------------------------------- /logs/README.md: -------------------------------------------------------------------------------- 1 | Logfiles from supervisor processes go here 2 | -------------------------------------------------------------------------------- /seeds_generator/src/main/config/queries.txt: -------------------------------------------------------------------------------- 1 | explosive chemicals -------------------------------------------------------------------------------- /ranking/run.sh: -------------------------------------------------------------------------------- 1 | python rank.py ../lda_pipeline/data/lda_input.csv 3,4,7,28 2 | -------------------------------------------------------------------------------- /elastic/.gitignore: -------------------------------------------------------------------------------- 1 | /local 2 | /bin 3 | /include 4 | /lib/python* 5 | /build 6 | *.pyc 7 | -------------------------------------------------------------------------------- /elastic/__init__.py: -------------------------------------------------------------------------------- 1 | from config import es, es_server 2 | 3 | __export__ = ['es_server', 'es'] 4 | -------------------------------------------------------------------------------- /vis/html/img/apply.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/apply.png -------------------------------------------------------------------------------- /vis/html/img/boost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/boost.png -------------------------------------------------------------------------------- /vis/html/img/delete.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/delete.jpg -------------------------------------------------------------------------------- /vis/html/img/reload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/reload.png -------------------------------------------------------------------------------- /vis/html/img/remove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/remove.png -------------------------------------------------------------------------------- /vis/html/img/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/search.png -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | vis/config.conf 2 | ranking/D_cbow_pdw_8B.pkl 3 | data/ 4 | vis/html/models/ 5 | *.pyc 6 | *.log 7 | *.class 8 | *.jar 9 | -------------------------------------------------------------------------------- /vis/html/img/nyu_stacked_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/nyu_stacked_black.png -------------------------------------------------------------------------------- /vis/html/img/engineering_long_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/engineering_long_black.png -------------------------------------------------------------------------------- /seeds_generator/conf/config.properties: -------------------------------------------------------------------------------- 1 | ACCOUNTKEY SYQ5NpERm7UmF1ZCdysXfQjS5wD41a27sSnBS5KReqA 2 | ACCOUNTKEY_GOOG AIzaSyADaHyjihNC3591IehV5pcmqK044jdrFEM 3 | CSE_ID_GOOG 016642719151054520299:gftwrd3ql-m 4 | -------------------------------------------------------------------------------- /vis/html/css/dashboard_styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color:transparent; 3 | } 4 | .bk-vbox { 5 | padding-left:1px; 6 | } 7 | 8 | .bk-data-table { 9 | margin: 0px 20px 20px 0px; 10 | } 11 | -------------------------------------------------------------------------------- /vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /run_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Activating DDT enviroment..." 3 | source activate ddt 4 | 5 | echo "Using ElasticSearch at $ELASTICSEARCH_SERVER" 6 | 7 | echo "Starting services..." 8 | supervisord -c /ddt/supervisord.conf 9 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/Download_Utils.java: -------------------------------------------------------------------------------- 1 | public class Download_Utils{ 2 | public static String validate_url(String url){ 3 | if(!url.contains("http")) 4 | url = "http://" + url; 5 | return url; 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /conda.recipe/README.md: -------------------------------------------------------------------------------- 1 | This conda recipe was originally added by Continuum Analytics in July 2015 under the DDT license 2 | 3 | You may need other recipes from https://github.com/memex-explorer/memex-explorer or packages from the memex channel 4 | 5 | -------------------------------------------------------------------------------- /elastic/scripts/create_config_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ $# -eq 0 ] 4 | then 5 | ELASTIC=http://localhost:9200 6 | else 7 | ELASTIC=$1 8 | fi 9 | 10 | ./create_index.sh config $ELASTIC 11 | ./put_mapping.sh config domains config.json $ELASTIC 12 | 13 | 14 | -------------------------------------------------------------------------------- /elastic/ddt_index_config_entries.json: -------------------------------------------------------------------------------- 1 | { 2 | "entries": [ 3 | { 4 | "id" : "1", 5 | "domain_name": "Gun Control", 6 | "index" : "gun_control" 7 | }, 8 | { 9 | "id" : "2", 10 | "domain_name": "Ebola", 11 | "index" : "ebola" 12 | } 13 | ] 14 | } 15 | 16 | -------------------------------------------------------------------------------- /elastic/delete_index.py: -------------------------------------------------------------------------------- 1 | from config import es as default_es 2 | from pprint import pprint 3 | 4 | def delete_index(es_index='', es=None): 5 | if es is None: 6 | es = default_es 7 | 8 | if es_index != "": 9 | res = es.indices.delete(index=es_index) 10 | 11 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/App.java: -------------------------------------------------------------------------------- 1 | package page_downloader; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | local 2 | bin 3 | include 4 | /lib 5 | python2.7 6 | /build 7 | /nltk_data 8 | config.conf 9 | results.txt 10 | *.pyc 11 | *~ 12 | *.#* 13 | *#* 14 | *.bak 15 | *log 16 | *.class 17 | *.jar 18 | *.out 19 | seeds_generator/target/* 20 | seeds_generator/conf/queries.txt 21 | ranking/D_cbow_pdw_8B.pkl 22 | data/ 23 | vis/html/models/ 24 | *.DS_Store 25 | .idea 26 | *.swp 27 | .cache/ 28 | -------------------------------------------------------------------------------- /elastic/scripts/delete_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ $# -eq 0 ] 3 | then 4 | INDEX=memex 5 | else 6 | INDEX=$1 7 | fi 8 | 9 | if [ $# -gt 1 ] 10 | then 11 | TYPE=$2 12 | else 13 | TYPE=page 14 | fi 15 | 16 | if [ $# -gt 2 ] 17 | then 18 | ELASTIC=$3 19 | else 20 | ELASTIC=http://localhost:9200 21 | fi 22 | echo $INDEX 23 | 24 | curl -XDELETE "$ELASTIC/$INDEX/$TYPE"; echo 25 | -------------------------------------------------------------------------------- /elastic/mapping_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "terms": { 3 | "properties": { 4 | "term": { 5 | "type": "string" 6 | }, 7 | "index": { 8 | "type": "string" 9 | }, 10 | "doc_type": { 11 | "type": "string" 12 | }, 13 | "tf": { 14 | "type": "integer" 15 | }, 16 | "tag": { 17 | "type": "string" 18 | } 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /bin/ddt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_PATH="${BASH_SOURCE[0]}"; 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH) 5 | 6 | export NLTK_DATA=$SCRIPT_DIR/../lib/ddt/nltk_data 7 | export ACHE_HOME=$SCRIPT_DIR/../lib/ache/ 8 | export DDT_HOME=$SCRIPT_DIR/../lib/ddt 9 | # ugly, but DDT doesn't really have a concept of installs 10 | export PYTHONPATH=$SCRIPT_DIR/../lib/ddt:$PYTHONPATH 11 | 12 | python $SCRIPT_DIR/../lib/ddt/vis/server.py 13 | -------------------------------------------------------------------------------- /elastic/delete.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from config import es as default_es 3 | from elasticsearch.exceptions import NotFoundError 4 | 5 | def delete(ids, es_index='memex', es_doc_type='page', es=None): 6 | if es is None: 7 | es = default_es 8 | 9 | for id in ids: 10 | try: 11 | es.delete(es_index, es_doc_type, id) 12 | except NotFoundError: 13 | continue 14 | -------------------------------------------------------------------------------- /vis/html/css/cross_filter.css: -------------------------------------------------------------------------------- 1 | .help-dropdown { 2 | padding-top:8px; 3 | } 4 | 5 | .help-dropdown-content { 6 | display: none; 7 | position: absolute; 8 | background-color: #f9f9f9; 9 | min-width: 360px; 10 | box-shadow: 0px 4px 8px 0px rgba(0,0,0,1); 11 | padding: 12px 16px; 12 | z-index: 1000; 13 | } 14 | .help-dropdown:hover .help-dropdown-content { 15 | display: block; 16 | } 17 | 18 | .bokeh_plot { 19 | padding-bottom:30px; 20 | } 21 | 22 | .bokeh_table { 23 | padding-bottom:10px; 24 | } 25 | -------------------------------------------------------------------------------- /elastic/scripts/put_mapping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ $# -eq 0 ] 3 | then 4 | INDEX=memex 5 | else 6 | INDEX=$1 7 | fi 8 | 9 | if [ $# -gt 1 ] 10 | then 11 | TYPE=$2 12 | echo $TYPE 13 | else 14 | TYPE=page 15 | fi 16 | 17 | if [ $# -gt 2 ] 18 | then 19 | MAPPING=$3 20 | else 21 | MAPPING='mapping.json' 22 | fi 23 | 24 | if [ $# -gt 3 ] 25 | then 26 | ELASTIC=$4 27 | else 28 | ELASTIC=http://localhost:9200 29 | fi 30 | 31 | curl -XPUT "$ELASTIC/$INDEX/$TYPE/_mapping?pretty=1" -d @$MAPPING 32 | -------------------------------------------------------------------------------- /vis/html/cross_filter_plot_area.html: -------------------------------------------------------------------------------- 1 | {% block content %} 2 | 3 |
{{ widgets_script | safe }}
4 |
{{ plots_script | safe }}
5 |
{{ plots_div['queries'] | safe }}
6 |
{{ plots_div['tags'] | safe }}
7 |
{{ plots_div['hostnames'] | safe }}
8 |
{{ plots_div['tlds'] | safe }}
9 |
{{ plots_div['ts'] | safe }}
10 | 11 | {% endblock content %} 12 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ddt 2 | 3 | channels: 4 | - memex 5 | - vida-nyu 6 | 7 | dependencies: 8 | - elasticsearch 9 | - supervisor 10 | - meld3 11 | - dateutil 12 | - cython >=0.22 13 | - ddt-word2vec 14 | - maven 15 | - nltk 16 | - scipy 17 | - numexpr >=2.4 18 | - scikit-learn >=0.16.1 19 | - pyelasticsearch >=1.2 20 | - cherrypy 21 | - requests 22 | - ache >=0.3.1 23 | - jinja2 24 | - bokeh=0.10.0 25 | - pyldavis=2.1.0 26 | - topik 27 | - functools32 28 | - networkx=1.11 29 | -------------------------------------------------------------------------------- /bin/ddt-dev: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_PATH="${BASH_SOURCE[0]}"; 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH) 5 | # ugly, but portable 6 | export DDT_HOME=$(python -c "import os, sys; sys.stdout.write(os.path.abspath('$SCRIPT_DIR/..')+'\n')") 7 | echo "DDT_HOME : $DDT_HOME" 8 | export NLTK_DATA=$DDT_HOME/nltk_data 9 | echo "NLTK_DATA : $NLTK_DATA" 10 | export ACHE_HOME=$(dirname $(which ache))/../lib/ache/ 11 | echo "ACHE_HOME : $ACHE_HOME" 12 | 13 | # ugly, but DDT doesn't really have a concept of installs 14 | export PYTHONPATH=$DDT_HOME:$PYTHONPATH 15 | echo "PYTHONPATH: $PYTHONPATH" 16 | 17 | python $DDT_HOME/vis/server.py 18 | -------------------------------------------------------------------------------- /elastic/scripts/create_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ $# -eq 0 ] 3 | then 4 | INDEX=memex 5 | else 6 | INDEX=$1 7 | fi 8 | 9 | if [ $# -gt 1 ] 10 | then 11 | ELASTIC=$2 12 | else 13 | ELASTIC=http://localhost:9200 14 | fi 15 | 16 | curl -s -XPUT "$ELASTIC/$INDEX"; echo 17 | # -d '{ 18 | # "index" : { 19 | # "analysis":{ 20 | # "analyzer":{ 21 | # "html" : { 22 | # "type" : "custom", 23 | # "tokenizer" : "standard", 24 | # "filter" : ["lowercase" , "stop"], 25 | # "char_filter" : ["html_strip"] 26 | # } 27 | # } 28 | # } 29 | # } 30 | # }' 31 | -------------------------------------------------------------------------------- /vis/html/js/libs/queue.min.js: -------------------------------------------------------------------------------- 1 | !function(){function n(n){function e(){for(;i=ap;){var u=a++,e=c[u],o=t.call(e,1);o.push(l(u)),++p,e[0].apply(null,o)}}function l(n){return function(u,t){--p,null==s&&(null!=u?(s=u,a=d=0/0,o()):(c[n]=t,--d?i||e():o()))}}function o(){null!=s?m(s):f?m(s,c):m.apply(null,[s].concat(c))}var r,i,f,c=[],a=0,p=0,d=0,s=null,m=u;return n||(n=1/0),r={defer:function(){return s||(c.push(arguments),++d,e()),r},await:function(n){return m=n,f=!1,d||o(),r},awaitAll:function(n){return m=n,f=!0,d||o(),r}}}function u(){}var t=[].slice;n.version="1.0.7","function"==typeof define&&define.amd?define(function(){return n}):"object"==typeof module&&module.exports?module.exports=n:this.queue=n}(); -------------------------------------------------------------------------------- /conda.recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: ddt 3 | version: 2.3.0 4 | 5 | build: 6 | number: 0 7 | has_prefix_files: 8 | - lib/ddt/vis/config.conf 9 | 10 | source: 11 | git_url: https://github.com/ViDA-NYU/domain_discovery_tool 12 | git_tag: 2.3 13 | 14 | requirements: 15 | build: 16 | - cython >=0.22 17 | - ddt-word2vec 18 | - maven 19 | - nltk 20 | run: 21 | - scipy 22 | - cython >=0.22 23 | - numexpr >=2.4 24 | - scikit-learn >=0.16.1 25 | - pyelasticsearch >=1.2 26 | - nltk 27 | - cherrypy 28 | - requests 29 | - ddt-word2vec 30 | - ache >=0.3.1 31 | - functools32 32 | 33 | #about: 34 | # license: Apache? 35 | -------------------------------------------------------------------------------- /vis/config.conf-in: -------------------------------------------------------------------------------- 1 | [global] 2 | server.socket_host = 0.0.0.0 3 | server.socket_port = 8084 4 | server.thread_pool = 10 5 | 6 | [/] 7 | tools.staticdir.root = . 8 | tools.encode.on = True 9 | tools.gzip.on = True 10 | 11 | [/css] 12 | tools.staticdir.on = True 13 | tools.staticdir.dir = css 14 | 15 | [/js] 16 | tools.staticdir.on = True 17 | tools.staticdir.dir = js 18 | 19 | [/img] 20 | tools.staticdir.on = True 21 | tools.staticdir.dir = img 22 | 23 | [/models] 24 | tools.staticdir.on = True 25 | tools.staticdir.dir = models 26 | 27 | [/bootflat-2.0.4] 28 | tools.staticdir.on = True 29 | tools.staticdir.dir = libs/bootflat-2.0.4 30 | 31 | [/bootstrap-datetimepicker-4.15.35] 32 | tools.staticdir.on = True 33 | tools.staticdir.dir = libs/bootstrap-datetimepicker-4.15.35 -------------------------------------------------------------------------------- /online_classifier/tfidf_vector.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfTransformer 2 | from nltk import corpus 3 | 4 | from tf_vector import tf_vectorizer 5 | 6 | class tfidf_vectorizer(tf_vectorizer): 7 | 8 | def __init__(self, convert_to_ascii=False, max_features= 10000, ngram_range=(1,1)): 9 | self.tfidf_transformer = None 10 | tf_vectorizer.__init__(self, convert_to_ascii, max_features, ngram_range) 11 | 12 | def tfidf(self, data): 13 | [X_counts, features] = self.vectorize(data) 14 | if self.tfidf_transformer is None: 15 | self.tfidf_transformer = TfidfTransformer() 16 | X = self.tfidf_transformer.fit_transform(X_counts) 17 | else: 18 | X = self.tfidf_transformer.transform(X_counts) 19 | 20 | return [X, X_counts, features] 21 | 22 | -------------------------------------------------------------------------------- /seeds_generator/src/test/java/page_downloader/AppTest.java: -------------------------------------------------------------------------------- 1 | package page_downloader; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /elastic/get_term_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from os import environ 3 | 4 | from config import es as default_es 5 | 6 | es = default_es 7 | 8 | query = { 9 | "query": { 10 | "match_all": {} 11 | }, 12 | "fields": [] 13 | } 14 | res = es.search(query, 15 | index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 16 | doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page') 17 | 18 | hits = res['hits'] 19 | print 'Document found: %d' % hits['total'] 20 | ids = [hit['_id'] for hit in hits['hits']] 21 | body={ 22 | "ids": ids, 23 | "parameters": { 24 | "fields": [ "text" ] 25 | } 26 | } 27 | res = es.send_request('POST', 28 | ['memex', 'page', '_mtermvectors'], 29 | body=body, query_params={}) 30 | 31 | -------------------------------------------------------------------------------- /ranking/preprocess.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize 2 | from nltk.text import TextCollection 3 | from nltk import corpus 4 | 5 | from pprint import pprint 6 | 7 | ENGLISH_STOPWORDS = set(corpus.stopwords.words('english')) 8 | 9 | class TextPreprocess: 10 | def __init__(self,display=False): 11 | self.display=display 12 | 13 | def preprocess(self,text): 14 | #text = text.split(" "); 15 | text = word_tokenize(text) 16 | if self.display: 17 | print "After Tokenizing" 18 | print text 19 | print "\n\n" 20 | 21 | text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2] 22 | 23 | tc = TextCollection([text]) 24 | words = list(set(tc)) 25 | 26 | word_tf = {word: tc.tf(word, text) * len(text) for word in words} 27 | 28 | return word_tf 29 | -------------------------------------------------------------------------------- /ranking/BayesianSets.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | from numpy import * 3 | 4 | import sys 5 | 6 | reload(sys) 7 | sys.setdefaultencoding("utf-8") 8 | 9 | 10 | class BayesianSets: 11 | # D-> Query Set 12 | # X-> Data Set 13 | def score(self, D, X) : 14 | 15 | #Compute Bayesian Sets Parameters 16 | c = 2 17 | N = D.shape[0] 18 | T = concatenate((D,X)) 19 | m = divide(sum(T, axis=0),T.shape[0]) 20 | 21 | a = multiply(m, c) 22 | b = multiply(subtract(1,m),c) 23 | 24 | at = add(a,sum(D, axis=0)) 25 | bt = subtract(add(b,N),sum(D, axis=0)) 26 | 27 | C = sum(subtract(add(subtract(log(add(a,b)),log(add(add(a,b),N))), log(bt)), log (b))) 28 | 29 | q = transpose(add(subtract(subtract(log(at),log(a)),log(bt)), log(b))) 30 | 31 | score_X = transpose(add(C, dot(X,q))) 32 | 33 | return asarray(score_X) 34 | 35 | -------------------------------------------------------------------------------- /elastic/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "domains" : { 3 | "_timestamp" : { 4 | "enabled" : true, 5 | "store" : true 6 | }, 7 | "properties" : { 8 | "domain_name" : { 9 | "type" : "string" 10 | }, 11 | "timestamp" : { 12 | "type" : "date" 13 | }, 14 | "index" : { 15 | "type" : "string" 16 | }, 17 | "doc_type": { 18 | "type": "string" 19 | }, 20 | "mapping":{ 21 | "properties": { 22 | "timestamp": { 23 | "type": "string" 24 | }, 25 | "text": { 26 | "type": "string" 27 | }, 28 | "html": { 29 | "type": "string" 30 | }, 31 | "tag":{ 32 | "type": "string" 33 | }, 34 | "content-type":{ 35 | "type": "string" 36 | } 37 | } 38 | }, 39 | "tag_colors": { 40 | "properties": { 41 | "index": { 42 | "type": "integer" 43 | }, 44 | "colors": { 45 | "type": "string" 46 | } 47 | } 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisorctl] 2 | username=darpamemex 3 | 4 | [supervisord] 5 | childlogdir=logs 6 | logfile=supervisord.log ; (main log file;default $CWD/supervisord.log) 7 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) 8 | logfile_backups=10 ; (num of main logfile rotation backups;default 10) 9 | loglevel=info ; (log level;default info; others: debug,warn,trace) 10 | pidfile=supervisord.pid ; (supervisord pidfile;default supervisord.pid) 11 | nodaemon=true ; (start in foreground if true;default false) 12 | minfds=1024 ; (min. avail startup file descriptors;default 1024) 13 | minprocs=200 ; (min. avail process descriptors;default 200) 14 | 15 | [inet_http_server] 16 | port = 127.0.0.1:9001 17 | 18 | [program:elasticsearch] 19 | command=elasticsearch 20 | priority=1 21 | 22 | [program:ddt] 23 | command=bash ./bin/ddt-dev 24 | priority=2 25 | 26 | [rpcinterface:supervisor] 27 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface -------------------------------------------------------------------------------- /elastic/load_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from datetime import datetime 4 | from add_documents import add_document 5 | 6 | from config import es as default_es 7 | 8 | def load_config(entries, es_index='config', es_doc_type='domains', es=None): 9 | 10 | if es is None: 11 | es = default_es 12 | 13 | add_document(entries, es_index, es_doc_type, es) 14 | 15 | if __name__ == "__main__": 16 | 17 | if len(sys.argv)>1: 18 | config_file = sys.argv[1] 19 | else: 20 | config_file = 'ddt_index_config_entries.json' 21 | 22 | if len(sys.argv)>2: 23 | es_index = sys.argv[2] 24 | else: 25 | es_index = 'config' 26 | 27 | if len(sys.argv)>3: 28 | es_doc_type = sys.argv[3] 29 | else: 30 | es_doc_type = 'domains' 31 | 32 | es = None 33 | if len(sys.argv)>4: 34 | es_host = sys.argv[4] 35 | from pyelasticsearch import ElasticSearch 36 | es = ElasticSearch(es_host) 37 | 38 | load_config(config_file, es_index, es_doc_type, es) 39 | 40 | -------------------------------------------------------------------------------- /elastic/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | provides access to elasticsearch server 3 | 4 | es_server - the name of the endpoint 5 | es - an Elasticsearch instance connected to es_server 6 | ''' 7 | 8 | from elasticsearch import Elasticsearch 9 | from os import environ 10 | import certifi 11 | 12 | if environ.get('ELASTICSEARCH_SERVER'): 13 | es_server = environ['ELASTICSEARCH_SERVER'] 14 | else: 15 | es_server = 'http://localhost:9200/' 16 | 17 | print 'ELASTICSEARCH_SERVER ', es_server 18 | 19 | if environ.get('ELASTICSEARCH_USER'): 20 | es_user = environ['ELASTICSEARCH_USER'] 21 | else: 22 | es_user = "" 23 | 24 | print 'ELASTICSEARCH_USER ', es_user 25 | 26 | if environ.get('ELASTICSEARCH_PASSWD'): 27 | es_passwd = environ['ELASTICSEARCH_PASSWD'] 28 | else: 29 | es_passwd = "" 30 | 31 | if es_user: 32 | es = Elasticsearch([es_server], http_auth=(es_user, es_passwd), use_ssl=True, verify_certs=True, ca_certs=certifi.where(), timeout=100) 33 | else: 34 | es = Elasticsearch([es_server]) 35 | 36 | if environ.get('ELASTICSEARCH_DOC_TYPE'): 37 | es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] 38 | else: 39 | es_doc_type = 'page' 40 | 41 | 42 | -------------------------------------------------------------------------------- /vis/html/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Domain Discovery Tool 11 | 12 | 13 | 24 |
25 | {% block content %} 26 | {% endblock content %} 27 |
28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /vis/html/js/cross_filter.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This module handles the server callback to update the bokeh plots 3 | */ 4 | var crossFilterUpdate = function(){ 5 | var data_table_ids = ['urls', 'tlds', 'tags', 'queries']; 6 | 7 | setTimeout(function() { //need timeout to wait for class change 8 | var global_state = {}; 9 | for (i=0; i ${PREFIX}/lib/ddt/vis/config.conf 32 | 33 | cp -av vis/* ${PREFIX}/lib/ddt/vis 34 | 35 | cp -av bin/ddt ${PREFIX}/bin/ddt 36 | chmod +x ${PREFIX}/bin/ddt 37 | 38 | # ugly, but DDT hardcodes the location of word2vec here 39 | pushd ${PREFIX}/lib/ddt/ranking 40 | ln -s ../../../data/D_cbow_pdw_8B.pkl ./D_cbow_pdw_8B.pkl 41 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/Download_urls.java: -------------------------------------------------------------------------------- 1 | public class Download_urls { 2 | public Download_urls(){ 3 | } 4 | 5 | public void download(String[] urls, String es_index, String es_doc_type, String es_server){ 6 | Download download = new Download("uploaded", es_index, es_doc_type, es_server); 7 | 8 | for(String url: urls){ 9 | download.addTask(Download_Utils.validate_url(url)); 10 | } 11 | 12 | download.shutdown(); 13 | } 14 | 15 | public static void main(String[] args) { 16 | 17 | String urls_str = ""; //default 18 | String es_index = "memex"; 19 | String es_doc_type = "page"; 20 | String es_server = "localhost"; 21 | 22 | int i = 0; 23 | while (i < args.length){ 24 | String arg = args[i]; 25 | if(arg.equals("-u")){ 26 | urls_str = args[++i]; 27 | } else if(arg.equals("-i")){ 28 | es_index = args[++i]; 29 | } else if(arg.equals("-d")){ 30 | es_doc_type = args[++i]; 31 | } else if(arg.equals("-s")){ 32 | es_server = args[++i]; 33 | }else { 34 | System.out.println("Unrecognized option"); 35 | break; 36 | } 37 | ++i; 38 | } 39 | 40 | String[] urls = null; 41 | if(urls_str != null & !urls_str.isEmpty()) 42 | urls = urls_str.split(" "); 43 | 44 | Download_urls download_urls = new Download_urls(); 45 | download_urls.download(urls, es_index, es_doc_type, es_server); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/StartCrawl.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.Arrays; 3 | 4 | public class StartCrawl { 5 | public static void main(String[] args) { 6 | 7 | String crawl = ""; //default 8 | String urls_str = ""; 9 | String top = "10"; 10 | String es_index = "memex"; 11 | String es_doc_type = "page"; 12 | String es_server = "localhost"; 13 | 14 | int i = 0; 15 | while (i < args.length){ 16 | String arg = args[i]; 17 | if(arg.equals("-c")){ 18 | crawl = args[++i]; 19 | } else if(arg.equals("-u")){ 20 | urls_str = args[++i]; 21 | } else if(arg.equals("-t")){ 22 | top = args[++i]; 23 | } else if(arg.equals("-i")){ 24 | es_index = args[++i]; 25 | } else if(arg.equals("-d")){ 26 | es_doc_type = args[++i]; 27 | } else if(arg.equals("-s")){ 28 | es_server = args[++i]; 29 | }else { 30 | System.out.println("Unrecognized option"); 31 | break; 32 | } 33 | ++i; 34 | } 35 | 36 | ArrayList urls = null; 37 | if(!urls_str.isEmpty()){ 38 | urls = new ArrayList(Arrays.asList(urls_str.split(","))); 39 | } 40 | 41 | Crawl c = new Crawl(es_index, es_doc_type, es_server); 42 | 43 | if(urls != null && crawl.equals("forward")) 44 | c.addForwardCrawlTask(urls, top); 45 | else if(urls != null && crawl.equals("backward")) 46 | c.addBackwardCrawlTask(urls, top); 47 | 48 | c.shutdown(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /vis/html/css/jquery.urlive.css: -------------------------------------------------------------------------------- 1 | /* 2 | * jquery.urlive.css v1.1.1, jQuery URLive 3 | * 4 | * View the plugin repository at: 5 | * https://github.com/markserbol/urlive 6 | * 7 | */ 8 | 9 | .urlive-container { 10 | color:inherit; 11 | text-decoration:none; 12 | display:block; 13 | width:auto; 14 | overflow:auto; 15 | position:relative; 16 | overflow:hidden; 17 | margin:0; 18 | font-size:12px; 19 | line-height:normal; 20 | } 21 | 22 | .urlive-link:hover { 23 | box-shadow:0 0 4px rgba(10,10,10,0.6); 24 | } 25 | 26 | .urlive-link * { 27 | display:block; 28 | } 29 | 30 | .urlive-image { 31 | width:100%; 32 | display:block; 33 | } 34 | 35 | .urlive-title { 36 | font-size:1.15em; 37 | font-weight:bold; 38 | } 39 | 40 | .urlive-description { 41 | font-size:1em; 42 | } 43 | 44 | .urlive-url { 45 | font-size:0.9em; 46 | overflow:hidden; 47 | white-space:nowrap; 48 | text-overflow:ellipsis; 49 | } 50 | 51 | .urlive-sitename, .urlive-type { 52 | display:none; 53 | } 54 | 55 | .urlive-img-wrapper { 56 | display: inline-block; 57 | float:left; 58 | margin-right:10px; 59 | max-width:80px; 60 | padding-left: 5px; 61 | } 62 | 63 | .urlive-text-wrapper { 64 | display: inline-block; 65 | overflow:auto; 66 | max-width:535px; 67 | } 68 | 69 | /* SMALL IMAGE STYLES*/ 70 | .urlive-img-small .urlive-img-wrapper { 71 | width:auto; 72 | max-width:80px; 73 | } 74 | 75 | /* LARGE IMAGE STYLES */ 76 | .urlive-img-large .urlive-img-wrapper { 77 | width:100%; 78 | max-width:none; 79 | float:none; 80 | } 81 | -------------------------------------------------------------------------------- /ranking/get_bigrams_trigrams.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from elastic.get_documents import get_documents 3 | from online_classifier.tfidf_vector import tfidf_vectorizer 4 | import numpy as np 5 | import operator 6 | import math 7 | from sets import Set 8 | 9 | from nltk import corpus 10 | ENGLISH_STOPWORDS = set(corpus.stopwords.words('english')) 11 | 12 | MAX_PHRASES = 1000 13 | 14 | def get_bigrams_trigrams(text=[], termCount=20, es=None): 15 | 16 | bigram_vectorizer = tfidf_vectorizer(convert_to_ascii=True, ngram_range=(2,2)) 17 | trigram_vectorizer = tfidf_vectorizer(convert_to_ascii=True, ngram_range=(3,3)) 18 | 19 | [bigram_tfidf, bigram_tf, bi_corpus] = bigram_vectorizer.tfidf(text) 20 | [trigram_tfidf, trigram_tf, tri_corpus] = trigram_vectorizer.tfidf(text) 21 | 22 | N = np.shape(bigram_tfidf)[0] 23 | avg = np.divide(bigram_tfidf.sum(axis=0), N) 24 | sortedAvgIndices = np.argsort(avg)[::-1] 25 | top_bigrams = [bi_corpus[sortedAvgIndices[0,i]] for i in range(0, np.shape(sortedAvgIndices)[1])][0:termCount] 26 | 27 | N = np.shape(trigram_tfidf)[0] 28 | avg = np.divide(trigram_tfidf.sum(axis=0), N) 29 | sortedAvgIndices = np.argsort(avg)[::-1] 30 | top_trigrams = [tri_corpus[sortedAvgIndices[0,i]] for i in range(0, np.shape(sortedAvgIndices)[1])][0:termCount] 31 | 32 | return bigram_tfidf, trigram_tfidf, bigram_tf, trigram_tf, bi_corpus, tri_corpus, top_bigrams, top_trigrams 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Domain Discover Tool Dockerfile 3 | # 4 | # https://github.com/ViDA-NYU/domain_discovery_tool 5 | # 6 | 7 | # Pull base image. 8 | FROM ubuntu:trusty 9 | 10 | # Install some dependencies and useful tools 11 | RUN apt-get update &&\ 12 | apt-get -y install\ 13 | build-essential\ 14 | openjdk-7-jdk\ 15 | wget curl vim 16 | 17 | # Install miniconda 18 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 19 | wget --quiet http://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh && \ 20 | /bin/bash /Miniconda2-latest-Linux-x86_64.sh -b -p /opt/conda && \ 21 | rm Miniconda2-latest-Linux-x86_64.sh && \ 22 | /opt/conda/bin/conda install --yes conda==3.14.1 23 | ENV PATH /opt/conda/bin:$PATH 24 | 25 | # Expose Domain Discovery Tool port 26 | EXPOSE 8084 27 | 28 | # Expose ElasticSearch ports 29 | EXPOSE 9200 30 | EXPOSE 9300 31 | 32 | # Expose Supervisord port 33 | EXPOSE 9001 34 | 35 | WORKDIR /ddt 36 | 37 | # Add build file 38 | ADD ./Makefile /ddt/Makefile 39 | 40 | # Install conda dependencies and download nltk data 41 | ADD ./environment.yml /ddt/environment.yml 42 | RUN make conda_env 43 | RUN make get_nltk_data 44 | 45 | # Compile Java app 46 | ADD ./seeds_generator /ddt/seeds_generator 47 | RUN make downloader_app 48 | 49 | # Add remaining python source files 50 | ADD . /ddt 51 | 52 | # Setup remaning configs 53 | RUN make cherrypy_config link_word2vec_data 54 | 55 | # Patch address to listen to external connections 56 | RUN sed -i "s#port = 127.0.0.1:9001#port = 0.0.0.0:9001#g" supervisord.conf 57 | 58 | CMD bash -c 'source activate ddt; /ddt/bin/ddt-dev' 59 | -------------------------------------------------------------------------------- /elastic/scripts/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "page": { 3 | "_timestamp": { 4 | "enabled": true, 5 | "store": true 6 | }, 7 | "_id": { 8 | "path": "url" 9 | }, 10 | "properties": { 11 | "url": { 12 | "type": "string", 13 | "index": "not_analyzed" 14 | }, 15 | "text": { 16 | "type": "string", 17 | "term_vector": "yes" 18 | }, 19 | "html": { 20 | "type": "string", 21 | "index": "no" 22 | }, 23 | "query": { 24 | "type": "string" 25 | }, 26 | "retrieved": { 27 | "type": "date" 28 | }, 29 | "last_modified": { 30 | "type": "date" 31 | }, 32 | "length": { 33 | "type": "integer" 34 | }, 35 | "md5": { 36 | "type": "binary" 37 | }, 38 | "redirect": { 39 | "type": "string", 40 | "index": "not_analyzed" 41 | }, 42 | "relevance": { 43 | "type": "float" 44 | }, 45 | "thumbnail_name": { 46 | "type": "string" 47 | }, 48 | "thumbnail": { 49 | "type": "binary" 50 | }, 51 | "tag": { 52 | "type": "string" 53 | }, 54 | "class": { 55 | "type": "string" 56 | }, 57 | "doc_name": { 58 | "type": "string", 59 | "index": "not_analyzed" 60 | }, 61 | "doc_distance": { 62 | "type": "float" 63 | }, 64 | "topic_name": { 65 | "type": "string", 66 | "index": "not_analyzed" 67 | }, 68 | "x": { 69 | "type": "float" 70 | }, 71 | "y": { 72 | "type": "float" 73 | }, 74 | "topic_weight": { 75 | "type": "float" 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /vis/html/css/d3.slider.css: -------------------------------------------------------------------------------- 1 | .d3-slider { 2 | position: relative; 3 | font-family: Verdana,Arial,sans-serif; 4 | font-size: 1.1em; 5 | border: 1px solid #dadada; 6 | border-radius: 5px; 7 | z-index: 2; 8 | } 9 | 10 | .d3-slider-horizontal { 11 | height: .8em; 12 | } 13 | 14 | .d3-slider-range { 15 | background:#2980b9; 16 | left:0px; 17 | right:0px; 18 | height: 0.8em; 19 | position: absolute; 20 | } 21 | 22 | .d3-slider-range-vertical { 23 | background:#2980b9; 24 | left:0px; 25 | right:0px; 26 | position: absolute; 27 | top:0; 28 | } 29 | 30 | .d3-slider-vertical { 31 | width: .8em; 32 | height: 100px; 33 | } 34 | 35 | .d3-slider-handle { 36 | position: absolute; 37 | width: 1.2em; 38 | height: 1.2em; 39 | border: 1px solid #d3d3d3; 40 | border-radius: 4px; 41 | background: #eee; 42 | background: linear-gradient(to bottom, #eee 0%, #ddd 100%); 43 | z-index: 3; 44 | } 45 | 46 | .d3-slider-handle:hover { 47 | border: 1px solid #999999; 48 | } 49 | 50 | .d3-slider-horizontal .d3-slider-handle { 51 | top: -.3em; 52 | margin-left: -.6em; 53 | } 54 | 55 | .d3-slider-axis { 56 | position: relative; 57 | z-index: 1; 58 | height: 20px; 59 | } 60 | 61 | .d3-slider-axis-bottom { 62 | top: .8em; 63 | } 64 | 65 | .d3-slider-axis-right { 66 | left: .8em; 67 | } 68 | 69 | .d3-slider-axis path { 70 | stroke-width: 0; 71 | fill: none; 72 | } 73 | 74 | .d3-slider-axis line { 75 | fill: none; 76 | stroke: #aaa; 77 | shape-rendering: crispEdges; 78 | } 79 | 80 | .d3-slider-axis text { 81 | font-size: 11px; 82 | } 83 | 84 | .d3-slider-vertical .d3-slider-handle { 85 | left: -.25em; 86 | margin-left: 0; 87 | margin-bottom: -.6em; 88 | } 89 | -------------------------------------------------------------------------------- /seeds_generator/download.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import sys 3 | from os import environ 4 | 5 | from subprocess import Popen, PIPE, STDOUT 6 | 7 | def encode( url): 8 | return urllib2.quote(url).replace("/", "%2F") 9 | 10 | def decode( url): 11 | return urllib2.unquote(url).replace("%2F", "/") 12 | 13 | def validate_url( url): 14 | s = url[:4] 15 | if s == "http": 16 | return url 17 | else: 18 | url = "http://" + url 19 | return url 20 | 21 | def get_downloaded_urls(inputfile): 22 | urls = [] 23 | with open(inputfile, 'r') as f: 24 | urls = f.readlines 25 | urls = [url.strip() for url in urls] 26 | return urls 27 | 28 | def download(inputfile, es_index = "memex", es_doc_type = "page", es_host="http://localhost"): 29 | parts = es_host.split(':') 30 | if len(parts) == 2: 31 | es_host = parts[0] 32 | elif len(parts) == 3: 33 | es_host = parts[1] 34 | 35 | es_host = es_host.strip('/') 36 | 37 | print es_host 38 | 39 | query = "" 40 | with open('conf/queries.txt', 'r') as f: 41 | for line in f: 42 | query = line.strip(); 43 | 44 | comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar Download " \ 45 | + inputfile + ' "' + query +'" ' + es_index + " " + es_doc_type + " " + es_host; 46 | 47 | print comm 48 | 49 | p=Popen(comm, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT) 50 | # output, errors = p.communicate() 51 | # print output 52 | # if not (errors == None): 53 | # print '*' * 80, '\n\n\n' 54 | # print errors 55 | 56 | def main(argv): 57 | if len(argv) != 1: 58 | print "Invalid arguments" 59 | print "python download.py inputfile" 60 | return 61 | inputfile=argv[0] 62 | 63 | download(inputfile) 64 | 65 | if __name__=="__main__": 66 | main(sys.argv[1:]) 67 | -------------------------------------------------------------------------------- /elastic/aggregations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from os import environ 3 | from config import es as default_es 4 | 5 | def get_significant_terms(ids, termCount = 50, mapping=None, es_index='memex', es_doc_type='page', es=None): 6 | if es is None: 7 | es = default_es 8 | 9 | with open(environ['DDT_HOME']+'/elastic/stopwords.txt', 'r') as f: 10 | stopwords = [word.strip() for word in f.readlines()] 11 | 12 | query = { 13 | "query":{ 14 | "ids": { 15 | "values": ids 16 | } 17 | }, 18 | "aggregations" : { 19 | "significantTerms" : { 20 | "significant_terms" : { 21 | "field" : mapping["text"], 22 | "size" : termCount, 23 | "exclude": stopwords 24 | } 25 | }, 26 | }, 27 | "size": 0 28 | } 29 | 30 | res = es.search(body=query, index=es_index, doc_type=es_doc_type, timeout=30) 31 | 32 | return [item['key'] for item in res['aggregations']['significantTerms']['buckets'] if len(item['key']) > 2] 33 | 34 | # This returns the unique values of the field and the number of documents associated with that unique value 35 | def get_unique_values(field, size, es_index='memex', es_doc_type='page', es=None): 36 | if es is None: 37 | es = default_es 38 | 39 | 40 | query = { 41 | "size": 0, 42 | "aggs" : { 43 | "unique_values" : { 44 | "terms" : { "field" : field, 45 | "size": size} 46 | 47 | } 48 | } 49 | } 50 | res = es.search(body=query, index=es_index, doc_type=es_doc_type, timeout=30) 51 | 52 | return {item['key']:item['doc_count'] for item in res['aggregations']['unique_values']['buckets']} 53 | 54 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/Extract.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import de.l3s.boilerpipe.extractors.KeepEverythingExtractor; 3 | import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; 4 | import de.l3s.boilerpipe.sax.HTMLDocument; 5 | import de.l3s.boilerpipe.document.TextDocument; 6 | import de.l3s.boilerpipe.document.TextBlock; 7 | import java.net.URL; 8 | import java.util.*; 9 | import java.util.HashMap; 10 | import java.lang.String; 11 | import java.net.URLDecoder; 12 | import java.io.PrintWriter; 13 | 14 | public class Extract { 15 | public Map process(String html) 16 | { 17 | try{ 18 | HashMap map = new HashMap(); 19 | String content = ""; 20 | if(!html.contains("@empty@")){ 21 | content = KeepEverythingExtractor.INSTANCE.getText(html); 22 | } 23 | content = content.trim().replaceAll(" +", " "); 24 | content = content.replaceAll("[\n\"\t]", " "); 25 | content = content.replaceAll(",",""); 26 | content = content.toLowerCase(); 27 | 28 | map.put("content", content); 29 | 30 | HTMLDocument htmlDoc = new HTMLDocument(html); 31 | TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); 32 | String title = doc.getTitle(); 33 | map.put("title", title); 34 | 35 | return map; 36 | } 37 | catch(Exception e){ 38 | System.err.println("process Exception" + e.getMessage()); 39 | } 40 | 41 | return null; 42 | } 43 | 44 | public static void main(String[] args) { 45 | Extract e = new Extract(); 46 | 47 | try{ 48 | BufferedReader br = 49 | new BufferedReader(new InputStreamReader(System.in)); 50 | 51 | String html = ""; 52 | String input; 53 | 54 | while((input=br.readLine())!=null){ 55 | html += input; 56 | } 57 | 58 | e.process(html); 59 | 60 | }catch(IOException io){ 61 | io.printStackTrace(); 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /elastic/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "page": { 3 | "_timestamp": { 4 | "enabled": true, 5 | "store": true 6 | }, 7 | "_id": { 8 | "path": "url" 9 | }, 10 | "properties": { 11 | "url": { 12 | "type": "string", 13 | "index": "not_analyzed" 14 | }, 15 | "text": { 16 | "type": "string", 17 | "term_vector": "yes" 18 | }, 19 | "html": { 20 | "type": "string", 21 | "index": "no" 22 | }, 23 | "query": { 24 | "type": "string", 25 | "index": "not_analyzed" 26 | }, 27 | "retrieved": { 28 | "type": "date" 29 | }, 30 | "last_modified": { 31 | "type": "date" 32 | }, 33 | "length": { 34 | "type": "integer" 35 | }, 36 | "md5": { 37 | "type": "binary" 38 | }, 39 | "redirect": { 40 | "type": "string", 41 | "index": "not_analyzed" 42 | }, 43 | "relevance": { 44 | "type": "float" 45 | }, 46 | "thumbnail_name": { 47 | "type": "string" 48 | }, 49 | "thumbnail": { 50 | "type": "binary" 51 | }, 52 | "tag": { 53 | "type": "string", 54 | "index": "not_analyzed" 55 | }, 56 | "class": { 57 | "type": "string" 58 | }, 59 | "doc_name": { 60 | "type": "string", 61 | "index": "not_analyzed" 62 | }, 63 | "doc_distance": { 64 | "type": "float" 65 | }, 66 | "topic_name": { 67 | "type": "string", 68 | "index": "not_analyzed" 69 | }, 70 | "x": { 71 | "type": "float" 72 | }, 73 | "y": { 74 | "type": "float" 75 | }, 76 | "topic_weight": { 77 | "type": "float" 78 | }, 79 | "crawled_backward": { 80 | "type": "float" 81 | }, 82 | "crawled_forward": { 83 | "type": "float" 84 | } 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /seeds_generator/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | page_downloader 5 | seeds_generator 6 | jar 7 | 1.0-SNAPSHOT 8 | seeds_generator 9 | http://maven.apache.org 10 | 11 | 12 | org.json 13 | json 14 | 20140107 15 | 16 | 17 | junit 18 | junit 19 | 3.8.1 20 | test 21 | 22 | 23 | org.apache.httpcomponents 24 | httpclient 25 | 4.4.1 26 | 27 | 28 | com.robbypond 29 | boilerpipe 30 | 1.2.3 31 | 32 | 33 | xerces 34 | xerces 35 | 2.4.0 36 | 37 | 38 | net.sourceforge.nekohtml 39 | nekohtml 40 | 1.9.13 41 | 42 | 43 | org.elasticsearch 44 | elasticsearch 45 | 1.5.2 46 | 47 | 48 | 49 | 50 | 51 | maven-assembly-plugin 52 | 2.5.4 53 | 54 | 55 | jar-with-dependencies 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/Download.java: -------------------------------------------------------------------------------- 1 | import java.util.concurrent.Executors; 2 | import java.util.concurrent.ExecutorService; 3 | import java.io.FileReader; 4 | import java.io.BufferedReader; 5 | import java.io.IOException; 6 | import java.util.concurrent.TimeUnit; 7 | import org.elasticsearch.client.transport.TransportClient; 8 | import org.elasticsearch.common.transport.InetSocketTransportAddress; 9 | import org.elasticsearch.client.Client; 10 | 11 | public class Download { 12 | 13 | private String query = ""; 14 | private String es_index = "memex"; 15 | private String es_doc_type = "page"; 16 | private Client client = null; 17 | private int poolSize = 100; 18 | private ExecutorService downloaderService = Executors.newFixedThreadPool(poolSize); 19 | 20 | public Download(String query, String es_index, String es_doc_type, String es_host){ 21 | this.query = query; 22 | if(es_host.isEmpty()) 23 | es_host = "localhost"; 24 | else { 25 | String[] parts = es_host.split(":"); 26 | if (parts.length == 2) 27 | es_host = parts[0]; 28 | else if(parts.length == 3) 29 | es_host = parts[1]; 30 | 31 | es_host = es_host.replaceAll("/",""); 32 | } 33 | 34 | this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(es_host, 9300)); 35 | 36 | if(!es_index.isEmpty()) 37 | this.es_index = es_index; 38 | if(!es_doc_type.isEmpty()) 39 | this.es_doc_type = es_doc_type; 40 | } 41 | 42 | public void setQuery(String query){ 43 | this.query = query; 44 | } 45 | 46 | public void addTask(String url){ 47 | downloaderService.execute(new Download_URL(url.trim(), this.query, this.es_index, this.es_doc_type, this.client)); 48 | } 49 | 50 | public void shutdown(){ 51 | try { 52 | downloaderService.shutdown(); 53 | //downloaderService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); 54 | downloaderService.awaitTermination(60 , TimeUnit.SECONDS); 55 | this.client.close(); 56 | } catch (InterruptedException e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /online_classifier/online_classifier.py: -------------------------------------------------------------------------------- 1 | from sklearn import linear_model 2 | from sklearn.calibration import CalibratedClassifierCV 3 | 4 | from tfidf_vector import tfidf_vectorizer 5 | 6 | import numpy as np 7 | 8 | class OnlineClassifier: 9 | 10 | def __init__(self, max_features=10000): 11 | self.clf = None 12 | self.tfidf_vector = tfidf_vectorizer(convert_to_ascii=True, max_features = max_features) 13 | 14 | def vectorize(self, train, test=[]): 15 | [X_train, _, _] = self.tfidf_vector.tfidf(train) 16 | 17 | X_test = None 18 | if test: 19 | [X_test, _, _] = self.tfidf_vector.tfidf(test) 20 | 21 | return [X_train, X_test] 22 | 23 | def fit(self, X, Y): 24 | clf = linear_model.SGDClassifier(n_iter=1) 25 | try: 26 | clf.fit(X, Y) 27 | except ValueError as verr: 28 | print("Value error: {0}".format(verr)) 29 | return None 30 | self.clf = clf 31 | return clf 32 | 33 | def partialFit(self, X, Y): 34 | if self.clf is None: 35 | self.fit(X, Y) 36 | else: 37 | self.clf.partial_fit(X,Y) 38 | return self.clf 39 | 40 | def calibrate(self, X, Y): 41 | if self.clf != None: 42 | sigmoid = CalibratedClassifierCV(self.clf, cv=2, method='sigmoid') 43 | sigmoid.fit(X,Y) 44 | return sigmoid 45 | else: 46 | return None 47 | 48 | def calibrateScore(self, sigmoid, X, Y): 49 | return sigmoid.score(X,Y) 50 | 51 | def predictClass(self, X, sigmoid): 52 | return [self.clf.predict(X), sigmoid.predict(X), np.multiply(sigmoid.predict_proba(X),100)] 53 | 54 | def classify(self, train, train_labels, test, test_labels, partial=False): 55 | [X_train, X_test] = self.vectorize(train, test) 56 | if partial: 57 | clf = self.partialFit(X_train, train_labels) 58 | else: 59 | clf = self.fit(X_train, train_labels) 60 | sigmoid = self.calibrate(clf, X_train, train_labels) 61 | self.predictClass(X_test, test_labels, clf, sigmoid) 62 | 63 | 64 | -------------------------------------------------------------------------------- /elastic/get_config.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from config import es as default_es 3 | 4 | def get_available_domains(es=None): 5 | if es is None: 6 | es = default_es 7 | 8 | query = { 9 | "query": { 10 | "match_all": {} 11 | }, 12 | } 13 | res = es.search(body=query, 14 | index='config', 15 | doc_type='domains', 16 | size=100 17 | ) 18 | 19 | hits = res['hits']['hits'] 20 | 21 | result = {} 22 | for hit in hits: 23 | result[hit['_id']] = hit['_source'] 24 | result[hit['_id']]['timestamp'] = long(convert_to_epoch(datetime.strptime(result[hit['_id']]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f'))) 25 | 26 | return result 27 | 28 | def get_mapping(es=None): 29 | if es is None: 30 | es = default_es 31 | 32 | query = { 33 | "query": { 34 | "match_all": {} 35 | }, 36 | } 37 | res = es.search(body=query, 38 | index='config', 39 | doc_type='mapping', 40 | size=100 41 | ) 42 | 43 | hits = res['hits']['hits'] 44 | 45 | res = {} 46 | for hit in hits: 47 | res[hit['_source']['field']] = hit['_source']['value'] 48 | 49 | return res 50 | 51 | def get_tag_colors(es=None): 52 | if es is None: 53 | es = default_es 54 | 55 | query = { 56 | "query": { 57 | "match_all": {} 58 | } 59 | } 60 | res = es.search(body=query, 61 | index='config', 62 | doc_type='tag_colors', 63 | size=100 64 | ) 65 | 66 | hits = res['hits']['hits'] 67 | 68 | res = {} 69 | for hit in hits: 70 | res[hit['_id']] = {'index': hit['_source']['index']} 71 | res[hit['_id']]['colors'] = hit['_source']['colors'] 72 | 73 | return res 74 | 75 | 76 | def convert_to_epoch(dt): 77 | epoch = datetime.utcfromtimestamp(0) 78 | delta = dt - epoch 79 | return delta.total_seconds() 80 | 81 | if __name__ == "__main__": 82 | get_available_domains() 83 | 84 | 85 | -------------------------------------------------------------------------------- /ranking/tfidf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from elastic.get_mtermvectors import getTermStatistics 3 | 4 | class tfidf: 5 | def __init__(self, opt_docs = None, rm_stopwords=True, rm_numbers=True, pos_tags=[], term_freq=1, mapping=None, es_index = 'memex', es_doc_type = 'page', es = None): 6 | self.documents = opt_docs 7 | self.corpus = None 8 | self.tfidfArray = None 9 | self.tfArray = None 10 | self.ttf = None 11 | self.mapping = mapping 12 | self.rm_stopwords = rm_stopwords 13 | self.rm_numbers = rm_numbers 14 | self.pos_tags = pos_tags 15 | self.es_index = es_index 16 | self.es_doc_type = es_doc_type 17 | self.es = es 18 | self.term_freq = term_freq 19 | 20 | if opt_docs != None: 21 | self.process(opt_docs) 22 | 23 | def getTopTerms(self,top): 24 | N = len(self.documents) 25 | avg = np.divide(np.sum(self.tfidfArray, axis=0), N) 26 | sortedAvgIndices = np.argsort(avg)[::-1] 27 | return [self.corpus[i] for i in sortedAvgIndices[0:top]] 28 | 29 | def getIndex(self, terms): 30 | index = [] 31 | for term in terms: 32 | if term.strip() in self.corpus: 33 | index.append(self.corpus.index(term.strip())) 34 | return index 35 | 36 | def getTfidfArray(self): 37 | return [self.documents, self.corpus, self.tfidfArray] 38 | 39 | def getTfArray(self): 40 | return [self.documents, self.corpus, self.tfArray] 41 | 42 | def getTtf(self): 43 | return self.ttf 44 | 45 | def getURLs(self, args): 46 | return self.documents 47 | 48 | def getTerms(self, indices): 49 | return [self.corpus[x] for x in indices] 50 | 51 | def process(self, documents): 52 | [data_tfidf, data_tf, data_ttf, corpus, urls] = getTermStatistics(documents, self.rm_stopwords, self.rm_numbers, self.pos_tags, self.term_freq, mapping=self.mapping, es_index=self.es_index, es_doc_type=self.es_doc_type, es=self.es) 53 | self.tfidfArray = data_tfidf 54 | self.tfArray = data_tf 55 | self.ttf = data_ttf 56 | self.corpus = corpus 57 | self.documents = urls 58 | -------------------------------------------------------------------------------- /elastic/create_index.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | import json 3 | 4 | from config import es as default_es 5 | 6 | def create_index(es_index='memex', mapping=environ['DDT_HOME']+'/elastic/mapping.json', es=None): 7 | if es is None: 8 | es = default_es 9 | 10 | json_page_data=open(mapping).read() 11 | 12 | page_mappings = json.loads(json_page_data) 13 | 14 | doctypes = {} 15 | for doc_type in page_mappings.keys(): 16 | doctypes[doc_type] = page_mappings[doc_type] 17 | 18 | mappings = { 19 | "mappings": doctypes 20 | } 21 | 22 | fields = es_index.lower().split(' ') 23 | es_index = '_'.join([item for item in fields if item not in '']) 24 | 25 | res = es.indices.create(index=es_index, body=mappings, ignore=400) 26 | 27 | es.indices.refresh(es_index) 28 | 29 | return res 30 | 31 | def create_terms_index(es_index='ddt_terms', es=None): 32 | if es is None: 33 | es = default_es 34 | 35 | json_terms_data=open(environ['DDT_HOME']+'/elastic/mapping_terms.json').read() 36 | 37 | terms_mappings = json.loads(json_terms_data) 38 | 39 | mappings = {"mappings": 40 | { 41 | "terms":terms_mappings["terms"] 42 | } 43 | } 44 | 45 | fields = es_index.lower().split(' ') 46 | es_index = '_'.join([item for item in fields if item not in '']) 47 | 48 | res = es.indices.create(index=es_index, body=mappings, ignore=400) 49 | 50 | es.indices.refresh(es_index) 51 | 52 | return res 53 | 54 | def create_config_index(es_index='config', es=None): 55 | if es is None: 56 | es = default_es 57 | 58 | json_config_data=open(environ['DDT_HOME']+'/elastic/config.json').read() 59 | 60 | config_mappings = json.loads(json_config_data) 61 | 62 | mappings = {"mappings": 63 | { 64 | "domains": config_mappings["domains"] 65 | } 66 | } 67 | 68 | fields = es_index.lower().split(' ') 69 | 70 | es_index = '_'.join([item for item in fields if item not in '']) 71 | 72 | res = es.indices.create(index=es_index, body=mappings, ignore=400) 73 | 74 | return res 75 | 76 | 77 | -------------------------------------------------------------------------------- /ranking/rank.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import tfidf 4 | import BayesianSets 5 | import numpy as np 6 | 7 | class rank: 8 | def results(self,table,query_urls, other_urls): 9 | 10 | [urls, corpus, data] = table.getTfidfArray() 11 | 12 | #Normalise the data 13 | col_sum_d = np.sum(data,axis=0) 14 | norm_d = np.divide(data, col_sum_d) 15 | 16 | indices = [urls.index(url) for url in query_urls] 17 | subquery_data = norm_d[indices, :] 18 | 19 | indices = [urls.index(url) for url in other_urls] 20 | other_data = norm_d[indices, :] 21 | 22 | # Check if any of the features are not present in any 23 | # of the query set documents 24 | check_for_zero = np.sum(subquery_data, axis=0) 25 | zero_indices = np.where(check_for_zero == 0)[0] 26 | 27 | if(len(zero_indices) > 0): 28 | # If features not present in query set documents 29 | # then remove them 30 | old_corpus = corpus 31 | corpus = [] 32 | [corpus.append(old_corpus[i]) for i in range(0,len(old_corpus)) if i not in zero_indices] 33 | 34 | subquery_data = np.delete(subquery_data, zero_indices, 1) 35 | other_data = np.delete(other_data, zero_indices, 1) 36 | 37 | bs = BayesianSets.BayesianSets() 38 | 39 | score = bs.score(subquery_data, other_data) 40 | 41 | indices = np.argsort(np.multiply(score,-1)) 42 | ranked_urls = [other_urls[index] for index in indices] 43 | ranked_scores = [score[index] for index in indices] 44 | return [ranked_urls,ranked_scores] 45 | 46 | def main(argv): 47 | if len(argv) != 2: 48 | print "Invalid arguments" 49 | print "python rank.py inputfile 0,1,2" 50 | return 51 | 52 | # File containing information of documents 53 | input_file = argv[0] 54 | # Most relevant documents 55 | query_index = [int(i) for i in argv[1].split(',')] 56 | ranker = rank() 57 | [ranked_urls,scores] = ranker.results(input_file,query_index) 58 | 59 | for i in range(0,len(ranked_urls)): 60 | print ranked_urls[i]," ", str(scores[i]) 61 | 62 | if __name__=="__main__": 63 | main(sys.argv[1:]) 64 | -------------------------------------------------------------------------------- /elastic/README.md: -------------------------------------------------------------------------------- 1 | # ElasticSearch utility for MEMEX (Experimental) 2 | 3 | Jean-Daniel Fekete, March 10th, 2015 4 | Yamuna Krishnamurthy 5 | 6 | Using ElasticSearch requires its installation first. Go to: 7 | https://www.elastic.co/downloads/elasticsearch, get the version that fits your system. Install it and start the server. 8 | It should work on port 9200 on localhost. The installed version should be higher than 1.4 to provide some of the features we need. 9 | 10 | To debug and see the contents of the data in ElasticSearch, install the "Head" plugin: 11 | ``` 12 | sudo elasticsearch/bin/plugin -install mobz/elasticsearch-head 13 | ``` 14 | Then look at the contents of Elasticsearch by opening the url: http://localhost:9200/_plugin/head/ 15 | 16 | Also, install python >= 2.7.9, not python3. 17 | 18 | Then, you can populate the database with html documents 19 | 20 | This directory contains python scripts for various operations with elasticsearch: 21 | 22 | ## Methods for creating an index 23 | 24 | ``` 25 | create_index.py 26 | ``` 27 | 28 | ## Methods for adding and updating documents 29 | ``` 30 | add_documents.py 31 | ``` 32 | ## Methods to Search documents 33 | 34 | ``` 35 | search_documents.py 36 | ``` 37 | ## Getting the term vectors 38 | 39 | To perform its search, ElasticSearch maintains term vectors and computes TF/IDF on them. The information can be retrieved with the sample script: 40 | ``` 41 | get_term_vectors.py 42 | ``` 43 | ## Methods to get specific documents 44 | 45 | ``` 46 | get_documents.py 47 | ``` 48 | ## Methods to do aggregations 49 | 50 | ``` 51 | aggregations.py 52 | ``` 53 | 54 | ## Methods for delete an index 55 | 56 | ``` 57 | delete.py 58 | ``` 59 | 60 | The shell scripts in the script directory can be used as follows for testing the elasticsearch: 61 | 62 | ## Creating the ElasticSearch Index 63 | 64 | A Database is called an Index in ElasticSearch. To create it, use the script `create_index.sh' 65 | ``` 66 | ./create_index.sh 67 | ``` 68 | 69 | Then, a Schema should be defined. A ElasticSearch Schema is called a "Mapping" for example `mapping.json`. You can install it with the script: 70 | ``` 71 | ./put_mapping.sh 72 | ``` 73 | 74 | -------------------------------------------------------------------------------- /vis/html/js/topicvis.js: -------------------------------------------------------------------------------- 1 | (function(exports){ 2 | 3 | /** 4 | * Sting to be used when grabbing the settings form DOM element. 5 | */ 6 | var form = "#topicvis_settings_form"; 7 | var MIN_TOPICS = 2; 8 | var MAX_TOPICS = 20; 9 | 10 | 11 | /** 12 | * Default settings for the topik visualizations. 13 | */ 14 | exports.visSettings = { 15 | tokenizer: "simple", 16 | vectorizer: "bag_of_words", 17 | model: "plsa", 18 | ntopics: 2, 19 | visualizer: "", 20 | session: "", 21 | }; 22 | 23 | 24 | /** 25 | * Convert the values in the form to simple key-value pairs, in which the key 26 | * is the html name of the input and the value is the value of the input. 27 | */ 28 | exports.formToObject = function(form){ 29 | var objects = {}; 30 | var formData = $(form).serializeArray(); 31 | for(var i = 0; i < formData.length; i++){ 32 | objects[formData[i]["name"]] = formData[i]["value"] 33 | } 34 | if((objects.ntopics > MAX_TOPICS) || (objects.ntopics < MIN_TOPICS)){ 35 | $("#error_ntopics").css("display", "inline"); 36 | throw "ntopics must be a number between " + MIN_TOPICS + " and " + MAX_TOPICS + "."; 37 | } else { 38 | $("#error_ntopics").css("display", "none"); 39 | return objects; 40 | } 41 | } 42 | 43 | 44 | /** 45 | * Update visSettings with the new settings using jQuery.extend 46 | */ 47 | exports.updateSettings = function(){ 48 | $.extend(true, exports.visSettings, exports.formToObject(form)); 49 | } 50 | 51 | 52 | /** 53 | * When either button is clicked, use the context dependent "this" to 54 | * grab the value of the clicked button and update visSettings, then change 55 | * the href on the link button to contain the vis settings parsed as URL 56 | * paramaters. 57 | */ 58 | $("#ldavisPlot, #termitePlot").on("click", function(){ 59 | exports.visSettings.visualizer = $(this).attr("value"); 60 | exports.visSettings.session = JSON.stringify(exports.vis.sessionInfo()); 61 | var url = "/topicvis?" + $.param(exports.visSettings); 62 | $(this).attr("href", url); 63 | }); 64 | 65 | 66 | /** 67 | * When the save button is clicked, update visSettings with the new values 68 | * from the form. 69 | */ 70 | $("#save_topicvis_settings").on("click", function(){ 71 | exports.updateSettings(); 72 | $("#topicVisSettingsModal").modal("hide"); 73 | }); 74 | 75 | })(this.TopicVis = {}); 76 | -------------------------------------------------------------------------------- /vis/html/js/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @fileoverview Contains commonly used functions throughout the 3 | * application. 4 | * 5 | * @author (cesarpalomo@gmail.com) Cesar Palomo 6 | */ 7 | var Utils = (function() { 8 | var tooltipDiv = undefined; 9 | var pressedKey = undefined; 10 | 11 | // Creates a div for tooltip content. 12 | var maybeCreateTooltip = function() { 13 | if (tooltipDiv === undefined) { 14 | tooltipDiv = d3.select('body') 15 | .append('div') 16 | .classed('tooltip', true) 17 | .style('opacity', 1e-6); 18 | } 19 | }; 20 | 21 | // Registers window to listen for pressed keys. 22 | d3.select(window).on('keydown', function() { 23 | pressedKey = d3.event.keyCode; 24 | }); 25 | d3.select(window).on('keyup', function() { 26 | pressedKey = undefined; 27 | }); 28 | 29 | var pub = {}; 30 | pub.parseFullDate = function(epochInSeconds) { 31 | return moment.unix(epochInSeconds).format('MMMM Do YYYY h:mm a'); 32 | }; 33 | pub.parseDateTime = function(epochInSeconds) { 34 | return moment.unix(epochInSeconds).format('MM/DD/YY h:mm a'); 35 | }; 36 | pub.toUTC = function(date) { 37 | return (date.getUTCMonth()+1) + '/' + date.getUTCDate() + '/' + date.getUTCFullYear() + ' ' + date.getUTCHours() + ':' + date.getUTCMinutes() + ' UTC'; 38 | }; 39 | 40 | pub.showTooltip = function() { 41 | maybeCreateTooltip(); 42 | tooltipDiv.transition() 43 | .duration(500) 44 | .style('opacity', 1); 45 | }; 46 | pub.hideTooltip = function() { 47 | maybeCreateTooltip(); 48 | tooltipDiv.transition() 49 | .duration(500) 50 | .style('opacity', 1e-6); 51 | }; 52 | pub.updateTooltip = function(text, opt_x, opt_y) { 53 | maybeCreateTooltip(); 54 | var x = opt_x || d3.event.pageX + 10; 55 | var y = opt_y || d3.event.pageY - 25; 56 | tooltipDiv 57 | .text(text) 58 | .style('left', x + 'px') 59 | .style('top', y + 'px'); 60 | }; 61 | pub.setWaitCursorEnabled = function(enabled) { 62 | d3.select('#mask') 63 | .style('display', enabled ? 'block' : 'none') 64 | .style('cursor', enabled ? 'wait' : 'pointer'); 65 | }; 66 | pub.getRandomInt = function(min, max) { 67 | return Math.floor(Math.random() * (max - min)) + min; 68 | }; 69 | pub.openInNewTab = function(url) { 70 | var win = window.open(url, '_blank'); 71 | win.focus(); 72 | }; 73 | pub.isKeyPressed = function(key) { 74 | return pressedKey === key; 75 | }; 76 | return pub; 77 | }()); 78 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Domain Discovery Tool development 2 | # Type "make" or "make all" to build the complete development environment 3 | # Type "make help" for a list of commands 4 | 5 | # Variables for the Makefile 6 | .PHONY = conda_environment cherrypy_config nltk_data word2vec_data 7 | SHELL := /bin/bash 8 | CONDA_ROOT := $(shell conda info --root) 9 | CONDA_ENV := $(CONDA_ROOT)/envs/ddt 10 | 11 | CONDA_ENV_TARGET := $(CONDA_ENV)/conda-meta/history 12 | DOWNLOADER_APP_TARGET := seeds_generator/target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar 13 | CHERRY_PY_CONFIG_TARGET := vis/config.conf 14 | GET_NLTK_DATA_TARGET := nltk_data/corpora nltk_data/tokenizers 15 | LINK_WORD2VEC_DATA_TARGET := ranking/D_cbow_pdw_8B.pkl 16 | 17 | # Makefile commands, see below for actual builds 18 | 19 | ## all : set up DDT development environment 20 | all: conda_env downloader_app cherrypy_config get_nltk_data link_word2vec_data 21 | 22 | ## help : show all commands. 23 | # Note the double '##' in the line above: this is what's matched to produce 24 | # the list of commands. 25 | help : Makefile 26 | @sed -n 's/^## //p' $< 27 | 28 | ## conda_env : Install/update a conda environment with needed packages 29 | conda_env: $(CONDA_ENV_TARGET) 30 | 31 | ## downloader_app : Build the Java-based downloader application 32 | downloader_app: $(DOWNLOADER_APP_TARGET) 33 | 34 | ## cherrypy_config : Configure CherryPy (set absolute root environment) 35 | cherrypy_config: $(CHERRY_PY_CONFIG_TARGET) 36 | 37 | ## get_nltk_data : Download NLTK corpus and tokenizers 38 | get_nltk_data: $(GET_NLTK_DATA_TARGET) 39 | 40 | ## link_word2vec_data : Hardlink the word2vec data from the conda environment 41 | link_word2vec_data: $(LINK_WORD2VEC_DATA_TARGET) 42 | 43 | # Actual Target work here 44 | 45 | $(CONDA_ENV_TARGET): environment.yml 46 | conda env update 47 | 48 | $(DOWNLOADER_APP_TARGET): $(CONDA_ENV_TARGET) seeds_generator/pom.xml $(wildcard seeds_generator/src/main/java/page_downloader/*.java) 49 | source activate ddt; \ 50 | pushd seeds_generator; \ 51 | mvn compile assembly:single; \ 52 | popd 53 | 54 | $(CHERRY_PY_CONFIG_TARGET): vis/config.conf-in 55 | sed "s#tools.staticdir.root = .#tools.staticdir.root = ${PWD}/vis/html#g" vis/config.conf-in > vis/config.conf 56 | 57 | $(GET_NLTK_DATA_TARGET): $(CONDA_ENV) 58 | source activate ddt; \ 59 | python -m nltk.downloader -d ${PWD}/nltk_data stopwords brown punkt averaged_perceptron_tagger 60 | 61 | $(LINK_WORD2VEC_DATA_TARGET): $(CONDA_ENV)/data/D_cbow_pdw_8B.pkl 62 | ln $(CONDA_ENV)/data/D_cbow_pdw_8B.pkl ${PWD}/ranking 63 | 64 | 65 | -------------------------------------------------------------------------------- /elastic/add_documents.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | 5 | from config import es as default_es 6 | 7 | from elasticsearch import helpers 8 | 9 | def add_document(entries, es_index='memex', es_doc_type='page', es=None): 10 | if es is None: 11 | es = default_es 12 | 13 | es_entries = [] 14 | for doc in entries: 15 | entry = {"_index": es_index, 16 | "_type": es_doc_type, 17 | "_source": {k: v for k, v in doc.items() if k not in ['_id']} } 18 | 19 | if '_id' in doc.keys(): 20 | entry['_id'] = doc['_id'] 21 | 22 | es_entries.append(entry) 23 | 24 | helpers.bulk(es, es_entries, refresh=True) 25 | 26 | def update_document(update_entries, es_index='memex', es_doc_type='page', es=None): 27 | if es is None: 28 | es = default_es 29 | 30 | helpers.bulk(es, [{"_op_type": "update", 31 | "_index": es_index, 32 | "_type": es_doc_type, 33 | "doc": value, 34 | "doc_as_upsert": True, 35 | "_id": key} for key, value in update_entries.items()], refresh=True, request_timeout=600) 36 | 37 | def delete_document(delete_entries, es_index='memex', es_doc_type='page', es=None): 38 | if es is None: 39 | es = default_es 40 | 41 | helpers.bulk(es, [{"_op_type": "delete", 42 | "_index": es_index, 43 | "_type": es_doc_type, 44 | "_id": key} for key in delete_entries], refresh=True, request_timeout=600) 45 | 46 | def refresh(es_index='memex', es_doc_type='page', es=None): 47 | if es is None: 48 | es = default_es 49 | 50 | es.refresh(es_index) 51 | 52 | if __name__ == "__main__": 53 | if len(sys.argv)>1: 54 | inputfile = sys.argv[1] 55 | urls = [] 56 | with open(inputfile, 'r') as f: 57 | for line in f: 58 | urls.append(line.strip()) 59 | else: 60 | urls = [ 61 | 'http://en.wikipedia.org/wiki/Dark_internet', 62 | 'http://www.dailymail.co.uk/.../article-3017888/...details-sold-dark-web.html', 63 | 'http://en.wikipedia.org/wiki/Deep_Web', 64 | 'http://www.rogerdavies.com/2011/06/dark-internet', 65 | 'http://www.straightdope.com/.../read/3092/how-can-i-access-the-deep-dark-web' 66 | ] 67 | entries = [] 68 | for url in urls: 69 | print 'Retrieving url %s' % url 70 | e = compute_index_entry(url=url) 71 | 72 | if e: entries.append(e) 73 | 74 | if len(entries): 75 | add_document(entries) 76 | 77 | url = 'http://en.wikipedia.org/wiki/Dark_internet', 78 | entry = { 79 | 'url': url, 80 | 'relevance' : 1 81 | } 82 | update_document([entry]) 83 | -------------------------------------------------------------------------------- /seeds_generator/concat_nltk.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import nltk 5 | import codecs 6 | from elastic.get_documents import get_documents 7 | 8 | ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')) 9 | NON_ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words()) - ENGLISH_STOPWORDS 10 | 11 | STOPWORDS_DICT = {} 12 | for lang in nltk.corpus.stopwords.fileids(): 13 | STOPWORDS_DICT[lang] = set(nltk.corpus.stopwords.words(lang)) 14 | 15 | def get_language(text): 16 | words = set(nltk.wordpunct_tokenize(text.lower())) 17 | return max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0] 18 | 19 | 20 | def is_english(text): 21 | text = text.lower() 22 | words = set(nltk.wordpunct_tokenize(text)) 23 | return len(words & ENGLISH_STOPWORDS) > len(words & NON_ENGLISH_STOPWORDS) 24 | 25 | def valid_words(text): 26 | tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') 27 | words = tokenizer.tokenize(text) 28 | filtered = [w for w in words if (not w.lower() in ENGLISH_STOPWORDS and len(w) > 2)] 29 | return " ".join(filtered) 30 | 31 | def process_text(content): 32 | content = content.strip().replace(" +", " "); 33 | content = content.replace("[\n\"\t]", " "); 34 | content = content.replace(",",""); 35 | content = content.lower(); 36 | return content 37 | 38 | ''' 39 | KEY = re.compile("sex|woman|labor|slave|prostitution|organ|child|traffic|force") 40 | def check_key_terms(content): 41 | content = content.lower() 42 | if KEY.search(content): 43 | content = content.replace("\n", " ") 44 | return content 45 | else: 46 | return "" 47 | ''' 48 | 49 | def get_all_files(dirname): 50 | print "Loading all filenames" 51 | files = [] 52 | for [path, dirnames, filenames] in os.walk(dirname): 53 | for filename in filenames: 54 | files.append(path + "/" + filename) 55 | print "Done loading files", len(files) 56 | return files 57 | 58 | def get_bag_of_words(urls): 59 | docs = get_documents(urls) 60 | bag_of_words = {} 61 | for url in docs.keys(): 62 | bof = process_text(docs[url]) 63 | bof = valid_words(bof) 64 | bag_of_words[url] = bof 65 | return bag_of_words 66 | 67 | def main(argv): 68 | output = open(argv[0], "w") 69 | len_count = 0 #Count number of documents have less than 100 characters 70 | count = 0 71 | #for file in files: 72 | for content in codecs.getreader("utf-8")(sys.stdin): 73 | if (count % 1000) == 0: 74 | print "all count:\t" + str(count) + "\tless-100 count:\t" + str(len_count) 75 | count += 1 76 | content = content.strip() 77 | url, text = content.split("\t") 78 | if not '@empty@' in text: 79 | text = valid_words(text) 80 | #if len(text) > 100: 81 | # len_count += 1 82 | output.write(url + ";" + text + "\n") 83 | else: 84 | output.write(url + ";\n") 85 | output.close() 86 | 87 | if __name__=="__main__": 88 | main(sys.argv[1:]) 89 | -------------------------------------------------------------------------------- /ranking/extract_terms.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import tfidf 4 | import BayesianSets 5 | 6 | import numpy as np 7 | import scipy.sparse as sps 8 | 9 | class extract_terms: 10 | def __init__(self, tfidf): 11 | self.table = tfidf 12 | 13 | def getTopTerms(self,top): 14 | return self.table.getTopTerms(top) 15 | 16 | def results(self,query_terms): 17 | 18 | [urls, corpus, d] = self.table.getTfidfArray() 19 | 20 | if sps.issparse(d): 21 | d = d.toarray() 22 | 23 | query_index = self.getIndex(corpus, query_terms) 24 | 25 | #Normalise the data 26 | col_sum_d = np.sum(d, axis=0) 27 | norm_d = np.divide(d, col_sum_d) 28 | 29 | data = np.transpose(norm_d) 30 | 31 | # documents other than the relevant documents 32 | index = [x for x in range(0,len(data)) if x not in query_index] 33 | 34 | subquery_data = data[query_index,:] 35 | other_data = data[index,:] 36 | 37 | # Check if any of the features are not present in any 38 | # of the query set documents 39 | check_for_zero = np.sum(subquery_data, axis=0) 40 | zero_indices = np.where(check_for_zero == 0)[0] 41 | 42 | if(len(zero_indices) > 0): 43 | # If features not present in query set documents 44 | # then remove them 45 | subquery_data = np.delete(subquery_data, zero_indices, 1) 46 | other_data = np.delete(other_data, zero_indices, 1) 47 | 48 | bs = BayesianSets.BayesianSets() 49 | score = bs.score(subquery_data, other_data) 50 | 51 | rank_index = np.argsort(score)[::-1] 52 | 53 | offset_rank_index = [index[x] for x in rank_index] 54 | 55 | # Get the terms corresponding to the scored indices 56 | ranked_terms = self.table.getTerms(offset_rank_index) 57 | 58 | ranked_scores = [score[rank_index[i]] for i in range(0, len(score))] 59 | return [ranked_terms,ranked_scores] 60 | 61 | def getIndex(self, corpus, query_terms): 62 | indices = [] 63 | for term in query_terms: 64 | try: 65 | indices.append(corpus.index(term)) 66 | except ValueError: 67 | pass 68 | return indices 69 | 70 | 71 | def main(argv): 72 | if len(argv) != 2: 73 | print "Invalid arguments" 74 | print "python rank.py inputfile 0,1,2" 75 | return 76 | 77 | # File containing information of documents 78 | input_file = argv[0] 79 | # Most relevant documents 80 | query_index = [int(i) for i in argv[1].split(',')] 81 | ranker = extract_terms() 82 | [ranked_urls,scores] = ranker.results(input_file,query_index) 83 | 84 | for i in range(0,20): 85 | print ranked_urls[i]," ", str(scores[i]) 86 | 87 | if __name__=="__main__": 88 | main(sys.argv[1:]) 89 | -------------------------------------------------------------------------------- /vis/html/domains_dashboard.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 26 |
27 |
28 | {% if ((pages_script and pages_div) or (queries_script and queries_div)) %} 29 | {% if ((pages_script and pages_div) and (endings_div and endings_script)) %} 30 |
31 |

Page Statistics

32 | {{ pages_script | safe }} 33 | {{ pages_div | safe }} 34 |
35 |
36 |

Endings Statistics

37 | {{ endings_script | safe }} 38 | {{ endings_div | safe }} 39 |
40 | {% else %} 41 |
42 |

No page information available for this domain.

43 |
44 | {% endif %} 45 |
46 | {% if (queries_script and queries_div) %} 47 |

Queries Statistics

48 | {{ queries_script | safe }} 49 | {{ queries_div | safe }} 50 | {% else %} 51 |

No queries information available for this domain.

52 | {% endif %} 53 |
54 | {% else %} 55 |
56 |

No statistics available for this domain.

57 |
58 | {% endif %} 59 |
60 |
61 | 62 | 63 | -------------------------------------------------------------------------------- /vis/html/js/crawlersigslots.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @fileoverview Manager for signal slots throught the application. 3 | * Refer to this when creating new signals, and to connect slots. 4 | * 5 | * @author (cesarpalomo@gmail.com) Cesar Palomo 6 | */ 7 | 8 | 9 | 10 | /** 11 | * Manages signal slots for application UI. 12 | */ 13 | var SigSlots = (function() { 14 | ////// Signals definition is centralized here. 15 | __sig__.available_crawlers_list_loaded = function(crawlers) {}; 16 | __sig__.available_crawlers_list_reloaded = function(crawlers) {}; 17 | __sig__.available_proj_alg_list_loaded = function(proj_alg) {}; 18 | __sig__.new_pages_summary_fetched = function(summary, isFilter) {}; 19 | __sig__.previous_pages_summary_fetched = function(summary, isFilter) {}; 20 | __sig__.terms_summary_fetched = function(summary) {}; 21 | __sig__.term_focus = function(term, onFocus) {}; 22 | __sig__.term_toggle = function(term, shiftClick) {}; 23 | __sig__.terms_snippets_loaded = function(snippetsData) {}; 24 | __sig__.pages_loaded = function(pages) {}; 25 | __sig__.queries_loaded = function(queries) {}; 26 | __sig__.tags_loaded = function(tags) {}; 27 | __sig__.model_tags_loaded = function(tags) {}; 28 | __sig__.tags_colors_loaded = function(tagColors){}; 29 | __sig__.tag_focus = function(tag, onFocus) {}; 30 | __sig__.tag_clicked = function(tag) {}; 31 | __sig__.tag_action_clicked = function(tag, actionType, pages, refresh_plot) {}; 32 | __sig__.tag_individual_page_action_clicked = function(tag, actionType, page) {}; 33 | __sig__.brushed_pages_changed = function(pagesIndices) {}; 34 | 35 | __sig__.add_crawler = function(index_name) {}; 36 | __sig__.del_crawler = function(domains) {}; 37 | __sig__.query_enter = function(terms) {}; 38 | __sig__.filter_enter = function(terms) {}; 39 | __sig__.add_term = function(term) {}; 40 | __sig__.add_neg_term = function(term) {}; 41 | __sig__.delete_term = function(term) {}; 42 | __sig__.load_new_pages_summary = function(isFilter) {}; 43 | __sig__.set_pages_tags_completed = function(){}; 44 | __sig__.bokeh_insert_plot = function() {}; 45 | __sig__.update_online_classifier = function() {}; 46 | __sig__.update_online_classifier_completed = function(accuracy) {}; 47 | __sig__.build_hierarchy_filters = function(filters) {}; 48 | __sig__.new_tag_loaded = function(flag_newTag) {}; 49 | 50 | //__sig__.pages_labels_changed = function() {}; 51 | //__sig__.term_selected = function(term) {}; 52 | //__sig__.query_enter = function(query) {}; 53 | //__sig__.pages_do_ranking = function() {}; 54 | //__sig__.pages_extract_terms = function() {}; 55 | //__sig__.brushed_pages_changed = function(pagesIndices) {}; 56 | //__sig__.add_term_to_query_box = function(term) {}; 57 | 58 | var pub = {}; 59 | ////// CONNECTS SIGNALS TO SLOTS 60 | // e.g. SigSlots.connect(__sig__.eventHappened, myObject, myObject.onEventHappened); 61 | pub.connect = function( 62 | signal, slotInstance, slotMethod) { 63 | __sig__.connect( 64 | __sig__, signal, 65 | slotInstance, slotMethod); 66 | }; 67 | return pub; 68 | }()); 69 | -------------------------------------------------------------------------------- /elastic/test/test_get_documents.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from mock import patch 4 | 5 | from ..get_documents import get_plotting_data 6 | 7 | @patch('domain_discovery_tool.elastic.config.es.search') 8 | def test_get_plotting_data(mock_es_search): 9 | mock_es_search.return_value = { 10 | u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, 11 | u'hits': {u'hits': [{u'_id': u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937', 12 | u'_index': u'apple', 13 | u'_score': 1.0, 14 | u'_type': u'page', 15 | u'fields': {u'query': [u'apple'], 16 | u'retrieved': [u'2016-04-16T00:06:35.292'], 17 | u'tag': [u'Relevant'], 18 | u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']}}, 19 | {u'_id': u'http://www.applevacations.com/', 20 | u'_index': u'apple', 21 | u'_score': 1.0, 22 | u'_type': u'page', 23 | u'fields': {u'query': [u'apple'], 24 | u'retrieved': [u'2016-04-16T00:06:36.135'], 25 | u'tag': [u'Irrelevant', u'Relevant'], 26 | u'url': [u'http://www.applevacations.com/']}}, 27 | {u'_id': u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU', 28 | u'_index': u'apple', 29 | u'_score': 1.0, 30 | u'_type': u'page', 31 | u'fields': {u'query': [u'apple'], 32 | u'retrieved': [u'2016-04-16T00:06:34.806'], 33 | u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']}}], 34 | u'max_score': 1.0, 35 | u'total': 285}, 36 | u'timed_out': False, 37 | u'took': 9} 38 | 39 | result = [ 40 | {u'query': [u'apple'], 41 | u'retrieved': [u'2016-04-16T00:06:35.292'], 42 | u'tag': [u'Relevant'], 43 | u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']}, 44 | {u'query': [u'apple'], 45 | u'retrieved': [u'2016-04-16T00:06:36.135'], 46 | u'tag': [u'Irrelevant', u'Relevant'], 47 | u'url': [u'http://www.applevacations.com/']}, 48 | {u'query': [u'apple'], 49 | u'retrieved': [u'2016-04-16T00:06:34.806'], 50 | u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']} 51 | ] 52 | 53 | assert get_plotting_data(u'potatoes', es=None) == result 54 | -------------------------------------------------------------------------------- /vis/bokeh_plots/utils.py: -------------------------------------------------------------------------------- 1 | from bokeh.plotting import figure 2 | from functools32 import wraps 3 | 4 | DATETIME_FORMAT = dict( 5 | microseconds=["%m/%d/%y %I:%M:%S %p"], 6 | milliseconds=["%m/%d/%y %I:%M:%S %p"], 7 | seconds=["%m/%d/%y %I:%M:%S %p"], 8 | minsec=["%m/%d/%y %I:%M:%S %p"], 9 | minutes=["%m/%d/%y %I:%M:%S %p"], 10 | hourmin=["%m/%d/%y %I:%M:%S %p"], 11 | hours=["%m/%d/%y %I:%M:%S %p"], 12 | days=["%m/%d/%y %I:%M:%S %p"], 13 | months=["%m/%d/%y %I:%M:%S %p"], 14 | years=["%m/%d/%y %I:%M:%S %p"], 15 | ) 16 | 17 | FONT = "Helvetica" 18 | FONT_SIZE = "10pt" 19 | 20 | NODATA_COLOR = "#eeeeee" 21 | GRAY = "#CCCCCC" 22 | DARK_GRAY = "#6B6B73" 23 | BLUE = '#67a9cf' 24 | RED = '#ef8a62' 25 | 26 | AXIS_FORMATS = dict( 27 | minor_tick_in=None, 28 | minor_tick_out=None, 29 | major_tick_in=None, 30 | major_label_text_font=FONT, 31 | major_label_text_font_size="8pt", 32 | axis_label_text_font=FONT, 33 | axis_label_text_font_style="italic", 34 | axis_label_text_font_size="8pt", 35 | 36 | axis_line_color=DARK_GRAY, 37 | major_tick_line_color=DARK_GRAY, 38 | major_label_text_color=DARK_GRAY, 39 | 40 | major_tick_line_cap="round", 41 | axis_line_cap="round", 42 | axis_line_width=1, 43 | major_tick_line_width=1, 44 | ) 45 | PLOT_FORMATS = dict( 46 | toolbar_location=None, 47 | logo=None, 48 | outline_line_color="#FFFFFF", 49 | title_text_font=FONT, 50 | title_text_align='center', 51 | title_text_color=DARK_GRAY, 52 | title_text_font_size="9pt", 53 | title_text_baseline='bottom', 54 | min_border_left=0, 55 | min_border_right=0, 56 | min_border_top=0, 57 | min_border_bottom=0, 58 | ) 59 | LINE_FORMATS = dict( 60 | line_cap='round', 61 | line_join='round', 62 | line_width=2 63 | ) 64 | FONT_PROPS_SM = dict( 65 | text_font=FONT, 66 | text_font_size='8pt', 67 | ) 68 | FONT_PROPS_MD = dict( 69 | text_font=FONT, 70 | text_font_size='10pt', 71 | ) 72 | FONT_PROPS_LG = dict( 73 | text_font=FONT, 74 | text_font_size='12pt', 75 | ) 76 | BLANK_AXIS = dict( 77 | minor_tick_in=None, 78 | minor_tick_out=None, 79 | major_tick_in=None, 80 | major_label_text_font=FONT, 81 | major_label_text_font_size="8pt", 82 | axis_label_text_font=FONT, 83 | axis_label_text_font_style="italic", 84 | axis_label_text_font_size="8pt", 85 | 86 | axis_line_color='white', 87 | major_tick_line_color='white', 88 | major_label_text_color='white', 89 | axis_label_text_color='white', 90 | 91 | major_tick_line_cap="round", 92 | axis_line_cap="round", 93 | axis_line_width=1, 94 | major_tick_line_width=1, 95 | ) 96 | 97 | def make_empty_plot(plot_width, plot_height): 98 | return figure(plot_width=plot_width, plot_height=plot_height, 99 | tools="", toolbar_location=None) 100 | 101 | def empty_plot_on_empty_df(func): 102 | @wraps(func) 103 | def wrapper(*args, **kwargs): 104 | if len(args[0]) == 0: 105 | return make_empty_plot(func.func_defaults[0], 106 | func.func_defaults[1]) 107 | return func(*args, **kwargs) 108 | return wrapper 109 | -------------------------------------------------------------------------------- /online_classifier/tf_vector.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from nltk import corpus 3 | 4 | class tf_vectorizer: 5 | 6 | def __init__(self, convert_to_ascii=False, max_features=10000, ngram_range=(1,1)): 7 | self.convert_to_ascii = convert_to_ascii 8 | self.count_vect = None 9 | self.max_features = max_features 10 | self.ngram_range = ngram_range 11 | self.ENGLISH_STOPWORDS = corpus.stopwords.words('english') 12 | 13 | def vectorize(self, data): 14 | X_counts = None 15 | 16 | if self.count_vect is None: 17 | self.count_vect = CountVectorizer(stop_words=self.ENGLISH_STOPWORDS, preprocessor=self.preprocess, strip_accents='ascii', ngram_range=self.ngram_range, max_features=self.max_features) 18 | X_counts = self.count_vect.fit_transform(data) 19 | else: 20 | X_counts = self.count_vect.transform(data) 21 | 22 | return [X_counts, self.count_vect.get_feature_names()] 23 | 24 | def tf(self, data): 25 | return self.vectorize(data) 26 | 27 | def preprocess(self, text): 28 | # Remove unwanted chars and new lines 29 | text = text.lower().replace(","," ").replace("__"," ").replace("(", " ").replace(")", " ").replace("[", " ").replace("]", " ").replace(".", " ").replace("/", " ").replace("\\", " ").replace("_", " ").replace("#", " ").replace("-", " ").replace("+", " ").replace("%", " ").replace(";", " ").replace(":", " ").replace("'", " ").replace("\""," ").replace("^", " ") 30 | 31 | text = text.replace("\n"," ") 32 | 33 | if self.convert_to_ascii: 34 | # Convert to ascii 35 | ascii_text = [] 36 | for x in text.split(" "): 37 | try: 38 | ascii_text.append(x.encode('ascii', 'ignore')) 39 | except: 40 | continue 41 | 42 | text = " ".join(ascii_text) 43 | 44 | preprocessed_text = " ".join([word.strip() for word in text.split(" ") if len(word.strip()) > 2 and (word.strip() != "") and (self.isnumeric(word.strip()) == False) and self.notHtmlTag(word.strip()) and self.notMonth(word.strip())]) 45 | 46 | return preprocessed_text 47 | 48 | def notHtmlTag(self, word): 49 | html_tags = ["http", "html", "img", "images", "image", "index"] 50 | 51 | for tag in html_tags: 52 | if (tag in word) or (word in ["url", "com", "www", "www3", "admin", "backup", "content"]): 53 | return False 54 | 55 | return True 56 | 57 | def notMonth(self, word): 58 | month_tags = ["jan", "january", "feb", "february","mar", "march","apr", "april","may", "jun", "june", "jul", "july", "aug", "august","sep", "sept", "september","oct","october","nov","november","dec", "december","montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sontag"] 59 | 60 | if word in month_tags: 61 | return False 62 | 63 | return True 64 | 65 | def isnumeric(self, s): 66 | # Check if string is a numeric 67 | try: 68 | int(s.replace(".","").replace("-","").replace("+","")) 69 | return True 70 | except ValueError: 71 | try: 72 | long(s.replace(".","").replace("-","").replace("+","")) 73 | return True 74 | except ValueError: 75 | return False 76 | 77 | 78 | -------------------------------------------------------------------------------- /vis/html/js/snippetsviewer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @fileoverview Manages a set of snippets for a frequent term appearing in URLs. 3 | * 4 | * @author (cesarpalomo@gmail.com) Cesar Palomo 5 | */ 6 | 7 | 8 | 9 | /** 10 | * Manages a set of snippets for a frequent term appearing in URLs. 11 | * 12 | * @param parentContainerId ID for div element for snippets viewer. 13 | */ 14 | var SnippetsViewer = function(parentContainerId) { 15 | this.parentContainerId = parentContainerId; 16 | 17 | // Items in viewer. 18 | this.items = []; 19 | }; 20 | 21 | 22 | /** 23 | * Clears all items in the viewer. 24 | */ 25 | SnippetsViewer.prototype.clear = function(lazyUpdate) { 26 | this.items = []; 27 | if (!lazyUpdate) { 28 | this.update(); 29 | } 30 | }; 31 | 32 | 33 | /** 34 | * Adds item to viewer: {term: xyz, snippet: abcd xyz nhonhonho} 35 | */ 36 | SnippetsViewer.prototype.addItem = function(snippet, lazyUpdate) { 37 | this.items.push(snippet); 38 | if (!lazyUpdate) { 39 | this.update(); 40 | } 41 | }; 42 | 43 | 44 | /** 45 | * Adds multiple items to viewer. 46 | */ 47 | SnippetsViewer.prototype.addItems = function(snippets, lazyUpdate) { 48 | this.items = this.items.concat(snippets); 49 | if (!lazyUpdate) { 50 | this.update(); 51 | } 52 | }; 53 | 54 | 55 | /** 56 | * Updates viewer. 57 | */ 58 | SnippetsViewer.prototype.update = function() { 59 | var viewer = this; 60 | var items = d3.select(this.parentContainerId) 61 | .selectAll('.item').data(this.items, function(item, i) { 62 | return item.term + '-' + i + '-' + item.snippet.substring(0, 30); 63 | }); 64 | 65 | // New items. 66 | items.enter() 67 | .append('div') 68 | .classed('noselect', true) 69 | .classed('item', true) 70 | .on('click', function(item, i) { 71 | var elem = d3.select(this); 72 | elem.classed('dblclicked', !elem.classed('dblclicked')); 73 | viewer.onItemDoubleClick(item, i); 74 | window.open(item.url, '_blank'); 75 | }); 76 | 77 | // Removes missing items. 78 | items.exit().remove(); 79 | 80 | // Updates existing items. 81 | items 82 | .html(function(item, i) { 83 | return viewer.getItemInfo(item, i); 84 | }); 85 | items.each(function(item, i) { 86 | var tags = item.term['tags']; 87 | var isPositive = tags.indexOf('Positive') != -1; 88 | var isNegative = tags.indexOf('Negative') != -1; 89 | d3.select(this).selectAll('em') 90 | .classed('Positive', isPositive) 91 | .classed('Negative', isNegative); 92 | }); 93 | }; 94 | 95 | 96 | /** 97 | * Builds html content with info about an item in the viewer. 98 | */ 99 | SnippetsViewer.prototype.getItemInfo = function(item, i) { 100 | // TODO Add more details about term. 101 | return '

' + item.snippet + '

'; 102 | }; 103 | 104 | 105 | /** 106 | * Builds html content with buttons for labeling relevancy an item in the viewer, 107 | * such as Yes, No, Maybe. 108 | */ 109 | SnippetsViewer.prototype.getItemLabels = function(item, i) { 110 | // TODO. 111 | return '

Yes No Maybe

'; 112 | }; 113 | 114 | 115 | /** 116 | * Handles click in an item. 117 | */ 118 | SnippetsViewer.prototype.onItemClick = function(item, i) { 119 | // TODO. 120 | console.log('itemClicked ' + i); 121 | }; 122 | 123 | 124 | /** 125 | * Handles click in an item. 126 | */ 127 | SnippetsViewer.prototype.onItemDoubleClick = function(item, i) { 128 | // TODO. 129 | console.log('itemDoubleClicked ' + i); 130 | }; 131 | -------------------------------------------------------------------------------- /ranking/word2vec.py: -------------------------------------------------------------------------------- 1 | from pickle import load 2 | import numpy as np 3 | from os import environ 4 | 5 | from elastic.get_mtermvectors import getTermFrequency 6 | from preprocess import TextPreprocess 7 | from elastic.get_documents import get_documents_by_id 8 | 9 | class word2vec: 10 | def __init__(self, opt_docs = None, mapping=None, from_es=True, es_index = 'memex', es_doc_type = 'page', es = None): 11 | self.documents = opt_docs 12 | self.word2vec = None 13 | self.word_vec = None 14 | self.es = es 15 | 16 | if not from_es: 17 | f = open(environ['DDT_HOME']+'/ranking/D_cbow_pdw_8B.pkl', 'rb') 18 | self.word_vec = load(f) 19 | 20 | if opt_docs != None: 21 | self.process(opt_docs, mapping, es_index, es_doc_type, es) 22 | 23 | def get_word2vec(self): 24 | return [self.documents,self.word2vec] 25 | 26 | def get(self, word): 27 | if self.word_vec is None: 28 | results = get_documents_by_id([word], ["term"], "word_phrase_to_vec", "terms", self.es) 29 | if results is None: 30 | return None; 31 | else: 32 | return results[0]["term"][0] 33 | else: 34 | return self.word_vec.get(word) 35 | 36 | def process(self, documents, mapping=None, es_index = 'memex', es_doc_type = 'page', es = None): 37 | [data_tf, corpus, urls] = getTermFrequency(documents, mapping, es_index, es_doc_type, es) 38 | 39 | documents = urls 40 | 41 | word2vec_list_docs = [] 42 | urls = [] 43 | i = 0 44 | for doc in data_tf: 45 | if self.word_vec is None: 46 | results = get_documents_by_id(doc.keys(), ["term", "vector"], "word_phrase_to_vec", "terms", self.es) 47 | word_vec_doc = [res["vector"][0] for res in results] 48 | else: 49 | word_vec_doc = [self.word_vec[term] for term in doc.keys() if doc[term] >= 1 and not self.word_vec.get(term) is None] 50 | 51 | if word_vec_doc: 52 | m_word_vec = np.array(word_vec_doc).mean(axis=0) 53 | word2vec_list_docs.append(m_word_vec.tolist()) 54 | urls.append(documents[i]) 55 | i = i + 1 56 | 57 | self.documents = urls 58 | 59 | self.word2vec = np.array(word2vec_list_docs) 60 | 61 | return [self.documents,self.word2vec] 62 | 63 | def process_text(self, urls, documents): 64 | tp = TextPreprocess() 65 | 66 | word2vec_list_docs = [] 67 | final_urls = [] 68 | i = 0 69 | for text in documents: 70 | doc = tp.preprocess(text) 71 | if self.word_vec is None: 72 | terms = [term for term in doc.keys() if doc[term] > 5] 73 | results = get_documents_by_id(terms, ["term", "vector"], "word_phrase_to_vec", "terms", self.es) 74 | word_vec_doc = [res["vector"] for res in results] 75 | else: 76 | word_vec_doc = [self.word_vec[term] for term in doc.keys() if not self.word_vec.get(term) is None] 77 | #word_vec_doc = [self.word_vec[term]*doc[term] for term in doc.keys() if not self.word_vec.get(term) is None] 78 | 79 | if word_vec_doc: 80 | m_word_vec = np.array(word_vec_doc).mean(axis=0) 81 | #m_word_vec = np.sum(np.array(word_vec_doc), axis=0)/np.sum(np.array([doc[term] for term in doc.keys() if not self.word_vec.get(term) is None])) 82 | word2vec_list_docs.append(m_word_vec.tolist()) 83 | final_urls.append(urls[i]) 84 | i = i + 1 85 | 86 | self.documents = final_urls 87 | 88 | self.word2vec = np.array(word2vec_list_docs) 89 | 90 | return [self.documents,self.word2vec] 91 | -------------------------------------------------------------------------------- /vis/html/js/libs/d3.lasso.min.js: -------------------------------------------------------------------------------- 1 | d3.lasso=function(){function t(){function t(){u="",P.attr("d",null),v.attr("d",null),g=0,n[0].forEach(function(t){t.hoverSelected=!1,t.loopSelected=!1;var e=t.getBBox();t.lassoPoint={cx:Math.round(e.x+e.width/2),cy:Math.round(e.y+e.height/2),edges:{top:0,right:0,bottom:0,left:0},close_edges:{left:0,right:0}}}),1==a&&n.on("mouseover.lasso",function(){d3.select(this)[0][0].hoverSelected=!0}),i.start()}function c(){var t=d3.mouse(this)[0],a=d3.mouse(this)[1];""==u?(u=u+"M "+t+" "+a,h=[t,a],M.attr("cx",t).attr("cy",a).attr("r",7).attr("display",null)):u=u+" L "+t+" "+a,n[0].forEach(function(t){t.lassoPoint.close_edges={left:0,right:0}});var l=Math.sqrt(Math.pow(t-h[0],2)+Math.pow(a-h[1],2)),c="M "+t+" "+a+" L "+h[0]+" "+h[1];P.attr("d",u),o>=l?v.attr("display",null):v.attr("display","none"),s=o>=l?!0:!1;var d=u+"Z";x.attr("d",d);for(var y=P.node(),p=y.getTotalLength(),m=(y.getPointAtLength(g-1),g);p>=m;m++){var _=y.getPointAtLength(m),S={x:Math.round(100*_.x)/100,y:Math.round(100*_.y)/100},b=y.getPointAtLength(m-1),L={x:Math.round(100*b.x)/100,y:Math.round(100*b.y)/100};n[0].filter(function(t){var n;return t.lassoPoint.cy===S.y&&t.lassoPoint.cy!=L.y?(f={x:L.x,y:L.y},n=!1):t.lassoPoint.cy===S.y&&t.lassoPoint.cy===L.y?n=!1:t.lassoPoint.cy===L.y&&t.lassoPoint.cy!=S.y?n=e(t.lassoPoint.cy-S.y)!=e(t.lassoPoint.cy-f.y):(f={x:L.x,y:L.y},n=e(t.lassoPoint.cy-S.y)!=e(t.lassoPoint.cy-L.y)),n}).forEach(function(t){S.x>t.lassoPoint.cx&&(t.lassoPoint.edges.right=t.lassoPoint.edges.right+1),S.x=m;m++){var _=close_path_node.getPointAtLength(m),b=close_path_node.getPointAtLength(m-1);n[0].filter(function(t){return t.lassoPoint.cy==Math.round(_.y)}).forEach(function(t){Math.round(_.y)!=Math.round(b.y)&&Math.round(_.x)>t.lassoPoint.cx&&(t.lassoPoint.close_edges.right=1),Math.round(_.y)!=Math.round(b.y)&&Math.round(_.x)0&&(t.lassoPoint.edges.right+t.lassoPoint.close_edges.right)%2==1?!0:!1})}else n[0].forEach(function(t){t.loopSelected=!1});d3.selectAll(n[0].filter(function(t){return t.loopSelected&&s||t.hoverSelected})).attr("d",function(t){return t.possible=!0}),d3.selectAll(n[0].filter(function(t){return!(t.loopSelected&&s||t.hoverSelected)})).attr("d",function(t){return t.possible=!1}),i.draw(),g=p+1}function d(){n.on("mouseover.lasso",null),n.filter(function(t){return t.possible===!0}).attr("d",function(t){return t.selected=!0}),n.filter(function(t){return t.possible===!1}).attr("d",function(t){return t.selected=!1}),n.attr("d",function(t){return t.possible=!1}),P.attr("d",null),v.attr("d",null),M.attr("display","none"),i.end()}var u,h,f,g,y=d3.select(this[0][0]),p=y.append("g").attr("class","lasso"),P=p.append("path").attr("class","drawn"),v=p.append("path").attr("class","loop_close"),x=p.append("path").attr("display","none"),M=p.append("circle").attr("class","origin"),m=d3.behavior.drag().on("dragstart",t).on("drag",c).on("dragend",d);l.call(m)}function e(t){return t?0>t?-1:1:0}var n=null,o=75,r=!0,s=!1,a=!0,l=null,i={start:function(){},draw:function(){},end:function(){}};return t.items=function(e){return arguments.length?(n=e,n[0].forEach(function(t){var e=d3.select(t);"undefined"==typeof e.datum()?e.datum({possible:!1,selected:!1}):e.attr("d",function(t){return t.possible=!1,t.selected=!1,t})}),t):n},t.closePathDistance=function(e){return arguments.length?(o=e,t):o},t.closePathSelect=function(e){return arguments.length?(r=1==e?!0:!1,t):r},t.isPathClosed=function(e){return arguments.length?(s=1==e?!0:!1,t):s},t.hoverSelect=function(e){return arguments.length?(a=1==e?!0:!1,t):a},t.on=function(e,n){if(!arguments.length)return i;if(1===arguments.length)return i[e];var o=["start","draw","end"];return o.indexOf(e)>-1&&(i[e]=n),t},t.area=function(e){return arguments.length?(l=e,t):l},t}; 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Domain Discovery Tool 2 | 3 | This repository contains the Domain Discovery Tool (DDT) project. DDT is an interactive system that helps users explore and better understand a domain (or topic) as it is represented on the Web. It achieves this by integrating human insights with machine computation (data mining and machine learning) through visualization. DDT allows a domain expert to visualize and analyze pages returned by a search engine or a crawler, and easily provide feedback about relevance. This feedback, in turn, can be used to address two challenges: 4 | 5 | * Guide users in the process of domain understanding and help them construct effective queries to be issued to a search engine; and 6 | * Configure focused crawlers that efficiently search the Web for additional pages on the topic. DDT allows users to quickly select crawling seeds as well as positive and negatives required to create a page classifier for the focus topic. 7 | 8 | ## Installing on your machine 9 | 10 | Building and deploying the Domain Discovery Tool can either be done using its Makefile to create a local development environment, or automatically by conda or Docker for deployment. The conda build environment is currently only supported on 64-bit OS X and Linux. 11 | 12 | ### Local development 13 | 14 | First install conda, either through the Anaconda or miniconda installers provided by Continuum. You will also need Git and a Java Development Kit. These are system tools that are generally not provided by conda. 15 | 16 | Clone the DDT repository and enter it: 17 | 18 | ``` 19 | https://github.com/ViDA-NYU/domain_discovery_tool 20 | cd domain_discovery_tool 21 | ``` 22 | 23 | Use the `make` command to build DDT and download/install its dependencies. 24 | 25 | ``` 26 | make 27 | ``` 28 | 29 | After a successful installation, you can activate the DDT development environment: 30 | 31 | ``` 32 | source activate ddt 33 | ``` 34 | 35 | And (from the top-level `domain_discovery_tool` directory), start 36 | supervisord to run the web application and its associated services: 37 | 38 | ``` 39 | supervisord 40 | ``` 41 | 42 | Now you should be able to head to http://localhost:8084/ to interact 43 | with the tool. 44 | 45 | ### Docker development 46 | 47 | First, make sure you have Docker installed and running. Then, you can create an DDT image using the Dockerfile. Run the following command in the root folder of this project: 48 | 49 | docker build -t domain_discovery_tool . 50 | 51 | or download the latest published docker build (you do not need to clone the DDT repository in this case): 52 | 53 | docker pull vidanyu/ddt:latest 54 | 55 | Run the app using the Docker image that you just built (or pulled). This starts the elasticsearch and the DDT server: 56 | 57 | docker run -i -p 8084:8084 -p 9200:9200 -t /ddt/run_demo.sh 58 | 59 | To see the app running, go to: 60 | 61 | http://localhost:8084/seedcrawler 62 | 63 | Alternativaly, you can also specify an external ElasticSearch server address using an enviroment variable: 64 | 65 | docker run -p 8084:8084 -e "ELASTICSEARCH_SERVER=http://127.0.0.1:9200" -i -t 66 | 67 | ## Further Documentation 68 | 69 | [Detailed Description of the tool](https://s3.amazonaws.com/vida-nyu/DDT/domain_discovery_tool.pdf) 70 | 71 | [Demo Scripts and Videos](https://s3.amazonaws.com/vida-nyu/DDT/DomainDiscoveryToolDemoScripts.pdf) 72 | 73 | **Note:** To follow the demo videos download and use the following demo build version of DDT: 74 | 75 | ``` 76 | docker pull vidanyu/ddt:2.7.0-demo 77 | docker run -i -p 8084:8084 -p 9200:9200 -p 9001:9001 -t vidanyu/ddt:2.7.0-demo 78 | ``` 79 | 80 | ## Publication 81 | 82 | Yamuna Krishnamurthy, Kien Pham, Aecio Santos, and Juliana Friere. 2016. [Interactive Web Content Exploration for Domain Discovery](http://poloclub.gatech.edu/idea2016/papers/p64-krishnamurthy.pdf) (Interactive Data Exploration and Analytics ([IDEA](http://poloclub.gatech.edu/idea2016/)) Workshop at Knowledge Discovery and Data Mining ([KDD](http://www.kdd.org/kdd2016/)), San Francisco, CA). 83 | 84 | ## Contact 85 | 86 | DDT Development Team [ddt-dev@vgc.poly.edu] 87 | 88 | -------------------------------------------------------------------------------- /vis/bokeh_plots/test/test_cross_filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pandas.util.testing import assert_frame_equal 4 | import pytest 5 | 6 | from ..cross_filter import (parse_es_response, calculate_query_correlation, 7 | calculate_graph_coords, duplicate_multi_rows, normalize) 8 | 9 | @pytest.fixture 10 | def es_response(): 11 | return [ 12 | {u'query': [u'apple', u'banana'], 13 | u'retrieved': [u'2016-04-16T00:06:35.292'], 14 | u'tag': [u'Relevant'], 15 | u'url': [u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937']}, 16 | {u'query': [u'apple', u'banana'], 17 | u'retrieved': [u'2016-04-16T00:06:36.135'], 18 | u'tag': [u'Irrelevant', u'Relevant'], 19 | u'url': [u'http://www.applevacations.com/']}, 20 | {u'query': [u'apple'], 21 | u'retrieved': [u'2016-04-16T00:06:34.806'], 22 | u'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']}, 23 | {u'query': [u'banana'], 24 | u'retrieved': [u'2016-04-16T00:06:36.135'], 25 | u'tag': [u'Irrelevant', u'Relevant'], 26 | u'url': [u'http://www.bananavacations.com/']}, 27 | {u'query': [u'carrot'], 28 | u'retrieved': [u'2016-04-16T00:06:34.806'], 29 | u'url': [u'http://www.nytimes.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU']} 30 | ] 31 | 32 | 33 | def test_parse_es_response(es_response): 34 | # note that data becomes ordered by `retrieved` field 35 | data = {'url': [u'http://www.reuters.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU', 36 | u'http://www.nytimes.com/article/us-apple-encryption-hearing-idUSKCN0XB2RU', 37 | u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937', 38 | u'http://www.politico.com/story/2016/04/apple-hires-cynthia-hogan-221937', 39 | u'http://www.applevacations.com/', 40 | u'http://www.applevacations.com/', 41 | u'http://www.applevacations.com/', 42 | u'http://www.applevacations.com/', 43 | u'http://www.bananavacations.com/', 44 | u'http://www.bananavacations.com/'], 45 | 'query': [u'apple', u'carrot', u'apple', u'banana', u'apple', u'apple', u'banana', u'banana', u'banana', u'banana'], 46 | 'tag': ['Untagged', 'Untagged', u'Relevant', u'Relevant', u'Irrelevant', u'Relevant', u'Irrelevant', u'Relevant', u'Irrelevant', u'Relevant'], 47 | 'hostname': [u'reuters.com', u'nytimes.com', u'politico.com', u'politico.com', u'applevacations.com', u'applevacations.com', u'applevacations.com', u'applevacations.com', u'bananavacations.com', u'bananavacations.com'], 48 | 'tld': [u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com', u'.com']} 49 | 50 | df = parse_es_response(es_response) 51 | 52 | assert df.to_dict('list') == data 53 | assert df.index.tz.tzname("") == 'UTC' 54 | 55 | def test_calculate_query_correlation(es_response): 56 | df = parse_es_response(es_response) 57 | 58 | graph = calculate_query_correlation(df, 'query') 59 | 60 | assert graph == {(u'apple', u'banana'): 1.0} 61 | 62 | def test_calculate_graph_coords(es_response): 63 | df = parse_es_response(es_response) 64 | 65 | graph = calculate_graph_coords(df, 'query') 66 | 67 | assert np.allclose(graph.x.tolist(), [-0.5, -0.5, 1.0]) 68 | assert np.allclose(graph.y.tolist(), [0.8660254037, -0.8660254037, 0.0]) 69 | assert graph.url.tolist() == [4, 5, 1] 70 | assert graph.index.tolist() == [u'apple', u'banana', u'carrot'] 71 | 72 | def test_duplicate_multitag_rows(es_response): 73 | df = parse_es_response(es_response) 74 | 75 | # Refactored to be called inside of parse_es_response 76 | # df = duplicate_multi_rows(df, 'tag') 77 | 78 | assert df.shape == (10,5) 79 | assert df.tag.tolist() == ['Untagged', 'Untagged', u'Relevant', u'Relevant', 80 | u'Irrelevant', u'Relevant', u'Irrelevant', 81 | u'Relevant', u'Irrelevant', u'Relevant'] 82 | 83 | def test_normalize(): 84 | assert np.allclose(normalize(pd.Series([1,2,3]), 3, 1.5).tolist(), [1.5, 2.0, 3.0]) 85 | -------------------------------------------------------------------------------- /vis/html/js/bokeh_controller.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This module handles communication between the bokeh callbacks and the rest of 3 | * the DDT application. Many of these functions are helper functions called from 4 | * the bokeh CustomJS callbacks in `vis/bokeh_graphs/clustering.py`. 5 | */ 6 | (function(exports){ 7 | 8 | exports.inds = []; 9 | exports.plot = {}; 10 | 11 | // Takes urls and tags from Bokeh and changes their tags. 12 | exports.updateTags = function(selectedUrls, tag, action){ 13 | // Add the tag to tagsgallery if it does not exist. For example a custom tag 14 | exports.vis.tagsGallery.addItem(tag, false); 15 | exports.vis.tagsGallery.applyOrRemoveTag(tag, action, selectedUrls, false); 16 | } 17 | 18 | exports.addCustomTags = function(custom_tags){ 19 | for(var i in custom_tags){ 20 | if(custom_tags[i] != "Custom tags") 21 | exports.vis.tagsGallery.addItem(custom_tags[i], false); 22 | } 23 | } 24 | 25 | exports.crawlPages = function(selectedURLs, crawl_type){ 26 | exports.vis.crawlPages(selectedURLs, crawl_type); 27 | } 28 | 29 | // Shows the selected pages on the pageGallery below the plot. 30 | exports.showPages = function(inds){ 31 | exports.inds = inds; 32 | exports.vis.onBrushedPagesChanged(inds); 33 | } 34 | 35 | // Inserts the bokeh plot at the specified dom element. 36 | exports.insertPlot = function(plotData){ 37 | $("#pages_landscape").html(plotData); 38 | } 39 | 40 | exports.BokehPlotKey = function(){ 41 | return Bokeh.index[Object.keys(Bokeh.index)[0]].model.children()[0] 42 | } 43 | 44 | 45 | exports.getGlyphRenderersByType = function(glyphType) { 46 | var allRenderers = exports.plot.get("renderers"); 47 | var renderers = []; 48 | $.each(exports.plot.get("renderers"), function(index, value) { 49 | if (value.attributes.hasOwnProperty("glyph") && value.attributes.glyph.type === glyphType) { 50 | renderers.push(value); 51 | } 52 | }); 53 | return renderers; 54 | }; 55 | 56 | 57 | exports.updatePlotColors = function(url, color) { 58 | var renderer = exports.getGlyphRenderersByType("Circle")[0]; 59 | var d = renderer.get("data_source").get("data"); 60 | url_index = -1; 61 | urls = [].concat.apply([], d.urls); 62 | for(var i in urls){ 63 | if(urls[i] == url){ 64 | url_index = i; 65 | break; 66 | } 67 | } 68 | d.color[url_index] = color; 69 | renderer.get("data_source").set("data", d); 70 | renderer.get("data_source").trigger("change"); 71 | }; 72 | 73 | 74 | // Gets the necessary javascript and HTML for rendering the bokeh plot into 75 | // the dom. 76 | exports.getPlotData = function(data){ 77 | Bokeh.index = {}; 78 | exports.insertPlot(data.plot); 79 | exports.plot = exports.BokehPlotKey() 80 | } 81 | 82 | 83 | exports.getEmptyPlot = function(){ 84 | $.ajax({ 85 | url: "/getEmptyBokehPlot", 86 | type: "GET", 87 | success: function(data){ 88 | exports.insertPlot(data); 89 | }, 90 | }); 91 | } 92 | 93 | exports.updateData = function(updated_tags){ 94 | // Update the data with the new tags 95 | var data = exports.vis.pagesLandscape.getPagesData(); 96 | for(var i in data){ 97 | var url = data[i]["url"]; 98 | if(updated_tags[url] != undefined){ 99 | data[i]["tags"] = updated_tags[url]["tags"]; 100 | exports.updatePlotColors(url, updated_tags[url]["color"]); 101 | } 102 | } 103 | exports.vis.pagesLandscape.setPagesData(data); 104 | exports.vis.pagesGallery.update(); 105 | } 106 | 107 | exports.clear = function(updated_tags){ 108 | exports.getEmptyPlot(); 109 | } 110 | 111 | // Connect to updateSession to bokeh_get_session signal 112 | SigSlots.connect(__sig__.bokeh_insert_plot, exports, exports.getPlotData); 113 | 114 | exports.getEmptyPlot(); 115 | 116 | // Statistics page functions and callbacks. 117 | $("#goto_statistics").on("click", function(){ 118 | var url = "/statistics?" + $.param({session: JSON.stringify(exports.vis.sessionInfo())}); 119 | $(this).attr("href", url); 120 | }); 121 | 122 | })(this.BokehPlots = {}); 123 | -------------------------------------------------------------------------------- /vis/html/cross_filter.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block navigation %} 4 | 18 | {% endblock navigation %} 19 | 20 | {% block content %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
30 |
31 |
{{ widgets_div['queries'] | safe }}
32 |
{{ widgets_div['tags'] | safe }}
33 |
{{ widgets_div['urls'] | safe }}
34 |
{{ widgets_div['tlds'] | safe }}
35 |
Start Date
36 |
37 |
38 | 39 | 40 | 41 | 42 |
43 |
44 |
End Date
45 |
46 |
47 | 48 | 49 | 50 | 51 |
52 |
53 |
54 |
55 |
56 | {% include 'cross_filter_plot_area.html' %} 57 |
58 |
59 |
60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 86 | {% endblock content %} 87 | -------------------------------------------------------------------------------- /vis/bokeh_plots/domains_dashboard.py: -------------------------------------------------------------------------------- 1 | from urlparse import urlparse 2 | from collections import Counter 3 | from operator import itemgetter 4 | import datetime 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import networkx as nx 9 | 10 | from bokeh.plotting import figure, show, output_file 11 | from bokeh.embed import components 12 | from bokeh.models import ColumnDataSource, HoverTool 13 | from bokeh.models.widgets import Panel, Tabs, Button, DataTable, DateFormatter, TableColumn 14 | from bokeh.models.widgets.layouts import HBox, VBox 15 | from bokeh.charts import Bar 16 | from bokeh.io import vform, vplot 17 | 18 | 19 | DOMAIN_PLOT_LIMIT = 10 20 | DOMAIN_TABLE_LIMIT = None 21 | 22 | ENDING_PLOT_LIMIT = 10 23 | ENDING_TABLE_LIMIT = None 24 | 25 | BAR_WIDTH = 0.4 26 | 27 | 28 | def pages_timeseries(response): 29 | parsed_dates = pd.Series(pd.to_datetime([x[1] for x in response]).order(), 30 | name="datetimes") 31 | hits = pd.Series(range(1, len(parsed_dates) + 1), name="hits") 32 | dates = pd.concat([hits, parsed_dates], axis=1).set_index("datetimes") 33 | dates = dates.resample("30S").dropna() 34 | plot = figure(plot_height=584, x_axis_type="datetime", x_axis_label="Time", 35 | y_axis_label="Fetched") 36 | plot.line(x=dates.index, y=dates["hits"]) 37 | return Panel(child=plot, title="Fetched") 38 | 39 | 40 | def endings_table(source): 41 | columns = [ 42 | TableColumn(field="x", title="Ending"), 43 | TableColumn(field="y", title="Count"), 44 | ] 45 | table = DataTable(source=source, 46 | columns=columns, width=400, height=280) 47 | return table 48 | 49 | 50 | def endings_plot(source): 51 | plot = Bar(source.data, values="y", label="x", 52 | title="Most Common URL Endings by Number", bar_width=BAR_WIDTH, 53 | height=584, xlabel="Endings", ylabel="Occurences") 54 | return plot 55 | 56 | 57 | def endings_dashboard(response): 58 | urls = [x[0][0] for x in response["pages"]] 59 | parsed_urls = [urlparse(x).hostname for x in urls] 60 | endings_counter = Counter([x[x.rfind("."):] for x in parsed_urls]).most_common(ENDING_PLOT_LIMIT) 61 | xendings = [x[0] for x in endings_counter] 62 | yendings = [y[1] for y in endings_counter] 63 | source = ColumnDataSource(data=dict(x=xendings, y=yendings)) 64 | 65 | table = VBox(children=[endings_table(source)]) 66 | plot = VBox(children=[endings_plot(source)]) 67 | return components(vplot(HBox(children=[table, plot]))) 68 | 69 | 70 | def domains_dashboard(response, extra_plots=None): 71 | """ 72 | Domains dashboard plot function. Takes an arguments for extra plots which 73 | will be added in a tab with the other plots. 74 | """ 75 | # Parsed Response Data 76 | urls = [x[0][0] for x in response["pages"]] 77 | parsed_urls = [urlparse(x).hostname for x in urls] 78 | 79 | # Domain names Bar chart. 80 | domains_counter = Counter(parsed_urls).most_common(DOMAIN_PLOT_LIMIT) 81 | xdomains = [x[0] for x in domains_counter] 82 | ydomains = [y[1] for y in domains_counter] 83 | source_domains = ColumnDataSource(data=dict(x=xdomains, y=ydomains)) 84 | 85 | bar_domains = Bar(source_domains.data, values="y", label="x", title="Most Common Sites by Number", 86 | bar_width=BAR_WIDTH, height=584, xlabel="Sites", 87 | ylabel="Occurences") 88 | panel_domains = Panel(child=bar_domains, title="Sites") 89 | 90 | # Domain Information Table 91 | table_domains_counter = Counter(parsed_urls).most_common(DOMAIN_TABLE_LIMIT) 92 | xdomains_table = [x[0] for x in table_domains_counter] 93 | ydomains_table = [y[1] for y in table_domains_counter] 94 | source_table_domains = ColumnDataSource(data=dict(x=xdomains_table, 95 | y=ydomains_table)) 96 | 97 | columns_domain = [ 98 | TableColumn(field="x", title="Site Name"), 99 | TableColumn(field="y", title="Count"), 100 | ] 101 | data_table_domain = DataTable(source=source_table_domains, columns=columns_domain, width=400, 102 | height=280) 103 | 104 | # Add the plots and charts to a vform and organize them with VBox and HBox 105 | plot_tabs = Tabs(tabs=[panel_domains, extra_plots]) 106 | 107 | # Take the plot and table and arrange them in a hbox. 108 | vbox_tables = VBox(children=[data_table_domain]) 109 | vbox_plots = VBox(children=[plot_tabs]) 110 | hbox_dashboard = HBox(children=[vbox_tables, vbox_plots]) 111 | return components(vplot(hbox_dashboard)) 112 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/BingSearch.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.IOException; 3 | import java.io.InputStreamReader; 4 | import java.io.InputStream; 5 | import java.io.FileInputStream; 6 | import java.io.StringReader; 7 | import java.io.File; 8 | import java.io.FileReader; 9 | import java.io.PrintWriter; 10 | import java.net.HttpURLConnection; 11 | import java.net.MalformedURLException; 12 | import java.net.URL; 13 | import java.util.Properties; 14 | import java.util.ArrayList; 15 | import org.apache.commons.codec.binary.Base64; 16 | import org.xml.sax.InputSource; 17 | import org.w3c.dom.*; 18 | import javax.xml.parsers.DocumentBuilderFactory; 19 | import javax.xml.parsers.DocumentBuilder; 20 | 21 | public class BingSearch { 22 | 23 | private String accountKey; 24 | private Properties prop; 25 | 26 | public BingSearch(){ 27 | try{ 28 | prop = new Properties(); 29 | FileInputStream is = new FileInputStream("conf/config.properties"); 30 | prop.load(is); 31 | accountKey = prop.getProperty("ACCOUNTKEY"); 32 | } 33 | catch(Exception e){ 34 | e.printStackTrace(); 35 | prop = null; 36 | } 37 | } 38 | 39 | 40 | public ArrayList search(String query, String top, String es_index, String es_doc_type, String es_server){ 41 | System.out.println("Query: " + query); 42 | 43 | if (this.prop == null){ 44 | System.out.println("Error: config file is not loaded yet"); 45 | return null; 46 | } 47 | 48 | Download download = new Download(query, es_index, es_doc_type, es_server); 49 | 50 | ArrayList results = new ArrayList(); 51 | query = query.replaceAll(" ", "%20"); 52 | byte[] accountKeyBytes = Base64.encodeBase64((this.accountKey + ":" + this.accountKey).getBytes()); 53 | String accountKeyEnc = new String(accountKeyBytes); 54 | URL query_url; 55 | try { 56 | int chunk = 50; 57 | if (Integer.valueOf(top) < 50) 58 | chunk = Integer.valueOf(top); 59 | int skip_index = 0; 60 | while(chunk > 0){ 61 | query_url = new URL("https://api.datamarket.azure.com/Data.ashx/Bing/Search/v1/Web?Adult=%27Off%27&$skip=" + String.valueOf(skip_index*50) + "&Query=%27" + query + "%20filetype:html" + "%27&$top=" + String.valueOf(chunk)); 62 | System.out.println(query_url); 63 | 64 | HttpURLConnection conn = (HttpURLConnection)query_url.openConnection(); 65 | conn.setRequestMethod("GET"); 66 | conn.setRequestProperty("Authorization", "Basic " + accountKeyEnc); 67 | 68 | BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream()))); 69 | String output = ""; 70 | String line; 71 | while ((line = br.readLine()) != null) { 72 | output = output + line; 73 | } 74 | conn.disconnect(); 75 | 76 | DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); 77 | DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); 78 | InputSource is = new InputSource(new StringReader(output)); 79 | Document doc = docBuilder.parse(is); 80 | NodeList urls = doc.getElementsByTagName("d:Url"); 81 | int totalUrls = urls.getLength(); 82 | 83 | for (int i=0; i search(String query, String top, String es_index, String es_doc_type, String es_server){ 42 | System.out.println("Query: " + query); 43 | int nTop = Integer.valueOf(top); 44 | 45 | if (this.prop == null){ 46 | System.out.println("Error: config file is not loaded yet"); 47 | return null; 48 | } 49 | 50 | Download download = new Download(query, es_index, es_doc_type, es_server); 51 | 52 | ArrayList results = new ArrayList(); 53 | ArrayList titles = new ArrayList(); 54 | ArrayList snippets = new ArrayList(); 55 | query = "&num=" + String.valueOf(step) + "&key=" + accountKey + "&cx=" + cseID + "&q=" + query.replaceAll(" ", "%20"); 56 | URL query_url; 57 | 58 | try { 59 | int step = 10; //10 is the maximum number of results to return in each query 60 | for (int start = 1; start < nTop; start += step){ 61 | query_url = new URL("https://www.googleapis.com/customsearch/v1?start=" + String.valueOf(start) + query); 62 | System.out.println(query_url); 63 | 64 | HttpURLConnection conn = (HttpURLConnection)query_url.openConnection(); 65 | conn.setRequestMethod("GET"); 66 | BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream()))); 67 | String output = ""; 68 | String line; 69 | while ((line = br.readLine()) != null) { 70 | output = output + line; 71 | } 72 | conn.disconnect(); 73 | 74 | JSONObject obj = new JSONObject(output); 75 | JSONArray items = obj.getJSONArray("items"); 76 | 77 | for(int i=0; i < items.length(); ++i){ 78 | JSONObject item = items.getJSONObject(i); 79 | String link = (String)item.get("link"); 80 | results.add(link); 81 | download.addTask(link); 82 | 83 | titles.add((String)item.get("title")); 84 | snippets.add((String)item.get("snippet")); 85 | //All keys of the json object: snippet, htmlFormattedUrl, htmlTitle 86 | //kind, pagemap, displayLink, link, htmlSnippet, title, formatedUrl, cacheId 87 | } 88 | } 89 | 90 | } 91 | catch (MalformedURLException e) { 92 | e.printStackTrace(); 93 | } 94 | catch (IOException e) { 95 | e.printStackTrace(); 96 | } 97 | catch (Exception e){ 98 | e.printStackTrace(); 99 | } 100 | 101 | download.shutdown(); 102 | System.out.println("Number of results: " + String.valueOf(results.size())); 103 | 104 | //TODO: Return titles and snippets 105 | return results; 106 | } 107 | 108 | public static void main(String[] args) { 109 | 110 | String query = ""; //default 111 | String top = "50"; //default 112 | String es_index = "memex"; 113 | String es_doc_type = "page"; 114 | String es_server = "localhost"; 115 | 116 | int i = 0; 117 | while (i < args.length){ 118 | String arg = args[i]; 119 | if(arg.equals("-q")){ 120 | query = args[++i]; 121 | } else if(arg.equals("-t")){ 122 | top = args[++i]; 123 | } else if(arg.equals("-i")){ 124 | es_index = args[++i]; 125 | } else if(arg.equals("-d")){ 126 | es_doc_type = args[++i]; 127 | } else if(arg.equals("-s")){ 128 | es_server = args[++i]; 129 | }else { 130 | System.out.println("Unrecognized option"); 131 | break; 132 | } 133 | ++i; 134 | } 135 | 136 | System.out.println("Query = " + query); 137 | System.out.println("Get the top " + top + " results"); 138 | 139 | GoogleSearch bs = new GoogleSearch(); 140 | bs.search(query, top, es_index, es_doc_type, es_server); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /vis/html/crawlervis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Crawler 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
40 | 41 |
42 |
43 | Crawler: 44 |
45 |
46 |
47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
61 |
62 | 63 |
64 | Filter: 65 | 66 |

Cap:

67 | 68 |
69 | 70 | 71 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |
83 |
84 | 85 |
86 | 87 |
88 | Pages 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | Tags 98 |
99 |
100 |
101 |
102 |
103 | 104 |
105 |
106 |
107 | 108 |
109 | 110 |
111 | Terms 112 |
113 |
114 |
115 |
116 |
117 |
118 | 119 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /seeds_generator/src/main/java/page_downloader/Crawl.java: -------------------------------------------------------------------------------- 1 | import java.util.concurrent.Executors; 2 | import java.util.concurrent.ExecutorService; 3 | import java.io.FileReader; 4 | import java.io.BufferedReader; 5 | import java.io.IOException; 6 | import java.lang.InterruptedException; 7 | import java.util.concurrent.ExecutionException; 8 | import java.util.concurrent.TimeUnit; 9 | import org.elasticsearch.client.transport.TransportClient; 10 | import org.elasticsearch.common.transport.InetSocketTransportAddress; 11 | import org.elasticsearch.client.Client; 12 | import java.util.ArrayList; 13 | import java.util.Map; 14 | 15 | import org.elasticsearch.common.xcontent.XContentFactory; 16 | import org.elasticsearch.action.search.SearchResponse; 17 | import org.elasticsearch.search.SearchHit; 18 | import org.elasticsearch.action.search.SearchType; 19 | import org.elasticsearch.index.query.QueryBuilders; 20 | import org.elasticsearch.index.query.QueryBuilder; 21 | import org.elasticsearch.index.query.MissingFilterBuilder; 22 | import org.elasticsearch.index.query.FilterBuilders; 23 | import org.elasticsearch.action.index.IndexRequest; 24 | import org.elasticsearch.action.update.UpdateRequest; 25 | 26 | public class Crawl { 27 | 28 | private ArrayList urls = null; 29 | private String es_index = "memex"; 30 | private String es_doc_type = "page"; 31 | private String es_host = "localhost"; 32 | private Client client = null; 33 | private int poolSize = 100; 34 | private ExecutorService crawlForwardService = Executors.newFixedThreadPool(poolSize); 35 | private ExecutorService crawlBackwardService = Executors.newFixedThreadPool(poolSize); 36 | private int MAXSIZE = 100000; 37 | 38 | public Crawl(String es_index, String es_doc_type, String es_host){ 39 | if(es_host.isEmpty()) 40 | es_host = "localhost"; 41 | else { 42 | String[] parts = es_host.split(":"); 43 | if (parts.length == 2) 44 | es_host = parts[0]; 45 | else if(parts.length == 3) 46 | es_host = parts[1]; 47 | 48 | es_host = es_host.replaceAll("/",""); 49 | } 50 | 51 | this.es_host = es_host; 52 | 53 | this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(es_host, 9300)); 54 | 55 | if(!es_index.isEmpty()) 56 | this.es_index = es_index; 57 | if(!es_doc_type.isEmpty()) 58 | this.es_doc_type = es_doc_type; 59 | 60 | } 61 | 62 | public void addForwardCrawlTask(ArrayList urls, String top){ 63 | try{ 64 | for (String f_url : urls) { 65 | SearchResponse searchResponse = client.prepareSearch(this.es_index) 66 | .setTypes(this.es_doc_type) 67 | .setSearchType(SearchType.DFS_QUERY_THEN_FETCH) 68 | .setFetchSource(new String[]{"url"}, null) 69 | .setQuery(QueryBuilders.termQuery("url", f_url)) 70 | .setSize(this.MAXSIZE) 71 | .setFrom(0).setExplain(true) 72 | .execute() 73 | .actionGet(); 74 | 75 | for (SearchHit hit : searchResponse.getHits()) { 76 | UpdateRequest updateRequest = new UpdateRequest(this.es_index, this.es_doc_type, hit.getId()) 77 | .doc(XContentFactory.jsonBuilder() 78 | .startObject() 79 | .field("crawled_forward", 1) 80 | .endObject()); 81 | this.client.update(updateRequest).get(); 82 | } 83 | } 84 | 85 | crawlForwardService.execute(new CrawlerInterface(urls, null, "forward", top, this.es_index, this.es_doc_type, this.es_host, this.client)); 86 | } catch (IOException e1) { 87 | // TODO Auto-generated catch block 88 | e1.printStackTrace(); 89 | } catch (InterruptedException e2) { 90 | // TODO Auto-generated catch block 91 | e2.printStackTrace(); 92 | } catch (ExecutionException e3) { 93 | // TODO Auto-generated catch block 94 | e3.printStackTrace(); 95 | } 96 | } 97 | 98 | public void addBackwardCrawlTask(ArrayList urls, String top){ 99 | try{ 100 | MissingFilterBuilder filter=FilterBuilders.missingFilter("crawled_backward"); 101 | QueryBuilder qb = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(),filter); 102 | SearchResponse searchResponse = client.prepareSearch(this.es_index) 103 | .setTypes(this.es_doc_type) 104 | .setSearchType(SearchType.DFS_QUERY_THEN_FETCH) 105 | .setFetchSource(new String[]{"url", "crawled_backward"}, null) 106 | .setQuery(qb) 107 | .setSize(this.MAXSIZE) 108 | .setFrom(0).setExplain(true) 109 | .execute() 110 | .actionGet(); 111 | 112 | ArrayList not_crawled = new ArrayList(); 113 | for (SearchHit hit : searchResponse.getHits()) { 114 | Map map = hit.getSource(); 115 | String url = (String)map.get("url"); 116 | if(urls.contains(url)){ 117 | not_crawled.add(url); 118 | UpdateRequest updateRequest = new UpdateRequest(this.es_index, this.es_doc_type, hit.getId()) 119 | .doc(XContentFactory.jsonBuilder() 120 | .startObject() 121 | .field("crawled_backward", 1) 122 | .endObject()); 123 | this.client.update(updateRequest).get(); 124 | } 125 | } 126 | 127 | crawlBackwardService.execute(new CrawlerInterface(not_crawled, null, "backward", top, this.es_index, this.es_doc_type, this.es_host, this.client)); 128 | 129 | } catch (IOException e1) { 130 | // TODO Auto-generated catch block 131 | e1.printStackTrace(); 132 | } catch (InterruptedException e2) { 133 | // TODO Auto-generated catch block 134 | e2.printStackTrace(); 135 | } catch (ExecutionException e3) { 136 | // TODO Auto-generated catch block 137 | e3.printStackTrace(); 138 | } 139 | 140 | } 141 | 142 | public void shutdown(){ 143 | try { 144 | crawlForwardService.shutdown(); 145 | crawlBackwardService.shutdown(); 146 | crawlForwardService.awaitTermination(60 , TimeUnit.SECONDS); 147 | crawlBackwardService.awaitTermination(60 , TimeUnit.SECONDS); 148 | System.out.println("SHUTDOWN"); 149 | this.client.close(); 150 | } catch (InterruptedException e) { 151 | e.printStackTrace(); 152 | } 153 | } 154 | 155 | 156 | } 157 | -------------------------------------------------------------------------------- /vis/html/js/sigslot_core.js: -------------------------------------------------------------------------------- 1 | // Filename: sigslot_core.js 2 | // Purpose: provides an abstracted event handling system 3 | // Classes: NW_sigslot_registry, NW_SignalObj 4 | // Global Objects: __sig__ (aka, __signals_registry__) 5 | // Dependencies: none 6 | // Author: Alex Russell (slightlyoff@crhomium.org) 7 | 8 | // class definition for signal objects 9 | function NW_SignalObj(obj, fp){ 10 | this.fp = fp; 11 | this.obj = obj; 12 | this.slots = new Array(); 13 | this.addSlot = function(pobj, pfp){ 14 | var slot = null; 15 | if(__sig__.isSigFP(pfp)){ 16 | slot = __sig__.getSig(pfp); 17 | }else{ 18 | // whee! recursive data structures! 19 | slot = new NW_SignalObj(pobj, pfp); 20 | __sig__.addSig(slot); 21 | } 22 | this.slots[this.slots.length]=slot; 23 | } 24 | 25 | this.rmSlot = function(pobj, pfp){ 26 | if(__sig__.isSigFP(pfp)){ 27 | var tslot = __sig__.getSig(pfp); 28 | for(var x in this.slots){ 29 | if(this.slots[x]==tslot){ 30 | delete this.slots[this.slots.length]; 31 | // make sure we only remove the first instance 32 | return true; 33 | } 34 | } 35 | }else{return false;} 36 | } 37 | } 38 | 39 | function NW_sigslot_registry(){ 40 | this.uID = 0; 41 | // conArr contains an arry of signal objects 42 | this.connArr = new Array(); 43 | 44 | // this method provides the mapping between signals and slots 45 | this.connect = function(sigObj, sigFP, slotObj, slotFP){ 46 | var isFound = this.isSigFP(sigFP); 47 | if(!isFound){ 48 | this.addSig(new NW_SignalObj(sigObj, sigFP)); 49 | } 50 | var csig = this.getSig(sigFP); 51 | csig.addSlot(slotObj, slotFP); 52 | } 53 | 54 | // this method provides the mapping between signals and slots 55 | this.disconnect = function(sigObj, sigFP, slotObj, slotFP){ 56 | var csig = this.getSig(sigFP); 57 | csig.rmSlot(slotObj, slotFP); 58 | } 59 | 60 | this.addSig = function(sigObj){ 61 | var cUID = this.uID++;// should be atomic anyway, but make sure 62 | this.connArr[cUID]=sigObj; 63 | } 64 | 65 | this.isSigFP = function(fp){ 66 | var isFound = false; 67 | for(var x in this.connArr){ if(this.connArr[x].fp == fp){isFound = true;} } 68 | return isFound; 69 | } 70 | 71 | this.getSig = function(fp){ 72 | for(var x in this.connArr){ if(this.connArr[x].fp == fp){return this.connArr[x];} } 73 | return null; 74 | } 75 | 76 | this.emit = function(fp){ 77 | for(x in this.connArr){ 78 | // find the signal object 79 | if(this.connArr[x].fp==fp){ 80 | var csig = this.connArr[x]; 81 | var args = arguments; 82 | var alen = args.length; 83 | // unroll the args array 84 | if(alen == 1){ 85 | (csig.fp).call(csig.obj); 86 | for(y in csig.slots){ 87 | this.emit(csig.slots[y].fp); 88 | } 89 | }else if(alen == 2){ 90 | (csig.fp).call(csig.obj, args[1]); 91 | for(y in csig.slots){ 92 | this.emit(csig.slots[y].fp, args[1]); 93 | } 94 | }else if(alen == 3){ 95 | (csig.fp).call(csig.obj, args[1], args[2]); 96 | for(y in csig.slots){ 97 | this.emit(csig.slots[y].fp, args[1], args[2]); 98 | } 99 | }else if(alen == 4){ 100 | (csig.fp).call(csig.obj, args[1], args[2], args[3]); 101 | for(y in csig.slots){ 102 | this.emit(csig.slots[y].fp, args[1], args[2], args[3]); 103 | } 104 | 105 | }else if(alen == 5){ 106 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4]); 107 | for(y in csig.slots){ 108 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4]); 109 | } 110 | }else if(alen == 6){ 111 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5]); 112 | for(y in csig.slots){ 113 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5]); 114 | } 115 | }else if(alen == 7){ 116 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6]); 117 | for(y in csig.slots){ 118 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6]); 119 | } 120 | }else if(alen == 8){ 121 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7]); 122 | for(y in csig.slots){ 123 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7]); 124 | } 125 | }else if(alen == 9){ 126 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]); 127 | for(y in csig.slots){ 128 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]); 129 | } 130 | }else if(alen == 10){ 131 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]); 132 | for(y in csig.slots){ 133 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]); 134 | } 135 | }else if(alen == 11){ 136 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10]); 137 | for(y in csig.slots){ 138 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10]); 139 | } 140 | }else if(alen == 12){ 141 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11]); 142 | for(y in csig.slots){ 143 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11]); 144 | } 145 | }else if(alen == 13){ 146 | // if your function needs more than 12 args, you need to learn how to write better code =) 147 | (csig.fp).call(csig.obj, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12]); 148 | for(y in csig.slots){ 149 | this.emit(csig.slots[y].fp, args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12]); 150 | } 151 | } 152 | } 153 | } 154 | } 155 | } 156 | 157 | var __signals_registry__ = new NW_sigslot_registry(); 158 | var __sig__ = __signals_registry__; // alias 159 | -------------------------------------------------------------------------------- /elastic/get_documents.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from os import environ 3 | import sys 4 | from config import es as default_es 5 | 6 | def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None): 7 | if es is None: 8 | es = default_es 9 | 10 | results = {} 11 | 12 | if len(terms) > 0: 13 | 14 | for term in terms: 15 | query = { 16 | "query": { 17 | "term": { 18 | term_field: term 19 | } 20 | }, 21 | "fields": fields 22 | } 23 | 24 | res = es.search(body=query, 25 | index=es_index, 26 | doc_type=es_doc_type) 27 | 28 | if res['hits']['hits']: 29 | hits = res['hits']['hits'] 30 | 31 | records = [] 32 | for hit in hits: 33 | record = {} 34 | if not hit.get('fields') is None: 35 | record = hit['fields'] 36 | record['id'] =hit['_id'] 37 | records.append(record) 38 | results[term] = records 39 | 40 | return results 41 | 42 | 43 | def get_more_like_this(urls, fields=[], pageCount=200, es_index='memex', es_doc_type='page', es=None): 44 | if es is None: 45 | es = default_es 46 | 47 | docs = [{"_index": es_index, "_type": es_doc_type, "_id": url} for url in urls] 48 | 49 | with open(environ['DDT_HOME']+'/elastic/stopwords.txt', 'r') as f: 50 | stopwords = [word.strip() for word in f.readlines()] 51 | 52 | query = { 53 | "query":{ 54 | "more_like_this": { 55 | "fields" : ["text"], 56 | "docs": docs, 57 | "min_term_freq": 1, 58 | "stop_words": stopwords 59 | } 60 | }, 61 | "fields": fields, 62 | "size": pageCount 63 | } 64 | 65 | res = es.search(body=query, index = es_index, doc_type = es_doc_type) 66 | hits = res['hits']['hits'] 67 | 68 | results = [] 69 | for hit in hits: 70 | fields = hit['fields'] 71 | fields['id'] = hit['_id'] 72 | fields['score'] = hit['_score'] 73 | results.append(fields) 74 | 75 | return results 76 | 77 | def get_most_recent_documents(opt_maxNumberOfPages = 200, mapping=None, fields = [], opt_filter = None, es_index = 'memex', es_doc_type = 'page', es = None): 78 | 79 | if mapping == None: 80 | print "No mappings found" 81 | return [] 82 | 83 | if es is None: 84 | es = default_es 85 | 86 | query = { 87 | "size": opt_maxNumberOfPages, 88 | "sort": [ 89 | { 90 | mapping["timestamp"]: { 91 | "order": "desc" 92 | } 93 | } 94 | ] 95 | } 96 | 97 | match_q = { 98 | "match_all": {} 99 | } 100 | 101 | if not mapping.get("content_type") is None: 102 | match_q = { 103 | "match": { 104 | mapping["content_type"]: "text/html" 105 | } 106 | } 107 | 108 | 109 | if opt_filter is None: 110 | query["query"] = { 111 | "filtered": { 112 | "query": match_q, 113 | "filter":{ 114 | "exists": { 115 | "field": mapping['text'] 116 | } 117 | } 118 | } 119 | } 120 | else: 121 | query["query"] = { 122 | "query_string": { 123 | "query": "(" + mapping['text'] + ":" + opt_filter.replace('"', '\"') + ")" 124 | } 125 | } 126 | 127 | if len(fields) > 0: 128 | query["fields"] = fields 129 | 130 | res = es.search(body=query, index = es_index, doc_type = es_doc_type) 131 | hits = res['hits']['hits'] 132 | 133 | results = [] 134 | for hit in hits: 135 | fields = hit['fields'] 136 | fields['id'] = hit['_id'] 137 | results.append(fields) 138 | 139 | return results 140 | 141 | def get_all_ids(pageCount = 100000, fields=[], es_index = 'memex', es_doc_type = 'page', es = None): 142 | if es is None: 143 | es = default_es 144 | 145 | query = { 146 | "query": { 147 | "match_all": {} 148 | }, 149 | "fields": fields 150 | } 151 | 152 | try: 153 | res = es.search(body=query, index = es_index, doc_type = es_doc_type, size = pageCount, request_timeout=600) 154 | hits = res['hits']['hits'] 155 | 156 | results = [] 157 | for hit in hits: 158 | fields = hit['fields'] 159 | fields['id'] = hit['_id'] 160 | results.append(fields) 161 | 162 | return results 163 | except: 164 | print("Unexpected error:", sys.exc_info()[0]) 165 | print es_index 166 | return [] 167 | 168 | def get_documents_by_id(ids=[], fields=[], es_index = 'memex', es_doc_type = 'page', es = None): 169 | if es is None: 170 | es = default_es 171 | 172 | query = { 173 | "query": { 174 | "ids": { 175 | "values": ids 176 | } 177 | }, 178 | "fields": fields 179 | } 180 | 181 | res = es.search(body=query, index = es_index, doc_type = es_doc_type, size=len(ids)) 182 | 183 | hits = res['hits']['hits'] 184 | 185 | results = [] 186 | for hit in hits: 187 | if hit.get('fields'): 188 | fields = hit['fields'] 189 | fields['id'] = hit['_id'] 190 | results.append(fields) 191 | return results 192 | 193 | def get_plotting_data(pageCount=200, es_index = 'memex', es_doc_type = 'page', es = None): 194 | if es is None: 195 | es = default_es 196 | 197 | res = es.search(index=es_index, doc_type = es_doc_type, size=pageCount, fields=["retrieved", "url", "tag", "query"]) 198 | 199 | fields = [] 200 | for item in res['hits']['hits']: 201 | if item['fields'].get('tag') != None: 202 | if "" in item['fields']['tag']: 203 | item['fields'].pop('tag') 204 | fields.append(item['fields']) 205 | 206 | return fields 207 | 208 | if __name__ == "__main__": 209 | urls = [] 210 | with open(environ['MEMEX_HOME']+'/seed_crawler/seeds_generator/results.txt', 'r') as f: 211 | urls = f.readlines() 212 | urls = [url.strip() for url in urls] 213 | 214 | docs = get_documents(urls) 215 | -------------------------------------------------------------------------------- /vis/html/css/dropdowns-enhancement.min.css: -------------------------------------------------------------------------------- 1 | .dropdown-menu>li>label{display:block;padding:3px 20px;clear:both;font-weight:400;line-height:1.42857143;color:#333;white-space:nowrap}.dropdown-menu>li>label:hover,.dropdown-menu>li>label:focus{text-decoration:none;color:#262626;background-color:#f5f5f5}.dropdown-menu>li>input:checked~label,.dropdown-menu>li>input:checked~label:hover,.dropdown-menu>li>input:checked~label:focus,.dropdown-menu>.active>label,.dropdown-menu>.active>label:hover,.dropdown-menu>.active>label:focus{color:#fff;text-decoration:none;outline:0;background-color:#428bca}.dropdown-menu>li>input[disabled]~label,.dropdown-menu>li>input[disabled]~label:hover,.dropdown-menu>li>input[disabled]~label:focus,.dropdown-menu>.disabled>label,.dropdown-menu>.disabled>label:hover,.dropdown-menu>.disabled>label:focus{color:#999}.dropdown-menu>li>input[disabled]~label:hover,.dropdown-menu>li>input[disabled]~label:focus,.dropdown-menu>.disabled>label:hover,.dropdown-menu>.disabled>label:focus{text-decoration:none;background-color:transparent;background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);cursor:not-allowed}.dropdown-menu>li>label{margin-bottom:0;cursor:pointer}.dropdown-menu>li>input[type=radio],.dropdown-menu>li>input[type=checkbox]{display:none;position:absolute;top:-9999em;left:-9999em}.dropdown-menu>li>label:focus,.dropdown-menu>li>input:focus~label{outline:thin dotted;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}.dropdown-menu.pull-right{right:0;left:auto}.dropdown-menu.pull-top{bottom:100%;top:auto;margin:0 0 2px;-webkit-box-shadow:0 -6px 12px rgba(0,0,0,.175);box-shadow:0 -6px 12px rgba(0,0,0,.175)}.dropdown-menu.pull-center{right:50%;left:auto}.dropdown-menu.pull-middle{right:100%;margin:0 2px 0 0;box-shadow:-5px 0 10px rgba(0,0,0,.2);left:auto}.dropdown-menu.pull-middle.pull-right{right:auto;left:100%;margin:0 0 0 2px;box-shadow:5px 0 10px rgba(0,0,0,.2)}.dropdown-menu.pull-middle.pull-center{right:50%;margin:0;box-shadow:0 0 10px rgba(0,0,0,.2)}.dropdown-menu.bullet{margin-top:8px}.dropdown-menu.bullet:before{width:0;height:0;content:'';display:inline-block;position:absolute;border-color:transparent;border-style:solid;-webkit-transform:rotate(360deg);border-width:0 7px 7px;border-bottom-color:#ccc;border-bottom-color:rgba(0,0,0,.15);top:-7px;left:9px}.dropdown-menu.bullet:after{width:0;height:0;content:'';display:inline-block;position:absolute;border-color:transparent;border-style:solid;-webkit-transform:rotate(360deg);border-width:0 6px 6px;border-bottom-color:#fff;top:-6px;left:10px}.dropdown-menu.bullet.pull-right:before{left:auto;right:9px}.dropdown-menu.bullet.pull-right:after{left:auto;right:10px}.dropdown-menu.bullet.pull-top{margin-top:0;margin-bottom:8px}.dropdown-menu.bullet.pull-top:before{top:auto;bottom:-7px;border-bottom-width:0;border-top-width:7px;border-top-color:#ccc;border-top-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-top:after{top:auto;bottom:-6px;border-bottom:0;border-top-width:6px;border-top-color:#fff}.dropdown-menu.bullet.pull-center:before{left:auto;right:50%;margin-right:-7px}.dropdown-menu.bullet.pull-center:after{left:auto;right:50%;margin-right:-6px}.dropdown-menu.bullet.pull-middle{margin-right:8px}.dropdown-menu.bullet.pull-middle:before{top:50%;left:100%;right:auto;margin-top:-7px;border-right-width:0;border-bottom-color:transparent;border-top-width:7px;border-left-color:#ccc;border-left-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-middle:after{top:50%;left:100%;right:auto;margin-top:-6px;border-right-width:0;border-bottom-color:transparent;border-top-width:6px;border-left-color:#fff}.dropdown-menu.bullet.pull-middle.pull-right{margin-right:0;margin-left:8px}.dropdown-menu.bullet.pull-middle.pull-right:before{left:-7px;border-left-width:0;border-right-width:7px;border-right-color:#ccc;border-right-color:rgba(0,0,0,.15)}.dropdown-menu.bullet.pull-middle.pull-right:after{left:-6px;border-left-width:0;border-right-width:6px;border-right-color:#fff}.dropdown-menu.bullet.pull-middle.pull-center{margin-left:0;margin-right:0}.dropdown-menu.bullet.pull-middle.pull-center:before{border:0;display:none}.dropdown-menu.bullet.pull-middle.pull-center:after{border:0;display:none}.dropdown-submenu{position:relative}.dropdown-submenu>.dropdown-menu{top:0;left:100%;margin-top:-6px;margin-left:-1px;border-top-left-radius:0}.dropdown-submenu>a:before{display:block;float:right;width:0;height:0;content:"";margin-top:6px;margin-right:-8px;border-width:4px 0 4px 4px;border-style:solid;border-left-style:dashed;border-top-color:transparent;border-bottom-color:transparent}@media (max-width:767px){.navbar-nav .dropdown-submenu>a:before{margin-top:8px;border-color:inherit;border-style:solid;border-width:4px 4px 0;border-left-color:transparent;border-right-color:transparent}.navbar-nav .dropdown-submenu>a{padding-left:40px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>label{padding-left:35px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:45px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:55px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:65px}.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>a,.navbar-nav>.open>.dropdown-menu>.dropdown-submenu>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>.dropdown-menu>li>label{padding-left:75px}}.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a,.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:hover,.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:focus{background-color:#e7e7e7;color:#555}@media (max-width:767px){.navbar-default .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:before{border-top-color:#555}}.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a,.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:hover,.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:focus{background-color:#080808;color:#fff}@media (max-width:767px){.navbar-inverse .navbar-nav .open>.dropdown-menu>.dropdown-submenu.open>a:before{border-top-color:#fff}} -------------------------------------------------------------------------------- /elastic/get_mtermvectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from sklearn.feature_extraction import DictVectorizer 3 | import nltk 4 | import math 5 | from sets import Set 6 | import time 7 | import numpy as np 8 | import operator 9 | 10 | from config import es as default_es 11 | from elastic.get_documents import get_documents_by_id 12 | 13 | ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')) 14 | MAX_TERMS = 2000 15 | 16 | def pos_filter(pos_tags=['NN', 'NNS', 'NNP', 'NNPS', 'VBN', 'JJ'], docterms=[]): 17 | tagged = nltk.pos_tag(docterms) 18 | valid_words = [tag[0] for tag in tagged if tag[1] in pos_tags] 19 | return valid_words 20 | 21 | def tfidf(tf, df, n_doc): 22 | idf = math.log(n_doc / float(df)) 23 | return tf * idf 24 | 25 | def terms_from_es_json(doc, rm_stopwords=True, rm_numbers=True, termstatistics = False, term_freq = 0, mapping=None, es=None): 26 | terms = {} 27 | docterms = doc["term_vectors"][mapping['text']]["terms"] 28 | n_doc = doc["term_vectors"][mapping['text']]["field_statistics"]["doc_count"] 29 | valid_words = docterms.keys() 30 | 31 | if rm_stopwords: 32 | valid_words = [k for k in valid_words if k not in ENGLISH_STOPWORDS and (len(k) > 2)] 33 | 34 | if rm_numbers: 35 | valid_words = [k for k in valid_words if not k.lstrip('-').replace('.','',1).replace(',','',1).isdigit()] 36 | 37 | if termstatistics == True: 38 | terms = {term: {'tfidf':tfidf(docterms[term]["term_freq"], docterms[term]["doc_freq"], n_doc), 39 | 'tf': docterms[term]["term_freq"], 40 | 'ttf': docterms[term]["ttf"], 41 | } for term in valid_words if docterms[term]["ttf"] > term_freq 42 | } 43 | else: 44 | terms = { term: {'tf': docterms[term]['term_freq']} for term in valid_words if docterms[term]["term_freq"] > term_freq} 45 | 46 | # Restrict the number of terms for large documents 47 | if len(terms.keys()) > MAX_TERMS: 48 | sorted_terms = [] 49 | if termstatistics == True: 50 | terms_tfidf = {term:terms[term]["tfidf"] for term in terms.keys()} 51 | sorted_terms = sorted(terms_tfidf.items(), key=operator.itemgetter(1), reverse=True) 52 | else: 53 | terms_tf = {term:terms[term]["tf"] for term in terms.keys()} 54 | sorted_terms = sorted(terms_tf.items(), key=operator.itemgetter(1), reverse=True) 55 | 56 | terms = {item[0]: terms[item[0]] for item in sorted_terms[0:MAX_TERMS]} 57 | 58 | return terms 59 | 60 | 61 | def getTermFrequency(all_hits, rm_stopwords=True, rm_numbers=True, pos_tags=[], term_freq=0, mapping=None, es_index='memex', es_doc_type='page', es=None): 62 | if es is None: 63 | es = default_es 64 | 65 | docs = [] 66 | stats = [] 67 | corpus = [] 68 | 69 | once = True 70 | for i in range(0, len(all_hits), 10): 71 | hits = all_hits[i:i+10] 72 | 73 | term_res = es.mtermvectors(index=es_index, 74 | doc_type=es_doc_type, 75 | fields=mapping['text'], 76 | ids=hits) 77 | 78 | for doc in term_res['docs']: 79 | if doc.get('term_vectors'): 80 | if mapping['text'] in doc['term_vectors']: 81 | docs.append(doc['_id']) 82 | res = terms_from_es_json(doc=doc, rm_stopwords=rm_stopwords, rm_numbers=rm_numbers, term_freq=term_freq, mapping=mapping) 83 | stats.append(res) 84 | 85 | tfs = [] 86 | for stat in stats: 87 | tf={} 88 | tf={k:stat[k]['tf'] for k in stat.keys()} 89 | tfs.append(tf) 90 | 91 | v_tf = DictVectorizer() 92 | data = v_tf.fit_transform(tfs).toarray() 93 | corpus = v_tf.get_feature_names() 94 | 95 | if len(pos_tags) > 0: 96 | filtered_words = pos_filter(pos_tags, corpus) 97 | indices = [corpus.index(term) for term in corpus if term not in filtered_words] 98 | corpus = np.delete(corpus, indices) 99 | corpus = corpus.tolist() 100 | data = np.delete(data, indices, 1) 101 | 102 | return [data, corpus, docs] 103 | 104 | 105 | def getTermStatistics(all_hits, rm_stopwords=True, rm_numbers=True, pos_tags=[], term_freq=0, num_terms=MAX_TERMS, mapping=None, es_index='memex', es_doc_type='page', es=None): 106 | if es is None: 107 | es = default_es 108 | 109 | stats = [] 110 | docs = [] 111 | 112 | ttf = {} 113 | for i in range(0, len(all_hits), 10): 114 | hits = all_hits[i:i+10] 115 | 116 | term_res = es.mtermvectors(index=es_index, 117 | doc_type=es_doc_type, 118 | term_statistics=True, 119 | fields=mapping['text'], 120 | ids=hits) 121 | 122 | for doc in term_res['docs']: 123 | if doc.get('term_vectors'): 124 | if mapping['text'] in doc['term_vectors']: 125 | docs.append(doc['_id']) 126 | res = terms_from_es_json(doc=doc, rm_stopwords=rm_stopwords, rm_numbers=rm_numbers, termstatistics=True, term_freq=term_freq, mapping=mapping) 127 | stats.append(res) 128 | for k in res.keys(): 129 | ttf[k] = res[k]['ttf'] 130 | 131 | 132 | tfidfs = [] 133 | tfs = [] 134 | for stat in stats: 135 | tfidf={k: stat[k]['tfidf'] for k in stat.keys()} 136 | tfidfs.append(tfidf) 137 | tf={k:stat[k]['tf'] for k in stat.keys()} 138 | tfs.append(tf) 139 | 140 | v_tfidf = DictVectorizer() 141 | v_tf = DictVectorizer() 142 | 143 | data = v_tfidf.fit_transform(tfidfs).toarray() 144 | corpus = v_tfidf.get_feature_names() 145 | tf_data = v_tf.fit_transform(tfs).toarray() 146 | 147 | if len(pos_tags) > 0: 148 | filtered_words = pos_filter(pos_tags, corpus) 149 | indices = [corpus.index(term) for term in corpus if term not in filtered_words] 150 | corpus = np.delete(corpus, indices) 151 | corpus = corpus.tolist() 152 | data = np.delete(data, indices, 1) 153 | tf_data = np.delete(tf_data, indices, 1) 154 | 155 | if len(corpus) > MAX_TERMS: 156 | mean_tfidf = np.mean(data, axis=0) 157 | indices = np.argsort(mean_tfidf)[::-1] 158 | corpus = [corpus[i] for i in indices] 159 | data = data[:, indices] 160 | tf_data = tf_data[:, indices] 161 | 162 | ttf = {key:value for key, value in ttf.iteritems() if key in corpus} 163 | 164 | result = [data, tf_data, ttf, corpus, docs] 165 | 166 | del tfidfs 167 | del tfs 168 | 169 | return result 170 | 171 | -------------------------------------------------------------------------------- /vis/html/libs/bootstrap-datetimepicker-4.15.35/css/bootstrap-datetimepicker.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Datetimepicker for Bootstrap 3 3 | * version : 4.15.35 4 | * https://github.com/Eonasdan/bootstrap-datetimepicker/ 5 | */.bootstrap-datetimepicker-widget{list-style:none}.bootstrap-datetimepicker-widget.dropdown-menu{margin:2px 0;padding:4px;width:19em}@media (min-width:768px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:992px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:1200px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}.bootstrap-datetimepicker-widget.dropdown-menu:before,.bootstrap-datetimepicker-widget.dropdown-menu:after{content:'';display:inline-block;position:absolute}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:before{border-left:7px solid transparent;border-right:7px solid transparent;border-bottom:7px solid #ccc;border-bottom-color:rgba(0,0,0,0.2);top:-7px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:after{border-left:6px solid transparent;border-right:6px solid transparent;border-bottom:6px solid white;top:-6px;left:8px}.bootstrap-datetimepicker-widget.dropdown-menu.top:before{border-left:7px solid transparent;border-right:7px solid transparent;border-top:7px solid #ccc;border-top-color:rgba(0,0,0,0.2);bottom:-7px;left:6px}.bootstrap-datetimepicker-widget.dropdown-menu.top:after{border-left:6px solid transparent;border-right:6px solid transparent;border-top:6px solid white;bottom:-6px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:before{left:auto;right:6px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:after{left:auto;right:7px}.bootstrap-datetimepicker-widget .list-unstyled{margin:0}.bootstrap-datetimepicker-widget a[data-action]{padding:6px 0}.bootstrap-datetimepicker-widget a[data-action]:active{box-shadow:none}.bootstrap-datetimepicker-widget .timepicker-hour,.bootstrap-datetimepicker-widget .timepicker-minute,.bootstrap-datetimepicker-widget .timepicker-second{width:54px;font-weight:bold;font-size:1.2em;margin:0}.bootstrap-datetimepicker-widget button[data-action]{padding:6px}.bootstrap-datetimepicker-widget .btn[data-action="incrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Hours"}.bootstrap-datetimepicker-widget .btn[data-action="incrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="decrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Hours"}.bootstrap-datetimepicker-widget .btn[data-action="decrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="showHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Hours"}.bootstrap-datetimepicker-widget .btn[data-action="showMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="togglePeriod"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle AM/PM"}.bootstrap-datetimepicker-widget .btn[data-action="clear"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Clear the picker"}.bootstrap-datetimepicker-widget .btn[data-action="today"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Set the date to today"}.bootstrap-datetimepicker-widget .picker-switch{text-align:center}.bootstrap-datetimepicker-widget .picker-switch::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle Date and Time Screens"}.bootstrap-datetimepicker-widget .picker-switch td{padding:0;margin:0;height:auto;width:auto;line-height:inherit}.bootstrap-datetimepicker-widget .picker-switch td span{line-height:2.5;height:2.5em;width:100%}.bootstrap-datetimepicker-widget table{width:100%;margin:0}.bootstrap-datetimepicker-widget table td,.bootstrap-datetimepicker-widget table th{text-align:center;border-radius:4px}.bootstrap-datetimepicker-widget table th{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table th.picker-switch{width:145px}.bootstrap-datetimepicker-widget table th.disabled,.bootstrap-datetimepicker-widget table th.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table th.prev::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Previous Month"}.bootstrap-datetimepicker-widget table th.next::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Next Month"}.bootstrap-datetimepicker-widget table thead tr:first-child th{cursor:pointer}.bootstrap-datetimepicker-widget table thead tr:first-child th:hover{background:#eee}.bootstrap-datetimepicker-widget table td{height:54px;line-height:54px;width:54px}.bootstrap-datetimepicker-widget table td.cw{font-size:.8em;height:20px;line-height:20px;color:#777}.bootstrap-datetimepicker-widget table td.day{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table td.day:hover,.bootstrap-datetimepicker-widget table td.hour:hover,.bootstrap-datetimepicker-widget table td.minute:hover,.bootstrap-datetimepicker-widget table td.second:hover{background:#eee;cursor:pointer}.bootstrap-datetimepicker-widget table td.old,.bootstrap-datetimepicker-widget table td.new{color:#777}.bootstrap-datetimepicker-widget table td.today{position:relative}.bootstrap-datetimepicker-widget table td.today:before{content:'';display:inline-block;border:solid transparent;border-width:0 0 7px 7px;border-bottom-color:#337ab7;border-top-color:rgba(0,0,0,0.2);position:absolute;bottom:4px;right:4px}.bootstrap-datetimepicker-widget table td.active,.bootstrap-datetimepicker-widget table td.active:hover{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td.active.today:before{border-bottom-color:#fff}.bootstrap-datetimepicker-widget table td.disabled,.bootstrap-datetimepicker-widget table td.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table td span{display:inline-block;width:54px;height:54px;line-height:54px;margin:2px 1.5px;cursor:pointer;border-radius:4px}.bootstrap-datetimepicker-widget table td span:hover{background:#eee}.bootstrap-datetimepicker-widget table td span.active{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td span.old{color:#777}.bootstrap-datetimepicker-widget table td span.disabled,.bootstrap-datetimepicker-widget table td span.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget.usetwentyfour td.hour{height:27px;line-height:27px}.bootstrap-datetimepicker-widget.wider{width:21em}.bootstrap-datetimepicker-widget .datepicker-decades .decade{line-height:1.8em !important}.input-group.date .input-group-addon{cursor:pointer}.sr-only{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0} -------------------------------------------------------------------------------- /vis/html/js/tagsgallery.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @fileoverview js Gallery of tags. 3 | * 4 | * @author (cesarpalomo@gmail.com) Cesar Palomo 5 | */ 6 | 7 | 8 | 9 | /** 10 | * Manages a list of tags used for pages (some predefined and some defined by user). 11 | * Interaction is possible through click on "tag selected" and "untag selected". 12 | * 13 | * @param parentContainerId ID for gallery parent div element. 14 | * @param predefinedTags list of predefined tags, with tag name. 15 | * @param tagsLogic mechanism to handle tags logic: some tags are not applicable, and some tags when 16 | * applied should trigger the removal of other tags (e.g. when Yes is applied, No is 17 | * removed). 18 | * Must be in the format: 19 | * { 20 | * 'TagName': { 21 | * applicable: true/false, 22 | * removable: true/false, 23 | * negate: ['Tag1', 'Tag2'], 24 | * }, 25 | * } 26 | */ 27 | var TagsGallery = function(parentContainerId, predefinedTags, tagsLogic) { 28 | this.parentContainerId = parentContainerId; 29 | 30 | // Predefined items in gallery. 31 | this.predefinedItems = predefinedTags; 32 | 33 | // User-defined items in gallery. 34 | this.userItems = []; 35 | 36 | // Handles tags logic. 37 | this.tagsLogic = tagsLogic; 38 | 39 | this.update(); 40 | }; 41 | 42 | 43 | /** 44 | * Clears list of items. 45 | */ 46 | TagsGallery.prototype.clear = function(lazyUpdate) { 47 | this.userItems = []; 48 | 49 | if (!lazyUpdate) { 50 | this.update(); 51 | } 52 | }; 53 | 54 | 55 | /** 56 | * Adds item to gallery. 57 | */ 58 | TagsGallery.prototype.addItem = function(tag, lazyUpdate) { 59 | if(this.predefinedItems.indexOf(tag) < 0) { 60 | if(this.userItems.indexOf(tag) < 0){ 61 | this.userItems.push(tag); 62 | this.tagsLogic[tag] = {'applicable': true, 'removable': true, negate: []}; 63 | if(this.tagsLogic["Neutral"]["negate"].indexOf(tag) < 0) 64 | this.tagsLogic["Neutral"]["negate"].push(tag); 65 | if (!lazyUpdate) { 66 | this.update(); 67 | } 68 | } 69 | } 70 | }; 71 | 72 | 73 | /** 74 | * Removes item from gallery. 75 | */ 76 | TagsGallery.prototype.removeItem = function(tag) { 77 | var index = this.userItems.indexOf(tag); 78 | if( index >= 0 && this.predefinedItems.indexOf(tag) < 0){ 79 | this.userItems.splice(index, 1); 80 | } 81 | } 82 | 83 | /** 84 | * Get items from gallery. 85 | */ 86 | TagsGallery.prototype.getCustomTags = function() { 87 | return this.userItems; 88 | } 89 | 90 | /** 91 | * Sets mechanism to handle tags logic: some tags are not applicable, and some tags when applied 92 | * should trigger the removal of other tags (e.g. when Yes is applied, No is removed). 93 | * Logic must be in the format: 94 | * { 95 | * 'TagName': { 96 | * applicable: true/false, 97 | * removable: true/false, 98 | * negate: ['Tag1', 'Tag2'], 99 | * }, 100 | * } 101 | */ 102 | TagsGallery.prototype.setTagsLogic = function(tagsLogic) { 103 | // Handles tags logic. 104 | this.tagsLogic = tagsLogic; 105 | }; 106 | 107 | 108 | /** 109 | * Updates gallery. 110 | */ 111 | TagsGallery.prototype.update = function() { 112 | var gallery = this; 113 | this.items = this.predefinedItems.concat(this.userItems); 114 | 115 | var gallery = this; 116 | var items = d3.select(this.parentContainerId) 117 | .selectAll('.item').data(this.items, function(item, i) { 118 | return item + '-' + i; 119 | }); 120 | 121 | // Configures actions on images. 122 | items.each(function(item, i) { 123 | // Only clickable tags. 124 | var isApplicable = gallery.isTagApplicable(item); 125 | var isRemovable = gallery.isTagRemovable(item); 126 | 127 | if (isApplicable || isRemovable) { 128 | var itemElm = d3.select(this); 129 | itemElm.selectAll('img').each(function() { 130 | var img = d3.select(this); 131 | var actionType = img.attr('actionType'); 132 | if ((isApplicable && actionType == 'Apply') 133 | || (isRemovable && actionType == 'Remove')) { 134 | img 135 | .on('mouseover', function() { 136 | Utils.showTooltip(); 137 | }) 138 | .on('mousemove', function() { 139 | Utils.updateTooltip(actionType + ' tag "' + item + '"'); 140 | }) 141 | .on('mouseout', function() { 142 | Utils.hideTooltip(); 143 | }) 144 | .on('click', function() { 145 | gallery.onItemActionClick(item, i, actionType); 146 | event.stopPropagation(); 147 | }); 148 | } 149 | }); 150 | } 151 | }); 152 | }; 153 | 154 | 155 | /** 156 | * Returns whether a tag is applicable. 157 | */ 158 | TagsGallery.prototype.isTagApplicable = function(tag) { 159 | return tag in this.tagsLogic && this.tagsLogic[tag]['applicable']; 160 | }; 161 | 162 | 163 | /** 164 | * Returns whether a tag is removable. 165 | */ 166 | TagsGallery.prototype.isTagRemovable = function(tag) { 167 | return tag in this.tagsLogic && this.tagsLogic[tag]['removable']; 168 | }; 169 | 170 | 171 | /** 172 | * Builds html content with info about an item in the gallery. 173 | */ 174 | TagsGallery.prototype.getItemInfo = function(item, i) { 175 | return item; 176 | }; 177 | 178 | 179 | /** 180 | * Builds html content with buttons for labeling relevancy an item in the gallery, 181 | * such as Yes, No, Maybe. 182 | */ 183 | TagsGallery.prototype.getItemButtons = function(item, i) { 184 | var w = 12; 185 | var a = this.isTagApplicable(item) ? 'clickable' : 'not-clickable'; 186 | var r = this.isTagRemovable(item) ? 'clickable' : 'not-clickable'; 187 | return '' 188 | + ''; 189 | }; 190 | 191 | 192 | /** 193 | * Handles click in an item. 194 | */ 195 | TagsGallery.prototype.onItemClick = function(item, i) { 196 | __sig__.emit(__sig__.tag_clicked, item); 197 | }; 198 | 199 | 200 | /** 201 | * Handles item focus. 202 | */ 203 | TagsGallery.prototype.onItemFocus = function(item, i, onFocus) { 204 | __sig__.emit(__sig__.tag_focus, item, onFocus); 205 | }; 206 | 207 | 208 | /** 209 | * Handles click in an item. 210 | */ 211 | TagsGallery.prototype.onItemActionClick = function(item, i, actionType) { 212 | this.applyOrRemoveTag(item, actionType); 213 | }; 214 | 215 | 216 | /** 217 | * Applies or removes tag. 218 | */ 219 | TagsGallery.prototype.applyOrRemoveTag = function(tag, actionType, opt_pages, refresh_plot) { 220 | // Handles tags logic. 221 | if (tag in this.tagsLogic) { 222 | var logicForTag = this.tagsLogic[tag]; 223 | 224 | if (actionType == 'Apply') { 225 | // Removes tags in negate. 226 | for (var i in logicForTag.negate) { 227 | var negateTag = logicForTag.negate[i]; 228 | __sig__.emit(__sig__.tag_action_clicked, negateTag, 'Remove', opt_pages, refresh_plot); 229 | } 230 | if (logicForTag.applicable && !logicForTag.isVirtual) { 231 | __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot); 232 | } 233 | } else { 234 | // Removes tag when removable. 235 | if (logicForTag.removable) { 236 | __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot); 237 | } 238 | } 239 | } else { 240 | __sig__.emit(__sig__.tag_action_clicked, tag, actionType, opt_pages, refresh_plot); 241 | } 242 | }; 243 | 244 | 245 | 246 | /** 247 | * Returns applicable tags. 248 | */ 249 | TagsGallery.prototype.getApplicableTags = function() { 250 | var gallery = this; 251 | return this.items.filter(function(tag) { 252 | return gallery.isTagApplicable(tag); 253 | }); 254 | }; 255 | -------------------------------------------------------------------------------- /elastic/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | above 5 | abst 6 | accordance 7 | according 8 | accordingly 9 | across 10 | act 11 | actually 12 | added 13 | adj 14 | affected 15 | affecting 16 | affects 17 | after 18 | afterwards 19 | again 20 | against 21 | ah 22 | all 23 | almost 24 | alone 25 | along 26 | already 27 | also 28 | although 29 | always 30 | am 31 | among 32 | amongst 33 | an 34 | and 35 | announce 36 | another 37 | any 38 | anybody 39 | anyhow 40 | anymore 41 | anyone 42 | anything 43 | anyway 44 | anyways 45 | anywhere 46 | apparently 47 | approximately 48 | are 49 | aren 50 | arent 51 | arise 52 | around 53 | as 54 | aside 55 | ask 56 | asking 57 | at 58 | auth 59 | available 60 | away 61 | awfully 62 | b 63 | back 64 | be 65 | became 66 | because 67 | become 68 | becomes 69 | becoming 70 | been 71 | before 72 | beforehand 73 | begin 74 | beginning 75 | beginnings 76 | begins 77 | behind 78 | being 79 | believe 80 | below 81 | beside 82 | besides 83 | between 84 | beyond 85 | biol 86 | both 87 | brief 88 | briefly 89 | but 90 | by 91 | c 92 | ca 93 | came 94 | can 95 | cannot 96 | can't 97 | cause 98 | causes 99 | certain 100 | certainly 101 | co 102 | com 103 | come 104 | comes 105 | contain 106 | containing 107 | contains 108 | could 109 | couldnt 110 | d 111 | date 112 | did 113 | didn't 114 | different 115 | do 116 | does 117 | doesn't 118 | doing 119 | done 120 | don't 121 | down 122 | downwards 123 | due 124 | during 125 | e 126 | each 127 | ed 128 | edu 129 | effect 130 | eg 131 | eight 132 | eighty 133 | either 134 | else 135 | elsewhere 136 | end 137 | ending 138 | enough 139 | especially 140 | et 141 | et-al 142 | etc 143 | even 144 | ever 145 | every 146 | everybody 147 | everyone 148 | everything 149 | everywhere 150 | ex 151 | except 152 | f 153 | far 154 | few 155 | ff 156 | fifth 157 | first 158 | five 159 | fix 160 | followed 161 | following 162 | follows 163 | for 164 | former 165 | formerly 166 | forth 167 | found 168 | four 169 | from 170 | further 171 | furthermore 172 | g 173 | gave 174 | get 175 | gets 176 | getting 177 | give 178 | given 179 | gives 180 | giving 181 | go 182 | goes 183 | gone 184 | got 185 | gotten 186 | h 187 | had 188 | happens 189 | hardly 190 | has 191 | hasn't 192 | have 193 | haven't 194 | having 195 | he 196 | hed 197 | hence 198 | her 199 | here 200 | hereafter 201 | hereby 202 | herein 203 | heres 204 | hereupon 205 | hers 206 | herself 207 | hes 208 | hi 209 | hid 210 | him 211 | himself 212 | his 213 | hither 214 | home 215 | how 216 | howbeit 217 | however 218 | hundred 219 | i 220 | id 221 | ie 222 | if 223 | i'll 224 | im 225 | immediate 226 | immediately 227 | importance 228 | important 229 | in 230 | inc 231 | indeed 232 | index 233 | information 234 | instead 235 | into 236 | invention 237 | inward 238 | is 239 | isn't 240 | it 241 | itd 242 | it'll 243 | its 244 | itself 245 | i've 246 | j 247 | just 248 | k 249 | keep keeps 250 | kept 251 | kg 252 | km 253 | know 254 | known 255 | knows 256 | l 257 | largely 258 | last 259 | lately 260 | later 261 | latter 262 | latterly 263 | least 264 | less 265 | lest 266 | let 267 | lets 268 | like 269 | liked 270 | likely 271 | line 272 | little 273 | 'll 274 | look 275 | looking 276 | looks 277 | ltd 278 | m 279 | made 280 | mainly 281 | make 282 | makes 283 | many 284 | may 285 | maybe 286 | me 287 | mean 288 | means 289 | meantime 290 | meanwhile 291 | merely 292 | mg 293 | might 294 | million 295 | miss 296 | ml 297 | more 298 | moreover 299 | most 300 | mostly 301 | mr 302 | mrs 303 | much 304 | mug 305 | must 306 | my 307 | myself 308 | n 309 | na 310 | name 311 | namely 312 | nay 313 | nd 314 | near 315 | nearly 316 | necessarily 317 | necessary 318 | need 319 | needs 320 | neither 321 | never 322 | nevertheless 323 | new 324 | next 325 | nine 326 | ninety 327 | no 328 | nobody 329 | non 330 | none 331 | nonetheless 332 | noone 333 | nor 334 | normally 335 | nos 336 | not 337 | noted 338 | nothing 339 | now 340 | nowhere 341 | o 342 | obtain 343 | obtained 344 | obviously 345 | of 346 | off 347 | often 348 | oh 349 | ok 350 | okay 351 | old 352 | omitted 353 | on 354 | once 355 | one 356 | ones 357 | only 358 | onto 359 | or 360 | ord 361 | other 362 | others 363 | otherwise 364 | ought 365 | our 366 | ours 367 | ourselves 368 | out 369 | outside 370 | over 371 | overall 372 | owing 373 | own 374 | p 375 | page 376 | pages 377 | part 378 | particular 379 | particularly 380 | past 381 | per 382 | perhaps 383 | placed 384 | please 385 | plus 386 | poorly 387 | possible 388 | possibly 389 | potentially 390 | pp 391 | predominantly 392 | present 393 | previously 394 | primarily 395 | probably 396 | promptly 397 | proud 398 | provides 399 | put 400 | q 401 | que 402 | quickly 403 | quite 404 | qv 405 | r 406 | ran 407 | rather 408 | rd 409 | re 410 | readily 411 | really 412 | recent 413 | recently 414 | ref 415 | refs 416 | regarding 417 | regardless 418 | regards 419 | related 420 | relatively 421 | research 422 | respectively 423 | resulted 424 | resulting 425 | results 426 | right 427 | run 428 | s 429 | said 430 | same 431 | saw 432 | say 433 | saying 434 | says 435 | sec 436 | section 437 | see 438 | seeing 439 | seem 440 | seemed 441 | seeming 442 | seems 443 | seen 444 | self 445 | selves 446 | sent 447 | seven 448 | several 449 | shall 450 | she 451 | shed 452 | she'll 453 | shes 454 | should 455 | shouldn't 456 | show 457 | showed 458 | shown 459 | showns 460 | shows 461 | significant 462 | significantly 463 | similar 464 | similarly 465 | since 466 | six 467 | slightly 468 | so 469 | some 470 | somebody 471 | somehow 472 | someone 473 | somethan 474 | something 475 | sometime 476 | sometimes 477 | somewhat 478 | somewhere 479 | soon 480 | sorry 481 | specifically 482 | specified 483 | specify 484 | specifying 485 | still 486 | stop 487 | strongly 488 | sub 489 | substantially 490 | successfully 491 | such 492 | sufficiently 493 | suggest 494 | sup 495 | sure t 496 | take 497 | taken 498 | taking 499 | tell 500 | tends 501 | th 502 | than 503 | thank 504 | thanks 505 | thanx 506 | that 507 | that'll 508 | thats 509 | that've 510 | the 511 | their 512 | theirs 513 | them 514 | themselves 515 | then 516 | thence 517 | there 518 | thereafter 519 | thereby 520 | thered 521 | therefore 522 | therein 523 | there'll 524 | thereof 525 | therere 526 | theres 527 | thereto 528 | thereupon 529 | there've 530 | these 531 | they 532 | theyd 533 | they'll 534 | theyre 535 | they've 536 | think 537 | this 538 | those 539 | thou 540 | though 541 | thoughh 542 | thousand 543 | throug 544 | through 545 | throughout 546 | thru 547 | thus 548 | til 549 | tip 550 | to 551 | together 552 | too 553 | took 554 | toward 555 | towards 556 | tried 557 | tries 558 | truly 559 | try 560 | trying 561 | ts 562 | twice 563 | two 564 | u 565 | un 566 | under 567 | unfortunately 568 | unless 569 | unlike 570 | unlikely 571 | until 572 | unto 573 | up 574 | upon 575 | ups 576 | us 577 | use 578 | used 579 | useful 580 | usefully 581 | usefulness 582 | uses 583 | using 584 | usually 585 | v 586 | value 587 | various 588 | 've 589 | very 590 | via 591 | viz 592 | vol 593 | vols 594 | vs 595 | w 596 | want 597 | wants 598 | was 599 | wasnt 600 | way 601 | we 602 | wed 603 | welcome 604 | we'll 605 | went 606 | were 607 | werent 608 | we've 609 | what 610 | whatever 611 | what'll 612 | whats 613 | when 614 | whence 615 | whenever 616 | where 617 | whereafter 618 | whereas 619 | whereby 620 | wherein 621 | wheres 622 | whereupon 623 | wherever 624 | whether 625 | which 626 | while 627 | whim 628 | whither 629 | who 630 | whod 631 | whoever 632 | whole 633 | who'll 634 | whom 635 | whomever 636 | whos 637 | whose 638 | why 639 | widely 640 | willing 641 | wish 642 | with 643 | within 644 | without 645 | wont 646 | words 647 | world 648 | would 649 | wouldnt 650 | www 651 | x 652 | y 653 | yes 654 | yet 655 | you 656 | youd 657 | you'll 658 | your 659 | youre 660 | yours 661 | yourself 662 | yourselves 663 | you've 664 | z 665 | zero 666 | -------------------------------------------------------------------------------- /vis/html/release.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Domain Discovery Tool Release Notes 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 |

Domain Discovery Tool v2.9 Release Notes

23 |

Features

24 |
    25 |
  • Incorporated an online model learner to incrementally learn the domain model as the user annotates the pages. The accuracy of the model is an indicator of completeness of the domain. The model is also used to label the unlabeled pages which helps the user see which pages the model is most unsure of and thereby helps the user to annotate a smaller subset of pages that would most affect the performance of the model
  • 26 |
  • Added a quality indicator for domain model construction
  • 27 |
  • The interface was restructured to support multiple criteria for page filtering and improve workflow
  • 28 |
  • Made improvements to handle long pages
  • 29 |
30 |

Domain Discovery Tool v2.8.5 Release Notes

31 |

Features

32 |
    33 |
  • Added ability to delete domains
  • 34 |
  • Added settings to model built in order to select custom tags as either relevant and irrelevant to build ACHE classifier
  • 35 |
36 |

Domain Discovery Tool v2.8.4 Release Notes

37 |

Features

38 |
    39 |
  • Crawl backward and forward links of selected pages to extend content
  • 40 |
  • Topics dashboard to discover and visualize the distribution of topics in the domain content
  • 41 |
  • Added ability to custom tag individual pages in the pagesgallery
  • 42 |
  • Choose between Google and Bing web searches
  • 43 |
  • Assign different colors for custom tags
  • 44 |
45 |

Domain Discovery Tool v2.8.3 Release Notes

46 |

Features

47 |
    48 |
  • Added ability to custom tag pages
  • 49 |
  • Filter pages by tags
  • 50 |
51 |

Domain Discovery Tool v2.8.2 Release Notes

52 |

Features

53 |
    54 |
  • Added a statistics dashboard to provide various aggregations of the contents of the DDT data including queries and annotations
  • 55 |
56 |

Domain Discovery Tool v2.8.1 Release Notes

57 |

Features

58 |
    59 |
  • Replaced D3 page clustering visualization with Continuum's bokeh plot
  • 60 |
61 |

Domain Discovery Tool v2.8 Release Notes

62 |

Features

63 |
    64 |
  • Added the feature to select the kind of pages to retrieve as follows:
  • 65 |
      66 |
    • Most recent pages
    • 67 |
    • Pages for specific queries
    • 68 |
    • Relevant pages and pages similar to those
    • 69 |
    • Irrelevant pages and pages similar to those
    • 70 |
    71 |
72 |

Domain Discovery Tool v2.7.1 Release Notes

73 |

Features

74 |
    75 |
  • Fixed bugs in ranking and accessing elasticsearch data
  • 76 |
77 |

Domain Discovery Tool v2.7.0 Release Notes

78 |

Features

79 |
    80 |
  • Decoupled server and client to allow multiple-users
  • 81 |
  • Added capability to upload a file with list of URLs or enter a list of URLs in a text box for which the pages are to be downloaded to be viewed in DDT
  • 82 |
83 | 84 |

Domain Discovery Tool v2.6.1 Release Notes

85 |

Features

86 |
    87 |
  • Moved the model building to the menu bar
  • 88 |
89 | 90 |

Domain Discovery Tool v2.6.0 Release Notes

91 |

Features

92 |
    93 |
  • Added a menu bar at the top with logo and name of tool and moved the following to the menu:
  • 94 |
      95 |
    • list of available domains and adding new domains to it
    • 96 |
    97 |
  • Changed the look and feel of the terms window.
  • 98 |
      99 |
    • All the words appear to the left.
    • 100 |
    • Custom words are also added to the same list as the extracted ones
    • 101 |
    • Custom words are distinguised by the delete icon before them. This delete icon allows the deletion of the custom terms.
    • 102 |
    • The positive and negative bars are both on the right
    • 103 |
    104 |
  • Updated the development environment to use conda
  • 105 |
  • Updated docker build to use the make and not use fab anymore
  • 106 |
107 | 108 |

Domain Discovery Tool v2.5.0 Release Notes

109 |

Features

110 |
    111 |
  • Filter the corpora by date range
  • 112 |
  • Added bigrams and trigrams to term list
  • 113 |
  • Added ability to enter custom relevant and irrelevant terms
  • 114 |
115 | 116 |

Domain Discovery Tool v2.4.0 Release Notes

117 |

Features

118 |
    119 |
  • Ability to create a new domain
  • 120 |
  • Query the web and output the downloaded pages to elasticsearch
  • 121 |
  • The downloading of the pages can be seen in the page summary window below the web search component
  • 122 |
  • Update option to update to new downloaded pages and re-rank terms
  • 123 |
  • Visualize the page clustering projections on the multidimensional visualization (MDV) window
  • 124 |
  • Currently supported projection methods are: PCA, t-SNE, K-Means
  • 125 |
  • Allows lasso selection of the pages in the MDV window
  • 126 |
  • Allows zoom in MDV window when key 'z' is kept pressed
  • 127 |
  • The files selected in the MDV window are displayed with a short snippet from the page, an image and url in the pages panel below the MDV window
  • 128 |
  • Shift+click on the page in the pages panel brings up a snapshot of the page
  • 129 |
  • Clicking on the url in pages panel opens the page in a new tab
  • 130 |
  • After inspecting the pages all selected pages can be tagged relevant or irrelevant as a group or individually
  • 131 |
  • Ability to connect to exisiting corpora stored in elasticsearch
  • 132 |
  • Filter the corpora for documents with specific query words
  • 133 |
  • The number of filtered documents to be retrieved can be set using the pageCap
  • 134 |
  • When documents are filtered, the terms retrieved reflect the significant terms in the filtered documents.
  • 135 |
  • The update also retrieves the top 50 frequent terms in the pages that provide more insight into the content of the pages
  • 136 |
  • The red and blue bars to the right and left of the term show a relative occurrence of the term in negative and positive pages. This facilitates selecting terms that are more discriminative
  • 137 |
  • When the mouse is hovered over the word the snippets from pages that the selected term occurs in are displayed. This provides a context for the term
  • 138 |
  • The terms can be tagged positive or negative by one-click and double-click respectively
  • 139 |
  • After tagging when update is clicked the terms are re-ranked based on previous tagging
  • 140 |
  • Shift+Click on the term to enter it into the web search
  • 141 |
  • These results are stored on the back end to build page classifiers for crawlers
  • 142 |
  • Clicking on the Model button builds the Ache crawler page classifier model, features and seeds files
  • 143 |
  • It also outputs the annotated pages as training data. This can be used by any other crawler to build their respective models
  • 144 |
  • Once the model is built it can be downloaded and saved to be further retrieved for starting a crawler
  • 145 |
146 |
147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /vis/html/js/libs/jquery.urlive.js: -------------------------------------------------------------------------------- 1 | /* 2 | * jquery.urlive.js v1.1.1, jQuery URLive 3 | * 4 | * Copyright 2014 Mark Serbol. 5 | * Use, reproduction, distribution, and modification of this code is subject to the terms and 6 | * conditions of the MIT license, available at http://www.opensource.org/licenses/MIT. 7 | * 8 | * https://github.com/markserbol/urlive 9 | * 10 | */ 11 | 12 | ;(function($){ 13 | var defaults = { 14 | container: '.urlive-container', 15 | target: '_blank', 16 | imageSize: 'auto', 17 | render: true, 18 | disableClick: false, 19 | regexp: /((https?:\/\/)?[\w-@]+(\.[a-z]+)+\.?(:\d+)?(\/\S*)?)/i, 20 | yqlSelect: '*', 21 | callbacks: { 22 | onStart: function() {}, 23 | onSuccess: function() {}, 24 | onFail: function() {}, 25 | noData: function() {}, 26 | onLoadEnd: function() {}, 27 | imgError: function() {}, 28 | onClick: function() {} 29 | } 30 | }, 31 | 32 | xajax = (function(ajax){ 33 | var exRegex = RegExp(window.location.protocol + '//' + window.location.hostname), 34 | yql_base_uri = 'http'+(/^https/.test(window.location.protocol)?'s':'') + 35 | '://query.yahooapis.com/v1/public/yql?callback=?', 36 | yql_query = 'select {SELECT} from html where url="{URL}" and xpath="*" and compat="html5"'; 37 | 38 | return function(o) { 39 | var url = (!/^https?:\/\//i.test(o.url)) ? window.location.protocol + '//' + o.url : o.url; 40 | 41 | if (/get/i.test(o.type) && !/json/i.test(o.dataType) && !exRegex.test(url) && /:\/\//.test(url)){ 42 | 43 | o.url = yql_base_uri; 44 | o.dataType = 'json'; 45 | o.data = { 46 | q: yql_query.replace('{SELECT}', o.yqlSelect).replace( 47 | '{URL}', 48 | url + (o.data ? (/\?/.test(url) ? '&' : '?') + $.param(o.data) : '') 49 | ), 50 | format: 'xml' 51 | }; 52 | 53 | if (!o.success && o.complete) { 54 | o.success = o.complete; 55 | delete o.complete; 56 | } 57 | 58 | o.success = (function(success){ 59 | return function(data){ 60 | if(success){ 61 | success.call(this, { 62 | responseText: (data.results[0] || '').replace(/]+?\/>|/gi, '') 63 | }, 'success'); 64 | } 65 | 66 | }; 67 | })(o.success); 68 | 69 | } 70 | return ajax.apply(this, arguments); 71 | }; 72 | 73 | })($.ajax), 74 | 75 | findUrlive = function(){ 76 | var selector = $(this).data('urlive-container') || $(this); 77 | return $(selector).find('.urlive-link'); 78 | }, 79 | 80 | methods = { 81 | init: function(options){ 82 | var opts = $.extend(true, defaults, options); 83 | 84 | return this.each(function(){ 85 | var el = $(this), url = undefined; 86 | 87 | el.data('urlive-container', opts.container); 88 | 89 | if ('url' in opts) { 90 | url = opts.url; 91 | } else { 92 | if(el.is('a')){ 93 | url = el.attr('href'); 94 | }else{ 95 | var text = el.val() || el.text(), 96 | regexp = opts.regexp, 97 | email = /^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; 98 | 99 | url = regexp.exec(text); 100 | 101 | url = (url && !email.test(url[0])) ? url[0] : null; 102 | } 103 | } 104 | 105 | if(url){ 106 | if(/\.(?:jpe?g|gif|png)/.test(url)){ 107 | var ti = url.substr(url.lastIndexOf('/') + 1); 108 | draw({image:url, title:ti, url:url}); 109 | }else{ 110 | getData(url); 111 | } 112 | } 113 | 114 | function getData(url){ 115 | xajax({ 116 | url: url, 117 | type: 'GET', 118 | yqlSelect: opts.yqlSelect, 119 | beforeSend: opts.callbacks.onStart 120 | }).done(function(data){ 121 | if(!$.isEmptyObject(data.results)){ 122 | data = data.results[0]; 123 | 124 | html = $('
',{html:data}); 125 | 126 | get = function(prop){ 127 | return html.find('[property="' + prop + '"]').attr('content') 128 | || html.find('[name="' + prop + '"]').attr('content') 129 | || html.find(prop).html() || html.find(prop).attr('src'); 130 | }; 131 | 132 | set = { 133 | image: el.data('image') || get('og:image') || get('img'), 134 | title: el.data('title') || get('og:title') || get('title'), 135 | description: el.data('description') || get('og:description') || get('description'), 136 | url: el.data('url') || get('og:url') || url, 137 | type: el.data('type') || get('og:type'), 138 | sitename: el.data('site_name') || get('og:site_name') 139 | }; 140 | 141 | opts.callbacks.onSuccess(set); 142 | 143 | if(opts.render){ 144 | draw(set); 145 | } 146 | 147 | }else{ 148 | opts.callbacks.noData(); 149 | $.error('YQL request succeeded but with empty results', data); 150 | 151 | } 152 | }).fail(function (jqXHR, textStatus, errorThrown) { 153 | opts.callbacks.onFail(); 154 | $.error('YQL request error: ', textStatus, errorThrown); 155 | }); 156 | } 157 | 158 | function draw(set){ 159 | //outer = $('',{ 'class':'urlive-link', href: set.url, target: opts.target}); 160 | outer = $('
',{ 'class':'urlive-container', href: set.url, target: opts.target}); 161 | imgWrapper = $('
',{ 'class':'urlive-img-wrapper'}); 162 | textWrapper = $('
',{'class':'urlive-text-wrapper'}); 163 | 164 | $.each(set, function(key, val){ 165 | if(val){ 166 | if(key == 'image'){ 167 | 168 | if(!/^(?:[a-z]+:)?\/\//i.test(val)){ 169 | val = (!/^https?:\/\//i.test(set.url)) ? window.location.protocol + '//' + set.url + val : set.url + val; 170 | } 171 | 172 | img = $('', {src: val}); 173 | 174 | img.error(opts.callbacks.imgError); 175 | 176 | img.appendTo(imgWrapper); 177 | 178 | img.hide().load(function() { 179 | var imgW = $(this).width(), 180 | outer = $(this).closest('.urlive-link'); 181 | 182 | $(this).addClass('urlive-'+key).show(); 183 | 184 | if(opts.imageSize == 'auto'){ 185 | 186 | if(imgW >= outer.width()){ 187 | outer.addClass('urlive-img-large'); 188 | }else{ 189 | outer.addClass('urlive-img-small'); 190 | } 191 | }else if(opts.imageSize == 'large'){ 192 | outer.addClass('urlive-img-large'); 193 | }else if(opts.imageSize == 'small'){ 194 | outer.addClass('urlive-img-small'); 195 | } 196 | 197 | opts.callbacks.onLoadEnd(); 198 | }); 199 | 200 | }else{ 201 | elem = $('', {'class':'urlive-'+key, text: val}); 202 | elem.appendTo(textWrapper); 203 | } 204 | } 205 | }); 206 | 207 | outer.append(imgWrapper, textWrapper).appendTo(el.data('urlive-container')); 208 | 209 | outer.on('click', opts.callbacks.onClick); 210 | 211 | if(opts.disableClick){ 212 | outer.on('click', function(e){ 213 | e.preventDefault(); 214 | }); 215 | } 216 | 217 | } 218 | 219 | }); 220 | }, 221 | 222 | close: function(duration){ 223 | var urlive = findUrlive.apply(this); 224 | 225 | urlive.fadeOut(duration); 226 | }, 227 | 228 | remove: function(duration){ 229 | var urlive = findUrlive.apply(this); 230 | 231 | if(duration){ 232 | urlive.fadeOut(duration, function(){ 233 | urlive.remove(); 234 | }); 235 | }else{ 236 | urlive.remove(); 237 | } 238 | }, 239 | 240 | open: function(duration){ 241 | var urlive = findUrlive.apply(this); 242 | 243 | urlive.fadeIn(duration); 244 | }, 245 | 246 | disable: function(){ 247 | var urlive = findUrlive.apply(this); 248 | 249 | urlive.on('click',function(e) { 250 | e.preventDefault(); 251 | }); 252 | }, 253 | 254 | enable: function(){ 255 | var urlive = findUrlive.apply(this); 256 | 257 | urlive.off('click'); 258 | } 259 | 260 | }; 261 | 262 | $.fn.urlive = function(method){ 263 | if(methods[method]){ 264 | return methods[method].apply(this, Array.prototype.slice.call(arguments, 1)); 265 | }else if(typeof method === 'object' || !method){ 266 | return methods.init.apply(this, arguments); 267 | }else{ 268 | $.error('Method "' + method + '" does not exist on jquery.urlive'); 269 | } 270 | }; 271 | 272 | })(jQuery); 273 | --------------------------------------------------------------------------------