├── __init__.py
├── vis
├── __init__.py
├── bokeh_plots
│ ├── __init__.py
│ ├── test
│ │ ├── __init__.py
│ │ └── test_cross_filter.py
│ ├── utils.py
│ └── domains_dashboard.py
├── html
│ ├── img
│ │ ├── apply.png
│ │ ├── boost.png
│ │ ├── delete.jpg
│ │ ├── reload.png
│ │ ├── remove.png
│ │ ├── search.png
│ │ ├── nyu_stacked_black.png
│ │ └── engineering_long_black.png
│ ├── css
│ │ ├── dashboard_styles.css
│ │ ├── cross_filter.css
│ │ ├── jquery.urlive.css
│ │ ├── d3.slider.css
│ │ └── dropdowns-enhancement.min.css
│ ├── libs
│ │ ├── bootflat-2.0.4
│ │ │ └── fonts
│ │ │ │ ├── glyphicons-halflings-regular.eot
│ │ │ │ ├── glyphicons-halflings-regular.ttf
│ │ │ │ └── glyphicons-halflings-regular.woff
│ │ └── bootstrap-datetimepicker-4.15.35
│ │ │ └── css
│ │ │ └── bootstrap-datetimepicker.min.css
│ ├── cross_filter_plot_area.html
│ ├── js
│ │ ├── libs
│ │ │ ├── queue.min.js
│ │ │ ├── d3.lasso.min.js
│ │ │ └── jquery.urlive.js
│ │ ├── cross_filter.js
│ │ ├── topicvis.js
│ │ ├── utils.js
│ │ ├── crawlersigslots.js
│ │ ├── snippetsviewer.js
│ │ ├── bokeh_controller.js
│ │ ├── sigslot_core.js
│ │ └── tagsgallery.js
│ ├── base.html
│ ├── domains_dashboard.html
│ ├── cross_filter.html
│ ├── crawlervis.html
│ └── release.html
└── config.conf-in
├── models
└── __init__.py
├── elastic
├── test
│ ├── __init__.py
│ └── test_get_documents.py
├── .gitignore
├── __init__.py
├── scripts
│ ├── create_config_index.sh
│ ├── delete_index.sh
│ ├── put_mapping.sh
│ ├── create_index.sh
│ └── mapping.json
├── ddt_index_config_entries.json
├── delete_index.py
├── mapping_terms.json
├── delete.py
├── get_term_vectors.py
├── config.json
├── load_config.py
├── config.py
├── aggregations.py
├── mapping.json
├── get_config.py
├── create_index.py
├── README.md
├── add_documents.py
├── get_documents.py
├── get_mtermvectors.py
└── stopwords.txt
├── ranking
├── __init__.py
├── .gitignore
├── run.sh
├── preprocess.py
├── BayesianSets.py
├── get_bigrams_trigrams.py
├── tfidf.py
├── rank.py
├── extract_terms.py
└── word2vec.py
├── seeds_generator
├── __init__.py
├── src
│ ├── main
│ │ ├── config
│ │ │ └── queries.txt
│ │ └── java
│ │ │ └── page_downloader
│ │ │ ├── Download_Utils.java
│ │ │ ├── App.java
│ │ │ ├── Download_urls.java
│ │ │ ├── StartCrawl.java
│ │ │ ├── Extract.java
│ │ │ ├── Download.java
│ │ │ ├── BingSearch.java
│ │ │ ├── GoogleSearch.java
│ │ │ └── Crawl.java
│ └── test
│ │ └── java
│ │ └── page_downloader
│ │ └── AppTest.java
├── conf
│ └── config.properties
├── download.py
├── pom.xml
└── concat_nltk.py
├── online_classifier
├── __init__.py
├── tfidf_vector.py
├── online_classifier.py
└── tf_vector.py
├── logs
└── README.md
├── .dockerignore
├── run_demo.sh
├── conda.recipe
├── README.md
├── meta.yaml
└── build.sh
├── .gitignore
├── bin
├── ddt
└── ddt-dev
├── environment.yml
├── supervisord.conf
├── Dockerfile
├── Makefile
└── README.md
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vis/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/elastic/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ranking/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/seeds_generator/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vis/bokeh_plots/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/online_classifier/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vis/bokeh_plots/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ranking/.gitignore:
--------------------------------------------------------------------------------
1 | selected_terms.txt
2 | exclude.txt
--------------------------------------------------------------------------------
/logs/README.md:
--------------------------------------------------------------------------------
1 | Logfiles from supervisor processes go here
2 |
--------------------------------------------------------------------------------
/seeds_generator/src/main/config/queries.txt:
--------------------------------------------------------------------------------
1 | explosive chemicals
--------------------------------------------------------------------------------
/ranking/run.sh:
--------------------------------------------------------------------------------
1 | python rank.py ../lda_pipeline/data/lda_input.csv 3,4,7,28
2 |
--------------------------------------------------------------------------------
/elastic/.gitignore:
--------------------------------------------------------------------------------
1 | /local
2 | /bin
3 | /include
4 | /lib/python*
5 | /build
6 | *.pyc
7 |
--------------------------------------------------------------------------------
/elastic/__init__.py:
--------------------------------------------------------------------------------
1 | from config import es, es_server
2 |
3 | __export__ = ['es_server', 'es']
4 |
--------------------------------------------------------------------------------
/vis/html/img/apply.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/apply.png
--------------------------------------------------------------------------------
/vis/html/img/boost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/boost.png
--------------------------------------------------------------------------------
/vis/html/img/delete.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/delete.jpg
--------------------------------------------------------------------------------
/vis/html/img/reload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/reload.png
--------------------------------------------------------------------------------
/vis/html/img/remove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/remove.png
--------------------------------------------------------------------------------
/vis/html/img/search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/search.png
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | vis/config.conf
2 | ranking/D_cbow_pdw_8B.pkl
3 | data/
4 | vis/html/models/
5 | *.pyc
6 | *.log
7 | *.class
8 | *.jar
9 |
--------------------------------------------------------------------------------
/vis/html/img/nyu_stacked_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/nyu_stacked_black.png
--------------------------------------------------------------------------------
/vis/html/img/engineering_long_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/img/engineering_long_black.png
--------------------------------------------------------------------------------
/seeds_generator/conf/config.properties:
--------------------------------------------------------------------------------
1 | ACCOUNTKEY SYQ5NpERm7UmF1ZCdysXfQjS5wD41a27sSnBS5KReqA
2 | ACCOUNTKEY_GOOG AIzaSyADaHyjihNC3591IehV5pcmqK044jdrFEM
3 | CSE_ID_GOOG 016642719151054520299:gftwrd3ql-m
4 |
--------------------------------------------------------------------------------
/vis/html/css/dashboard_styles.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color:transparent;
3 | }
4 | .bk-vbox {
5 | padding-left:1px;
6 | }
7 |
8 | .bk-data-table {
9 | margin: 0px 20px 20px 0px;
10 | }
11 |
--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool_deprecated/HEAD/vis/html/libs/bootflat-2.0.4/fonts/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/run_demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Activating DDT enviroment..."
3 | source activate ddt
4 |
5 | echo "Using ElasticSearch at $ELASTICSEARCH_SERVER"
6 |
7 | echo "Starting services..."
8 | supervisord -c /ddt/supervisord.conf
9 |
--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/Download_Utils.java:
--------------------------------------------------------------------------------
1 | public class Download_Utils{
2 | public static String validate_url(String url){
3 | if(!url.contains("http"))
4 | url = "http://" + url;
5 | return url;
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/conda.recipe/README.md:
--------------------------------------------------------------------------------
1 | This conda recipe was originally added by Continuum Analytics in July 2015 under the DDT license
2 |
3 | You may need other recipes from https://github.com/memex-explorer/memex-explorer or packages from the memex channel
4 |
5 |
--------------------------------------------------------------------------------
/elastic/scripts/create_config_index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [ $# -eq 0 ]
4 | then
5 | ELASTIC=http://localhost:9200
6 | else
7 | ELASTIC=$1
8 | fi
9 |
10 | ./create_index.sh config $ELASTIC
11 | ./put_mapping.sh config domains config.json $ELASTIC
12 |
13 |
14 |
--------------------------------------------------------------------------------
/elastic/ddt_index_config_entries.json:
--------------------------------------------------------------------------------
1 | {
2 | "entries": [
3 | {
4 | "id" : "1",
5 | "domain_name": "Gun Control",
6 | "index" : "gun_control"
7 | },
8 | {
9 | "id" : "2",
10 | "domain_name": "Ebola",
11 | "index" : "ebola"
12 | }
13 | ]
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/elastic/delete_index.py:
--------------------------------------------------------------------------------
1 | from config import es as default_es
2 | from pprint import pprint
3 |
4 | def delete_index(es_index='', es=None):
5 | if es is None:
6 | es = default_es
7 |
8 | if es_index != "":
9 | res = es.indices.delete(index=es_index)
10 |
11 |
--------------------------------------------------------------------------------
/seeds_generator/src/main/java/page_downloader/App.java:
--------------------------------------------------------------------------------
1 | package page_downloader;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | local
2 | bin
3 | include
4 | /lib
5 | python2.7
6 | /build
7 | /nltk_data
8 | config.conf
9 | results.txt
10 | *.pyc
11 | *~
12 | *.#*
13 | *#*
14 | *.bak
15 | *log
16 | *.class
17 | *.jar
18 | *.out
19 | seeds_generator/target/*
20 | seeds_generator/conf/queries.txt
21 | ranking/D_cbow_pdw_8B.pkl
22 | data/
23 | vis/html/models/
24 | *.DS_Store
25 | .idea
26 | *.swp
27 | .cache/
28 |
--------------------------------------------------------------------------------
/elastic/scripts/delete_index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | if [ $# -eq 0 ]
3 | then
4 | INDEX=memex
5 | else
6 | INDEX=$1
7 | fi
8 |
9 | if [ $# -gt 1 ]
10 | then
11 | TYPE=$2
12 | else
13 | TYPE=page
14 | fi
15 |
16 | if [ $# -gt 2 ]
17 | then
18 | ELASTIC=$3
19 | else
20 | ELASTIC=http://localhost:9200
21 | fi
22 | echo $INDEX
23 |
24 | curl -XDELETE "$ELASTIC/$INDEX/$TYPE"; echo
25 |
--------------------------------------------------------------------------------
/elastic/mapping_terms.json:
--------------------------------------------------------------------------------
1 | {
2 | "terms": {
3 | "properties": {
4 | "term": {
5 | "type": "string"
6 | },
7 | "index": {
8 | "type": "string"
9 | },
10 | "doc_type": {
11 | "type": "string"
12 | },
13 | "tf": {
14 | "type": "integer"
15 | },
16 | "tag": {
17 | "type": "string"
18 | }
19 | }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/bin/ddt:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPT_PATH="${BASH_SOURCE[0]}";
4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH)
5 |
6 | export NLTK_DATA=$SCRIPT_DIR/../lib/ddt/nltk_data
7 | export ACHE_HOME=$SCRIPT_DIR/../lib/ache/
8 | export DDT_HOME=$SCRIPT_DIR/../lib/ddt
9 | # ugly, but DDT doesn't really have a concept of installs
10 | export PYTHONPATH=$SCRIPT_DIR/../lib/ddt:$PYTHONPATH
11 |
12 | python $SCRIPT_DIR/../lib/ddt/vis/server.py
13 |
--------------------------------------------------------------------------------
/elastic/delete.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from config import es as default_es
3 | from elasticsearch.exceptions import NotFoundError
4 |
5 | def delete(ids, es_index='memex', es_doc_type='page', es=None):
6 | if es is None:
7 | es = default_es
8 |
9 | for id in ids:
10 | try:
11 | es.delete(es_index, es_doc_type, id)
12 | except NotFoundError:
13 | continue
14 |
--------------------------------------------------------------------------------
/vis/html/css/cross_filter.css:
--------------------------------------------------------------------------------
1 | .help-dropdown {
2 | padding-top:8px;
3 | }
4 |
5 | .help-dropdown-content {
6 | display: none;
7 | position: absolute;
8 | background-color: #f9f9f9;
9 | min-width: 360px;
10 | box-shadow: 0px 4px 8px 0px rgba(0,0,0,1);
11 | padding: 12px 16px;
12 | z-index: 1000;
13 | }
14 | .help-dropdown:hover .help-dropdown-content {
15 | display: block;
16 | }
17 |
18 | .bokeh_plot {
19 | padding-bottom:30px;
20 | }
21 |
22 | .bokeh_table {
23 | padding-bottom:10px;
24 | }
25 |
--------------------------------------------------------------------------------
/elastic/scripts/put_mapping.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | if [ $# -eq 0 ]
3 | then
4 | INDEX=memex
5 | else
6 | INDEX=$1
7 | fi
8 |
9 | if [ $# -gt 1 ]
10 | then
11 | TYPE=$2
12 | echo $TYPE
13 | else
14 | TYPE=page
15 | fi
16 |
17 | if [ $# -gt 2 ]
18 | then
19 | MAPPING=$3
20 | else
21 | MAPPING='mapping.json'
22 | fi
23 |
24 | if [ $# -gt 3 ]
25 | then
26 | ELASTIC=$4
27 | else
28 | ELASTIC=http://localhost:9200
29 | fi
30 |
31 | curl -XPUT "$ELASTIC/$INDEX/$TYPE/_mapping?pretty=1" -d @$MAPPING
32 |
--------------------------------------------------------------------------------
/vis/html/cross_filter_plot_area.html:
--------------------------------------------------------------------------------
1 | {% block content %}
2 |
3 |
{{ widgets_script | safe }}
4 | {{ plots_script | safe }}
5 | {{ plots_div['queries'] | safe }}
6 | {{ plots_div['tags'] | safe }}
7 | {{ plots_div['hostnames'] | safe }}
8 | {{ plots_div['tlds'] | safe }}
9 | {{ plots_div['ts'] | safe }}
10 |
11 | {% endblock content %}
12 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: ddt
2 |
3 | channels:
4 | - memex
5 | - vida-nyu
6 |
7 | dependencies:
8 | - elasticsearch
9 | - supervisor
10 | - meld3
11 | - dateutil
12 | - cython >=0.22
13 | - ddt-word2vec
14 | - maven
15 | - nltk
16 | - scipy
17 | - numexpr >=2.4
18 | - scikit-learn >=0.16.1
19 | - pyelasticsearch >=1.2
20 | - cherrypy
21 | - requests
22 | - ache >=0.3.1
23 | - jinja2
24 | - bokeh=0.10.0
25 | - pyldavis=2.1.0
26 | - topik
27 | - functools32
28 | - networkx=1.11
29 |
--------------------------------------------------------------------------------
/bin/ddt-dev:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPT_PATH="${BASH_SOURCE[0]}";
4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH)
5 | # ugly, but portable
6 | export DDT_HOME=$(python -c "import os, sys; sys.stdout.write(os.path.abspath('$SCRIPT_DIR/..')+'\n')")
7 | echo "DDT_HOME : $DDT_HOME"
8 | export NLTK_DATA=$DDT_HOME/nltk_data
9 | echo "NLTK_DATA : $NLTK_DATA"
10 | export ACHE_HOME=$(dirname $(which ache))/../lib/ache/
11 | echo "ACHE_HOME : $ACHE_HOME"
12 |
13 | # ugly, but DDT doesn't really have a concept of installs
14 | export PYTHONPATH=$DDT_HOME:$PYTHONPATH
15 | echo "PYTHONPATH: $PYTHONPATH"
16 |
17 | python $DDT_HOME/vis/server.py
18 |
--------------------------------------------------------------------------------
/elastic/scripts/create_index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | if [ $# -eq 0 ]
3 | then
4 | INDEX=memex
5 | else
6 | INDEX=$1
7 | fi
8 |
9 | if [ $# -gt 1 ]
10 | then
11 | ELASTIC=$2
12 | else
13 | ELASTIC=http://localhost:9200
14 | fi
15 |
16 | curl -s -XPUT "$ELASTIC/$INDEX"; echo
17 | # -d '{
18 | # "index" : {
19 | # "analysis":{
20 | # "analyzer":{
21 | # "html" : {
22 | # "type" : "custom",
23 | # "tokenizer" : "standard",
24 | # "filter" : ["lowercase" , "stop"],
25 | # "char_filter" : ["html_strip"]
26 | # }
27 | # }
28 | # }
29 | # }
30 | # }'
31 |
--------------------------------------------------------------------------------
/vis/html/js/libs/queue.min.js:
--------------------------------------------------------------------------------
1 | !function(){function n(n){function e(){for(;i=ap;){var u=a++,e=c[u],o=t.call(e,1);o.push(l(u)),++p,e[0].apply(null,o)}}function l(n){return function(u,t){--p,null==s&&(null!=u?(s=u,a=d=0/0,o()):(c[n]=t,--d?i||e():o()))}}function o(){null!=s?m(s):f?m(s,c):m.apply(null,[s].concat(c))}var r,i,f,c=[],a=0,p=0,d=0,s=null,m=u;return n||(n=1/0),r={defer:function(){return s||(c.push(arguments),++d,e()),r},await:function(n){return m=n,f=!1,d||o(),r},awaitAll:function(n){return m=n,f=!0,d||o(),r}}}function u(){}var t=[].slice;n.version="1.0.7","function"==typeof define&&define.amd?define(function(){return n}):"object"==typeof module&&module.exports?module.exports=n:this.queue=n}();
--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
1 | package:
2 | name: ddt
3 | version: 2.3.0
4 |
5 | build:
6 | number: 0
7 | has_prefix_files:
8 | - lib/ddt/vis/config.conf
9 |
10 | source:
11 | git_url: https://github.com/ViDA-NYU/domain_discovery_tool
12 | git_tag: 2.3
13 |
14 | requirements:
15 | build:
16 | - cython >=0.22
17 | - ddt-word2vec
18 | - maven
19 | - nltk
20 | run:
21 | - scipy
22 | - cython >=0.22
23 | - numexpr >=2.4
24 | - scikit-learn >=0.16.1
25 | - pyelasticsearch >=1.2
26 | - nltk
27 | - cherrypy
28 | - requests
29 | - ddt-word2vec
30 | - ache >=0.3.1
31 | - functools32
32 |
33 | #about:
34 | # license: Apache?
35 |
--------------------------------------------------------------------------------
/vis/config.conf-in:
--------------------------------------------------------------------------------
1 | [global]
2 | server.socket_host = 0.0.0.0
3 | server.socket_port = 8084
4 | server.thread_pool = 10
5 |
6 | [/]
7 | tools.staticdir.root = .
8 | tools.encode.on = True
9 | tools.gzip.on = True
10 |
11 | [/css]
12 | tools.staticdir.on = True
13 | tools.staticdir.dir = css
14 |
15 | [/js]
16 | tools.staticdir.on = True
17 | tools.staticdir.dir = js
18 |
19 | [/img]
20 | tools.staticdir.on = True
21 | tools.staticdir.dir = img
22 |
23 | [/models]
24 | tools.staticdir.on = True
25 | tools.staticdir.dir = models
26 |
27 | [/bootflat-2.0.4]
28 | tools.staticdir.on = True
29 | tools.staticdir.dir = libs/bootflat-2.0.4
30 |
31 | [/bootstrap-datetimepicker-4.15.35]
32 | tools.staticdir.on = True
33 | tools.staticdir.dir = libs/bootstrap-datetimepicker-4.15.35
--------------------------------------------------------------------------------
/online_classifier/tfidf_vector.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfTransformer
2 | from nltk import corpus
3 |
4 | from tf_vector import tf_vectorizer
5 |
6 | class tfidf_vectorizer(tf_vectorizer):
7 |
8 | def __init__(self, convert_to_ascii=False, max_features= 10000, ngram_range=(1,1)):
9 | self.tfidf_transformer = None
10 | tf_vectorizer.__init__(self, convert_to_ascii, max_features, ngram_range)
11 |
12 | def tfidf(self, data):
13 | [X_counts, features] = self.vectorize(data)
14 | if self.tfidf_transformer is None:
15 | self.tfidf_transformer = TfidfTransformer()
16 | X = self.tfidf_transformer.fit_transform(X_counts)
17 | else:
18 | X = self.tfidf_transformer.transform(X_counts)
19 |
20 | return [X, X_counts, features]
21 |
22 |
--------------------------------------------------------------------------------
/seeds_generator/src/test/java/page_downloader/AppTest.java:
--------------------------------------------------------------------------------
1 | package page_downloader;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/elastic/get_term_vectors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from os import environ
3 |
4 | from config import es as default_es
5 |
6 | es = default_es
7 |
8 | query = {
9 | "query": {
10 | "match_all": {}
11 | },
12 | "fields": []
13 | }
14 | res = es.search(query,
15 | index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex',
16 | doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
17 |
18 | hits = res['hits']
19 | print 'Document found: %d' % hits['total']
20 | ids = [hit['_id'] for hit in hits['hits']]
21 | body={
22 | "ids": ids,
23 | "parameters": {
24 | "fields": [ "text" ]
25 | }
26 | }
27 | res = es.send_request('POST',
28 | ['memex', 'page', '_mtermvectors'],
29 | body=body, query_params={})
30 |
31 |
--------------------------------------------------------------------------------
/ranking/preprocess.py:
--------------------------------------------------------------------------------
1 | from nltk import word_tokenize
2 | from nltk.text import TextCollection
3 | from nltk import corpus
4 |
5 | from pprint import pprint
6 |
7 | ENGLISH_STOPWORDS = set(corpus.stopwords.words('english'))
8 |
9 | class TextPreprocess:
10 | def __init__(self,display=False):
11 | self.display=display
12 |
13 | def preprocess(self,text):
14 | #text = text.split(" ");
15 | text = word_tokenize(text)
16 | if self.display:
17 | print "After Tokenizing"
18 | print text
19 | print "\n\n"
20 |
21 | text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2]
22 |
23 | tc = TextCollection([text])
24 | words = list(set(tc))
25 |
26 | word_tf = {word: tc.tf(word, text) * len(text) for word in words}
27 |
28 | return word_tf
29 |
--------------------------------------------------------------------------------
/ranking/BayesianSets.py:
--------------------------------------------------------------------------------
1 | from math import sqrt
2 | from numpy import *
3 |
4 | import sys
5 |
6 | reload(sys)
7 | sys.setdefaultencoding("utf-8")
8 |
9 |
10 | class BayesianSets:
11 | # D-> Query Set
12 | # X-> Data Set
13 | def score(self, D, X) :
14 |
15 | #Compute Bayesian Sets Parameters
16 | c = 2
17 | N = D.shape[0]
18 | T = concatenate((D,X))
19 | m = divide(sum(T, axis=0),T.shape[0])
20 |
21 | a = multiply(m, c)
22 | b = multiply(subtract(1,m),c)
23 |
24 | at = add(a,sum(D, axis=0))
25 | bt = subtract(add(b,N),sum(D, axis=0))
26 |
27 | C = sum(subtract(add(subtract(log(add(a,b)),log(add(add(a,b),N))), log(bt)), log (b)))
28 |
29 | q = transpose(add(subtract(subtract(log(at),log(a)),log(bt)), log(b)))
30 |
31 | score_X = transpose(add(C, dot(X,q)))
32 |
33 | return asarray(score_X)
34 |
35 |
--------------------------------------------------------------------------------
/elastic/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "domains" : {
3 | "_timestamp" : {
4 | "enabled" : true,
5 | "store" : true
6 | },
7 | "properties" : {
8 | "domain_name" : {
9 | "type" : "string"
10 | },
11 | "timestamp" : {
12 | "type" : "date"
13 | },
14 | "index" : {
15 | "type" : "string"
16 | },
17 | "doc_type": {
18 | "type": "string"
19 | },
20 | "mapping":{
21 | "properties": {
22 | "timestamp": {
23 | "type": "string"
24 | },
25 | "text": {
26 | "type": "string"
27 | },
28 | "html": {
29 | "type": "string"
30 | },
31 | "tag":{
32 | "type": "string"
33 | },
34 | "content-type":{
35 | "type": "string"
36 | }
37 | }
38 | },
39 | "tag_colors": {
40 | "properties": {
41 | "index": {
42 | "type": "integer"
43 | },
44 | "colors": {
45 | "type": "string"
46 | }
47 | }
48 | }
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisorctl]
2 | username=darpamemex
3 |
4 | [supervisord]
5 | childlogdir=logs
6 | logfile=supervisord.log ; (main log file;default $CWD/supervisord.log)
7 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB)
8 | logfile_backups=10 ; (num of main logfile rotation backups;default 10)
9 | loglevel=info ; (log level;default info; others: debug,warn,trace)
10 | pidfile=supervisord.pid ; (supervisord pidfile;default supervisord.pid)
11 | nodaemon=true ; (start in foreground if true;default false)
12 | minfds=1024 ; (min. avail startup file descriptors;default 1024)
13 | minprocs=200 ; (min. avail process descriptors;default 200)
14 |
15 | [inet_http_server]
16 | port = 127.0.0.1:9001
17 |
18 | [program:elasticsearch]
19 | command=elasticsearch
20 | priority=1
21 |
22 | [program:ddt]
23 | command=bash ./bin/ddt-dev
24 | priority=2
25 |
26 | [rpcinterface:supervisor]
27 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
--------------------------------------------------------------------------------
/elastic/load_config.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from datetime import datetime
4 | from add_documents import add_document
5 |
6 | from config import es as default_es
7 |
8 | def load_config(entries, es_index='config', es_doc_type='domains', es=None):
9 |
10 | if es is None:
11 | es = default_es
12 |
13 | add_document(entries, es_index, es_doc_type, es)
14 |
15 | if __name__ == "__main__":
16 |
17 | if len(sys.argv)>1:
18 | config_file = sys.argv[1]
19 | else:
20 | config_file = 'ddt_index_config_entries.json'
21 |
22 | if len(sys.argv)>2:
23 | es_index = sys.argv[2]
24 | else:
25 | es_index = 'config'
26 |
27 | if len(sys.argv)>3:
28 | es_doc_type = sys.argv[3]
29 | else:
30 | es_doc_type = 'domains'
31 |
32 | es = None
33 | if len(sys.argv)>4:
34 | es_host = sys.argv[4]
35 | from pyelasticsearch import ElasticSearch
36 | es = ElasticSearch(es_host)
37 |
38 | load_config(config_file, es_index, es_doc_type, es)
39 |
40 |
--------------------------------------------------------------------------------
/elastic/config.py:
--------------------------------------------------------------------------------
1 | '''
2 | provides access to elasticsearch server
3 |
4 | es_server - the name of the endpoint
5 | es - an Elasticsearch instance connected to es_server
6 | '''
7 |
8 | from elasticsearch import Elasticsearch
9 | from os import environ
10 | import certifi
11 |
12 | if environ.get('ELASTICSEARCH_SERVER'):
13 | es_server = environ['ELASTICSEARCH_SERVER']
14 | else:
15 | es_server = 'http://localhost:9200/'
16 |
17 | print 'ELASTICSEARCH_SERVER ', es_server
18 |
19 | if environ.get('ELASTICSEARCH_USER'):
20 | es_user = environ['ELASTICSEARCH_USER']
21 | else:
22 | es_user = ""
23 |
24 | print 'ELASTICSEARCH_USER ', es_user
25 |
26 | if environ.get('ELASTICSEARCH_PASSWD'):
27 | es_passwd = environ['ELASTICSEARCH_PASSWD']
28 | else:
29 | es_passwd = ""
30 |
31 | if es_user:
32 | es = Elasticsearch([es_server], http_auth=(es_user, es_passwd), use_ssl=True, verify_certs=True, ca_certs=certifi.where(), timeout=100)
33 | else:
34 | es = Elasticsearch([es_server])
35 |
36 | if environ.get('ELASTICSEARCH_DOC_TYPE'):
37 | es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']
38 | else:
39 | es_doc_type = 'page'
40 |
41 |
42 |
--------------------------------------------------------------------------------
/vis/html/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Domain Discovery Tool
11 |
12 |
13 |