├── server ├── __init__.py ├── config.conf-in └── server.py ├── docs ├── .gitignore ├── contact.rst ├── figures │ ├── search.png │ ├── sf_pm.png │ ├── terms.png │ ├── add_tag.png │ ├── domains.png │ ├── filters.png │ ├── pm_icon.png │ ├── query_web.png │ ├── tag_all.png │ ├── tag_one.png │ ├── deep_crawl.png │ ├── new_domain.png │ ├── saved_model.png │ ├── crawler_start.png │ ├── crawler_view.png │ ├── ddt_arch-new.png │ ├── deep_crawl_tag.png │ ├── delete_domain.png │ ├── empty_domain.png │ ├── focused_crawl.png │ ├── load_url_text.png │ ├── model_download.png │ ├── model_dropdown.png │ ├── multi_select.png │ ├── add_ebola_domain.png │ ├── fwd_back_single.png │ ├── load_urls_popup.png │ ├── add_domain_button.png │ ├── explore_data_view.png │ ├── delete_domain_button.png │ ├── load_multiple_queries.png │ ├── seedfinder_search_new.png │ ├── visualization_general.png │ ├── visualization_toolbar.png │ ├── annotated_pages_updated.png │ ├── visualization_toolbar1.png │ ├── visualization_toolbar_select.png │ ├── visualization_general_tagpages.png │ ├── visualization_general_wordcloud.png │ ├── visualization_toolbar_find_keyword.png │ ├── visualization_toolbar_lassoSelect.png │ ├── visualization_toolbar_transaltion.png │ ├── visualization_toolbar_find_keyword_result.png │ └── visualization_toolbar_lassoSelect_result.png ├── tag_individual.rst ├── tag_all.rst ├── tag_everything.rst ├── visualization_radviz_toolbar_translation.rst ├── visualization_radviz_toolbar.rst ├── run_crawler.rst ├── tag_multiselect.rst ├── custom.rst ├── how_to_annotate.rst ├── deep_crawl_tag.rst ├── publication.rst ├── visualization_radviz_toolbar_select.rst ├── del_domain.rst ├── visualization_radviz_toolbar_findKeyword.rst ├── annotations.rst ├── make.bat ├── Makefile ├── add_domain.rst ├── deep_crawl.rst ├── use.rst ├── focused_crawl.rst ├── visualization_radviz_lasso_selection.rst ├── seedfinder.rst ├── terms_summary.rst ├── tutorials.rst ├── create_model.rst ├── crawl_fwd_back.rst ├── index.rst ├── load_data.rst ├── filter.rst ├── ebola_urls.txt ├── visualization_radviz.rst ├── conf.py └── install.rst ├── client ├── src │ ├── index.css │ ├── images │ │ ├── dicon.png │ │ ├── qicon.png │ │ ├── ticon.png │ │ ├── ddt_logo.png │ │ ├── ddt_logo2.png │ │ ├── qicon_1.png │ │ ├── search_icon.png │ │ ├── nyu_logo_purple.png │ │ ├── images_not_available.png │ │ ├── nyu_logo_new_york_university.jpg │ │ └── logo.svg │ ├── index.js │ ├── App.test.js │ ├── css │ │ ├── Views.css │ │ └── Components.css │ ├── App.css │ ├── App.js │ ├── config │ │ └── routes.js │ ├── components │ │ ├── ScaleBar.js │ │ ├── Scatterplot.js │ │ ├── Domain.js │ │ ├── Monitoring.js │ │ ├── DomainInfo.js │ │ ├── Search.js │ │ ├── SidebarMenu.js │ │ ├── TermsSnippetViewer.js │ │ ├── Filters.js │ │ ├── CrawlingView.js │ │ ├── MultiselectTable.js │ │ ├── Terms.js │ │ ├── RadViz.js │ │ ├── Home.js │ │ └── Body.js │ └── utils │ │ ├── stopword-filter.js │ │ └── utils │ │ └── stopword-filter.js ├── public │ ├── font-awesome-4.7.0 │ │ ├── fonts │ │ │ ├── FontAwesome.otf │ │ │ ├── fontawesome-webfont.eot │ │ │ ├── fontawesome-webfont.ttf │ │ │ ├── fontawesome-webfont.woff │ │ │ └── fontawesome-webfont.woff2 │ │ ├── less │ │ │ ├── fixed-width.less │ │ │ ├── screen-reader.less │ │ │ ├── larger.less │ │ │ ├── list.less │ │ │ ├── core.less │ │ │ ├── stacked.less │ │ │ ├── font-awesome.less │ │ │ ├── bordered-pulled.less │ │ │ ├── rotated-flipped.less │ │ │ ├── path.less │ │ │ ├── animated.less │ │ │ └── mixins.less │ │ ├── scss │ │ │ ├── _fixed-width.scss │ │ │ ├── _screen-reader.scss │ │ │ ├── _larger.scss │ │ │ ├── _list.scss │ │ │ ├── _core.scss │ │ │ ├── font-awesome.scss │ │ │ ├── _stacked.scss │ │ │ ├── _bordered-pulled.scss │ │ │ ├── _rotated-flipped.scss │ │ │ ├── _path.scss │ │ │ ├── _animated.scss │ │ │ └── _mixins.scss │ │ └── HELP-US-OUT.txt │ └── index.html ├── fix_for_npm_child_process_issue.py └── package.json ├── bin ├── run_docker_ddt.zip ├── ddt ├── ddt-dev ├── run_ddt └── run_docker_ddt ├── environment.yml ├── docker-compose.yml ├── ache.yml ├── .gitignore ├── docker-compose.yml.ache ├── README.md ├── Dockerfile └── Makefile /server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | html/ 2 | doctrees/ -------------------------------------------------------------------------------- /docs/contact.rst: -------------------------------------------------------------------------------- 1 | Contact 2 | ======= 3 | 4 | DDT Development Team [ddt-dev@vgc.poly.edu] 5 | -------------------------------------------------------------------------------- /client/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | padding: 0; 4 | font-family: sans-serif; 5 | } 6 | -------------------------------------------------------------------------------- /bin/run_docker_ddt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/bin/run_docker_ddt.zip -------------------------------------------------------------------------------- /docs/figures/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/search.png -------------------------------------------------------------------------------- /docs/figures/sf_pm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/sf_pm.png -------------------------------------------------------------------------------- /docs/figures/terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/terms.png -------------------------------------------------------------------------------- /docs/figures/add_tag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/add_tag.png -------------------------------------------------------------------------------- /docs/figures/domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/domains.png -------------------------------------------------------------------------------- /docs/figures/filters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/filters.png -------------------------------------------------------------------------------- /docs/figures/pm_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/pm_icon.png -------------------------------------------------------------------------------- /docs/figures/query_web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/query_web.png -------------------------------------------------------------------------------- /docs/figures/tag_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/tag_all.png -------------------------------------------------------------------------------- /docs/figures/tag_one.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/tag_one.png -------------------------------------------------------------------------------- /client/src/images/dicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/dicon.png -------------------------------------------------------------------------------- /client/src/images/qicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/qicon.png -------------------------------------------------------------------------------- /client/src/images/ticon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/ticon.png -------------------------------------------------------------------------------- /docs/figures/deep_crawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/deep_crawl.png -------------------------------------------------------------------------------- /docs/figures/new_domain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/new_domain.png -------------------------------------------------------------------------------- /docs/figures/saved_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/saved_model.png -------------------------------------------------------------------------------- /client/src/images/ddt_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/ddt_logo.png -------------------------------------------------------------------------------- /client/src/images/ddt_logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/ddt_logo2.png -------------------------------------------------------------------------------- /client/src/images/qicon_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/qicon_1.png -------------------------------------------------------------------------------- /docs/figures/crawler_start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/crawler_start.png -------------------------------------------------------------------------------- /docs/figures/crawler_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/crawler_view.png -------------------------------------------------------------------------------- /docs/figures/ddt_arch-new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/ddt_arch-new.png -------------------------------------------------------------------------------- /docs/figures/deep_crawl_tag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/deep_crawl_tag.png -------------------------------------------------------------------------------- /docs/figures/delete_domain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/delete_domain.png -------------------------------------------------------------------------------- /docs/figures/empty_domain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/empty_domain.png -------------------------------------------------------------------------------- /docs/figures/focused_crawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/focused_crawl.png -------------------------------------------------------------------------------- /docs/figures/load_url_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/load_url_text.png -------------------------------------------------------------------------------- /docs/figures/model_download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/model_download.png -------------------------------------------------------------------------------- /docs/figures/model_dropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/model_dropdown.png -------------------------------------------------------------------------------- /docs/figures/multi_select.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/multi_select.png -------------------------------------------------------------------------------- /client/src/images/search_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/search_icon.png -------------------------------------------------------------------------------- /docs/figures/add_ebola_domain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/add_ebola_domain.png -------------------------------------------------------------------------------- /docs/figures/fwd_back_single.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/fwd_back_single.png -------------------------------------------------------------------------------- /docs/figures/load_urls_popup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/load_urls_popup.png -------------------------------------------------------------------------------- /docs/figures/add_domain_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/add_domain_button.png -------------------------------------------------------------------------------- /docs/figures/explore_data_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/explore_data_view.png -------------------------------------------------------------------------------- /client/src/images/nyu_logo_purple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/nyu_logo_purple.png -------------------------------------------------------------------------------- /docs/figures/delete_domain_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/delete_domain_button.png -------------------------------------------------------------------------------- /docs/figures/load_multiple_queries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/load_multiple_queries.png -------------------------------------------------------------------------------- /docs/figures/seedfinder_search_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/seedfinder_search_new.png -------------------------------------------------------------------------------- /docs/figures/visualization_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_general.png -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar.png -------------------------------------------------------------------------------- /docs/figures/annotated_pages_updated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/annotated_pages_updated.png -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar1.png -------------------------------------------------------------------------------- /client/src/images/images_not_available.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/images_not_available.png -------------------------------------------------------------------------------- /docs/tag_individual.rst: -------------------------------------------------------------------------------- 1 | .. |tag_one| image:: figures/tag_one.png 2 | 3 | |tag_one| buttons, along each page, can be used to tag individual pages. 4 | -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_select.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_select.png -------------------------------------------------------------------------------- /docs/figures/visualization_general_tagpages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_general_tagpages.png -------------------------------------------------------------------------------- /docs/figures/visualization_general_wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_general_wordcloud.png -------------------------------------------------------------------------------- /client/src/images/nyu_logo_new_york_university.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/src/images/nyu_logo_new_york_university.jpg -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_find_keyword.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_find_keyword.png -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_lassoSelect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_lassoSelect.png -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_transaltion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_transaltion.png -------------------------------------------------------------------------------- /docs/tag_all.rst: -------------------------------------------------------------------------------- 1 | .. |tag_all| image:: figures/tag_all.png 2 | 3 | Use the |tag_all| buttons at the top of the list of pages to tag all pages in the current view 4 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/fonts/FontAwesome.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/public/font-awesome-4.7.0/fonts/FontAwesome.otf -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_find_keyword_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_find_keyword_result.png -------------------------------------------------------------------------------- /docs/figures/visualization_toolbar_lassoSelect_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/docs/figures/visualization_toolbar_lassoSelect_result.png -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/domain_discovery_tool/HEAD/client/public/font-awesome-4.7.0/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/fixed-width.less: -------------------------------------------------------------------------------- 1 | // Fixed Width Icons 2 | // ------------------------- 3 | .@{fa-css-prefix}-fw { 4 | width: (18em / 14); 5 | text-align: center; 6 | } 7 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/screen-reader.less: -------------------------------------------------------------------------------- 1 | // Screen Readers 2 | // ------------------------- 3 | 4 | .sr-only { .sr-only(); } 5 | .sr-only-focusable { .sr-only-focusable(); } 6 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_fixed-width.scss: -------------------------------------------------------------------------------- 1 | // Fixed Width Icons 2 | // ------------------------- 3 | .#{$fa-css-prefix}-fw { 4 | width: (18em / 14); 5 | text-align: center; 6 | } 7 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_screen-reader.scss: -------------------------------------------------------------------------------- 1 | // Screen Readers 2 | // ------------------------- 3 | 4 | .sr-only { @include sr-only(); } 5 | .sr-only-focusable { @include sr-only-focusable(); } 6 | -------------------------------------------------------------------------------- /client/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | import './index.css'; 5 | 6 | ReactDOM.render( 7 | , 8 | document.getElementById('root') 9 | ); 10 | -------------------------------------------------------------------------------- /client/src/App.test.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | 5 | it('renders without crashing', () => { 6 | const div = document.createElement('div'); 7 | ReactDOM.render(, div); 8 | }); 9 | -------------------------------------------------------------------------------- /docs/tag_everything.rst: -------------------------------------------------------------------------------- 1 | If you want to tag all pages retrieved for a particular filter (across pagination), then check the **Select ALL results in paginations** checkbox below the page list on top left. Then use |tag_all| buttons to tag all the pages. 2 | -------------------------------------------------------------------------------- /docs/visualization_radviz_toolbar_translation.rst: -------------------------------------------------------------------------------- 1 | .. |toolbar_traslation| image:: figures/visualization_toolbar_transaltion.png 2 | 3 | |toolbar_traslation| slider allows calibrate the degree of condesity or sparsity of the representations of the pages in the visualization. 4 | -------------------------------------------------------------------------------- /docs/visualization_radviz_toolbar.rst: -------------------------------------------------------------------------------- 1 | 2 | .. image:: figures/visualization_toolbar1.png 3 | :width: 700px 4 | :align: center 5 | :height: 80px 6 | :alt: alternate text 7 | 8 | This visualization has five controls to interact with it. Below it is described each functionality. 9 | -------------------------------------------------------------------------------- /docs/run_crawler.rst: -------------------------------------------------------------------------------- 1 | Run Crawler 2 | ----------- 3 | 4 | Once a sufficiently good model is available or pages are tagged for a deep crawl you can change from **Explore Data View** to the **Crawler View** to start the crawlsshown below: 5 | 6 | .. image:: figures/crawler_view.png 7 | :width: 800px 8 | :align: center 9 | :height: 400px 10 | :alt: alternate text 11 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/HELP-US-OUT.txt: -------------------------------------------------------------------------------- 1 | I hope you love Font Awesome. If you've found it useful, please do me a favor and check out my latest project, 2 | Fort Awesome (https://fortawesome.com). It makes it easy to put the perfect icons on your website. Choose from our awesome, 3 | comprehensive icon sets or copy and paste your own. 4 | 5 | Please. Check it out. 6 | 7 | -Dave Gandy 8 | -------------------------------------------------------------------------------- /server/config.conf-in: -------------------------------------------------------------------------------- 1 | [global] 2 | server.socket_host = 0.0.0.0 3 | server.socket_port = 8084 4 | server.thread_pool = 10 5 | 6 | [/] 7 | tools.staticdir.root = . 8 | tools.encode.on = True 9 | tools.gzip.on = True 10 | 11 | [/static] 12 | tools.staticdir.on = True 13 | tools.staticdir.dir = static 14 | 15 | [/models] 16 | tools.staticdir.on = True 17 | tools.staticdir.dir = models 18 | -------------------------------------------------------------------------------- /docs/tag_multiselect.rst: -------------------------------------------------------------------------------- 1 | Select multiple pages by keeping the **ctrl** key pressed and clicking on the pages that you want to select. When done with selecting pages, release the **ctrl** key. This will bring up a window where you can tag the pages as shown below: 2 | 3 | .. image:: figures/multi_select.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | -------------------------------------------------------------------------------- /docs/custom.rst: -------------------------------------------------------------------------------- 1 | Custom tags can be added using Add Tag text box as shown below. Enter the custom tag in the Add Tag text box and press **enter** key. This adds the tag as a chip below the page info. This can be applied to individual, selected or all pages similar to relevant and irrelevant tags. 2 | 3 | .. image:: figures/add_tag.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | -------------------------------------------------------------------------------- /bin/ddt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_PATH="${BASH_SOURCE[0]}"; 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH) 5 | 6 | export NLTK_DATA=$SCRIPT_DIR/../lib/ddt/nltk_data 7 | export ACHE_HOME=$SCRIPT_DIR/../lib/ache/ 8 | export DDT_HOME=$SCRIPT_DIR/../lib/ddt 9 | # ugly, but DDT doesn't really have a concept of installs 10 | export PYTHONPATH=$SCRIPT_DIR/../lib/ddt:$PYTHONPATH 11 | 12 | python $SCRIPT_DIR/../lib/ddt/vis/server.py 13 | -------------------------------------------------------------------------------- /docs/how_to_annotate.rst: -------------------------------------------------------------------------------- 1 | In the **Explore Data View** you see the pages for the domain (based on any filters applied) in two ways: through **Snippets** and **Visualizations**, as shown below: 2 | 3 | .. image:: figures/annotated_pages_updated.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | 9 | The different mechanisms for annotating pages through **Snippet** are: 10 | -------------------------------------------------------------------------------- /client/src/css/Views.css: -------------------------------------------------------------------------------- 1 | .url-link { 2 | font-size: 14px; 3 | color: #006621; 4 | margin-bottom: 4px; 5 | text-overflow: ellipsis; 6 | overflow: hidden; 7 | white-space: nowrap; 8 | display: block; 9 | } 10 | 11 | .url-title-link { 12 | height: 20px; 13 | font-size: 18px; 14 | color: #1a0dab; 15 | text-overflow: ellipsis; 16 | overflow: hidden; 17 | white-space: nowrap; 18 | display: block; 19 | } 20 | -------------------------------------------------------------------------------- /docs/deep_crawl_tag.rst: -------------------------------------------------------------------------------- 1 | Some tags such as **Deep Crawl** are pre-configured. User can tag a page (or group of pages) for deep crawl by choosing the tag from the Add Tag drop-down as shown. For example, if user wants to deep crawl all the uploaded pages then they can tag the pages **Deep Crawl**. 2 | 3 | .. image:: figures/deep_crawl_tag.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | -------------------------------------------------------------------------------- /docs/publication.rst: -------------------------------------------------------------------------------- 1 | Publication 2 | =========== 3 | 4 | Yamuna Krishnamurthy, Kien Pham, Aecio Santos, and Juliana Freire. 2016. `Interactive Web Content Exploration for Domain Discovery `_ (Interactive Data Exploration and Analytics (`IDEA `_) Workshop at Knowledge Discovery and Data Mining (`KDD `_), San Francisco, CA). 5 | -------------------------------------------------------------------------------- /client/src/App.css: -------------------------------------------------------------------------------- 1 | .App { 2 | text-align: center; 3 | } 4 | 5 | .App-logo { 6 | animation: App-logo-spin infinite 20s linear; 7 | height: 80px; 8 | } 9 | 10 | .App-header { 11 | background-color: #222; 12 | height: 150px; 13 | padding: 20px; 14 | color: white; 15 | } 16 | 17 | .App-intro { 18 | font-size: large; 19 | } 20 | 21 | @keyframes App-logo-spin { 22 | from { transform: rotate(0deg); } 23 | to { transform: rotate(360deg); } 24 | } 25 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/larger.less: -------------------------------------------------------------------------------- 1 | // Icon Sizes 2 | // ------------------------- 3 | 4 | /* makes the font 33% larger relative to the icon container */ 5 | .@{fa-css-prefix}-lg { 6 | font-size: (4em / 3); 7 | line-height: (3em / 4); 8 | vertical-align: -15%; 9 | } 10 | .@{fa-css-prefix}-2x { font-size: 2em; } 11 | .@{fa-css-prefix}-3x { font-size: 3em; } 12 | .@{fa-css-prefix}-4x { font-size: 4em; } 13 | .@{fa-css-prefix}-5x { font-size: 5em; } 14 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_larger.scss: -------------------------------------------------------------------------------- 1 | // Icon Sizes 2 | // ------------------------- 3 | 4 | /* makes the font 33% larger relative to the icon container */ 5 | .#{$fa-css-prefix}-lg { 6 | font-size: (4em / 3); 7 | line-height: (3em / 4); 8 | vertical-align: -15%; 9 | } 10 | .#{$fa-css-prefix}-2x { font-size: 2em; } 11 | .#{$fa-css-prefix}-3x { font-size: 3em; } 12 | .#{$fa-css-prefix}-4x { font-size: 4em; } 13 | .#{$fa-css-prefix}-5x { font-size: 5em; } 14 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ddt 2 | 3 | channels: 4 | - conda-forge 5 | - memex 6 | - anaconda 7 | - vida-nyu 8 | 9 | dependencies: 10 | - dateutil 11 | - cython >=0.22 12 | - nltk 13 | - scipy 14 | - numexpr >=2.4 15 | - scikit-learn >=0.18.1 16 | - pyelasticsearch >=1.2 17 | - cherrypy 18 | - requests 19 | - ache ==0.9.0 20 | - functools32 21 | - jinja2 22 | - nodejs 23 | - futures 24 | - openjdk ==8.0.152 25 | - joblib 26 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/list.less: -------------------------------------------------------------------------------- 1 | // List Icons 2 | // ------------------------- 3 | 4 | .@{fa-css-prefix}-ul { 5 | padding-left: 0; 6 | margin-left: @fa-li-width; 7 | list-style-type: none; 8 | > li { position: relative; } 9 | } 10 | .@{fa-css-prefix}-li { 11 | position: absolute; 12 | left: -@fa-li-width; 13 | width: @fa-li-width; 14 | top: (2em / 14); 15 | text-align: center; 16 | &.@{fa-css-prefix}-lg { 17 | left: (-@fa-li-width + (4em / 14)); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_list.scss: -------------------------------------------------------------------------------- 1 | // List Icons 2 | // ------------------------- 3 | 4 | .#{$fa-css-prefix}-ul { 5 | padding-left: 0; 6 | margin-left: $fa-li-width; 7 | list-style-type: none; 8 | > li { position: relative; } 9 | } 10 | .#{$fa-css-prefix}-li { 11 | position: absolute; 12 | left: -$fa-li-width; 13 | width: $fa-li-width; 14 | top: (2em / 14); 15 | text-align: center; 16 | &.#{$fa-css-prefix}-lg { 17 | left: -$fa-li-width + (4em / 14); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /docs/visualization_radviz_toolbar_select.rst: -------------------------------------------------------------------------------- 1 | .. |toolbar_select| image:: figures/visualization_toolbar_select.png 2 | 3 | |toolbar_select| radio buttons, can be used to show or hide data on RadViz. 4 | 5 | **Show all:** Show all is selected by default in this visualization. It shows all the pages present in the data collection. 6 | **Hide selected:** This option hides the selected pages of the current view. 7 | **Hide unselected:** This option acts in a contrary way of the previous option. It hides the pages that are not selected. 8 | -------------------------------------------------------------------------------- /client/fix_for_npm_child_process_issue.py: -------------------------------------------------------------------------------- 1 | # This is hopefully a temporary fix for the recent changes to npm 2 | # that breaks with the error "Module not found: Error: Cannot 3 | # resolve module 'child_process'" 4 | 5 | import json 6 | 7 | with open("node_modules/xmlhttprequest/package.json") as f: 8 | package_json = json.load(f) 9 | 10 | package_json["browser"] = {"child_process": False} 11 | 12 | with open("node_modules/xmlhttprequest/package.json", "w") as f: 13 | json.dump(package_json, f, indent=2) 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/del_domain.rst: -------------------------------------------------------------------------------- 1 | Delete Domain 2 | ------------- 3 | 4 | Domains can be deleted by clicking on the **Delete Domain** button. 5 | 6 | .. image:: figures/delete_domain.png 7 | :width: 800px 8 | :align: center 9 | :height: 400px 10 | :alt: alternate text 11 | 12 | On the **Deleting a domain** dialog select the domains to be deleted in the list of current domains and click on **Submit** button. They will no longer appear on the domains list. 13 | 14 | **NOTE: This will delete all the data collected for that domain.** 15 | -------------------------------------------------------------------------------- /client/src/css/Components.css: -------------------------------------------------------------------------------- 1 | .Menus-child{ 2 | margin: 0px 0px 0px 2px; 3 | } 4 | .View-body{ 5 | margin-left: -88px; 6 | margin-top: 8px; 7 | margin-right: -86px; 8 | margin-bottom: 8px; 9 | background-color: #FFFFFF; 10 | border-radius: 10px 10px 10px 10px 11 | } 12 | .Menus-body{ 13 | background-color: #FFFFFF; 14 | padding-top: 8px; 15 | margin-left: -11px; 16 | } 17 | 18 | .boder { 19 | width: 100%; 20 | background-color: #FFFFFF; 21 | } 22 | em{ 23 | font-weight: bold; 24 | font-style: normal; 25 | } 26 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/core.less: -------------------------------------------------------------------------------- 1 | // Base Class Definition 2 | // ------------------------- 3 | 4 | .@{fa-css-prefix} { 5 | display: inline-block; 6 | font: normal normal normal @fa-font-size-base/@fa-line-height-base FontAwesome; // shortening font declaration 7 | font-size: inherit; // can't have font-size inherit on line above, so need to override 8 | text-rendering: auto; // optimizelegibility throws things off #1094 9 | -webkit-font-smoothing: antialiased; 10 | -moz-osx-font-smoothing: grayscale; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_core.scss: -------------------------------------------------------------------------------- 1 | // Base Class Definition 2 | // ------------------------- 3 | 4 | .#{$fa-css-prefix} { 5 | display: inline-block; 6 | font: normal normal normal #{$fa-font-size-base}/#{$fa-line-height-base} FontAwesome; // shortening font declaration 7 | font-size: inherit; // can't have font-size inherit on line above, so need to override 8 | text-rendering: auto; // optimizelegibility throws things off #1094 9 | -webkit-font-smoothing: antialiased; 10 | -moz-osx-font-smoothing: grayscale; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/font-awesome.scss: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome 3 | * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) 4 | */ 5 | 6 | @import "variables"; 7 | @import "mixins"; 8 | @import "path"; 9 | @import "core"; 10 | @import "larger"; 11 | @import "fixed-width"; 12 | @import "list"; 13 | @import "bordered-pulled"; 14 | @import "animated"; 15 | @import "rotated-flipped"; 16 | @import "stacked"; 17 | @import "icons"; 18 | @import "screen-reader"; 19 | -------------------------------------------------------------------------------- /docs/visualization_radviz_toolbar_findKeyword.rst: -------------------------------------------------------------------------------- 1 | .. |toolbar_find_keyword| image:: figures/visualization_toolbar_find_keyword.png 2 | 3 | |toolbar_find_keyword| auto-complete text-field allows to search a keyword over all keywords highlighting it in the visualization. Blue font color is used to highlight the keyword (see Figure bellow). This functionality is supported by an autocomplete process using all keyword used to create RadViz. 4 | 5 | .. image:: figures/visualization_toolbar_find_keyword_result.png 6 | :width: 400px 7 | :align: center 8 | :height: 400px 9 | :alt: alternate text 10 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/stacked.less: -------------------------------------------------------------------------------- 1 | // Stacked Icons 2 | // ------------------------- 3 | 4 | .@{fa-css-prefix}-stack { 5 | position: relative; 6 | display: inline-block; 7 | width: 2em; 8 | height: 2em; 9 | line-height: 2em; 10 | vertical-align: middle; 11 | } 12 | .@{fa-css-prefix}-stack-1x, .@{fa-css-prefix}-stack-2x { 13 | position: absolute; 14 | left: 0; 15 | width: 100%; 16 | text-align: center; 17 | } 18 | .@{fa-css-prefix}-stack-1x { line-height: inherit; } 19 | .@{fa-css-prefix}-stack-2x { font-size: 2em; } 20 | .@{fa-css-prefix}-inverse { color: @fa-inverse; } 21 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_stacked.scss: -------------------------------------------------------------------------------- 1 | // Stacked Icons 2 | // ------------------------- 3 | 4 | .#{$fa-css-prefix}-stack { 5 | position: relative; 6 | display: inline-block; 7 | width: 2em; 8 | height: 2em; 9 | line-height: 2em; 10 | vertical-align: middle; 11 | } 12 | .#{$fa-css-prefix}-stack-1x, .#{$fa-css-prefix}-stack-2x { 13 | position: absolute; 14 | left: 0; 15 | width: 100%; 16 | text-align: center; 17 | } 18 | .#{$fa-css-prefix}-stack-1x { line-height: inherit; } 19 | .#{$fa-css-prefix}-stack-2x { font-size: 2em; } 20 | .#{$fa-css-prefix}-inverse { color: $fa-inverse; } 21 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/font-awesome.less: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome 3 | * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) 4 | */ 5 | 6 | @import "variables.less"; 7 | @import "mixins.less"; 8 | @import "path.less"; 9 | @import "core.less"; 10 | @import "larger.less"; 11 | @import "fixed-width.less"; 12 | @import "list.less"; 13 | @import "bordered-pulled.less"; 14 | @import "animated.less"; 15 | @import "rotated-flipped.less"; 16 | @import "stacked.less"; 17 | @import "icons.less"; 18 | @import "screen-reader.less"; 19 | -------------------------------------------------------------------------------- /docs/annotations.rst: -------------------------------------------------------------------------------- 1 | Annotate Pages 2 | -------------- 3 | 4 | A model is created by annotating pages as **Relevant** or **Irrelevant** for the domain. Currently, the model can only distinguish between relevant and irrelevant pages. You can also annotate pages with custom tags. These can be later grouped as relevant or irrelevant when generating the model. Try to alternate between Steps 3a and 3b to build a model till you reach at least 100 pages for each. This will continuously build a model and you can see the accuracy of the model at the top right corner - **Domain Model Accuracy**. 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /bin/ddt-dev: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_PATH="${BASH_SOURCE[0]}"; 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH) 5 | # ugly, but portable 6 | export DDT_HOME=$(python -c "import os, sys; sys.stdout.write(os.path.abspath('$SCRIPT_DIR/..')+'\n')") 7 | echo "DDT_HOME : $DDT_HOME" 8 | export NLTK_DATA=$DDT_HOME/nltk_data 9 | echo "NLTK_DATA : $NLTK_DATA" 10 | export ACHE_HOME=$(dirname $(which ache))/../lib/ache/ 11 | echo "ACHE_HOME : $ACHE_HOME" 12 | 13 | # ugly, but DDT doesn't really have a concept of installs 14 | export PYTHONPATH=$DD_API_HOME/../:$DD_API_HOME:$DD_API_HOME/lib/tsp-solver-master:$DDT_HOME:$DDT_HOME/server:$PYTHONPATH 15 | echo "PYTHONPATH: $PYTHONPATH" 16 | python $DDT_HOME/server/server.py 17 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/bordered-pulled.less: -------------------------------------------------------------------------------- 1 | // Bordered & Pulled 2 | // ------------------------- 3 | 4 | .@{fa-css-prefix}-border { 5 | padding: .2em .25em .15em; 6 | border: solid .08em @fa-border-color; 7 | border-radius: .1em; 8 | } 9 | 10 | .@{fa-css-prefix}-pull-left { float: left; } 11 | .@{fa-css-prefix}-pull-right { float: right; } 12 | 13 | .@{fa-css-prefix} { 14 | &.@{fa-css-prefix}-pull-left { margin-right: .3em; } 15 | &.@{fa-css-prefix}-pull-right { margin-left: .3em; } 16 | } 17 | 18 | /* Deprecated as of 4.4.0 */ 19 | .pull-right { float: right; } 20 | .pull-left { float: left; } 21 | 22 | .@{fa-css-prefix} { 23 | &.pull-left { margin-right: .3em; } 24 | &.pull-right { margin-left: .3em; } 25 | } 26 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_bordered-pulled.scss: -------------------------------------------------------------------------------- 1 | // Bordered & Pulled 2 | // ------------------------- 3 | 4 | .#{$fa-css-prefix}-border { 5 | padding: .2em .25em .15em; 6 | border: solid .08em $fa-border-color; 7 | border-radius: .1em; 8 | } 9 | 10 | .#{$fa-css-prefix}-pull-left { float: left; } 11 | .#{$fa-css-prefix}-pull-right { float: right; } 12 | 13 | .#{$fa-css-prefix} { 14 | &.#{$fa-css-prefix}-pull-left { margin-right: .3em; } 15 | &.#{$fa-css-prefix}-pull-right { margin-left: .3em; } 16 | } 17 | 18 | /* Deprecated as of 4.4.0 */ 19 | .pull-right { float: right; } 20 | .pull-left { float: left; } 21 | 22 | .#{$fa-css-prefix} { 23 | &.pull-left { margin-right: .3em; } 24 | &.pull-right { margin-left: .3em; } 25 | } 26 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/rotated-flipped.less: -------------------------------------------------------------------------------- 1 | // Rotated & Flipped Icons 2 | // ------------------------- 3 | 4 | .@{fa-css-prefix}-rotate-90 { .fa-icon-rotate(90deg, 1); } 5 | .@{fa-css-prefix}-rotate-180 { .fa-icon-rotate(180deg, 2); } 6 | .@{fa-css-prefix}-rotate-270 { .fa-icon-rotate(270deg, 3); } 7 | 8 | .@{fa-css-prefix}-flip-horizontal { .fa-icon-flip(-1, 1, 0); } 9 | .@{fa-css-prefix}-flip-vertical { .fa-icon-flip(1, -1, 2); } 10 | 11 | // Hook for IE8-9 12 | // ------------------------- 13 | 14 | :root .@{fa-css-prefix}-rotate-90, 15 | :root .@{fa-css-prefix}-rotate-180, 16 | :root .@{fa-css-prefix}-rotate-270, 17 | :root .@{fa-css-prefix}-flip-horizontal, 18 | :root .@{fa-css-prefix}-flip-vertical { 19 | filter: none; 20 | } 21 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_rotated-flipped.scss: -------------------------------------------------------------------------------- 1 | // Rotated & Flipped Icons 2 | // ------------------------- 3 | 4 | .#{$fa-css-prefix}-rotate-90 { @include fa-icon-rotate(90deg, 1); } 5 | .#{$fa-css-prefix}-rotate-180 { @include fa-icon-rotate(180deg, 2); } 6 | .#{$fa-css-prefix}-rotate-270 { @include fa-icon-rotate(270deg, 3); } 7 | 8 | .#{$fa-css-prefix}-flip-horizontal { @include fa-icon-flip(-1, 1, 0); } 9 | .#{$fa-css-prefix}-flip-vertical { @include fa-icon-flip(1, -1, 2); } 10 | 11 | // Hook for IE8-9 12 | // ------------------------- 13 | 14 | :root .#{$fa-css-prefix}-rotate-90, 15 | :root .#{$fa-css-prefix}-rotate-180, 16 | :root .#{$fa-css-prefix}-rotate-270, 17 | :root .#{$fa-css-prefix}-flip-horizontal, 18 | :root .#{$fa-css-prefix}-flip-vertical { 19 | filter: none; 20 | } 21 | -------------------------------------------------------------------------------- /bin/run_ddt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_PATH="${BASH_SOURCE[0]}" 4 | SCRIPT_DIR=$(dirname $SCRIPT_PATH) 5 | 6 | # ugly, but portable 7 | export DD_API_HOME=/ddt/domain_discovery_API 8 | echo "DD_API_HOME : $DD_API_HOME" 9 | 10 | export DDT_HOME=/ddt/domain_discovery_tool 11 | echo "DDT_HOME : $DDT_HOME" 12 | 13 | export NLTK_DATA=$DDT_HOME/nltk_data 14 | echo "NLTK_DATA : $NLTK_DATA" 15 | 16 | # ugly, but DDT doesn't really have a concept of installs 17 | export PYTHONPATH=$DD_API_HOME/../:$DD_API_HOME:$DD_API_HOME/lib/tsp-solver-master:$DDT_HOME:$DDT_HOME/server:$PYTHONPATH 18 | echo "PYTHONPATH: $PYTHONPATH" 19 | 20 | #Run DDT 21 | cd $DDT_HOME 22 | source activate ddt 23 | 24 | export ACHE_HOME=$(dirname $(which ache))/../lib/ache/ 25 | echo "ACHE_HOME : $ACHE_HOME" 26 | 27 | python $DDT_HOME/server/server.py 28 | 29 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | elasticsearch: 4 | image: elasticsearch:1.6 5 | container_name: elastic 6 | environment: 7 | - xpack.security.enabled=false 8 | - cluster.name=docker-cluster 9 | - bootstrap.memory_lock=true 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | volumes: 15 | - ./data/:/usr/share/elasticsearch/data # elasticsearch data storage 16 | ports: 17 | - 9200:9200 18 | ddt: 19 | image: vidanyu/ddt:latest 20 | container_name: dd_tool 21 | environment: 22 | ELASTICSEARCH_SERVER: elasticsearch 23 | ports: 24 | - "8084:8084" 25 | - "8080:8080" 26 | links: 27 | - elasticsearch 28 | volumes: 29 | - ./data/:/ddt/domain_discovery_tool/server/data # ddt data storage 30 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/path.less: -------------------------------------------------------------------------------- 1 | /* FONT PATH 2 | * -------------------------- */ 3 | 4 | @font-face { 5 | font-family: 'FontAwesome'; 6 | src: url('@{fa-font-path}/fontawesome-webfont.eot?v=@{fa-version}'); 7 | src: url('@{fa-font-path}/fontawesome-webfont.eot?#iefix&v=@{fa-version}') format('embedded-opentype'), 8 | url('@{fa-font-path}/fontawesome-webfont.woff2?v=@{fa-version}') format('woff2'), 9 | url('@{fa-font-path}/fontawesome-webfont.woff?v=@{fa-version}') format('woff'), 10 | url('@{fa-font-path}/fontawesome-webfont.ttf?v=@{fa-version}') format('truetype'), 11 | url('@{fa-font-path}/fontawesome-webfont.svg?v=@{fa-version}#fontawesomeregular') format('svg'); 12 | // src: url('@{fa-font-path}/FontAwesome.otf') format('opentype'); // used when developing fonts 13 | font-weight: normal; 14 | font-style: normal; 15 | } 16 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_path.scss: -------------------------------------------------------------------------------- 1 | /* FONT PATH 2 | * -------------------------- */ 3 | 4 | @font-face { 5 | font-family: 'FontAwesome'; 6 | src: url('#{$fa-font-path}/fontawesome-webfont.eot?v=#{$fa-version}'); 7 | src: url('#{$fa-font-path}/fontawesome-webfont.eot?#iefix&v=#{$fa-version}') format('embedded-opentype'), 8 | url('#{$fa-font-path}/fontawesome-webfont.woff2?v=#{$fa-version}') format('woff2'), 9 | url('#{$fa-font-path}/fontawesome-webfont.woff?v=#{$fa-version}') format('woff'), 10 | url('#{$fa-font-path}/fontawesome-webfont.ttf?v=#{$fa-version}') format('truetype'), 11 | url('#{$fa-font-path}/fontawesome-webfont.svg?v=#{$fa-version}#fontawesomeregular') format('svg'); 12 | // src: url('#{$fa-font-path}/FontAwesome.otf') format('opentype'); // used when developing fonts 13 | font-weight: normal; 14 | font-style: normal; 15 | } 16 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/animated.less: -------------------------------------------------------------------------------- 1 | // Animated Icons 2 | // -------------------------- 3 | 4 | .@{fa-css-prefix}-spin { 5 | -webkit-animation: fa-spin 2s infinite linear; 6 | animation: fa-spin 2s infinite linear; 7 | } 8 | 9 | .@{fa-css-prefix}-pulse { 10 | -webkit-animation: fa-spin 1s infinite steps(8); 11 | animation: fa-spin 1s infinite steps(8); 12 | } 13 | 14 | @-webkit-keyframes fa-spin { 15 | 0% { 16 | -webkit-transform: rotate(0deg); 17 | transform: rotate(0deg); 18 | } 19 | 100% { 20 | -webkit-transform: rotate(359deg); 21 | transform: rotate(359deg); 22 | } 23 | } 24 | 25 | @keyframes fa-spin { 26 | 0% { 27 | -webkit-transform: rotate(0deg); 28 | transform: rotate(0deg); 29 | } 30 | 100% { 31 | -webkit-transform: rotate(359deg); 32 | transform: rotate(359deg); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_animated.scss: -------------------------------------------------------------------------------- 1 | // Spinning Icons 2 | // -------------------------- 3 | 4 | .#{$fa-css-prefix}-spin { 5 | -webkit-animation: fa-spin 2s infinite linear; 6 | animation: fa-spin 2s infinite linear; 7 | } 8 | 9 | .#{$fa-css-prefix}-pulse { 10 | -webkit-animation: fa-spin 1s infinite steps(8); 11 | animation: fa-spin 1s infinite steps(8); 12 | } 13 | 14 | @-webkit-keyframes fa-spin { 15 | 0% { 16 | -webkit-transform: rotate(0deg); 17 | transform: rotate(0deg); 18 | } 19 | 100% { 20 | -webkit-transform: rotate(359deg); 21 | transform: rotate(359deg); 22 | } 23 | } 24 | 25 | @keyframes fa-spin { 26 | 0% { 27 | -webkit-transform: rotate(0deg); 28 | transform: rotate(0deg); 29 | } 30 | 100% { 31 | -webkit-transform: rotate(359deg); 32 | transform: rotate(359deg); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /client/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | //import logo from './logo.svg'; 3 | import './App.css'; 4 | import Domain from './components/Domain'; 5 | import Home from './components/Home'; 6 | var ReactRouter = require('react-router'); 7 | var Router = ReactRouter.Router; 8 | var Route = ReactRouter.Route; 9 | var hashHistory = ReactRouter.hashHistory; 10 | 11 | import MuiThemeProvider from 'material-ui/styles/MuiThemeProvider'; 12 | import injectTapEventPlugin from 'react-tap-event-plugin'; 13 | injectTapEventPlugin(); 14 | 15 | class App extends Component { 16 | 17 | render() { 18 | return ( 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | ); 27 | } 28 | } 29 | 30 | export default App; 31 | -------------------------------------------------------------------------------- /client/src/config/routes.js: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | var ReactRouter = require('react-router'); 3 | var Router = ReactRouter.Router; 4 | var Route = ReactRouter.Route; 5 | var hashHistory = ReactRouter.hashHistory; 6 | var IndexRoute = ReactRouter.IndexRoute; 7 | var Main = require('../components/Main'); 8 | var Home = require("../components/Home"); 9 | var PromptContainer = require('../containers/PromptContainer'); 10 | var ConfirmBattleContainer = require('../containers/ConfirmBattleContainer'); 11 | 12 | var routes = ( 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | ); 22 | 23 | module.exports = routes; 24 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=DomainDiscoveryTool 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = DomainDiscoveryTool 8 | SOURCEDIR = . 9 | BUILDDIR = . 10 | PDFBUILDDIR = /tmp 11 | PDF = manual.pdf 12 | 13 | # Put it first so that "make" without argument is like "make help". 14 | help: 15 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 16 | 17 | .PHONY: help Makefile 18 | 19 | # Catch-all target: route all unknown targets to Sphinx using the new 20 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 21 | %: Makefile 22 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 23 | 24 | latexpdf: 25 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(PDFBUILDDIR)/latex 26 | @echo "Running LaTeX files through pdflatex..." 27 | make -C $(PDFBUILDDIR)/latex all-pdf 28 | cp $(PDFBUILDDIR)/latex/*.pdf $(PDF) 29 | @echo "pdflatex finished; see $(PDF)" 30 | -------------------------------------------------------------------------------- /docs/add_domain.rst: -------------------------------------------------------------------------------- 1 | Create Domain 2 | ------------- 3 | 4 | .. image:: figures/empty_domain.png 5 | :width: 800px 6 | :align: center 7 | :height: 400px 8 | :alt: alternate text 9 | 10 | Begin by adding a domain on the Domains page (initial page), shown in the figure above, by clicking on the **Add Domain** button. Domain maintains context of domain discovery. 11 | 12 | .. image:: figures/add_ebola_domain.png 13 | :width: 800px 14 | :align: center 15 | :height: 400px 16 | :alt: alternate text 17 | 18 | On the **Adding a domain** dialog shown in figure above, enter the name of the domain you would like to create, for example **Ebola**, and click on **Submit** button. You should now see the new domain you added in the list of domains as shown below. 19 | 20 | .. image:: figures/new_domain.png 21 | :width: 800px 22 | :align: center 23 | :height: 400px 24 | :alt: alternate text 25 | 26 | Once domain is added click on domain name in the list of domains to collect, analyse and annotate web pages. 27 | -------------------------------------------------------------------------------- /docs/deep_crawl.rst: -------------------------------------------------------------------------------- 1 | In order to run a *Deep Crawl* annotate pages to be crawled with tag *Deep Crawl* as described in `Tag for Deep Crawl `_. 2 | 3 | .. image:: figures/deep_crawl.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | 9 | The figure above shows the Deep Crawl View. The list on the left shows all pages annotated as *Deep Crawl* in the Explore Data View. The table on the right shows recommendations of pages that could be added to deep crawl by clicking on the **Add to Deep Crawl**. If keyword terms are added or annotated then recommendations are made based on the score of how many of the keywords they contain. Otherwise the domains are recommended by the number of pages they contain. 10 | 11 | The deep crawler can be started by clicking on **Start Crawler** button at the bottom. This starts a deep crawler with all the pages tagged for Deep Crawl. 12 | 13 | You can see the results of the crawled data in **Crawled Data** in the Filters Tab. When the crawler is running it can be monitored by clicking on the **Crawler Monitor** button. 14 | -------------------------------------------------------------------------------- /docs/use.rst: -------------------------------------------------------------------------------- 1 | How To 2 | ====== 3 | 4 | Now you should be able to head to http://:8084/ to interact with the tool. 5 | 6 | .. include:: add_domain.rst 7 | .. include:: del_domain.rst 8 | .. include:: load_data.rst 9 | .. include:: seedfinder.rst 10 | .. include:: crawl_fwd_back.rst 11 | .. include:: filter.rst 12 | .. include:: annotations.rst 13 | .. include:: how_to_annotate.rst 14 | 15 | Tag Individual Pages 16 | ******************** 17 | 18 | .. include:: tag_individual.rst 19 | 20 | Tag Selected Pages 21 | ****************** 22 | 23 | .. include:: tag_multiselect.rst 24 | 25 | Tag All Pages in View 26 | ********************* 27 | 28 | .. include:: tag_all.rst 29 | 30 | Tag All Pages for Current Filter 31 | ******************************** 32 | 33 | .. include:: tag_everything.rst 34 | 35 | Custom Tag 36 | ********** 37 | 38 | .. include:: custom.rst 39 | 40 | Tag for Deep Crawl 41 | ****************** 42 | 43 | .. include:: deep_crawl_tag.rst 44 | 45 | .. include:: terms_summary.rst 46 | .. include:: visualization_radviz.rst 47 | 48 | .. include:: run_crawler.rst 49 | 50 | Deep Crawl 51 | ********** 52 | 53 | .. include:: deep_crawl.rst 54 | 55 | Focused Crawl 56 | ************* 57 | 58 | .. include:: focused_crawl.rst 59 | -------------------------------------------------------------------------------- /ache.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Example of configuration for crawling all pages of the web sites in the seeds 3 | # 4 | 5 | # Configure ELASTICSEARCH and FILES data formats 6 | target_storage.data_formats: 7 | - ELASTICSEARCH 8 | - WARC 9 | 10 | target_storage.data_format.elasticsearch.rest.hosts: 11 | - http://elasticsearch:9200 12 | 13 | # Enables "scope" to only crawl pages that belong to domains of seed URLs 14 | link_storage.link_strategy.use_scope: true 15 | 16 | # Perform breadth-search crawl 17 | link_storage.link_classifier.type: MaxDepthLinkClassifier 18 | link_storage.link_classifier.max_depth: 299 19 | 20 | # Select URLs from all domains during link selection phase, 21 | link_storage.link_selector: MaximizeWebsitesLinkSelector 22 | 23 | # Configure the minimum time interval (in milliseconds) to wait between requests 24 | # to the same host to avoid overloading servers. If you are crawling your own 25 | # web site, you can descrease this value to speed-up the crawl. 26 | link_storage.scheduler.host_min_access_interval: 4000 27 | 28 | # Enables discovery of links using the Sitemaps protocol 29 | link_storage.download_sitemap_xml: false 30 | 31 | # Configure the User-Agent of the crawler 32 | crawler_manager.downloader.user_agent.name: ACHE 33 | crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache -------------------------------------------------------------------------------- /client/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "DDT_react", 3 | "version": "0.1.0", 4 | "private": true, 5 | "devDependencies": { 6 | "react-scripts": "0.7.0" 7 | }, 8 | "dependencies": { 9 | "d3": "^4.7.4", 10 | "d3-array": "^1.1.1", 11 | "d3-request": "^1.0.2", 12 | "d3-scale": "^1.0.3", 13 | "d3-selection": "^1.0.5", 14 | "immutable": "^3.8.1", 15 | "jquery": "^3.1.1", 16 | "material-ui": "^0.18.6", 17 | "radviz-component": "^1.0.54", 18 | "react": "^15.6.1", 19 | "react-bootstrap": "^0.30.6", 20 | "react-checkbox-tree": "^0.5.2", 21 | "react-dom": "^15.6.1", 22 | "react-faux-dom": "^3.0.1", 23 | "react-highlight-words": "^0.8.0", 24 | "react-motion": "^0.4.5", 25 | "react-paginate": "^4.4.2", 26 | "react-router": "^3.0.0", 27 | "react-search-bar": "^1.1.4", 28 | "react-select": "^1.0.0-rc.5", 29 | "react-side-bar": "^0.3.5", 30 | "react-sidebar": "^2.2.1", 31 | "react-swipeable-views": "^0.8.0", 32 | "react-tap-event-plugin": "^2.0.1", 33 | "react-vis": "^0.6.4", 34 | "spawn-sync": "^1.0.15" 35 | }, 36 | "scripts": { 37 | "start": "react-scripts start && react-scripts build", 38 | "build": "react-scripts build", 39 | "test": "react-scripts test --env=jsdom", 40 | "eject": "react-scripts eject" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /bin/run_docker_ddt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping elastisearch container" 4 | docker stop elastic 5 | 6 | echo "Removing elastisearch container" 7 | docker rm elastic 8 | 9 | echo -n "Provide path to store domain data [Press ENTER for default path $(pwd)/data]: " 10 | read path 11 | 12 | echo $path 13 | 14 | if [[ $path == "" ]]; then 15 | path=$(pwd)/data 16 | fi 17 | 18 | echo "Starting elastisearch container" 19 | CID=$(docker run -d --name=elastic -p 9200:9200 -v $path:/usr/share/elasticsearch/data -t elasticsearch:1.6); 20 | docker inspect $CID > /dev/null 21 | ELASTIC_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' $CID); 22 | echo "Elasticsearch running at = $ELASTIC_IP" 23 | 24 | echo "Stopping DD Tool container" 25 | docker stop dd_tool 26 | 27 | echo "Removing DD Tool container" 28 | docker rm dd_tool 29 | 30 | echo "Updating DDT Package" 31 | docker pull vidanyu/ddt:latest 32 | 33 | echo "Starting DD Tool container" 34 | docker run -i --name=dd_tool -p 8084:8084 -e "ELASTICSEARCH_SERVER=$ELASTIC_IP" -v $path:/ddt/domain_discovery_tool/server/data -t vidanyu/ddt:latest 35 | 36 | echo "Stopping elastisearch container" 37 | docker stop elastic 38 | 39 | echo "Removing elastisearch container" 40 | docker rm elastic 41 | 42 | echo "Stopping DD Tool container" 43 | docker stop dd_tool 44 | 45 | echo "Removing DD Tool container" 46 | docker rm dd_tool 47 | 48 | -------------------------------------------------------------------------------- /docs/focused_crawl.rst: -------------------------------------------------------------------------------- 1 | The figure below shows the Focused Crawler View: 2 | 3 | .. image:: figures/focused_crawl.png 4 | :width: 800px 5 | :align: center 6 | :height: 400px 7 | :alt: alternate text 8 | 9 | 1. In the 'Model Settings' on the left select the tags that should be considered as relevant(Positive) and irrelevant(Negative). If there sufficient relevant and irrelevant pages (about 100 each), then you can start the crawler by clicking on the **Start Crawler** button. 10 | 2. If there are no irrelevant pages then a page classifier model cannot be built. Instead you can either upload keywords by clicking on the 'Add Terms' in the Terms window. You can also annotate the terms extracted from the positive pages by clicking on them. If no annotated terms are available then the top 50 terms are used to build a regular expression model. 11 | 3. Once either a page classifier or a regex model is possible start the focused crawler by clicking on the **Start Crawler**. 12 | 13 | You can see the results of the crawled data in "Crawled Data" in the Filters Tab. When the crawler is running it can be monitored by clicking on the 'Crawler Monitor' button. 14 | 15 | The Model info on the bottom right shows how good a domain model is if there are both relevant and irrelevant pages annotated. The color bar shows the strength of the model based on the balance of relevant and irrelevant pages and the classifier accuracy of the model. 16 | -------------------------------------------------------------------------------- /docs/visualization_radviz_lasso_selection.rst: -------------------------------------------------------------------------------- 1 | 2 | .. image:: figures/visualization_toolbar_lassoSelect.png 3 | :width: 400px 4 | :align: center 5 | :height: 410px 6 | :alt: alternate text 7 | 8 | To create the lasso the user simply drags a freehand selection around the pages located into the circle in RadViz, in a similar way to how you would outline something on a piece of paper with a pen or pencil. To start the lasso users simply have to click at the spot where you want to begin the selection, then continue holding your mouse button down and drag to draw a freeform selection outline. To complete the selection, simply return to the spot where you began and release your mouse button. You don't necessarily have to return the same spot you started from, but if you don't, RadViz will automatically close the selection for you by drawing a straight line from the point where you released your mouse button to the point where you began, so in most cases, you will want to finish where you started. 9 | 10 | After complete the selection, RadViz will highlight all the keywords along the circle that are relevant on selected pages. Bold font is used to highlight. Also, the WordCloud, located in the right top corner is updated using just the selected pages. The most frequently keyword are showed with bigger font size. Furthermore, on the right bottom corner, the snippets of selected pages are showed. 11 | 12 | .. image:: figures/visualization_toolbar_lassoSelect_result.png 13 | :width: 700px 14 | :align: center 15 | :height: 430px 16 | :alt: alternate text 17 | -------------------------------------------------------------------------------- /client/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 20 | Domain Discovery Tool 21 | 27 | 28 | 29 |
30 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | \#* 2 | \.#* 3 | *~* 4 | *.conf 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # IPython Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | client/build 97 | client/node_modules 98 | nltk_data 99 | node-v6.9.1-linux-x64 100 | server/ranking/D_cbow_pdw_8B.pkl 101 | results.txt 102 | -------------------------------------------------------------------------------- /docs/seedfinder.rst: -------------------------------------------------------------------------------- 1 | SeedFinder 2 | ********** 3 | 4 | Instead of making multiple queries to Google/Bing yourself you can trigger automated keyword search on Google/Bing and collect more web pages for the domain using the SeedFinder. This requires a domain model. So once you have annoated sufficient pages, indicated by a non-zero accuracy on the top right corner, you can use the SeedFinder functionality. 5 | 6 | To start a SeedFinder search click on the SEEDFINDER tab. 7 | 8 | .. image:: figures/seedfinder_search_new.png 9 | :width: 800px 10 | :align: center 11 | :height: 600px 12 | :alt: alternate text 13 | 14 | Enter the initial search query keywords, for example **ebola treatment**, as shown in the figure above. The SeedFinder issues this query to Google/Bing. It applies the domain model to the pages returned by Google/Bing. From the pages labeled relevant by the domain model the SeedFinder extracts keywords to form new queries which it again issues to Google/Bing. This iterative process terminates when no more relevant pages are retrieved or the max number of queries configured is exceeded. 15 | 16 | You can monitor the status of the SeedFinder in the **Process Monitor** that can be be accessed by clicking on the |pm_icon| on the top as shown below: 17 | 18 | .. |pm_icon| image:: figures/pm_icon.png 19 | 20 | .. image:: figures/sf_pm.png 21 | :width: 800px 22 | :align: center 23 | :height: 600px 24 | :alt: alternate text 25 | 26 | You can also stop the seedfinder process from the **Process Monitor** by clicking on the stop button shown along the corresponding proces. 27 | 28 | All queries made are listed in the **Filters** Tab under **SeedFinder Queries**. These pages can now be analysed and annotated just like the other web pages. 29 | -------------------------------------------------------------------------------- /client/src/components/ScaleBar.js: -------------------------------------------------------------------------------- 1 | // 2 | import React, {Component} from 'react'; 3 | 4 | class ScaleBar extends React.Component{ 5 | constructor(props){ 6 | super(props); 7 | this.state={ 8 | ratioAccuracy:0, 9 | }; 10 | } 11 | 12 | render(){ 13 | var ratioAccuracy = (this.props.ratioAccuracy===undefined || isNaN(this.props.ratioAccuracy))?0:this.props.ratioAccuracy; 14 | return( 15 | 16 | 17 | 18 | 19 | 20 | 21 | Ratio/Accuracy 22 | 23 | Poor 24 | 25 | 26 | Fair 27 | 28 | 29 | 30 | 31 | 32 | Good 33 | 34 | 35 | Excellent 36 | 37 | 38 | 39 | ); 40 | } 41 | } 42 | 43 | export default ScaleBar; 44 | -------------------------------------------------------------------------------- /docs/terms_summary.rst: -------------------------------------------------------------------------------- 1 | Extracted Terms Summary 2 | ----------------------- 3 | 4 | .. image:: figures/terms.png 5 | :width: 800px 6 | :align: center 7 | :height: 400px 8 | :alt: alternate text 9 | 10 | The most relevant terms and phrases (unigrams, bigrams and trigrams) are extracted from the pages in the current view of DDT and listed in the Terms Tab on the left panel, as shown in the figure above. This provides a summary of the pages currently in view. Initially, when there are no annotated terms, the top 40 terms with the highest TFIDF (term frequency-inverse document frequency) are selected. The terms are displayed with their frequency of occurrence in relevant (blue) and irrelevant (red) pages (bars to the right of the Terms panel). This helps the expert to select terms that are more discerning of relevant pages. 11 | 12 | Terms can be tagged as ’Positive’ and ’Negative’ by 1-click and 2-click respectively. The tags are stored in the active data source. When the update terms button is clicked, the positively and negatively annotated terms are used to re-rank the other terms. Terms help the expert understand and discover new information about the domains of interest. The terms can be used to refine the Web search or start new sub topic searches. 13 | 14 | Custom relevant and irrelevant terms can be added by clicking the + button to boost the extraction of more relevant terms. These custom terms are distinguised by the delete icon before them which can be clicked to delete the custom term. 15 | 16 | Hovering the mouse over the terms in the Terms window displays the context in which they appear on the pages. This again helps the expert understand and disambiguate the relevant terms. Inspect the terms extracted in the "Terms" window. Clicking on the stop button pins the context to the corresponding term. 17 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/less/mixins.less: -------------------------------------------------------------------------------- 1 | // Mixins 2 | // -------------------------- 3 | 4 | .fa-icon() { 5 | display: inline-block; 6 | font: normal normal normal @fa-font-size-base/@fa-line-height-base FontAwesome; // shortening font declaration 7 | font-size: inherit; // can't have font-size inherit on line above, so need to override 8 | text-rendering: auto; // optimizelegibility throws things off #1094 9 | -webkit-font-smoothing: antialiased; 10 | -moz-osx-font-smoothing: grayscale; 11 | 12 | } 13 | 14 | .fa-icon-rotate(@degrees, @rotation) { 15 | -ms-filter: "progid:DXImageTransform.Microsoft.BasicImage(rotation=@{rotation})"; 16 | -webkit-transform: rotate(@degrees); 17 | -ms-transform: rotate(@degrees); 18 | transform: rotate(@degrees); 19 | } 20 | 21 | .fa-icon-flip(@horiz, @vert, @rotation) { 22 | -ms-filter: "progid:DXImageTransform.Microsoft.BasicImage(rotation=@{rotation}, mirror=1)"; 23 | -webkit-transform: scale(@horiz, @vert); 24 | -ms-transform: scale(@horiz, @vert); 25 | transform: scale(@horiz, @vert); 26 | } 27 | 28 | 29 | // Only display content to screen readers. A la Bootstrap 4. 30 | // 31 | // See: http://a11yproject.com/posts/how-to-hide-content/ 32 | 33 | .sr-only() { 34 | position: absolute; 35 | width: 1px; 36 | height: 1px; 37 | padding: 0; 38 | margin: -1px; 39 | overflow: hidden; 40 | clip: rect(0,0,0,0); 41 | border: 0; 42 | } 43 | 44 | // Use in conjunction with .sr-only to only display content when it's focused. 45 | // 46 | // Useful for "Skip to main content" links; see http://www.w3.org/TR/2013/NOTE-WCAG20-TECHS-20130905/G1 47 | // 48 | // Credit: HTML5 Boilerplate 49 | 50 | .sr-only-focusable() { 51 | &:active, 52 | &:focus { 53 | position: static; 54 | width: auto; 55 | height: auto; 56 | margin: 0; 57 | overflow: visible; 58 | clip: auto; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /docker-compose.yml.ache: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | elasticsearch: 4 | image: elasticsearch:1.6 5 | container_name: elastic 6 | environment: 7 | - xpack.security.enabled=false 8 | - cluster.name=docker-cluster 9 | - bootstrap.memory_lock=true 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | volumes: 15 | - ./data/:/usr/share/elasticsearch/data # elasticsearch data storage 16 | ports: 17 | - 9200:9200 18 | ache_deep: 19 | image: vidanyu/ache:0.9.0 20 | entrypoint: /ache/bin/ache startServer -c /config/ -d /data 21 | container_name: ache_deep_crawl 22 | ports: 23 | - 8080:8080 24 | links: 25 | - elasticsearch 26 | volumes: 27 | - ./data/data-ache:/data 28 | - ./:/config 29 | ache_focused: 30 | image: vidanyu/ache:0.9.0 31 | entrypoint: /ache/bin/ache startServer -c /config/ -d /data 32 | container_name: ache_focused_crawl 33 | ports: 34 | - 8081:8080 35 | links: 36 | - elasticsearch 37 | - ache_deep 38 | volumes: 39 | - ./data/data-ache:/data 40 | - ./:/config 41 | ddt: 42 | image: vidanyu/ddt:latest 43 | container_name: dd_tool 44 | environment: 45 | ELASTICSEARCH_SERVER: elasticsearch 46 | ACHE_DEEP_CRAWLER_SERVER: ache_deep 47 | ACHE_DEEP_CRAWLER_PORT: 8080 48 | ACHE_FOCUSED_CRAWLER_SERVER: ache_focused 49 | ACHE_FOCUSED_CRAWLER_PORT: 8080 50 | ACHE_DEEP_CRAWLER_MONITOR_SERVER: localhost 51 | ACHE_DEEP_CRAWLER_MONITOR_PORT: 8080 52 | ACHE_FOCUSED_CRAWLER_MONITOR_SERVER: localhost 53 | ACHE_FOCUSED_CRAWLER_MONITOR_PORT: 8081 54 | ports: 55 | - 8084:8084 56 | links: 57 | - elasticsearch 58 | - ache_deep 59 | - ache_focused 60 | volumes: 61 | - ./data/:/ddt/domain_discovery_tool/server/data # ddt data storage 62 | -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | Building Domain Index 5 | _____________________ 6 | 7 | Creating a domain specific index involves: 8 | 9 | - Uploading known relevant URL domains from which you would like to collect all URLs belonging to that domain. This is called a deep crawl. Follow steps 1,2 and 4, below, for this. 10 | - Create a domain model that can be used for a focused crawl (broad crawl). For this follow all the steps 1-4 below. 11 | 12 | Step 1 13 | ~~~~~~ 14 | 15 | .. include:: add_domain.rst 16 | 17 | Step 2 18 | ~~~~~~ 19 | 20 | .. include:: load_data.rst 21 | 22 | Step 3 23 | ~~~~~~ 24 | 25 | .. include:: annotations.rst 26 | 27 | Step 3a 28 | ******* 29 | 30 | Tag at least 100 **Relevant** pages for your domain. Refer `How to Annotate`_. 31 | 32 | Step 3b 33 | ******* 34 | 35 | Tag at least 100 **Irrelevant** pages for your domain. Refer `How to Annotate`_. 36 | 37 | 38 | How to Annotate 39 | *************** 40 | 41 | .. include:: how_to_annotate.rst 42 | 43 | Tag Individual Pages 44 | >>>>>>>>>>>>>>>>>>>> 45 | 46 | .. include:: tag_individual.rst 47 | 48 | Tag Selected Pages 49 | >>>>>>>>>>>>>>>>>> 50 | 51 | .. include:: tag_multiselect.rst 52 | 53 | Tag All Pages in View 54 | >>>>>>>>>>>>>>>>>>>>> 55 | 56 | .. include:: tag_all.rst 57 | 58 | Tag All Pages for Current Filter 59 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 60 | 61 | .. include:: tag_everything.rst 62 | 63 | Custom Tag 64 | >>>>>>>>>> 65 | 66 | .. include:: custom.rst 67 | 68 | Tag for Deep Crawl 69 | >>>>>>>>>>>>>>>>>> 70 | 71 | .. include:: deep_crawl_tag.rst 72 | 73 | Step 4 74 | ~~~~~~ 75 | 76 | .. include:: run_crawler.rst 77 | 78 | Step 4a 79 | ******* 80 | 81 | Deep Crawl 82 | >>>>>>>>>> 83 | 84 | .. include:: deep_crawl.rst 85 | 86 | Step 4b 87 | ******* 88 | 89 | Focused Crawl 90 | >>>>>>>>>>>>> 91 | 92 | .. include:: focused_crawl.rst 93 | 94 | 95 | -------------------------------------------------------------------------------- /client/public/font-awesome-4.7.0/scss/_mixins.scss: -------------------------------------------------------------------------------- 1 | // Mixins 2 | // -------------------------- 3 | 4 | @mixin fa-icon() { 5 | display: inline-block; 6 | font: normal normal normal #{$fa-font-size-base}/#{$fa-line-height-base} FontAwesome; // shortening font declaration 7 | font-size: inherit; // can't have font-size inherit on line above, so need to override 8 | text-rendering: auto; // optimizelegibility throws things off #1094 9 | -webkit-font-smoothing: antialiased; 10 | -moz-osx-font-smoothing: grayscale; 11 | 12 | } 13 | 14 | @mixin fa-icon-rotate($degrees, $rotation) { 15 | -ms-filter: "progid:DXImageTransform.Microsoft.BasicImage(rotation=#{$rotation})"; 16 | -webkit-transform: rotate($degrees); 17 | -ms-transform: rotate($degrees); 18 | transform: rotate($degrees); 19 | } 20 | 21 | @mixin fa-icon-flip($horiz, $vert, $rotation) { 22 | -ms-filter: "progid:DXImageTransform.Microsoft.BasicImage(rotation=#{$rotation}, mirror=1)"; 23 | -webkit-transform: scale($horiz, $vert); 24 | -ms-transform: scale($horiz, $vert); 25 | transform: scale($horiz, $vert); 26 | } 27 | 28 | 29 | // Only display content to screen readers. A la Bootstrap 4. 30 | // 31 | // See: http://a11yproject.com/posts/how-to-hide-content/ 32 | 33 | @mixin sr-only { 34 | position: absolute; 35 | width: 1px; 36 | height: 1px; 37 | padding: 0; 38 | margin: -1px; 39 | overflow: hidden; 40 | clip: rect(0,0,0,0); 41 | border: 0; 42 | } 43 | 44 | // Use in conjunction with .sr-only to only display content when it's focused. 45 | // 46 | // Useful for "Skip to main content" links; see http://www.w3.org/TR/2013/NOTE-WCAG20-TECHS-20130905/G1 47 | // 48 | // Credit: HTML5 Boilerplate 49 | 50 | @mixin sr-only-focusable { 51 | &:active, 52 | &:focus { 53 | position: static; 54 | width: auto; 55 | height: auto; 56 | margin: 0; 57 | overflow: visible; 58 | clip: auto; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Documentation Status](https://readthedocs.org/projects/domain-discovery-tool/badge/?version=latest)](http://domain-discovery-tool.readthedocs.io/en/latest/?badge=latest) 2 | 3 | # Domain Discovery Tool (DDT) 4 | 5 | > NOTE: The code in this repository is currently unmaintained. 6 | 7 | This repository contains the Domain Discovery Tool (DDT) project. DDT is an interactive system that helps users explore and better understand a domain (or topic) as it is represented on the Web. It achieves this by integrating human insights with machine computation (data mining and machine learning) through visualization. DDT allows a domain expert to visualize and analyze pages returned by a search engine or a crawler, and easily provide feedback about relevance. DDT addresses important challenges: 8 | 9 | * It assist users in the process of domain understanding and discovery, guiding them to construct effective queries to be issued to a search engine to find additional relevant information; 10 | * It provides an easy-to-use interface whereby users can quickly provide feedback regarding the relevance of pages which can then be used to create learning classifiers for the domains of interest; and 11 | * It supports the configuration and deployment of focused crawlers that automatically and efficiently search the Web for additional pages on the topic. DDT allows users to quickly select crawling seeds as well as positive and negatives required to create the page classifier required for the focus topic. 12 | 13 | ## Documentation 14 | 15 | Documentation for installation and usage is available [HERE!](http://domain-discovery-tool.readthedocs.io/en/latest/). 16 | 17 | ## Publication 18 | 19 | Yamuna Krishnamurthy, Kien Pham, Aecio Santos, and Juliana Friere. 2016. [Interactive Web Content Exploration for Domain Discovery](http://poloclub.gatech.edu/idea2016/papers/p64-krishnamurthy.pdf) (Interactive Data Exploration and Analytics ([IDEA](http://poloclub.gatech.edu/idea2016/)) Workshop at Knowledge Discovery and Data Mining ([KDD](http://www.kdd.org/kdd2016/)), San Francisco, CA). 20 | 21 | -------------------------------------------------------------------------------- /client/src/components/Scatterplot.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import 'react-vis/main.css'; 3 | import { 4 | XYPlot, 5 | XAxis, 6 | YAxis, 7 | VerticalGridLines, 8 | HorizontalGridLines, 9 | MarkSeries} from 'react-vis'; 10 | import {scaleOrdinal} from 'd3-scale'; //schemeCategory10 11 | 12 | class Scatterplot extends Component { 13 | constructor(props){ 14 | super(props); 15 | this.state = {}; 16 | this.colorscheme = scaleOrdinal([1,2,3]); 17 | } 18 | 19 | create_data(data){ 20 | let self=this; 21 | if (data){ 22 | let processed_data = {}; 23 | data.forEach((d)=>{ 24 | if (!processed_data[self.props.labelAcessor(d)]){ 25 | processed_data[self.props.labelAcessor(d)] = []; 26 | } 27 | processed_data[self.props.labelAcessor(d)].push({ 28 | "x": parseFloat(self.props.xAcessor(d)), 29 | "y": parseFloat(self.props.yAcessor(d)), 30 | }); 31 | }); 32 | return processed_data; 33 | } 34 | return {}; 35 | } 36 | 37 | render() { 38 | let mydata = this.create_data(this.props.data); 39 | return ( 40 |
41 |

{this.props.title}

42 | 43 | 44 | 45 | 46 | 47 | {Object.keys(mydata).map((k)=>{ 48 | return 49 | })} 50 | 51 |
52 | ); 53 | } 54 | } 55 | 56 | Scatterplot.defaultProps = { 57 | width: 400, 58 | height: 400, 59 | dotSize: 5, 60 | title: "title" 61 | }; 62 | 63 | 64 | export default Scatterplot; 65 | -------------------------------------------------------------------------------- /docs/create_model.rst: -------------------------------------------------------------------------------- 1 | Create Model 2 | ------------ 3 | 4 | DDT incrementally builds a model as the user `annotates `_ the retrieved pages. The accuracy of the domain model is displayed on the top right corner. It provides an indication of the model coverage of the domain and how it is influenced by annotations. 5 | 6 | The domain model can be exported by clicking on the **Model** button on the top (this button will be dsiabled when there are no sufficient annotations to build the model and the model **Accuracy of onlineClassifier: 0 %**). This will show a drop down as shown in figure below: 7 | 8 | .. image:: model_dropdown.png 9 | :width: 800px 10 | :align: center 11 | :height: 400px 12 | :alt: alternate text 13 | 14 | Click on **Create Model** to export the model. This should bring up a file explorer pop-up (makes sure you enable pop-up on your browser) as shown below. Save the compressed model file. 15 | 16 | .. image:: model_download.png 17 | :width: 800px 18 | :align: center 19 | :height: 400px 20 | :alt: alternate text 21 | 22 | This saved model file contains the ACHE classifier model, the training data for the model and the initial seed list required for focused crawling as shown in figure below: 23 | 24 | .. image:: saved_model.png 25 | :width: 800px 26 | :align: center 27 | :height: 400px 28 | :alt: alternate text 29 | 30 | 31 | Annotation 32 | ~~~~~~~~~~ 33 | 34 | Currently, pages can be annotated as Relevant, Irrelevant or Neutral using the |tag_all| buttons respectively to tag all pages in the current view. |tag_one| buttons can be used to tag individual pages. Annotations are used to build the domain model. 35 | 36 | .. |tag_all| image:: tag_all.png 37 | 38 | .. |tag_one| image:: tag_one.png 39 | 40 | Note: 41 | 42 | * At least 10 pages each of relevant and irrelevant pages should be annotated to build the model. The more the annotations, hence the better coverage of the domain, the better the domain model. 43 | * Ensure that the relevant and irrelevant page annotations are balanced for a better model. 44 | 45 | 46 | -------------------------------------------------------------------------------- /docs/crawl_fwd_back.rst: -------------------------------------------------------------------------------- 1 | Crawl Forward and Backward 2 | ************************** 3 | 4 | This allows the user to crawl one level forward or backward for all the selected URLs. 5 | 6 | **Forward Links -** Forward links are all the links contained in a given page. When you crawl one level forward it downloads all the pages corresponding to the links contained in the page. 7 | 8 | **Backward Links -** Backward links are all the links that contain a link to the given page. When you crawl one level backward it first finds all the links that contain a link to the selected page and then downloads all the pages corresponding to the links contained in the all the backward link pages. 9 | 10 | The motivation for backward and forward crawling is the assumption that links containing the selected pages (back links) and links contained in the selected page (forward links) would be about similar topic as the selected page. 11 | 12 | Crawl Individual Pages 13 | <<<<<<<<<<<<<<<<<<<<<< 14 | 15 | .. |tag_one| image:: figures/fwd_back_single.png 16 | 17 | |tag_one| buttons, along each page, can be used to crawl backward or forward links in individual pages. 18 | 19 | Crawl Selected Pages 20 | <<<<<<<<<<<<<<<<<<<< 21 | 22 | Select multiple pages by keeping the **ctrl** key pressed and clicking on the pages that you want to select. When done with selecting pages, release the **ctrl** key. This will bring up a window where you can choose to crawl forward or backward the pages as shown below: 23 | 24 | .. image:: figures/multi_select.png 25 | :width: 800px 26 | :align: center 27 | :height: 400px 28 | :alt: alternate text 29 | 30 | 31 | Crawl All Pages 32 | <<<<<<<<<<<<<<< 33 | 34 | Use the |tag_all| buttons at the top of the list of pages to crawl backward or forward links on all pages in the current view. 35 | 36 | Crawl All Pages for Current Filter 37 | <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 38 | 39 | .. |tag_all| image:: figures/tag_all.png 40 | 41 | If you want to crawl forward or backward all pages retrieved for a particular filter (across pagination), then check the **Select ALL results in paginations** checkbox below the page list on top left. Then use |tag_all| buttons to crawl all the pages. 42 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Domain Discover Tool Dockerfile 3 | # 4 | # 5 | 6 | # Pull base image. 7 | FROM ubuntu:latest 8 | 9 | # Install some dependencies 10 | RUN apt-get update && apt-get -y install git build-essential wget 11 | 12 | # Install miniconda 13 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 14 | wget --quiet --no-check-certificate http://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh && \ 15 | /bin/bash /Miniconda2-latest-Linux-x86_64.sh -b -p /opt/conda && \ 16 | rm Miniconda2-latest-Linux-x86_64.sh && \ 17 | /opt/conda/bin/conda install --yes conda==3.14.1 18 | ENV PATH /opt/conda/bin:$PATH 19 | 20 | RUN conda install -c conda conda-env 21 | 22 | RUN echo $PATH 23 | 24 | # Expose Domain Discovery Tool port 25 | EXPOSE 8084 26 | 27 | # Expose ACHE port 28 | EXPOSE 8080 29 | 30 | # Expose ElasticSearch ports 31 | EXPOSE 9200 32 | EXPOSE 9300 33 | 34 | # Expose Supervisord port 35 | EXPOSE 9001 36 | 37 | # Get domain_discovery_API repository 38 | RUN git clone https://github.com/ViDA-NYU/domain_discovery_API.git /ddt/domain_discovery_API 39 | 40 | WORKDIR /ddt/domain_discovery_API 41 | 42 | # Add build file 43 | RUN mv env_docker.yml environment.yml 44 | 45 | RUN make conda_env &&\ 46 | make downloader_app &&\ 47 | make link_word2vec_data &&\ 48 | make tsp_solver &&\ 49 | rm -rf /opt/conda/pkgs 50 | 51 | WORKDIR /ddt/domain_discovery_tool 52 | 53 | # Add build file 54 | ADD ./Makefile /ddt/domain_discovery_tool/Makefile 55 | 56 | # Install conda dependencies and download nltk data 57 | ADD ./environment.yml /ddt/domain_discovery_tool/environment.yml 58 | RUN make conda_env 59 | RUN make get_nltk_data 60 | 61 | # Add client source files 62 | ADD ./client /ddt/domain_discovery_tool/client 63 | 64 | RUN make get_react_install &&\ 65 | make get_react_build &&\ 66 | rm -rf /ddt/domain_discovery_tool/client/node_modules &&\ 67 | rm -rf /ddt/domain_discovery_tool/client/public 68 | 69 | # Add client source files 70 | ADD ./server /ddt/domain_discovery_tool/server 71 | 72 | # Setup remaining configs 73 | RUN make cherrypy_config 74 | 75 | ADD ./bin/run_ddt /ddt/run_ddt 76 | 77 | CMD bash -c 'source activate ddt; /ddt/run_ddt' -------------------------------------------------------------------------------- /client/src/utils/stopword-filter.js: -------------------------------------------------------------------------------- 1 | const stopWordList = ["a","about","above","after","again","against","all","am", 2 | "an","and","any","are","aren't","as","at","be","because", 3 | "been","before","being","below","between","both","but", 4 | "by","can't","cannot","could","couldn't","did","didn't", 5 | "do","does","doesn't","doing","don't","down","during", 6 | "each","few","for","from","further","had","hadn't","has", 7 | "hasn't","have","haven't","having","he","he'd","he'll", 8 | "he's","her","here","here's","hers","herself","him", 9 | "himself","his","how","how's","i","i'd","i'll","i'm", 10 | "i've","if","in","into","is","isn't","it","it's","its", 11 | "itself","let's","me","more","most","mustn't","my", 12 | "myself","no","nor","not","of","off","on","once","only", 13 | "or","other","ought","our","ours", "ourselves","out", 14 | "over","own","same","shan't","she", "she'd","she'll", 15 | "she's","should","shouldn't","so","some", "such","than", 16 | "that","that's","the","their","theirs", "them", 17 | "themselves","then","there","there's","these", "they", 18 | "they'd","they'll","they're","they've","this","those", 19 | "through","to","too","under","until","up","very","was", 20 | "wasn't","we","we'd","we'll","we're","we've","were", 21 | "weren't","what","what's","when","when's","where", 22 | "where's","which","while","who","who's","whom","why", 23 | "why's","with","won't","would","wouldn't","you","you'd", 24 | "you'll","you're","you've","your","yours","yourself", 25 | "yourselves"]; 26 | 27 | function stopWordFilter(inputString) { 28 | return inputString.split(" ") 29 | .map(word => word.toLowerCase()) 30 | .filter(word => stopWordList.indexOf(word) === -1) 31 | .join(","); 32 | } 33 | 34 | export { stopWordFilter }; 35 | -------------------------------------------------------------------------------- /client/src/utils/utils/stopword-filter.js: -------------------------------------------------------------------------------- 1 | const stopWordList = ["a","about","above","after","again","against","all","am", 2 | "an","and","any","are","aren't","as","at","be","because", 3 | "been","before","being","below","between","both","but", 4 | "by","can't","cannot","could","couldn't","did","didn't", 5 | "do","does","doesn't","doing","don't","down","during", 6 | "each","few","for","from","further","had","hadn't","has", 7 | "hasn't","have","haven't","having","he","he'd","he'll", 8 | "he's","her","here","here's","hers","herself","him", 9 | "himself","his","how","how's","i","i'd","i'll","i'm", 10 | "i've","if","in","into","is","isn't","it","it's","its", 11 | "itself","let's","me","more","most","mustn't","my", 12 | "myself","no","nor","not","of","off","on","once","only", 13 | "or","other","ought","our","ours", "ourselves","out", 14 | "over","own","same","shan't","she", "she'd","she'll", 15 | "she's","should","shouldn't","so","some", "such","than", 16 | "that","that's","the","their","theirs", "them", 17 | "themselves","then","there","there's","these", "they", 18 | "they'd","they'll","they're","they've","this","those", 19 | "through","to","too","under","until","up","very","was", 20 | "wasn't","we","we'd","we'll","we're","we've","were", 21 | "weren't","what","what's","when","when's","where", 22 | "where's","which","while","who","who's","whom","why", 23 | "why's","with","won't","would","wouldn't","you","you'd", 24 | "you'll","you're","you've","your","yours","yourself", 25 | "yourselves"]; 26 | 27 | function stopWordFilter(inputString) { 28 | return inputString.split(" ") 29 | .map(word => word.toLowerCase()) 30 | .filter(word => stopWordList.indexOf(word) === -1) 31 | .join(","); 32 | } 33 | 34 | export { stopWordFilter }; 35 | -------------------------------------------------------------------------------- /client/src/images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Domain Discovery Tool (DDT) Documentation 2 | ========================================= 3 | 4 | Domain Discovery is the process of acquiring, understanding and exploring data for a specific domain. Some example domains include human trafficking, illegal sale of weapons and micro-cap fraud. Before a user starts the domain discovery process, she has an "idea" of what she is looking for based on prior knowledge. During domain discovery, the user obtains additional knowledge about how the information she is looking for is represented on the Web. This new knowledge of the domain becomes prior knowledge, leading to an iterative process of domain discovery as illustrated in Figure 2. The goals of the domain discovery process are: 5 | 6 | * Help users learn about a domain and how (and where) it is represented on the Web. 7 | * Acquire a sufficient number of Web pages that capture the user's notion of the domain so that a computational model can be constructed to automatically recognize relevant content. 8 | 9 | .. image:: figures/ddt_arch-new.png 10 | :width: 600px 11 | :align: center 12 | :height: 300px 13 | :alt: alternate text 14 | 15 | The Domain Discovery Tool (DDT) is an interactive system that helps explore and better understand a domain (or topic) as it is represented on the Web. It achieves this by integrating human insights with machine computation (data mining and machine learning) through visualization. DDT allows a domain expert to visualize and analyze pages returned by a search engine or a crawler, and easily provide feedback about relevance. This feedback, in turn, can be used to address two challenges: 16 | 17 | * Assist users in the process of domain understanding and discovery, guiding them to construct effective queries to be issued to a search engine to find additional relevant information; 18 | * Provide an easy-to-use interface whereby users can quickly provide feedback regarding the relevance of pages which can then be used to create learning classifiers for the domains of interest; and 19 | * Support the configuration and deployment of focused crawlers that automatically and efficiently search the Web for additional pages on the topic. DDT allows users to quickly select crawling seeds as well as positive and negatives required to create the page classifier required for the focus topic. 20 | 21 | Contents 22 | ======== 23 | 24 | .. toctree:: 25 | :maxdepth: 2 26 | 27 | install 28 | tutorials 29 | use 30 | publication 31 | contact 32 | 33 | Links 34 | ======== 35 | 36 | * `GitHub repository `_ 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Domain Discovery Tool development 2 | # Type "make" or "make all" to build the complete development environment 3 | # Type "make help" for a list of commands 4 | 5 | # Variables for the Makefile 6 | .PHONY = conda_environment cherrypy_config word2vec_data clean nltk_data 7 | SHELL := /bin/bash 8 | CONDA_ROOT := $(shell conda info --root) 9 | CONDA_ENV := $(CONDA_ROOT)/envs/ddt 10 | HOSTNAME := $(shell hostname) 11 | 12 | CONDA_ENV_TARGET := $(CONDA_ENV)/conda-meta/history 13 | CHERRY_PY_CONFIG_TARGET := server/config.conf 14 | GET_REACT_NPM_INSTALL := client/node_modules 15 | GET_REACT_NPM_BUILD := client/build/index.html 16 | GET_NLTK_DATA_TARGET := nltk_data 17 | 18 | # Makefile commands, see below for actual builds 19 | 20 | ## all : set up DDT development environment 21 | all: conda_env downloader_app cherrypy_config get_react_install get_react_build get_nltk_data 22 | 23 | ## help : show all commands. 24 | # Note the double '##' in the line above: this is what's matched to produce 25 | # the list of commands. 26 | help : Makefile 27 | @sed -n 's/^## //p' $< 28 | 29 | clean: 30 | rm -rf client/build; \ 31 | rm server/config.conf 32 | 33 | ## conda_env : Install/update a conda environment with needed packages 34 | conda_env: $(CONDA_ENV_TARGET) 35 | 36 | ## downloader_app : Build the Java-based downloader application 37 | downloader_app: $(DOWNLOADER_APP_TARGET) 38 | 39 | ## cherrypy_config : Configure CherryPy (set absolute root environment) 40 | cherrypy_config: $(CHERRY_PY_CONFIG_TARGET) 41 | 42 | ## get_nltk_data : Download NLTK corpus and tokenizers 43 | get_nltk_data: $(GET_NLTK_DATA_TARGET) 44 | 45 | ## get_react_install : Download react packages 46 | get_react_install: $(GET_REACT_NPM_INSTALL) 47 | 48 | ## get_react_build : Build react packages 49 | get_react_build: $(GET_REACT_NPM_BUILD) 50 | 51 | # Actual Target work here 52 | 53 | $(CONDA_ENV_TARGET): environment.yml 54 | conda env update 55 | 56 | $(CHERRY_PY_CONFIG_TARGET): server/config.conf-in 57 | sed "s#tools.staticdir.root = .#tools.staticdir.root = ${PWD}/client/build#g" server/config.conf-in > server/config.conf 58 | 59 | $(GET_NLTK_DATA_TARGET): 60 | source activate ddt; \ 61 | python -m nltk.downloader -d ${PWD}/nltk_data stopwords brown punkt averaged_perceptron_tagger 62 | 63 | $(GET_REACT_NPM_INSTALL): 64 | source activate ddt; \ 65 | pushd client; \ 66 | npm install radviz-component@1.0.54; \ 67 | npm install; \ 68 | python fix_for_npm_child_process_issue.py; \ 69 | popd 70 | 71 | 72 | $(GET_REACT_NPM_BUILD): 73 | source activate ddt; \ 74 | pushd client; \ 75 | npm run build; \ 76 | cp build/index.html build/domain_discovery_tool.html; \ 77 | cp -rf public/font-awesome-4.7.0 build/static; \ 78 | popd 79 | -------------------------------------------------------------------------------- /docs/load_data.rst: -------------------------------------------------------------------------------- 1 | Acquire Data 2 | ------------ 3 | 4 | Continuing with our example of the **Ebola** domain, we show here the methods of uploading data. Expand the Search tab on the left panel. You can add data to the domain in the following ways: 5 | 6 | Upload URLs 7 | *********** 8 | 9 | If you have a set of URLs of sites you already know, you can add them from the **LOAD** tab. You can upload the list of URLs in the text box, one fully qualified URL per line, as shown in figure below: 10 | 11 | .. image:: figures/load_url_text.png 12 | :width: 800px 13 | :align: center 14 | :height: 400px 15 | :alt: alternate text 16 | 17 | You can also upload a file with the list of URLs by clicking on the **LOAD URLS FROM FILE** button. This will bring up a file explorer window where you can select the file to upload. *The list of fully qualified URLs should be entered one per line in the file*. For example: 18 | 19 | | http://www.plospathogens.org/article/info%3Adoi%2F10.1371%2Fjournal.ppat.1003065 20 | | https://bmcpsychiatry.biomedcentral.com/articles/10.1186/s12888-017-1280-8 21 | | http://www.cdph.ca.gov/programs/cder/Pages/Ebola.aspx 22 | 23 | Download an example URLs list file for ebola domain `HERE `_. Once the file is selected you can upload them by clicking on **RELEVANT**, **IRRELEVANT**, **NEUTRAL** or **Add Tag** (Add a custom tag). This will annotate the pages correspondingly. 24 | 25 | .. image:: figures/load_urls_popup.png 26 | :width: 800px 27 | :align: center 28 | :height: 400px 29 | :alt: alternate text 30 | 31 | The uploaded URLs are listed in the **Filters** Tab under **Queries** as **Uploaded**. 32 | 33 | Web Search 34 | *********** 35 | 36 | You can do a keywords search on google or bing by clicking on the **WEB** tab. For example, “ebola symptoms”. All queries made are listed in the **Filters** Tab under **Queries**. 37 | 38 | .. image:: figures/query_web.png 39 | :width: 800px 40 | :align: center 41 | :height: 400px 42 | :alt: alternate text 43 | 44 | If you have a multiple search queries then you can load them by clicking on the **Run Multiple Queries** button. This will bring up a window where you can either add the queries one per line in a textbox or upload a file that contains the search queries one per line. You can select the search engine to use (**Google** or **Bing**): 45 | 46 | .. image:: figures/load_multiple_queries.png 47 | :width: 800px 48 | :align: center 49 | :height: 400px 50 | :alt: alternate text 51 | 52 | Each of the queries will be issued on Google or Bing (as chosen) and the results made available for exploration and annotation in the **Filters** Tab under **Queries** as **Uploaded**. 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /docs/filter.rst: -------------------------------------------------------------------------------- 1 | Explore Data (Filters) 2 | ---------------------- 3 | 4 | .. image:: figures/filters.png 5 | :width: 800px 6 | :align: center 7 | :height: 400px 8 | :alt: alternate text 9 | 10 | Once some pages are loaded into the domain, they can be analyzed and spliced with various filters available in the Filters tab on the left panel. The available filters are: 11 | 12 | Queries 13 | ******* 14 | 15 | This lists all the web search queries and uploaded URLs made to date in the domain. You can select one or more of these queries to get pages for those specific queries. 16 | 17 | Tags 18 | **** 19 | 20 | This lists the annotations made to data. Currently the annotations can be either **Relevant**, **Irrelevant** or **Neutral**. 21 | 22 | Domains 23 | ******* 24 | 25 | This lists all the top level domains of all the pages in the domain. For example, the top level domain for URL https://ebolaresponse.un.org/data is **ebolaresponse.un.org**. 26 | 27 | Model Tags 28 | ********** 29 | 30 | You can expand the **Model Tags** and click the **Upate Model Tags** button that appears below, to apply the domain model to a random selection of 500 unlabeled pages. The predicted labels for these 500 pages could be: 31 | 32 | * **Maybe Relevant:** These are pages that have been labeled relevant by the model with a high confidence 33 | * **Maybe Irrelevant:** These are pages that have been labeled irrelevant by the model with a high confidence 34 | * **Unsure:** These are pages that were marked relevant or irrelevant by the domain model but with low confidence. Experiments have shown that labeling these pages helps improve the domain model's ability to predict labels for similar pages with higher confidence. 35 | 36 | **NOTE:** This will take a few seconds to apply the model and show the results. 37 | 38 | Annotated Terms 39 | *************** 40 | 41 | This lists all the terms that are either added, uploaded in the Terms Tab. It also lists the terms from the extracted terms in the Terms Tab that are annotated. 42 | 43 | SeedFinder Queries 44 | ****************** 45 | 46 | This lists all the seedfinder queries made to date in the domain. You can select one or more of these queries to get pages for those specific queries. 47 | 48 | Crawled Data 49 | ************ 50 | 51 | This lists the relevant and irrelevant crawled data. The relevant crawled data, **CD Relevant**, are those crawled pages that are labeled relevant by the domain model. The irrelevant crawled data, **CD Irrelevant**, are those crawled pages that are labeled irrelevant by the domain model. 52 | 53 | Search for Keywords 54 | ******************* 55 | 56 | .. image:: figures/search.png 57 | :width: 800px 58 | :align: center 59 | :height: 400px 60 | :alt: alternate text 61 | 62 | Search by keywords within the page content text. This search is available on the top right corner as shown in the figure above. It can be used along with the other filters. The keywords are searched not only in the content of the page but also the title and URL of the page. 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /client/src/components/Domain.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import Body from './Body'; 3 | import Header from './Header'; 4 | 5 | class Domain extends Component { 6 | 7 | constructor(props) { 8 | super(props); 9 | this.state = { 10 | idDomain:'', 11 | deleteKeywordSignal:false, 12 | reloadBody:true, 13 | noModelAvailable:true, 14 | updateCrawlerData:"", 15 | filterKeyword:null, 16 | valueSelectedViewBody:1, 17 | statusCrawlers:[], 18 | }; 19 | }; 20 | 21 | componentWillMount(){ 22 | this.setState({idDomain: this.props.location.query.idDomain}); 23 | }; 24 | 25 | componentWillReceiveProps = (newProps, nextState) => { 26 | if(newProps.location.query.idDomain ===this.state.idDomain){ 27 | return; 28 | } 29 | this.setState({idDomain: this.props.location.query.idDomain}); 30 | 31 | }; 32 | 33 | shouldComponentUpdate(nextProps, nextState) { 34 | if(nextProps.location.query.idDomain ===this.state.idDomain){ 35 | return false; 36 | } 37 | return true; 38 | }; 39 | 40 | //This function is called from Header component when user applies a filter by keyword (serch by keyword) 41 | filterKeyword(newFilterKeyword){ 42 | this.setState({filterKeyword:newFilterKeyword, deleteKeywordSignal:false, reloadBody:true }); 43 | this.forceUpdate(); 44 | } 45 | //This function is called from the Body. Check if there is a available model for the current domain. 46 | availableCrawlerButton(noModelAvailable){ //false means that there is a available model 47 | this.setState({noModelAvailable:noModelAvailable,reloadBody:false }); 48 | this.forceUpdate(); 49 | } 50 | 51 | //This function is called from the Body. It just update the Header in order to clear the textfield associated with the search by keyword. 52 | deletedFilter(filter_Keyword){ 53 | this.setState({ deleteKeywordSignal:true, reloadBody:false }); 54 | this.forceUpdate(); 55 | } 56 | //if updateCrawlerData is true, then the filter 'crawler data' should be updated because the crawler is still running. 57 | updateFilterCrawlerData(updateCrawlerData, statusCrawlers){ 58 | this.setState({ updateCrawlerData:updateCrawlerData, reloadBody:true, statusCrawlers:statusCrawlers}); 59 | this.forceUpdate(); 60 | } 61 | 62 | selectedViewBody(valueViewBody){ 63 | this.setState({valueSelectedViewBody:valueViewBody}); 64 | this.forceUpdate(); 65 | } 66 | 67 | render() { 68 | 69 | return ( 70 |
71 |
72 | 73 |
74 | ); 75 | } 76 | } 77 | 78 | export default Domain; 79 | -------------------------------------------------------------------------------- /client/src/components/Monitoring.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { 3 | Table, 4 | TableBody, 5 | TableHeader, 6 | TableHeaderColumn, 7 | TableRow, 8 | TableRowColumn, 9 | } from 'material-ui/Table'; 10 | 11 | import RaisedButton from 'material-ui/RaisedButton'; 12 | import $ from 'jquery'; 13 | 14 | class Monitoring extends Component { 15 | 16 | constructor(props){ 17 | super(props); 18 | this.state={ 19 | processes:{}, 20 | } 21 | } 22 | componentWillMount(){ 23 | this.setState({processes: this.props.processes,}) 24 | } 25 | 26 | componentWillReceiveProps(nextProps){ 27 | this.setState({processes: nextProps.processes, }) 28 | } 29 | 30 | stopProcess(process, process_info){ 31 | $.post( 32 | '/stopProcess', 33 | {"process": process["process"], "process_info": JSON.stringify(process_info["process_row"])}, 34 | function(message) { 35 | if(message.includes("Crawler")) 36 | this.props.updateFilterCrawlerData("stopCrawler"); 37 | console.log(message); 38 | }.bind(this) 39 | ); 40 | } 41 | 42 | render(){ 43 | var rows = Object.keys(this.state.processes).map((process, index)=>{ 44 | return this.state.processes[process].map((process_row, index_row)=>{ 45 | var disableStop = true; 46 | if(process_row.status === "Running" || process_row.status === "Starting" || process_row.status === "Downloading") 47 | disableStop = false; 48 | if(process_row.status !== "Completed"){ 49 | return 50 | {process} 51 | {process_row.domain} 52 | {process_row.status} 53 | {process_row.description} 54 | 55 | ; 56 | }else return; 57 | }); 58 | }); 59 | return ( 60 | 61 | 62 | Process 63 | Domain 64 | Status 65 | Description 66 | Stop 67 | 68 | 69 | 70 | 71 | {rows} 72 | 73 |
); 74 | } 75 | } 76 | 77 | Monitoring.propTypes = { 78 | messageCrawler: React.PropTypes.string.isRequired, 79 | }; 80 | 81 | Monitoring.defaultProps = { 82 | widthProcess:100, 83 | widthDomain:150, 84 | widthStatus:150, 85 | widthDescription:150, 86 | widthStop:100, 87 | }; 88 | 89 | export default Monitoring; 90 | -------------------------------------------------------------------------------- /client/src/components/DomainInfo.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react'; 2 | import {Card, CardHeader, CardMedia} from 'material-ui/Card'; 3 | import Avatar from 'material-ui/Avatar'; 4 | import $ from 'jquery'; 5 | import Home from 'material-ui/svg-icons/action/home'; 6 | //import Bars from 'react-bars'; 7 | const styles = { 8 | card: { 9 | 10 | borderStyle: 'solid', 11 | borderColor: '#C09ED7', 12 | background: 'white', 13 | borderRadius: '0px 0px 0px 0px', 14 | borderWidth: '0px 0px 1px 0px' 15 | }, 16 | avatar:{ 17 | margin:'-4px 8px 0px 0px', 18 | }, 19 | cardHeader:{ 20 | background: '#DCCCE7', 21 | padding:'10px 1px 10px 6px', 22 | borderRadius: '0px 0px 0px 0px', 23 | }, 24 | cardMedia:{ 25 | background: '#DCCCE7', 26 | padding:'2px 4px 2px 4px', 27 | borderRadius: '0px 0px 0px 0px', 28 | border:'solid', 29 | borderColor: '#C09ED7', 30 | }, 31 | 32 | }; 33 | 34 | class DomainInfo extends Component{ 35 | 36 | constructor(props) { 37 | super(props); 38 | this.state = { 39 | expanded: this.props.statedCard, 40 | currentTags: '', 41 | }; 42 | }; 43 | 44 | getTags(){ 45 | $.post( 46 | '/getAvailableTags', 47 | {'session': JSON.stringify(this.props.session), 'event': 'Tags'}, 48 | function(tagsDomain) { 49 | this.setState({currentTags: tagsDomain['tags']}); 50 | }.bind(this) 51 | ); 52 | } 53 | 54 | componentWillMount = () => { 55 | this.getTags(); 56 | this.setState({expanded: this.props.statedCard, }); 57 | }; 58 | 59 | componentWillReceiveProps = (newProps) => { 60 | this.setState({expanded: this.props.statedCard}, function() { 61 | this.setState({expanded: this.props.statedCard}); 62 | }); 63 | }; 64 | 65 | handleExpandChange = (expanded) => { 66 | this.setState({expanded: expanded}); 67 | if(expanded){ 68 | this.getTags(); 69 | this.props.setActiveMenu(expanded, 2); 70 | } 71 | }; 72 | 73 | handleToggle = (event, toggle) => { 74 | console.log("handleToggle"); 75 | this.setState({expanded: toggle}); 76 | }; 77 | 78 | handleExpand = () => { 79 | console.log("expand"); 80 | this.setState({expanded: true}); 81 | }; 82 | 83 | handleReduce = () => { 84 | console.log("reduce"); 85 | this.setState({expanded: false}); 86 | }; 87 | 88 | render(){ 89 | return( 90 | 91 | } />} 94 | style={styles.cardHeader} 95 | actAsExpander={true} 96 | showExpandableButton={true} 97 | /> 98 | 99 |

Domain: {this.props.nameDomain}

100 |

Labeled data:

101 |

Relevant: {this.state.currentTags["Relevant"]}

102 |

Irrelevant: {this.state.currentTags["Irrelevant"]}

103 |

Neutral: {this.state.currentTags["Neutral"]}

104 |
105 |
106 | ) 107 | } 108 | } 109 | 110 | export default DomainInfo; 111 | -------------------------------------------------------------------------------- /client/src/components/Search.js: -------------------------------------------------------------------------------- 1 | // Filename: Search.js 2 | // Purpose: This is an intermediate component between Body.js and SearchTabs.js. It handles the changes in search card. 3 | // Author: Sonia Castelo (scastelo2@gmail.com) 4 | import React, {Component} from 'react'; 5 | import {Card, CardHeader, CardMedia} from 'material-ui/Card'; 6 | import SearchTabs from './SearchTabs'; 7 | import Avatar from 'material-ui/Avatar'; 8 | import Assignment from 'material-ui/svg-icons/action/assignment-returned'; 9 | 10 | const styles = { 11 | card: { 12 | 13 | borderStyle: 'solid', 14 | borderColor: '#C09ED7', 15 | background: 'white', 16 | borderRadius: '0px 0px 0px 0px', 17 | borderWidth: '0px 0px 1px 0px' 18 | }, 19 | avatar:{ 20 | margin:'-4px 8px 0px 0px', 21 | }, 22 | cardHeader:{ 23 | background: '#DCCCE7', 24 | padding:'10px 1px 10px 6px', 25 | borderRadius: '0px 0px 0px 0px', 26 | }, 27 | cardMedia:{ 28 | background: '#DCCCE7', 29 | padding:'2px 4px 2px 4px', 30 | borderRadius: '0px 0px 0px 0px', 31 | height: "200px", 32 | }, 33 | 34 | }; 35 | 36 | class Search extends Component{ 37 | 38 | constructor(props) { 39 | super(props); 40 | this.state = { 41 | expanded: this.props.statedCard, 42 | update:true, 43 | }; 44 | }; 45 | 46 | componentWillMount = () => { 47 | this.setState({expanded: this.props.statedCard, }); 48 | }; 49 | 50 | //Handling state's changes of search card. (expanded or reduced) 51 | componentWillReceiveProps = (newProps) => { 52 | this.setState({expanded: this.props.statedCard}, function() { 53 | this.setState({expanded: this.props.statedCard}); 54 | }); 55 | }; 56 | 57 | handleExpandChange = (expanded) => { 58 | this.setState({expanded: expanded}); 59 | if(expanded){ 60 | this.props.setActiveMenu(expanded, 0); 61 | } 62 | }; 63 | 64 | handleReduce = () => { 65 | this.setState({expanded: false}); 66 | }; 67 | 68 | //Send information about a query that is running (from SearchTabs.js to Body.js) 69 | updateStatusMessage(value, term){ 70 | this.setState({update:!value}); /// false = reload, true = dont reload the search area 71 | this.props.updateStatusMessage(value, term); 72 | } 73 | 74 | updatePages(pages){ 75 | this.props.updatePages(pages); 76 | } 77 | getQueryPages(term){ 78 | this.props.getQueryPages(term); 79 | } 80 | queryPagesDone(){ 81 | this.props.queryPagesDone(); 82 | } 83 | 84 | 85 | //Check if the component should be updated or not 86 | shouldComponentUpdate(){ 87 | return true; 88 | 89 | } 90 | 91 | render(){ 92 | return( 93 | 94 | } />} 97 | style={styles.cardHeader} 98 | actAsExpander={true} 99 | showExpandableButton={true} 100 | /> 101 | 102 | 103 | 104 | 105 | 106 | ) 107 | 108 | } 109 | } 110 | 111 | export default Search; 112 | -------------------------------------------------------------------------------- /docs/ebola_urls.txt: -------------------------------------------------------------------------------- 1 | http://www.plospathogens.org/article/info%3Adoi%2F10.1371%2Fjournal.ppat.1003065 2 | https://bmcpsychiatry.biomedcentral.com/articles/10.1186/s12888-017-1280-8 3 | http://www.cdph.ca.gov/programs/cder/Pages/Ebola.aspx 4 | http://abc7ny.com/health/preliminary-tests-on-conn-patient-negative-for-ebola/352911/ 5 | https://www.uptodate.com/contents/clinical-manifestations-and-diagnosis-of-ebola-virus-disease 6 | https://www.nhsinform.scot/illnesses-and-conditions/infections-and-poisoning/ebola-virus-disease 7 | http://www.humanillnesses.com/original/E-Ga/Ebola-Fever.html 8 | http://www.rgj.com/story/news/2014/11/07/passenger-monitored-possible-ebola-symptoms-reno/18674105/ 9 | http://food.ndtv.com/health/ebola-virus-origin-and-control-1659259 10 | http://amarillo.com/news/local-news/2015-01-09/travelers-under-ebola-quarantine-amarillo 11 | http://www.nydailynews.com/life-style/health/u-s-ebola-symptoms-travelers-airports-borders-article-1.1967272 12 | http://www.ebolavirusnet.com/signs-a-symptoms.html 13 | http://www.virusprecautions.com/ 14 | http://www.king5.com/news/health/seattle-biotech-company-working-to-cure-ebola-like-disease/428092069 15 | https://www.grainger.com/content/ebola 16 | https://www.cdc.gov/vhf/ebola/about.html 17 | http://kfor.com/2014/10/30/health-officials-know-the-difference-between-ebola-and-the-flu-symptoms/ 18 | http://www.mayoclinic.com/health/ebola-virus/ds00996/dsection=symptoms 19 | http://www.dallasnews.com/news/plano/2014/10/03/plano-isd-person-cdc-monitoring-not-a-student-has-no-ebola-symptoms 20 | http://www.livescience.com/48311-ebola-causes-symptoms-treatment.html 21 | http://www.thehealthsite.com/diseases-conditions/ebola-virus/001/ 22 | https://www.mooc-list.com/tags/ebola-symptoms 23 | http://app6.vocusgr.com/Tracking.aspx?Data=HHL%3D92%3C3%3D%26JDG%3C98%3C!OHL%3D8%2B62&RE=IN&RI=3280911&Preview=False&DistributionActionID=31660&Action=Follow+Link 24 | http://wjla.com/news/health/ebola-enterovirus-or-the-flu-which-symptoms-are-yours--108050 25 | https://www.sciencedaily.com/releases/2016/11/161115150637.htm 26 | https://www.washingtonpost.com/news/to-your-health/wp/2015/10/09/british-nurses-re-hospitalization-reports-of-blindness-and-other-post-ebola-syndrome-complications-haunt-survivors/ 27 | http://www.aha.org/advocacy-issues/emergreadiness/ebola/index.shtml 28 | https://ebolaresponse.un.org/data 29 | http://apps.who.int/ebola/ebola-situation-reports 30 | http://ebola.emedtv.com/ebola/ebola-symptoms.html 31 | https://www.cdc.gov/vhf/ebola/symptoms/ 32 | https://www.sharecare.com/health/ebola-virus/how-common-is-ebola 33 | http://www.who.int/csr/disease/ebola/ebola-6-months/sierra-leone/en/ 34 | http://www.who.int/csr/disease/ebola/situation-reports/en/ 35 | http://ebola.ici3d.org/ 36 | http://gotnews.com/?p=1536 37 | http://ebola.emedtv.com/ebola/ebola-cure.html 38 | http://www.tweenshealth.com/en/parents/ebola.html?WT.ac=ctg 39 | https://theconversation.com/explainer-what-is-ebola-virus-25071 40 | http://health.howstuffworks.com/diseases-conditions/infectious/ebola3.htm 41 | http://ebola.emedtv.com/ebola/ebola-treatment.html 42 | http://www.cnn.com/2014/04/11/health/ebola-fast-facts/ 43 | http://t.umblr.com/redirect?z=http%3A%2F%2Fen.wikipedia.org%2Fwiki%2FThe_Cars&t=MmU1MTRmZTVjZjllODFmMWRjNzUwYTBlNTRkODg1NWJjNTA0ODMzMSxUWGh5dDJzSQ%3D%3D& 44 | http://www.rxeconsult.com/healthcare-articles/Ebola-Virus-Outbreak-Symptoms-Treatment-and-Prevention-of-Ebola-Hemorrhagic-Fever-661/ 45 | https://www.aap.org/en-us/advocacy-and-policy/aap-health-initiatives/Children-and-Disasters/Pages/Ebola.aspx 46 | http://www.nbc-2.com/story/26782160/ebola-fact-sheet-statistics-and-timeline 47 | http://gotnews.com/ebola-nurse-boyfriend-reportedly-admitted-ebola-symptoms/ 48 | http://health.alot.com/conditions/ebola-symptoms--4986 49 | http://www.everydayhealth.com/ebola/guide/symptoms/ 50 | -------------------------------------------------------------------------------- /client/src/components/SidebarMenu.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react'; 2 | import { Row, Col} from 'react-bootstrap'; 3 | import DomainInfo from './DomainInfo'; 4 | import Search from './Search'; 5 | import Filters from './Filters'; 6 | import '../css/Components.css'; 7 | import Plus from 'material-ui/svg-icons/action/swap-horiz'; 8 | import FloatingActionButton from 'material-ui/FloatingActionButton'; 9 | 10 | const styles = { 11 | button:{ 12 | marginTop:20, 13 | paddingBottom:'-145px', 14 | marginBottom:'-545px', 15 | marginRight: 5, 16 | }, 17 | }; 18 | 19 | class SidebarMenu extends Component{ 20 | 21 | constructor(props) { 22 | super(props); 23 | this.state = { 24 | session:{}, 25 | }; 26 | } 27 | 28 | componentWillMount() { 29 | this.setState({session:this.props.session}); 30 | 31 | 32 | } 33 | 34 | updateSession(newSession){ 35 | this.props.updateSession(newSession); 36 | } 37 | 38 | closeMenu(){ 39 | this.setState({ 40 | size: 60, 41 | //iconDomainInfo:} />, 42 | //stateDomainInfoCard:false, 43 | open: !this.state.open, 44 | sizeAvatar:35, 45 | }); 46 | } 47 | 48 | openMenu(){ 49 | this.setState({ 50 | size: 350, 51 | iconDomainInfo:null, 52 | open: !this.state.open, 53 | sizeAvatar:25, 54 | }); 55 | } 56 | 57 | openDockMenu(){ 58 | if(this.state.open){ 59 | this.closeMenu(); 60 | this.setState({ 61 | stateDomainInfoCard:false, 62 | stateSearchCard:false, 63 | stateFiltersCard:false, 64 | });} 65 | else{ 66 | this.openMenu(); 67 | this.setState({ 68 | stateDomainInfoCard:false, 69 | stateSearchCard:false, 70 | stateFiltersCard:false, 71 | }); 72 | } 73 | } 74 | 75 | setActiveMenu (expanded, menu) { 76 | console.log("setActiveMenu " + expanded.toString() + " " + this.state.open.toString()); 77 | if(!this.state.open){ 78 | this.openMenu(); 79 | } 80 | var item = menu===0 ? this.setState({stateSearchCard: expanded, stateFiltersCard :!expanded, stateDomainInfoCard:!expanded}) : 81 | ( menu===1 ? this.setState({stateFiltersCard: expanded, stateSearchCard: !expanded, stateDomainInfoCard:!expanded}) : this.setState({ stateDomainInfoCard:expanded, stateFiltersCard: !expanded, stateSearchCard: !expanded})); 82 | } 83 | 84 | render(){ 85 | return ( 86 |
87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 |
104 | ) 105 | } 106 | } 107 | 108 | export default SidebarMenu; 109 | -------------------------------------------------------------------------------- /server/server.py: -------------------------------------------------------------------------------- 1 | import cherrypy 2 | from domain_discovery_API.server import Page 3 | #import domain_discovery_api.* as dd_api 4 | from ConfigParser import ConfigParser 5 | import json 6 | import os 7 | from threading import Lock 8 | import urlparse 9 | from domain_discovery_API.models.domain_discovery_model import DomainModel 10 | from domain_discovery_API.models.crawler_model import CrawlerModel 11 | from domain_discovery_API.models.RadvizModel import RadvizModel 12 | 13 | class DDTServer(Page): 14 | @staticmethod 15 | def getConfig(): 16 | # Parses file to prevent cherrypy from restarting when config.conf changes: after each request 17 | # it restarts saying config.conf changed, when it did not. 18 | config = ConfigParser() 19 | config.read(os.path.join(os.path.dirname(__file__), "config.conf")) 20 | 21 | configMap = {} 22 | for section in config.sections(): 23 | configMap[section] = {} 24 | for option in config.options(section): 25 | # Handles specific integer entries. 26 | val = config.get(section, option) 27 | if option == "server.socket_port" or option == "server.thread_pool": 28 | val = int(val) 29 | configMap[section][option] = val 30 | 31 | return configMap 32 | 33 | # Default constructor reading app config file. 34 | def __init__(self): 35 | path = os.path.dirname(os.path.realpath(__file__)) 36 | self._ddtModel = DomainModel(path) 37 | self._crawlerModel = CrawlerModel(path) 38 | self._ddtModel.setCrawlerModel(self._crawlerModel) 39 | self._radvizModel = RadvizModel(path) 40 | models = {"domain": self._ddtModel, "crawler": self._crawlerModel, "radviz": self._radvizModel} 41 | super(DDTServer, self).__init__(models, path) 42 | 43 | # Access to seed crawler vis. 44 | @cherrypy.expose 45 | def seedcrawler(self): 46 | # TODO Use SeedCrawlerModelAdapter self._crawler = SeedCrawlerModelAdapter() 47 | return open(os.path.join(self._HTML_DIR, u"index.html")) 48 | 49 | @cherrypy.expose 50 | def release(self): 51 | return open(os.path.join(self._HTML_DIR, u"release.html")) 52 | 53 | @cherrypy.expose 54 | def index(self): 55 | return self.seedcrawler() 56 | 57 | # Submits a web query for a list of terms, e.g. 'ebola disease' 58 | # @cherrypy.expose 59 | # def queryWeb(self, terms, session): 60 | # print "\n\n\n QUERY WEB IN REACT" 61 | # session = json.loads(session) 62 | # cherrypy.response.headers["Content-Type"] = "text/plain;" 63 | # # for res in self._model.queryWeb(terms, 20, session=session): 64 | # # print "\n\n\n SERVER QUERY WEB\n",res,"\n\n\n" 65 | 66 | # return self._model.queryWeb(terms, 20, session=session) 67 | # queryWeb._cp_config ={'response.stream':True} 68 | 69 | @cherrypy.expose 70 | def thing(self): 71 | cherrypy.response.headers['Content-Type'] = 'text/plain' 72 | # if not authorized(): 73 | # raise cherrypy.NotFound() 74 | def content(): 75 | print "\n\n\nRUNNING CONTENT\n\n\n" 76 | yield json.dumps({"first":"Hello"}) 77 | yield json.dumps({"first":"World"}) 78 | return content() 79 | thing._cp_config = {'response.stream': True} 80 | 81 | if __name__ == "__main__": 82 | server = DDTServer() 83 | 84 | # CherryPy always starts with app.root when trying to map request URIs 85 | # to objects, so we need to mount a request handler root. A request 86 | # to "/" will be mapped to HelloWorld().index(). 87 | app = cherrypy.quickstart(server, config=DDTServer.getConfig()) 88 | cherrypy.config.update(server.config) 89 | #app = cherrypy.tree.mount(page, "/", page.config) 90 | 91 | #if hasattr(cherrypy.engine, "signal_handler"): 92 | # cherrypy.engine.signal_handler.subscribe() 93 | #if hasattr(cherrypy.engine, "console_control_handler"): 94 | # cherrypy.engine.console_control_handler.subscribe() 95 | #cherrypy.engine.start() 96 | #cherrypy.engine.block() 97 | 98 | else: 99 | server = DDTServer() 100 | # This branch is for the test suite; you can ignore it. 101 | config = DDTServer.getConfig() 102 | app = cherrypy.tree.mount(server, config=config) 103 | -------------------------------------------------------------------------------- /docs/visualization_radviz.rst: -------------------------------------------------------------------------------- 1 | Visualization through RadViz 2 | ---------------------------- 3 | 4 | Select the **Visualization Tab** in the **Explore Data View** to see the multidimensional scaling visualization using RadViz. 5 | 6 | .. image:: figures/visualization_general.png 7 | :width: 800px 8 | :align: center 9 | :height: 400px 10 | :alt: alternate text 11 | 12 | RadViz is a data visualization that enables users to explore and analyze samples in a data set (such as a corpus of web pages in the case of DDT), represented as points in the visualization, in terms of similarity relations among semantic descriptors (keywords on the pages). Keywords are located along a circle, and pages are represented as a points in the circle. The more similar the pages the closer the distance between them. Also, the greater the proximity of a page to a keyword, the greater the frequency of occurrence of that keyword in that page. This kind of analysis allows users to identify regions of interest in the data set according to the most relevant features of the sample. 13 | 14 | Explore Pages 15 | ************* 16 | 17 | In order to explore the pages in the visualization you would need to select the pages that you want to see the various details for. 18 | 19 | .. image:: figures/visualization_toolbar_lassoSelect.png 20 | :width: 400px 21 | :align: center 22 | :height: 410px 23 | :alt: alternate text 24 | 25 | Selection of a group of pages is done using lasso selection. For this, the user simply drags a freehand selection around the pages located in the circle in RadViz, similar to how you would outline something on a piece of paper with a pen or pencil. To start the lasso selection users simply have to click at the spot where you want to begin the selection, then continue holding your mouse button down and drag to draw a freeform selection outline. To complete the selection, simply release your mouse button. You don't necessarily have to return the same spot you started from, but if you don't, RadViz will automatically close the selection for you by drawing a straight line from the point where you released your mouse button to the point where you began, so in most cases, you will want to finish where you started. 26 | 27 | When the pages are selected, you will observe the following: 28 | 29 | - Keywords contained in the selected pages will be highlighted along the circle. 30 | - A WordCloud of all the top keywords contained in the selected pages is generated in the right top corner. The font size of the keyword in the word cloud is proportional to the frequency of occurence of the word 31 | - Snippets of selected pages are shown at the right bottom corner 32 | 33 | Pages can be tagged through RadViz as ’Positive’ and ’Negative’, and even Custom Tag, by drawing lasso around any region of interest, which made the selection of a sub-group of pages very easy, and then users can tag the selected pages as ’Positive’, ’Negative’ and Custom Tag. 34 | 35 | ToolBar RadViz 36 | ************** 37 | 38 | .. image:: figures/visualization_toolbar1.png 39 | :width: 700px 40 | :align: center 41 | :height: 80px 42 | :alt: alternate text 43 | 44 | This visualization has five controls to interact with, whose functionlity are described below. 45 | 46 | Showing data in RadViz 47 | <<<<<<<<<<<<<<<<<<<<<< 48 | 49 | .. |toolbar_select| image:: figures/visualization_toolbar_select.png 50 | 51 | |toolbar_select| radio buttons, can be used to show or hide data on RadViz. 52 | 53 | **Show all:** Show all is selected by default in this visualization. It shows all the pages present in the data collection. 54 | 55 | **Hide selected:** This option hides the selected pages of the current view. 56 | 57 | **Hide unselected:** This option hides the pages that are not selected. 58 | 59 | 60 | Translation in RadViz 61 | <<<<<<<<<<<<<<<<<<<<<< 62 | 63 | .. |toolbar_translation| image:: figures/visualization_toolbar_transaltion.png 64 | 65 | |toolbar_translation| slider allows to calibrate the degree of denseness or sparseness of the representations of the pages in the visualization. 66 | 67 | Find Keyword in RadViz 68 | <<<<<<<<<<<<<<<<<<<<<< 69 | 70 | .. |toolbar_find_keyword| image:: figures/visualization_toolbar_find_keyword.png 71 | 72 | |toolbar_find_keyword| auto-complete text-field allows to search a keyword over all keywords in the visualization. Blue font color is used to highlight the keyword (shown below). This functionality is supported by an autocomplete process using all keyword used in th current view of RadViz. 73 | 74 | .. image:: figures/visualization_toolbar_find_keyword_result.png 75 | :width: 400px 76 | :align: center 77 | :height: 400px 78 | :alt: alternate text 79 | -------------------------------------------------------------------------------- /client/src/components/TermsSnippetViewer.js: -------------------------------------------------------------------------------- 1 | // Filename: TermsSnippetViewer.js 2 | // Purpose: Showing snippets that belong to an specific term. 3 | // Author: Sonia Castelo (scastelo2@gmail.com) 4 | 5 | 6 | import React, { Component } from 'react'; 7 | import {scaleLinear} from 'd3-scale'; 8 | import {range} from 'd3-array'; 9 | import {select} from 'd3-selection'; 10 | import ReactFauxDom from 'react-faux-dom'; 11 | import $ from "jquery"; 12 | //import {select} from 'd3-selection'; 13 | //import ReactFauxDom from 'react-faux-dom'; 14 | 15 | 16 | class TermsSnippetViewer extends Component { 17 | constructor(props){ 18 | super(props); 19 | this.state = { 20 | term: [], 21 | }; 22 | this.update = this.update.bind(this); 23 | this.onLoadedTermsSnippets = this.onLoadedTermsSnippets.bind(this); 24 | 25 | //this.drawWordCloud = this.drawWordCloud.bind(this); 26 | } 27 | 28 | componentWillMount(){ 29 | this.setState({term:this.props.term}); 30 | this.getTermSnippets(this.props.term, this.props.session); 31 | //this.wordCloud = ReactFauxDom.createElement('div'); 32 | } 33 | 34 | componentWillReceiveProps(nextProps){ 35 | if(nextProps.focusContext && this.state.term == nextProps.focusTermContext){ 36 | return; 37 | } 38 | this.setState({term:nextProps.term}); 39 | this.getTermSnippets(nextProps.term, nextProps.session); 40 | 41 | } 42 | 43 | 44 | getTermSnippets(term, session){ 45 | $.post( 46 | '/getTermSnippets', 47 | {'term': term, 'session': JSON.stringify(session)}, 48 | function(data) { 49 | this.onLoadedTermsSnippets(data); 50 | }.bind(this)).fail(function() { 51 | console.log("Something wrong happen. Try again."); 52 | }.bind(this)); 53 | } 54 | 55 | // Responds to loaded terms snippets. 56 | onLoadedTermsSnippets(data) { 57 | var term = data.term; 58 | var tags = data.tags; 59 | var context = data.context; 60 | 61 | var termObj = {term: term, tags: tags}; 62 | 63 | var termSnippets = []; 64 | $.each(context, function(url, context){ 65 | var termSnippet = {}; 66 | termSnippet['term'] = termObj; 67 | termSnippet['url'] = url; 68 | termSnippet['snippet'] = context; 69 | termSnippets.push(termSnippet); 70 | }); 71 | 72 | //var lazyUpdate = true; 73 | //this.termsSnippetsViewer.clear(lazyUpdate); 74 | //this.termsSnippetsViewer.addItems(termSnippets); 75 | this.update(termSnippets); 76 | }; 77 | 78 | /** 79 | * Updates viewer. 80 | */ 81 | update(items_) { 82 | // Removes missing items. 83 | select("#termsSnippet").selectAll('*').remove(); 84 | 85 | var items = select("#termsSnippet") 86 | .attr("width", 300) 87 | .attr("height", 300) 88 | .selectAll('.item').data(items_, function(item, i) { 89 | //console.log(item.term.term + '-' + i + '-' + item.snippet.substring(0, 30)); 90 | return item.term.term + '-' + i + '-' + item.snippet.substring(0, 30); 91 | }); 92 | 93 | // New items. 94 | items.enter() 95 | .append('div') 96 | .classed('noselect', true) 97 | .classed('item', true) 98 | .html(function(item, i) { 99 | var snippet = (item.snippet.replace(//g, "")).replace("", ""); 100 | return '

' + snippet + '

'; 101 | }) 102 | .style('cursor', 'pointer') 103 | .on('click', function(item, i) { 104 | var elem = select(this); 105 | elem.classed('dblclicked', !elem.classed('dblclicked')); 106 | window.open(item.url, '_blank'); 107 | }); 108 | 109 | items.each(function(item, i) { 110 | console.log("---"); 111 | var tags = item.term['tags']; 112 | var isPositive = tags.indexOf('Positive') != -1; 113 | var isNegative = tags.indexOf('Negative') != -1; 114 | /*select(this).selectAll('em') 115 | .classed('Positive', isPositive) 116 | .classed('Negative', isNegative);*/ 117 | }); 118 | }; 119 | 120 | 121 | render() { 122 | if(this.state.term!==""){ 123 | return ( 124 |
125 |
126 | ); 127 | } 128 | else { 129 | return(
); 130 | } 131 | 132 | } 133 | } 134 | 135 | TermsSnippetViewer.propTypes = { 136 | width: React.PropTypes.number.isRequired, 137 | height: React.PropTypes.number.isRequired, 138 | }; 139 | 140 | TermsSnippetViewer.defaultProps = { 141 | width: 300, 142 | height: 300, 143 | }; 144 | 145 | 146 | 147 | export default TermsSnippetViewer; 148 | -------------------------------------------------------------------------------- /client/src/components/Filters.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react'; 2 | import {Card, CardHeader, CardMedia} from 'material-ui/Card'; 3 | import FiltersTabs from './FiltersTabs'; 4 | import Avatar from 'material-ui/Avatar'; 5 | import CheckList from 'material-ui/svg-icons/av/playlist-add-check'; 6 | 7 | const styles = { 8 | card: { 9 | background: 'white', 10 | borderRadius: '0px 0px 0px 0px', 11 | borderStyle: 'solid', 12 | borderColor: '#C09ED7', 13 | borderWidth: '1px 0px 1px 0px' 14 | 15 | }, 16 | avatar:{ 17 | margin:'-4px 8px 0px 0px', 18 | }, 19 | cardHeader:{ 20 | background: '#DCCCE7', 21 | padding:'10px 1px 10px 6px', 22 | borderRadius: '0px 0px 0px 0px', 23 | }, 24 | cardMedia:{ 25 | background: '#DCCCE7', 26 | padding:'0px 4px 2px 4px', 27 | borderRadius: '0px 0px 0px 0px', 28 | height: "500px", 29 | }, 30 | 31 | }; 32 | 33 | class Filters extends Component{ 34 | 35 | constructor(props) { 36 | super(props); 37 | this.state = { 38 | expanded: undefined, 39 | sessionString:"", 40 | session: undefined, 41 | sizeAvatar:undefined, 42 | checked_queries:[], 43 | checked_tags:[], 44 | }; 45 | this.queryFromSearch=true; 46 | } 47 | 48 | componentWillMount(){ 49 | this.setState({ 50 | expanded: this.props.statedCard, 51 | session:this.props.session, 52 | sessionString:JSON.stringify(this.props.session), 53 | sizeAvatar:this.props.sizeAvatar, 54 | }); 55 | } 56 | 57 | componentWillReceiveProps(nextProps) { 58 | this.queryFromSearch = (this.props.queryFromSearch ===undefined)?false:true; 59 | // Calculate new state 60 | if(nextProps.statedCard !== this.state.statedCard){ 61 | this.setState({expanded: nextProps.statedCard}, function() { 62 | this.setState({expanded: nextProps.statedCard}); 63 | }); 64 | } 65 | if(JSON.stringify(nextProps.session) !== this.state.sessionString || this.queryFromSearch){ 66 | this.setState({ 67 | session:nextProps.session, 68 | sessionString:JSON.stringify(this.props.session), 69 | }); 70 | } 71 | else{ 72 | return; 73 | } 74 | } 75 | 76 | shouldComponentUpdate(nextProps, nextState) { 77 | this.queryFromSearch = (this.props.queryFromSearch ===undefined)?false:true; 78 | //console.log("filter before shouldComponentUpdate"); 79 | //console.log(this.props.update); 80 | if(nextProps.updateCrawlerData==="updateCrawler" || nextProps.updateCrawlerData==="stopCrawler" || this.queryFromSearch || this.props.update || JSON.stringify(nextProps.session) !== this.state.sessionString || nextProps.statedCard !== this.state.statedCard || JSON.stringify(nextState.session) !== this.state.sessionString) { 81 | return true; 82 | } 83 | //console.log("filter after shouldComponentUpdate"); 84 | return false; 85 | } 86 | 87 | 88 | handleExpandChange = (expanded) => { 89 | this.setState({expanded: expanded}); 90 | if(expanded){ 91 | this.props.setActiveMenu(expanded, 1); 92 | } 93 | } 94 | 95 | handleToggle = (event, toggle) => { 96 | this.setState({expanded: toggle}); 97 | } 98 | 99 | handleExpand = () => { 100 | this.setState({expanded: true}); 101 | } 102 | 103 | handleReduce = () => { 104 | this.setState({expanded: false}); 105 | } 106 | 107 | //Send information about a query that is running (from FiltersTabs.js to Body.js) 108 | updateStatusMessage(value, term){ 109 | this.props.updateStatusMessage(value, term); 110 | } 111 | 112 | updateSession(newSession){ 113 | this.setState({ 114 | session:newSession, 115 | sessionString:JSON.stringify(newSession), 116 | }); 117 | this.props.updateSession(newSession); 118 | } 119 | 120 | deletedFilter(sessionTemp){ 121 | this.setState({ 122 | session:sessionTemp, sessionString: JSON.stringify(sessionTemp) 123 | }); 124 | this.props.deletedFilter(sessionTemp); 125 | } 126 | 127 | /*updateCheckedQueries(checkedQueries){ 128 | this.setState({ 129 | checked_queries:checkedQueries, 130 | }); 131 | } 132 | */ 133 | 134 | render(){ 135 | return( 136 | 137 | } />} 140 | style={styles.cardHeader} 141 | actAsExpander={true} 142 | showExpandableButton={true} 143 | /> 144 | 145 | 146 | 147 | 148 | ) 149 | } 150 | 151 | } 152 | 153 | export default Filters; 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # dd_api documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Apr 7 14:41:38 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | sys.path.insert(0, "../") 22 | 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ['sphinx.ext.intersphinx', 34 | 'sphinx.ext.viewcode', 35 | 'sphinx.ext.githubpages' 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # 44 | # source_suffix = ['.rst', '.md'] 45 | source_suffix = '.rst' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = u'Domain Discovery Tool' 52 | copyright = u'2017, New York University' 53 | author = u'New York University' 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | version = u'1.0' 61 | # The full version, including alpha/beta/rc tags. 62 | release = u'1.0' 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | # 67 | # This is also used if you do content translation via gettext catalogs. 68 | # Usually you set "language" from the command line for these cases. 69 | language = None 70 | 71 | # List of patterns, relative to source directory, that match files and 72 | # directories to ignore when looking for source files. 73 | # This patterns also effect to html_static_path and html_extra_path 74 | exclude_patterns = [] 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = 'sphinx' 78 | 79 | # If true, `todo` and `todoList` produce output, else they produce nothing. 80 | todo_include_todos = True 81 | 82 | 83 | # -- Options for HTML output ---------------------------------------------- 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # 88 | html_theme = 'default' 89 | 90 | # Theme options are theme-specific and customize the look and feel of a theme 91 | # further. For a list of options available for each theme, see the 92 | # documentation. 93 | # 94 | # html_theme_options = {} 95 | 96 | # Add any paths that contain custom static files (such as style sheets) here, 97 | # relative to this directory. They are copied after the builtin static files, 98 | # so a file named "default.css" will overwrite the builtin "default.css". 99 | html_static_path = ['_static'] 100 | 101 | 102 | # -- Options for HTMLHelp output ------------------------------------------ 103 | 104 | # Output file base name for HTML help builder. 105 | htmlhelp_basename = 'Domain Discovery Tool' 106 | 107 | 108 | # -- Options for LaTeX output --------------------------------------------- 109 | 110 | latex_elements = { 111 | # The paper size ('letterpaper' or 'a4paper'). 112 | # 113 | # 'papersize': 'letterpaper', 114 | 115 | # The font size ('10pt', '11pt' or '12pt'). 116 | # 117 | # 'pointsize': '10pt', 118 | 119 | # Additional stuff for the LaTeX preamble. 120 | # 121 | # 'preamble': '', 122 | 123 | # Latex figure (float) alignment 124 | # 125 | # 'figure_align': 'htbp', 126 | } 127 | 128 | # Grouping the document tree into LaTeX files. List of tuples 129 | # (source start file, target name, title, 130 | # author, documentclass [howto, manual, or own class]). 131 | latex_documents = [ 132 | (master_doc, 'ddt.tex', u'ddt Documentation', 133 | u'Yamuna Krishnamurthy', 'manual'), 134 | ] 135 | 136 | 137 | # -- Options for manual page output --------------------------------------- 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'ddt', u'ddt Documentation', 143 | [author], 1) 144 | ] 145 | 146 | 147 | # -- Options for Texinfo output ------------------------------------------- 148 | 149 | # Grouping the document tree into Texinfo files. List of tuples 150 | # (source start file, target name, title, author, 151 | # dir menu entry, description, category) 152 | texinfo_documents = [ 153 | (master_doc, 'ddt', u'ddt Documentation', 154 | author, 'ddt', 'One line description of project.', 155 | 'Miscellaneous'), 156 | ] 157 | 158 | # Example configuration for intersphinx: refer to the Python standard library. 159 | intersphinx_mapping = {'https://docs.python.org/': None} 160 | 161 | -------------------------------------------------------------------------------- /client/src/components/CrawlingView.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { Col, Row} from 'react-bootstrap'; 3 | // From https://github.com/oliviertassinari/react-swipeable-views 4 | import Terms from './Terms'; 5 | import DeepCrawling from './DeepCrawling'; 6 | import FocusedCrawling from './FocusedCrawling'; 7 | import {Tabs, Tab} from 'material-ui/Tabs'; 8 | import SwipeableViews from 'react-swipeable-views'; 9 | import $ from 'jquery'; 10 | 11 | import MultiselectTable from './MultiselectTable'; 12 | 13 | const styles = { 14 | slide: { 15 | padding: 10, 16 | }, 17 | content: { 18 | marginTop: '5px', 19 | marginRight: '5px', 20 | marginBottom: '8px', 21 | marginLeft: '5px', 22 | backgroundColor: '#FFFFFF', 23 | borderRadius: '10px 10px 10px 10px', 24 | }, 25 | }; 26 | 27 | 28 | class CrawlingView extends Component { 29 | 30 | constructor(props) { 31 | super(props); 32 | this.state = { 33 | disableStopCrawlerSignal:true, 34 | disableAcheInterfaceSignal:true, 35 | disabledStartCrawler:false, //false 36 | disabledCreateModel:true, //false 37 | messageCrawler:"", 38 | openCreateModel: false, 39 | slideIndex: 0, 40 | pages:{}, 41 | openDialogLoadUrl: false, 42 | currentTags:undefined, 43 | deepCrawlableDomains: [], 44 | deepCrawlableDomainsFromTag: [], 45 | resetSelection: false, 46 | openLoadURLs: false, 47 | session:{}, 48 | crawlerServers: {}, 49 | }; 50 | 51 | } 52 | 53 | 54 | 55 | /** 56 | * Creating session to get the urls with deep crawl tag. 57 | * @method createSession 58 | * @param {string} domainId 59 | */ 60 | /*consultaQueries: {"search_engine":"GOOG","activeProjectionAlg":"Group by Correlation" 61 | ,"domainId":"AVWjx7ciIf40cqEj1ACn","pagesCap":"100","fromDate":null,"toDate":null, 62 | "filter":null,"pageRetrievalCriteria":"Most Recent","selected_morelike":"", 63 | "model":{"positive":"Relevant","negative":"Irrelevant"}}*/ 64 | createSession(domainId){ 65 | var session = {}; 66 | session['search_engine'] = "GOOG"; 67 | session['activeProjectionAlg'] = "Group by Correlation"; 68 | session['domainId'] = domainId; 69 | session['pagesCap'] = "5"; 70 | session['fromDate'] = null; 71 | session['toDate'] = null; 72 | session['filter'] = null; //null 73 | session['pageRetrievalCriteria'] = "Most Recent"; 74 | session['selected_morelike'] = ""; 75 | session['selected_queries']=""; 76 | session['selected_tlds']=""; 77 | session['selected_aterms']=""; 78 | session['selected_tags']=""; 79 | session['selected_model_tags']=""; 80 | session['selected_crawled_tags']=""; 81 | session['model'] = {}; 82 | session['model']['positive'] = ["Relevant"]; 83 | session['model']['negative'] = ["Irrelevant"]; 84 | session["from"]=0; 85 | return session; 86 | } 87 | /** 88 | * Set the deepCrawlableDomainsFromTag state for displaying the current tlds in deep crawler tag. 89 | * @method componentWillMount 90 | * @param 91 | */ 92 | componentWillMount(){ 93 | var temp_session = this.createSession(this.props.domainId); 94 | this.setState({session: temp_session}); 95 | this.getCrawlerServers(); 96 | } 97 | 98 | handleChange = (value) => { 99 | this.setState({ 100 | slideIndex: value, 101 | //valueLoadUrls:[], 102 | //valueLoadUrlsFromTextField:[], 103 | }); 104 | } 105 | 106 | getCrawlerServers(){ 107 | $.post( 108 | '/getCrawlerServers', 109 | {}, 110 | (crawlerServers) => { 111 | console.log("CRAWLER SERVERS"); 112 | console.log(crawlerServers); 113 | this.setState({crawlerServers: crawlerServers}); 114 | this.forceUpdate(); 115 | } 116 | ).fail((error) => { 117 | console.log('getCrawlerServers FAILED ', error); 118 | }); 119 | } 120 | render() { 121 | var disableDeepCrawlerButton =false; 122 | var disableFocusedCrawlerButton = false; 123 | if(this.props.statusCrawlers!== undefined && this.props.statusCrawlers.length > 0){ 124 | // Crawler is executing 125 | this.props.statusCrawlers.forEach(function(obj){ 126 | if(obj.description==="deep"){ 127 | disableDeepCrawlerButton = (obj.status.toLowerCase() === "running")?true:false; 128 | } 129 | if(obj.description==="focused"){ 130 | disableFocusedCrawlerButton = (obj.status.toLowerCase() === "running")?true:false; 131 | } 132 | }.bind(this)); 133 | } 134 | 135 | return ( 136 |
137 | 142 | > 143 | 144 | 145 | 146 | 147 |
148 | 149 |
150 | 151 |
152 | 153 |
154 | 155 |
156 |
157 | ); 158 | } 159 | } 160 | 161 | export default CrawlingView; 162 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Install and Run 2 | =============== 3 | 4 | You can install the system from source or using Docker. 5 | 6 | Docker Version 7 | -------------- 8 | 9 | You must have docker installed (`Docker Installation for Mac `_ , `Docker Installation for Ubuntu `_) 10 | 11 | Background Mode 12 | ~~~~~~~~~~~~~~~ 13 | 14 | You must have docker compose installed to run the background version. For Mac docker-compose is included in the docker installation. For Ubuntu follow instructions under Linux tab in `docker compose install for linux `_ 15 | 16 | In order to run the docker version in background download: 17 | 18 | **To run only DDT (no crawlers):** Download :download:`docker-compose.yml <../docker-compose.yml>`. 19 | 20 | **To run DDT, deep crawler and focused crawler:** Download the following files in the same directory 21 | 22 | :download:`docker-compose.yml.ache <../docker-compose.yml.ache>`. Rename the downloaded **docker-compose.yml.ache** to **docker-compose.yml**. 23 | 24 | :download:`ache.yml <../ache.yml>` 25 | 26 | Now use the following commands to run DDT (and crawlers if applicable): 27 | 28 | >>> cd {path-to-downloaded-docker-compose.yml} 29 | >>> docker-compose up -d 30 | 31 | The above commands will start elasticsearch and DDT processes (and crawlers if applicable). The elasticsearch and DDT (and crawler if applicable) data are stored in the directory {path-to-downloaded-docker-compose.yml}/data 32 | 33 | You can check the output of the DDT tool using: 34 | 35 | >>> docker logs dd_tool 36 | 37 | You will see a message **"ENGINE Bus STARTED"** when DDT is running successfully. You can now use DDT. 38 | 39 | `Use Domain Discovery Tool `_ 40 | 41 | To shutdown the processes run: 42 | 43 | >>> cd {path-to-downloaded-docker-compose.yml} 44 | >>> docker-compose stop 45 | 46 | Interactive Mode 47 | ~~~~~~~~~~~~~~~~ 48 | 49 | To run using the interactive docker version download the script :download:`run_docker_ddt <../bin/run_docker_ddt>` and run it: 50 | 51 | >>> cd {path-to-downloaded-run_docker_ddt} 52 | >>> chmod a+x run_docker_ddt 53 | >>> ./run_docker_ddt 54 | 55 | The above script will prompt to enter a directory where you would like to persist all the web pages for the domains you create. You can enter the path to a directory on the host you are running DDT or just press **Enter** to use the default directory which is {path-to-downloaded-run_docker_ddt}/data. The data is stored in the `elasticsearch `_ data format (You can later use this directory as the data directory to any elasticsearch).The script will start elasticsearch with the data directory provided. 56 | 57 | The script will then start DDT. You will see a message **"ENGINE Bus STARTED"** when DDT is running successfully. You can now use DDT. 58 | 59 | `Use Domain Discovery Tool `_ 60 | 61 | Trouble Shooting 62 | ~~~~~~~~~~~~~~~~ 63 | 64 | In case you see the following error: 65 | 66 | >>> ERROR: for elasticsearch Cannot create container for service elasticsearch: Conflict. The container name "/elastic" is already in use by container b714e105ccbf3a6d5a718c76c2ce1e5a51ea6f10a5f4997a6e5b12b9c7faf50e. You have to remove (or rename) that container to be able to reuse that name. 67 | 68 | run the following command: 69 | 70 | >>> docker rm elastic 71 | 72 | In case you see the following error: 73 | 74 | >>> ERROR: for ddt Cannot create container for service ddt: Conflict. The container name "/dd_tool" is already in use by container 326881fda035692aa0a5c03ec808294aaad2f9fd816baa13270d2fe50e7e1e77. You have to remove (or rename) that container to be able to reuse that name. 75 | 76 | >>> docker rm dd_tool 77 | 78 | Local development 79 | ----------------- 80 | 81 | Building and deploying the Domain Discovery Tool can be done using its Makefile to create a local development environment. The conda build environment is currently only supported on 64-bit OS X and Linux. 82 | 83 | Install Conda 84 | ~~~~~~~~~~~~~~ 85 | 86 | First install `conda (anaconda) for python 2.7 `_. 87 | 88 | Install Java 89 | ~~~~~~~~~~~~~~ 90 | 91 | Install `JDK 1.8 `_. 92 | 93 | Install Elasticsearch 94 | ~~~~~~~~~~~~~~~~~~~~~ 95 | 96 | Download Elasticsearch 1.6.2 `here `_, extract the file and run Elasticsearch: 97 | 98 | >>> cd {path-to-installed-Elasticsearch} 99 | >>> ./bin/elasticsearch 100 | 101 | Install Domain Discovery API 102 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 103 | 104 | >>> git clone https://github.com/ViDA-NYU/domain_discovery_API 105 | >>> cd domain_discovery_API 106 | 107 | The `make` command builds dd_api and downloads/installs its dependencies. 108 | 109 | >>> make 110 | 111 | 112 | Add domain_discovery_API to the environment: 113 | 114 | >>> export DD_API_HOME="{path-to-cloned-domain_discovery_API-repository}" 115 | 116 | Clone the DDT repository and enter it: 117 | 118 | >>> git clone https://github.com/ViDA-NYU/domain_discovery_tool 119 | >>> cd domain_discovery_tool 120 | 121 | Use the `make` command to build ddt and download/install its dependencies. 122 | 123 | >>> make 124 | 125 | After a successful installation, you can activate the DDT development environment: 126 | 127 | >>> source activate ddt 128 | 129 | (from the top-level `domain_discovery_tool` directory) execute: 130 | 131 | >>> ./bin/ddt-dev 132 | 133 | `Use Domain Discovery Tool `_ 134 | -------------------------------------------------------------------------------- /client/src/components/MultiselectTable.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | 4 | import ReactPaginate from 'react-paginate'; 5 | 6 | import Checkbox from 'material-ui/Checkbox'; 7 | import { Table, TableBody, TableFooter, TableHeader, TableHeaderColumn, 8 | TableRow, TableRowColumn } from 'material-ui/Table'; 9 | 10 | // Table built on top of Material UI's Table component to avoid the Select all bug 11 | // TODO: Once Material UI releases the next version this can be discarded 12 | // completely. Also, add more props for better flexibility 13 | class MultiselectTable extends Component { 14 | constructor(props) { 15 | super(props); 16 | this.state = { 17 | selectedRows: [], 18 | selectAll: false, 19 | currentPage: 0 20 | } 21 | 22 | 23 | this.perPage = 100; 24 | this.toggleSelectOrDeselectAll = this.toggleSelectOrDeselectAll.bind(this); 25 | this.onRowSelection = this.onRowSelection.bind(this); 26 | } 27 | 28 | componentWillReceiveProps(nextProps) { 29 | if(nextProps.resetSelection) 30 | this.setState({selectedRows: [], selectAll: false}); 31 | } 32 | 33 | /** 34 | * Manipulate this.state.selectedRows to mimick SelectAll / DeselectAll 35 | * @method toggleSelectOrDeselectAll (onClick event) 36 | * @param {Object} event 37 | */ 38 | toggleSelectOrDeselectAll(event) { 39 | let selectedRows = !this.state.selectAll ? 40 | this.props.rows.map((reco, index) => index) 41 | : 42 | [] 43 | 44 | this.setState({ 45 | selectAll: !this.state.selectAll, 46 | selectedRows 47 | }); 48 | this.props.onRowSelection && this.props.onRowSelection(selectedRows); 49 | } 50 | 51 | /** 52 | * Set the selectedRows state variable with the checked element id "recommendation-index" 53 | * and call the parent components method to manipulate data 54 | * @method onRowSelection (onClick event) 55 | * @param {number[]} selectedRows 56 | */ 57 | onRowSelection(event) { 58 | var selectedRows = this.state.selectedRows; 59 | var rowId = event.target.id.split("-")[1]; 60 | if(event.target.checked) 61 | selectedRows.push(parseInt(rowId)); 62 | else 63 | selectedRows.splice(selectedRows.indexOf(parseInt(rowId)), 1); 64 | 65 | this.setState({selectedRows, selectAll: false}); 66 | this.props.onRowSelection && this.props.onRowSelection(selectedRows); 67 | } 68 | 69 | render() { 70 | return ( 71 |
72 | 80 | 86 | 87 | 88 | 92 | 93 | { 94 | this.props.columnHeadings.map(column => 95 | 96 | {column} 97 | 98 | ) 99 | } 100 | 101 | 102 | 108 | { 109 | this.props.rows.slice( 110 | this.state.currentPage * this.perPage, 111 | (this.state.currentPage + 1) * this.perPage 112 | ).map((row, index) => 113 | 114 | 115 | 120 | 121 | {row[0]} 122 | {(row[1]['score'] === undefined)? '1, '+row[1]['count']: row[1]['score'].toFixed(3)+', '+row[1]['count']} 123 | 124 | ) 125 | } 126 | 127 | 128 |
129 |
130 | ...} 135 | breakClassName={"break-me"} 136 | pageCount={(this.props.rows || []).length/this.perPage} 137 | marginPagesDisplayed={1} 138 | pageRangeDisplayed={1} 139 | onPageChange={(page) => {this.setState({currentPage: page.selected})}} 140 | containerClassName={"pagination"} 141 | subContainerClassName={"pages pagination"} 142 | activeClassName={"active"} /> 143 |
144 |
145 | ) 146 | } 147 | } 148 | 149 | MultiselectTable.propTypes = { 150 | rows: PropTypes.array, 151 | columnHeadings: PropTypes.array, 152 | onRowSelection: PropTypes.func, 153 | resetSelection: PropTypes.bool 154 | } 155 | 156 | export default MultiselectTable; 157 | -------------------------------------------------------------------------------- /client/src/components/Terms.js: -------------------------------------------------------------------------------- 1 | // Filename: Terms.js 2 | // Purpose: This is an intermediate component between Body.js and TermsList.js. It handles the changes in Terms card. 3 | // Author: Sonia Castelo (scastelo2@gmail.com) 4 | import React, {Component} from 'react'; 5 | import {Card, CardHeader, CardMedia} from 'material-ui/Card'; 6 | import TermsList from './TermsList'; 7 | import Avatar from 'material-ui/Avatar'; 8 | import Assignment from 'material-ui/svg-icons/action/assignment-returned'; 9 | import Divider from 'material-ui/Divider'; 10 | import $ from 'jquery'; 11 | import CircularProgress from 'material-ui/CircularProgress'; 12 | 13 | 14 | class CircularProgressSimple extends React.Component{ 15 | render(){ 16 | return( 17 |
18 | 19 |
20 | );} 21 | } 22 | 23 | 24 | class Terms extends Component{ 25 | 26 | constructor(props) { 27 | super(props); 28 | this.state = { 29 | expanded: this.props.statedCard, 30 | update:true, 31 | listTerms: [], 32 | session:{}, 33 | sessionString:"", 34 | fromCrawling:false, 35 | }; 36 | }; 37 | 38 | componentWillMount = () => { 39 | this.setState({expanded: this.props.statedCard, session:this.props.session, sessionString:JSON.stringify(this.props.session) , fromCrawling:this.props.fromCrawling,}); 40 | this.loadTerms(); 41 | }; 42 | 43 | 44 | //Handling state's changes of search card. (expanded or reduced) 45 | componentWillReceiveProps = (nextProps) => { 46 | // Calculate new state 47 | if(nextProps.statedCard !== this.state.statedCard){ 48 | this.setState({expanded: nextProps.statedCard}, function() { 49 | this.setState({expanded: nextProps.statedCard}); 50 | }); 51 | } 52 | 53 | if(JSON.stringify(nextProps.session) !== this.state.sessionString && nextProps.statedCard){ 54 | this.setState({ 55 | session:nextProps.session, 56 | sessionString:JSON.stringify(this.props.session), 57 | listTerms: [], 58 | }); 59 | this.loadTerms(); 60 | } 61 | else{ 62 | return; 63 | } 64 | 65 | }; 66 | 67 | handleExpandChange = (expanded) => { 68 | this.setState({expanded: expanded}); 69 | if(expanded){ 70 | this.props.setActiveMenu(expanded, 3); 71 | 72 | } 73 | }; 74 | 75 | handleReduce = () => { 76 | this.setState({expanded: false}); 77 | }; 78 | 79 | 80 | loadTerms(){ 81 | var session = this.props.session; 82 | $.post( 83 | '/extractTerms', 84 | {'numberOfTerms': 40, 'session': JSON.stringify(session)}, 85 | function(summary) { 86 | var entries = []; 87 | entries = summary.map(function(w) { 88 | return {'word': w[0], 'posFreq': w[1], 'negFreq': w[2], 'tags': w[3]} 89 | }); 90 | this.setState({listTerms: entries}); 91 | this.updateTerms(entries); //sending new terms to FocusedCrawling component 92 | }.bind(this)).fail(function() { 93 | console.log("Something wrong happen. Try again."); 94 | }.bind(this)); 95 | }; 96 | 97 | //sending new terms to FocusedCrawling component 98 | updateTerms(updateListTerm){ 99 | if(this.props.updateTerms != undefined) 100 | this.props.updateTerms(updateListTerm); 101 | } 102 | 103 | updateListTermParent(updateListTerm){ 104 | this.setState({listTerms: updateListTerm}); 105 | this.updateTerms(updateListTerm); //sending new terms to FocusedCrawling component 106 | this.forceUpdate(); 107 | } 108 | 109 | //Check if the component should be updated or not 110 | shouldComponentUpdate(nextProps, nextState) { 111 | if(JSON.stringify(nextProps.session) !== this.state.sessionString || nextProps.statedCard !== this.state.statedCard || JSON.stringify(nextState.session) !== this.state.sessionString || this.props.focusedCrawlDomains) { 112 | return true; 113 | } 114 | return false; 115 | } 116 | 117 | 118 | render(){ 119 | 120 | const styles = { 121 | card: { 122 | borderStyle: 'solid', 123 | borderColor: '#C09ED7', 124 | background: 'white', 125 | borderRadius: '0px 0px 0px 0px', 126 | borderWidth: '0px 0px 0px 0px' 127 | }, 128 | avatar:{ 129 | margin:'-4px 8px 0px 0px', 130 | }, 131 | cardHeader:{ 132 | background: this.props.BackgroundColorTerm, //'#DCCCE7', 133 | padding:'10px 1px 10px 6px', 134 | borderRadius: '0px 0px 0px 0px', 135 | }, 136 | cardMedia:{ 137 | background: this.props.BackgroundColorTerm, 138 | padding:'2px 4px 2px 4px', 139 | borderRadius: '0px 0px 0px 0px', 140 | height: "390px", 141 | }, 142 | 143 | }; 144 | 145 | let terms = " "; 146 | if(this.state.listTerms.length>0){ 147 | terms = this.state.listTerms.map(function(w) { 148 | return

{w["word"]}

; 149 | }); 150 | } 151 | var isThereTerms = (this.state.listTerms.length>0)?:; 152 | var avatarElement = (this.props.renderAvatar)?} /> 153 | :null; 154 | return( 155 | 156 | 157 | 164 | 165 | 166 |
167 | {isThereTerms} 168 |
169 |
170 |
171 | ) 172 | } 173 | 174 | } 175 | 176 | export default Terms; 177 | -------------------------------------------------------------------------------- /client/src/components/RadViz.js: -------------------------------------------------------------------------------- 1 | // Filename: Header.js 2 | // Purpose: Shows just information about the current domain. From here, the user can change of domain too. 3 | //Dependencies: Body.js 4 | // Author: Sonia Castelo (scastelo2@gmail.com) 5 | 6 | import React, { Component } from 'react'; 7 | import Header from './Header'; 8 | import $ from 'jquery'; 9 | import {scaleOrdinal} from 'd3-scale'; 10 | import Dialog from 'material-ui/Dialog'; 11 | import FlatButton from 'material-ui/FlatButton'; 12 | import RadVizComponent from 'radviz-component'; 13 | import FiltersTabs from './FiltersTabs' 14 | class RadViz extends Component { 15 | 16 | constructor(props) { 17 | super(props); 18 | this.state = { 19 | flat:0, 20 | data:undefined, 21 | colors:undefined, 22 | originalData:undefined, 23 | dimNames:[], 24 | filterTerm:"", 25 | open:false, 26 | session:"", 27 | sessionString:{}, 28 | 29 | }; 30 | this.colorTags= [ "#9E9E9E", "#0D47A1", "#C62828"]; 31 | }; 32 | 33 | loadDataFromElasticSearch(session, filterTerm){ 34 | //var session = this.props.session; 35 | 36 | if(!(Object.keys(session).length === 0)){ 37 | //console.log(session); 38 | 39 | if(session['pagesCap']<100) 40 | session['pagesCap']=100; 41 | 42 | 43 | //session['pagesCap']='1000'; 44 | $.post( 45 | '/getRadvizPoints', 46 | {'session': JSON.stringify(session), filterByTerm: filterTerm}, 47 | function(es) { 48 | var data = JSON.parse(es); 49 | let numericalData = []; 50 | let dimNames = Object.keys(data); 51 | let scaleColor = scaleOrdinal(this.colorTags); 52 | let colors = []; 53 | data['Model Result'] = []; 54 | 55 | for (let i = 0; i < data['labels'].length; ++i){ 56 | data['Model Result'][i] = "neutral"; 57 | data['labels'][i]= data['labels'][i].split(','); 58 | //colors.push(scaleColor(data['tags'][0])); 59 | let aux = {}; 60 | for (let j = 0; j < dimNames.length-2; ++j){//except urls and labels 61 | aux[dimNames[j]] = parseFloat(data[dimNames[j]][i]); 62 | } 63 | numericalData.push(aux); 64 | } 65 | dimNames.push('Model Result'); 66 | $.post( 67 | '/computeTSP', 68 | { }, 69 | function(es) { 70 | let numericalDataTSP = []; 71 | var orderObj = JSON.parse(es); 72 | for (let i = 0; i < numericalData.length; ++i){ 73 | let aux = {}; 74 | for(var j in orderObj.cities){ 75 | aux[dimNames[orderObj.cities[j]]] = numericalData[i][dimNames[orderObj.cities[j]]]; 76 | } 77 | numericalDataTSP.push(aux); 78 | } 79 | this.setState({originalData: data, data:numericalDataTSP, colors:colors, flat:1, dimNames: dimNames, filterTerm: filterTerm}); 80 | //this.props.setDimNames(dimNames); 81 | }.bind(this) 82 | ); 83 | }.bind(this) 84 | ).fail(function() { 85 | this.setState({open: true}); 86 | }.bind(this)); 87 | } 88 | } 89 | 90 | componentWillMount(){ 91 | //console.log(this.props.session); 92 | //this.props.session['pagesCap']="100" 93 | // console.log(this.props.session); 94 | 95 | this.loadDataFromElasticSearch(this.props.session, this.state.filterTerm); 96 | //this.setState({ session:this.props.session, sessionString:JSON.stringify(this.props.session)}); 97 | }; 98 | 99 | componentWillReceiveProps = (newProps, nextState) => { 100 | if(newProps.reloadRadViz){ 101 | var reload = false; 102 | for (var i = 0; i < newProps.urlsToRadviz.length; i++) { 103 | for (let j = 0; j < this.state.originalData['urls'].length; ++j){ 104 | if(this.state.originalData['urls'][j] === newProps.urlsToRadviz[i]) { 105 | reload = true; 106 | break; 107 | } 108 | } 109 | } 110 | if(reload){ 111 | this.setState({ session:newProps.session, sessionString:JSON.stringify(newProps.session)}); 112 | this.loadDataFromElasticSearch(newProps.session, this.state.filterTerm); 113 | } 114 | } 115 | 116 | 117 | if(JSON.stringify(newProps.session) ===this.state.sessionString){ 118 | if(newProps.queryFromSearch){ 119 | this.setState({ session:newProps.session, sessionString:JSON.stringify(newProps.session)}); 120 | this.loadDataFromElasticSearch(newProps.session, this.state.filterTerm); 121 | } 122 | else{ 123 | return; 124 | } 125 | } 126 | this.setState({ session:newProps.session, sessionString:JSON.stringify(newProps.session)}); 127 | this.loadDataFromElasticSearch(newProps.session, this.state.filterTerm); 128 | }; 129 | 130 | updatePagesCap(newNroPAges){ 131 | 132 | var session = JSON.parse(JSON.stringify(this.props.session)); 133 | session['pagesCap'] = newNroPAges; 134 | this.loadDataFromElasticSearch(session, this.state.filterTerm); 135 | 136 | // this.setState({idDomain: this.props.location.query.idDomain}); 137 | 138 | } 139 | //Filter by terms (ex. ebola AND virus) 140 | filterKeyword(filterTerm){ 141 | this.loadDataFromElasticSearch(this.props.session, filterTerm); 142 | } 143 | 144 | handleOpen = () => { 145 | this.setState({open: true}); 146 | }; 147 | 148 | handleClose = () => { 149 | this.setState({open: false}); 150 | }; 151 | updateTagColor(){ 152 | this.props.updateTagColor(); 153 | } 154 | reloadFilters(){ 155 | this.props.reloadFilters(); 156 | }; 157 | updateOnlineAccuracy(accuracy){ 158 | this.props.updateOnlineAccuracy(accuracy); 159 | }; 160 | render() { 161 | if(!(Object.keys(this.state.session).length === 0)){ 162 | const actions = [ 163 | , 168 | ]; 169 | //console.log(this.state.session); 170 | return ( 171 |
172 | 173 |
174 | ); 175 | } 176 | else { 177 | return( 178 |
179 |
180 | ); 181 | } 182 | } 183 | } 184 | 185 | export default RadViz; 186 | -------------------------------------------------------------------------------- /client/src/components/Home.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | var ReactRouter = require('react-router'); 3 | var Link = ReactRouter.Link; 4 | 5 | import Checkbox from 'material-ui/Checkbox'; 6 | import {List, ListItem} from 'material-ui/List'; 7 | import Subheader from 'material-ui/Subheader'; 8 | import { Row, Col} from 'react-bootstrap'; 9 | //import {Toolbar, ToolbarGroup, ToolbarSeparator, ToolbarTitle} from 'material-ui/Toolbar'; 10 | 11 | import FlatButton from 'material-ui/FlatButton'; 12 | import Forward from 'material-ui/svg-icons/content/forward'; 13 | import AddBox from 'material-ui/svg-icons/content/add-box'; 14 | import DeleteForever from 'material-ui/svg-icons/action/delete-forever'; 15 | import {fullWhite} from 'material-ui/styles/colors'; 16 | import $ from 'jquery'; 17 | 18 | import AppBar from 'material-ui/AppBar'; 19 | import logoNYU from '../images/nyu_logo_purple.png'; 20 | import Dialog from 'material-ui/Dialog'; 21 | import TextField from 'material-ui/TextField'; 22 | import RaisedButton from 'material-ui/RaisedButton'; 23 | 24 | const styles = { 25 | listDomains:{ 26 | borderStyle: 'solid', 27 | borderColor: '#C09ED7', 28 | background: 'white', 29 | borderRadius: '0px 0px 0px 0px', 30 | borderWidth: '0px 0px 1px 0px', 31 | }, 32 | }; 33 | 34 | 35 | class Home extends Component { 36 | constructor(props){ 37 | super(props); 38 | this.state = { 39 | domains: undefined, 40 | openCreateDomain: false, 41 | openDeleteDomain: false, 42 | openDuplicateDomainName:false, 43 | newNameDomain:"", 44 | delDomains: {} 45 | }; 46 | this.focusTextField = this.focusTextField.bind(this); 47 | this.textInput = null; 48 | } 49 | 50 | getAvailableDomains(){ 51 | $.post( 52 | '/getAvailableDomains', 53 | {"type": "init"}, 54 | function(domains) { 55 | this.setState({domains: domains['crawlers']}); 56 | }.bind(this) 57 | ); 58 | } 59 | componentWillMount() { 60 | //Get domains. 61 | this.getAvailableDomains(); 62 | } 63 | 64 | handleOpenCreateDomain = () => { 65 | this.setState({openCreateDomain: true}); 66 | this.focusTextField(); 67 | }; 68 | 69 | handleCloseCreateDomain = () => { 70 | this.setState({openCreateDomain: false, openDuplicateDomainName:false, newNameDomain:"" }); 71 | }; 72 | 73 | handleOpenDeleteDomain = () => { 74 | this.setState({openDeleteDomain: true}); 75 | }; 76 | 77 | handleCloseDeleteDomain = () => { 78 | this.setState({openDeleteDomain: false}); 79 | }; 80 | 81 | handleCloseDuplicateDomainName = () => { 82 | this.setState({openDuplicateDomainName: false}); 83 | }; 84 | 85 | //Handling changes into TextField newNameDomain (updating TextField). 86 | handleTextChangeNewNameDomain(e){ 87 | this.setState({ "newNameDomain": e.target.value}); 88 | } 89 | 90 | // Explicitly focus the text input using the raw DOM API 91 | focusTextField() { 92 | setTimeout(() => this.textInput.focus(), 100); 93 | } 94 | 95 | //Create a new domain 96 | createNewDomain(){ 97 | //createNewDomain 98 | var nameDomain= this.state.newNameDomain; 99 | var duplicateDomain = false; 100 | var mydata = this.state.domains; 101 | Object.keys(mydata).map((k, index)=>{ var name = mydata[k].name; if(name.trim().toLowerCase().replace(/\s+/g,"_") === nameDomain.trim().toLowerCase().replace(/\s+/g,"_")){ duplicateDomain = true;} }); // .trim() to remove last and first spaces from a string. 102 | if(!duplicateDomain){ 103 | $.post( 104 | '/addDomain', 105 | {'index_name': nameDomain}, 106 | function(domains) { 107 | this.setState({openCreateDomain: false, newNameDomain:"", openDuplicateDomainName:false }); 108 | this.getAvailableDomains(); 109 | this.forceUpdate(); 110 | }.bind(this) 111 | ); 112 | } 113 | else{ 114 | this.setState({openCreateDomain: true, newNameDomain:nameDomain, openDuplicateDomainName:true}); 115 | } 116 | }; 117 | 118 | //Delete selected domains 119 | deleteDomains(){ 120 | var delDomains= this.state.delDomains; 121 | $.post( 122 | '/delDomain', 123 | {'domains': JSON.stringify(delDomains)}, 124 | function(domains) { 125 | this.setState({openDeleteDomain: false, delDomains: {}}); 126 | this.getAvailableDomains(); 127 | this.forceUpdate(); 128 | }.bind(this) 129 | ); 130 | }; 131 | 132 | // Get all the domains selected for deletion 133 | addDelDomains(id,name){ 134 | var tempDelDomains = this.state.delDomains; 135 | tempDelDomains[id]=name; 136 | this.setState({delDomains:tempDelDomains}); 137 | } 138 | 139 | render(){ 140 | 141 | const actionsCreateDomain = [ 142 | , 147 | , 153 | ]; 154 | 155 | const actionsDeleteDomain = [ 156 | , 161 | , 167 | ]; 168 | 169 | const actionsDuplicateDomainName = [ 170 | , 175 | ]; 176 | 177 | if(this.state.domains!==undefined){ 178 | var mydata = this.state.domains; 179 | return ( 180 |
181 | Domain Discovery Tool } 184 | //iconElementLeft={} 185 | iconElementLeft={logo NYU} 186 | //onLeftIconButtonTouchTap={this.removeRecord.bind(this)} 187 | > 188 | 189 |
190 |
191 | 192 | 193 | 194 |

Domains

195 | {Object.keys(mydata).map((k, index)=>{ 196 | return 197 | } /> 200 | 201 | })} 202 |
203 | 204 | 205 | 206 | } 211 | style={{margin:'70px 10px 30px 10px'}} 212 | onTouchTap={this.handleOpenCreateDomain.bind(this)} 213 | /> 214 | } 219 | style={{margin:'70px 10px 30px 10px'}} 220 | onTouchTap={this.handleOpenDeleteDomain.bind(this)} 221 | /> 222 | 223 | 230 | { this.textInput = input;}} 232 | value={this.state.newNameDomain} 233 | onChange={this.handleTextChangeNewNameDomain.bind(this)} 234 | onKeyPress={(e) => {(e.key === 'Enter') ? this.createNewDomain(this) : null}} 235 | hintText="Write the name domain." 236 | hintStyle={{ marginLeft:10}} 237 | inputStyle={{marginBottom:10, marginLeft:10, paddingRight:20}} 238 | /> 239 | 240 | 241 | 248 |
249 | {Object.keys(mydata).map((k, index)=>{ 250 | return 256 | })} 257 |
258 |
259 | 266 | The domain name was already entered. All domain names must be unique. Please try again. 267 | 268 | 269 |
270 |
271 |
272 |
273 | ); 274 | } 275 | return( 276 |
277 | ); 278 | } 279 | } 280 | 281 | Home.defaultProps = { 282 | backgroundColor:"#9A7BB0", 283 | }; 284 | 285 | export default Home; 286 | -------------------------------------------------------------------------------- /client/src/components/Body.js: -------------------------------------------------------------------------------- 1 | // Filename: Body.js 2 | // Purpose: Contain filters and view components. If there is some change into filter or view components, body.js should be updated. 3 | // Author: Sonia Castelo (scastelo2@gmail.com) 4 | import React, {Component} from 'react'; 5 | import { Row, Col} from 'react-bootstrap'; 6 | import DomainInfo from './DomainInfo'; 7 | import Search from './Search'; 8 | import Filters from './Filters'; 9 | import Terms from './Terms'; 10 | import Views from './Views'; 11 | import CrawlingView from './CrawlingView'; 12 | import '../css/Components.css'; 13 | import 'react-checkbox-tree/lib/react-checkbox-tree.css'; 14 | import 'react-select/dist/react-select.css'; 15 | import Sidebar from 'react-sidebar'; 16 | import Plus from 'material-ui/svg-icons/action/swap-horiz'; 17 | import FloatingActionButton from 'material-ui/FloatingActionButton'; 18 | import Snackbar from 'material-ui/Snackbar'; //Adding an indicator which tell us that the pages are being downloaded 19 | 20 | import {Card, CardActions, CardHeader, CardText, CardMedia} from 'material-ui/Card'; 21 | import {List, ListItem} from 'material-ui/List'; 22 | import Subheader from 'material-ui/Subheader'; 23 | 24 | const styles = { 25 | button:{ 26 | //hoverColor:"#9c9ca4" 27 | marginTop:20, 28 | paddingBottom:'-145px', 29 | marginBottom:'-545px', 30 | marginRight: 5, 31 | }, 32 | contentHeaderMenuLink: { 33 | textDecoration: 'none', 34 | color: 'white', 35 | padding: 8, 36 | }, 37 | content: { 38 | marginTop: '68px', 39 | marginRight: '5px', 40 | marginBottom: '8px', 41 | marginLeft: '5px', 42 | backgroundColor: '#FFFFFF', 43 | borderRadius: '10px 10px 10px 10px', 44 | }, 45 | avatar:{ 46 | margin:'-4px 8px 0px 0px', 47 | }, 48 | 49 | // 50 | card: { 51 | 52 | borderStyle: 'solid', 53 | borderColor: '#C09ED7', 54 | background: 'white', 55 | borderRadius: '0px 0px 0px 0px', 56 | borderWidth: '0px 0px 1px 0px' 57 | }, 58 | cardHeader:{ 59 | background: '#DCCCE7', 60 | padding:'10px 1px 10px 6px', 61 | borderRadius: '0px 0px 0px 0px', 62 | }, 63 | cardMedia:{ 64 | background: '#DCCCE7', 65 | padding:'2px 4px 2px 4px', 66 | borderRadius: '0px 0px 0px 0px', 67 | height: "200px", 68 | }, 69 | }; 70 | 71 | 72 | 73 | class Body extends Component{ 74 | 75 | constructor(props) { 76 | super(props); 77 | this.state = { 78 | docked: true, 79 | open: true, 80 | transitions: true, 81 | touch: true, 82 | shadow: true, 83 | pullRight: false, 84 | touchHandleWidth: 20, 85 | dragToggleDistance: 30, 86 | size:350, 87 | iconDomainInfo:null, 88 | stateDomainInfoCard:false, 89 | stateSearchCard:false, 90 | offset:0, 91 | currentPagination:0, 92 | stateFiltersCard:false, 93 | stateTermsCard:false, 94 | sizeAvatar:25, 95 | currentDomain:'', 96 | sessionBody:{}, 97 | sessionString:"", 98 | pages:{}, 99 | update:false, 100 | runCurrentQuery: "*", 101 | intervalFuncId:undefined, 102 | stopApplyQueryOverView:false, //Allow interactions with data (applying filters, tagging, etc) while multiple searches are running. 103 | }; 104 | this.sessionB={}; 105 | } 106 | 107 | /*consultaQueries: {"search_engine":"GOOG","activeProjectionAlg":"Group by Correlation" 108 | ,"domainId":"AVWjx7ciIf40cqEj1ACn","pagesCap":"100","fromDate":null,"toDate":null, 109 | "filter":null,"pageRetrievalCriteria":"Most Recent","selected_morelike":"", 110 | "model":{"positive":"Relevant","nagative":"Irrelevant"}}*/ 111 | createSession(domainId){ 112 | var session = {}; 113 | session['search_engine'] = "GOOG"; 114 | session['activeProjectionAlg'] = "Group by Correlation"; 115 | session['domainId'] = domainId; 116 | session['pagesCap'] = "100"; 117 | session['fromDate'] = null; 118 | session['toDate'] = null; 119 | session['filter'] = null; //null 120 | session['pageRetrievalCriteria'] = "Most Recent"; 121 | session['selected_morelike'] = ""; 122 | session['selected_queries']=""; 123 | session['selected_tlds']=""; 124 | session['selected_aterms']=""; 125 | session['selected_tags']=""; 126 | session['selected_model_tags']=""; 127 | session['selected_crawled_tags']=""; 128 | session['model'] = {}; 129 | session['model']['positive'] = "Relevant"; 130 | session['model']['negative'] = "Irrelevant"; 131 | 132 | 133 | 134 | return session; 135 | } 136 | 137 | //Get queries, tags, urls from a speficic domain. 138 | componentWillMount() { 139 | this.setState({currentDomain: this.props.currentDomain, sessionBody: this.createSession(this.props.currentDomain), sessionString: JSON.stringify(this.createSession(this.props.currentDomain)) }); 140 | } 141 | 142 | //handling update of props (ex. filters, session, etc) 143 | componentWillReceiveProps = (newProps) => { 144 | if(newProps.reloadBody){ 145 | let sessionTemp = this.state.sessionBody; 146 | sessionTemp['filter']= (newProps.filterKeyword === '')?null:newProps.filterKeyword; 147 | this.setState({sessionBody: sessionTemp, sessionString: JSON.stringify(sessionTemp)}); 148 | } 149 | if(newProps.currentDomain === this.state.currentDomain){ 150 | return; 151 | } 152 | 153 | this.setState({currentDomain: this.props.currentDomain}); 154 | 155 | }; 156 | 157 | //Verify if it is necessary an update. 158 | shouldComponentUpdate(nextProps, nextState) { 159 | if (nextState.sessionString === this.state.sessionString) { 160 | if(nextProps.updateCrawlerData=="updateCrawler" || nextProps.updateCrawlerData=="stopCrawler" || nextProps.filterKeyword !== null || nextProps.filterKeyword !== "" || nextState.stateDomainInfoCard!==this.state.stateDomainInfoCard || nextState.stateSearchCard!==this.state.stateSearchCard || nextState.stateTermsCard!==this.state.stateTermsCard || nextState.stateFiltersCard!==this.state.stateFiltersCard){ 161 | return true; 162 | } 163 | return false; 164 | } 165 | return true; 166 | } 167 | 168 | //Handling menus of DomainInfo, Search, and Filter Cards. 169 | closeMenu(){ 170 | this.setState({ 171 | size: 60, 172 | //iconDomainInfo:} />, 173 | //stateDomainInfoCard:false, 174 | open: !this.state.open, 175 | sizeAvatar:35, 176 | }); 177 | } 178 | 179 | //Handling menus of DomainInfo, Search, and Filter Cards. 180 | openMenu(){ 181 | this.setState({ 182 | size: 350, 183 | iconDomainInfo:null, 184 | open: !this.state.open, 185 | sizeAvatar:25, 186 | }); 187 | } 188 | 189 | //Handling close/open of DomainInfo, Search, and Filter Cards. 190 | openDockMenu(){ 191 | if(this.state.open){ 192 | this.closeMenu(); 193 | this.setState({ 194 | stateDomainInfoCard:false, 195 | stateSearchCard:false, 196 | stateFiltersCard:false, 197 | stateTermsCard:false, 198 | });} 199 | else{ 200 | this.openMenu(); 201 | this.setState({ 202 | stateDomainInfoCard:false, 203 | stateSearchCard:false, 204 | stateFiltersCard:false, 205 | stateTermsCard:false, 206 | }); 207 | } 208 | } 209 | 210 | setActiveMenu (expanded, menu) { 211 | if(!this.state.open){ 212 | this.openMenu(); 213 | } 214 | //stateSearchCard: menu=0 215 | //stateFiltersCard: menu=1 216 | //stateDomainInfoCard: menu=2 217 | //stateTermsCard: menu=3 218 | var item = menu===0 ? this.setState({stateSearchCard: expanded, stateFiltersCard :!expanded, stateDomainInfoCard:!expanded, stateTermsCard:!expanded}) : 219 | (menu===1 ? this.setState({stateFiltersCard: expanded, stateSearchCard: !expanded, stateDomainInfoCard:!expanded, stateTermsCard:!expanded}) : 220 | menu===2 ? this.setState({ stateDomainInfoCard:expanded, stateFiltersCard: !expanded, stateSearchCard: !expanded, stateTermsCard:!expanded}): 221 | this.setState({stateTermsCard:expanded, stateDomainInfoCard:!expanded, stateFiltersCard: !expanded, stateSearchCard: !expanded})); 222 | } 223 | 224 | //function from FiltersTabs Component. Add or Remove some query or tag which was used to filter data. 225 | deletedFilter(sessionTemp){ 226 | this.props.deletedFilter(sessionTemp["filter"]); 227 | this.setState({ 228 | sessionBody:sessionTemp, sessionString: JSON.stringify(sessionTemp), stopApplyQueryOverView:true, 229 | }); 230 | this.updateSession(sessionTemp); 231 | } 232 | 233 | // Update the pages that have changed (for example pages returned from web query) 234 | updatePages(pages){ 235 | this.setState({pages:pages}); 236 | } 237 | 238 | // Update the status message 239 | updateStatusMessage(value, term){ 240 | this.setState({update:value, runCurrentQuery: term}); 241 | this.forceUpdate(); 242 | } 243 | 244 | // Start a timer and get the pages for the particular query as and when they become available on the server 245 | getQueryPages(term){ 246 | if(this.state.intervalFuncId !== undefined) 247 | this.queryPagesDone(); 248 | 249 | this.setState({stopApplyQueryOverView:false, intervalFuncId: window.setInterval(function() {this.applyFilterByQuery(term);}.bind(this), 1000)}); 250 | 251 | } 252 | 253 | applyFilterByQuery(term){ 254 | var session =this.state.sessionBody; 255 | if(!this.state.stopApplyQueryOverView){ 256 | session['newPageRetrievalCriteria'] = "one"; 257 | session['pageRetrievalCriteria'] = "Queries"; 258 | session['selected_queries']=term; 259 | } 260 | this.updateSession(session); 261 | 262 | } 263 | 264 | // Stop timer to stop getting pages fromt the server as all downloaded pages have been retrieved 265 | queryPagesDone(){ 266 | window.clearInterval(this.state.intervalFuncId); 267 | this.setState({intervalFuncId:undefined, stopApplyQueryOverView:false,}); 268 | } 269 | 270 | //Update session 271 | updateSession(newSession){ 272 | this.setState({sessionBody: newSession , sessionString: JSON.stringify(newSession), stopApplyQueryOverView:true,}); 273 | this.forceUpdate(); 274 | } 275 | 276 | reloadFilters(){ 277 | this.setState({update:true}); 278 | this.forceUpdate(); 279 | this.setState({update:false}); 280 | }; 281 | 282 | availableCrawlerButton(isthereModel){ 283 | this.props.availableCrawlerButton(isthereModel); 284 | } 285 | 286 | // Update pagination 287 | handlePageClick(offset, currentPagination){ 288 | this.setState({offset: offset, currentPagination:currentPagination}); 289 | } 290 | 291 | 292 | render(){ 293 | 294 | if(this.props.selectedViewBody===1) //explore data view 295 | { 296 | //console.log(this.state.sessionBody); 297 | //console.log("------body----------"); 298 | const sidebar = (
299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 |
319 | ); 320 | 321 | const sidebarProps = { 322 | sidebar: sidebar, 323 | docked: this.state.docked, 324 | sidebarClassName: 'custom-sidebar-class', 325 | open: this.state.open, 326 | touch: this.state.touch, 327 | shadow: this.state.shadow, 328 | pullRight: this.state.pullRight, 329 | touchHandleWidth: this.state.touchHandleWidth, 330 | dragToggleDistance: this.state.dragToggleDistance, 331 | transitions: this.state.transitions, 332 | onSetOpen: this.onSetOpen, 333 | }; 334 | 335 | return ( 336 | 337 |
338 | 339 | 340 | 341 |
342 | 347 |
348 | ) 349 | } 350 | else //crawling view 351 | { 352 | return( 353 |
354 | 355 |
356 | ) 357 | } 358 | 359 | 360 | } 361 | } 362 | 363 | 364 | 365 | export default Body; 366 | --------------------------------------------------------------------------------