├── docs
    ├── _data
    │   ├── terms.yml
    │   ├── glossary.yml
    │   ├── definitions.yml
    │   ├── tags.yml
    │   ├── topnav.yml
    │   ├── sidebars
    │   │   └── home_sidebar.yml
    │   └── alerts.yml
    ├── .gitignore
    ├── _layouts
    │   ├── none.html
    │   ├── page_print.html
    │   ├── default_print.html
    │   ├── page.html
    │   └── default.html
    ├── images
    │   ├── favicon.ico
    │   ├── doc_example.png
    │   ├── company_logo.png
    │   ├── workflowarrow.png
    │   ├── company_logo_big.png
    │   └── export_example.png
    ├── _includes
    │   ├── inline_image.html
    │   ├── callout.html
    │   ├── note.html
    │   ├── tip.html
    │   ├── important.html
    │   ├── warning.html
    │   ├── archive.html
    │   ├── image.html
    │   ├── search_google_custom.html
    │   ├── footer.html
    │   ├── google_analytics.html
    │   ├── toc.html
    │   ├── search_simple_jekyll.html
    │   ├── links.html
    │   ├── head_print.html
    │   ├── sidebar.html
    │   ├── topnav.html
    │   ├── initialize_shuffle.html
    │   └── head.html
    ├── fonts
    │   ├── FontAwesome.otf
    │   ├── fontawesome-webfont.eot
    │   ├── fontawesome-webfont.ttf
    │   ├── fontawesome-webfont.woff
    │   ├── glyphicons-halflings-regular.eot
    │   ├── glyphicons-halflings-regular.ttf
    │   ├── glyphicons-halflings-regular.woff
    │   └── glyphicons-halflings-regular.woff2
    ├── css
    │   ├── fonts
    │   │   ├── FontAwesome.otf
    │   │   ├── fontawesome-webfont.eot
    │   │   ├── fontawesome-webfont.ttf
    │   │   ├── fontawesome-webfont.woff
    │   │   └── fontawesome-webfont.woff2
    │   ├── boxshadowproperties.css
    │   ├── modern-business.css
    │   ├── theme-green.css
    │   ├── theme-blue.css
    │   ├── printstyles.css
    │   └── syntax.css
    ├── sidebar.json
    ├── Gemfile
    ├── tooltips.json
    ├── sitemap.xml
    ├── core.html
    ├── js
    │   ├── jquery.ba-throttle-debounce.min.js
    │   ├── customscripts.js
    │   ├── jquery.navgoco.min.js
    │   ├── toc.js
    │   └── jekyll-search.js
    ├── licenses
    │   ├── LICENSE
    │   └── LICENSE-BSD-NAVGOCO.txt
    ├── feed.xml
    ├── _config.yml
    ├── utils.html
    ├── Gemfile.lock
    ├── main.html
    ├── index.html
    └── tokenizer.html
├── keywords2vec
    ├── __init__.py
    ├── core.py
    ├── imports.py
    ├── _nbdev.py
    ├── utils.py
    ├── main.py
    └── tokenizer.py
├── MANIFEST.in
├── Makefile
├── .github
    └── workflows
    │   └── main.yml
├── setup.py
├── settings.ini
├── .gitignore
├── CONTRIBUTING.md
├── analyze
    ├── README.md
    ├── compare_to_ngrams.py
    └── vocab_size_results.csv
├── 20_utils.ipynb
├── README.md
├── LICENSE
├── index.ipynb
└── 30_main.ipynb


/docs/_data/terms.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site/
2 | 


--------------------------------------------------------------------------------
/docs/_data/glossary.yml:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/_data/definitions.yml:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/_layouts/none.html:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 | {{content}}


--------------------------------------------------------------------------------
/keywords2vec/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/docs/_data/tags.yml:
--------------------------------------------------------------------------------
1 | allowed-tags:
2 |   - getting_started
3 |   - navigation
4 | 


--------------------------------------------------------------------------------
/docs/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/favicon.ico


--------------------------------------------------------------------------------
/docs/_includes/inline_image.html:
--------------------------------------------------------------------------------
1 | <img class="inline" src="images/{{include.file}}" alt="{{include.alt}}" />
2 | 


--------------------------------------------------------------------------------
/docs/fonts/FontAwesome.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/FontAwesome.otf


--------------------------------------------------------------------------------
/docs/images/doc_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/doc_example.png


--------------------------------------------------------------------------------
/docs/images/company_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/company_logo.png


--------------------------------------------------------------------------------
/docs/images/workflowarrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/workflowarrow.png


--------------------------------------------------------------------------------
/docs/css/fonts/FontAwesome.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/css/fonts/FontAwesome.otf


--------------------------------------------------------------------------------
/docs/images/company_logo_big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/company_logo_big.png


--------------------------------------------------------------------------------
/docs/images/export_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/images/export_example.png


--------------------------------------------------------------------------------
/docs/_includes/callout.html:
--------------------------------------------------------------------------------
1 | <div markdown="span" class="bs-callout bs-callout-{{include.type}}">{{include.content}}</div>
2 | 


--------------------------------------------------------------------------------
/docs/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/keywords2vec/core.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 00_core.ipynb (unless otherwise specified).
2 | 
3 | __all__ = []


--------------------------------------------------------------------------------
/docs/css/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/css/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/css/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/css/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/css/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/css/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 | 


--------------------------------------------------------------------------------
/docs/css/fonts/fontawesome-webfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/css/fonts/fontawesome-webfont.woff2


--------------------------------------------------------------------------------
/docs/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/docs/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/docs/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/docs/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dperezrada/keywords2vec/HEAD/docs/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/docs/_includes/note.html:
--------------------------------------------------------------------------------
1 | <div markdown="span" class="alert alert-info" role="alert"><i class="fa fa-info-circle"></i> <b>Note:</b> {{include.content}}</div>
2 | 


--------------------------------------------------------------------------------
/docs/_includes/tip.html:
--------------------------------------------------------------------------------
1 | <div markdown="span" class="alert alert-success" role="alert"><i class="fa fa-check-square-o"></i> <b>Tip:</b> {{include.content}}</div>


--------------------------------------------------------------------------------
/docs/_includes/important.html:
--------------------------------------------------------------------------------
1 | <div markdown="span" class="alert alert-warning" role="alert"><i class="fa fa-warning"></i> <b>Important:</b> {{include.content}}</div>


--------------------------------------------------------------------------------
/docs/sidebar.json:
--------------------------------------------------------------------------------
1 | {
2 |   "keywords2vec": {
3 |     "Overview": "/",
4 |     "Tokenizer": "/tokenizer",
5 |     "Utils": "/utils",
6 |     "Main": "/main"
7 |   }
8 | }


--------------------------------------------------------------------------------
/docs/_includes/warning.html:
--------------------------------------------------------------------------------
1 | <div markdown="span" class="alert alert-danger" role="alert"><i class="fa fa-exclamation-circle"></i> <b>Warning:</b> {{include.content}}</div>


--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gem 'github-pages', group: :jekyll_plugins
4 | 
5 | 
6 | # Added at 2019-11-25 10:11:40 -0800 by jhoward:
7 | gem "jekyll", "~> 3.7"
8 | 


--------------------------------------------------------------------------------
/keywords2vec/imports.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | import re
 4 | 
 5 | import unidecode
 6 | import nltk
 7 | 
 8 | from stop_words import safe_get_stop_words
 9 | from annoy import AnnoyIndex
10 | 


--------------------------------------------------------------------------------
/docs/_data/topnav.yml:
--------------------------------------------------------------------------------
 1 | topnav:
 2 | - title: Topnav
 3 |   items:
 4 |     - title: GitHub
 5 |       external_url: https://github.com/dperezrada/keywords2vec
 6 | 
 7 | #Topnav dropdowns
 8 | topnav_dropdowns:
 9 | - title: Topnav dropdowns
10 |   folders:


--------------------------------------------------------------------------------
/docs/_includes/archive.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | type: archive
 4 | ---
 5 | 
 6 | <div class="post-header">
 7 |   <h1 class="post-title-main">{{ page.title }}</h1>
 8 | </div>
 9 | <div class="post-content">
10 | 
11 | {{ content }}
12 | </div>
13 | 
14 |  
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/tooltips.json:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: null
 3 | search: exclude
 4 | ---
 5 | 
 6 | {
 7 | "entries":
 8 | [
 9 | {% for page in site.tooltips %}
10 | {
11 | "doc_id": "{{ page.doc_id }}",
12 | "body": "{{ page.content | strip_newlines | replace: '\', '\\\\' | replace: '"', '\\"' }}"
13 | } {% unless forloop.last %},{% endunless %}
14 | {% endfor %}
15 | ]
16 | }
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SRC = $(wildcard ./*.ipynb)
 2 | 
 3 | all: build docs clean
 4 | 
 5 | build: $(SRC)
 6 | 	nbdev_build_lib
 7 | 
 8 | docs: $(SRC)
 9 | 	nbdev_build_docs
10 | 	touch docs
11 | 
12 | test:
13 | 	nbdev_test_nbs
14 | 
15 | pypi: dist
16 | 	twine upload --repository pypi dist/*
17 | 
18 | dist: clean
19 | 	python setup.py sdist bdist_wheel
20 | 
21 | clean:
22 | 	nbdev_clean_nbs
23 | 	rm -rf dist
24 | 


--------------------------------------------------------------------------------
/docs/_includes/image.html:
--------------------------------------------------------------------------------
1 | <figure>{% if {{include.url}} %}<a class="no_icon" target="_blank" href="{{include.url}}">{% endif %}<img class="docimage" src="{{include.file}}" alt="{{include.alt}}" {% if {{include.max-width}} %}style="max-width: {{include.max-width}}px"{% endif %} />{% if {{include.url}} %}</a>{% endif %}{% if {{include.caption}} %}<figcaption>{{include.caption}}</figcaption>{% endif %}</figure>
2 | 


--------------------------------------------------------------------------------
/docs/_layouts/page_print.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default_print
 3 | comments: true
 4 | ---
 5 | <div class="post-header">
 6 |     <h1 class="post-title-main" id="{{page.permalink | replace: '/', '' }}">{{ page.title }}</h1>
 7 | </div>
 8 | 
 9 | <div class="post-content">
10 | 
11 |     {% if page.summary %}
12 |     <div class="summary">{{page.summary}}</div>
13 |     {% endif %}
14 |     {{ content }}
15 | </div>
16 | 


--------------------------------------------------------------------------------
/docs/_includes/search_google_custom.html:
--------------------------------------------------------------------------------
 1 | <script>
 2 |   (function() {
 3 |     var cx = '{{site.google_search}}';
 4 |     var gcse = document.createElement('script');
 5 |     gcse.type = 'text/javascript';
 6 |     gcse.async = true;
 7 |     gcse.src = 'https://cse.google.com/cse.js?cx=' + cx;
 8 |     var s = document.getElementsByTagName('script')[0];
 9 |     s.parentNode.insertBefore(gcse, s);
10 |   })();
11 | </script>
12 | 
13 | <div id="gcs-search-container">
14 |     <gcse:search></gcse:search>
15 | </div>
16 | 
17 | 


--------------------------------------------------------------------------------
/docs/_includes/footer.html:
--------------------------------------------------------------------------------
 1 | <footer>
 2 |             <div class="row">
 3 |                 <div class="col-lg-12 footer">
 4 |                   <p><img src="{{ "/images/company_logo.png" | prepend: site.baseurl }}" alt="Company logo"/></p>
 5 |                &copy;{{ site.time | date: "%Y"  }} {{site.company_name}}. All rights reserved. <br />
 6 | {% if page.last_updated %}<span>Page last updated:</span> {{page.last_updated}}<br/>{% endif %} Site last generated: {{ site.time | date: "%b %-d, %Y"  }} <br />
 7 |                 </div>
 8 |             </div>
 9 | </footer>
10 | 


--------------------------------------------------------------------------------
/docs/_layouts/default_print.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <html>
 4 | <head>
 5 |     {% include head_print.html %}
 6 | 
 7 | 
 8 | </head>
 9 | 
10 | <body class="{% if page.type == "title"%}title{% elsif page.type == "frontmatter" %}frontmatter{% elsif page.type == "first_page" %}first_page{% endif %} print">
11 | 
12 | <!-- Page Content -->
13 | <div class="container">
14 |     <!-- Content Column -->
15 |     <div class="col-md-9">
16 | 
17 |         {{content}}
18 |     </div>
19 | 
20 | </div>    <!-- /.container -->
21 | 
22 | </body>
23 | 
24 | </html>
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/_includes/google_analytics.html:
--------------------------------------------------------------------------------
1 | <!-- the google_analytics_id gets auto inserted from the config file -->
2 | 
3 | {% if site.google_analytics %}
4 | 
5 | <script>(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,'script','//www.google-analytics.com/analytics.js','ga');ga('create','{{site.google_analytics}}','auto');ga('require','displayfeatures');ga('send','pageview');</script>
6 | {% endif %}


--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: none
 3 | search: exclude
 4 | ---
 5 | 
 6 | <?xml version="1.0" encoding="UTF-8"?>
 7 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 8 |   {% for post in site.posts %}
 9 |   {% unless post.search == "exclude" %}
10 |   <url>
11 |     <loc>{{site.url}}{{post.url}}</loc>
12 |   </url>
13 |   {% endunless %}
14 |   {% endfor %}
15 | 
16 | 
17 |   {% for page in site.pages %}
18 |   {% unless page.search == "exclude" %}
19 |   <url>
20 |     <loc>{{site.url}}{{ page.url}}</loc>
21 |   </url>
22 |   {% endunless %}
23 |   {% endfor %}
24 | </urlset>


--------------------------------------------------------------------------------
/docs/core.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | title: module name here
 4 | 
 5 | keywords: fastai
 6 | sidebar: home_sidebar
 7 | 
 8 | summary: "API details."
 9 | ---
10 | <!--
11 | 
12 | #################################################
13 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
14 | #################################################
15 | # file to edit: 00_core.ipynb
16 | # command to build the docs after a change: nbdev_build_docs
17 | 
18 | -->
19 | 
20 | <div class="container" id="notebook-container">
21 |     
22 | <div class="cell border-box-sizing code_cell rendered">
23 | 
24 | </div>
25 | </div>
26 |  
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/css/boxshadowproperties.css:
--------------------------------------------------------------------------------
 1 | /* box-shadow fonts return errors with prince, so extracting here to put in web output only */
 2 | 
 3 | #search-demo-container ul#results-container {
 4 |     box-shadow: 2px 3px 2px #dedede;
 5 | }
 6 | 
 7 | 
 8 | hr.shaded {
 9 |     box-shadow: inset 0 6px 6px -6px rgba(0,0,0,0.5);
10 | }
11 | 
12 | .videoThumbs img {
13 |     box-shadow: 2px 2px 1px #f0f0f0;
14 | }
15 | 
16 | .box {
17 |     box-shadow: 2px 2px 4px #dedede;
18 | }
19 | 
20 | @media (max-width: 1200px) {
21 |     .navbar-collapse {
22 |         box-shadow: inset 0 1px 0 rgba(255,255,255,0.1);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/docs/_data/sidebars/home_sidebar.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################################
 3 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 4 | #################################################
 5 | # Instead edit ../../sidebar.json
 6 | entries:
 7 | - folders:
 8 |   - folderitems:
 9 |     - output: web,pdf
10 |       title: Overview
11 |       url: /
12 |     - output: web,pdf
13 |       title: Tokenizer
14 |       url: /tokenizer
15 |     - output: web,pdf
16 |       title: Utils
17 |       url: /utils
18 |     - output: web,pdf
19 |       title: Main
20 |       url: /main
21 |     output: web
22 |     title: keywords2vec
23 |   output: web
24 |   title: Sidebar
25 | 


--------------------------------------------------------------------------------
/docs/js/jquery.ba-throttle-debounce.min.js:
--------------------------------------------------------------------------------
1 | /*
2 |  * jQuery throttle / debounce - v1.1 - 3/7/2010
3 |  * http://benalman.com/projects/jquery-throttle-debounce-plugin/
4 |  * 
5 |  * Copyright (c) 2010 "Cowboy" Ben Alman
6 |  * Dual licensed under the MIT and GPL licenses.
7 |  * http://benalman.com/about/license/
8 |  */
9 | (function(b,c){var $=b.jQuery||b.Cowboy||(b.Cowboy={}),a;$.throttle=a=function(e,f,j,i){var h,d=0;if(typeof f!=="boolean"){i=j;j=f;f=c}function g(){var o=this,m=+new Date()-d,n=arguments;function l(){d=+new Date();j.apply(o,n)}function k(){h=c}if(i&&!h){l()}h&&clearTimeout(h);if(i===c&&m>e){l()}else{if(f!==true){h=setTimeout(i?k:l,i===c?e-m:e)}}}if($.guid){g.guid=j.guid=j.guid||$.guid++}return g};$.debounce=function(d,e,f){return f===c?a(d,e,false):a(d,f,e!==false)}})(this);


--------------------------------------------------------------------------------
/docs/_includes/toc.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- this handles the automatic toc. use ## for subheads to auto-generate the on-page minitoc. if you use html tags, you must supply an ID for the heading element in order for it to appear in the minitoc. -->
 3 | <script>
 4 | $( document ).ready(function() {
 5 |   // Handler for .ready() called.
 6 | 
 7 | $('#toc').toc({ minimumHeaders: 0, listType: 'ul', showSpeed: 0, headers: 'h2,h3,h4' });
 8 | 
 9 | /* this offset helps account for the space taken up by the floating toolbar. */
10 | $('#toc').on('click', 'a', function() {
11 |   var target = $(this.getAttribute('href'))
12 |     , scroll_target = target.offset().top
13 | 
14 |   $(window).scrollTop(scroll_target - 10);
15 |   return false
16 | })
17 |   
18 | });
19 | </script>
20 | 
21 | <div id="toc"></div>
22 | 


--------------------------------------------------------------------------------
/docs/_includes/search_simple_jekyll.html:
--------------------------------------------------------------------------------
 1 | <div id="search-demo-container">
 2 |     <input type="text" id="search-input" placeholder="{{site.data.strings.search_placeholder_text}}">
 3 |     <ul id="results-container"></ul>
 4 | </div>
 5 | <script src="{{ "js/jekyll-search.js"}}" type="text/javascript"></script>
 6 | <script type="text/javascript">
 7 |         SimpleJekyllSearch.init({
 8 |             searchInput: document.getElementById('search-input'),
 9 |             resultsContainer: document.getElementById('results-container'),
10 |             dataSource: '{{ "search.json" }}',
11 |             searchResultTemplate: '<li><a href="{url}" title="{{page.title | escape }}">{title}</a></li>',
12 | noResultsText: '{{site.data.strings.search_no_results_text}}',
13 |         limit: 10,
14 |         fuzzy: true,
15 | })
16 | </script>
17 | 


--------------------------------------------------------------------------------
/docs/_data/alerts.yml:
--------------------------------------------------------------------------------
 1 | tip: '<div class="alert alert-success" role="alert"><i class="fa fa-check-square-o"></i> <b>Tip: </b>'
 2 | note: '<div class="alert alert-info" role="alert"><i class="fa fa-info-circle"></i> <b>Note: </b>'
 3 | important: '<div class="alert alert-warning" role="alert"><i class="fa fa-warning"></i> <b>Important: </b>'
 4 | warning: '<div class="alert alert-danger" role="alert"><i class="fa fa-exclamation-circle"></i> <b>Warning: </b>'
 5 | end: '</div>'
 6 | 
 7 | callout_danger: '<div class="bs-callout bs-callout-danger">'
 8 | callout_default: '<div class="bs-callout bs-callout-default">'
 9 | callout_primary: '<div class="bs-callout bs-callout-primary">'
10 | callout_success: '<div class="bs-callout bs-callout-success">'
11 | callout_info: '<div class="bs-callout bs-callout-info">'
12 | callout_warning: '<div class="bs-callout bs-callout-warning">'
13 | 
14 | hr_faded: '<hr class="faded"/>'
15 | hr_shaded: '<hr class="shaded"/>'


--------------------------------------------------------------------------------
/keywords2vec/_nbdev.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT!
 2 | 
 3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"]
 4 | 
 5 | index = {"NUMBERS_STOPWORDS": "10_tokenizer.ipynb",
 6 |          "prepare_stopwords": "10_tokenizer.ipynb",
 7 |          "tokenize_one": "10_tokenizer.ipynb",
 8 |          "get_nodes_for_ntlk": "10_tokenizer.ipynb",
 9 |          "tokenize_by_nltk": "10_tokenizer.ipynb",
10 |          "tokenize": "10_tokenizer.ipynb",
11 |          "parallel": "20_utils.ipynb",
12 |          "num_cpus": "20_utils.ipynb",
13 |          "open_file": "20_utils.ipynb",
14 |          "chunk_of_text": "20_utils.ipynb",
15 |          "get_file_chunks": "20_utils.ipynb",
16 |          "tokenize_file": "30_main.ipynb",
17 |          "train_model": "30_main.ipynb",
18 |          "similars_tree_from_model": "30_main.ipynb",
19 |          "get_similars": "30_main.ipynb",
20 |          "similars_tree": "30_main.ipynb"}
21 | 
22 | modules = ["tokenizer.py",
23 |            "utils.py",
24 |            "main.py"]
25 | 
26 | doc_url = "https://dperezrada.github.io/keywords2vec/"
27 | 
28 | git_url = "https://github.com/dperezrada/keywords2vec/tree/master/"
29 | 
30 | def custom_doc_links(name): return None


--------------------------------------------------------------------------------
/docs/licenses/LICENSE:
--------------------------------------------------------------------------------
 1 | /* This license pertains to the docs template, except for the Navgoco jQuery component. */
 2 | 
 3 | The MIT License (MIT)
 4 | 
 5 | Original theme: Copyright (c) 2016 Tom Johnson
 6 | Modifications: Copyright (c) 2017 onwards fast.ai, Inc
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |     - uses: actions/checkout@v1
 8 |     - uses: actions/setup-python@v1
 9 |       with:
10 |         python-version: '3.6'
11 |         architecture: 'x64'
12 |     - name: Install the library
13 |       run: |
14 |         pip install nbdev jupyter
15 |         pip install -e .
16 |     - name: Read all notebooks
17 |       run: |
18 |         nbdev_read_nbs
19 |     - name: Check if all notebooks are cleaned
20 |       run: |
21 |         echo "Check we are starting with clean git checkout"
22 |         if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi
23 |         echo "Trying to strip out notebooks"
24 |         nbdev_clean_nbs
25 |         echo "Check that strip out was unnecessary"
26 |         git status -s # display the status to see which nbs need cleaning up
27 |         if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi
28 |     - name: Check if there is no diff library/notebooks
29 |       run: |
30 |         if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi
31 |     - name: Run tests
32 |       run: |
33 |         nbdev_test_nbs
34 | 


--------------------------------------------------------------------------------
/docs/feed.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | search: exclude
 3 | layout: none
 4 | ---
 5 | 
 6 | <?xml version="1.0" encoding="UTF-8"?>
 7 | <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
 8 |     <channel>
 9 |         <title>{{ site.title | xml_escape }}</title>
10 |         <description>{{ site.description | xml_escape }}</description>
11 |         <link>{{ site.url }}/</link>
12 |         <atom:link href="{{ "/feed.xml" | prepend: site.url }}" rel="self" type="application/rss+xml"/>
13 |         <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
14 |         <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
15 |         <generator>Jekyll v{{ jekyll.version }}</generator>
16 |         {% for post in site.posts limit:10 %}
17 |         <item>
18 |             <title>{{ post.title | xml_escape }}</title>
19 |             <description>{{ post.content | xml_escape }}</description>
20 |             <pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
21 |             <link>{{ post.url | prepend: site.url }}</link>
22 |             <guid isPermaLink="true">{{ post.url | prepend: site.url }}</guid>
23 |             {% for tag in post.tags %}
24 |             <category>{{ tag | xml_escape }}</category>
25 |             {% endfor %}
26 |             {% for tag in page.tags %}
27 |             <category>{{ cat | xml_escape }}</category>
28 |             {% endfor %}
29 |         </item>
30 |         {% endfor %}
31 |     </channel>
32 | </rss>
33 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | repository: dperezrada/keywords2vec
 2 | output: web
 3 | topnav_title: keywords2vec
 4 | site_title: keywords2vec
 5 | company_name: Daniel Pérez Rada
 6 | description: To generate a word2vec model, but using multi-word keywords instead of single words.
 7 | # Set to false to disable KaTeX math
 8 | use_math: true
 9 | # Add Google analytics id if you have one and want to use it here
10 | google_analytics:
11 | # See http://nbdev.fast.ai/search for help with adding Search
12 | google_search:
13 | 
14 | host: 127.0.0.1
15 | # the preview server used. Leave as is.
16 | port: 4000
17 | # the port where the preview is rendered.
18 | 
19 | exclude:
20 |   - .idea/
21 |   - .gitignore
22 |   - vendor
23 |  
24 | exclude: [vendor]
25 | 
26 | highlighter: rouge
27 | markdown: kramdown
28 | kramdown:
29 |  input: GFM
30 |  auto_ids: true
31 |  hard_wrap: false
32 |  syntax_highlighter: rouge
33 | 
34 | collections:
35 |   tooltips:
36 |     output: false
37 | 
38 | defaults:
39 |   -
40 |     scope:
41 |       path: ""
42 |       type: "pages"
43 |     values:
44 |       layout: "page"
45 |       comments: true
46 |       search: true
47 |       sidebar: home_sidebar
48 |       topnav: topnav
49 |   -
50 |     scope:
51 |       path: ""
52 |       type: "tooltips"
53 |     values:
54 |       layout: "page"
55 |       comments: true
56 |       search: true
57 |       tooltip: true
58 | 
59 | sidebars:
60 | - home_sidebar
61 | permalink: pretty
62 | 
63 | theme: jekyll-theme-cayman
64 | baseurl: /keywords2vec/


--------------------------------------------------------------------------------
/docs/_includes/links.html:
--------------------------------------------------------------------------------
 1 | {% comment %}Get links from each sidebar, as listed in the _config.yml file under sidebars{% endcomment %}
 2 | 
 3 | {% for sidebar in site.sidebars %}
 4 | {% for entry in site.data.sidebars[sidebar].entries %}
 5 | {% for folder in entry.folders %}
 6 | {% for folderitem in folder.folderitems %}
 7 | {% if folderitem.url contains "html#" %}
 8 | [{{folderitem.url | remove: "/" }}]: {{folderitem.url | remove: "/"}}
 9 | {% else %}
10 | [{{folderitem.url | remove: "/"  | remove: ".html"}}]: {{folderitem.url | remove: "/"}}
11 | {% endif %}
12 | {% for subfolders in folderitem.subfolders %}
13 | {% for subfolderitem in subfolders.subfolderitems %}
14 | [{{subfolderitem.url | remove: "/"  | remove: ".html"}}]: {{subfolderitem.url | remove: "/"}}
15 | {% endfor %}
16 | {% endfor %}
17 | {% endfor %}
18 | {% endfor %}
19 | {% endfor %}
20 | {% endfor %}
21 | 
22 | 
23 | {% comment %} Get links from topnav {% endcomment %}
24 | 
25 | {% for entry in site.data.topnav.topnav %}
26 | {% for item in entry.items %}
27 | {% if item.external_url == null %}
28 | [{{item.url | remove: "/" | remove: ".html"}}]: {{item.url | remove: "/"}}
29 | {% endif %}
30 | {% endfor %}
31 | {% endfor %}
32 | 
33 | {% comment %}Get links from topnav dropdowns {% endcomment %}
34 | 
35 | {% for entry in site.data.topnav.topnav_dropdowns %}
36 | {% for folder in entry.folders %}
37 | {% for folderitem in folder.folderitems %}
38 | {% if folderitem.external_url == null %}
39 | [{{folderitem.url | remove: "/"  | remove: ".html"}}]: {{folderitem.url | remove: "/"}}
40 | {% endif %}
41 | {% endfor %}
42 | {% endfor %}
43 | {% endfor %}
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/css/modern-business.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Start Bootstrap - Modern Business HTML Template (http://startbootstrap.com)
 3 |  * Code licensed under the Apache License v2.0.
 4 |  * For details, see http://www.apache.org/licenses/LICENSE-2.0.
 5 |  */
 6 | 
 7 | /* Global Styles */
 8 | 
 9 | html,
10 | body {
11 |     height: 100%;
12 | }
13 | 
14 | .img-portfolio {
15 |     margin-bottom: 30px;
16 | }
17 | 
18 | .img-hover:hover {
19 |     opacity: 0.8;
20 | }
21 | 
22 | /* Home Page Carousel */
23 | 
24 | header.carousel {
25 |     height: 50%;
26 | }
27 | 
28 | header.carousel .item,
29 | header.carousel .item.active,
30 | header.carousel .carousel-inner {
31 |     height: 100%;
32 | }
33 | 
34 | header.carousel .fill {
35 |     width: 100%;
36 |     height: 100%;
37 |     background-position: center;
38 |     background-size: cover;
39 | }
40 | 
41 | /* 404 Page Styles */
42 | 
43 | .error-404 {
44 |     font-size: 100px;
45 | }
46 | 
47 | /* Pricing Page Styles */
48 | 
49 | .price {
50 |     display: block;
51 |     font-size: 50px;
52 |     line-height: 50px;
53 | }
54 | 
55 | .price sup {
56 |     top: -20px;
57 |     left: 2px;
58 |     font-size: 20px;
59 | }
60 | 
61 | .period {
62 |     display: block;
63 |     font-style: italic;
64 | }
65 | 
66 | /* Footer Styles */
67 | 
68 | footer {
69 |     margin: 50px 0;
70 | }
71 | 
72 | /* Responsive Styles */
73 | 
74 | @media(max-width:991px) {
75 |     .client-img,
76 |     .img-related {
77 |         margin-bottom: 30px;
78 |     }
79 | }
80 | 
81 | @media(max-width:767px) {
82 |     .img-portfolio {
83 |         margin-bottom: 15px;
84 |     }
85 | 
86 |     header.carousel .carousel {
87 |         height: 70%;
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/docs/_includes/head_print.html:
--------------------------------------------------------------------------------
 1 | <meta charset="utf-8">
 2 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
 3 | <meta name="viewport" content="width=device-width, initial-scale=1">
 4 | <meta name="description" content="{% if page.summary %}{{ page.summary | strip_html | strip_newlines | truncate: 160 }}{% endif %}">
 5 | <meta name="keywords" content="{{page.tags}}{% if page.tags %}, {% endif %} {{page.keywords}}">
 6 | <title>{% if page.homepage == true %} {{site.homepage_title}} {% elsif page.title %}{{ page.title }}{% endif %}  | {{ site.site_title }}</title>
 7 | 
 8 | 
 9 | <link rel="stylesheet" href="{{ "/css/syntax.css" | prepend: site.baseurl | prepend: site.url }}">
10 | <link rel="stylesheet" href="{{ "/css/font-awesome.min.css" | prepend: site.baseurl | prepend: site.url }}">
11 | <link rel="stylesheet" href="{{ "/css/bootstrap.min.css" | prepend: site.baseurl | prepend: site.url }}">
12 | <link rel="stylesheet" href="{{ "/css/modern-business.css" | prepend: site.baseurl | prepend: site.url }}">
13 | <link rel="stylesheet" href="{{ "/css/customstyles.css" | prepend: site.baseurl | prepend: site.url }}">
14 | <link rel="stylesheet" href="{{ "/css/theme-blue.css" | prepend: site.baseurl | prepend: site.url }}">
15 | <link rel="stylesheet" href="{{ "/css/syntax.css" | prepend: site.baseurl | prepend: site.url }}">
16 | <link rel="stylesheet" href="{{ "/css/printstyles.css" | prepend: site.baseurl }}">
17 | 
18 | <script>
19 |     Prince.addScriptFunc("datestamp", function() {
20 |         return "PDF last generated: {{ site.time | date: '%B %d, %Y' }}";
21 |     });
22 | </script>
23 | 
24 | <script>
25 |     Prince.addScriptFunc("guideName", function() {
26 |         return "{{site.print_title}} User Guide";
27 |     });
28 | </script>
29 | 


--------------------------------------------------------------------------------
/docs/licenses/LICENSE-BSD-NAVGOCO.txt:
--------------------------------------------------------------------------------
 1 | /* This license pertains to the Navgoco jQuery component used for the sidebar. */
 2 | 
 3 | Copyright (c) 2013, Christodoulos Tsoulloftas, http://www.komposta.net
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification,
 7 | are permitted provided that the following conditions are met:
 8 | 
 9 |    * Redistributions of source code must retain the above copyright notice,
10 |       this list of conditions and the following disclaimer.
11 |    * Redistributions in binary form must reproduce the above copyright notice,
12 |       this list of conditions and the following disclaimer in the documentation
13 |       and/or other materials provided with the distribution.
14 |    * Neither the name of the <Christodoulos Tsoulloftas> nor the names of its
15 |       contributors may be used to endorse or promote products derived from this
16 |       software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 |  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
26 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
27 | OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/docs/_layouts/page.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | ---
 4 | 
 5 | <div class="post-header">
 6 |     <a id="{{page.title}}"></a>
 7 |     <h1 class="post-title-main">{{ page.title }}</h1>
 8 | </div>
 9 | 
10 | {% if page.simple_map == true %}
11 | 
12 | <script>
13 |     $(document).ready ( function(){
14 |         $('.box{{page.box_number}}').addClass('active');
15 |     });
16 | </script>
17 | 
18 | {% include custom/{{page.map_name}}.html %}
19 | 
20 | {% elsif page.complex_map == true %}
21 | 
22 | <script>
23 |     $(document).ready ( function(){
24 |         $('.modalButton{{page.box_number}}').addClass('active');
25 |     });
26 | </script>
27 | 
28 | {% include custom/{{page.map_name}}.html %}
29 | 
30 | {% endif %}
31 | 
32 | <div class="post-content">
33 | 
34 |    {% if page.summary %}
35 |     <div class="summary">{{page.summary}}</div>
36 |    {% endif %}
37 | 
38 |     {% unless page.toc == false %}
39 |     {% include toc.html %}
40 |     {% endunless %}
41 | 
42 | 
43 |     {% if site.github_editme_path %}
44 | 
45 |     <a target="_blank" href="https://github.com/{{site.github_editme_path}}{{page.path}}" class="btn btn-default githubEditButton" role="button"><i class="fa fa-github fa-lg"></i> Edit me</a>
46 | 
47 |     {% endif %}
48 | 
49 |    {{content}}
50 | 
51 |     <div class="tags">
52 |         {% if page.tags != null %}
53 |         <b>Tags: </b>
54 |         {% assign projectTags = site.data.tags.allowed-tags %}
55 |         {% for tag in page.tags %}
56 |         {% if projectTags contains tag %}
57 |         <a href="{{ "tag_" | append: tag | append: ".html" }}" class="btn btn-default navbar-btn cursorNorm" role="button">{{page.tagName}}{{tag}}</a>
58 |         {% endif %}
59 |         {% endfor %}
60 |         {% endif %}
61 |     </div>
62 | 
63 | </div>
64 | 
65 | {{site.data.alerts.hr_shaded}}
66 | 
67 | {% include footer.html %}
68 | 


--------------------------------------------------------------------------------
/docs/js/customscripts.js:
--------------------------------------------------------------------------------
 1 | $('#mysidebar').height($(".nav").height());
 2 | 
 3 | 
 4 | $( document ).ready(function() {
 5 | 
 6 |     //this script says, if the height of the viewport is greater than 800px, then insert affix class, which makes the nav bar float in a fixed
 7 |     // position as your scroll. if you have a lot of nav items, this height may not work for you.
 8 |     var h = $(window).height();
 9 |     //console.log (h);
10 |     if (h > 800) {
11 |         $( "#mysidebar" ).attr("class", "nav affix");
12 |     }
13 |     // activate tooltips. although this is a bootstrap js function, it must be activated this way in your theme.
14 |     $('[data-toggle="tooltip"]').tooltip({
15 |         placement : 'top'
16 |     });
17 | 
18 |     /**
19 |      * AnchorJS
20 |      */
21 |     anchors.add('h2,h3,h4,h5');
22 | 
23 | });
24 | 
25 | // needed for nav tabs on pages. See Formatting > Nav tabs for more details.
26 | // script from http://stackoverflow.com/questions/10523433/how-do-i-keep-the-current-tab-active-with-twitter-bootstrap-after-a-page-reload
27 | $(function() {
28 |     var json, tabsState;
29 |     $('a[data-toggle="pill"], a[data-toggle="tab"]').on('shown.bs.tab', function(e) {
30 |         var href, json, parentId, tabsState;
31 | 
32 |         tabsState = localStorage.getItem("tabs-state");
33 |         json = JSON.parse(tabsState || "{}");
34 |         parentId = $(e.target).parents("ul.nav.nav-pills, ul.nav.nav-tabs").attr("id");
35 |         href = $(e.target).attr('href');
36 |         json[parentId] = href;
37 | 
38 |         return localStorage.setItem("tabs-state", JSON.stringify(json));
39 |     });
40 | 
41 |     tabsState = localStorage.getItem("tabs-state");
42 |     json = JSON.parse(tabsState || "{}");
43 | 
44 |     $.each(json, function(containerId, href) {
45 |         return $("#" + containerId + " a[href=" + href + "]").tab('show');
46 |     });
47 | 
48 |     $("ul.nav.nav-pills, ul.nav.nav-tabs").each(function() {
49 |         var $this = $(this);
50 |         if (!json[$this.attr("id")]) {
51 |             return $this.find("a[data-toggle=tab]:first, a[data-toggle=pill]:first").tab("show");
52 |         }
53 |     });
54 | });
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pkg_resources import parse_version
 2 | from configparser import ConfigParser
 3 | import setuptools
 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2')
 5 | 
 6 | # note: all settings are in settings.ini; edit there, not here
 7 | config = ConfigParser(delimiters=['='])
 8 | config.read('settings.ini')
 9 | cfg = config['DEFAULT']
10 | 
11 | cfg_keys = 'version description keywords author author_email'.split()
12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
14 | setup_cfg = {o:cfg[o] for o in cfg_keys}
15 | 
16 | licenses = {
17 |     'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
18 | }
19 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
20 |     '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
21 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
22 | 
23 | requirements = cfg.get('requirements','').split()
24 | dev_requirements = cfg.get('dev_requirements','').split()
25 | 
26 | lic = licenses[cfg['license']]
27 | min_python = cfg['min_python']
28 | 
29 | setuptools.setup(
30 |     name = cfg['lib_name'],
31 |     license = lic[0],
32 |     classifiers = [
33 |         'Development Status :: ' + statuses[int(cfg['status'])],
34 |         'Intended Audience :: ' + cfg['audience'].title(),
35 |         'License :: ' + lic[1],
36 |         'Natural Language :: ' + cfg['language'].title(),
37 |     ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]],
38 |     url = 'https://github.com/{}/{}'.format(cfg['user'],cfg['lib_name']),
39 |     packages = setuptools.find_packages(),
40 |     include_package_data = True,
41 |     install_requires = requirements,
42 |     extras_require = {
43 |         'dev': dev_requirements
44 |     },
45 |     python_requires  = '>=' + cfg['min_python'],
46 |     long_description = open('README.md').read(),
47 |     long_description_content_type = 'text/markdown',
48 |     zip_safe = False,
49 |     entry_points = { 'console_scripts': cfg.get('console_scripts','').split() },
50 |     **setup_cfg)
51 | 
52 | 


--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | # All sections below are required unless otherwise specified
 3 | lib_name = keywords2vec
 4 | user = dperezrada
 5 | description = To generate a word2vec model, but using multi-word keywords instead of single words.
 6 | keywords = word2vec,nlp,text-mining,phrase-extraction,keywords-extraction,multi-language
 7 | author = Daniel Pérez Rada
 8 | author_email = dperezrada@gmail.com
 9 | copyright = Daniel Pérez Rada
10 | branch = master
11 | version = 0.1.0
12 | min_python = 3.6
13 | audience = Developers
14 | language = English
15 | # Set to True if you want to create a more fancy sidebar.json than the default
16 | custom_sidebar = False
17 | # Add licenses and see current list in `setup.py`
18 | license = apache2
19 | # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
20 | status = 3
21 | 
22 | # Optional. Same format as setuptools requirements
23 | requirements = nltk Unidecode stop-words fastprogress fasttext annoy
24 | dev_requirements = nbdev jupyter ipywidgets matplotlib
25 | # Optional. Same format as setuptools console_scripts
26 | # console_scripts =
27 | 
28 | ###
29 | # You probably won't need to change anything under here,
30 | #   unless you have some special requirements
31 | ###
32 | 
33 | # Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root
34 | nbs_path = .
35 | doc_path = docs
36 | 
37 | # Anything shown as '%(...)s' is substituted with that setting automatically
38 | doc_host =  https://%(user)s.github.io
39 | doc_baseurl = /%(lib_name)s/
40 | git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/
41 | lib_path = %(lib_name)s
42 | title = %(lib_name)s
43 | 
44 | #Optional advanced parameters
45 | #Monospace docstings: adds <pre> tags around the doc strings, preserving newlines/indentation.
46 | #monospace_docstrings = False
47 | #Test flags: introduce here the test flags you want to use separated by |
48 | #tst_flags =
49 | #Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
50 | #custom_sidebar =
51 | #Cell spacing: if you want cell blocks in code separated by more than one new line
52 | #cell_spacing =
53 | #Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
54 | #jekyll_styles = note,warning,tip,important
55 | 


--------------------------------------------------------------------------------
/keywords2vec/utils.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 20_utils.ipynb (unless otherwise specified).
 2 | 
 3 | __all__ = ['parallel', 'num_cpus', 'open_file', 'chunk_of_text', 'get_file_chunks']
 4 | 
 5 | # Cell
 6 | from .imports import *
 7 | 
 8 | from fastprogress.fastprogress import progress_bar
 9 | from concurrent.futures import ProcessPoolExecutor, as_completed
10 | 
11 | 
12 | # Cell
13 | 
14 | # BEGIN From fastai
15 | def parallel(func, arr, max_workers=-1):
16 |     if max_workers == -1:
17 |         max_workers = num_cpus(2)
18 |     with ProcessPoolExecutor(max_workers=max_workers) as ex:
19 |         futures = [ex.submit(func, arr_el) for arr_el in arr]
20 |         results = []
21 |         for f in progress_bar(as_completed(futures), total=len(arr)):
22 |             results.append(f.result())
23 |         return results
24 | 
25 | def num_cpus(n_cpus):
26 |     try:
27 |         return len(os.sched_getaffinity(0))
28 |     except AttributeError:
29 |         return os.cpu_count()
30 |     if n_cpus > 0:
31 |         return n_cpus
32 |     """Get number of cpus."""
33 | # END From fastai
34 | 
35 | 
36 | def open_file(filepath, options):
37 |     if filepath[-3:] == ".gz":
38 |         return gzip.open(filepath, options)
39 |     return open(filepath, options)
40 | 
41 | 
42 | def chunk_of_text(_file, chunk_size=-1):
43 |     index = 0
44 |     if chunk_size == -1:
45 |         chunk_size = 200
46 |     while True:
47 |         line = _file.readline()
48 |         if not line:
49 |             break
50 |         for sentence in line.split("."):
51 |             if sentence.strip():
52 |                 yield sentence.strip()
53 |         if index >= chunk_size:
54 |             break
55 |         index += 1
56 | 
57 | 
58 | def get_file_chunks(start_index, filepath, lines_chunk, sample_size):
59 |     _file = open_file(filepath, 'rt')
60 |     texts = []
61 |     break_by_sample = False
62 |     while True:
63 |         next_n_lines = list(chunk_of_text(_file, lines_chunk))
64 |         texts.append("\n".join(next_n_lines) + "\n")
65 |         if not next_n_lines:
66 |             break
67 |         start_index += lines_chunk
68 |         if sample_size > 0 and start_index >= sample_size:
69 |             break_by_sample = True
70 |             break
71 |     _file.close()
72 |     return (start_index, texts, break_by_sample)
73 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.bak
  2 | .gitattributes
  3 | .last_checked
  4 | .gitconfig
  5 | *.bak
  6 | *.log
  7 | *~
  8 | ~*
  9 | _tmp*
 10 | tmp*
 11 | tags
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | env/
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # dotenv
 95 | .env
 96 | 
 97 | # virtualenv
 98 | .venv
 99 | venv/
100 | ENV/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 
115 | .vscode
116 | *.swp
117 | 
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 | 
126 | # pytest
127 | .pytest_cache
128 | 
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 | 
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 | 
136 | # link checker
137 | checklink/cookies.txt
138 | 
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 | 
142 | *.old
143 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | ## How to get started
 4 | 
 5 | Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
 6 | ```
 7 | nbdev_install_git_hooks
 8 | ```
 9 | 
10 | ## Did you find a bug?
11 | 
12 | * Ensure the bug was not already reported by searching on GitHub under Issues.
13 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
14 | * Be sure to add the complete error messages.
15 | 
16 | #### Did you write a patch that fixes a bug?
17 | 
18 | * Open a new GitHub pull request with the patch.
19 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
20 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
21 | 
22 | ## PR submission guidelines
23 | 
24 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
25 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected.
26 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
27 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
28 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.
29 | 
30 | ## Do you want to contribute to the documentation?
31 | 
32 | * Docs are automatically created from the notebooks in the nbs folder.
33 | 
34 | 


--------------------------------------------------------------------------------
/analyze/README.md:
--------------------------------------------------------------------------------
 1 | # Comparing vocab size
 2 | 
 3 | We generate a quick comparison with the size of the vocab, using the stopword tokenizer vs ngrams. To do this, we used bigquery.
 4 | You can take a look to the dataset [here](https://bigquery.cloud.google.com/dataset/api-project-380745743806:epistemonikos)
 5 | 
 6 | | ngrams             | keywords  | comp    |
 7 | |--------------------|-----------|---------|
 8 | | 1                  | 127,824   | 36%     |
 9 | | 1,2                | 1,360,550 | 388%    |
10 | | 1-3                | 3,204,099 | 914%    |
11 | | 1-4                | 4,461,930 | 1,272%  |
12 | | 1-5                | 5,133,619 | 1,464%  |
13 | |                    |           |         |
14 | | stopword tokenizer | 350,529   | 100%    | 
15 | 
16 | ## Reproduce
17 | If you wanted to reproduce the results
18 | 
19 | ### Get the data
20 | cd to this folder
21 | ```
22 | mkdir -p ../data/inputs
23 | wget "http://s3.amazonaws.com/episte-labs/episte_title_abstract.tsv.gz" -O ../data/inputs/episte_title_abstract.tsv.gz
24 | ```
25 | 
26 | ### Get all keywords
27 | 
28 | ```
29 | python compare_to_ngrams.py ../data/inputs/episte_title_abstract.tsv.gz| gzip > ../data/all_keywords.tsv.gz
30 | ```
31 | 
32 | ### upload to bigquery
33 | ```
34 | gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp "data/all_found.tsv.gz" gs://episte-lab/all_tokens.tsv.gz
35 | 
36 | bq mk "api-project-380745743806:epistemonikos.all_keywords" tokenizer_name:string,keyword:string
37 | 
38 | bq load --replace --max_bad_records=40000 --field_delimiter="\t" --source_format=CSV "api-project-380745743806:epistemonikos.all_keywords" gs://episte-lab/all_tokens.tsv.gz
39 | ```
40 | 
41 | ### Count
42 | 
43 | In bigquery, execute the following query, using standard query language, and as destination table we set _epistemonikos.count_keywords_ table.
44 | 
45 | ```
46 | SELECT tokenizer_name, keyword, count(*) as repeat_count
47 | FROM `api-project-380745743806.epistemonikos.all_keywords`
48 | GROUP BY tokenizer_name, keyword
49 | ```
50 | 
51 | Then we query and set as destination table _epistemonikos.vocab_size_ with append, and changing the number 1 for 2, then 3, and so (a better query is needed later):
52 | 
53 | ```
54 | SELECT tokenizer_name, 1 as min_repeat, count(*) as vocab_size
55 | FROM `api-project-380745743806.epistemonikos.count_keywords`
56 | WHERE repeat_count >= 1
57 | GROUP BY tokenizer_name
58 | ORDER BY vocab_size DESC
59 | ```
60 | 
61 | Now you can get the vocab_size, and get a CSV like [this](vocab_size_results.csv):
62 | ```
63 | SELECT *
64 | FROM `api-project-380745743806.epistemonikos.vocab_size`
65 | ORDER BY min_repeat ASC, vocab_size DESC
66 | LIMIT 1000
67 | ```
68 | 
69 | ## The data
70 | The data is public [here](https://bigquery.cloud.google.com/dataset/api-project-380745743806:epistemonikos)
71 | 
72 | You can play around.


--------------------------------------------------------------------------------
/docs/css/theme-green.css:
--------------------------------------------------------------------------------
  1 | .summary {
  2 |     color: #808080;
  3 |     border-left: 5px solid #E50E51;
  4 |     font-size:16px;
  5 | }
  6 | 
  7 | 
  8 | h3 {color: #E50E51; }
  9 | h4 {color: #808080; }
 10 | 
 11 | .nav-tabs > li.active > a, .nav-tabs > li.active > a:hover, .nav-tabs > li.active > a:focus {
 12 |     background-color: #248ec2;
 13 |     color: white;
 14 | }
 15 | 
 16 | .nav > li.active > a {
 17 |     background-color: #72ac4a;
 18 | }
 19 | 
 20 | .nav > li > a:hover {
 21 |     background-color: #72ac4a;
 22 | }
 23 | 
 24 | div.navbar-collapse .dropdown-menu > li > a:hover {
 25 |     background-color: #72ac4a;
 26 | }
 27 | 
 28 | .navbar-inverse .navbar-nav>li>a, .navbar-inverse .navbar-brand {
 29 |     color: white;
 30 | }
 31 | 
 32 | .navbar-inverse .navbar-nav>li>a:hover, a.fa.fa-home.fa-lg.navbar-brand:hover {
 33 |      color: #f0f0f0;
 34 | }
 35 | 
 36 | .nav li.thirdlevel > a {
 37 |     background-color: #FAFAFA !important;
 38 |     color: #72ac4a;
 39 |     font-weight: bold;
 40 | }
 41 | 
 42 | a[data-toggle="tooltip"] {
 43 |     color: #649345;
 44 |     font-style: italic;
 45 |     cursor: default;
 46 | }
 47 | 
 48 | .navbar-inverse {
 49 |     background-color: #72ac4a;
 50 |     border-color: #5b893c;
 51 | }
 52 | 
 53 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
 54 |     color: #5b893c;
 55 | }
 56 | 
 57 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
 58 |     background-color: #5b893c;
 59 |     color: #ffffff;
 60 | }
 61 | 
 62 | /* not sure if using this ...*/
 63 | .navbar-inverse .navbar-collapse, .navbar-inverse .navbar-form {
 64 |     border-color: #72ac4a !important;
 65 | }
 66 | 
 67 | .btn-primary {
 68 |     color: #ffffff;
 69 |     background-color: #5b893c;
 70 |     border-color: #5b893c;
 71 | }
 72 | 
 73 | .btn-primary:hover,
 74 | .btn-primary:focus,
 75 | .btn-primary:active,
 76 | .btn-primary.active,
 77 | .open .dropdown-toggle.btn-primary {
 78 |     background-color: #72ac4a;
 79 |     border-color: #5b893c;
 80 | }
 81 | 
 82 | .printTitle {
 83 |     color: #5b893c !important;
 84 | }
 85 | 
 86 | body.print h1 {color: #5b893c !important; font-size:28px;}
 87 | body.print h2 {color: #595959 !important; font-size:24px;}
 88 | body.print h3 {color: #E50E51 !important; font-size:14px;}
 89 | body.print h4 {color: #679DCE !important; font-size:14px; font-style: italic;}
 90 | 
 91 | .anchorjs-link:hover {
 92 |     color: #4f7233;
 93 | }
 94 | 
 95 | div.sidebarTitle {
 96 |     color: #E50E51;
 97 | }
 98 | 
 99 | li.sidebarTitle {
100 |     margin-top:20px;
101 |     font-weight:normal;
102 |     font-size:130%;
103 |     color: #ED1951;
104 |     margin-bottom:10px;
105 |     margin-left: 5px;
106 | }
107 | 
108 | .navbar-inverse .navbar-toggle:focus, .navbar-inverse .navbar-toggle:hover {
109 |     background-color: #E50E51;
110 | }
111 | 


--------------------------------------------------------------------------------
/keywords2vec/main.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 30_main.ipynb (unless otherwise specified).
 2 | 
 3 | __all__ = ['tokenize_file', 'train_model', 'similars_tree_from_model', 'get_similars', 'similars_tree']
 4 | 
 5 | # Cell
 6 | from .imports import *
 7 | 
 8 | from glob import glob
 9 | from functools import partial
10 | 
11 | import fasttext
12 | 
13 | from .utils import parallel, open_file, chunk_of_text, get_file_chunks
14 | from .tokenizer import tokenize
15 | 
16 | # Cell
17 | 
18 | def tokenize_file(
19 |     input_path, output_path="tokenized.txt", lang="en",
20 |     sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
21 | ):
22 |     tokenize_wrapper = partial(
23 |         tokenize, lang=lang, text_output=True, merge=True, keywords_w_stopwords=keywords_w_stopwords
24 |     )
25 | 
26 |     index = 0
27 | 
28 |     with open(output_path, "wt") as _output:
29 |         for file_path in glob(input_path):
30 |             print("processing file:", file_path)
31 |             # We are going to split the text in chunks to show some progress.
32 |             new_index, text_chunks, break_by_sample = get_file_chunks(index, file_path, lines_chunks, sample_size)
33 |             index = new_index
34 |             results = parallel(tokenize_wrapper, text_chunks, n_cpus)
35 |             _output.write(
36 |                 ("\n".join(results) + "\n").replace(" ", "_").replace("!", " ")
37 |             )
38 |             if break_by_sample:
39 |                 break
40 |     return output_path
41 | 
42 | 
43 | def train_model(input_filename):
44 |     model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)
45 |     return model
46 | 
47 | def similars_tree_from_model(model, vector_size=100):
48 |     f = 100
49 |     t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
50 |     labels = model.labels
51 |     for index, label in enumerate(labels):
52 |         v = model[label]
53 |         t.add_item(index, v)
54 | 
55 |     t.build(10) # 10 trees
56 |     return labels, t
57 | 
58 | def get_similars(tree, labels, keyword, n_similars=10, show_score=False):
59 |     index = labels.index(keyword.replace(" ", "_"))
60 |     suggestions, scores = tree.get_nns_by_item(index, n=15, include_distances=True)
61 |     suggested_labels = [
62 |         labels[suggestion].replace("_", " ")
63 |         for suggestion in suggestions
64 |     ]
65 |     return suggested_labels
66 | 
67 | def similars_tree(
68 |     input_path, temp_tokenized_file="tmp_tokenized.txt", lang="en",
69 |     sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
70 | ):
71 |     tokenize_file(
72 |         input_path=input_path, output_path=temp_tokenized_file, lang=lang,
73 |         sample_size=sample_size, lines_chunks=lines_chunks, n_cpus=n_cpus,
74 |         keywords_w_stopwords=keywords_w_stopwords
75 |     )
76 |     model = train_model(temp_tokenized_file)
77 |     labels, tree = similars_tree_from_model(model)
78 |     return labels, tree
79 | 


--------------------------------------------------------------------------------
/docs/_includes/sidebar.html:
--------------------------------------------------------------------------------
 1 | {% assign sidebar = site.data.sidebars[page.sidebar].entries %}
 2 | {% assign pageurl = page.url  | remove: ".html" %}
 3 | 
 4 | <ul id="mysidebar" class="nav">
 5 |   <li class="sidebarTitle">{{sidebar[0].product}} {{sidebar[0].version}}</li>
 6 |   {% for entry in sidebar %}
 7 |   {% for folder in entry.folders %}
 8 |   {% if folder.output contains "web" %}
 9 |   <li>
10 |       <a href="#">{{ folder.title }}</a>
11 |       <ul>
12 |           {% for folderitem in folder.folderitems %}
13 |           {% if folderitem.output contains "web" %}
14 |           {% if folderitem.external_url %}
15 |           <li><a href="{{folderitem.external_url}}" target="_blank">{{folderitem.title}}</a></li>
16 |           {% elsif pageurl == folderitem.url %}
17 |           <li class="active"><a href="{{folderitem.url | prepend: site.baseurl}}">{{folderitem.title}}</a></li>
18 |           {% elsif folderitem.type == "empty" %}
19 |           <li><a href="{{folderitem.url | prepend: site.baseurl}}">{{folderitem.title}}</a></li>
20 | 
21 |           {% else %}
22 |           <li><a href="{{folderitem.url | prepend: site.baseurl}}">{{folderitem.title}}</a></li>
23 |           {% endif %}
24 |           {% for subfolders in folderitem.subfolders %}
25 |           {% if subfolders.output contains "web" %}
26 |           <li class="subfolders">
27 |               <a href="#">{{ subfolders.title }}</a>
28 |               <ul>
29 |                   {% for subfolderitem in subfolders.subfolderitems %}
30 |                   {% if subfolderitem.output contains "web" %}
31 |                   {% if subfolderitem.external_url %}
32 |                   <li><a href="{{subfolderitem.external_url}}" target="_blank">{{subfolderitem.title}}</a></li>
33 |                   {% elsif pageurl == subfolderitem.url %}
34 |                   <li class="active"><a href="{{subfolderitem.url | prepend: site.baseurl}}">{{subfolderitem.title}}</a></li>
35 |                   {% else %}
36 |                   <li><a href="{{subfolderitem.url | prepend: site.baseurl}}">{{subfolderitem.title}}</a></li>
37 |                   {% endif %}
38 |                   {% endif %}
39 |                   {% endfor %}
40 |               </ul>
41 |           </li>
42 |           {% endif %}
43 |           {% endfor %}
44 |           {% endif %}
45 |           {% endfor %}
46 |       </ul>
47 |    </li>
48 |      {% endif %}
49 |       {% endfor %}
50 |       {% endfor %}
51 |       <!-- if you aren't using the accordion, uncomment this block:
52 |          <p class="external">
53 |              <a href="#" id="collapseAll">Collapse All</a> | <a href="#" id="expandAll">Expand All</a>
54 |          </p>
55 |          -->
56 | </ul>
57 | 
58 | <!-- this highlights the active parent class in the navgoco sidebar. this is critical so that the parent expands when you're viewing a page. This must appear below the sidebar code above. Otherwise, if placed inside customscripts.js, the script runs before the sidebar code runs and the class never gets inserted.-->
59 | <script>$("li.active").parents('li').toggleClass("active");</script>
60 | 


--------------------------------------------------------------------------------
/analyze/compare_to_ngrams.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import gzip
 3 | import sys
 4 | 
 5 | from multiprocessing import Pool, cpu_count
 6 | from functools import partial
 7 | 
 8 | 
 9 | from keywords_tokenizer import tokenize_one
10 | 
11 | def tokenize_simple(text):
12 |     text_part = text.lower()
13 | 
14 |     # Must be executed in order
15 |     regexs = [
16 |         ("’", "'"),
17 |         # Remove all non alpha, numeric, spaces, - or single quote
18 |         (r'([^a-z0-9\u00C0-\u1FFF\u2C00-\uD7FF \t\n\-\'])', "!!"),
19 |         # remove only words numbers
20 |         (r'(^|[ !])[0-9]+([ !]|$)', "!!"),
21 |         # remove hyphen-minus for keywords starting or ending with it
22 |         (r'((^|[ !])[\-\']+)|([\-\']+([ !]|$))', "!!"),
23 |         # remove spaces between !
24 |         (r' *! *', "!!"),
25 |         # generate multiple ! need for next regex
26 |         (r'!', "!!"),
27 |         # remove one character keyword
28 |         (r'(^|!)[^!\n](!|$)', "!!"),
29 |         # remove multiple ! (!!!!)
30 |         (r'!+', "!"),
31 |         # remove first and last !
32 |         (r'(^!+)|(!+$)', ""),
33 |         # replace spaces
34 |         (r'\s', "!"),
35 |     ]
36 |     for regex, replacement in regexs:
37 |         text_part = re.sub(regex, replacement, text_part, flags=re.M)
38 |     return text_part.split("!")
39 | 
40 | 
41 | def get_ngram(text, min_ngram=1, max_ngrams=6):
42 |     list_of_words = tokenize_simple(text)
43 |     ngrams = {}
44 |     for ngram in range(min_ngram, max_ngrams):
45 |         ngrams[ngram] = [
46 |             " ".join(list_of_words[i:i + ngram])
47 |             for i in iter(range(len(list_of_words) - ngram + 1))
48 |         ]
49 |     return ngrams
50 | 
51 | def process_batch_grams(texts):
52 |     cpu_num = max(1, cpu_count() - 1)
53 |     pool_queue = Pool(cpu_num)
54 | 
55 |     ngrams_rows = pool_queue.map(get_ngram, texts)
56 |     pool_queue.close()
57 |     for ngrams_found in ngrams_rows:
58 |         for ngram_num, keywords in ngrams_found.items():
59 |             for keyword in keywords:
60 |                 print("%s\t%s" % (ngram_num, keyword))
61 | 
62 | # Could be refactored later
63 | def process_batch_stopwords_tokenizer(texts):
64 |     cpu_num = max(1, cpu_count() - 1)
65 |     pool_queue = Pool(cpu_num)
66 | 
67 |     rows = pool_queue.map(tokenize_one, texts)
68 |     pool_queue.close()
69 |     for ngrams_found in rows:
70 |         for ngram_num, keywords in ngrams_found.items():
71 |             for keyword in keywords:
72 |                 print("%s\t%s" % ("sk", keyword))
73 | 
74 | def main():
75 |     ngrams = (1, 6)
76 |     batch_size = 10000
77 |     cpu_num = max(1, cpu_count() - 1)
78 |     pool_queue = Pool(cpu_num)
79 | 
80 |     texts = []
81 |     for index, line in enumerate(gzip.open(sys.argv[1], "rt")):
82 |         row = line[:-1].split("\t")
83 |         title = row[2]
84 |         abstract = row[3]
85 |         text = title + "." + abstract
86 |         if index > 0 and index % batch_size == 0:
87 |             process_batch_grams(texts)
88 |             process_batch_stopwords_tokenizer(texts)
89 |             texts = []
90 |             print(index, end="\r", file=sys.stderr)
91 |     if len(texts) > 0:
92 |         process_batch_grams(texts)
93 |         process_batch_stopwords_tokenizer(texts)
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 


--------------------------------------------------------------------------------
/docs/css/theme-blue.css:
--------------------------------------------------------------------------------
  1 | .summary {
  2 |     color: #808080;
  3 |     border-left: 5px solid #ED1951;
  4 |     font-size:16px;
  5 | }
  6 | 
  7 | 
  8 | h3 {color: #000000; }
  9 | h4 {color: #000000; }
 10 | 
 11 | .nav-tabs > li.active > a, .nav-tabs > li.active > a:hover, .nav-tabs > li.active > a:focus {
 12 |     background-color: #248ec2;
 13 |     color: white;
 14 | }
 15 | 
 16 | .nav > li.active > a {
 17 |     background-color: #347DBE;
 18 | }
 19 | 
 20 | .nav > li > a:hover {
 21 |     background-color: #248ec2;
 22 | }
 23 | 
 24 | div.navbar-collapse .dropdown-menu > li > a:hover {
 25 |     background-color: #347DBE;
 26 | }
 27 | 
 28 | .nav li.thirdlevel > a {
 29 |     background-color: #FAFAFA !important;
 30 |     color: #248EC2;
 31 |     font-weight: bold;
 32 | }
 33 | 
 34 | a[data-toggle="tooltip"] {
 35 |     color: #649345;
 36 |     font-style: italic;
 37 |     cursor: default;
 38 | }
 39 | 
 40 | .navbar-inverse {
 41 |     background-color: #347DBE;
 42 |     border-color: #015CAE;
 43 | }
 44 | .navbar-inverse .navbar-nav>li>a, .navbar-inverse .navbar-brand {
 45 |     color: white;
 46 | }
 47 | 
 48 | .navbar-inverse .navbar-nav>li>a:hover, a.fa.fa-home.fa-lg.navbar-brand:hover {
 49 |      color: #f0f0f0;
 50 | }
 51 | 
 52 | a.navbar-brand:hover {
 53 |   color: #f0f0f0;
 54 | }
 55 | 
 56 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
 57 |     color: #015CAE;
 58 | }
 59 | 
 60 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
 61 |     background-color: #015CAE;
 62 |     color: #ffffff;
 63 | }
 64 | 
 65 | .navbar-inverse .navbar-collapse, .navbar-inverse .navbar-form {
 66 |     border-color: #248ec2 !important;
 67 | }
 68 | 
 69 | .btn-primary {
 70 |     color: #ffffff;
 71 |     background-color: #347DBE;
 72 |     border-color: #347DBE;
 73 | }
 74 | 
 75 | .navbar-inverse .navbar-nav > .active > a, .navbar-inverse .navbar-nav > .active > a:hover, .navbar-inverse .navbar-nav > .active > a:focus {
 76 |     background-color: #347DBE;
 77 | }
 78 | 
 79 | .btn-primary:hover,
 80 | .btn-primary:focus,
 81 | .btn-primary:active,
 82 | .btn-primary.active,
 83 | .open .dropdown-toggle.btn-primary {
 84 |     background-color: #248ec2;
 85 |     border-color: #347DBE;
 86 | }
 87 | 
 88 | .printTitle {
 89 |     color: #015CAE !important;
 90 | }
 91 | 
 92 | body.print h1 {color: #015CAE !important; font-size:28px !important;}
 93 | body.print h2 {color: #595959 !important; font-size:20px !important;}
 94 | body.print h3 {color: #E50E51 !important; font-size:14px !important;}
 95 | body.print h4 {color: #679DCE !important; font-size:14px; font-style: italic !important;}
 96 | 
 97 | .anchorjs-link:hover {
 98 |     color: #216f9b;
 99 | }
100 | 
101 | div.sidebarTitle {
102 |     color: #015CAE;
103 | }
104 | 
105 | li.sidebarTitle {
106 |   margin-top:20px;
107 |     font-weight:normal;
108 |     font-size:130%;
109 |     color: #ED1951;
110 |     margin-bottom:10px;
111 |     margin-left: 5px;
112 | 
113 | }
114 | 
115 | .navbar-inverse .navbar-toggle:focus, .navbar-inverse .navbar-toggle:hover {
116 |     background-color: #015CAE;
117 | }
118 | 
119 | .navbar-inverse .navbar-toggle {
120 |     border-color: #015CAE;
121 | }
122 | 


--------------------------------------------------------------------------------
/analyze/vocab_size_results.csv:
--------------------------------------------------------------------------------
 1 | tokenizer_name,min_repeat,vocab_size,perc_of_max
 2 | 5,1,"125,549,318",1
 3 | 4,1,"98,424,631",0.783951936720198
 4 | 3,1,"54,989,810",0.437993697424943
 5 | 2,1,"14,041,534",0.11184078275917
 6 | sk,1,"8,532,143",0.067958497393033
 7 | 1,1,"729,252",0.005808490333655
 8 | 5,2,"27,970,086",1
 9 | 4,2,"25,744,143",0.920417012661313
10 | 3,2,"18,448,509",0.659579988420486
11 | 2,2,"6,325,498",0.226152254233326
12 | sk,2,"2,850,645",0.101917634432729
13 | 1,2,"406,894",0.014547470465411
14 | 3,3,"8,372,240",1
15 | 4,3,"8,191,498",0.978411751215923
16 | 5,3,"6,085,108",0.726819584722846
17 | 2,3,"3,789,551",0.452632867667434
18 | sk,3,"1,342,343",0.160332599161037
19 | 1,3,"287,651",0.03435771072019
20 | 3,4,"5,828,969",1
21 | 4,4,"5,310,830",0.911109666220561
22 | 5,4,"3,769,273",0.646644886943128
23 | 2,4,"2,888,392",0.495523650923517
24 | sk,4,"970,502",0.166496339232547
25 | 1,4,"237,935",0.040819397049461
26 | 3,5,"4,226,673",1
27 | 4,5,"3,439,428",0.813743575620825
28 | 2,5,"2,300,277",0.544228758647759
29 | 5,5,"2,138,736",0.506009336421341
30 | sk,5,"723,230",0.171110942341648
31 | 1,5,"202,138",0.047824376288395
32 | 3,6,"3,402,188",1
33 | 4,6,"2,657,524",0.781122030881303
34 | 2,6,"1,951,263",0.573531797772492
35 | 5,6,"1,608,265",0.472714911698001
36 | sk,6,"598,500",0.175916204513096
37 | 1,6,"179,661",0.052807487416921
38 | 3,7,"2,794,275",1
39 | 4,7,"2,069,573",0.740647574057671
40 | 2,7,"1,687,958",0.604077265122438
41 | 5,7,"1,180,406",0.422437304846517
42 | sk,7,"502,072",0.17967880756189
43 | 1,7,"160,993",0.057615302717163
44 | 3,8,"2,395,570",1
45 | 4,8,"1,725,232",0.720175991517676
46 | 2,8,"1,502,163",0.627058695842743
47 | 5,8,"966,156",0.403309442011713
48 | sk,8,"439,701",0.183547548182687
49 | 1,8,"147,866",0.061724766965691
50 | 3,9,"2,078,106",1
51 | 4,9,"1,447,165",0.696386517338384
52 | 2,9,"1,349,900",0.649581878883945
53 | 5,9,"783,519",0.377035146426602
54 | sk,9,"388,047",0.186731090714333
55 | 1,9,"136,421",0.065646795687997
56 | 3,10,"1,843,549",1
57 | 4,10,"1,257,831",0.682287804663722
58 | 2,10,"1,232,726",0.668670048911095
59 | 5,10,"671,689",0.364345618152813
60 | sk,10,"350,529",0.190138152010063
61 | 1,10,"127,824",0.069335829967091
62 | 3,15,"1,162,809",1
63 | 2,15,"867,345",0.745904959455938
64 | 4,15,"730,842",0.628514227186064
65 | 5,15,"366,007",0.314761065660826
66 | sk,15,"236,877",0.203711013588646
67 | 1,15,"98,683",0.084866044208464
68 | 3,20,"843,965",1
69 | 2,20,"678,065",0.803427867269377
70 | 4,20,"504,692",0.598001101941431
71 | 5,20,"243,773",0.288842546788078
72 | sk,20,"181,448",0.214994697647414
73 | 1,20,"83,201",0.098583472063415
74 | 3,30,"539,543",1
75 | 2,30,"479,622",0.888941196531138
76 | 4,30,"301,530",0.558861851604043
77 | 5,30,"138,541",0.256774714897608
78 | sk,30,"125,883",0.233314119541909
79 | 1,30,"65,817",0.12198657011582
80 | 3,40,"392,935",1
81 | 2,40,"374,555",0.953223815643809
82 | 4,40,"209,929",0.534258846883072
83 | sk,40,"97,334",0.247710181073205
84 | 5,40,"93,242",0.237296244926006
85 | 1,40,"55,910",0.142288164709176
86 | 2,50,"309,045",1
87 | 3,50,"307,022",0.993454027730589
88 | 4,50,"158,562",0.513070912003106
89 | sk,50,"79,897",0.258528693232377
90 | 5,50,"68,758",0.222485398566552
91 | 1,50,"49,600",0.160494426378036
92 | 2,100,"169,444",1
93 | 3,100,"142,447",0.84067302471613
94 | 4,100,"66,126",0.390252826892661
95 | sk,100,"44,048",0.259956091688109
96 | 1,100,"34,234",0.202037251245249
97 | 5,100,"26,858",0.158506645263332
98 | 


--------------------------------------------------------------------------------
/docs/_includes/topnav.html:
--------------------------------------------------------------------------------
 1 | <!-- Navigation -->
 2 | <nav class="navbar navbar-inverse navbar-static-top">
 3 |     <div class="container topnavlinks">
 4 |         <div class="navbar-header">
 5 |             <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
 6 |                 <span class="sr-only">Toggle navigation</span>
 7 |                 <span class="icon-bar"></span>
 8 |                 <span class="icon-bar"></span>
 9 |                 <span class="icon-bar"></span>
10 |             </button>
11 |             <a class="fa fa-home fa-lg navbar-brand" href="index.html">&nbsp;<span class="projectTitle"> {{site.topnav_title}}</span></a>
12 |         </div>
13 |         <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
14 |             <ul class="nav navbar-nav navbar-right">
15 |                 <!-- toggle sidebar button -->
16 |                 <li><a id="tg-sb-link" href="#"><i id="tg-sb-icon" class="fa fa-toggle-on"></i> Nav</a></li>
17 |                 <!-- entries without drop-downs appear here -->
18 | 
19 | {% assign topnav = site.data[page.topnav] %}
20 | {% assign topnav_dropdowns = site.data[page.topnav].topnav_dropdowns %}
21 | 
22 |                 {% for entry in topnav.topnav %}
23 |                 {% for item in entry.items %}
24 |                 {% if item.external_url %}
25 |                 <li><a href="{{item.external_url}}" target="_blank">{{item.title}}</a></li>
26 |                 {% elsif page.url contains item.url %}
27 |                 <li class="active"><a href="{{item.url | remove: "/"}}">{{item.title}}</a></li>
28 |                 {% else %}
29 |                 <li><a href="{{item.url | remove: "/"}}">{{item.title}}</a></li>
30 |                 {% endif %}
31 |                 {% endfor %}
32 |                 {% endfor %}
33 |                 <!-- entries with drop-downs appear here -->
34 |                 <!-- conditional logic to control which topnav appears for the audience defined in the configuration file.-->
35 |                 {% for entry in topnav_dropdowns %}
36 |                 {% for folder in entry.folders %}
37 |                 <li class="dropdown">
38 |                     <a href="#" class="dropdown-toggle" data-toggle="dropdown">{{ folder.title }}<b class="caret"></b></a>
39 |                     <ul class="dropdown-menu">
40 |                         {% for folderitem in folder.folderitems %}
41 |                         {% if folderitem.external_url %}
42 |                         <li><a href="{{folderitem.external_url}}" target="_blank">{{folderitem.title}}</a></li>
43 |                         {% elsif page.url contains folderitem.url %}
44 |                         <li class="dropdownActive"><a href="{{folderitem.url |  remove: "/"}}">{{folderitem.title}}</a></li>
45 |                         {% else %}
46 |                         <li><a href="{{folderitem.url | remove: "/"}}">{{folderitem.title}}</a></li>
47 |                         {% endif %}
48 |                         {% endfor %}
49 |                     </ul>
50 |                 </li>
51 |                 {% endfor %}
52 |                 {% endfor %}
53 |                 {% if site.google_search %}
54 |                 <li>
55 |                     {% include search_google_custom.html %}
56 |                 </li>
57 |                 {% endif %}
58 |             </ul>
59 |         </div>
60 |         </div>
61 |         <!-- /.container -->
62 | </nav>
63 | 


--------------------------------------------------------------------------------
/docs/js/jquery.navgoco.min.js:
--------------------------------------------------------------------------------
1 | /*
2 |  * jQuery Navgoco Menus Plugin v0.2.1 (2014-04-11)
3 |  * https://github.com/tefra/navgoco
4 |  *
5 |  * Copyright (c) 2014 Chris T (@tefra)
6 |  * BSD - https://github.com/tefra/navgoco/blob/master/LICENSE-BSD
7 |  */
8 | !function(a){"use strict";var b=function(b,c,d){return this.el=b,this.$el=a(b),this.options=c,this.uuid=this.$el.attr("id")?this.$el.attr("id"):d,this.state={},this.init(),this};b.prototype={init:function(){var b=this;b._load(),b.$el.find("ul").each(function(c){var d=a(this);d.attr("data-index",c),b.options.save&&b.state.hasOwnProperty(c)?(d.parent().addClass(b.options.openClass),d.show()):d.parent().hasClass(b.options.openClass)?(d.show(),b.state[c]=1):d.hide()});var c=a("<span></span>").prepend(b.options.caretHtml),d=b.$el.find("li > a");b._trigger(c,!1),b._trigger(d,!0),b.$el.find("li:has(ul) > a").prepend(c)},_trigger:function(b,c){var d=this;b.on("click",function(b){b.stopPropagation();var e=c?a(this).next():a(this).parent().next(),f=!1;if(c){var g=a(this).attr("href");f=void 0===g||""===g||"#"===g}if(e=e.length>0?e:!1,d.options.onClickBefore.call(this,b,e),!c||e&&f)b.preventDefault(),d._toggle(e,e.is(":hidden")),d._save();else if(d.options.accordion){var h=d.state=d._parents(a(this));d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");h.hasOwnProperty(c)||d._toggle(b,!1)}),d._save()}d.options.onClickAfter.call(this,b,e)})},_toggle:function(b,c){var d=this,e=b.attr("data-index"),f=b.parent();if(d.options.onToggleBefore.call(this,b,c),c){if(f.addClass(d.options.openClass),b.slideDown(d.options.slide),d.state[e]=1,d.options.accordion){var g=d.state=d._parents(b);g[e]=d.state[e]=1,d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");g.hasOwnProperty(c)||d._toggle(b,!1)})}}else f.removeClass(d.options.openClass),b.slideUp(d.options.slide),d.state[e]=0;d.options.onToggleAfter.call(this,b,c)},_parents:function(b,c){var d={},e=b.parent(),f=e.parents("ul");return f.each(function(){var b=a(this),e=b.attr("data-index");return e?void(d[e]=c?b:1):!1}),d},_save:function(){if(this.options.save){var b={};for(var d in this.state)1===this.state[d]&&(b[d]=1);c[this.uuid]=this.state=b,a.cookie(this.options.cookie.name,JSON.stringify(c),this.options.cookie)}},_load:function(){if(this.options.save){if(null===c){var b=a.cookie(this.options.cookie.name);c=b?JSON.parse(b):{}}this.state=c.hasOwnProperty(this.uuid)?c[this.uuid]:{}}},toggle:function(b){var c=this,d=arguments.length;if(1>=d)c.$el.find("ul").each(function(){var d=a(this);c._toggle(d,b)});else{var e,f={},g=Array.prototype.slice.call(arguments,1);d--;for(var h=0;d>h;h++){e=g[h];var i=c.$el.find('ul[data-index="'+e+'"]').first();if(i&&(f[e]=i,b)){var j=c._parents(i,!0);for(var k in j)f.hasOwnProperty(k)||(f[k]=j[k])}}for(e in f)c._toggle(f[e],b)}c._save()},destroy:function(){a.removeData(this.$el),this.$el.find("li:has(ul) > a").unbind("click"),this.$el.find("li:has(ul) > a > span").unbind("click")}},a.fn.navgoco=function(c){if("string"==typeof c&&"_"!==c.charAt(0)&&"init"!==c)var d=!0,e=Array.prototype.slice.call(arguments,1);else c=a.extend({},a.fn.navgoco.defaults,c||{}),a.cookie||(c.save=!1);return this.each(function(f){var g=a(this),h=g.data("navgoco");h||(h=new b(this,d?a.fn.navgoco.defaults:c,f),g.data("navgoco",h)),d&&h[c].apply(h,e)})};var c=null;a.fn.navgoco.defaults={caretHtml:"",accordion:!1,openClass:"open",save:!0,cookie:{name:"navgoco",expires:!1,path:"/"},slide:{duration:400,easing:"swing"},onClickBefore:a.noop,onClickAfter:a.noop,onToggleBefore:a.noop,onToggleAfter:a.noop}}(jQuery);


--------------------------------------------------------------------------------
/docs/_includes/initialize_shuffle.html:
--------------------------------------------------------------------------------
  1 | <script type="text/javascript">
  2 | $(document).ready(function() {
  3 |     $('#toc').toc({ minimumHeaders: 0, listType: 'ul', showSpeed: 0, headers: 'h2,h3,h4' });
  4 | });
  5 | 
  6 | </script>
  7 | <!-- shuffle -->
  8 | <script>
  9 | var shuffleme = (function( $ ) {
 10 |   'use strict';
 11 | 
 12 |   var $grid = $('#grid'),
 13 |       $filterOptions = $('.filter-options'),
 14 |       $sizer = $grid.find('.shuffle_sizer'),
 15 | 
 16 |   init = function() {
 17 | 
 18 |     // None of these need to be executed synchronously
 19 |     setTimeout(function() {
 20 |       listen();
 21 |       setupFilters();
 22 |     }, 100);
 23 | 
 24 |     // instantiate the plugin
 25 |     $grid.shuffle({
 26 |       itemSelector: '[class*="col-"]',
 27 |       sizer: $sizer    
 28 |     });
 29 |   },
 30 | 
 31 |   // Set up button clicks
 32 |   setupFilters = function() {
 33 |     var $btns = $filterOptions.children();
 34 |     $btns.on('click', function() {
 35 |       var $this = $(this),
 36 |           isActive = $this.hasClass( 'active' ),
 37 |           group = isActive ? 'all' : $this.data('group');
 38 | 
 39 |       // Hide current label, show current label in title
 40 |       if ( !isActive ) {
 41 |         $('.filter-options .active').removeClass('active');
 42 |       }
 43 | 
 44 |       $this.toggleClass('active');
 45 | 
 46 |       // Filter elements
 47 |       $grid.shuffle( 'shuffle', group );
 48 |     });
 49 | 
 50 |     $btns = null;
 51 |   },
 52 | 
 53 |   // Re layout shuffle when images load. This is only needed
 54 |   // below 768 pixels because the .picture-item height is auto and therefore
 55 |   // the height of the picture-item is dependent on the image
 56 |   // I recommend using imagesloaded to determine when an image is loaded
 57 |   // but that doesn't support IE7
 58 |   listen = function() {
 59 |     var debouncedLayout = $.throttle( 300, function() {
 60 |       $grid.shuffle('update');
 61 |     });
 62 | 
 63 |     // Get all images inside shuffle
 64 |     $grid.find('img').each(function() {
 65 |       var proxyImage;
 66 | 
 67 |       // Image already loaded
 68 |       if ( this.complete && this.naturalWidth !== undefined ) {
 69 |         return;
 70 |       }
 71 | 
 72 |       // If none of the checks above matched, simulate loading on detached element.
 73 |       proxyImage = new Image();
 74 |       $( proxyImage ).on('load', function() {
 75 |         $(this).off('load');
 76 |         debouncedLayout();
 77 |       });
 78 | 
 79 |       proxyImage.src = this.src;
 80 |     });
 81 | 
 82 |     // Because this method doesn't seem to be perfect.
 83 |     setTimeout(function() {
 84 |       debouncedLayout();
 85 |     }, 500);
 86 |   };      
 87 | 
 88 |   return {
 89 |     init: init
 90 |   };
 91 | }( jQuery ));
 92 | 
 93 | 
 94 | 
 95 | $(document).ready(function() {
 96 |   shuffleme.init();
 97 | });
 98 | 
 99 |     </script>
100 | 
101 |     <!-- new attempt-->
102 | 
103 |     <script>
104 |     $(document).ready(function() {
105 |      
106 |     /* initialize shuffle plugin */
107 |     var $grid = $('#grid');
108 |          
109 |     $grid.shuffle({
110 |         itemSelector: '.item' // the selector for the items in the grid
111 |     });
112 |  
113 | });</script>
114 | 
115 | <script>
116 | $('#filter a').click(function (e) {
117 |     e.preventDefault();
118 |          
119 |     // set active class
120 |     $('#filter a').removeClass('active');
121 |     $(this).addClass('active');
122 |          
123 |     // get group name from clicked item
124 |     var groupName = $(this).attr('data-group');
125 |          
126 |     // reshuffle grid
127 |     $grid.shuffle('shuffle', groupName );
128 | });</script>
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/docs/css/printstyles.css:
--------------------------------------------------------------------------------
  1 | 
  2 | /*body.print .container {max-width: 650px;}*/
  3 | 
  4 | body {
  5 |     font-size:14px;
  6 | }
  7 | .nav ul li a {border-top:0px; background-color:transparent; color: #808080; }
  8 | #navig a[href] {color: #595959 !important;}
  9 | table .table {max-width:650px;}
 10 | 
 11 | #navig li.sectionHead {font-weight: bold; font-size: 18px; color: #595959 !important; }
 12 | #navig li {font-weight: normal; }
 13 | 
 14 | #navig a[href]::after { content: leader(".") target-counter(attr(href), page); }
 15 | 
 16 | a[href]::after {
 17 |     content: " (page " target-counter(attr(href), page) ")"
 18 | }
 19 | 
 20 | a[href^="http:"]::after, a[href^="https:"]::after {
 21 |     content: "";
 22 | }
 23 | 
 24 | a[href] {
 25 |     color: blue !important;
 26 | }
 27 | a[href*="mailto"]::after, a[data-toggle="tooltip"]::after, a[href].noCrossRef::after {
 28 |     content: "";
 29 | }
 30 | 
 31 | 
 32 | @page {
 33 |     margin: 60pt 90pt 60pt 90pt;
 34 |     font-family: sans-serif;
 35 |     font-style:none;
 36 |     color: gray;
 37 | 
 38 | }
 39 | 
 40 | .printTitle {
 41 |     line-height:30pt;
 42 |     font-size:27pt;
 43 |     font-weight: bold;
 44 |     letter-spacing: -.5px;
 45 |     margin-bottom:25px;
 46 | }
 47 | 
 48 | .printSubtitle {
 49 |     font-size: 19pt;
 50 |     color: #cccccc !important;
 51 |     font-family: "Grotesque MT Light";
 52 |     line-height: 22pt;
 53 |     letter-spacing: -.5px;
 54 |     margin-bottom:20px;
 55 | }
 56 | .printTitleArea hr {
 57 |     color: #999999 !important;
 58 |     height: 2px;
 59 |     width: 100%;
 60 | }
 61 | 
 62 | .printTitleImage {
 63 |     max-width:300px;
 64 |     margin-bottom:200px;
 65 | }
 66 | 
 67 | 
 68 | .printTitleImage {
 69 |     max-width: 250px;
 70 | }
 71 | 
 72 | #navig {
 73 |     /*page-break-before: always;*/
 74 | }
 75 | 
 76 | .copyrightBoilerplate {
 77 |     page-break-before:always;
 78 |     font-size:14px;
 79 | }
 80 | 
 81 | .lastGeneratedDate {
 82 |     font-style: italic;
 83 |     font-size:14px;
 84 |     color: gray;
 85 | }
 86 | 
 87 | .alert a {
 88 |     text-decoration: none !important;
 89 | }
 90 | 
 91 | 
 92 | body.title { page: title }
 93 | 
 94 | @page title {
 95 |     @top-left {
 96 |         content: " ";
 97 |     }
 98 |     @top-right {
 99 |         content: " "
100 |     }
101 |     @bottom-right {
102 |         content: " ";
103 |     }
104 |     @bottom-left {
105 |         content: " ";
106 |     }
107 | }
108 | 
109 | body.frontmatter { page: frontmatter }
110 | body.frontmatter {counter-reset: page 1}
111 | 
112 | 
113 | @page frontmatter {
114 |     @top-left {
115 |         content: prince-script(guideName);
116 |     }
117 |     @top-right {
118 |         content: prince-script(datestamp);
119 |     }
120 |     @bottom-right {
121 |         content: counter(page, lower-roman);
122 |     }
123 |     @bottom-left {
124 |         content: "youremail@domain.com";   }
125 | }
126 | 
127 | body.first_page {counter-reset: page 1}
128 | 
129 | h1 { string-set: doctitle content() }
130 | 
131 | @page {
132 |     @top-left {
133 |         content: string(doctitle);
134 |         font-size: 11px;
135 |         font-style: italic;
136 |     }
137 |     @top-right {
138 |         content: prince-script(datestamp);
139 |         font-size: 11px;
140 |     }
141 | 
142 |     @bottom-right {
143 |         content: "Page " counter(page);
144 |         font-size: 11px;
145 |     }
146 |     @bottom-left {
147 |         content: prince-script(guideName);
148 |         font-size: 11px;
149 |     }
150 | }
151 | .alert {
152 |     background-color: #fafafa !important;
153 |     border-color: #dedede !important;
154 |     color: black;
155 | }
156 | 
157 | pre {
158 |     background-color: #fafafa;
159 | }
160 | 


--------------------------------------------------------------------------------
/docs/css/syntax.css:
--------------------------------------------------------------------------------
 1 | .highlight  { background: #ffffff; }
 2 | .highlight .c { color: #999988; font-style: italic } /* Comment */
 3 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
 4 | .highlight .k { font-weight: bold } /* Keyword */
 5 | .highlight .o { font-weight: bold } /* Operator */
 6 | .highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */
 7 | .highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */
 8 | .highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */
 9 | .highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */
10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
11 | .highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */
12 | .highlight .ge { font-style: italic } /* Generic.Emph */
13 | .highlight .gr { color: #aa0000 } /* Generic.Error */
14 | .highlight .gh { color: #999999 } /* Generic.Heading */
15 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
16 | .highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */
17 | .highlight .go { color: #888888 } /* Generic.Output */
18 | .highlight .gp { color: #555555 } /* Generic.Prompt */
19 | .highlight .gs { font-weight: bold } /* Generic.Strong */
20 | .highlight .gu { color: #aaaaaa } /* Generic.Subheading */
21 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */
22 | .highlight .kc { font-weight: bold } /* Keyword.Constant */
23 | .highlight .kd { font-weight: bold } /* Keyword.Declaration */
24 | .highlight .kp { font-weight: bold } /* Keyword.Pseudo */
25 | .highlight .kr { font-weight: bold } /* Keyword.Reserved */
26 | .highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */
27 | .highlight .m { color: #009999 } /* Literal.Number */
28 | .highlight .s { color: #d14 } /* Literal.String */
29 | .highlight .na { color: #008080 } /* Name.Attribute */
30 | .highlight .nb { color: #0086B3 } /* Name.Builtin */
31 | .highlight .nc { color: #445588; font-weight: bold } /* Name.Class */
32 | .highlight .no { color: #008080 } /* Name.Constant */
33 | .highlight .ni { color: #800080 } /* Name.Entity */
34 | .highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */
35 | .highlight .nf { color: #990000; font-weight: bold } /* Name.Function */
36 | .highlight .nn { color: #555555 } /* Name.Namespace */
37 | .highlight .nt { color: #000080 } /* Name.Tag */
38 | .highlight .nv { color: #008080 } /* Name.Variable */
39 | .highlight .ow { font-weight: bold } /* Operator.Word */
40 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
41 | .highlight .mf { color: #009999 } /* Literal.Number.Float */
42 | .highlight .mh { color: #009999 } /* Literal.Number.Hex */
43 | .highlight .mi { color: #009999 } /* Literal.Number.Integer */
44 | .highlight .mo { color: #009999 } /* Literal.Number.Oct */
45 | .highlight .sb { color: #d14 } /* Literal.String.Backtick */
46 | .highlight .sc { color: #d14 } /* Literal.String.Char */
47 | .highlight .sd { color: #d14 } /* Literal.String.Doc */
48 | .highlight .s2 { color: #d14 } /* Literal.String.Double */
49 | .highlight .se { color: #d14 } /* Literal.String.Escape */
50 | .highlight .sh { color: #d14 } /* Literal.String.Heredoc */
51 | .highlight .si { color: #d14 } /* Literal.String.Interpol */
52 | .highlight .sx { color: #d14 } /* Literal.String.Other */
53 | .highlight .sr { color: #009926 } /* Literal.String.Regex */
54 | .highlight .s1 { color: #d14 } /* Literal.String.Single */
55 | .highlight .ss { color: #990073 } /* Literal.String.Symbol */
56 | .highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */
57 | .highlight .vc { color: #008080 } /* Name.Variable.Class */
58 | .highlight .vg { color: #008080 } /* Name.Variable.Global */
59 | .highlight .vi { color: #008080 } /* Name.Variable.Instance */
60 | .highlight .il { color: #009999 } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/js/toc.js:
--------------------------------------------------------------------------------
 1 | // https://github.com/ghiculescu/jekyll-table-of-contents
 2 | // this library modified by fastai to:
 3 | // - update the location.href with the correct anchor when a toc item is clicked on
 4 | (function($){
 5 |   $.fn.toc = function(options) {
 6 |     var defaults = {
 7 |       noBackToTopLinks: false,
 8 |       title: '',
 9 |       minimumHeaders: 3,
10 |       headers: 'h1, h2, h3, h4',
11 |       listType: 'ol', // values: [ol|ul]
12 |       showEffect: 'show', // values: [show|slideDown|fadeIn|none]
13 |       showSpeed: 'slow' // set to 0 to deactivate effect
14 |     },
15 |     settings = $.extend(defaults, options);
16 | 
17 |     var headers = $(settings.headers).filter(function() {
18 |       // get all headers with an ID
19 |       var previousSiblingName = $(this).prev().attr( "name" );
20 |       if (!this.id && previousSiblingName) {
21 |         this.id = $(this).attr( "id", previousSiblingName.replace(/\./g, "-") );
22 |       }
23 |       return this.id;
24 |     }), output = $(this);
25 |     if (!headers.length || headers.length < settings.minimumHeaders || !output.length) {
26 |       return;
27 |     }
28 | 
29 |     if (0 === settings.showSpeed) {
30 |       settings.showEffect = 'none';
31 |     }
32 | 
33 |     var render = {
34 |       show: function() { output.hide().html(html).show(settings.showSpeed); },
35 |       slideDown: function() { output.hide().html(html).slideDown(settings.showSpeed); },
36 |       fadeIn: function() { output.hide().html(html).fadeIn(settings.showSpeed); },
37 |       none: function() { output.html(html); }
38 |     };
39 | 
40 |     var get_level = function(ele) { return parseInt(ele.nodeName.replace("H", ""), 10); }
41 |     var highest_level = headers.map(function(_, ele) { return get_level(ele); }).get().sort()[0];
42 |     //var return_to_top = '<i class="glyphicon glyphicon-upload back-to-top"></i>';
43 |     // other nice icons that can be used instead: glyphicon-upload glyphicon-hand-up glyphicon-chevron-up glyphicon-menu-up glyphicon-triangle-top
44 |     var level = get_level(headers[0]),
45 |       this_level,
46 |       html = settings.title + " <"+settings.listType+">";
47 |     headers.on('click', function() {
48 |       if (!settings.noBackToTopLinks) {
49 |         var pos = $(window).scrollTop();
50 |         window.location.hash = this.id;
51 |         $(window).scrollTop(pos);
52 |       }
53 |     })
54 |     .addClass('clickable-header')
55 |     .each(function(_, header) {
56 |       base_url = window.location.href;
57 |       base_url = base_url.replace(/#.*$/, "");
58 |       this_level = get_level(header);
59 |       //if (!settings.noBackToTopLinks && this_level > 1) {
60 |       //  $(header).addClass('top-level-header').before(return_to_top);
61 |       //}
62 |       txt = header.textContent.split('¶')[0].split(/\[(test|source)\]/)[0];
63 |       if (!txt) {return;}
64 |       if (this_level === level) // same level as before; same indenting
65 |         html += "<li><a href='" + base_url + "#" + header.id + "'>" + txt + "</a>";
66 |       else if (this_level <= level){ // higher level than before; end parent ol
67 |         for(i = this_level; i < level; i++) {
68 |           html += "</li></"+settings.listType+">"
69 |         }
70 |         html += "<li><a href='" + base_url + "#" + header.id + "'>" + txt + "</a>";
71 |       }
72 |       else if (this_level > level) { // lower level than before; expand the previous to contain a ol
73 |         for(i = this_level; i > level; i--) {
74 |           html += "<"+settings.listType+">"+((i-level == 2) ? "<li class=\"hide_content\">" : "<li>")
75 |         }
76 |         html += "<a href='" + base_url + "#" + header.id + "'>" + txt + "</a>";
77 |       }
78 |       level = this_level; // update for the next one
79 |     });
80 |     html += "</"+settings.listType+">";
81 |     if (!settings.noBackToTopLinks) {
82 |       $(document).on('click', '.back-to-top', function() {
83 |         $(window).scrollTop(0);
84 |         window.location.hash = '';
85 |       });
86 |     }
87 | 
88 |     render[settings.showEffect]();
89 |   };
90 | })(jQuery);
91 | 


--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     {% include head.html %}
  5 |     <script>
  6 |         $(document).ready(function() {
  7 |             // Initialize navgoco with default options
  8 |             $("#mysidebar").navgoco({
  9 |                 caretHtml: '',
 10 |                 accordion: true,
 11 |                 openClass: 'active', // open
 12 |                 save: false, // leave false or nav highlighting doesn't work right
 13 |                 cookie: {
 14 |                     name: 'navgoco',
 15 |                     expires: false,
 16 |                     path: '/'
 17 |                 },
 18 |                 slide: {
 19 |                     duration: 400,
 20 |                     easing: 'swing'
 21 |                 }
 22 |             });
 23 | 
 24 |             $("#collapseAll").click(function(e) {
 25 |                 e.preventDefault();
 26 |                 $("#mysidebar").navgoco('toggle', false);
 27 |             });
 28 | 
 29 |             $("#expandAll").click(function(e) {
 30 |                 e.preventDefault();
 31 |                 $("#mysidebar").navgoco('toggle', true);
 32 |             });
 33 | 
 34 |             // activate menu items where href is matching to current page
 35 |             $("#mysidebar a[href='" + location.pathname.match(/\/([^\/]*)$/)[1] + "']")
 36 |                 .parents('li').addClass('active')
 37 |                 .parents('ul').css('display', 'block');
 38 |         });
 39 | 
 40 |     </script>
 41 |     <script>
 42 |         $(function () {
 43 |             $('[data-toggle="tooltip"]').tooltip()
 44 |         })
 45 |     </script>
 46 |     <script>
 47 |         $(document).ready(function() {
 48 |             $("#tg-sb-link").click(function() {
 49 |                 $("#tg-sb-sidebar").toggle();
 50 |                 $("#tg-sb-content").toggleClass('col-md-9');
 51 |                 $("#tg-sb-content").toggleClass('col-md-12');
 52 |                 $("#tg-sb-icon").toggleClass('fa-toggle-on');
 53 |                 $("#tg-sb-icon").toggleClass('fa-toggle-off');
 54 |             });
 55 |         });
 56 |     </script>
 57 |     {% if page.datatable == true %}
 58 |     <!-- Include the standard DataTables bits -->
 59 |     <link rel="stylesheet" type="text/css" href="//cdn.datatables.net/1.10.13/css/jquery.dataTables.css">
 60 |     <script type="text/javascript" charset="utf8" src="//cdn.datatables.net/1.10.13/js/jquery.dataTables.js"></script>
 61 |     <!-- First, this walks through the tables that occur between ...-begin
 62 |          and ...-end and add the "datatable" class to them.
 63 |          Then it invokes DataTable's standard initializer
 64 |          Credit here: http://www.beardedhacker.com/blog/2015/08/28/add-class-attribute-to-markdown-table/
 65 |       -->
 66 |     <script>
 67 |       $(document).ready(function(){
 68 |           $('div.datatable-begin').nextUntil('div.datatable-end', 'table').addClass('display');
 69 |           $('table.display').DataTable( {
 70 |               paging: true,
 71 |               stateSave: true,
 72 |               searching: true
 73 |           });
 74 |        });
 75 |     </script>
 76 |     {% endif %}
 77 | 
 78 | </head>
 79 | <body>
 80 | {% include topnav.html %}
 81 | <!-- Page Content -->
 82 | <div class="container">
 83 |   <div id="main">
 84 |     <!-- Content Row -->
 85 |     <div class="row">
 86 |         {% assign content_col_size = "col-md-12" %}
 87 |         {% unless page.hide_sidebar %}
 88 |             <!-- Sidebar Column -->
 89 |             <div class="col-md-3" id="tg-sb-sidebar">
 90 |                 {% include sidebar.html %}
 91 |             </div>
 92 |             {% assign content_col_size = "col-md-9" %}
 93 |         {% endunless %}
 94 | 
 95 |         <!-- Content Column -->
 96 |         <div class="{{content_col_size}}" id="tg-sb-content">
 97 |             {{content}}
 98 |         </div>
 99 |     <!-- /.row -->
100 | </div>
101 | <!-- /.container -->
102 | </div>
103 | <!-- /#main -->
104 |     </div>
105 | 
106 | </body>
107 | {% if site.google_analytics %}
108 | {% include google_analytics.html %}
109 | {% endif %}
110 | </html>
111 | 


--------------------------------------------------------------------------------
/20_utils.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# default_exp utils"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Utils\n",
 17 |     "\n",
 18 |     "> some utils :) "
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stderr",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "/home/dperezrada/anaconda3/envs/keywords2vec/lib/python3.7/site-packages/fastprogress/fastprogress.py:102: UserWarning: Couldn't import ipywidgets properly, progress bar will use console behavior\n",
 31 |       "  warn(\"Couldn't import ipywidgets properly, progress bar will use console behavior\")\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "#export\n",
 37 |     "from keywords2vec.imports import *\n",
 38 |     "\n",
 39 |     "from fastprogress.fastprogress import progress_bar\n",
 40 |     "from concurrent.futures import ProcessPoolExecutor, as_completed\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "#export\n",
 50 |     "\n",
 51 |     "# BEGIN From fastai\n",
 52 |     "def parallel(func, arr, max_workers=-1):\n",
 53 |     "    if max_workers == -1:\n",
 54 |     "        max_workers = num_cpus(2)\n",
 55 |     "    with ProcessPoolExecutor(max_workers=max_workers) as ex:\n",
 56 |     "        futures = [ex.submit(func, arr_el) for arr_el in arr]\n",
 57 |     "        results = []\n",
 58 |     "        for f in progress_bar(as_completed(futures), total=len(arr)):\n",
 59 |     "            results.append(f.result())\n",
 60 |     "        return results\n",
 61 |     "\n",
 62 |     "def num_cpus(n_cpus):\n",
 63 |     "    try:\n",
 64 |     "        return len(os.sched_getaffinity(0))\n",
 65 |     "    except AttributeError:\n",
 66 |     "        return os.cpu_count()\n",
 67 |     "    if n_cpus > 0:\n",
 68 |     "        return n_cpus\n",
 69 |     "    \"\"\"Get number of cpus.\"\"\"\n",
 70 |     "# END From fastai\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "def open_file(filepath, options):\n",
 74 |     "    if filepath[-3:] == \".gz\":\n",
 75 |     "        return gzip.open(filepath, options)\n",
 76 |     "    return open(filepath, options)\n",
 77 |     "\n",
 78 |     "\n",
 79 |     "def chunk_of_text(_file, chunk_size=-1):\n",
 80 |     "    index = 0\n",
 81 |     "    if chunk_size == -1:\n",
 82 |     "        chunk_size = 200\n",
 83 |     "    while True:\n",
 84 |     "        line = _file.readline()\n",
 85 |     "        if not line:\n",
 86 |     "            break\n",
 87 |     "        for sentence in line.split(\".\"):\n",
 88 |     "            if sentence.strip():\n",
 89 |     "                yield sentence.strip()\n",
 90 |     "        if index >= chunk_size:\n",
 91 |     "            break\n",
 92 |     "        index += 1\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "def get_file_chunks(start_index, filepath, lines_chunk, sample_size):\n",
 96 |     "    _file = open_file(filepath, 'rt')\n",
 97 |     "    texts = []\n",
 98 |     "    break_by_sample = False\n",
 99 |     "    while True:\n",
100 |     "        next_n_lines = list(chunk_of_text(_file, lines_chunk))\n",
101 |     "        texts.append(\"\\n\".join(next_n_lines) + \"\\n\")\n",
102 |     "        if not next_n_lines:\n",
103 |     "            break\n",
104 |     "        start_index += lines_chunk\n",
105 |     "        if sample_size > 0 and start_index >= sample_size:\n",
106 |     "            break_by_sample = True\n",
107 |     "            break\n",
108 |     "    _file.close()\n",
109 |     "    return (start_index, texts, break_by_sample)\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 3",
123 |    "language": "python",
124 |    "name": "python3"
125 |   }
126 |  },
127 |  "nbformat": 4,
128 |  "nbformat_minor": 4
129 | }
130 | 


--------------------------------------------------------------------------------
/docs/utils.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | title: Utils
  4 | 
  5 | keywords: fastai
  6 | sidebar: home_sidebar
  7 | 
  8 | summary: "some utils :) "
  9 | ---
 10 | <!--
 11 | 
 12 | #################################################
 13 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 14 | #################################################
 15 | # file to edit: 20_utils.ipynb
 16 | # command to build the docs after a change: nbdev_build_docs
 17 | 
 18 | -->
 19 | 
 20 | <div class="container" id="notebook-container">
 21 |     {% raw %}
 22 |         
 23 | <div class="cell border-box-sizing code_cell rendered">
 24 | 
 25 | </div>
 26 | <div class="cell border-box-sizing code_cell rendered">
 27 | 
 28 | </div>
 29 | <div class="cell border-box-sizing code_cell rendered">
 30 | 
 31 | </div>
 32 | <div class="cell border-box-sizing code_cell rendered">
 33 | 
 34 | <div class="output_wrapper">
 35 | <div class="output">
 36 | 
 37 | <div class="output_area">
 38 | 
 39 | 
 40 | <div class="output_markdown rendered_html output_subarea ">
 41 | <h4 id="parallel" class="doc_header"><code>parallel</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/utils.py#L15" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>parallel</code>(<strong><code>func</code></strong>, <strong><code>arr</code></strong>, <strong><code>max_workers</code></strong>=<em><code>-1</code></em>)</p>
 42 | </blockquote>
 43 | 
 44 | </div>
 45 | 
 46 | </div>
 47 | 
 48 | </div>
 49 | </div>
 50 | 
 51 | </div>
 52 | <div class="cell border-box-sizing code_cell rendered">
 53 | 
 54 | <div class="output_wrapper">
 55 | <div class="output">
 56 | 
 57 | <div class="output_area">
 58 | 
 59 | 
 60 | <div class="output_markdown rendered_html output_subarea ">
 61 | <h4 id="num_cpus" class="doc_header"><code>num_cpus</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/utils.py#L25" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>num_cpus</code>(<strong><code>n_cpus</code></strong>)</p>
 62 | </blockquote>
 63 | 
 64 | </div>
 65 | 
 66 | </div>
 67 | 
 68 | </div>
 69 | </div>
 70 | 
 71 | </div>
 72 | <div class="cell border-box-sizing code_cell rendered">
 73 | 
 74 | <div class="output_wrapper">
 75 | <div class="output">
 76 | 
 77 | <div class="output_area">
 78 | 
 79 | 
 80 | <div class="output_markdown rendered_html output_subarea ">
 81 | <h4 id="open_file" class="doc_header"><code>open_file</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/utils.py#L36" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>open_file</code>(<strong><code>filepath</code></strong>, <strong><code>options</code></strong>)</p>
 82 | </blockquote>
 83 | 
 84 | </div>
 85 | 
 86 | </div>
 87 | 
 88 | </div>
 89 | </div>
 90 | 
 91 | </div>
 92 | <div class="cell border-box-sizing code_cell rendered">
 93 | 
 94 | <div class="output_wrapper">
 95 | <div class="output">
 96 | 
 97 | <div class="output_area">
 98 | 
 99 | 
100 | <div class="output_markdown rendered_html output_subarea ">
101 | <h4 id="chunk_of_text" class="doc_header"><code>chunk_of_text</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/utils.py#L42" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>chunk_of_text</code>(<strong><code>_file</code></strong>, <strong><code>chunk_size</code></strong>=<em><code>-1</code></em>)</p>
102 | </blockquote>
103 | 
104 | </div>
105 | 
106 | </div>
107 | 
108 | </div>
109 | </div>
110 | 
111 | </div>
112 | <div class="cell border-box-sizing code_cell rendered">
113 | 
114 | <div class="output_wrapper">
115 | <div class="output">
116 | 
117 | <div class="output_area">
118 | 
119 | 
120 | <div class="output_markdown rendered_html output_subarea ">
121 | <h4 id="get_file_chunks" class="doc_header"><code>get_file_chunks</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/utils.py#L58" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>get_file_chunks</code>(<strong><code>start_index</code></strong>, <strong><code>filepath</code></strong>, <strong><code>lines_chunk</code></strong>, <strong><code>sample_size</code></strong>)</p>
122 | </blockquote>
123 | 
124 | </div>
125 | 
126 | </div>
127 | 
128 | </div>
129 | </div>
130 | 
131 | </div>
132 |     {% endraw %}
133 | </div>
134 |  
135 | 
136 | 


--------------------------------------------------------------------------------
/docs/_includes/head.html:
--------------------------------------------------------------------------------
 1 | <meta charset="utf-8">
 2 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
 3 | <meta name="viewport" content="width=device-width, initial-scale=1">
 4 | <meta name="description" content="{% if page.summary %}{{ page.summary | strip_html | strip_newlines | truncate: 160 }}{% endif %}">
 5 | <meta name="keywords" content="{{page.tags}}{% if page.tags %}, {% endif %} {{page.keywords}}">
 6 | <title>{{ page.title }} | {{ site.site_title }}</title>
 7 | <link rel="stylesheet" href="{{ "/css/syntax.css" | prepend: site.baseurl }}">
 8 | 
 9 | <link rel="stylesheet" type="text/css" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css">
10 | <!--<link rel="stylesheet" type="text/css" href="/css/bootstrap.min.css">-->
11 | <link rel="stylesheet" href="{{ "/css/modern-business.css" | prepend: site.baseurl }}">
12 | <!-- Latest compiled and minified CSS -->
13 | <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
14 | <link rel="stylesheet" href="{{ "/css/customstyles.css" | prepend: site.baseurl }}">
15 | <link rel="stylesheet" href="{{ "/css/boxshadowproperties.css" | prepend: site.baseurl }}">
16 | <!-- most color styles are extracted out to here -->
17 | <link rel="stylesheet" href="{{ "/css/theme-blue.css" | prepend: site.baseurl }}">
18 | 
19 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.1.4/jquery.min.js"></script>
20 | 
21 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js"></script>
22 | <script src="{{ "/js/jquery.navgoco.min.js" | prepend: site.baseurl }}"></script>
23 | 
24 | {% if site.use_math %}
25 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/katex.min.css" integrity="sha384-zB1R0rpPzHqg7Kpt0Aljp8JPLqbXI3bhnPWROx27a9N0Ll6ZP/+DiW/UqRcLbRjq" crossorigin="anonymous">
26 | <script defer src="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/katex.min.js" integrity="sha384-y23I5Q6l+B6vatafAwxRu/0oK/79VlbSz7Q9aiSZUvyWYIYsd+qj+o24G5ZU2zJz" crossorigin="anonymous"></script>
27 | <script defer src="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/contrib/auto-render.min.js" integrity="sha384-kWPLUVMOks5AQFrykwIup5lo0m3iMkkHrD0uJ4H5cjeGihAutqP0yW0J6dpFiVkI" crossorigin="anonymous"></script>
28 | <script>
29 | document.addEventListener("DOMContentLoaded", function() {
30 |   renderMathInElement( document.body, {
31 |     delimiters: [
32 |       {left: "$$", right: "$$", display: true},
33 |       {left: "[%", right: "%]", display: true},
34 |       {left: "$", right: "$", display: false}
35 |     ]}
36 |   );
37 | });
38 | </script>
39 | {% endif %}
40 | 
41 | <!-- Latest compiled and minified JavaScript -->
42 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
43 | <!-- Anchor.js -->
44 | <script src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/2.0.0/anchor.min.js"></script>
45 | <script src="{{ "/js/toc.js" | prepend: site.baseurl }}"></script>
46 | <script src="{{ "/js/customscripts.js" | prepend: site.baseurl }}"></script>
47 | 
48 | <link rel="shortcut icon" href="{{ "/images/favicon.ico?"  | prepend: site.baseurl }}">
49 | 
50 | <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
51 | <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
52 | <!--[if lt IE 9]>
53 | <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
54 | <script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
55 | <![endif]-->
56 | 
57 | <link rel="alternate" type="application/rss+xml" title="{{ site.title }}" href="{{ "/feed.xml" | prepend: site.baseurl | prepend: site.url }}">
58 | 
59 | <!-- Twitter cards -->
60 | {% if site.twitter_username %}
61 | <meta name="twitter:site"    content="@{{ site.twitter_username }}">
62 | <meta name="twitter:creator" content="@{{ page.author }}">
63 | <meta name="twitter:title"   content="{{ page.title }}">
64 | {% endif %}
65 | 
66 | {% if page.summary %}
67 | <meta name="twitter:description" content="{{ page.summary }}">
68 | {% else %}
69 | <meta name="twitter:description" content="{{ site.description }}">
70 | {% endif %}
71 | 
72 | {% if page.image %}
73 | <meta name="twitter:card"  content="summary_large_image">
74 | <meta name="twitter:image" content="{{ site.url }}{{ page.image }}">
75 | {% else %}
76 | <!-- <meta name="twitter:card"  content="summary"> -->
77 | <!-- <meta name="twitter:image" content="{{ site.title_image }}"> -->
78 | {% endif %}
79 | <!-- end of Twitter cards -->
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/docs/js/jekyll-search.js:
--------------------------------------------------------------------------------
1 | !function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a="function"==typeof require&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);throw new Error("Cannot find module '"+o+"'")}var f=n[o]={exports:{}};t[o][0].call(f.exports,function(e){var n=t[o][1][e];return s(n?n:e)},f,f.exports,e,t,n,r)}return n[o].exports}for(var i="function"==typeof require&&require,o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module){module.exports=function(){function receivedResponse(xhr){return 200==xhr.status&&4==xhr.readyState}function handleResponse(xhr,callback){xhr.onreadystatechange=function(){if(receivedResponse(xhr))try{callback(null,JSON.parse(xhr.responseText))}catch(err){callback(err,null)}}}var self=this;self.load=function(location,callback){var xhr=window.XMLHttpRequest?new XMLHttpRequest:new ActiveXObject("Microsoft.XMLHTTP");xhr.open("GET",location,!0),handleResponse(xhr,callback),xhr.send()}}},{}],2:[function(require,module){function FuzzySearchStrategy(){function createFuzzyRegExpFromString(string){return new RegExp(string.split("").join(".*?"),"gi")}var self=this;self.matches=function(string,crit){return"string"!=typeof string?!1:(string=string.trim(),!!string.match(createFuzzyRegExpFromString(crit)))}}module.exports=new FuzzySearchStrategy},{}],3:[function(require,module){function LiteralSearchStrategy(){function doMatch(string,crit){return string.toLowerCase().indexOf(crit.toLowerCase())>=0}var self=this;self.matches=function(string,crit){return"string"!=typeof string?!1:(string=string.trim(),doMatch(string,crit))}}module.exports=new LiteralSearchStrategy},{}],4:[function(require,module){module.exports=function(){function findMatches(store,crit,strategy){for(var data=store.get(),i=0;i<data.length&&matches.length<limit;i++)findMatchesInObject(data[i],crit,strategy);return matches}function findMatchesInObject(obj,crit,strategy){for(var key in obj)if(strategy.matches(obj[key],crit)){matches.push(obj);break}}function getSearchStrategy(){return fuzzy?fuzzySearchStrategy:literalSearchStrategy}var self=this,matches=[],fuzzy=!1,limit=10,fuzzySearchStrategy=require("./SearchStrategies/fuzzy"),literalSearchStrategy=require("./SearchStrategies/literal");self.setFuzzy=function(_fuzzy){fuzzy=!!_fuzzy},self.setLimit=function(_limit){limit=parseInt(_limit,10)||limit},self.search=function(data,crit){return crit?(matches.length=0,findMatches(data,crit,getSearchStrategy())):[]}}},{"./SearchStrategies/fuzzy":2,"./SearchStrategies/literal":3}],5:[function(require,module){module.exports=function(_store){function isObject(obj){return!!obj&&"[object Object]"==Object.prototype.toString.call(obj)}function isArray(obj){return!!obj&&"[object Array]"==Object.prototype.toString.call(obj)}function addObject(data){return store.push(data),data}function addArray(data){for(var added=[],i=0;i<data.length;i++)isObject(data[i])&&added.push(addObject(data[i]));return added}var self=this,store=[];isArray(_store)&&addArray(_store),self.clear=function(){return store.length=0,store},self.get=function(){return store},self.put=function(data){return isObject(data)?addObject(data):isArray(data)?addArray(data):void 0}}},{}],6:[function(require,module){module.exports=function(){var self=this,templatePattern=/\{(.*?)\}/g;self.setTemplatePattern=function(newTemplatePattern){templatePattern=newTemplatePattern},self.render=function(t,data){return t.replace(templatePattern,function(match,prop){return data[prop]||match})}}},{}],7:[function(require){!function(window){"use strict";function SimpleJekyllSearch(){function initWithJSON(){store.put(opt.dataSource),registerInput()}function initWithURL(url){jsonLoader.load(url,function(err,json){err?throwError("failed to get JSON ("+url+")"):(store.put(json),registerInput())})}function throwError(message){throw new Error("SimpleJekyllSearch --- "+message)}function validateOptions(_opt){for(var i=0;i<requiredOptions.length;i++){var req=requiredOptions[i];_opt[req]||throwError("You must specify a "+req)}}function assignOptions(_opt){for(var option in opt)opt[option]=_opt[option]||opt[option]}function isJSON(json){try{return json instanceof Object&&JSON.parse(JSON.stringify(json))}catch(e){return!1}}function emptyResultsContainer(){opt.resultsContainer.innerHTML=""}function appendToResultsContainer(text){opt.resultsContainer.innerHTML+=text}function registerInput(){opt.searchInput.addEventListener("keyup",function(e){return 0==e.target.value.length?void emptyResultsContainer():void render(searcher.search(store,e.target.value))})}function render(results){if(emptyResultsContainer(),0==results.length)return appendToResultsContainer(opt.noResultsText);for(var i=0;i<results.length;i++)appendToResultsContainer(templater.render(opt.searchResultTemplate,results[i]))}var self=this,requiredOptions=["searchInput","resultsContainer","dataSource"],opt={searchInput:null,resultsContainer:null,dataSource:[],searchResultTemplate:'<li><a href="{url}" title="{desc}">{title}</a></li>',noResultsText:"No results found",limit:10,fuzzy:!1};self.init=function(_opt){validateOptions(_opt),assignOptions(_opt),isJSON(opt.dataSource)?initWithJSON(opt.dataSource):initWithURL(opt.dataSource)}}var Searcher=require("./Searcher"),Templater=require("./Templater"),Store=require("./Store"),JSONLoader=require("./JSONLoader"),searcher=new Searcher,templater=new Templater,store=new Store,jsonLoader=new JSONLoader;window.SimpleJekyllSearch=new SimpleJekyllSearch}(window,document)},{"./JSONLoader":1,"./Searcher":4,"./Store":5,"./Templater":6}]},{},[7]);
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # keywords2vec
  2 | > A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words.
  3 | 
  4 | 
  5 | ## Example result
  6 | 
  7 | Finding similar keywords for "__obesity__"
  8 | 
  9 | | index | term                        |
 10 | |-------|-----------------------------|
 11 | | 0     | overweight                  |
 12 | | 1     | obese                       |
 13 | | 2     | physical inactivity         |
 14 | | 3     | excess weight               |
 15 | | 4     | obese adults                |
 16 | | 5     | high bmi                    |
 17 | | 6     | obese adults                |
 18 | | 7     | obese people                |
 19 | | 8     | obesity-related outcomes    |
 20 | | 9     | obesity among children      |
 21 | | 10    | poor sleep quality          |
 22 | | 11    | ssbs                        |
 23 | | 12    | obese populations           |
 24 | | 13    | cardiometabolic risk        |
 25 | | 14    | abdominal obesity           |
 26 | 
 27 | 
 28 | ## Install
 29 | 
 30 | `pip install keywords2vec`
 31 | 
 32 | ## How to use
 33 | 
 34 | Lets download some example data
 35 | 
 36 | ```
 37 | data_filepath = "epistemonikos_data_sample.tsv.gz"
 38 | 
 39 | !wget "https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz" -O "{data_filepath}"
 40 | ```
 41 | 
 42 | Import
 43 | 
 44 | ```
 45 | from keywords2vec.main import similars_tree, get_similars
 46 | ```
 47 | 
 48 | 
 49 | We create the model.
 50 | 
 51 | ```
 52 | labels, tree = similars_tree(data_filepath)
 53 | ```
 54 | 
 55 | More info, take a look [here](30_main.ipynb)
 56 | 
 57 | 
 58 | 
 59 | Then we can get the most similars keywords
 60 | 
 61 | ```
 62 | get_similars(tree, labels, "obesity")
 63 | ```
 64 | 
 65 | 
 66 | 
 67 | 
 68 |     ['obesity',
 69 |      'overweight',
 70 |      'obese',
 71 |      'physical inactivity',
 72 |      'excess weight',
 73 |      'high bmi',
 74 |      'obese adults',
 75 |      'obese people',
 76 |      'obesity-related outcomes',
 77 |      'obesity among children',
 78 |      'poor sleep quality',
 79 |      'ssbs',
 80 |      'obese populations',
 81 |      'cardiometabolic risk',
 82 |      'abdominal obesity']
 83 | 
 84 | 
 85 | 
 86 | ```
 87 | get_similars(tree, labels, "heart failure")
 88 | ```
 89 | 
 90 | 
 91 | 
 92 | 
 93 |     ['heart failure',
 94 |      'hf',
 95 |      'chf',
 96 |      'chronic heart failure',
 97 |      'reduced ejection fraction',
 98 |      'unstable angina',
 99 |      'peripheral vascular disease',
100 |      'peripheral arterial disease',
101 |      'angina',
102 |      'congestive heart failure',
103 |      'left ventricular systolic dysfunction',
104 |      'acute coronary syndrome',
105 |      'heart failure patients',
106 |      'acute myocardial infarction',
107 |      'left ventricular dysfunction']
108 | 
109 | 
110 | 
111 | ### Motivation
112 | 
113 | The idea started in the Epistemonikos database [www.epistemonikos.org](https://www.epistemonikos.org), a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:
114 | 
115 |  * asthma
116 |  * heart failure
117 |  * medial compartment knee osteoarthritis
118 |  * preserved left ventricular systolic function
119 |  * non-selective non-steroidal anti-inflammatory drugs
120 |  
121 | We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.
122 | 
123 | 
124 | ### Our approach
125 | 
126 | We found that tokenizing using stopwords + non word characters was really useful for "finding" the keywords. An example:
127 | 
128 | * input: "Timing of replacement therapy for acute renal failure after cardiac surgery"
129 | * output: [
130 | 	"timing",
131 | 	"replacement therapy",
132 | 	"acute renal failure",
133 | 	"cardiac surgery"
134 | ]
135 | 
136 | So we basically split the text when we find:
137 |  * a stopword
138 |  * a non word character(/,!?. etc) (except from - and ')
139 | 
140 | That's it.
141 | 
142 | But as there were some problem with some keywords that cointain stopwords, like:
143 |  * Vitamin A
144 |  * Hepatitis A
145 |  * Web of Science
146 | 
147 | So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter `keywords_w_stopwords=True`, this method is approx 20x slower.
148 | 
149 | ### References
150 | 
151 | Seem to be an old idea (2004):
152 | 
153 | *Mihalcea, Rada, and Paul Tarau. "Textrank: Bringing order into text." Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.*
154 | 
155 | Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec
156 | 
157 | As pointed by @deliprao in this [twitter thread](https://twitter.com/jeremyphoward/status/1094025901371621376). It's also used by Rake (2010):
158 | 
159 | *Rose, Stuart & Engel, Dave & Cramer, Nick & Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.*
160 | 
161 | As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)
162 | [https://www.nltk.org/book/ch07.html#Chinking](https://www.nltk.org/book/ch07.html#Chinking)
163 | 
164 | 
165 | ### Multi-lingual
166 | We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese
167 | 
168 | 
169 | ## Try it online
170 | 
171 | You can try it [here](http://54.196.169.11/episte/) (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)
172 | 
173 | These embedding were created using 827,341 title/abstract from @epistemonikos database.
174 | With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)
175 | 
176 | ## Vocab size
177 | 
178 | One of the main benefit of this method, is the size of the vocabulary. 
179 | For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:
180 | 
181 | | ngrams             | keywords  | comp    |
182 | |--------------------|-----------|---------|
183 | | 1                  | 127,824   | 36%     |
184 | | 1,2                | 1,360,550 | 388%    |
185 | | 1-3                | 3,204,099 | 914%    |
186 | | 1-4                | 4,461,930 | 1,272%  |
187 | | 1-5                | 5,133,619 | 1,464%  |
188 | |                    |           |         |
189 | | stopword tokenizer | 350,529   | 100%    |
190 | 
191 | More information regarding the comparison, take a look to the folder [analyze](analyze).
192 | 
193 | 
194 | ## Credits
195 | 
196 | This project has been created using [nbdev](https://github.com/fastai/nbdev)
197 | 


--------------------------------------------------------------------------------
/keywords2vec/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 10_tokenizer.ipynb (unless otherwise specified).
  2 | 
  3 | __all__ = ['NUMBERS_STOPWORDS', 'prepare_stopwords', 'tokenize_one', 'get_nodes_for_ntlk', 'tokenize_by_nltk',
  4 |            'tokenize']
  5 | 
  6 | # Cell
  7 | from .imports import *
  8 | 
  9 | # Cell
 10 | NUMBERS_STOPWORDS = {
 11 |     "en": [
 12 |         "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one", "twenty-two", "twenty-three", "twenty-four", "twenty-five", "twenty-six", "twenty-seven", "twenty-eight", "twenty-nine", "thirty", "thirty-one", "thirty-two", "thirty-three", "thirty-four", "thirty-five", "thirty-six", "thirty-seven", "thirty-eight", "thirty-nine", "forty", "forty-one", "forty-two", "forty-three", "forty-four", "forty-five", "forty-six", "forty-seven", "forty-eight", "forty-nine", "fifty", "fifty-one", "fifty-two", "fifty-three", "fifty-four", "fifty-five", "fifty-six", "fifty-seven", "fifty-eight", "fifty-nine", "sixty", "sixty-one", "sixty-two", "sixty-three", "sixty-four", "sixty-five", "sixty-six", "sixty-seven", "sixty-eight", "sixty-nine", "seventy", "seventy-one", "seventy-two", "seventy-three", "seventy-four", "seventy-five", "seventy-six", "seventy-seven", "seventy-eight", "seventy-nine", "eighty", "eighty-one", "eighty-two", "eighty-three", "eighty-four", "eighty-five", "eighty-six", "eighty-seven", "eighty-eight", "eighty-nine", "ninety", "ninety-one", "ninety-two", "ninety-three", "ninety-four", "ninety-five", "ninety-six", "ninety-seven", "ninety-eight", "ninety-nine"
 13 |     ],
 14 |     "es": []
 15 | }
 16 | 
 17 | # Cell
 18 | def prepare_stopwords(stopwords=None, additional_stopwords=None, lang="en"):
 19 |     if stopwords is None:
 20 |         stopwords = safe_get_stop_words(lang) + (NUMBERS_STOPWORDS.get(lang) or [])
 21 |     if additional_stopwords:
 22 |         stopwords += additional_stopwords
 23 |     return [
 24 |         stopword
 25 |         for stopword in stopwords
 26 |         if stopword
 27 |     ]
 28 | 
 29 | 
 30 | def tokenize_one(text, stopwords=None, additional_stopwords=None, lang="en", split_by_stopwords=True):
 31 |     stopwords = prepare_stopwords(stopwords, additional_stopwords, lang)
 32 |     text_part = text.lower()
 33 | 
 34 |     regexs = []
 35 |     if split_by_stopwords:
 36 |         # Remove all stopwords by a !, we are searching for the stopword (bounded)
 37 |         regexs.append(
 38 |             ("\\b" + "\\b|\\b".join(stopwords), "!!")
 39 |         )
 40 |     # Must be executed in order
 41 |     regexs += [
 42 |         ("’", "'"),
 43 |         # Remove all non alpha, numeric, spaces, - or single quote
 44 |         (r'([^a-z0-9\u00C0-\u1FFF\u2C00-\uD7FF \n\-\'])', "!!"),
 45 |         # remove only words numbers
 46 |         (r'(^|[ !])[\-0-9]+([ !]|$)', "!!"),
 47 |         # remove hyphen-minus for keywords starting or ending with it
 48 |         (r'((^|[ !])[\-\']+)|([\-\']+([ !]|$))', "!!"),
 49 |         # remove spaces between !
 50 |         (r' *! *', "!!"),
 51 |         # generate multiple ! need for next regex
 52 |         (r'!', "!!"),
 53 |         # remove one character keyword
 54 |         (r'(^|!)[^!\n](!|$)', "!!"),
 55 |         # remove multiple ! (!!!!)
 56 |         (r'!+', "!"),
 57 |         # remove first and last !
 58 |         (r'(^!+)|(!+$)', "")
 59 |     ]
 60 | 
 61 |     for regex, replacement in regexs:
 62 |         text_part = re.sub(regex, replacement, text_part, flags=re.M)
 63 |     return text_part
 64 | 
 65 | 
 66 | # Second option to tokenize the information
 67 | def get_nodes_for_ntlk(parent, stopwords, valid_labels):
 68 |     keywords = []
 69 |     for node in parent:
 70 |         if type(node) is nltk.Tree:
 71 |             if node.label() in valid_labels:
 72 |                 phrase = " ".join([key.lower() for key, value in node.leaves()])
 73 |                 phrase = unidecode.unidecode(phrase)
 74 |                 for subtree in node.subtrees():
 75 |                     subtree_keywords = get_nodes_for_ntlk(subtree, stopwords, valid_labels)
 76 |                     keywords.extend(subtree_keywords)
 77 |                 if phrase not in stopwords:
 78 |                     pattern = re.compile(r"([^\s\w-]|_)+")
 79 |                     phrase = pattern.sub('', phrase).strip()
 80 |                     keywords.append(phrase)
 81 |     return keywords
 82 | 
 83 | 
 84 | def tokenize_by_nltk(text, stopwords=None, additional_stopwords=None, lang="en"):
 85 |     stopwords = prepare_stopwords(stopwords, additional_stopwords, lang)
 86 |     # grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
 87 |     grammar = r"""
 88 |         PHRASE1: {<NN.*>+ <DT>*}
 89 |         PHRASE2: {<JJ> <PHRASE1>+}
 90 |         PHRASE3: {<JJ>+ <PHRASE2>}
 91 |         PHRASE4: {(<JJ>* <NN.*>+ <IN>) <PHRASE3>}
 92 |     """
 93 |     valid_labels = ["PHRASE1", "PHRASE2", "PHRASE3", "PHRASE4", "KT"]
 94 | 
 95 |     chunker = nltk.RegexpParser(grammar, loop=5)
 96 |     chunker2 = nltk.RegexpParser(r"KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}", loop=5)
 97 |     output = ""
 98 | 
 99 |     for line in text.splitlines():
100 |         sentences = nltk.sent_tokenize(line)
101 |         sentences = [nltk.word_tokenize(sent) for sent in sentences]
102 |         sentences = [nltk.pos_tag(sent) for sent in sentences]
103 |         keyphrases = []
104 |         relevant_words = []
105 |         for sentence in sentences:
106 |             keyphrases.append(chunker.parse(sentence))
107 |             keyphrases.append(chunker2.parse(sentence))
108 |         for elem in keyphrases:
109 |             relevant_words += get_nodes_for_ntlk(elem, stopwords, valid_labels)
110 |         output += "!".join(relevant_words) + "!"
111 | 
112 |     #output = re.sub("\\b" + "\\b|!".join(stopwords), "!", output, flags=re.M).lower()
113 |     output = tokenize_one(output, split_by_stopwords=False)
114 |     return output
115 | 
116 | 
117 | def tokenize(text, text_output=False, lang="en", keywords_w_stopwords=False, merge=True):
118 |     outputs = []
119 |     tokenizers = [tokenize_one]
120 |     if lang == "en" and keywords_w_stopwords:
121 |         tokenizers.append(tokenize_by_nltk)
122 | 
123 |     for tokenizer_el in tokenizers:
124 |         outputs.append(
125 |             tokenizer_el(
126 |                 text,
127 |                 lang=lang
128 |             )
129 |         )
130 |     if text_output:
131 |         if merge:
132 |             return "!".join(outputs)
133 |         else:
134 |             return outputs
135 |     keywords = [
136 |         [
137 |             keyword.strip()
138 |             for phrase in re.split("\r\n|\n", output)
139 |             for keyword in phrase.split("!")
140 |         ]
141 |         for output in outputs
142 |     ]
143 |     if merge:
144 |         return [item for sublist in keywords for item in sublist]
145 |     else:
146 |         return keywords


--------------------------------------------------------------------------------
/docs/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GEM
  2 |   remote: https://rubygems.org/
  3 |   specs:
  4 |     activesupport (4.2.11.1)
  5 |       i18n (~> 0.7)
  6 |       minitest (~> 5.1)
  7 |       thread_safe (~> 0.3, >= 0.3.4)
  8 |       tzinfo (~> 1.1)
  9 |     addressable (2.7.0)
 10 |       public_suffix (>= 2.0.2, < 5.0)
 11 |     coffee-script (2.4.1)
 12 |       coffee-script-source
 13 |       execjs
 14 |     coffee-script-source (1.11.1)
 15 |     colorator (1.1.0)
 16 |     commonmarker (0.17.13)
 17 |       ruby-enum (~> 0.5)
 18 |     concurrent-ruby (1.1.5)
 19 |     dnsruby (1.61.3)
 20 |       addressable (~> 2.5)
 21 |     em-websocket (0.5.1)
 22 |       eventmachine (>= 0.12.9)
 23 |       http_parser.rb (~> 0.6.0)
 24 |     ethon (0.12.0)
 25 |       ffi (>= 1.3.0)
 26 |     eventmachine (1.2.7)
 27 |     execjs (2.7.0)
 28 |     faraday (0.17.0)
 29 |       multipart-post (>= 1.2, < 3)
 30 |     ffi (1.11.3)
 31 |     forwardable-extended (2.6.0)
 32 |     gemoji (3.0.1)
 33 |     github-pages (202)
 34 |       activesupport (= 4.2.11.1)
 35 |       github-pages-health-check (= 1.16.1)
 36 |       jekyll (= 3.8.5)
 37 |       jekyll-avatar (= 0.6.0)
 38 |       jekyll-coffeescript (= 1.1.1)
 39 |       jekyll-commonmark-ghpages (= 0.1.6)
 40 |       jekyll-default-layout (= 0.1.4)
 41 |       jekyll-feed (= 0.11.0)
 42 |       jekyll-gist (= 1.5.0)
 43 |       jekyll-github-metadata (= 2.12.1)
 44 |       jekyll-mentions (= 1.4.1)
 45 |       jekyll-optional-front-matter (= 0.3.0)
 46 |       jekyll-paginate (= 1.1.0)
 47 |       jekyll-readme-index (= 0.2.0)
 48 |       jekyll-redirect-from (= 0.14.0)
 49 |       jekyll-relative-links (= 0.6.0)
 50 |       jekyll-remote-theme (= 0.4.0)
 51 |       jekyll-sass-converter (= 1.5.2)
 52 |       jekyll-seo-tag (= 2.5.0)
 53 |       jekyll-sitemap (= 1.2.0)
 54 |       jekyll-swiss (= 0.4.0)
 55 |       jekyll-theme-architect (= 0.1.1)
 56 |       jekyll-theme-cayman (= 0.1.1)
 57 |       jekyll-theme-dinky (= 0.1.1)
 58 |       jekyll-theme-hacker (= 0.1.1)
 59 |       jekyll-theme-leap-day (= 0.1.1)
 60 |       jekyll-theme-merlot (= 0.1.1)
 61 |       jekyll-theme-midnight (= 0.1.1)
 62 |       jekyll-theme-minimal (= 0.1.1)
 63 |       jekyll-theme-modernist (= 0.1.1)
 64 |       jekyll-theme-primer (= 0.5.3)
 65 |       jekyll-theme-slate (= 0.1.1)
 66 |       jekyll-theme-tactile (= 0.1.1)
 67 |       jekyll-theme-time-machine (= 0.1.1)
 68 |       jekyll-titles-from-headings (= 0.5.1)
 69 |       jemoji (= 0.10.2)
 70 |       kramdown (= 1.17.0)
 71 |       liquid (= 4.0.0)
 72 |       listen (= 3.1.5)
 73 |       mercenary (~> 0.3)
 74 |       minima (= 2.5.0)
 75 |       nokogiri (>= 1.10.4, < 2.0)
 76 |       rouge (= 3.11.0)
 77 |       terminal-table (~> 1.4)
 78 |     github-pages-health-check (1.16.1)
 79 |       addressable (~> 2.3)
 80 |       dnsruby (~> 1.60)
 81 |       octokit (~> 4.0)
 82 |       public_suffix (~> 3.0)
 83 |       typhoeus (~> 1.3)
 84 |     html-pipeline (2.12.2)
 85 |       activesupport (>= 2)
 86 |       nokogiri (>= 1.4)
 87 |     http_parser.rb (0.6.0)
 88 |     i18n (0.9.5)
 89 |       concurrent-ruby (~> 1.0)
 90 |     jekyll (3.8.5)
 91 |       addressable (~> 2.4)
 92 |       colorator (~> 1.0)
 93 |       em-websocket (~> 0.5)
 94 |       i18n (~> 0.7)
 95 |       jekyll-sass-converter (~> 1.0)
 96 |       jekyll-watch (~> 2.0)
 97 |       kramdown (~> 1.14)
 98 |       liquid (~> 4.0)
 99 |       mercenary (~> 0.3.3)
100 |       pathutil (~> 0.9)
101 |       rouge (>= 1.7, < 4)
102 |       safe_yaml (~> 1.0)
103 |     jekyll-avatar (0.6.0)
104 |       jekyll (~> 3.0)
105 |     jekyll-coffeescript (1.1.1)
106 |       coffee-script (~> 2.2)
107 |       coffee-script-source (~> 1.11.1)
108 |     jekyll-commonmark (1.3.1)
109 |       commonmarker (~> 0.14)
110 |       jekyll (>= 3.7, < 5.0)
111 |     jekyll-commonmark-ghpages (0.1.6)
112 |       commonmarker (~> 0.17.6)
113 |       jekyll-commonmark (~> 1.2)
114 |       rouge (>= 2.0, < 4.0)
115 |     jekyll-default-layout (0.1.4)
116 |       jekyll (~> 3.0)
117 |     jekyll-feed (0.11.0)
118 |       jekyll (~> 3.3)
119 |     jekyll-gist (1.5.0)
120 |       octokit (~> 4.2)
121 |     jekyll-github-metadata (2.12.1)
122 |       jekyll (~> 3.4)
123 |       octokit (~> 4.0, != 4.4.0)
124 |     jekyll-mentions (1.4.1)
125 |       html-pipeline (~> 2.3)
126 |       jekyll (~> 3.0)
127 |     jekyll-optional-front-matter (0.3.0)
128 |       jekyll (~> 3.0)
129 |     jekyll-paginate (1.1.0)
130 |     jekyll-readme-index (0.2.0)
131 |       jekyll (~> 3.0)
132 |     jekyll-redirect-from (0.14.0)
133 |       jekyll (~> 3.3)
134 |     jekyll-relative-links (0.6.0)
135 |       jekyll (~> 3.3)
136 |     jekyll-remote-theme (0.4.0)
137 |       addressable (~> 2.0)
138 |       jekyll (~> 3.5)
139 |       rubyzip (>= 1.2.1, < 3.0)
140 |     jekyll-sass-converter (1.5.2)
141 |       sass (~> 3.4)
142 |     jekyll-seo-tag (2.5.0)
143 |       jekyll (~> 3.3)
144 |     jekyll-sitemap (1.2.0)
145 |       jekyll (~> 3.3)
146 |     jekyll-swiss (0.4.0)
147 |     jekyll-theme-architect (0.1.1)
148 |       jekyll (~> 3.5)
149 |       jekyll-seo-tag (~> 2.0)
150 |     jekyll-theme-cayman (0.1.1)
151 |       jekyll (~> 3.5)
152 |       jekyll-seo-tag (~> 2.0)
153 |     jekyll-theme-dinky (0.1.1)
154 |       jekyll (~> 3.5)
155 |       jekyll-seo-tag (~> 2.0)
156 |     jekyll-theme-hacker (0.1.1)
157 |       jekyll (~> 3.5)
158 |       jekyll-seo-tag (~> 2.0)
159 |     jekyll-theme-leap-day (0.1.1)
160 |       jekyll (~> 3.5)
161 |       jekyll-seo-tag (~> 2.0)
162 |     jekyll-theme-merlot (0.1.1)
163 |       jekyll (~> 3.5)
164 |       jekyll-seo-tag (~> 2.0)
165 |     jekyll-theme-midnight (0.1.1)
166 |       jekyll (~> 3.5)
167 |       jekyll-seo-tag (~> 2.0)
168 |     jekyll-theme-minimal (0.1.1)
169 |       jekyll (~> 3.5)
170 |       jekyll-seo-tag (~> 2.0)
171 |     jekyll-theme-modernist (0.1.1)
172 |       jekyll (~> 3.5)
173 |       jekyll-seo-tag (~> 2.0)
174 |     jekyll-theme-primer (0.5.3)
175 |       jekyll (~> 3.5)
176 |       jekyll-github-metadata (~> 2.9)
177 |       jekyll-seo-tag (~> 2.0)
178 |     jekyll-theme-slate (0.1.1)
179 |       jekyll (~> 3.5)
180 |       jekyll-seo-tag (~> 2.0)
181 |     jekyll-theme-tactile (0.1.1)
182 |       jekyll (~> 3.5)
183 |       jekyll-seo-tag (~> 2.0)
184 |     jekyll-theme-time-machine (0.1.1)
185 |       jekyll (~> 3.5)
186 |       jekyll-seo-tag (~> 2.0)
187 |     jekyll-titles-from-headings (0.5.1)
188 |       jekyll (~> 3.3)
189 |     jekyll-watch (2.2.1)
190 |       listen (~> 3.0)
191 |     jemoji (0.10.2)
192 |       gemoji (~> 3.0)
193 |       html-pipeline (~> 2.2)
194 |       jekyll (~> 3.0)
195 |     kramdown (1.17.0)
196 |     liquid (4.0.0)
197 |     listen (3.1.5)
198 |       rb-fsevent (~> 0.9, >= 0.9.4)
199 |       rb-inotify (~> 0.9, >= 0.9.7)
200 |       ruby_dep (~> 1.2)
201 |     mercenary (0.3.6)
202 |     mini_portile2 (2.4.0)
203 |     minima (2.5.0)
204 |       jekyll (~> 3.5)
205 |       jekyll-feed (~> 0.9)
206 |       jekyll-seo-tag (~> 2.1)
207 |     minitest (5.13.0)
208 |     multipart-post (2.1.1)
209 |     nokogiri (1.10.8)
210 |       mini_portile2 (~> 2.4.0)
211 |     octokit (4.14.0)
212 |       sawyer (~> 0.8.0, >= 0.5.3)
213 |     pathutil (0.16.2)
214 |       forwardable-extended (~> 2.6)
215 |     public_suffix (3.1.1)
216 |     rb-fsevent (0.10.3)
217 |     rb-inotify (0.10.0)
218 |       ffi (~> 1.0)
219 |     rouge (3.11.0)
220 |     ruby-enum (0.7.2)
221 |       i18n
222 |     ruby_dep (1.5.0)
223 |     rubyzip (2.0.0)
224 |     safe_yaml (1.0.5)
225 |     sass (3.7.4)
226 |       sass-listen (~> 4.0.0)
227 |     sass-listen (4.0.0)
228 |       rb-fsevent (~> 0.9, >= 0.9.4)
229 |       rb-inotify (~> 0.9, >= 0.9.7)
230 |     sawyer (0.8.2)
231 |       addressable (>= 2.3.5)
232 |       faraday (> 0.8, < 2.0)
233 |     terminal-table (1.8.0)
234 |       unicode-display_width (~> 1.1, >= 1.1.1)
235 |     thread_safe (0.3.6)
236 |     typhoeus (1.3.1)
237 |       ethon (>= 0.9.0)
238 |     tzinfo (1.2.5)
239 |       thread_safe (~> 0.1)
240 |     unicode-display_width (1.6.0)
241 | 
242 | PLATFORMS
243 |   ruby
244 | 
245 | DEPENDENCIES
246 |   github-pages
247 |   jekyll (~> 3.7)
248 | 
249 | BUNDLED WITH
250 |    2.0.2
251 | 


--------------------------------------------------------------------------------
/docs/main.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | title: Main
  4 | 
  5 | keywords: fastai
  6 | sidebar: home_sidebar
  7 | 
  8 | summary: "This are the main functions, where we are going to "
  9 | ---
 10 | <!--
 11 | 
 12 | #################################################
 13 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 14 | #################################################
 15 | # file to edit: 30_main.ipynb
 16 | # command to build the docs after a change: nbdev_build_docs
 17 | 
 18 | -->
 19 | 
 20 | <div class="container" id="notebook-container">
 21 |     {% raw %}
 22 |         
 23 | <div class="cell border-box-sizing code_cell rendered">
 24 | 
 25 | </div>
 26 | <div class="cell border-box-sizing code_cell rendered">
 27 | <div class="input">
 28 | 
 29 | <div class="inner_cell">
 30 |     <div class="input_area">
 31 | <div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">nbdev.showdoc</span> <span class="kn">import</span> <span class="o">*</span>
 32 | </pre></div>
 33 | 
 34 |     </div>
 35 | </div>
 36 | </div>
 37 | 
 38 | </div>
 39 | <div class="cell border-box-sizing code_cell rendered">
 40 | 
 41 | </div>
 42 | <div class="cell border-box-sizing code_cell rendered">
 43 | 
 44 | </div>
 45 | <div class="cell border-box-sizing code_cell rendered">
 46 | 
 47 | <div class="output_wrapper">
 48 | <div class="output">
 49 | 
 50 | <div class="output_area">
 51 | 
 52 | 
 53 | <div class="output_markdown rendered_html output_subarea ">
 54 | <h4 id="similars_tree" class="doc_header"><code>similars_tree</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/main.py#L67" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>similars_tree</code>(<strong><code>input_path</code></strong>, <strong><code>temp_tokenized_file</code></strong>=<em><code>'tmp_tokenized.txt'</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>, <strong><code>sample_size</code></strong>=<em><code>-1</code></em>, <strong><code>lines_chunks</code></strong>=<em><code>-1</code></em>, <strong><code>n_cpus</code></strong>=<em><code>-1</code></em>, <strong><code>keywords_w_stopwords</code></strong>=<em><code>False</code></em>)</p>
 55 | </blockquote>
 56 | 
 57 | </div>
 58 | 
 59 | </div>
 60 | 
 61 | </div>
 62 | </div>
 63 | 
 64 | </div>
 65 | <div class="cell border-box-sizing code_cell rendered">
 66 | <div class="input">
 67 | 
 68 | <div class="inner_cell">
 69 |     <div class="input_area">
 70 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">data_url</span> <span class="o">=</span> <span class="s2">&quot;https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz&quot;</span>
 71 | <span class="n">data_filepath</span> <span class="o">=</span> <span class="s2">&quot;epistemonikos_data_sample.tsv.gz&quot;</span>
 72 | <span class="n">tokenized_filepath</span> <span class="o">=</span> <span class="s2">&quot;tokenized_epistemonikos_data.txt&quot;</span>
 73 | <span class="o">!</span>wget <span class="s2">&quot;{data_url}&quot;</span> -O <span class="s2">&quot;{data_filepath}&quot;</span>
 74 | </pre></div>
 75 | 
 76 |     </div>
 77 | </div>
 78 | </div>
 79 | 
 80 | <div class="output_wrapper">
 81 | <div class="output">
 82 | 
 83 | <div class="output_area">
 84 | 
 85 | <div class="output_subarea output_stream output_stdout output_text">
 86 | <pre>--2020-02-25 11:52:04--  https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz
 87 | Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.240.38
 88 | Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.240.38|:443... connected.
 89 | HTTP request sent, awaiting response... 200 OK
 90 | Length: 21510551 (21M) [application/gzip]
 91 | Saving to: ‘epistemonikos_data_sample.tsv.gz’
 92 | 
 93 | epistemonikos_data_ 100%[===================&gt;]  20.51M  1.76MB/s    in 12s     
 94 | 
 95 | 2020-02-25 11:52:17 (1.70 MB/s) - ‘epistemonikos_data_sample.tsv.gz’ saved [21510551/21510551]
 96 | 
 97 | </pre>
 98 | </div>
 99 | </div>
100 | 
101 | </div>
102 | </div>
103 | 
104 | </div>
105 | <div class="cell border-box-sizing code_cell rendered">
106 | 
107 | <div class="output_wrapper">
108 | <div class="output">
109 | 
110 | <div class="output_area">
111 | 
112 | 
113 | <div class="output_markdown rendered_html output_subarea ">
114 | <h4 id="tokenize_file" class="doc_header"><code>tokenize_file</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/main.py#L18" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>tokenize_file</code>(<strong><code>input_path</code></strong>, <strong><code>output_path</code></strong>=<em><code>'tokenized.txt'</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>, <strong><code>sample_size</code></strong>=<em><code>-1</code></em>, <strong><code>lines_chunks</code></strong>=<em><code>-1</code></em>, <strong><code>n_cpus</code></strong>=<em><code>-1</code></em>, <strong><code>keywords_w_stopwords</code></strong>=<em><code>False</code></em>)</p>
115 | </blockquote>
116 | 
117 | </div>
118 | 
119 | </div>
120 | 
121 | </div>
122 | </div>
123 | 
124 | </div>
125 | <div class="cell border-box-sizing code_cell rendered">
126 | <div class="input">
127 | 
128 | <div class="inner_cell">
129 |     <div class="input_area">
130 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenize_file</span><span class="p">(</span><span class="n">data_filepath</span><span class="p">,</span> <span class="n">tokenized_filepath</span><span class="p">)</span>
131 | </pre></div>
132 | 
133 |     </div>
134 | </div>
135 | </div>
136 | 
137 | <div class="output_wrapper">
138 | <div class="output">
139 | 
140 | <div class="output_area">
141 | 
142 | <div class="output_subarea output_stream output_stdout output_text">
143 | <pre>processing file: epistemonikos_data_sample.tsv.gz
144 | </pre>
145 | </div>
146 | </div>
147 | 
148 | <div class="output_area">
149 | 
150 | 
151 | <div class="output_html rendered_html output_subarea ">
152 | 
153 |     <div>
154 |         <style>
155 |             /* Turns off some styling */
156 |             progress {
157 |                 /* gets rid of default border in Firefox and Opera. */
158 |                 border: none;
159 |                 /* Needs to be in here for Safari polyfill so background images work as expected. */
160 |                 background-size: auto;
161 |             }
162 |             .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {
163 |                 background: #F44336;
164 |             }
165 |         </style>
166 |       <progress value='201' class='' max='201', style='width:300px; height:20px; vertical-align: middle;'></progress>
167 |       100.00% [201/201 00:16<00:00]
168 |     </div>
169 |     
170 | </div>
171 | 
172 | </div>
173 | 
174 | <div class="output_area">
175 | 
176 | 
177 | 
178 | <div class="output_text output_subarea output_execute_result">
179 | <pre>&#39;tokenized_epistemonikos_data.txt&#39;</pre>
180 | </div>
181 | 
182 | </div>
183 | 
184 | </div>
185 | </div>
186 | 
187 | </div>
188 | <div class="cell border-box-sizing code_cell rendered">
189 | 
190 | <div class="output_wrapper">
191 | <div class="output">
192 | 
193 | <div class="output_area">
194 | 
195 | 
196 | <div class="output_markdown rendered_html output_subarea ">
197 | <h4 id="train_model" class="doc_header"><code>train_model</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/main.py#L43" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>train_model</code>(<strong><code>input_filename</code></strong>)</p>
198 | </blockquote>
199 | 
200 | </div>
201 | 
202 | </div>
203 | 
204 | </div>
205 | </div>
206 | 
207 | </div>
208 | <div class="cell border-box-sizing code_cell rendered">
209 | <div class="input">
210 | 
211 | <div class="inner_cell">
212 |     <div class="input_area">
213 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">model</span> <span class="o">=</span> <span class="n">train_model</span><span class="p">(</span><span class="n">tokenized_filepath</span><span class="p">)</span>
214 | </pre></div>
215 | 
216 |     </div>
217 | </div>
218 | </div>
219 | 
220 | </div>
221 | <div class="cell border-box-sizing code_cell rendered">
222 | 
223 | <div class="output_wrapper">
224 | <div class="output">
225 | 
226 | <div class="output_area">
227 | 
228 | 
229 | <div class="output_markdown rendered_html output_subarea ">
230 | <h4 id="similars_tree_from_model" class="doc_header"><code>similars_tree_from_model</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/main.py#L47" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>similars_tree_from_model</code>(<strong><code>model</code></strong>, <strong><code>vector_size</code></strong>=<em><code>100</code></em>)</p>
231 | </blockquote>
232 | 
233 | </div>
234 | 
235 | </div>
236 | 
237 | </div>
238 | </div>
239 | 
240 | </div>
241 | <div class="cell border-box-sizing code_cell rendered">
242 | <div class="input">
243 | 
244 | <div class="inner_cell">
245 |     <div class="input_area">
246 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">labels</span><span class="p">,</span> <span class="n">tree</span> <span class="o">=</span> <span class="n">similars_tree_from_model</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
247 | </pre></div>
248 | 
249 |     </div>
250 | </div>
251 | </div>
252 | 
253 | </div>
254 | <div class="cell border-box-sizing code_cell rendered">
255 | 
256 | <div class="output_wrapper">
257 | <div class="output">
258 | 
259 | <div class="output_area">
260 | 
261 | 
262 | <div class="output_markdown rendered_html output_subarea ">
263 | <h4 id="get_similars" class="doc_header"><code>get_similars</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/main.py#L58" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>get_similars</code>(<strong><code>tree</code></strong>, <strong><code>labels</code></strong>, <strong><code>keyword</code></strong>, <strong><code>n_similars</code></strong>=<em><code>10</code></em>, <strong><code>show_score</code></strong>=<em><code>False</code></em>)</p>
264 | </blockquote>
265 | 
266 | </div>
267 | 
268 | </div>
269 | 
270 | </div>
271 | </div>
272 | 
273 | </div>
274 | <div class="cell border-box-sizing code_cell rendered">
275 | <div class="input">
276 | 
277 | <div class="inner_cell">
278 |     <div class="input_area">
279 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">get_similars</span><span class="p">(</span><span class="n">tree</span><span class="p">,</span> <span class="n">labels</span><span class="p">,</span> <span class="s2">&quot;obesity&quot;</span><span class="p">)</span>
280 | </pre></div>
281 | 
282 |     </div>
283 | </div>
284 | </div>
285 | 
286 | <div class="output_wrapper">
287 | <div class="output">
288 | 
289 | <div class="output_area">
290 | 
291 | 
292 | 
293 | <div class="output_text output_subarea output_execute_result">
294 | <pre>[&#39;obesity&#39;,
295 |  &#39;overweight&#39;,
296 |  &#39;obese children&#39;,
297 |  &#39;ssbs&#39;,
298 |  &#39;poor sleep quality&#39;,
299 |  &#39;metabolic syndrome&#39;,
300 |  &#39;obesity among children&#39;,
301 |  &#39;dental caries&#39;,
302 |  &#39;physical inactivity&#39;,
303 |  &#39;obesity may&#39;,
304 |  &#39;sedentary behaviour&#39;,
305 |  &#39;food allergy&#39;,
306 |  &#39;sugar-sweetened beverages&#39;,
307 |  &#39;worldwide prevalence&#39;,
308 |  &#39;known risk factor&#39;]</pre>
309 | </div>
310 | 
311 | </div>
312 | 
313 | </div>
314 | </div>
315 | 
316 | </div>
317 |     {% endraw %}
318 | </div>
319 |  
320 | 
321 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/index.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#hide\n",
 10 |     "from keywords2vec.main import *"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "# keywords2vec\n",
 18 |     "\n",
 19 |     "> A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words.\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Example result"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "Finding similar keywords for \"__obesity__\"\n",
 34 |     "\n",
 35 |     "| index | term                        |\n",
 36 |     "|-------|-----------------------------|\n",
 37 |     "| 0     | overweight                  |\n",
 38 |     "| 1     | obese                       |\n",
 39 |     "| 2     | physical inactivity         |\n",
 40 |     "| 3     | excess weight               |\n",
 41 |     "| 4     | obese adults                |\n",
 42 |     "| 5     | high bmi                    |\n",
 43 |     "| 6     | obese adults                |\n",
 44 |     "| 7     | obese people                |\n",
 45 |     "| 8     | obesity-related outcomes    |\n",
 46 |     "| 9     | obesity among children      |\n",
 47 |     "| 10    | poor sleep quality          |\n",
 48 |     "| 11    | ssbs                        |\n",
 49 |     "| 12    | obese populations           |\n",
 50 |     "| 13    | cardiometabolic risk        |\n",
 51 |     "| 14    | abdominal obesity           |\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Install"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "`pip install keywords2vec`"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## How to use"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Lets download some example data"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "data_filepath = \"epistemonikos_data_sample.tsv.gz\"\n",
 89 |     "\n",
 90 |     "!wget \"https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\" -O \"{data_filepath}\""
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "We create the model. If you need the vectors, take a look [here](30_main.ipynb)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "processing file: epistemonikos_data_sample.tsv.gz\n"
110 |      ]
111 |     },
112 |     {
113 |      "data": {
114 |       "text/html": [
115 |        "\n",
116 |        "    <div>\n",
117 |        "        <style>\n",
118 |        "            /* Turns off some styling */\n",
119 |        "            progress {\n",
120 |        "                /* gets rid of default border in Firefox and Opera. */\n",
121 |        "                border: none;\n",
122 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
123 |        "                background-size: auto;\n",
124 |        "            }\n",
125 |        "            .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
126 |        "                background: #F44336;\n",
127 |        "            }\n",
128 |        "        </style>\n",
129 |        "      <progress value='201' class='' max='201', style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
130 |        "      100.00% [201/201 00:19<00:00]\n",
131 |        "    </div>\n",
132 |        "    "
133 |       ],
134 |       "text/plain": [
135 |        "<IPython.core.display.HTML object>"
136 |       ]
137 |      },
138 |      "metadata": {},
139 |      "output_type": "display_data"
140 |     }
141 |    ],
142 |    "source": [
143 |     "labels, tree = similars_tree(data_filepath)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "Then we can get the most similars keywords"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "['obesity',\n",
162 |        " 'overweight',\n",
163 |        " 'obese',\n",
164 |        " 'physical inactivity',\n",
165 |        " 'excess weight',\n",
166 |        " 'high bmi',\n",
167 |        " 'obese adults',\n",
168 |        " 'obese people',\n",
169 |        " 'obesity-related outcomes',\n",
170 |        " 'obesity among children',\n",
171 |        " 'poor sleep quality',\n",
172 |        " 'ssbs',\n",
173 |        " 'obese populations',\n",
174 |        " 'cardiometabolic risk',\n",
175 |        " 'abdominal obesity']"
176 |       ]
177 |      },
178 |      "execution_count": null,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "get_similars(tree, labels, \"obesity\")"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/plain": [
195 |        "['heart failure',\n",
196 |        " 'hf',\n",
197 |        " 'chf',\n",
198 |        " 'chronic heart failure',\n",
199 |        " 'reduced ejection fraction',\n",
200 |        " 'unstable angina',\n",
201 |        " 'peripheral vascular disease',\n",
202 |        " 'peripheral arterial disease',\n",
203 |        " 'angina',\n",
204 |        " 'congestive heart failure',\n",
205 |        " 'left ventricular systolic dysfunction',\n",
206 |        " 'acute coronary syndrome',\n",
207 |        " 'heart failure patients',\n",
208 |        " 'acute myocardial infarction',\n",
209 |        " 'left ventricular dysfunction']"
210 |       ]
211 |      },
212 |      "execution_count": null,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "get_similars(tree, labels, \"heart failure\")"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "### Motivation\n",
226 |     "\n",
227 |     "The idea started in the Epistemonikos database [www.epistemonikos.org](https://www.epistemonikos.org), a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:\n",
228 |     "\n",
229 |     " * asthma\n",
230 |     " * heart failure\n",
231 |     " * medial compartment knee osteoarthritis\n",
232 |     " * preserved left ventricular systolic function\n",
233 |     " * non-selective non-steroidal anti-inflammatory drugs\n",
234 |     " \n",
235 |     "We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.\n"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### Our approach\n",
243 |     "\n",
244 |     "We found that tokenizing using stopwords + non word characters was really useful for \"finding\" the keywords. An example:\n",
245 |     "\n",
246 |     "* input: \"Timing of replacement therapy for acute renal failure after cardiac surgery\"\n",
247 |     "* output: [\n",
248 |     "\t\"timing\",\n",
249 |     "\t\"replacement therapy\",\n",
250 |     "\t\"acute renal failure\",\n",
251 |     "\t\"cardiac surgery\"\n",
252 |     "]\n",
253 |     "\n",
254 |     "So we basically split the text when we find:\n",
255 |     " * a stopword\n",
256 |     " * a non word character(/,!?. etc) (except from - and ')\n",
257 |     "\n",
258 |     "That's it.\n",
259 |     "\n",
260 |     "But as there were some problem with some keywords that cointain stopwords, like:\n",
261 |     " * Vitamin A\n",
262 |     " * Hepatitis A\n",
263 |     " * Web of Science\n",
264 |     "\n",
265 |     "So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter `keywords_w_stopwords=True`, this method is approx 20x slower."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "### References\n",
273 |     "\n",
274 |     "Seem to be an old idea (2004):\n",
275 |     "\n",
276 |     "*Mihalcea, Rada, and Paul Tarau. \"Textrank: Bringing order into text.\" Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.*\n",
277 |     "\n",
278 |     "Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec\n",
279 |     "\n",
280 |     "As pointed by @deliprao in this [twitter thread](https://twitter.com/jeremyphoward/status/1094025901371621376). It's also used by Rake (2010):\n",
281 |     "\n",
282 |     "*Rose, Stuart & Engel, Dave & Cramer, Nick & Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.*\n",
283 |     "\n",
284 |     "As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)\n",
285 |     "[https://www.nltk.org/book/ch07.html#Chinking](https://www.nltk.org/book/ch07.html#Chinking)\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "### Multi-lingual\n",
293 |     "We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese\n"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## Try it online\n",
301 |     "\n",
302 |     "You can try it [here](http://54.196.169.11/episte/) (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)\n",
303 |     "\n",
304 |     "These embedding were created using 827,341 title/abstract from @epistemonikos database.\n",
305 |     "With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "## Vocab size\n",
313 |     "\n",
314 |     "One of the main benefit of this method, is the size of the vocabulary. \n",
315 |     "For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:\n",
316 |     "\n",
317 |     "| ngrams             | keywords  | comp    |\n",
318 |     "|--------------------|-----------|---------|\n",
319 |     "| 1                  | 127,824   | 36%     |\n",
320 |     "| 1,2                | 1,360,550 | 388%    |\n",
321 |     "| 1-3                | 3,204,099 | 914%    |\n",
322 |     "| 1-4                | 4,461,930 | 1,272%  |\n",
323 |     "| 1-5                | 5,133,619 | 1,464%  |\n",
324 |     "|                    |           |         |\n",
325 |     "| stopword tokenizer | 350,529   | 100%    |\n",
326 |     "\n",
327 |     "More information regarding the comparison, take a look to the folder [analyze](analyze).\n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "## Credits"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "This project has been created using [nbdev](https://github.com/fastai/nbdev)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": []
350 |   }
351 |  ],
352 |  "metadata": {
353 |   "kernelspec": {
354 |    "display_name": "Python 3",
355 |    "language": "python",
356 |    "name": "python3"
357 |   }
358 |  },
359 |  "nbformat": 4,
360 |  "nbformat_minor": 2
361 | }
362 | 


--------------------------------------------------------------------------------
/30_main.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# default_exp main"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from nbdev.showdoc import *"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Main\n",
 26 |     "\n",
 27 |     "> This are the main functions, where we are going to "
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# export \n",
 37 |     "from keywords2vec.imports import *\n",
 38 |     "\n",
 39 |     "from glob import glob\n",
 40 |     "from functools import partial\n",
 41 |     "\n",
 42 |     "import fasttext\n",
 43 |     "\n",
 44 |     "from keywords2vec.utils import parallel, open_file, chunk_of_text, get_file_chunks\n",
 45 |     "from keywords2vec.tokenizer import tokenize"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "#export\n",
 55 |     "\n",
 56 |     "def tokenize_file(\n",
 57 |     "    input_path, output_path=\"tokenized.txt\", lang=\"en\",\n",
 58 |     "    sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False\n",
 59 |     "):\n",
 60 |     "    tokenize_wrapper = partial(\n",
 61 |     "        tokenize, lang=lang, text_output=True, merge=True, keywords_w_stopwords=keywords_w_stopwords\n",
 62 |     "    )\n",
 63 |     "\n",
 64 |     "    index = 0\n",
 65 |     "\n",
 66 |     "    with open(output_path, \"wt\") as _output:\n",
 67 |     "        for file_path in glob(input_path):\n",
 68 |     "            print(\"processing file:\", file_path)\n",
 69 |     "            # We are going to split the text in chunks to show some progress.\n",
 70 |     "            new_index, text_chunks, break_by_sample = get_file_chunks(index, file_path, lines_chunks, sample_size)\n",
 71 |     "            index = new_index\n",
 72 |     "            results = parallel(tokenize_wrapper, text_chunks, n_cpus)\n",
 73 |     "            _output.write(\n",
 74 |     "                (\"\\n\".join(results) + \"\\n\").replace(\" \", \"_\").replace(\"!\", \" \")\n",
 75 |     "            )\n",
 76 |     "            if break_by_sample:\n",
 77 |     "                break\n",
 78 |     "    return output_path\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "def train_model(input_filename):\n",
 82 |     "    model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)\n",
 83 |     "    return model\n",
 84 |     "\n",
 85 |     "def similars_tree_from_model(model, vector_size=100):\n",
 86 |     "    f = 100\n",
 87 |     "    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed\n",
 88 |     "    labels = model.labels\n",
 89 |     "    for index, label in enumerate(labels):\n",
 90 |     "        v = model[label]\n",
 91 |     "        t.add_item(index, v)\n",
 92 |     "\n",
 93 |     "    t.build(10) # 10 trees\n",
 94 |     "    return labels, t\n",
 95 |     "\n",
 96 |     "def get_similars(tree, labels, keyword, n_similars=10, show_score=False):\n",
 97 |     "    index = labels.index(keyword.replace(\" \", \"_\"))\n",
 98 |     "    suggestions, scores = tree.get_nns_by_item(index, n=15, include_distances=True)\n",
 99 |     "    suggested_labels = [\n",
100 |     "        labels[suggestion].replace(\"_\", \" \")\n",
101 |     "        for suggestion in suggestions\n",
102 |     "    ]\n",
103 |     "    return suggested_labels\n",
104 |     "\n",
105 |     "def similars_tree(\n",
106 |     "    input_path, temp_tokenized_file=\"tmp_tokenized.txt\", lang=\"en\",\n",
107 |     "    sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False\n",
108 |     "):\n",
109 |     "    tokenize_file(\n",
110 |     "        input_path=input_path, output_path=temp_tokenized_file, lang=lang,\n",
111 |     "        sample_size=sample_size, lines_chunks=lines_chunks, n_cpus=n_cpus,\n",
112 |     "        keywords_w_stopwords=keywords_w_stopwords\n",
113 |     "    )\n",
114 |     "    model = train_model(temp_tokenized_file)\n",
115 |     "    labels, tree = similars_tree_from_model(model)\n",
116 |     "    return labels, tree\n",
117 |     "    "
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "--2020-02-25 11:52:04--  https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\n",
130 |       "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.240.38\n",
131 |       "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.240.38|:443... connected.\n",
132 |       "HTTP request sent, awaiting response... 200 OK\n",
133 |       "Length: 21510551 (21M) [application/gzip]\n",
134 |       "Saving to: ‘epistemonikos_data_sample.tsv.gz’\n",
135 |       "\n",
136 |       "epistemonikos_data_ 100%[===================>]  20.51M  1.76MB/s    in 12s     \n",
137 |       "\n",
138 |       "2020-02-25 11:52:17 (1.70 MB/s) - ‘epistemonikos_data_sample.tsv.gz’ saved [21510551/21510551]\n",
139 |       "\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "data_url = \"https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\"\n",
145 |     "data_filepath = \"epistemonikos_data_sample.tsv.gz\"\n",
146 |     "tokenized_filepath = \"tokenized_epistemonikos_data.txt\"\n",
147 |     "!wget \"{data_url}\" -O \"{data_filepath}\""
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/markdown": [
158 |        "<h4 id=\"tokenize_file\" class=\"doc_header\"><code>tokenize_file</code><a href=\"__main__.py#L3\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
159 |        "\n",
160 |        "> <code>tokenize_file</code>(**`input_path`**, **`output_path`**=*`'tokenized.txt'`*, **`lang`**=*`'en'`*, **`sample_size`**=*`-1`*, **`lines_chunks`**=*`-1`*, **`n_cpus`**=*`-1`*, **`keywords_w_stopwords`**=*`False`*)\n",
161 |        "\n"
162 |       ],
163 |       "text/plain": [
164 |        "<IPython.core.display.Markdown object>"
165 |       ]
166 |      },
167 |      "metadata": {},
168 |      "output_type": "display_data"
169 |     }
170 |    ],
171 |    "source": [
172 |     "show_doc(tokenize_file)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "processing file: epistemonikos_data_sample.tsv.gz\n"
185 |      ]
186 |     },
187 |     {
188 |      "data": {
189 |       "text/html": [
190 |        "\n",
191 |        "    <div>\n",
192 |        "        <style>\n",
193 |        "            /* Turns off some styling */\n",
194 |        "            progress {\n",
195 |        "                /* gets rid of default border in Firefox and Opera. */\n",
196 |        "                border: none;\n",
197 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
198 |        "                background-size: auto;\n",
199 |        "            }\n",
200 |        "            .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
201 |        "                background: #F44336;\n",
202 |        "            }\n",
203 |        "        </style>\n",
204 |        "      <progress value='201' class='' max='201', style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
205 |        "      100.00% [201/201 00:16<00:00]\n",
206 |        "    </div>\n",
207 |        "    "
208 |       ],
209 |       "text/plain": [
210 |        "<IPython.core.display.HTML object>"
211 |       ]
212 |      },
213 |      "metadata": {},
214 |      "output_type": "display_data"
215 |     },
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "'tokenized_epistemonikos_data.txt'"
220 |       ]
221 |      },
222 |      "execution_count": null,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "tokenize_file(data_filepath, tokenized_filepath)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "data": {
238 |       "text/markdown": [
239 |        "<h4 id=\"train_model\" class=\"doc_header\"><code>train_model</code><a href=\"__main__.py#L28\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
240 |        "\n",
241 |        "> <code>train_model</code>(**`input_filename`**)\n",
242 |        "\n"
243 |       ],
244 |       "text/plain": [
245 |        "<IPython.core.display.Markdown object>"
246 |       ]
247 |      },
248 |      "metadata": {},
249 |      "output_type": "display_data"
250 |     }
251 |    ],
252 |    "source": [
253 |     "show_doc(train_model)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "model = train_model(tokenized_filepath)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/markdown": [
273 |        "<h4 id=\"similars_tree_from_model\" class=\"doc_header\"><code>similars_tree_from_model</code><a href=\"__main__.py#L32\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
274 |        "\n",
275 |        "> <code>similars_tree_from_model</code>(**`model`**, **`vector_size`**=*`100`*)\n",
276 |        "\n"
277 |       ],
278 |       "text/plain": [
279 |        "<IPython.core.display.Markdown object>"
280 |       ]
281 |      },
282 |      "metadata": {},
283 |      "output_type": "display_data"
284 |     }
285 |    ],
286 |    "source": [
287 |     "show_doc(similars_tree_from_model)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "labels, tree = similars_tree_from_model(model)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/markdown": [
307 |        "<h4 id=\"get_similars\" class=\"doc_header\"><code>get_similars</code><a href=\"__main__.py#L43\" class=\"source_link\" style=\"float:right\">[source]</a></h4>\n",
308 |        "\n",
309 |        "> <code>get_similars</code>(**`tree`**, **`labels`**, **`keyword`**, **`n_similars`**=*`10`*, **`show_score`**=*`False`*)\n",
310 |        "\n"
311 |       ],
312 |       "text/plain": [
313 |        "<IPython.core.display.Markdown object>"
314 |       ]
315 |      },
316 |      "metadata": {},
317 |      "output_type": "display_data"
318 |     }
319 |    ],
320 |    "source": [
321 |     "show_doc(get_similars)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [
329 |     {
330 |      "data": {
331 |       "text/plain": [
332 |        "['obesity',\n",
333 |        " 'overweight',\n",
334 |        " 'obese children',\n",
335 |        " 'ssbs',\n",
336 |        " 'poor sleep quality',\n",
337 |        " 'metabolic syndrome',\n",
338 |        " 'obesity among children',\n",
339 |        " 'dental caries',\n",
340 |        " 'physical inactivity',\n",
341 |        " 'obesity may',\n",
342 |        " 'sedentary behaviour',\n",
343 |        " 'food allergy',\n",
344 |        " 'sugar-sweetened beverages',\n",
345 |        " 'worldwide prevalence',\n",
346 |        " 'known risk factor']"
347 |       ]
348 |      },
349 |      "execution_count": null,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "get_similars(tree, labels, \"obesity\")"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": []
364 |   }
365 |  ],
366 |  "metadata": {
367 |   "kernelspec": {
368 |    "display_name": "Python 3",
369 |    "language": "python",
370 |    "name": "python3"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 4
375 | }
376 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | title: keywords2vec
  4 | 
  5 | keywords: fastai
  6 | sidebar: home_sidebar
  7 | 
  8 | summary: "A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words."
  9 | ---
 10 | <!--
 11 | 
 12 | #################################################
 13 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 14 | #################################################
 15 | # file to edit: index.ipynb
 16 | # command to build the docs after a change: nbdev_build_docs
 17 | 
 18 | -->
 19 | 
 20 | <div class="container" id="notebook-container">
 21 |     {% raw %}
 22 |         
 23 | <div class="cell border-box-sizing code_cell rendered">
 24 | 
 25 | </div>
 26 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
 27 | <div class="text_cell_render border-box-sizing rendered_html">
 28 | <h2 id="Example-result">Example result<a class="anchor-link" href="#Example-result">&#182;</a></h2>
 29 | </div>
 30 | </div>
 31 | </div>
 32 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
 33 | <div class="text_cell_render border-box-sizing rendered_html">
 34 | <p>Finding similar keywords for "<strong>obesity</strong>"</p>
 35 | <table>
 36 | <thead><tr>
 37 | <th>index</th>
 38 | <th>term</th>
 39 | </tr>
 40 | </thead>
 41 | <tbody>
 42 | <tr>
 43 | <td>0</td>
 44 | <td>overweight</td>
 45 | </tr>
 46 | <tr>
 47 | <td>1</td>
 48 | <td>obese</td>
 49 | </tr>
 50 | <tr>
 51 | <td>2</td>
 52 | <td>physical inactivity</td>
 53 | </tr>
 54 | <tr>
 55 | <td>3</td>
 56 | <td>excess weight</td>
 57 | </tr>
 58 | <tr>
 59 | <td>4</td>
 60 | <td>obese adults</td>
 61 | </tr>
 62 | <tr>
 63 | <td>5</td>
 64 | <td>high bmi</td>
 65 | </tr>
 66 | <tr>
 67 | <td>6</td>
 68 | <td>obese adults</td>
 69 | </tr>
 70 | <tr>
 71 | <td>7</td>
 72 | <td>obese people</td>
 73 | </tr>
 74 | <tr>
 75 | <td>8</td>
 76 | <td>obesity-related outcomes</td>
 77 | </tr>
 78 | <tr>
 79 | <td>9</td>
 80 | <td>obesity among children</td>
 81 | </tr>
 82 | <tr>
 83 | <td>10</td>
 84 | <td>poor sleep quality</td>
 85 | </tr>
 86 | <tr>
 87 | <td>11</td>
 88 | <td>ssbs</td>
 89 | </tr>
 90 | <tr>
 91 | <td>12</td>
 92 | <td>obese populations</td>
 93 | </tr>
 94 | <tr>
 95 | <td>13</td>
 96 | <td>cardiometabolic risk</td>
 97 | </tr>
 98 | <tr>
 99 | <td>14</td>
100 | <td>abdominal obesity</td>
101 | </tr>
102 | </tbody>
103 | </table>
104 | 
105 | </div>
106 | </div>
107 | </div>
108 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
109 | <div class="text_cell_render border-box-sizing rendered_html">
110 | <h2 id="Install">Install<a class="anchor-link" href="#Install">&#182;</a></h2>
111 | </div>
112 | </div>
113 | </div>
114 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
115 | <div class="text_cell_render border-box-sizing rendered_html">
116 | <p><code>pip install keywords2vec</code></p>
117 | 
118 | </div>
119 | </div>
120 | </div>
121 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
122 | <div class="text_cell_render border-box-sizing rendered_html">
123 | <h2 id="How-to-use">How to use<a class="anchor-link" href="#How-to-use">&#182;</a></h2>
124 | </div>
125 | </div>
126 | </div>
127 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
128 | <div class="text_cell_render border-box-sizing rendered_html">
129 | <p>Lets download some example data</p>
130 | 
131 | </div>
132 | </div>
133 | </div>
134 | <div class="cell border-box-sizing code_cell rendered">
135 | <div class="input">
136 | 
137 | <div class="inner_cell">
138 |     <div class="input_area">
139 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">data_filepath</span> <span class="o">=</span> <span class="s2">&quot;epistemonikos_data_sample.tsv.gz&quot;</span>
140 | 
141 | <span class="o">!</span>wget <span class="s2">&quot;https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz&quot;</span> -O <span class="s2">&quot;{data_filepath}&quot;</span>
142 | </pre></div>
143 | 
144 |     </div>
145 | </div>
146 | </div>
147 | 
148 | </div>
149 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
150 | <div class="text_cell_render border-box-sizing rendered_html">
151 | <p>We create the model. If you need the vectors, take a look <a href="30_main.ipynb">here</a></p>
152 | 
153 | </div>
154 | </div>
155 | </div>
156 | <div class="cell border-box-sizing code_cell rendered">
157 | <div class="input">
158 | 
159 | <div class="inner_cell">
160 |     <div class="input_area">
161 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">labels</span><span class="p">,</span> <span class="n">tree</span> <span class="o">=</span> <span class="n">similars_tree</span><span class="p">(</span><span class="n">data_filepath</span><span class="p">)</span>
162 | </pre></div>
163 | 
164 |     </div>
165 | </div>
166 | </div>
167 | 
168 | <div class="output_wrapper">
169 | <div class="output">
170 | 
171 | <div class="output_area">
172 | 
173 | <div class="output_subarea output_stream output_stdout output_text">
174 | <pre>processing file: epistemonikos_data_sample.tsv.gz
175 | </pre>
176 | </div>
177 | </div>
178 | 
179 | <div class="output_area">
180 | 
181 | 
182 | <div class="output_html rendered_html output_subarea ">
183 | 
184 |     <div>
185 |         <style>
186 |             /* Turns off some styling */
187 |             progress {
188 |                 /* gets rid of default border in Firefox and Opera. */
189 |                 border: none;
190 |                 /* Needs to be in here for Safari polyfill so background images work as expected. */
191 |                 background-size: auto;
192 |             }
193 |             .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {
194 |                 background: #F44336;
195 |             }
196 |         </style>
197 |       <progress value='201' class='' max='201', style='width:300px; height:20px; vertical-align: middle;'></progress>
198 |       100.00% [201/201 00:19<00:00]
199 |     </div>
200 |     
201 | </div>
202 | 
203 | </div>
204 | 
205 | </div>
206 | </div>
207 | 
208 | </div>
209 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
210 | <div class="text_cell_render border-box-sizing rendered_html">
211 | <p>Then we can get the most similars keywords</p>
212 | 
213 | </div>
214 | </div>
215 | </div>
216 | <div class="cell border-box-sizing code_cell rendered">
217 | <div class="input">
218 | 
219 | <div class="inner_cell">
220 |     <div class="input_area">
221 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">get_similars</span><span class="p">(</span><span class="n">tree</span><span class="p">,</span> <span class="n">labels</span><span class="p">,</span> <span class="s2">&quot;obesity&quot;</span><span class="p">)</span>
222 | </pre></div>
223 | 
224 |     </div>
225 | </div>
226 | </div>
227 | 
228 | <div class="output_wrapper">
229 | <div class="output">
230 | 
231 | <div class="output_area">
232 | 
233 | 
234 | 
235 | <div class="output_text output_subarea output_execute_result">
236 | <pre>[&#39;obesity&#39;,
237 |  &#39;overweight&#39;,
238 |  &#39;obese&#39;,
239 |  &#39;physical inactivity&#39;,
240 |  &#39;excess weight&#39;,
241 |  &#39;high bmi&#39;,
242 |  &#39;obese adults&#39;,
243 |  &#39;obese people&#39;,
244 |  &#39;obesity-related outcomes&#39;,
245 |  &#39;obesity among children&#39;,
246 |  &#39;poor sleep quality&#39;,
247 |  &#39;ssbs&#39;,
248 |  &#39;obese populations&#39;,
249 |  &#39;cardiometabolic risk&#39;,
250 |  &#39;abdominal obesity&#39;]</pre>
251 | </div>
252 | 
253 | </div>
254 | 
255 | </div>
256 | </div>
257 | 
258 | </div>
259 | <div class="cell border-box-sizing code_cell rendered">
260 | <div class="input">
261 | 
262 | <div class="inner_cell">
263 |     <div class="input_area">
264 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">get_similars</span><span class="p">(</span><span class="n">tree</span><span class="p">,</span> <span class="n">labels</span><span class="p">,</span> <span class="s2">&quot;heart failure&quot;</span><span class="p">)</span>
265 | </pre></div>
266 | 
267 |     </div>
268 | </div>
269 | </div>
270 | 
271 | <div class="output_wrapper">
272 | <div class="output">
273 | 
274 | <div class="output_area">
275 | 
276 | 
277 | 
278 | <div class="output_text output_subarea output_execute_result">
279 | <pre>[&#39;heart failure&#39;,
280 |  &#39;hf&#39;,
281 |  &#39;chf&#39;,
282 |  &#39;chronic heart failure&#39;,
283 |  &#39;reduced ejection fraction&#39;,
284 |  &#39;unstable angina&#39;,
285 |  &#39;peripheral vascular disease&#39;,
286 |  &#39;peripheral arterial disease&#39;,
287 |  &#39;angina&#39;,
288 |  &#39;congestive heart failure&#39;,
289 |  &#39;left ventricular systolic dysfunction&#39;,
290 |  &#39;acute coronary syndrome&#39;,
291 |  &#39;heart failure patients&#39;,
292 |  &#39;acute myocardial infarction&#39;,
293 |  &#39;left ventricular dysfunction&#39;]</pre>
294 | </div>
295 | 
296 | </div>
297 | 
298 | </div>
299 | </div>
300 | 
301 | </div>
302 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
303 | <div class="text_cell_render border-box-sizing rendered_html">
304 | <h3 id="Motivation">Motivation<a class="anchor-link" href="#Motivation">&#182;</a></h3><p>The idea started in the Epistemonikos database <a href="https://www.epistemonikos.org">www.epistemonikos.org</a>, a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:</p>
305 | <ul>
306 | <li>asthma</li>
307 | <li>heart failure</li>
308 | <li>medial compartment knee osteoarthritis</li>
309 | <li>preserved left ventricular systolic function</li>
310 | <li>non-selective non-steroidal anti-inflammatory drugs</li>
311 | </ul>
312 | <p>We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.</p>
313 | 
314 | </div>
315 | </div>
316 | </div>
317 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
318 | <div class="text_cell_render border-box-sizing rendered_html">
319 | <h3 id="Our-approach">Our approach<a class="anchor-link" href="#Our-approach">&#182;</a></h3><p>We found that tokenizing using stopwords + non word characters was really useful for "finding" the keywords. An example:</p>
320 | <ul>
321 | <li>input: "Timing of replacement therapy for acute renal failure after cardiac surgery"</li>
322 | <li>output: [
323 |   "timing",
324 |   "replacement therapy",
325 |   "acute renal failure",
326 |   "cardiac surgery"
327 | ]</li>
328 | </ul>
329 | <p>So we basically split the text when we find:</p>
330 | <ul>
331 | <li>a stopword</li>
332 | <li>a non word character(/,!?. etc) (except from - and ')</li>
333 | </ul>
334 | <p>That's it.</p>
335 | <p>But as there were some problem with some keywords that cointain stopwords, like:</p>
336 | <ul>
337 | <li>Vitamin A</li>
338 | <li>Hepatitis A</li>
339 | <li>Web of Science</li>
340 | </ul>
341 | <p>So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter <code>keywords_w_stopwords=True</code>, this method is approx 20x slower.</p>
342 | 
343 | </div>
344 | </div>
345 | </div>
346 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
347 | <div class="text_cell_render border-box-sizing rendered_html">
348 | <h3 id="References">References<a class="anchor-link" href="#References">&#182;</a></h3><p>Seem to be an old idea (2004):</p>
349 | <p><em>Mihalcea, Rada, and Paul Tarau. "Textrank: Bringing order into text." Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.</em></p>
350 | <p>Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec</p>
351 | <p>As pointed by @deliprao in this <a href="https://twitter.com/jeremyphoward/status/1094025901371621376">twitter thread</a>. It's also used by Rake (2010):</p>
352 | <p><em>Rose, Stuart &amp; Engel, Dave &amp; Cramer, Nick &amp; Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.</em></p>
353 | <p>As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)
354 | <a href="https://www.nltk.org/book/ch07.html#Chinking">https://www.nltk.org/book/ch07.html#Chinking</a></p>
355 | 
356 | </div>
357 | </div>
358 | </div>
359 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
360 | <div class="text_cell_render border-box-sizing rendered_html">
361 | <h3 id="Multi-lingual">Multi-lingual<a class="anchor-link" href="#Multi-lingual">&#182;</a></h3><p>We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese</p>
362 | 
363 | </div>
364 | </div>
365 | </div>
366 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
367 | <div class="text_cell_render border-box-sizing rendered_html">
368 | <h2 id="Try-it-online">Try it online<a class="anchor-link" href="#Try-it-online">&#182;</a></h2><p>You can try it <a href="http://54.196.169.11/episte/">here</a> (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)</p>
369 | <p>These embedding were created using 827,341 title/abstract from @epistemonikos database.
370 | With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)</p>
371 | 
372 | </div>
373 | </div>
374 | </div>
375 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
376 | <div class="text_cell_render border-box-sizing rendered_html">
377 | <h2 id="Vocab-size">Vocab size<a class="anchor-link" href="#Vocab-size">&#182;</a></h2><p>One of the main benefit of this method, is the size of the vocabulary. 
378 | For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:</p>
379 | <table>
380 | <thead><tr>
381 | <th>ngrams</th>
382 | <th>keywords</th>
383 | <th>comp</th>
384 | </tr>
385 | </thead>
386 | <tbody>
387 | <tr>
388 | <td>1</td>
389 | <td>127,824</td>
390 | <td>36%</td>
391 | </tr>
392 | <tr>
393 | <td>1,2</td>
394 | <td>1,360,550</td>
395 | <td>388%</td>
396 | </tr>
397 | <tr>
398 | <td>1-3</td>
399 | <td>3,204,099</td>
400 | <td>914%</td>
401 | </tr>
402 | <tr>
403 | <td>1-4</td>
404 | <td>4,461,930</td>
405 | <td>1,272%</td>
406 | </tr>
407 | <tr>
408 | <td>1-5</td>
409 | <td>5,133,619</td>
410 | <td>1,464%</td>
411 | </tr>
412 | <tr>
413 | <td></td>
414 | <td></td>
415 | <td></td>
416 | </tr>
417 | <tr>
418 | <td>stopword tokenizer</td>
419 | <td>350,529</td>
420 | <td>100%</td>
421 | </tr>
422 | </tbody>
423 | </table>
424 | <p>More information regarding the comparison, take a look to the folder <a href="analyze">analyze</a>.</p>
425 | 
426 | </div>
427 | </div>
428 | </div>
429 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
430 | <div class="text_cell_render border-box-sizing rendered_html">
431 | <h2 id="Credits">Credits<a class="anchor-link" href="#Credits">&#182;</a></h2>
432 | </div>
433 | </div>
434 | </div>
435 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
436 | <div class="text_cell_render border-box-sizing rendered_html">
437 | <p>This project has been created using <a href="https://github.com/fastai/nbdev">nbdev</a></p>
438 | 
439 | </div>
440 | </div>
441 | </div>
442 |     {% endraw %}
443 | </div>
444 |  
445 | 
446 | 


--------------------------------------------------------------------------------
/docs/tokenizer.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | title: Tokenizer
  4 | 
  5 | keywords: fastai
  6 | sidebar: home_sidebar
  7 | 
  8 | summary: "We are going to tokenize using 2 different strategies. The first one is using stopwords (read the main README for more information). And the second one, is a nltk grammar regexp parser."
  9 | ---
 10 | <!--
 11 | 
 12 | #################################################
 13 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 14 | #################################################
 15 | # file to edit: 10_tokenizer.ipynb
 16 | # command to build the docs after a change: nbdev_build_docs
 17 | 
 18 | -->
 19 | 
 20 | <div class="container" id="notebook-container">
 21 |     {% raw %}
 22 |         
 23 | <div class="cell border-box-sizing code_cell rendered">
 24 | 
 25 | </div>
 26 | <div class="cell border-box-sizing code_cell rendered">
 27 | <div class="input">
 28 | 
 29 | <div class="inner_cell">
 30 |     <div class="input_area">
 31 | <div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">nbdev.showdoc</span> <span class="kn">import</span> <span class="o">*</span>
 32 | </pre></div>
 33 | 
 34 |     </div>
 35 | </div>
 36 | </div>
 37 | 
 38 | </div>
 39 | <div class="cell border-box-sizing code_cell rendered">
 40 | 
 41 | </div>
 42 | <div class="cell border-box-sizing code_cell rendered">
 43 | 
 44 | </div>
 45 | <div class="cell border-box-sizing code_cell rendered">
 46 | 
 47 | </div>
 48 | <div class="cell border-box-sizing code_cell rendered">
 49 | 
 50 | <div class="output_wrapper">
 51 | <div class="output">
 52 | 
 53 | <div class="output_area">
 54 | 
 55 | 
 56 | <div class="output_markdown rendered_html output_subarea ">
 57 | <h4 id="prepare_stopwords" class="doc_header"><code>prepare_stopwords</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/tokenizer.py#L18" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>prepare_stopwords</code>(<strong><code>stopwords</code></strong>=<em><code>None</code></em>, <strong><code>additional_stopwords</code></strong>=<em><code>None</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>)</p>
 58 | </blockquote>
 59 | 
 60 | </div>
 61 | 
 62 | </div>
 63 | 
 64 | </div>
 65 | </div>
 66 | 
 67 | </div>
 68 | <div class="cell border-box-sizing code_cell rendered">
 69 | 
 70 | <div class="output_wrapper">
 71 | <div class="output">
 72 | 
 73 | <div class="output_area">
 74 | 
 75 | 
 76 | <div class="output_markdown rendered_html output_subarea ">
 77 | <h4 id="tokenize_one" class="doc_header"><code>tokenize_one</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/tokenizer.py#L30" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>tokenize_one</code>(<strong><code>text</code></strong>, <strong><code>stopwords</code></strong>=<em><code>None</code></em>, <strong><code>additional_stopwords</code></strong>=<em><code>None</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>, <strong><code>split_by_stopwords</code></strong>=<em><code>True</code></em>)</p>
 78 | </blockquote>
 79 | 
 80 | </div>
 81 | 
 82 | </div>
 83 | 
 84 | </div>
 85 | </div>
 86 | 
 87 | </div>
 88 | <div class="cell border-box-sizing code_cell rendered">
 89 | 
 90 | <div class="output_wrapper">
 91 | <div class="output">
 92 | 
 93 | <div class="output_area">
 94 | 
 95 | 
 96 | <div class="output_markdown rendered_html output_subarea ">
 97 | <h4 id="get_nodes_for_ntlk" class="doc_header"><code>get_nodes_for_ntlk</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/tokenizer.py#L67" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>get_nodes_for_ntlk</code>(<strong><code>parent</code></strong>, <strong><code>stopwords</code></strong>, <strong><code>valid_labels</code></strong>)</p>
 98 | </blockquote>
 99 | 
100 | </div>
101 | 
102 | </div>
103 | 
104 | </div>
105 | </div>
106 | 
107 | </div>
108 | <div class="cell border-box-sizing code_cell rendered">
109 | 
110 | <div class="output_wrapper">
111 | <div class="output">
112 | 
113 | <div class="output_area">
114 | 
115 | 
116 | <div class="output_markdown rendered_html output_subarea ">
117 | <h4 id="tokenize_by_nltk" class="doc_header"><code>tokenize_by_nltk</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/tokenizer.py#L84" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>tokenize_by_nltk</code>(<strong><code>text</code></strong>, <strong><code>stopwords</code></strong>=<em><code>None</code></em>, <strong><code>additional_stopwords</code></strong>=<em><code>None</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>)</p>
118 | </blockquote>
119 | 
120 | </div>
121 | 
122 | </div>
123 | 
124 | </div>
125 | </div>
126 | 
127 | </div>
128 | <div class="cell border-box-sizing code_cell rendered">
129 | 
130 | <div class="output_wrapper">
131 | <div class="output">
132 | 
133 | <div class="output_area">
134 | 
135 | 
136 | <div class="output_markdown rendered_html output_subarea ">
137 | <h4 id="tokenize" class="doc_header"><code>tokenize</code><a href="https://github.com/dperezrada/keywords2vec/tree/master/keywords2vec/tokenizer.py#L117" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>tokenize</code>(<strong><code>text</code></strong>, <strong><code>text_output</code></strong>=<em><code>False</code></em>, <strong><code>lang</code></strong>=<em><code>'en'</code></em>, <strong><code>keywords_w_stopwords</code></strong>=<em><code>False</code></em>, <strong><code>merge</code></strong>=<em><code>True</code></em>)</p>
138 | </blockquote>
139 | 
140 | </div>
141 | 
142 | </div>
143 | 
144 | </div>
145 | </div>
146 | 
147 | </div>
148 | <div class="cell border-box-sizing code_cell rendered">
149 | <div class="input">
150 | 
151 | <div class="inner_cell">
152 |     <div class="input_area">
153 | <div class=" highlight hl-ipython3"><pre><span></span><span class="c1">#from_wikipedia</span>
154 | <span class="n">example_text</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;The modern sovereign state of Chile is among South America&#39;s most economically and socially stable and prosperous nations, with a high-income economy and high living standards.[11][12] It leads Latin American nations in rankings of human development, competitiveness, income per capita, globalization, state of peace, economic freedom, and low perception of corruption.[13] It also ranks high regionally in sustainability of the state, and democratic development.[14] Currently it also has the lowest homicide rate in the Americas after Canada. Chile is a founding member of the United Nations, the Union of South American Nations (UNASUR), the Community of Latin American and Caribbean States (CELAC) and the Pacific Alliance, and joined the Organisation for Economic Co-operation and Development (OECD) in 2010.&quot;&quot;&quot;</span>
155 | <span class="n">tokenized_text</span> <span class="o">=</span> <span class="n">tokenize</span><span class="p">(</span><span class="n">example_text</span><span class="p">)</span>
156 | </pre></div>
157 | 
158 |     </div>
159 | </div>
160 | </div>
161 | 
162 | </div>
163 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
164 | <div class="text_cell_render border-box-sizing rendered_html">
165 | <p>Tokenizing the text, by default use two methods and merge them, so you might see duplicated keywords.</p>
166 | 
167 | </div>
168 | </div>
169 | </div>
170 | <div class="cell border-box-sizing code_cell rendered">
171 | <div class="input">
172 | 
173 | <div class="inner_cell">
174 |     <div class="input_area">
175 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenized_text</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">15</span><span class="p">]</span>
176 | </pre></div>
177 | 
178 |     </div>
179 | </div>
180 | </div>
181 | 
182 | <div class="output_wrapper">
183 | <div class="output">
184 | 
185 | <div class="output_area">
186 | 
187 | 
188 | 
189 | <div class="output_text output_subarea output_execute_result">
190 | <pre>[&#39;modern sovereign state&#39;,
191 |  &#39;chile&#39;,
192 |  &#34;among south america&#39;s&#34;,
193 |  &#39;economically&#39;,
194 |  &#39;socially stable&#39;,
195 |  &#39;prosperous nations&#39;,
196 |  &#39;high-income economy&#39;,
197 |  &#39;high living standards&#39;,
198 |  &#39;leads latin american nations&#39;,
199 |  &#39;rankings&#39;,
200 |  &#39;human development&#39;,
201 |  &#39;competitiveness&#39;,
202 |  &#39;income per capita&#39;,
203 |  &#39;globalization&#39;,
204 |  &#39;state&#39;]</pre>
205 | </div>
206 | 
207 | </div>
208 | 
209 | </div>
210 | </div>
211 | 
212 | </div>
213 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
214 | <div class="text_cell_render border-box-sizing rendered_html">
215 | <p>If you wanted to how see the result from each method</p>
216 | 
217 | </div>
218 | </div>
219 | </div>
220 | <div class="cell border-box-sizing code_cell rendered">
221 | <div class="input">
222 | 
223 | <div class="inner_cell">
224 |     <div class="input_area">
225 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenized_text</span> <span class="o">=</span> <span class="n">tokenize</span><span class="p">(</span><span class="n">example_text</span><span class="p">,</span> <span class="n">merge</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
226 | </pre></div>
227 | 
228 |     </div>
229 | </div>
230 | </div>
231 | 
232 | </div>
233 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
234 | <div class="text_cell_render border-box-sizing rendered_html">
235 | <p>Tokenized using only stopwords</p>
236 | 
237 | </div>
238 | </div>
239 | </div>
240 | <div class="cell border-box-sizing code_cell rendered">
241 | <div class="input">
242 | 
243 | <div class="inner_cell">
244 |     <div class="input_area">
245 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenized_text</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">:</span><span class="mi">15</span><span class="p">]</span>
246 | </pre></div>
247 | 
248 |     </div>
249 | </div>
250 | </div>
251 | 
252 | <div class="output_wrapper">
253 | <div class="output">
254 | 
255 | <div class="output_area">
256 | 
257 | 
258 | 
259 | <div class="output_text output_subarea output_execute_result">
260 | <pre>[&#39;modern sovereign state&#39;,
261 |  &#39;chile&#39;,
262 |  &#34;among south america&#39;s&#34;,
263 |  &#39;economically&#39;,
264 |  &#39;socially stable&#39;,
265 |  &#39;prosperous nations&#39;,
266 |  &#39;high-income economy&#39;,
267 |  &#39;high living standards&#39;,
268 |  &#39;leads latin american nations&#39;,
269 |  &#39;rankings&#39;,
270 |  &#39;human development&#39;,
271 |  &#39;competitiveness&#39;,
272 |  &#39;income per capita&#39;,
273 |  &#39;globalization&#39;,
274 |  &#39;state&#39;]</pre>
275 | </div>
276 | 
277 | </div>
278 | 
279 | </div>
280 | </div>
281 | 
282 | </div>
283 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
284 | <div class="text_cell_render border-box-sizing rendered_html">
285 | <p>Tokenized using only nltk, this method complement to find keywords with stopwords</p>
286 | 
287 | </div>
288 | </div>
289 | </div>
290 | <div class="cell border-box-sizing code_cell rendered">
291 | <div class="input">
292 | 
293 | <div class="inner_cell">
294 |     <div class="input_area">
295 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenized_text</span><span class="p">[</span><span class="mi">1</span><span class="p">][</span><span class="mi">0</span><span class="p">:</span><span class="mi">15</span><span class="p">]</span>
296 | </pre></div>
297 | 
298 |     </div>
299 | </div>
300 | </div>
301 | 
302 | <div class="output_wrapper">
303 | <div class="output">
304 | 
305 | <div class="output_area">
306 | 
307 | 
308 | 
309 | <div class="output_text output_subarea output_execute_result">
310 | <pre>[&#39;sovereign state&#39;,
311 |  &#39;modern sovereign state&#39;,
312 |  &#39;chile&#39;,
313 |  &#39;south america&#39;,
314 |  &#39;nations&#39;,
315 |  &#39;prosperous nations&#39;,
316 |  &#39;economy&#39;,
317 |  &#39;high-income economy&#39;,
318 |  &#39;living standards&#39;,
319 |  &#39;high living standards&#39;,
320 |  &#39;modern sovereign state of chile&#39;,
321 |  &#39;south america&#39;,
322 |  &#39;prosperous nations&#39;,
323 |  &#39;high-income economy&#39;,
324 |  &#39;high living standards&#39;]</pre>
325 | </div>
326 | 
327 | </div>
328 | 
329 | </div>
330 | </div>
331 | 
332 | </div>
333 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
334 | <div class="text_cell_render border-box-sizing rendered_html">
335 | <p>We recommend to use them with the default options</p>
336 | 
337 | </div>
338 | </div>
339 | </div>
340 | <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
341 | <div class="text_cell_render border-box-sizing rendered_html">
342 | <p>Tokenize and return plain text</p>
343 | 
344 | </div>
345 | </div>
346 | </div>
347 | <div class="cell border-box-sizing code_cell rendered">
348 | <div class="input">
349 | 
350 | <div class="inner_cell">
351 |     <div class="input_area">
352 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">tokenized_text</span> <span class="o">=</span> <span class="n">tokenize</span><span class="p">(</span><span class="n">example_text</span><span class="p">,</span> <span class="n">text_output</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
353 | <span class="n">tokenized_text</span>
354 | </pre></div>
355 | 
356 |     </div>
357 | </div>
358 | </div>
359 | 
360 | <div class="output_wrapper">
361 | <div class="output">
362 | 
363 | <div class="output_area">
364 | 
365 | 
366 | 
367 | <div class="output_text output_subarea output_execute_result">
368 | <pre>&#34;modern sovereign state!chile!among south america&#39;s!economically!socially stable!prosperous nations!high-income economy!high living standards!leads latin american nations!rankings!human development!competitiveness!income per capita!globalization!state!peace!economic freedom!low perception!corruption!also ranks high regionally!sustainability!state!democratic development!currently!also!lowest homicide rate!americas!canada!chile!founding member!united nations!union!south american nations!unasur!community!latin american!caribbean states!celac!pacific alliance!joined!organisation!economic co-operation!development!oecd!sovereign state!modern sovereign state!chile!south america!nations!prosperous nations!economy!high-income economy!living standards!high living standards!modern sovereign state of chile!south america!prosperous nations!high-income economy!high living standards!nations!american nations!nations!latin american nations!rankings!development!human development!competitiveness!income!capita!globalization!state!peace!freedom!economic freedom!perception!low perception!corruption!latin american nations in rankings!human development!competitiveness!income per capita!globalization!state of peace!economic freedom!low perception of corruption!sustainability!state!development!democratic development!sustainability!state!democratic development!currently!homicide rate!americas!canada!currently!homicide rate!americas after canada!chile!member!founding member!united nations!union!south!nations!american nations!unasur!community!states!caribbean states!celac!pacific alliance!organisation!economic co-operation!development!oecd!chile!founding member!united nations!union of south!american nations!unasur!community!caribbean states!celac!pacific alliance!organisation for economic co-operation!development!oecd&#34;</pre>
369 | </div>
370 | 
371 | </div>
372 | 
373 | </div>
374 | </div>
375 | 
376 | </div>
377 |     {% endraw %}
378 | </div>
379 |  
380 | 
381 | 


--------------------------------------------------------------------------------