'
--------------------------------------------------------------------------------
/keywords2vec/_nbdev.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT!
2 |
3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"]
4 |
5 | index = {"NUMBERS_STOPWORDS": "10_tokenizer.ipynb",
6 | "prepare_stopwords": "10_tokenizer.ipynb",
7 | "tokenize_one": "10_tokenizer.ipynb",
8 | "get_nodes_for_ntlk": "10_tokenizer.ipynb",
9 | "tokenize_by_nltk": "10_tokenizer.ipynb",
10 | "tokenize": "10_tokenizer.ipynb",
11 | "parallel": "20_utils.ipynb",
12 | "num_cpus": "20_utils.ipynb",
13 | "open_file": "20_utils.ipynb",
14 | "chunk_of_text": "20_utils.ipynb",
15 | "get_file_chunks": "20_utils.ipynb",
16 | "tokenize_file": "30_main.ipynb",
17 | "train_model": "30_main.ipynb",
18 | "similars_tree_from_model": "30_main.ipynb",
19 | "get_similars": "30_main.ipynb",
20 | "similars_tree": "30_main.ipynb"}
21 |
22 | modules = ["tokenizer.py",
23 | "utils.py",
24 | "main.py"]
25 |
26 | doc_url = "https://dperezrada.github.io/keywords2vec/"
27 |
28 | git_url = "https://github.com/dperezrada/keywords2vec/tree/master/"
29 |
30 | def custom_doc_links(name): return None
--------------------------------------------------------------------------------
/docs/licenses/LICENSE:
--------------------------------------------------------------------------------
1 | /* This license pertains to the docs template, except for the Navgoco jQuery component. */
2 |
3 | The MIT License (MIT)
4 |
5 | Original theme: Copyright (c) 2016 Tom Johnson
6 | Modifications: Copyright (c) 2017 onwards fast.ai, Inc
7 |
8 | Permission is hereby granted, free of charge, to any person obtaining a copy
9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 |
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push, pull_request]
3 | jobs:
4 | build:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v1
8 | - uses: actions/setup-python@v1
9 | with:
10 | python-version: '3.6'
11 | architecture: 'x64'
12 | - name: Install the library
13 | run: |
14 | pip install nbdev jupyter
15 | pip install -e .
16 | - name: Read all notebooks
17 | run: |
18 | nbdev_read_nbs
19 | - name: Check if all notebooks are cleaned
20 | run: |
21 | echo "Check we are starting with clean git checkout"
22 | if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi
23 | echo "Trying to strip out notebooks"
24 | nbdev_clean_nbs
25 | echo "Check that strip out was unnecessary"
26 | git status -s # display the status to see which nbs need cleaning up
27 | if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi
28 | - name: Check if there is no diff library/notebooks
29 | run: |
30 | if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi
31 | - name: Run tests
32 | run: |
33 | nbdev_test_nbs
34 |
--------------------------------------------------------------------------------
/docs/feed.xml:
--------------------------------------------------------------------------------
1 | ---
2 | search: exclude
3 | layout: none
4 | ---
5 |
6 |
7 |
33 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | repository: dperezrada/keywords2vec
2 | output: web
3 | topnav_title: keywords2vec
4 | site_title: keywords2vec
5 | company_name: Daniel Pérez Rada
6 | description: To generate a word2vec model, but using multi-word keywords instead of single words.
7 | # Set to false to disable KaTeX math
8 | use_math: true
9 | # Add Google analytics id if you have one and want to use it here
10 | google_analytics:
11 | # See http://nbdev.fast.ai/search for help with adding Search
12 | google_search:
13 |
14 | host: 127.0.0.1
15 | # the preview server used. Leave as is.
16 | port: 4000
17 | # the port where the preview is rendered.
18 |
19 | exclude:
20 | - .idea/
21 | - .gitignore
22 | - vendor
23 |
24 | exclude: [vendor]
25 |
26 | highlighter: rouge
27 | markdown: kramdown
28 | kramdown:
29 | input: GFM
30 | auto_ids: true
31 | hard_wrap: false
32 | syntax_highlighter: rouge
33 |
34 | collections:
35 | tooltips:
36 | output: false
37 |
38 | defaults:
39 | -
40 | scope:
41 | path: ""
42 | type: "pages"
43 | values:
44 | layout: "page"
45 | comments: true
46 | search: true
47 | sidebar: home_sidebar
48 | topnav: topnav
49 | -
50 | scope:
51 | path: ""
52 | type: "tooltips"
53 | values:
54 | layout: "page"
55 | comments: true
56 | search: true
57 | tooltip: true
58 |
59 | sidebars:
60 | - home_sidebar
61 | permalink: pretty
62 |
63 | theme: jekyll-theme-cayman
64 | baseurl: /keywords2vec/
--------------------------------------------------------------------------------
/docs/_includes/links.html:
--------------------------------------------------------------------------------
1 | {% comment %}Get links from each sidebar, as listed in the _config.yml file under sidebars{% endcomment %}
2 |
3 | {% for sidebar in site.sidebars %}
4 | {% for entry in site.data.sidebars[sidebar].entries %}
5 | {% for folder in entry.folders %}
6 | {% for folderitem in folder.folderitems %}
7 | {% if folderitem.url contains "html#" %}
8 | [{{folderitem.url | remove: "/" }}]: {{folderitem.url | remove: "/"}}
9 | {% else %}
10 | [{{folderitem.url | remove: "/" | remove: ".html"}}]: {{folderitem.url | remove: "/"}}
11 | {% endif %}
12 | {% for subfolders in folderitem.subfolders %}
13 | {% for subfolderitem in subfolders.subfolderitems %}
14 | [{{subfolderitem.url | remove: "/" | remove: ".html"}}]: {{subfolderitem.url | remove: "/"}}
15 | {% endfor %}
16 | {% endfor %}
17 | {% endfor %}
18 | {% endfor %}
19 | {% endfor %}
20 | {% endfor %}
21 |
22 |
23 | {% comment %} Get links from topnav {% endcomment %}
24 |
25 | {% for entry in site.data.topnav.topnav %}
26 | {% for item in entry.items %}
27 | {% if item.external_url == null %}
28 | [{{item.url | remove: "/" | remove: ".html"}}]: {{item.url | remove: "/"}}
29 | {% endif %}
30 | {% endfor %}
31 | {% endfor %}
32 |
33 | {% comment %}Get links from topnav dropdowns {% endcomment %}
34 |
35 | {% for entry in site.data.topnav.topnav_dropdowns %}
36 | {% for folder in entry.folders %}
37 | {% for folderitem in folder.folderitems %}
38 | {% if folderitem.external_url == null %}
39 | [{{folderitem.url | remove: "/" | remove: ".html"}}]: {{folderitem.url | remove: "/"}}
40 | {% endif %}
41 | {% endfor %}
42 | {% endfor %}
43 | {% endfor %}
44 |
45 |
--------------------------------------------------------------------------------
/docs/css/modern-business.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Start Bootstrap - Modern Business HTML Template (http://startbootstrap.com)
3 | * Code licensed under the Apache License v2.0.
4 | * For details, see http://www.apache.org/licenses/LICENSE-2.0.
5 | */
6 |
7 | /* Global Styles */
8 |
9 | html,
10 | body {
11 | height: 100%;
12 | }
13 |
14 | .img-portfolio {
15 | margin-bottom: 30px;
16 | }
17 |
18 | .img-hover:hover {
19 | opacity: 0.8;
20 | }
21 |
22 | /* Home Page Carousel */
23 |
24 | header.carousel {
25 | height: 50%;
26 | }
27 |
28 | header.carousel .item,
29 | header.carousel .item.active,
30 | header.carousel .carousel-inner {
31 | height: 100%;
32 | }
33 |
34 | header.carousel .fill {
35 | width: 100%;
36 | height: 100%;
37 | background-position: center;
38 | background-size: cover;
39 | }
40 |
41 | /* 404 Page Styles */
42 |
43 | .error-404 {
44 | font-size: 100px;
45 | }
46 |
47 | /* Pricing Page Styles */
48 |
49 | .price {
50 | display: block;
51 | font-size: 50px;
52 | line-height: 50px;
53 | }
54 |
55 | .price sup {
56 | top: -20px;
57 | left: 2px;
58 | font-size: 20px;
59 | }
60 |
61 | .period {
62 | display: block;
63 | font-style: italic;
64 | }
65 |
66 | /* Footer Styles */
67 |
68 | footer {
69 | margin: 50px 0;
70 | }
71 |
72 | /* Responsive Styles */
73 |
74 | @media(max-width:991px) {
75 | .client-img,
76 | .img-related {
77 | margin-bottom: 30px;
78 | }
79 | }
80 |
81 | @media(max-width:767px) {
82 | .img-portfolio {
83 | margin-bottom: 15px;
84 | }
85 |
86 | header.carousel .carousel {
87 | height: 70%;
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/docs/_includes/head_print.html:
--------------------------------------------------------------------------------
1 |
17 |
18 |
23 |
24 |
29 |
--------------------------------------------------------------------------------
/docs/licenses/LICENSE-BSD-NAVGOCO.txt:
--------------------------------------------------------------------------------
1 | /* This license pertains to the Navgoco jQuery component used for the sidebar. */
2 |
3 | Copyright (c) 2013, Christodoulos Tsoulloftas, http://www.komposta.net
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without modification,
7 | are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice,
10 | this list of conditions and the following disclaimer.
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 | * Neither the name of the
nor the names of its
15 | contributors may be used to endorse or promote products derived from this
16 | software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
26 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
27 | OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/docs/_layouts/page.html:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 |
5 |
9 |
10 | {% if page.simple_map == true %}
11 |
12 |
17 |
18 | {% include custom/{{page.map_name}}.html %}
19 |
20 | {% elsif page.complex_map == true %}
21 |
22 |
27 |
28 | {% include custom/{{page.map_name}}.html %}
29 |
30 | {% endif %}
31 |
32 |
33 |
34 | {% if page.summary %}
35 |
{{page.summary}}
36 | {% endif %}
37 |
38 | {% unless page.toc == false %}
39 | {% include toc.html %}
40 | {% endunless %}
41 |
42 |
43 | {% if site.github_editme_path %}
44 |
45 |
Edit me
46 |
47 | {% endif %}
48 |
49 | {{content}}
50 |
51 |
62 |
63 |
64 |
65 | {{site.data.alerts.hr_shaded}}
66 |
67 | {% include footer.html %}
68 |
--------------------------------------------------------------------------------
/docs/js/customscripts.js:
--------------------------------------------------------------------------------
1 | $('#mysidebar').height($(".nav").height());
2 |
3 |
4 | $( document ).ready(function() {
5 |
6 | //this script says, if the height of the viewport is greater than 800px, then insert affix class, which makes the nav bar float in a fixed
7 | // position as your scroll. if you have a lot of nav items, this height may not work for you.
8 | var h = $(window).height();
9 | //console.log (h);
10 | if (h > 800) {
11 | $( "#mysidebar" ).attr("class", "nav affix");
12 | }
13 | // activate tooltips. although this is a bootstrap js function, it must be activated this way in your theme.
14 | $('[data-toggle="tooltip"]').tooltip({
15 | placement : 'top'
16 | });
17 |
18 | /**
19 | * AnchorJS
20 | */
21 | anchors.add('h2,h3,h4,h5');
22 |
23 | });
24 |
25 | // needed for nav tabs on pages. See Formatting > Nav tabs for more details.
26 | // script from http://stackoverflow.com/questions/10523433/how-do-i-keep-the-current-tab-active-with-twitter-bootstrap-after-a-page-reload
27 | $(function() {
28 | var json, tabsState;
29 | $('a[data-toggle="pill"], a[data-toggle="tab"]').on('shown.bs.tab', function(e) {
30 | var href, json, parentId, tabsState;
31 |
32 | tabsState = localStorage.getItem("tabs-state");
33 | json = JSON.parse(tabsState || "{}");
34 | parentId = $(e.target).parents("ul.nav.nav-pills, ul.nav.nav-tabs").attr("id");
35 | href = $(e.target).attr('href');
36 | json[parentId] = href;
37 |
38 | return localStorage.setItem("tabs-state", JSON.stringify(json));
39 | });
40 |
41 | tabsState = localStorage.getItem("tabs-state");
42 | json = JSON.parse(tabsState || "{}");
43 |
44 | $.each(json, function(containerId, href) {
45 | return $("#" + containerId + " a[href=" + href + "]").tab('show');
46 | });
47 |
48 | $("ul.nav.nav-pills, ul.nav.nav-tabs").each(function() {
49 | var $this = $(this);
50 | if (!json[$this.attr("id")]) {
51 | return $this.find("a[data-toggle=tab]:first, a[data-toggle=pill]:first").tab("show");
52 | }
53 | });
54 | });
55 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools
4 | assert parse_version(setuptools.__version__)>=parse_version('36.2')
5 |
6 | # note: all settings are in settings.ini; edit there, not here
7 | config = ConfigParser(delimiters=['='])
8 | config.read('settings.ini')
9 | cfg = config['DEFAULT']
10 |
11 | cfg_keys = 'version description keywords author author_email'.split()
12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
14 | setup_cfg = {o:cfg[o] for o in cfg_keys}
15 |
16 | licenses = {
17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
18 | }
19 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
20 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
21 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
22 |
23 | requirements = cfg.get('requirements','').split()
24 | dev_requirements = cfg.get('dev_requirements','').split()
25 |
26 | lic = licenses[cfg['license']]
27 | min_python = cfg['min_python']
28 |
29 | setuptools.setup(
30 | name = cfg['lib_name'],
31 | license = lic[0],
32 | classifiers = [
33 | 'Development Status :: ' + statuses[int(cfg['status'])],
34 | 'Intended Audience :: ' + cfg['audience'].title(),
35 | 'License :: ' + lic[1],
36 | 'Natural Language :: ' + cfg['language'].title(),
37 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]],
38 | url = 'https://github.com/{}/{}'.format(cfg['user'],cfg['lib_name']),
39 | packages = setuptools.find_packages(),
40 | include_package_data = True,
41 | install_requires = requirements,
42 | extras_require = {
43 | 'dev': dev_requirements
44 | },
45 | python_requires = '>=' + cfg['min_python'],
46 | long_description = open('README.md').read(),
47 | long_description_content_type = 'text/markdown',
48 | zip_safe = False,
49 | entry_points = { 'console_scripts': cfg.get('console_scripts','').split() },
50 | **setup_cfg)
51 |
52 |
--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | # All sections below are required unless otherwise specified
3 | lib_name = keywords2vec
4 | user = dperezrada
5 | description = To generate a word2vec model, but using multi-word keywords instead of single words.
6 | keywords = word2vec,nlp,text-mining,phrase-extraction,keywords-extraction,multi-language
7 | author = Daniel Pérez Rada
8 | author_email = dperezrada@gmail.com
9 | copyright = Daniel Pérez Rada
10 | branch = master
11 | version = 0.1.0
12 | min_python = 3.6
13 | audience = Developers
14 | language = English
15 | # Set to True if you want to create a more fancy sidebar.json than the default
16 | custom_sidebar = False
17 | # Add licenses and see current list in `setup.py`
18 | license = apache2
19 | # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
20 | status = 3
21 |
22 | # Optional. Same format as setuptools requirements
23 | requirements = nltk Unidecode stop-words fastprogress fasttext annoy
24 | dev_requirements = nbdev jupyter ipywidgets matplotlib
25 | # Optional. Same format as setuptools console_scripts
26 | # console_scripts =
27 |
28 | ###
29 | # You probably won't need to change anything under here,
30 | # unless you have some special requirements
31 | ###
32 |
33 | # Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root
34 | nbs_path = .
35 | doc_path = docs
36 |
37 | # Anything shown as '%(...)s' is substituted with that setting automatically
38 | doc_host = https://%(user)s.github.io
39 | doc_baseurl = /%(lib_name)s/
40 | git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/
41 | lib_path = %(lib_name)s
42 | title = %(lib_name)s
43 |
44 | #Optional advanced parameters
45 | #Monospace docstings: adds tags around the doc strings, preserving newlines/indentation.
46 | #monospace_docstrings = False
47 | #Test flags: introduce here the test flags you want to use separated by |
48 | #tst_flags =
49 | #Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
50 | #custom_sidebar =
51 | #Cell spacing: if you want cell blocks in code separated by more than one new line
52 | #cell_spacing =
53 | #Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
54 | #jekyll_styles = note,warning,tip,important
55 |
--------------------------------------------------------------------------------
/keywords2vec/utils.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 20_utils.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['parallel', 'num_cpus', 'open_file', 'chunk_of_text', 'get_file_chunks']
4 |
5 | # Cell
6 | from .imports import *
7 |
8 | from fastprogress.fastprogress import progress_bar
9 | from concurrent.futures import ProcessPoolExecutor, as_completed
10 |
11 |
12 | # Cell
13 |
14 | # BEGIN From fastai
15 | def parallel(func, arr, max_workers=-1):
16 | if max_workers == -1:
17 | max_workers = num_cpus(2)
18 | with ProcessPoolExecutor(max_workers=max_workers) as ex:
19 | futures = [ex.submit(func, arr_el) for arr_el in arr]
20 | results = []
21 | for f in progress_bar(as_completed(futures), total=len(arr)):
22 | results.append(f.result())
23 | return results
24 |
25 | def num_cpus(n_cpus):
26 | try:
27 | return len(os.sched_getaffinity(0))
28 | except AttributeError:
29 | return os.cpu_count()
30 | if n_cpus > 0:
31 | return n_cpus
32 | """Get number of cpus."""
33 | # END From fastai
34 |
35 |
36 | def open_file(filepath, options):
37 | if filepath[-3:] == ".gz":
38 | return gzip.open(filepath, options)
39 | return open(filepath, options)
40 |
41 |
42 | def chunk_of_text(_file, chunk_size=-1):
43 | index = 0
44 | if chunk_size == -1:
45 | chunk_size = 200
46 | while True:
47 | line = _file.readline()
48 | if not line:
49 | break
50 | for sentence in line.split("."):
51 | if sentence.strip():
52 | yield sentence.strip()
53 | if index >= chunk_size:
54 | break
55 | index += 1
56 |
57 |
58 | def get_file_chunks(start_index, filepath, lines_chunk, sample_size):
59 | _file = open_file(filepath, 'rt')
60 | texts = []
61 | break_by_sample = False
62 | while True:
63 | next_n_lines = list(chunk_of_text(_file, lines_chunk))
64 | texts.append("\n".join(next_n_lines) + "\n")
65 | if not next_n_lines:
66 | break
67 | start_index += lines_chunk
68 | if sample_size > 0 and start_index >= sample_size:
69 | break_by_sample = True
70 | break
71 | _file.close()
72 | return (start_index, texts, break_by_sample)
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | .gitattributes
3 | .last_checked
4 | .gitconfig
5 | *.bak
6 | *.log
7 | *~
8 | ~*
9 | _tmp*
10 | tmp*
11 | tags
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | env/
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 |
40 | # PyInstaller
41 | # Usually these files are written by a python script from a template
42 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 |
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 |
50 | # Unit test / coverage reports
51 | htmlcov/
52 | .tox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | .hypothesis/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # dotenv
95 | .env
96 |
97 | # virtualenv
98 | .venv
99 | venv/
100 | ENV/
101 |
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 |
106 | # Rope project settings
107 | .ropeproject
108 |
109 | # mkdocs documentation
110 | /site
111 |
112 | # mypy
113 | .mypy_cache/
114 |
115 | .vscode
116 | *.swp
117 |
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 |
126 | # pytest
127 | .pytest_cache
128 |
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 |
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 |
136 | # link checker
137 | checklink/cookies.txt
138 |
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 |
142 | *.old
143 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | ## How to get started
4 |
5 | Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
6 | ```
7 | nbdev_install_git_hooks
8 | ```
9 |
10 | ## Did you find a bug?
11 |
12 | * Ensure the bug was not already reported by searching on GitHub under Issues.
13 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
14 | * Be sure to add the complete error messages.
15 |
16 | #### Did you write a patch that fixes a bug?
17 |
18 | * Open a new GitHub pull request with the patch.
19 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
20 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
21 |
22 | ## PR submission guidelines
23 |
24 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
25 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected.
26 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
27 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
28 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.
29 |
30 | ## Do you want to contribute to the documentation?
31 |
32 | * Docs are automatically created from the notebooks in the nbs folder.
33 |
34 |
--------------------------------------------------------------------------------
/analyze/README.md:
--------------------------------------------------------------------------------
1 | # Comparing vocab size
2 |
3 | We generate a quick comparison with the size of the vocab, using the stopword tokenizer vs ngrams. To do this, we used bigquery.
4 | You can take a look to the dataset [here](https://bigquery.cloud.google.com/dataset/api-project-380745743806:epistemonikos)
5 |
6 | | ngrams | keywords | comp |
7 | |--------------------|-----------|---------|
8 | | 1 | 127,824 | 36% |
9 | | 1,2 | 1,360,550 | 388% |
10 | | 1-3 | 3,204,099 | 914% |
11 | | 1-4 | 4,461,930 | 1,272% |
12 | | 1-5 | 5,133,619 | 1,464% |
13 | | | | |
14 | | stopword tokenizer | 350,529 | 100% |
15 |
16 | ## Reproduce
17 | If you wanted to reproduce the results
18 |
19 | ### Get the data
20 | cd to this folder
21 | ```
22 | mkdir -p ../data/inputs
23 | wget "http://s3.amazonaws.com/episte-labs/episte_title_abstract.tsv.gz" -O ../data/inputs/episte_title_abstract.tsv.gz
24 | ```
25 |
26 | ### Get all keywords
27 |
28 | ```
29 | python compare_to_ngrams.py ../data/inputs/episte_title_abstract.tsv.gz| gzip > ../data/all_keywords.tsv.gz
30 | ```
31 |
32 | ### upload to bigquery
33 | ```
34 | gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp "data/all_found.tsv.gz" gs://episte-lab/all_tokens.tsv.gz
35 |
36 | bq mk "api-project-380745743806:epistemonikos.all_keywords" tokenizer_name:string,keyword:string
37 |
38 | bq load --replace --max_bad_records=40000 --field_delimiter="\t" --source_format=CSV "api-project-380745743806:epistemonikos.all_keywords" gs://episte-lab/all_tokens.tsv.gz
39 | ```
40 |
41 | ### Count
42 |
43 | In bigquery, execute the following query, using standard query language, and as destination table we set _epistemonikos.count_keywords_ table.
44 |
45 | ```
46 | SELECT tokenizer_name, keyword, count(*) as repeat_count
47 | FROM `api-project-380745743806.epistemonikos.all_keywords`
48 | GROUP BY tokenizer_name, keyword
49 | ```
50 |
51 | Then we query and set as destination table _epistemonikos.vocab_size_ with append, and changing the number 1 for 2, then 3, and so (a better query is needed later):
52 |
53 | ```
54 | SELECT tokenizer_name, 1 as min_repeat, count(*) as vocab_size
55 | FROM `api-project-380745743806.epistemonikos.count_keywords`
56 | WHERE repeat_count >= 1
57 | GROUP BY tokenizer_name
58 | ORDER BY vocab_size DESC
59 | ```
60 |
61 | Now you can get the vocab_size, and get a CSV like [this](vocab_size_results.csv):
62 | ```
63 | SELECT *
64 | FROM `api-project-380745743806.epistemonikos.vocab_size`
65 | ORDER BY min_repeat ASC, vocab_size DESC
66 | LIMIT 1000
67 | ```
68 |
69 | ## The data
70 | The data is public [here](https://bigquery.cloud.google.com/dataset/api-project-380745743806:epistemonikos)
71 |
72 | You can play around.
--------------------------------------------------------------------------------
/docs/css/theme-green.css:
--------------------------------------------------------------------------------
1 | .summary {
2 | color: #808080;
3 | border-left: 5px solid #E50E51;
4 | font-size:16px;
5 | }
6 |
7 |
8 | h3 {color: #E50E51; }
9 | h4 {color: #808080; }
10 |
11 | .nav-tabs > li.active > a, .nav-tabs > li.active > a:hover, .nav-tabs > li.active > a:focus {
12 | background-color: #248ec2;
13 | color: white;
14 | }
15 |
16 | .nav > li.active > a {
17 | background-color: #72ac4a;
18 | }
19 |
20 | .nav > li > a:hover {
21 | background-color: #72ac4a;
22 | }
23 |
24 | div.navbar-collapse .dropdown-menu > li > a:hover {
25 | background-color: #72ac4a;
26 | }
27 |
28 | .navbar-inverse .navbar-nav>li>a, .navbar-inverse .navbar-brand {
29 | color: white;
30 | }
31 |
32 | .navbar-inverse .navbar-nav>li>a:hover, a.fa.fa-home.fa-lg.navbar-brand:hover {
33 | color: #f0f0f0;
34 | }
35 |
36 | .nav li.thirdlevel > a {
37 | background-color: #FAFAFA !important;
38 | color: #72ac4a;
39 | font-weight: bold;
40 | }
41 |
42 | a[data-toggle="tooltip"] {
43 | color: #649345;
44 | font-style: italic;
45 | cursor: default;
46 | }
47 |
48 | .navbar-inverse {
49 | background-color: #72ac4a;
50 | border-color: #5b893c;
51 | }
52 |
53 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
54 | color: #5b893c;
55 | }
56 |
57 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
58 | background-color: #5b893c;
59 | color: #ffffff;
60 | }
61 |
62 | /* not sure if using this ...*/
63 | .navbar-inverse .navbar-collapse, .navbar-inverse .navbar-form {
64 | border-color: #72ac4a !important;
65 | }
66 |
67 | .btn-primary {
68 | color: #ffffff;
69 | background-color: #5b893c;
70 | border-color: #5b893c;
71 | }
72 |
73 | .btn-primary:hover,
74 | .btn-primary:focus,
75 | .btn-primary:active,
76 | .btn-primary.active,
77 | .open .dropdown-toggle.btn-primary {
78 | background-color: #72ac4a;
79 | border-color: #5b893c;
80 | }
81 |
82 | .printTitle {
83 | color: #5b893c !important;
84 | }
85 |
86 | body.print h1 {color: #5b893c !important; font-size:28px;}
87 | body.print h2 {color: #595959 !important; font-size:24px;}
88 | body.print h3 {color: #E50E51 !important; font-size:14px;}
89 | body.print h4 {color: #679DCE !important; font-size:14px; font-style: italic;}
90 |
91 | .anchorjs-link:hover {
92 | color: #4f7233;
93 | }
94 |
95 | div.sidebarTitle {
96 | color: #E50E51;
97 | }
98 |
99 | li.sidebarTitle {
100 | margin-top:20px;
101 | font-weight:normal;
102 | font-size:130%;
103 | color: #ED1951;
104 | margin-bottom:10px;
105 | margin-left: 5px;
106 | }
107 |
108 | .navbar-inverse .navbar-toggle:focus, .navbar-inverse .navbar-toggle:hover {
109 | background-color: #E50E51;
110 | }
111 |
--------------------------------------------------------------------------------
/keywords2vec/main.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 30_main.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['tokenize_file', 'train_model', 'similars_tree_from_model', 'get_similars', 'similars_tree']
4 |
5 | # Cell
6 | from .imports import *
7 |
8 | from glob import glob
9 | from functools import partial
10 |
11 | import fasttext
12 |
13 | from .utils import parallel, open_file, chunk_of_text, get_file_chunks
14 | from .tokenizer import tokenize
15 |
16 | # Cell
17 |
18 | def tokenize_file(
19 | input_path, output_path="tokenized.txt", lang="en",
20 | sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
21 | ):
22 | tokenize_wrapper = partial(
23 | tokenize, lang=lang, text_output=True, merge=True, keywords_w_stopwords=keywords_w_stopwords
24 | )
25 |
26 | index = 0
27 |
28 | with open(output_path, "wt") as _output:
29 | for file_path in glob(input_path):
30 | print("processing file:", file_path)
31 | # We are going to split the text in chunks to show some progress.
32 | new_index, text_chunks, break_by_sample = get_file_chunks(index, file_path, lines_chunks, sample_size)
33 | index = new_index
34 | results = parallel(tokenize_wrapper, text_chunks, n_cpus)
35 | _output.write(
36 | ("\n".join(results) + "\n").replace(" ", "_").replace("!", " ")
37 | )
38 | if break_by_sample:
39 | break
40 | return output_path
41 |
42 |
43 | def train_model(input_filename):
44 | model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)
45 | return model
46 |
47 | def similars_tree_from_model(model, vector_size=100):
48 | f = 100
49 | t = AnnoyIndex(f, 'angular') # Length of item vector that will be indexed
50 | labels = model.labels
51 | for index, label in enumerate(labels):
52 | v = model[label]
53 | t.add_item(index, v)
54 |
55 | t.build(10) # 10 trees
56 | return labels, t
57 |
58 | def get_similars(tree, labels, keyword, n_similars=10, show_score=False):
59 | index = labels.index(keyword.replace(" ", "_"))
60 | suggestions, scores = tree.get_nns_by_item(index, n=15, include_distances=True)
61 | suggested_labels = [
62 | labels[suggestion].replace("_", " ")
63 | for suggestion in suggestions
64 | ]
65 | return suggested_labels
66 |
67 | def similars_tree(
68 | input_path, temp_tokenized_file="tmp_tokenized.txt", lang="en",
69 | sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
70 | ):
71 | tokenize_file(
72 | input_path=input_path, output_path=temp_tokenized_file, lang=lang,
73 | sample_size=sample_size, lines_chunks=lines_chunks, n_cpus=n_cpus,
74 | keywords_w_stopwords=keywords_w_stopwords
75 | )
76 | model = train_model(temp_tokenized_file)
77 | labels, tree = similars_tree_from_model(model)
78 | return labels, tree
79 |
--------------------------------------------------------------------------------
/docs/_includes/sidebar.html:
--------------------------------------------------------------------------------
1 | {% assign sidebar = site.data.sidebars[page.sidebar].entries %}
2 | {% assign pageurl = page.url | remove: ".html" %}
3 |
4 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/analyze/compare_to_ngrams.py:
--------------------------------------------------------------------------------
1 | import re
2 | import gzip
3 | import sys
4 |
5 | from multiprocessing import Pool, cpu_count
6 | from functools import partial
7 |
8 |
9 | from keywords_tokenizer import tokenize_one
10 |
11 | def tokenize_simple(text):
12 | text_part = text.lower()
13 |
14 | # Must be executed in order
15 | regexs = [
16 | ("’", "'"),
17 | # Remove all non alpha, numeric, spaces, - or single quote
18 | (r'([^a-z0-9\u00C0-\u1FFF\u2C00-\uD7FF \t\n\-\'])', "!!"),
19 | # remove only words numbers
20 | (r'(^|[ !])[0-9]+([ !]|$)', "!!"),
21 | # remove hyphen-minus for keywords starting or ending with it
22 | (r'((^|[ !])[\-\']+)|([\-\']+([ !]|$))', "!!"),
23 | # remove spaces between !
24 | (r' *! *', "!!"),
25 | # generate multiple ! need for next regex
26 | (r'!', "!!"),
27 | # remove one character keyword
28 | (r'(^|!)[^!\n](!|$)', "!!"),
29 | # remove multiple ! (!!!!)
30 | (r'!+', "!"),
31 | # remove first and last !
32 | (r'(^!+)|(!+$)', ""),
33 | # replace spaces
34 | (r'\s', "!"),
35 | ]
36 | for regex, replacement in regexs:
37 | text_part = re.sub(regex, replacement, text_part, flags=re.M)
38 | return text_part.split("!")
39 |
40 |
41 | def get_ngram(text, min_ngram=1, max_ngrams=6):
42 | list_of_words = tokenize_simple(text)
43 | ngrams = {}
44 | for ngram in range(min_ngram, max_ngrams):
45 | ngrams[ngram] = [
46 | " ".join(list_of_words[i:i + ngram])
47 | for i in iter(range(len(list_of_words) - ngram + 1))
48 | ]
49 | return ngrams
50 |
51 | def process_batch_grams(texts):
52 | cpu_num = max(1, cpu_count() - 1)
53 | pool_queue = Pool(cpu_num)
54 |
55 | ngrams_rows = pool_queue.map(get_ngram, texts)
56 | pool_queue.close()
57 | for ngrams_found in ngrams_rows:
58 | for ngram_num, keywords in ngrams_found.items():
59 | for keyword in keywords:
60 | print("%s\t%s" % (ngram_num, keyword))
61 |
62 | # Could be refactored later
63 | def process_batch_stopwords_tokenizer(texts):
64 | cpu_num = max(1, cpu_count() - 1)
65 | pool_queue = Pool(cpu_num)
66 |
67 | rows = pool_queue.map(tokenize_one, texts)
68 | pool_queue.close()
69 | for ngrams_found in rows:
70 | for ngram_num, keywords in ngrams_found.items():
71 | for keyword in keywords:
72 | print("%s\t%s" % ("sk", keyword))
73 |
74 | def main():
75 | ngrams = (1, 6)
76 | batch_size = 10000
77 | cpu_num = max(1, cpu_count() - 1)
78 | pool_queue = Pool(cpu_num)
79 |
80 | texts = []
81 | for index, line in enumerate(gzip.open(sys.argv[1], "rt")):
82 | row = line[:-1].split("\t")
83 | title = row[2]
84 | abstract = row[3]
85 | text = title + "." + abstract
86 | if index > 0 and index % batch_size == 0:
87 | process_batch_grams(texts)
88 | process_batch_stopwords_tokenizer(texts)
89 | texts = []
90 | print(index, end="\r", file=sys.stderr)
91 | if len(texts) > 0:
92 | process_batch_grams(texts)
93 | process_batch_stopwords_tokenizer(texts)
94 |
95 | if __name__ == '__main__':
96 | main()
97 |
--------------------------------------------------------------------------------
/docs/css/theme-blue.css:
--------------------------------------------------------------------------------
1 | .summary {
2 | color: #808080;
3 | border-left: 5px solid #ED1951;
4 | font-size:16px;
5 | }
6 |
7 |
8 | h3 {color: #000000; }
9 | h4 {color: #000000; }
10 |
11 | .nav-tabs > li.active > a, .nav-tabs > li.active > a:hover, .nav-tabs > li.active > a:focus {
12 | background-color: #248ec2;
13 | color: white;
14 | }
15 |
16 | .nav > li.active > a {
17 | background-color: #347DBE;
18 | }
19 |
20 | .nav > li > a:hover {
21 | background-color: #248ec2;
22 | }
23 |
24 | div.navbar-collapse .dropdown-menu > li > a:hover {
25 | background-color: #347DBE;
26 | }
27 |
28 | .nav li.thirdlevel > a {
29 | background-color: #FAFAFA !important;
30 | color: #248EC2;
31 | font-weight: bold;
32 | }
33 |
34 | a[data-toggle="tooltip"] {
35 | color: #649345;
36 | font-style: italic;
37 | cursor: default;
38 | }
39 |
40 | .navbar-inverse {
41 | background-color: #347DBE;
42 | border-color: #015CAE;
43 | }
44 | .navbar-inverse .navbar-nav>li>a, .navbar-inverse .navbar-brand {
45 | color: white;
46 | }
47 |
48 | .navbar-inverse .navbar-nav>li>a:hover, a.fa.fa-home.fa-lg.navbar-brand:hover {
49 | color: #f0f0f0;
50 | }
51 |
52 | a.navbar-brand:hover {
53 | color: #f0f0f0;
54 | }
55 |
56 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
57 | color: #015CAE;
58 | }
59 |
60 | .navbar-inverse .navbar-nav > .open > a, .navbar-inverse .navbar-nav > .open > a:hover, .navbar-inverse .navbar-nav > .open > a:focus {
61 | background-color: #015CAE;
62 | color: #ffffff;
63 | }
64 |
65 | .navbar-inverse .navbar-collapse, .navbar-inverse .navbar-form {
66 | border-color: #248ec2 !important;
67 | }
68 |
69 | .btn-primary {
70 | color: #ffffff;
71 | background-color: #347DBE;
72 | border-color: #347DBE;
73 | }
74 |
75 | .navbar-inverse .navbar-nav > .active > a, .navbar-inverse .navbar-nav > .active > a:hover, .navbar-inverse .navbar-nav > .active > a:focus {
76 | background-color: #347DBE;
77 | }
78 |
79 | .btn-primary:hover,
80 | .btn-primary:focus,
81 | .btn-primary:active,
82 | .btn-primary.active,
83 | .open .dropdown-toggle.btn-primary {
84 | background-color: #248ec2;
85 | border-color: #347DBE;
86 | }
87 |
88 | .printTitle {
89 | color: #015CAE !important;
90 | }
91 |
92 | body.print h1 {color: #015CAE !important; font-size:28px !important;}
93 | body.print h2 {color: #595959 !important; font-size:20px !important;}
94 | body.print h3 {color: #E50E51 !important; font-size:14px !important;}
95 | body.print h4 {color: #679DCE !important; font-size:14px; font-style: italic !important;}
96 |
97 | .anchorjs-link:hover {
98 | color: #216f9b;
99 | }
100 |
101 | div.sidebarTitle {
102 | color: #015CAE;
103 | }
104 |
105 | li.sidebarTitle {
106 | margin-top:20px;
107 | font-weight:normal;
108 | font-size:130%;
109 | color: #ED1951;
110 | margin-bottom:10px;
111 | margin-left: 5px;
112 |
113 | }
114 |
115 | .navbar-inverse .navbar-toggle:focus, .navbar-inverse .navbar-toggle:hover {
116 | background-color: #015CAE;
117 | }
118 |
119 | .navbar-inverse .navbar-toggle {
120 | border-color: #015CAE;
121 | }
122 |
--------------------------------------------------------------------------------
/analyze/vocab_size_results.csv:
--------------------------------------------------------------------------------
1 | tokenizer_name,min_repeat,vocab_size,perc_of_max
2 | 5,1,"125,549,318",1
3 | 4,1,"98,424,631",0.783951936720198
4 | 3,1,"54,989,810",0.437993697424943
5 | 2,1,"14,041,534",0.11184078275917
6 | sk,1,"8,532,143",0.067958497393033
7 | 1,1,"729,252",0.005808490333655
8 | 5,2,"27,970,086",1
9 | 4,2,"25,744,143",0.920417012661313
10 | 3,2,"18,448,509",0.659579988420486
11 | 2,2,"6,325,498",0.226152254233326
12 | sk,2,"2,850,645",0.101917634432729
13 | 1,2,"406,894",0.014547470465411
14 | 3,3,"8,372,240",1
15 | 4,3,"8,191,498",0.978411751215923
16 | 5,3,"6,085,108",0.726819584722846
17 | 2,3,"3,789,551",0.452632867667434
18 | sk,3,"1,342,343",0.160332599161037
19 | 1,3,"287,651",0.03435771072019
20 | 3,4,"5,828,969",1
21 | 4,4,"5,310,830",0.911109666220561
22 | 5,4,"3,769,273",0.646644886943128
23 | 2,4,"2,888,392",0.495523650923517
24 | sk,4,"970,502",0.166496339232547
25 | 1,4,"237,935",0.040819397049461
26 | 3,5,"4,226,673",1
27 | 4,5,"3,439,428",0.813743575620825
28 | 2,5,"2,300,277",0.544228758647759
29 | 5,5,"2,138,736",0.506009336421341
30 | sk,5,"723,230",0.171110942341648
31 | 1,5,"202,138",0.047824376288395
32 | 3,6,"3,402,188",1
33 | 4,6,"2,657,524",0.781122030881303
34 | 2,6,"1,951,263",0.573531797772492
35 | 5,6,"1,608,265",0.472714911698001
36 | sk,6,"598,500",0.175916204513096
37 | 1,6,"179,661",0.052807487416921
38 | 3,7,"2,794,275",1
39 | 4,7,"2,069,573",0.740647574057671
40 | 2,7,"1,687,958",0.604077265122438
41 | 5,7,"1,180,406",0.422437304846517
42 | sk,7,"502,072",0.17967880756189
43 | 1,7,"160,993",0.057615302717163
44 | 3,8,"2,395,570",1
45 | 4,8,"1,725,232",0.720175991517676
46 | 2,8,"1,502,163",0.627058695842743
47 | 5,8,"966,156",0.403309442011713
48 | sk,8,"439,701",0.183547548182687
49 | 1,8,"147,866",0.061724766965691
50 | 3,9,"2,078,106",1
51 | 4,9,"1,447,165",0.696386517338384
52 | 2,9,"1,349,900",0.649581878883945
53 | 5,9,"783,519",0.377035146426602
54 | sk,9,"388,047",0.186731090714333
55 | 1,9,"136,421",0.065646795687997
56 | 3,10,"1,843,549",1
57 | 4,10,"1,257,831",0.682287804663722
58 | 2,10,"1,232,726",0.668670048911095
59 | 5,10,"671,689",0.364345618152813
60 | sk,10,"350,529",0.190138152010063
61 | 1,10,"127,824",0.069335829967091
62 | 3,15,"1,162,809",1
63 | 2,15,"867,345",0.745904959455938
64 | 4,15,"730,842",0.628514227186064
65 | 5,15,"366,007",0.314761065660826
66 | sk,15,"236,877",0.203711013588646
67 | 1,15,"98,683",0.084866044208464
68 | 3,20,"843,965",1
69 | 2,20,"678,065",0.803427867269377
70 | 4,20,"504,692",0.598001101941431
71 | 5,20,"243,773",0.288842546788078
72 | sk,20,"181,448",0.214994697647414
73 | 1,20,"83,201",0.098583472063415
74 | 3,30,"539,543",1
75 | 2,30,"479,622",0.888941196531138
76 | 4,30,"301,530",0.558861851604043
77 | 5,30,"138,541",0.256774714897608
78 | sk,30,"125,883",0.233314119541909
79 | 1,30,"65,817",0.12198657011582
80 | 3,40,"392,935",1
81 | 2,40,"374,555",0.953223815643809
82 | 4,40,"209,929",0.534258846883072
83 | sk,40,"97,334",0.247710181073205
84 | 5,40,"93,242",0.237296244926006
85 | 1,40,"55,910",0.142288164709176
86 | 2,50,"309,045",1
87 | 3,50,"307,022",0.993454027730589
88 | 4,50,"158,562",0.513070912003106
89 | sk,50,"79,897",0.258528693232377
90 | 5,50,"68,758",0.222485398566552
91 | 1,50,"49,600",0.160494426378036
92 | 2,100,"169,444",1
93 | 3,100,"142,447",0.84067302471613
94 | 4,100,"66,126",0.390252826892661
95 | sk,100,"44,048",0.259956091688109
96 | 1,100,"34,234",0.202037251245249
97 | 5,100,"26,858",0.158506645263332
98 |
--------------------------------------------------------------------------------
/docs/_includes/topnav.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 | Nav
17 |
18 |
19 | {% assign topnav = site.data[page.topnav] %}
20 | {% assign topnav_dropdowns = site.data[page.topnav].topnav_dropdowns %}
21 |
22 | {% for entry in topnav.topnav %}
23 | {% for item in entry.items %}
24 | {% if item.external_url %}
25 | {{item.title}}
26 | {% elsif page.url contains item.url %}
27 | {{item.title}}
28 | {% else %}
29 | {{item.title}}
30 | {% endif %}
31 | {% endfor %}
32 | {% endfor %}
33 |
34 |
35 | {% for entry in topnav_dropdowns %}
36 | {% for folder in entry.folders %}
37 |
38 | {{ folder.title }}
39 |
50 |
51 | {% endfor %}
52 | {% endfor %}
53 | {% if site.google_search %}
54 |
55 | {% include search_google_custom.html %}
56 |
57 | {% endif %}
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/docs/js/jquery.navgoco.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | * jQuery Navgoco Menus Plugin v0.2.1 (2014-04-11)
3 | * https://github.com/tefra/navgoco
4 | *
5 | * Copyright (c) 2014 Chris T (@tefra)
6 | * BSD - https://github.com/tefra/navgoco/blob/master/LICENSE-BSD
7 | */
8 | !function(a){"use strict";var b=function(b,c,d){return this.el=b,this.$el=a(b),this.options=c,this.uuid=this.$el.attr("id")?this.$el.attr("id"):d,this.state={},this.init(),this};b.prototype={init:function(){var b=this;b._load(),b.$el.find("ul").each(function(c){var d=a(this);d.attr("data-index",c),b.options.save&&b.state.hasOwnProperty(c)?(d.parent().addClass(b.options.openClass),d.show()):d.parent().hasClass(b.options.openClass)?(d.show(),b.state[c]=1):d.hide()});var c=a(" ").prepend(b.options.caretHtml),d=b.$el.find("li > a");b._trigger(c,!1),b._trigger(d,!0),b.$el.find("li:has(ul) > a").prepend(c)},_trigger:function(b,c){var d=this;b.on("click",function(b){b.stopPropagation();var e=c?a(this).next():a(this).parent().next(),f=!1;if(c){var g=a(this).attr("href");f=void 0===g||""===g||"#"===g}if(e=e.length>0?e:!1,d.options.onClickBefore.call(this,b,e),!c||e&&f)b.preventDefault(),d._toggle(e,e.is(":hidden")),d._save();else if(d.options.accordion){var h=d.state=d._parents(a(this));d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");h.hasOwnProperty(c)||d._toggle(b,!1)}),d._save()}d.options.onClickAfter.call(this,b,e)})},_toggle:function(b,c){var d=this,e=b.attr("data-index"),f=b.parent();if(d.options.onToggleBefore.call(this,b,c),c){if(f.addClass(d.options.openClass),b.slideDown(d.options.slide),d.state[e]=1,d.options.accordion){var g=d.state=d._parents(b);g[e]=d.state[e]=1,d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");g.hasOwnProperty(c)||d._toggle(b,!1)})}}else f.removeClass(d.options.openClass),b.slideUp(d.options.slide),d.state[e]=0;d.options.onToggleAfter.call(this,b,c)},_parents:function(b,c){var d={},e=b.parent(),f=e.parents("ul");return f.each(function(){var b=a(this),e=b.attr("data-index");return e?void(d[e]=c?b:1):!1}),d},_save:function(){if(this.options.save){var b={};for(var d in this.state)1===this.state[d]&&(b[d]=1);c[this.uuid]=this.state=b,a.cookie(this.options.cookie.name,JSON.stringify(c),this.options.cookie)}},_load:function(){if(this.options.save){if(null===c){var b=a.cookie(this.options.cookie.name);c=b?JSON.parse(b):{}}this.state=c.hasOwnProperty(this.uuid)?c[this.uuid]:{}}},toggle:function(b){var c=this,d=arguments.length;if(1>=d)c.$el.find("ul").each(function(){var d=a(this);c._toggle(d,b)});else{var e,f={},g=Array.prototype.slice.call(arguments,1);d--;for(var h=0;d>h;h++){e=g[h];var i=c.$el.find('ul[data-index="'+e+'"]').first();if(i&&(f[e]=i,b)){var j=c._parents(i,!0);for(var k in j)f.hasOwnProperty(k)||(f[k]=j[k])}}for(e in f)c._toggle(f[e],b)}c._save()},destroy:function(){a.removeData(this.$el),this.$el.find("li:has(ul) > a").unbind("click"),this.$el.find("li:has(ul) > a > span").unbind("click")}},a.fn.navgoco=function(c){if("string"==typeof c&&"_"!==c.charAt(0)&&"init"!==c)var d=!0,e=Array.prototype.slice.call(arguments,1);else c=a.extend({},a.fn.navgoco.defaults,c||{}),a.cookie||(c.save=!1);return this.each(function(f){var g=a(this),h=g.data("navgoco");h||(h=new b(this,d?a.fn.navgoco.defaults:c,f),g.data("navgoco",h)),d&&h[c].apply(h,e)})};var c=null;a.fn.navgoco.defaults={caretHtml:"",accordion:!1,openClass:"open",save:!0,cookie:{name:"navgoco",expires:!1,path:"/"},slide:{duration:400,easing:"swing"},onClickBefore:a.noop,onClickAfter:a.noop,onToggleBefore:a.noop,onToggleAfter:a.noop}}(jQuery);
--------------------------------------------------------------------------------
/docs/_includes/initialize_shuffle.html:
--------------------------------------------------------------------------------
1 |
7 |
8 |
100 |
101 |
102 |
103 |
114 |
115 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/docs/css/printstyles.css:
--------------------------------------------------------------------------------
1 |
2 | /*body.print .container {max-width: 650px;}*/
3 |
4 | body {
5 | font-size:14px;
6 | }
7 | .nav ul li a {border-top:0px; background-color:transparent; color: #808080; }
8 | #navig a[href] {color: #595959 !important;}
9 | table .table {max-width:650px;}
10 |
11 | #navig li.sectionHead {font-weight: bold; font-size: 18px; color: #595959 !important; }
12 | #navig li {font-weight: normal; }
13 |
14 | #navig a[href]::after { content: leader(".") target-counter(attr(href), page); }
15 |
16 | a[href]::after {
17 | content: " (page " target-counter(attr(href), page) ")"
18 | }
19 |
20 | a[href^="http:"]::after, a[href^="https:"]::after {
21 | content: "";
22 | }
23 |
24 | a[href] {
25 | color: blue !important;
26 | }
27 | a[href*="mailto"]::after, a[data-toggle="tooltip"]::after, a[href].noCrossRef::after {
28 | content: "";
29 | }
30 |
31 |
32 | @page {
33 | margin: 60pt 90pt 60pt 90pt;
34 | font-family: sans-serif;
35 | font-style:none;
36 | color: gray;
37 |
38 | }
39 |
40 | .printTitle {
41 | line-height:30pt;
42 | font-size:27pt;
43 | font-weight: bold;
44 | letter-spacing: -.5px;
45 | margin-bottom:25px;
46 | }
47 |
48 | .printSubtitle {
49 | font-size: 19pt;
50 | color: #cccccc !important;
51 | font-family: "Grotesque MT Light";
52 | line-height: 22pt;
53 | letter-spacing: -.5px;
54 | margin-bottom:20px;
55 | }
56 | .printTitleArea hr {
57 | color: #999999 !important;
58 | height: 2px;
59 | width: 100%;
60 | }
61 |
62 | .printTitleImage {
63 | max-width:300px;
64 | margin-bottom:200px;
65 | }
66 |
67 |
68 | .printTitleImage {
69 | max-width: 250px;
70 | }
71 |
72 | #navig {
73 | /*page-break-before: always;*/
74 | }
75 |
76 | .copyrightBoilerplate {
77 | page-break-before:always;
78 | font-size:14px;
79 | }
80 |
81 | .lastGeneratedDate {
82 | font-style: italic;
83 | font-size:14px;
84 | color: gray;
85 | }
86 |
87 | .alert a {
88 | text-decoration: none !important;
89 | }
90 |
91 |
92 | body.title { page: title }
93 |
94 | @page title {
95 | @top-left {
96 | content: " ";
97 | }
98 | @top-right {
99 | content: " "
100 | }
101 | @bottom-right {
102 | content: " ";
103 | }
104 | @bottom-left {
105 | content: " ";
106 | }
107 | }
108 |
109 | body.frontmatter { page: frontmatter }
110 | body.frontmatter {counter-reset: page 1}
111 |
112 |
113 | @page frontmatter {
114 | @top-left {
115 | content: prince-script(guideName);
116 | }
117 | @top-right {
118 | content: prince-script(datestamp);
119 | }
120 | @bottom-right {
121 | content: counter(page, lower-roman);
122 | }
123 | @bottom-left {
124 | content: "youremail@domain.com"; }
125 | }
126 |
127 | body.first_page {counter-reset: page 1}
128 |
129 | h1 { string-set: doctitle content() }
130 |
131 | @page {
132 | @top-left {
133 | content: string(doctitle);
134 | font-size: 11px;
135 | font-style: italic;
136 | }
137 | @top-right {
138 | content: prince-script(datestamp);
139 | font-size: 11px;
140 | }
141 |
142 | @bottom-right {
143 | content: "Page " counter(page);
144 | font-size: 11px;
145 | }
146 | @bottom-left {
147 | content: prince-script(guideName);
148 | font-size: 11px;
149 | }
150 | }
151 | .alert {
152 | background-color: #fafafa !important;
153 | border-color: #dedede !important;
154 | color: black;
155 | }
156 |
157 | pre {
158 | background-color: #fafafa;
159 | }
160 |
--------------------------------------------------------------------------------
/docs/css/syntax.css:
--------------------------------------------------------------------------------
1 | .highlight { background: #ffffff; }
2 | .highlight .c { color: #999988; font-style: italic } /* Comment */
3 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
4 | .highlight .k { font-weight: bold } /* Keyword */
5 | .highlight .o { font-weight: bold } /* Operator */
6 | .highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */
7 | .highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */
8 | .highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */
9 | .highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */
10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
11 | .highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */
12 | .highlight .ge { font-style: italic } /* Generic.Emph */
13 | .highlight .gr { color: #aa0000 } /* Generic.Error */
14 | .highlight .gh { color: #999999 } /* Generic.Heading */
15 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
16 | .highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */
17 | .highlight .go { color: #888888 } /* Generic.Output */
18 | .highlight .gp { color: #555555 } /* Generic.Prompt */
19 | .highlight .gs { font-weight: bold } /* Generic.Strong */
20 | .highlight .gu { color: #aaaaaa } /* Generic.Subheading */
21 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */
22 | .highlight .kc { font-weight: bold } /* Keyword.Constant */
23 | .highlight .kd { font-weight: bold } /* Keyword.Declaration */
24 | .highlight .kp { font-weight: bold } /* Keyword.Pseudo */
25 | .highlight .kr { font-weight: bold } /* Keyword.Reserved */
26 | .highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */
27 | .highlight .m { color: #009999 } /* Literal.Number */
28 | .highlight .s { color: #d14 } /* Literal.String */
29 | .highlight .na { color: #008080 } /* Name.Attribute */
30 | .highlight .nb { color: #0086B3 } /* Name.Builtin */
31 | .highlight .nc { color: #445588; font-weight: bold } /* Name.Class */
32 | .highlight .no { color: #008080 } /* Name.Constant */
33 | .highlight .ni { color: #800080 } /* Name.Entity */
34 | .highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */
35 | .highlight .nf { color: #990000; font-weight: bold } /* Name.Function */
36 | .highlight .nn { color: #555555 } /* Name.Namespace */
37 | .highlight .nt { color: #000080 } /* Name.Tag */
38 | .highlight .nv { color: #008080 } /* Name.Variable */
39 | .highlight .ow { font-weight: bold } /* Operator.Word */
40 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
41 | .highlight .mf { color: #009999 } /* Literal.Number.Float */
42 | .highlight .mh { color: #009999 } /* Literal.Number.Hex */
43 | .highlight .mi { color: #009999 } /* Literal.Number.Integer */
44 | .highlight .mo { color: #009999 } /* Literal.Number.Oct */
45 | .highlight .sb { color: #d14 } /* Literal.String.Backtick */
46 | .highlight .sc { color: #d14 } /* Literal.String.Char */
47 | .highlight .sd { color: #d14 } /* Literal.String.Doc */
48 | .highlight .s2 { color: #d14 } /* Literal.String.Double */
49 | .highlight .se { color: #d14 } /* Literal.String.Escape */
50 | .highlight .sh { color: #d14 } /* Literal.String.Heredoc */
51 | .highlight .si { color: #d14 } /* Literal.String.Interpol */
52 | .highlight .sx { color: #d14 } /* Literal.String.Other */
53 | .highlight .sr { color: #009926 } /* Literal.String.Regex */
54 | .highlight .s1 { color: #d14 } /* Literal.String.Single */
55 | .highlight .ss { color: #990073 } /* Literal.String.Symbol */
56 | .highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */
57 | .highlight .vc { color: #008080 } /* Name.Variable.Class */
58 | .highlight .vg { color: #008080 } /* Name.Variable.Global */
59 | .highlight .vi { color: #008080 } /* Name.Variable.Instance */
60 | .highlight .il { color: #009999 } /* Literal.Number.Integer.Long */
--------------------------------------------------------------------------------
/docs/js/toc.js:
--------------------------------------------------------------------------------
1 | // https://github.com/ghiculescu/jekyll-table-of-contents
2 | // this library modified by fastai to:
3 | // - update the location.href with the correct anchor when a toc item is clicked on
4 | (function($){
5 | $.fn.toc = function(options) {
6 | var defaults = {
7 | noBackToTopLinks: false,
8 | title: '',
9 | minimumHeaders: 3,
10 | headers: 'h1, h2, h3, h4',
11 | listType: 'ol', // values: [ol|ul]
12 | showEffect: 'show', // values: [show|slideDown|fadeIn|none]
13 | showSpeed: 'slow' // set to 0 to deactivate effect
14 | },
15 | settings = $.extend(defaults, options);
16 |
17 | var headers = $(settings.headers).filter(function() {
18 | // get all headers with an ID
19 | var previousSiblingName = $(this).prev().attr( "name" );
20 | if (!this.id && previousSiblingName) {
21 | this.id = $(this).attr( "id", previousSiblingName.replace(/\./g, "-") );
22 | }
23 | return this.id;
24 | }), output = $(this);
25 | if (!headers.length || headers.length < settings.minimumHeaders || !output.length) {
26 | return;
27 | }
28 |
29 | if (0 === settings.showSpeed) {
30 | settings.showEffect = 'none';
31 | }
32 |
33 | var render = {
34 | show: function() { output.hide().html(html).show(settings.showSpeed); },
35 | slideDown: function() { output.hide().html(html).slideDown(settings.showSpeed); },
36 | fadeIn: function() { output.hide().html(html).fadeIn(settings.showSpeed); },
37 | none: function() { output.html(html); }
38 | };
39 |
40 | var get_level = function(ele) { return parseInt(ele.nodeName.replace("H", ""), 10); }
41 | var highest_level = headers.map(function(_, ele) { return get_level(ele); }).get().sort()[0];
42 | //var return_to_top = ' ';
43 | // other nice icons that can be used instead: glyphicon-upload glyphicon-hand-up glyphicon-chevron-up glyphicon-menu-up glyphicon-triangle-top
44 | var level = get_level(headers[0]),
45 | this_level,
46 | html = settings.title + " <"+settings.listType+">";
47 | headers.on('click', function() {
48 | if (!settings.noBackToTopLinks) {
49 | var pos = $(window).scrollTop();
50 | window.location.hash = this.id;
51 | $(window).scrollTop(pos);
52 | }
53 | })
54 | .addClass('clickable-header')
55 | .each(function(_, header) {
56 | base_url = window.location.href;
57 | base_url = base_url.replace(/#.*$/, "");
58 | this_level = get_level(header);
59 | //if (!settings.noBackToTopLinks && this_level > 1) {
60 | // $(header).addClass('top-level-header').before(return_to_top);
61 | //}
62 | txt = header.textContent.split('¶')[0].split(/\[(test|source)\]/)[0];
63 | if (!txt) {return;}
64 | if (this_level === level) // same level as before; same indenting
65 | html += "" + txt + " ";
66 | else if (this_level <= level){ // higher level than before; end parent ol
67 | for(i = this_level; i < level; i++) {
68 | html += " "+settings.listType+">"
69 | }
70 | html += "" + txt + " ";
71 | }
72 | else if (this_level > level) { // lower level than before; expand the previous to contain a ol
73 | for(i = this_level; i > level; i--) {
74 | html += "<"+settings.listType+">"+((i-level == 2) ? "" : " ")
75 | }
76 | html += "" + txt + " ";
77 | }
78 | level = this_level; // update for the next one
79 | });
80 | html += ""+settings.listType+">";
81 | if (!settings.noBackToTopLinks) {
82 | $(document).on('click', '.back-to-top', function() {
83 | $(window).scrollTop(0);
84 | window.location.hash = '';
85 | });
86 | }
87 |
88 | render[settings.showEffect]();
89 | };
90 | })(jQuery);
91 |
--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {% include head.html %}
5 |
41 |
46 |
57 | {% if page.datatable == true %}
58 |
59 |
60 |
61 |
66 |
76 | {% endif %}
77 |
78 |
79 |
80 | {% include topnav.html %}
81 |
82 |
83 |
84 |
85 |
86 | {% assign content_col_size = "col-md-12" %}
87 | {% unless page.hide_sidebar %}
88 |
89 |
92 | {% assign content_col_size = "col-md-9" %}
93 | {% endunless %}
94 |
95 |
96 |
97 | {{content}}
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | {% if site.google_analytics %}
108 | {% include google_analytics.html %}
109 | {% endif %}
110 |
111 |
--------------------------------------------------------------------------------
/20_utils.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# default_exp utils"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# Utils\n",
17 | "\n",
18 | "> some utils :) "
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stderr",
28 | "output_type": "stream",
29 | "text": [
30 | "/home/dperezrada/anaconda3/envs/keywords2vec/lib/python3.7/site-packages/fastprogress/fastprogress.py:102: UserWarning: Couldn't import ipywidgets properly, progress bar will use console behavior\n",
31 | " warn(\"Couldn't import ipywidgets properly, progress bar will use console behavior\")\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "#export\n",
37 | "from keywords2vec.imports import *\n",
38 | "\n",
39 | "from fastprogress.fastprogress import progress_bar\n",
40 | "from concurrent.futures import ProcessPoolExecutor, as_completed\n"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "#export\n",
50 | "\n",
51 | "# BEGIN From fastai\n",
52 | "def parallel(func, arr, max_workers=-1):\n",
53 | " if max_workers == -1:\n",
54 | " max_workers = num_cpus(2)\n",
55 | " with ProcessPoolExecutor(max_workers=max_workers) as ex:\n",
56 | " futures = [ex.submit(func, arr_el) for arr_el in arr]\n",
57 | " results = []\n",
58 | " for f in progress_bar(as_completed(futures), total=len(arr)):\n",
59 | " results.append(f.result())\n",
60 | " return results\n",
61 | "\n",
62 | "def num_cpus(n_cpus):\n",
63 | " try:\n",
64 | " return len(os.sched_getaffinity(0))\n",
65 | " except AttributeError:\n",
66 | " return os.cpu_count()\n",
67 | " if n_cpus > 0:\n",
68 | " return n_cpus\n",
69 | " \"\"\"Get number of cpus.\"\"\"\n",
70 | "# END From fastai\n",
71 | "\n",
72 | "\n",
73 | "def open_file(filepath, options):\n",
74 | " if filepath[-3:] == \".gz\":\n",
75 | " return gzip.open(filepath, options)\n",
76 | " return open(filepath, options)\n",
77 | "\n",
78 | "\n",
79 | "def chunk_of_text(_file, chunk_size=-1):\n",
80 | " index = 0\n",
81 | " if chunk_size == -1:\n",
82 | " chunk_size = 200\n",
83 | " while True:\n",
84 | " line = _file.readline()\n",
85 | " if not line:\n",
86 | " break\n",
87 | " for sentence in line.split(\".\"):\n",
88 | " if sentence.strip():\n",
89 | " yield sentence.strip()\n",
90 | " if index >= chunk_size:\n",
91 | " break\n",
92 | " index += 1\n",
93 | "\n",
94 | "\n",
95 | "def get_file_chunks(start_index, filepath, lines_chunk, sample_size):\n",
96 | " _file = open_file(filepath, 'rt')\n",
97 | " texts = []\n",
98 | " break_by_sample = False\n",
99 | " while True:\n",
100 | " next_n_lines = list(chunk_of_text(_file, lines_chunk))\n",
101 | " texts.append(\"\\n\".join(next_n_lines) + \"\\n\")\n",
102 | " if not next_n_lines:\n",
103 | " break\n",
104 | " start_index += lines_chunk\n",
105 | " if sample_size > 0 and start_index >= sample_size:\n",
106 | " break_by_sample = True\n",
107 | " break\n",
108 | " _file.close()\n",
109 | " return (start_index, texts, break_by_sample)\n"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": []
118 | }
119 | ],
120 | "metadata": {
121 | "kernelspec": {
122 | "display_name": "Python 3",
123 | "language": "python",
124 | "name": "python3"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 4
129 | }
130 |
--------------------------------------------------------------------------------
/docs/utils.html:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | title: Utils
4 |
5 | keywords: fastai
6 | sidebar: home_sidebar
7 |
8 | summary: "some utils :) "
9 | ---
10 |
19 |
20 |
21 | {% raw %}
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
parallel(func , arr , max_workers =-1 )
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
num_cpus(n_cpus )
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
open_file(filepath , options )
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
chunk_of_text(_file , chunk_size =-1 )
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
get_file_chunks(start_index , filepath , lines_chunk , sample_size )
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 | {% endraw %}
133 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/docs/_includes/head.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | {{ page.title }} | {{ site.site_title }}
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | {% if site.use_math %}
25 |
26 |
27 |
28 |
39 | {% endif %}
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
56 |
57 |
58 |
59 |
60 | {% if site.twitter_username %}
61 |
62 |
63 |
64 | {% endif %}
65 |
66 | {% if page.summary %}
67 |
68 | {% else %}
69 |
70 | {% endif %}
71 |
72 | {% if page.image %}
73 |
74 |
75 | {% else %}
76 |
77 |
78 | {% endif %}
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/docs/js/jekyll-search.js:
--------------------------------------------------------------------------------
1 | !function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a="function"==typeof require&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);throw new Error("Cannot find module '"+o+"'")}var f=n[o]={exports:{}};t[o][0].call(f.exports,function(e){var n=t[o][1][e];return s(n?n:e)},f,f.exports,e,t,n,r)}return n[o].exports}for(var i="function"==typeof require&&require,o=0;o=0}var self=this;self.matches=function(string,crit){return"string"!=typeof string?!1:(string=string.trim(),doMatch(string,crit))}}module.exports=new LiteralSearchStrategy},{}],4:[function(require,module){module.exports=function(){function findMatches(store,crit,strategy){for(var data=store.get(),i=0;i{title} ',noResultsText:"No results found",limit:10,fuzzy:!1};self.init=function(_opt){validateOptions(_opt),assignOptions(_opt),isJSON(opt.dataSource)?initWithJSON(opt.dataSource):initWithURL(opt.dataSource)}}var Searcher=require("./Searcher"),Templater=require("./Templater"),Store=require("./Store"),JSONLoader=require("./JSONLoader"),searcher=new Searcher,templater=new Templater,store=new Store,jsonLoader=new JSONLoader;window.SimpleJekyllSearch=new SimpleJekyllSearch}(window,document)},{"./JSONLoader":1,"./Searcher":4,"./Store":5,"./Templater":6}]},{},[7]);
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # keywords2vec
2 | > A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words.
3 |
4 |
5 | ## Example result
6 |
7 | Finding similar keywords for "__obesity__"
8 |
9 | | index | term |
10 | |-------|-----------------------------|
11 | | 0 | overweight |
12 | | 1 | obese |
13 | | 2 | physical inactivity |
14 | | 3 | excess weight |
15 | | 4 | obese adults |
16 | | 5 | high bmi |
17 | | 6 | obese adults |
18 | | 7 | obese people |
19 | | 8 | obesity-related outcomes |
20 | | 9 | obesity among children |
21 | | 10 | poor sleep quality |
22 | | 11 | ssbs |
23 | | 12 | obese populations |
24 | | 13 | cardiometabolic risk |
25 | | 14 | abdominal obesity |
26 |
27 |
28 | ## Install
29 |
30 | `pip install keywords2vec`
31 |
32 | ## How to use
33 |
34 | Lets download some example data
35 |
36 | ```
37 | data_filepath = "epistemonikos_data_sample.tsv.gz"
38 |
39 | !wget "https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz" -O "{data_filepath}"
40 | ```
41 |
42 | Import
43 |
44 | ```
45 | from keywords2vec.main import similars_tree, get_similars
46 | ```
47 |
48 |
49 | We create the model.
50 |
51 | ```
52 | labels, tree = similars_tree(data_filepath)
53 | ```
54 |
55 | More info, take a look [here](30_main.ipynb)
56 |
57 |
58 |
59 | Then we can get the most similars keywords
60 |
61 | ```
62 | get_similars(tree, labels, "obesity")
63 | ```
64 |
65 |
66 |
67 |
68 | ['obesity',
69 | 'overweight',
70 | 'obese',
71 | 'physical inactivity',
72 | 'excess weight',
73 | 'high bmi',
74 | 'obese adults',
75 | 'obese people',
76 | 'obesity-related outcomes',
77 | 'obesity among children',
78 | 'poor sleep quality',
79 | 'ssbs',
80 | 'obese populations',
81 | 'cardiometabolic risk',
82 | 'abdominal obesity']
83 |
84 |
85 |
86 | ```
87 | get_similars(tree, labels, "heart failure")
88 | ```
89 |
90 |
91 |
92 |
93 | ['heart failure',
94 | 'hf',
95 | 'chf',
96 | 'chronic heart failure',
97 | 'reduced ejection fraction',
98 | 'unstable angina',
99 | 'peripheral vascular disease',
100 | 'peripheral arterial disease',
101 | 'angina',
102 | 'congestive heart failure',
103 | 'left ventricular systolic dysfunction',
104 | 'acute coronary syndrome',
105 | 'heart failure patients',
106 | 'acute myocardial infarction',
107 | 'left ventricular dysfunction']
108 |
109 |
110 |
111 | ### Motivation
112 |
113 | The idea started in the Epistemonikos database [www.epistemonikos.org](https://www.epistemonikos.org), a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:
114 |
115 | * asthma
116 | * heart failure
117 | * medial compartment knee osteoarthritis
118 | * preserved left ventricular systolic function
119 | * non-selective non-steroidal anti-inflammatory drugs
120 |
121 | We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.
122 |
123 |
124 | ### Our approach
125 |
126 | We found that tokenizing using stopwords + non word characters was really useful for "finding" the keywords. An example:
127 |
128 | * input: "Timing of replacement therapy for acute renal failure after cardiac surgery"
129 | * output: [
130 | "timing",
131 | "replacement therapy",
132 | "acute renal failure",
133 | "cardiac surgery"
134 | ]
135 |
136 | So we basically split the text when we find:
137 | * a stopword
138 | * a non word character(/,!?. etc) (except from - and ')
139 |
140 | That's it.
141 |
142 | But as there were some problem with some keywords that cointain stopwords, like:
143 | * Vitamin A
144 | * Hepatitis A
145 | * Web of Science
146 |
147 | So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter `keywords_w_stopwords=True`, this method is approx 20x slower.
148 |
149 | ### References
150 |
151 | Seem to be an old idea (2004):
152 |
153 | *Mihalcea, Rada, and Paul Tarau. "Textrank: Bringing order into text." Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.*
154 |
155 | Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec
156 |
157 | As pointed by @deliprao in this [twitter thread](https://twitter.com/jeremyphoward/status/1094025901371621376). It's also used by Rake (2010):
158 |
159 | *Rose, Stuart & Engel, Dave & Cramer, Nick & Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.*
160 |
161 | As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)
162 | [https://www.nltk.org/book/ch07.html#Chinking](https://www.nltk.org/book/ch07.html#Chinking)
163 |
164 |
165 | ### Multi-lingual
166 | We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese
167 |
168 |
169 | ## Try it online
170 |
171 | You can try it [here](http://54.196.169.11/episte/) (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)
172 |
173 | These embedding were created using 827,341 title/abstract from @epistemonikos database.
174 | With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)
175 |
176 | ## Vocab size
177 |
178 | One of the main benefit of this method, is the size of the vocabulary.
179 | For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:
180 |
181 | | ngrams | keywords | comp |
182 | |--------------------|-----------|---------|
183 | | 1 | 127,824 | 36% |
184 | | 1,2 | 1,360,550 | 388% |
185 | | 1-3 | 3,204,099 | 914% |
186 | | 1-4 | 4,461,930 | 1,272% |
187 | | 1-5 | 5,133,619 | 1,464% |
188 | | | | |
189 | | stopword tokenizer | 350,529 | 100% |
190 |
191 | More information regarding the comparison, take a look to the folder [analyze](analyze).
192 |
193 |
194 | ## Credits
195 |
196 | This project has been created using [nbdev](https://github.com/fastai/nbdev)
197 |
--------------------------------------------------------------------------------
/keywords2vec/tokenizer.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 10_tokenizer.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['NUMBERS_STOPWORDS', 'prepare_stopwords', 'tokenize_one', 'get_nodes_for_ntlk', 'tokenize_by_nltk',
4 | 'tokenize']
5 |
6 | # Cell
7 | from .imports import *
8 |
9 | # Cell
10 | NUMBERS_STOPWORDS = {
11 | "en": [
12 | "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one", "twenty-two", "twenty-three", "twenty-four", "twenty-five", "twenty-six", "twenty-seven", "twenty-eight", "twenty-nine", "thirty", "thirty-one", "thirty-two", "thirty-three", "thirty-four", "thirty-five", "thirty-six", "thirty-seven", "thirty-eight", "thirty-nine", "forty", "forty-one", "forty-two", "forty-three", "forty-four", "forty-five", "forty-six", "forty-seven", "forty-eight", "forty-nine", "fifty", "fifty-one", "fifty-two", "fifty-three", "fifty-four", "fifty-five", "fifty-six", "fifty-seven", "fifty-eight", "fifty-nine", "sixty", "sixty-one", "sixty-two", "sixty-three", "sixty-four", "sixty-five", "sixty-six", "sixty-seven", "sixty-eight", "sixty-nine", "seventy", "seventy-one", "seventy-two", "seventy-three", "seventy-four", "seventy-five", "seventy-six", "seventy-seven", "seventy-eight", "seventy-nine", "eighty", "eighty-one", "eighty-two", "eighty-three", "eighty-four", "eighty-five", "eighty-six", "eighty-seven", "eighty-eight", "eighty-nine", "ninety", "ninety-one", "ninety-two", "ninety-three", "ninety-four", "ninety-five", "ninety-six", "ninety-seven", "ninety-eight", "ninety-nine"
13 | ],
14 | "es": []
15 | }
16 |
17 | # Cell
18 | def prepare_stopwords(stopwords=None, additional_stopwords=None, lang="en"):
19 | if stopwords is None:
20 | stopwords = safe_get_stop_words(lang) + (NUMBERS_STOPWORDS.get(lang) or [])
21 | if additional_stopwords:
22 | stopwords += additional_stopwords
23 | return [
24 | stopword
25 | for stopword in stopwords
26 | if stopword
27 | ]
28 |
29 |
30 | def tokenize_one(text, stopwords=None, additional_stopwords=None, lang="en", split_by_stopwords=True):
31 | stopwords = prepare_stopwords(stopwords, additional_stopwords, lang)
32 | text_part = text.lower()
33 |
34 | regexs = []
35 | if split_by_stopwords:
36 | # Remove all stopwords by a !, we are searching for the stopword (bounded)
37 | regexs.append(
38 | ("\\b" + "\\b|\\b".join(stopwords), "!!")
39 | )
40 | # Must be executed in order
41 | regexs += [
42 | ("’", "'"),
43 | # Remove all non alpha, numeric, spaces, - or single quote
44 | (r'([^a-z0-9\u00C0-\u1FFF\u2C00-\uD7FF \n\-\'])', "!!"),
45 | # remove only words numbers
46 | (r'(^|[ !])[\-0-9]+([ !]|$)', "!!"),
47 | # remove hyphen-minus for keywords starting or ending with it
48 | (r'((^|[ !])[\-\']+)|([\-\']+([ !]|$))', "!!"),
49 | # remove spaces between !
50 | (r' *! *', "!!"),
51 | # generate multiple ! need for next regex
52 | (r'!', "!!"),
53 | # remove one character keyword
54 | (r'(^|!)[^!\n](!|$)', "!!"),
55 | # remove multiple ! (!!!!)
56 | (r'!+', "!"),
57 | # remove first and last !
58 | (r'(^!+)|(!+$)', "")
59 | ]
60 |
61 | for regex, replacement in regexs:
62 | text_part = re.sub(regex, replacement, text_part, flags=re.M)
63 | return text_part
64 |
65 |
66 | # Second option to tokenize the information
67 | def get_nodes_for_ntlk(parent, stopwords, valid_labels):
68 | keywords = []
69 | for node in parent:
70 | if type(node) is nltk.Tree:
71 | if node.label() in valid_labels:
72 | phrase = " ".join([key.lower() for key, value in node.leaves()])
73 | phrase = unidecode.unidecode(phrase)
74 | for subtree in node.subtrees():
75 | subtree_keywords = get_nodes_for_ntlk(subtree, stopwords, valid_labels)
76 | keywords.extend(subtree_keywords)
77 | if phrase not in stopwords:
78 | pattern = re.compile(r"([^\s\w-]|_)+")
79 | phrase = pattern.sub('', phrase).strip()
80 | keywords.append(phrase)
81 | return keywords
82 |
83 |
84 | def tokenize_by_nltk(text, stopwords=None, additional_stopwords=None, lang="en"):
85 | stopwords = prepare_stopwords(stopwords, additional_stopwords, lang)
86 | # grammar = r'KT: {(* + )? * +}'
87 | grammar = r"""
88 | PHRASE1: {+ *}
89 | PHRASE2: { +}
90 | PHRASE3: {+ }
91 | PHRASE4: {(* + ) }
92 | """
93 | valid_labels = ["PHRASE1", "PHRASE2", "PHRASE3", "PHRASE4", "KT"]
94 |
95 | chunker = nltk.RegexpParser(grammar, loop=5)
96 | chunker2 = nltk.RegexpParser(r"KT: {(* + )? * +}", loop=5)
97 | output = ""
98 |
99 | for line in text.splitlines():
100 | sentences = nltk.sent_tokenize(line)
101 | sentences = [nltk.word_tokenize(sent) for sent in sentences]
102 | sentences = [nltk.pos_tag(sent) for sent in sentences]
103 | keyphrases = []
104 | relevant_words = []
105 | for sentence in sentences:
106 | keyphrases.append(chunker.parse(sentence))
107 | keyphrases.append(chunker2.parse(sentence))
108 | for elem in keyphrases:
109 | relevant_words += get_nodes_for_ntlk(elem, stopwords, valid_labels)
110 | output += "!".join(relevant_words) + "!"
111 |
112 | #output = re.sub("\\b" + "\\b|!".join(stopwords), "!", output, flags=re.M).lower()
113 | output = tokenize_one(output, split_by_stopwords=False)
114 | return output
115 |
116 |
117 | def tokenize(text, text_output=False, lang="en", keywords_w_stopwords=False, merge=True):
118 | outputs = []
119 | tokenizers = [tokenize_one]
120 | if lang == "en" and keywords_w_stopwords:
121 | tokenizers.append(tokenize_by_nltk)
122 |
123 | for tokenizer_el in tokenizers:
124 | outputs.append(
125 | tokenizer_el(
126 | text,
127 | lang=lang
128 | )
129 | )
130 | if text_output:
131 | if merge:
132 | return "!".join(outputs)
133 | else:
134 | return outputs
135 | keywords = [
136 | [
137 | keyword.strip()
138 | for phrase in re.split("\r\n|\n", output)
139 | for keyword in phrase.split("!")
140 | ]
141 | for output in outputs
142 | ]
143 | if merge:
144 | return [item for sublist in keywords for item in sublist]
145 | else:
146 | return keywords
--------------------------------------------------------------------------------
/docs/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | activesupport (4.2.11.1)
5 | i18n (~> 0.7)
6 | minitest (~> 5.1)
7 | thread_safe (~> 0.3, >= 0.3.4)
8 | tzinfo (~> 1.1)
9 | addressable (2.7.0)
10 | public_suffix (>= 2.0.2, < 5.0)
11 | coffee-script (2.4.1)
12 | coffee-script-source
13 | execjs
14 | coffee-script-source (1.11.1)
15 | colorator (1.1.0)
16 | commonmarker (0.17.13)
17 | ruby-enum (~> 0.5)
18 | concurrent-ruby (1.1.5)
19 | dnsruby (1.61.3)
20 | addressable (~> 2.5)
21 | em-websocket (0.5.1)
22 | eventmachine (>= 0.12.9)
23 | http_parser.rb (~> 0.6.0)
24 | ethon (0.12.0)
25 | ffi (>= 1.3.0)
26 | eventmachine (1.2.7)
27 | execjs (2.7.0)
28 | faraday (0.17.0)
29 | multipart-post (>= 1.2, < 3)
30 | ffi (1.11.3)
31 | forwardable-extended (2.6.0)
32 | gemoji (3.0.1)
33 | github-pages (202)
34 | activesupport (= 4.2.11.1)
35 | github-pages-health-check (= 1.16.1)
36 | jekyll (= 3.8.5)
37 | jekyll-avatar (= 0.6.0)
38 | jekyll-coffeescript (= 1.1.1)
39 | jekyll-commonmark-ghpages (= 0.1.6)
40 | jekyll-default-layout (= 0.1.4)
41 | jekyll-feed (= 0.11.0)
42 | jekyll-gist (= 1.5.0)
43 | jekyll-github-metadata (= 2.12.1)
44 | jekyll-mentions (= 1.4.1)
45 | jekyll-optional-front-matter (= 0.3.0)
46 | jekyll-paginate (= 1.1.0)
47 | jekyll-readme-index (= 0.2.0)
48 | jekyll-redirect-from (= 0.14.0)
49 | jekyll-relative-links (= 0.6.0)
50 | jekyll-remote-theme (= 0.4.0)
51 | jekyll-sass-converter (= 1.5.2)
52 | jekyll-seo-tag (= 2.5.0)
53 | jekyll-sitemap (= 1.2.0)
54 | jekyll-swiss (= 0.4.0)
55 | jekyll-theme-architect (= 0.1.1)
56 | jekyll-theme-cayman (= 0.1.1)
57 | jekyll-theme-dinky (= 0.1.1)
58 | jekyll-theme-hacker (= 0.1.1)
59 | jekyll-theme-leap-day (= 0.1.1)
60 | jekyll-theme-merlot (= 0.1.1)
61 | jekyll-theme-midnight (= 0.1.1)
62 | jekyll-theme-minimal (= 0.1.1)
63 | jekyll-theme-modernist (= 0.1.1)
64 | jekyll-theme-primer (= 0.5.3)
65 | jekyll-theme-slate (= 0.1.1)
66 | jekyll-theme-tactile (= 0.1.1)
67 | jekyll-theme-time-machine (= 0.1.1)
68 | jekyll-titles-from-headings (= 0.5.1)
69 | jemoji (= 0.10.2)
70 | kramdown (= 1.17.0)
71 | liquid (= 4.0.0)
72 | listen (= 3.1.5)
73 | mercenary (~> 0.3)
74 | minima (= 2.5.0)
75 | nokogiri (>= 1.10.4, < 2.0)
76 | rouge (= 3.11.0)
77 | terminal-table (~> 1.4)
78 | github-pages-health-check (1.16.1)
79 | addressable (~> 2.3)
80 | dnsruby (~> 1.60)
81 | octokit (~> 4.0)
82 | public_suffix (~> 3.0)
83 | typhoeus (~> 1.3)
84 | html-pipeline (2.12.2)
85 | activesupport (>= 2)
86 | nokogiri (>= 1.4)
87 | http_parser.rb (0.6.0)
88 | i18n (0.9.5)
89 | concurrent-ruby (~> 1.0)
90 | jekyll (3.8.5)
91 | addressable (~> 2.4)
92 | colorator (~> 1.0)
93 | em-websocket (~> 0.5)
94 | i18n (~> 0.7)
95 | jekyll-sass-converter (~> 1.0)
96 | jekyll-watch (~> 2.0)
97 | kramdown (~> 1.14)
98 | liquid (~> 4.0)
99 | mercenary (~> 0.3.3)
100 | pathutil (~> 0.9)
101 | rouge (>= 1.7, < 4)
102 | safe_yaml (~> 1.0)
103 | jekyll-avatar (0.6.0)
104 | jekyll (~> 3.0)
105 | jekyll-coffeescript (1.1.1)
106 | coffee-script (~> 2.2)
107 | coffee-script-source (~> 1.11.1)
108 | jekyll-commonmark (1.3.1)
109 | commonmarker (~> 0.14)
110 | jekyll (>= 3.7, < 5.0)
111 | jekyll-commonmark-ghpages (0.1.6)
112 | commonmarker (~> 0.17.6)
113 | jekyll-commonmark (~> 1.2)
114 | rouge (>= 2.0, < 4.0)
115 | jekyll-default-layout (0.1.4)
116 | jekyll (~> 3.0)
117 | jekyll-feed (0.11.0)
118 | jekyll (~> 3.3)
119 | jekyll-gist (1.5.0)
120 | octokit (~> 4.2)
121 | jekyll-github-metadata (2.12.1)
122 | jekyll (~> 3.4)
123 | octokit (~> 4.0, != 4.4.0)
124 | jekyll-mentions (1.4.1)
125 | html-pipeline (~> 2.3)
126 | jekyll (~> 3.0)
127 | jekyll-optional-front-matter (0.3.0)
128 | jekyll (~> 3.0)
129 | jekyll-paginate (1.1.0)
130 | jekyll-readme-index (0.2.0)
131 | jekyll (~> 3.0)
132 | jekyll-redirect-from (0.14.0)
133 | jekyll (~> 3.3)
134 | jekyll-relative-links (0.6.0)
135 | jekyll (~> 3.3)
136 | jekyll-remote-theme (0.4.0)
137 | addressable (~> 2.0)
138 | jekyll (~> 3.5)
139 | rubyzip (>= 1.2.1, < 3.0)
140 | jekyll-sass-converter (1.5.2)
141 | sass (~> 3.4)
142 | jekyll-seo-tag (2.5.0)
143 | jekyll (~> 3.3)
144 | jekyll-sitemap (1.2.0)
145 | jekyll (~> 3.3)
146 | jekyll-swiss (0.4.0)
147 | jekyll-theme-architect (0.1.1)
148 | jekyll (~> 3.5)
149 | jekyll-seo-tag (~> 2.0)
150 | jekyll-theme-cayman (0.1.1)
151 | jekyll (~> 3.5)
152 | jekyll-seo-tag (~> 2.0)
153 | jekyll-theme-dinky (0.1.1)
154 | jekyll (~> 3.5)
155 | jekyll-seo-tag (~> 2.0)
156 | jekyll-theme-hacker (0.1.1)
157 | jekyll (~> 3.5)
158 | jekyll-seo-tag (~> 2.0)
159 | jekyll-theme-leap-day (0.1.1)
160 | jekyll (~> 3.5)
161 | jekyll-seo-tag (~> 2.0)
162 | jekyll-theme-merlot (0.1.1)
163 | jekyll (~> 3.5)
164 | jekyll-seo-tag (~> 2.0)
165 | jekyll-theme-midnight (0.1.1)
166 | jekyll (~> 3.5)
167 | jekyll-seo-tag (~> 2.0)
168 | jekyll-theme-minimal (0.1.1)
169 | jekyll (~> 3.5)
170 | jekyll-seo-tag (~> 2.0)
171 | jekyll-theme-modernist (0.1.1)
172 | jekyll (~> 3.5)
173 | jekyll-seo-tag (~> 2.0)
174 | jekyll-theme-primer (0.5.3)
175 | jekyll (~> 3.5)
176 | jekyll-github-metadata (~> 2.9)
177 | jekyll-seo-tag (~> 2.0)
178 | jekyll-theme-slate (0.1.1)
179 | jekyll (~> 3.5)
180 | jekyll-seo-tag (~> 2.0)
181 | jekyll-theme-tactile (0.1.1)
182 | jekyll (~> 3.5)
183 | jekyll-seo-tag (~> 2.0)
184 | jekyll-theme-time-machine (0.1.1)
185 | jekyll (~> 3.5)
186 | jekyll-seo-tag (~> 2.0)
187 | jekyll-titles-from-headings (0.5.1)
188 | jekyll (~> 3.3)
189 | jekyll-watch (2.2.1)
190 | listen (~> 3.0)
191 | jemoji (0.10.2)
192 | gemoji (~> 3.0)
193 | html-pipeline (~> 2.2)
194 | jekyll (~> 3.0)
195 | kramdown (1.17.0)
196 | liquid (4.0.0)
197 | listen (3.1.5)
198 | rb-fsevent (~> 0.9, >= 0.9.4)
199 | rb-inotify (~> 0.9, >= 0.9.7)
200 | ruby_dep (~> 1.2)
201 | mercenary (0.3.6)
202 | mini_portile2 (2.4.0)
203 | minima (2.5.0)
204 | jekyll (~> 3.5)
205 | jekyll-feed (~> 0.9)
206 | jekyll-seo-tag (~> 2.1)
207 | minitest (5.13.0)
208 | multipart-post (2.1.1)
209 | nokogiri (1.10.8)
210 | mini_portile2 (~> 2.4.0)
211 | octokit (4.14.0)
212 | sawyer (~> 0.8.0, >= 0.5.3)
213 | pathutil (0.16.2)
214 | forwardable-extended (~> 2.6)
215 | public_suffix (3.1.1)
216 | rb-fsevent (0.10.3)
217 | rb-inotify (0.10.0)
218 | ffi (~> 1.0)
219 | rouge (3.11.0)
220 | ruby-enum (0.7.2)
221 | i18n
222 | ruby_dep (1.5.0)
223 | rubyzip (2.0.0)
224 | safe_yaml (1.0.5)
225 | sass (3.7.4)
226 | sass-listen (~> 4.0.0)
227 | sass-listen (4.0.0)
228 | rb-fsevent (~> 0.9, >= 0.9.4)
229 | rb-inotify (~> 0.9, >= 0.9.7)
230 | sawyer (0.8.2)
231 | addressable (>= 2.3.5)
232 | faraday (> 0.8, < 2.0)
233 | terminal-table (1.8.0)
234 | unicode-display_width (~> 1.1, >= 1.1.1)
235 | thread_safe (0.3.6)
236 | typhoeus (1.3.1)
237 | ethon (>= 0.9.0)
238 | tzinfo (1.2.5)
239 | thread_safe (~> 0.1)
240 | unicode-display_width (1.6.0)
241 |
242 | PLATFORMS
243 | ruby
244 |
245 | DEPENDENCIES
246 | github-pages
247 | jekyll (~> 3.7)
248 |
249 | BUNDLED WITH
250 | 2.0.2
251 |
--------------------------------------------------------------------------------
/docs/main.html:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | title: Main
4 |
5 | keywords: fastai
6 | sidebar: home_sidebar
7 |
8 | summary: "This are the main functions, where we are going to "
9 | ---
10 |
19 |
20 |
21 | {% raw %}
22 |
23 |
24 |
25 |
26 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
similars_tree(input_path , temp_tokenized_file ='tmp_tokenized.txt' , lang ='en' , sample_size =-1 , lines_chunks =-1 , n_cpus =-1 , keywords_w_stopwords =False )
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
--2020-02-25 11:52:04-- https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz
87 | Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.240.38
88 | Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.240.38|:443... connected.
89 | HTTP request sent, awaiting response... 200 OK
90 | Length: 21510551 (21M) [application/gzip]
91 | Saving to: ‘epistemonikos_data_sample.tsv.gz’
92 |
93 | epistemonikos_data_ 100%[===================>] 20.51M 1.76MB/s in 12s
94 |
95 | 2020-02-25 11:52:17 (1.70 MB/s) - ‘epistemonikos_data_sample.tsv.gz’ saved [21510551/21510551]
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
tokenize_file(input_path , output_path ='tokenized.txt' , lang ='en' , sample_size =-1 , lines_chunks =-1 , n_cpus =-1 , keywords_w_stopwords =False )
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
processing file: epistemonikos_data_sample.tsv.gz
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
166 |
167 | 100.00% [201/201 00:16<00:00]
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
'tokenized_epistemonikos_data.txt'
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
train_model(input_filename )
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
similars_tree_from_model(model , vector_size =100 )
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
get_similars(tree , labels , keyword , n_similars =10 , show_score =False )
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
['obesity',
295 | 'overweight',
296 | 'obese children',
297 | 'ssbs',
298 | 'poor sleep quality',
299 | 'metabolic syndrome',
300 | 'obesity among children',
301 | 'dental caries',
302 | 'physical inactivity',
303 | 'obesity may',
304 | 'sedentary behaviour',
305 | 'food allergy',
306 | 'sugar-sweetened beverages',
307 | 'worldwide prevalence',
308 | 'known risk factor']
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 | {% endraw %}
318 |
319 |
320 |
321 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#hide\n",
10 | "from keywords2vec.main import *"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# keywords2vec\n",
18 | "\n",
19 | "> A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words.\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Example result"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Finding similar keywords for \"__obesity__\"\n",
34 | "\n",
35 | "| index | term |\n",
36 | "|-------|-----------------------------|\n",
37 | "| 0 | overweight |\n",
38 | "| 1 | obese |\n",
39 | "| 2 | physical inactivity |\n",
40 | "| 3 | excess weight |\n",
41 | "| 4 | obese adults |\n",
42 | "| 5 | high bmi |\n",
43 | "| 6 | obese adults |\n",
44 | "| 7 | obese people |\n",
45 | "| 8 | obesity-related outcomes |\n",
46 | "| 9 | obesity among children |\n",
47 | "| 10 | poor sleep quality |\n",
48 | "| 11 | ssbs |\n",
49 | "| 12 | obese populations |\n",
50 | "| 13 | cardiometabolic risk |\n",
51 | "| 14 | abdominal obesity |\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Install"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "`pip install keywords2vec`"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## How to use"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Lets download some example data"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "data_filepath = \"epistemonikos_data_sample.tsv.gz\"\n",
89 | "\n",
90 | "!wget \"https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\" -O \"{data_filepath}\""
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "We create the model. If you need the vectors, take a look [here](30_main.ipynb)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "processing file: epistemonikos_data_sample.tsv.gz\n"
110 | ]
111 | },
112 | {
113 | "data": {
114 | "text/html": [
115 | "\n",
116 | " \n",
117 | " \n",
129 | "
\n",
130 | " 100.00% [201/201 00:19<00:00]\n",
131 | "
\n",
132 | " "
133 | ],
134 | "text/plain": [
135 | ""
136 | ]
137 | },
138 | "metadata": {},
139 | "output_type": "display_data"
140 | }
141 | ],
142 | "source": [
143 | "labels, tree = similars_tree(data_filepath)"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "Then we can get the most similars keywords"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "['obesity',\n",
162 | " 'overweight',\n",
163 | " 'obese',\n",
164 | " 'physical inactivity',\n",
165 | " 'excess weight',\n",
166 | " 'high bmi',\n",
167 | " 'obese adults',\n",
168 | " 'obese people',\n",
169 | " 'obesity-related outcomes',\n",
170 | " 'obesity among children',\n",
171 | " 'poor sleep quality',\n",
172 | " 'ssbs',\n",
173 | " 'obese populations',\n",
174 | " 'cardiometabolic risk',\n",
175 | " 'abdominal obesity']"
176 | ]
177 | },
178 | "execution_count": null,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "get_similars(tree, labels, \"obesity\")"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "['heart failure',\n",
196 | " 'hf',\n",
197 | " 'chf',\n",
198 | " 'chronic heart failure',\n",
199 | " 'reduced ejection fraction',\n",
200 | " 'unstable angina',\n",
201 | " 'peripheral vascular disease',\n",
202 | " 'peripheral arterial disease',\n",
203 | " 'angina',\n",
204 | " 'congestive heart failure',\n",
205 | " 'left ventricular systolic dysfunction',\n",
206 | " 'acute coronary syndrome',\n",
207 | " 'heart failure patients',\n",
208 | " 'acute myocardial infarction',\n",
209 | " 'left ventricular dysfunction']"
210 | ]
211 | },
212 | "execution_count": null,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "get_similars(tree, labels, \"heart failure\")"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "### Motivation\n",
226 | "\n",
227 | "The idea started in the Epistemonikos database [www.epistemonikos.org](https://www.epistemonikos.org), a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:\n",
228 | "\n",
229 | " * asthma\n",
230 | " * heart failure\n",
231 | " * medial compartment knee osteoarthritis\n",
232 | " * preserved left ventricular systolic function\n",
233 | " * non-selective non-steroidal anti-inflammatory drugs\n",
234 | " \n",
235 | "We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.\n"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Our approach\n",
243 | "\n",
244 | "We found that tokenizing using stopwords + non word characters was really useful for \"finding\" the keywords. An example:\n",
245 | "\n",
246 | "* input: \"Timing of replacement therapy for acute renal failure after cardiac surgery\"\n",
247 | "* output: [\n",
248 | "\t\"timing\",\n",
249 | "\t\"replacement therapy\",\n",
250 | "\t\"acute renal failure\",\n",
251 | "\t\"cardiac surgery\"\n",
252 | "]\n",
253 | "\n",
254 | "So we basically split the text when we find:\n",
255 | " * a stopword\n",
256 | " * a non word character(/,!?. etc) (except from - and ')\n",
257 | "\n",
258 | "That's it.\n",
259 | "\n",
260 | "But as there were some problem with some keywords that cointain stopwords, like:\n",
261 | " * Vitamin A\n",
262 | " * Hepatitis A\n",
263 | " * Web of Science\n",
264 | "\n",
265 | "So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter `keywords_w_stopwords=True`, this method is approx 20x slower."
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "### References\n",
273 | "\n",
274 | "Seem to be an old idea (2004):\n",
275 | "\n",
276 | "*Mihalcea, Rada, and Paul Tarau. \"Textrank: Bringing order into text.\" Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.*\n",
277 | "\n",
278 | "Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec\n",
279 | "\n",
280 | "As pointed by @deliprao in this [twitter thread](https://twitter.com/jeremyphoward/status/1094025901371621376). It's also used by Rake (2010):\n",
281 | "\n",
282 | "*Rose, Stuart & Engel, Dave & Cramer, Nick & Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.*\n",
283 | "\n",
284 | "As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)\n",
285 | "[https://www.nltk.org/book/ch07.html#Chinking](https://www.nltk.org/book/ch07.html#Chinking)\n"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "### Multi-lingual\n",
293 | "We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese\n"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "## Try it online\n",
301 | "\n",
302 | "You can try it [here](http://54.196.169.11/episte/) (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)\n",
303 | "\n",
304 | "These embedding were created using 827,341 title/abstract from @epistemonikos database.\n",
305 | "With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "## Vocab size\n",
313 | "\n",
314 | "One of the main benefit of this method, is the size of the vocabulary. \n",
315 | "For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:\n",
316 | "\n",
317 | "| ngrams | keywords | comp |\n",
318 | "|--------------------|-----------|---------|\n",
319 | "| 1 | 127,824 | 36% |\n",
320 | "| 1,2 | 1,360,550 | 388% |\n",
321 | "| 1-3 | 3,204,099 | 914% |\n",
322 | "| 1-4 | 4,461,930 | 1,272% |\n",
323 | "| 1-5 | 5,133,619 | 1,464% |\n",
324 | "| | | |\n",
325 | "| stopword tokenizer | 350,529 | 100% |\n",
326 | "\n",
327 | "More information regarding the comparison, take a look to the folder [analyze](analyze).\n"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "## Credits"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "This project has been created using [nbdev](https://github.com/fastai/nbdev)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {},
348 | "outputs": [],
349 | "source": []
350 | }
351 | ],
352 | "metadata": {
353 | "kernelspec": {
354 | "display_name": "Python 3",
355 | "language": "python",
356 | "name": "python3"
357 | }
358 | },
359 | "nbformat": 4,
360 | "nbformat_minor": 2
361 | }
362 |
--------------------------------------------------------------------------------
/30_main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# default_exp main"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from nbdev.showdoc import *"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "# Main\n",
26 | "\n",
27 | "> This are the main functions, where we are going to "
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# export \n",
37 | "from keywords2vec.imports import *\n",
38 | "\n",
39 | "from glob import glob\n",
40 | "from functools import partial\n",
41 | "\n",
42 | "import fasttext\n",
43 | "\n",
44 | "from keywords2vec.utils import parallel, open_file, chunk_of_text, get_file_chunks\n",
45 | "from keywords2vec.tokenizer import tokenize"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "#export\n",
55 | "\n",
56 | "def tokenize_file(\n",
57 | " input_path, output_path=\"tokenized.txt\", lang=\"en\",\n",
58 | " sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False\n",
59 | "):\n",
60 | " tokenize_wrapper = partial(\n",
61 | " tokenize, lang=lang, text_output=True, merge=True, keywords_w_stopwords=keywords_w_stopwords\n",
62 | " )\n",
63 | "\n",
64 | " index = 0\n",
65 | "\n",
66 | " with open(output_path, \"wt\") as _output:\n",
67 | " for file_path in glob(input_path):\n",
68 | " print(\"processing file:\", file_path)\n",
69 | " # We are going to split the text in chunks to show some progress.\n",
70 | " new_index, text_chunks, break_by_sample = get_file_chunks(index, file_path, lines_chunks, sample_size)\n",
71 | " index = new_index\n",
72 | " results = parallel(tokenize_wrapper, text_chunks, n_cpus)\n",
73 | " _output.write(\n",
74 | " (\"\\n\".join(results) + \"\\n\").replace(\" \", \"_\").replace(\"!\", \" \")\n",
75 | " )\n",
76 | " if break_by_sample:\n",
77 | " break\n",
78 | " return output_path\n",
79 | "\n",
80 | "\n",
81 | "def train_model(input_filename):\n",
82 | " model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)\n",
83 | " return model\n",
84 | "\n",
85 | "def similars_tree_from_model(model, vector_size=100):\n",
86 | " f = 100\n",
87 | " t = AnnoyIndex(f, 'angular') # Length of item vector that will be indexed\n",
88 | " labels = model.labels\n",
89 | " for index, label in enumerate(labels):\n",
90 | " v = model[label]\n",
91 | " t.add_item(index, v)\n",
92 | "\n",
93 | " t.build(10) # 10 trees\n",
94 | " return labels, t\n",
95 | "\n",
96 | "def get_similars(tree, labels, keyword, n_similars=10, show_score=False):\n",
97 | " index = labels.index(keyword.replace(\" \", \"_\"))\n",
98 | " suggestions, scores = tree.get_nns_by_item(index, n=15, include_distances=True)\n",
99 | " suggested_labels = [\n",
100 | " labels[suggestion].replace(\"_\", \" \")\n",
101 | " for suggestion in suggestions\n",
102 | " ]\n",
103 | " return suggested_labels\n",
104 | "\n",
105 | "def similars_tree(\n",
106 | " input_path, temp_tokenized_file=\"tmp_tokenized.txt\", lang=\"en\",\n",
107 | " sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False\n",
108 | "):\n",
109 | " tokenize_file(\n",
110 | " input_path=input_path, output_path=temp_tokenized_file, lang=lang,\n",
111 | " sample_size=sample_size, lines_chunks=lines_chunks, n_cpus=n_cpus,\n",
112 | " keywords_w_stopwords=keywords_w_stopwords\n",
113 | " )\n",
114 | " model = train_model(temp_tokenized_file)\n",
115 | " labels, tree = similars_tree_from_model(model)\n",
116 | " return labels, tree\n",
117 | " "
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "--2020-02-25 11:52:04-- https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\n",
130 | "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.240.38\n",
131 | "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.240.38|:443... connected.\n",
132 | "HTTP request sent, awaiting response... 200 OK\n",
133 | "Length: 21510551 (21M) [application/gzip]\n",
134 | "Saving to: ‘epistemonikos_data_sample.tsv.gz’\n",
135 | "\n",
136 | "epistemonikos_data_ 100%[===================>] 20.51M 1.76MB/s in 12s \n",
137 | "\n",
138 | "2020-02-25 11:52:17 (1.70 MB/s) - ‘epistemonikos_data_sample.tsv.gz’ saved [21510551/21510551]\n",
139 | "\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "data_url = \"https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz\"\n",
145 | "data_filepath = \"epistemonikos_data_sample.tsv.gz\"\n",
146 | "tokenized_filepath = \"tokenized_epistemonikos_data.txt\"\n",
147 | "!wget \"{data_url}\" -O \"{data_filepath}\""
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/markdown": [
158 | "\n",
159 | "\n",
160 | "> tokenize_file(**`input_path`**, **`output_path`**=*`'tokenized.txt'`*, **`lang`**=*`'en'`*, **`sample_size`**=*`-1`*, **`lines_chunks`**=*`-1`*, **`n_cpus`**=*`-1`*, **`keywords_w_stopwords`**=*`False`*)\n",
161 | "\n"
162 | ],
163 | "text/plain": [
164 | ""
165 | ]
166 | },
167 | "metadata": {},
168 | "output_type": "display_data"
169 | }
170 | ],
171 | "source": [
172 | "show_doc(tokenize_file)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "processing file: epistemonikos_data_sample.tsv.gz\n"
185 | ]
186 | },
187 | {
188 | "data": {
189 | "text/html": [
190 | "\n",
191 | " \n",
192 | " \n",
204 | "
\n",
205 | " 100.00% [201/201 00:16<00:00]\n",
206 | "
\n",
207 | " "
208 | ],
209 | "text/plain": [
210 | ""
211 | ]
212 | },
213 | "metadata": {},
214 | "output_type": "display_data"
215 | },
216 | {
217 | "data": {
218 | "text/plain": [
219 | "'tokenized_epistemonikos_data.txt'"
220 | ]
221 | },
222 | "execution_count": null,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "tokenize_file(data_filepath, tokenized_filepath)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/markdown": [
239 | "\n",
240 | "\n",
241 | "> train_model(**`input_filename`**)\n",
242 | "\n"
243 | ],
244 | "text/plain": [
245 | ""
246 | ]
247 | },
248 | "metadata": {},
249 | "output_type": "display_data"
250 | }
251 | ],
252 | "source": [
253 | "show_doc(train_model)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "model = train_model(tokenized_filepath)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/markdown": [
273 | "\n",
274 | "\n",
275 | "> similars_tree_from_model(**`model`**, **`vector_size`**=*`100`*)\n",
276 | "\n"
277 | ],
278 | "text/plain": [
279 | ""
280 | ]
281 | },
282 | "metadata": {},
283 | "output_type": "display_data"
284 | }
285 | ],
286 | "source": [
287 | "show_doc(similars_tree_from_model)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "labels, tree = similars_tree_from_model(model)"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/markdown": [
307 | "\n",
308 | "\n",
309 | "> get_similars(**`tree`**, **`labels`**, **`keyword`**, **`n_similars`**=*`10`*, **`show_score`**=*`False`*)\n",
310 | "\n"
311 | ],
312 | "text/plain": [
313 | ""
314 | ]
315 | },
316 | "metadata": {},
317 | "output_type": "display_data"
318 | }
319 | ],
320 | "source": [
321 | "show_doc(get_similars)"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "data": {
331 | "text/plain": [
332 | "['obesity',\n",
333 | " 'overweight',\n",
334 | " 'obese children',\n",
335 | " 'ssbs',\n",
336 | " 'poor sleep quality',\n",
337 | " 'metabolic syndrome',\n",
338 | " 'obesity among children',\n",
339 | " 'dental caries',\n",
340 | " 'physical inactivity',\n",
341 | " 'obesity may',\n",
342 | " 'sedentary behaviour',\n",
343 | " 'food allergy',\n",
344 | " 'sugar-sweetened beverages',\n",
345 | " 'worldwide prevalence',\n",
346 | " 'known risk factor']"
347 | ]
348 | },
349 | "execution_count": null,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "get_similars(tree, labels, \"obesity\")"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": []
364 | }
365 | ],
366 | "metadata": {
367 | "kernelspec": {
368 | "display_name": "Python 3",
369 | "language": "python",
370 | "name": "python3"
371 | }
372 | },
373 | "nbformat": 4,
374 | "nbformat_minor": 4
375 | }
376 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | title: keywords2vec
4 |
5 | keywords: fastai
6 | sidebar: home_sidebar
7 |
8 | summary: "A simple and fast way to generate a word2vec model, with multi-word keywords instead of single words."
9 | ---
10 |
19 |
20 |
21 | {% raw %}
22 |
23 |
24 |
25 |
26 |
27 |
28 |
Example result¶
29 |
30 |
31 |
32 |
33 |
34 |
Finding similar keywords for "obesity "
35 |
36 |
37 | index
38 | term
39 |
40 |
41 |
42 |
43 | 0
44 | overweight
45 |
46 |
47 | 1
48 | obese
49 |
50 |
51 | 2
52 | physical inactivity
53 |
54 |
55 | 3
56 | excess weight
57 |
58 |
59 | 4
60 | obese adults
61 |
62 |
63 | 5
64 | high bmi
65 |
66 |
67 | 6
68 | obese adults
69 |
70 |
71 | 7
72 | obese people
73 |
74 |
75 | 8
76 | obesity-related outcomes
77 |
78 |
79 | 9
80 | obesity among children
81 |
82 |
83 | 10
84 | poor sleep quality
85 |
86 |
87 | 11
88 | ssbs
89 |
90 |
91 | 12
92 | obese populations
93 |
94 |
95 | 13
96 | cardiometabolic risk
97 |
98 |
99 | 14
100 | abdominal obesity
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
114 |
115 |
116 |
pip install keywords2vec
117 |
118 |
119 |
120 |
121 |
127 |
128 |
129 |
Lets download some example data
130 |
131 |
132 |
133 |
134 |
149 |
150 |
151 |
We create the model. If you need the vectors, take a look here
152 |
153 |
154 |
155 |
156 |
157 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
processing file: epistemonikos_data_sample.tsv.gz
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
197 |
198 | 100.00% [201/201 00:19<00:00]
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
Then we can get the most similars keywords
212 |
213 |
214 |
215 |
216 |
217 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
['obesity',
237 | 'overweight',
238 | 'obese',
239 | 'physical inactivity',
240 | 'excess weight',
241 | 'high bmi',
242 | 'obese adults',
243 | 'obese people',
244 | 'obesity-related outcomes',
245 | 'obesity among children',
246 | 'poor sleep quality',
247 | 'ssbs',
248 | 'obese populations',
249 | 'cardiometabolic risk',
250 | 'abdominal obesity']
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
['heart failure',
280 | 'hf',
281 | 'chf',
282 | 'chronic heart failure',
283 | 'reduced ejection fraction',
284 | 'unstable angina',
285 | 'peripheral vascular disease',
286 | 'peripheral arterial disease',
287 | 'angina',
288 | 'congestive heart failure',
289 | 'left ventricular systolic dysfunction',
290 | 'acute coronary syndrome',
291 | 'heart failure patients',
292 | 'acute myocardial infarction',
293 | 'left ventricular dysfunction']
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
Motivation¶ The idea started in the Epistemonikos database www.epistemonikos.org , a database of scientific articles for people making decisions concerning clinical or health-policy questions. In this context the scientific/health language used is complex. You can easily find keywords like:
305 |
306 | asthma
307 | heart failure
308 | medial compartment knee osteoarthritis
309 | preserved left ventricular systolic function
310 | non-selective non-steroidal anti-inflammatory drugs
311 |
312 |
We tried some approaches to find those keywords, like ngrams, ngrams + tf-idf, identify entities, among others. But we didn't get really good results.
313 |
314 |
315 |
316 |
317 |
318 |
319 |
Our approach¶ We found that tokenizing using stopwords + non word characters was really useful for "finding" the keywords. An example:
320 |
321 | input: "Timing of replacement therapy for acute renal failure after cardiac surgery"
322 | output: [
323 | "timing",
324 | "replacement therapy",
325 | "acute renal failure",
326 | "cardiac surgery"
327 | ]
328 |
329 |
So we basically split the text when we find:
330 |
331 | a stopword
332 | a non word character(/,!?. etc) (except from - and ')
333 |
334 |
That's it.
335 |
But as there were some problem with some keywords that cointain stopwords, like:
336 |
337 | Vitamin A
338 | Hepatitis A
339 | Web of Science
340 |
341 |
So we decided to add another method (nltk with some grammar definition) to cover most of the cases. To use this, you need to add the parameter keywords_w_stopwords=True, this method is approx 20x slower.
342 |
343 |
344 |
345 |
346 |
347 |
348 |
References¶ Seem to be an old idea (2004):
349 |
Mihalcea, Rada, and Paul Tarau. "Textrank: Bringing order into text." Proceedings of the 2004 conference on empirical methods in natural language processing. 2004.
350 |
Reading an implementation of textrank, I realize they used stopwords to separate and create the graph. Then I though in using it as tokenizer for word2vec
351 |
As pointed by @deliprao in this twitter thread . It's also used by Rake (2010):
352 |
Rose, Stuart & Engel, Dave & Cramer, Nick & Cowley, Wendy. (2010). Automatic Keyword Extraction from Individual Documents. 10.1002/9780470689646.ch1.
353 |
As noted by @astent in the Twitter thread, this concept is called chinking (chunking by exclusion)
354 | https://www.nltk.org/book/ch07.html#Chinking
355 |
356 |
357 |
358 |
359 |
360 |
361 |
Multi-lingual¶ We worked in an implementation, that could be used in multiple languages. Of course not all languages are sutable for using this approach. We have tried with good results in English, Spanish and Portuguese
362 |
363 |
364 |
365 |
366 |
367 |
368 |
Try it online¶ You can try it here (takes time to load, lowercase only, doesn't work in mobile yet) MPV :)
369 |
These embedding were created using 827,341 title/abstract from @epistemonikos database.
370 | With keywords that repeat at least 10 times. The total vocab is 349,080 keywords (really manageable number)
371 |
372 |
373 |
374 |
375 |
376 |
377 |
Vocab size¶ One of the main benefit of this method, is the size of the vocabulary.
378 | For example, using keywords that repeat at least 10 times, for the Epistemonikos dataset (827,341 title/abstract), we got the following vocab size:
379 |
380 |
381 | ngrams
382 | keywords
383 | comp
384 |
385 |
386 |
387 |
388 | 1
389 | 127,824
390 | 36%
391 |
392 |
393 | 1,2
394 | 1,360,550
395 | 388%
396 |
397 |
398 | 1-3
399 | 3,204,099
400 | 914%
401 |
402 |
403 | 1-4
404 | 4,461,930
405 | 1,272%
406 |
407 |
408 | 1-5
409 | 5,133,619
410 | 1,464%
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 | stopword tokenizer
419 | 350,529
420 | 100%
421 |
422 |
423 |
424 |
More information regarding the comparison, take a look to the folder analyze .
425 |
426 |
427 |
428 |
429 |
435 |
436 |
437 |
This project has been created using nbdev
438 |
439 |
440 |
441 |
442 | {% endraw %}
443 |
444 |
445 |
446 |
--------------------------------------------------------------------------------
/docs/tokenizer.html:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | title: Tokenizer
4 |
5 | keywords: fastai
6 | sidebar: home_sidebar
7 |
8 | summary: "We are going to tokenize using 2 different strategies. The first one is using stopwords (read the main README for more information). And the second one, is a nltk grammar regexp parser."
9 | ---
10 |
19 |
20 |
21 | {% raw %}
22 |
23 |
24 |
25 |
26 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
prepare_stopwords(stopwords =None , additional_stopwords =None , lang ='en' )
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
tokenize_one(text , stopwords =None , additional_stopwords =None , lang ='en' , split_by_stopwords =True )
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
get_nodes_for_ntlk(parent , stopwords , valid_labels )
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
tokenize_by_nltk(text , stopwords =None , additional_stopwords =None , lang ='en' )
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
tokenize(text , text_output =False , lang ='en' , keywords_w_stopwords =False , merge =True )
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
163 |
164 |
165 |
Tokenizing the text, by default use two methods and merge them, so you might see duplicated keywords.
166 |
167 |
168 |
169 |
170 |
171 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
['modern sovereign state',
191 | 'chile',
192 | "among south america's",
193 | 'economically',
194 | 'socially stable',
195 | 'prosperous nations',
196 | 'high-income economy',
197 | 'high living standards',
198 | 'leads latin american nations',
199 | 'rankings',
200 | 'human development',
201 | 'competitiveness',
202 | 'income per capita',
203 | 'globalization',
204 | 'state']
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
If you wanted to how see the result from each method
216 |
217 |
218 |
219 |
220 |
233 |
234 |
235 |
Tokenized using only stopwords
236 |
237 |
238 |
239 |
240 |
241 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
['modern sovereign state',
261 | 'chile',
262 | "among south america's",
263 | 'economically',
264 | 'socially stable',
265 | 'prosperous nations',
266 | 'high-income economy',
267 | 'high living standards',
268 | 'leads latin american nations',
269 | 'rankings',
270 | 'human development',
271 | 'competitiveness',
272 | 'income per capita',
273 | 'globalization',
274 | 'state']
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
Tokenized using only nltk, this method complement to find keywords with stopwords
286 |
287 |
288 |
289 |
290 |
291 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
['sovereign state',
311 | 'modern sovereign state',
312 | 'chile',
313 | 'south america',
314 | 'nations',
315 | 'prosperous nations',
316 | 'economy',
317 | 'high-income economy',
318 | 'living standards',
319 | 'high living standards',
320 | 'modern sovereign state of chile',
321 | 'south america',
322 | 'prosperous nations',
323 | 'high-income economy',
324 | 'high living standards']
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
We recommend to use them with the default options
336 |
337 |
338 |
339 |
340 |
341 |
342 |
Tokenize and return plain text
343 |
344 |
345 |
346 |
347 |
348 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
"modern sovereign state!chile!among south america's!economically!socially stable!prosperous nations!high-income economy!high living standards!leads latin american nations!rankings!human development!competitiveness!income per capita!globalization!state!peace!economic freedom!low perception!corruption!also ranks high regionally!sustainability!state!democratic development!currently!also!lowest homicide rate!americas!canada!chile!founding member!united nations!union!south american nations!unasur!community!latin american!caribbean states!celac!pacific alliance!joined!organisation!economic co-operation!development!oecd!sovereign state!modern sovereign state!chile!south america!nations!prosperous nations!economy!high-income economy!living standards!high living standards!modern sovereign state of chile!south america!prosperous nations!high-income economy!high living standards!nations!american nations!nations!latin american nations!rankings!development!human development!competitiveness!income!capita!globalization!state!peace!freedom!economic freedom!perception!low perception!corruption!latin american nations in rankings!human development!competitiveness!income per capita!globalization!state of peace!economic freedom!low perception of corruption!sustainability!state!development!democratic development!sustainability!state!democratic development!currently!homicide rate!americas!canada!currently!homicide rate!americas after canada!chile!member!founding member!united nations!union!south!nations!american nations!unasur!community!states!caribbean states!celac!pacific alliance!organisation!economic co-operation!development!oecd!chile!founding member!united nations!union of south!american nations!unasur!community!caribbean states!celac!pacific alliance!organisation for economic co-operation!development!oecd"
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 | {% endraw %}
378 |
379 |
380 |
381 |
--------------------------------------------------------------------------------