├── .dockerignore
├── .gitattributes
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── PURPOSE.md
├── README.md
├── STYLE.md
├── dataset
    ├── Superheroes NLP Dataset
    │   ├── README.md
    │   ├── create_dataset.py
    │   ├── create_download_file.py
    │   └── helper.py
    ├── bbcsport.csv
    ├── countries.geojson
    └── superheroes_nlp_dataset.csv
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── source
    │   ├── conf.py
    │   ├── index.rst
    │   ├── nlp.rst
    │   ├── preprocessing.rst
    │   ├── representation.rst
    │   └── visualization.rst
    └── to_docusaurus.py
├── examples
    ├── README.md
    ├── README.md.ipynb
    └── getting-started.ipynb
├── github
    ├── demo.gif
    ├── logo.png
    ├── scatterplot_bbcsport.svg
    ├── scatterplot_bbcsport_kmeans.svg
    └── screencast.gif
├── requirements.txt
├── scripts
    ├── check.sh
    ├── format.sh
    ├── install-hooks.sh
    ├── install.sh
    ├── pre-commit.sh
    ├── push_pip.sh
    ├── test_coverage.sh
    ├── tests.sh
    └── update_documentation.sh
├── setup.cfg
├── setup.py
├── tests
    ├── README.md
    ├── __init__.py
    ├── conftest.py
    ├── test_helpers.py
    ├── test_indexes.py
    ├── test_nlp.py
    ├── test_preprocessing.py
    ├── test_representation.py
    ├── test_types.py
    └── test_visualization.py
├── texthero
    ├── __init__.py
    ├── _helper.py
    ├── _types.py
    ├── nlp.py
    ├── preprocessing.py
    ├── representation.py
    ├── stopwords.py
    └── visualization.py
├── vercel.json
└── website
    ├── blog
        ├── 2017-10-24-texthero-welcome.md
        ├── 2020-04-27-rename-columns-pandas.md
        ├── 2020-04-27-text-preprocessing-with-pandas.md
        ├── 2020-05-03-text-mining-with-python.md
        ├── 2020-05-03-text-unsupervised-learning.md
        └── 2020-05-08-bar-run-chart-python.md
    ├── build.sh
    ├── core
        ├── AnnouncementBar.js
        ├── Footer.js
        ├── Showcase.js
        └── annonucement-bar.css
    ├── docs
        ├── api-nlp.md
        ├── api-preprocessing.md
        ├── api-representation.md
        ├── api-visualization.md
        ├── api
        │   ├── texthero.nlp.dependency_parse.md
        │   ├── texthero.nlp.named_entities.md
        │   ├── texthero.nlp.noun_chunks.md
        │   ├── texthero.preprocessing.clean.md
        │   ├── texthero.preprocessing.drop_no_content.md
        │   ├── texthero.preprocessing.get_default_pipeline.md
        │   ├── texthero.preprocessing.has_content.md
        │   ├── texthero.preprocessing.remove_angle_brackets.md
        │   ├── texthero.preprocessing.remove_brackets.md
        │   ├── texthero.preprocessing.remove_curly_brackets.md
        │   ├── texthero.preprocessing.remove_diacritics.md
        │   ├── texthero.preprocessing.remove_digits.md
        │   ├── texthero.preprocessing.remove_html_tags.md
        │   ├── texthero.preprocessing.remove_punctuation.md
        │   ├── texthero.preprocessing.remove_round_brackets.md
        │   ├── texthero.preprocessing.remove_square_brackets.md
        │   ├── texthero.preprocessing.remove_stopwords.md
        │   ├── texthero.preprocessing.remove_urls.md
        │   ├── texthero.preprocessing.remove_whitespace.md
        │   ├── texthero.preprocessing.replace_punctuation.md
        │   ├── texthero.preprocessing.replace_stopwords.md
        │   ├── texthero.preprocessing.replace_urls.md
        │   ├── texthero.preprocessing.stem.md
        │   ├── texthero.preprocessing.tokenize.md
        │   ├── texthero.representation.dbscan.md
        │   ├── texthero.representation.kmeans.md
        │   ├── texthero.representation.meanshift.md
        │   ├── texthero.representation.nmf.md
        │   ├── texthero.representation.pca.md
        │   ├── texthero.representation.term_frequency.md
        │   ├── texthero.representation.tfidf.md
        │   ├── texthero.representation.tsne.md
        │   ├── texthero.visualization.scatterplot.md
        │   ├── texthero.visualization.top_words.md
        │   └── texthero.visualization.wordcloud.md
        ├── assets
        │   └── texthero.png
        ├── getting-started-installation.md
        ├── getting-started-preprocessing.md
        ├── getting-started.md
        └── tutorial-tfidf.md
    ├── package.json
    ├── pages
        └── en
        │   ├── help.js
        │   ├── index.js
        │   ├── index_original.js
        │   └── users.js
    ├── sidebars.json
    ├── siteConfig.js
    ├── static
        ├── css
        │   ├── announcement-bar.css
        │   ├── code-block-buttons.css
        │   ├── custom.css
        │   ├── pygments.css
        │   └── sphinx_basic.css
        ├── figure
        │   └── scatterplot_bccsport_kmeans.svg
        ├── img
        │   ├── T.png
        │   ├── favicon.png
        │   ├── logo_v2.png
        │   ├── logo_v2_transparent.png
        │   ├── oss_logo.png
        │   ├── scatterplot_bccsport.svg
        │   ├── undraw_code_review.svg
        │   ├── undraw_monitor.svg
        │   ├── undraw_note_list.svg
        │   ├── undraw_online.svg
        │   ├── undraw_open_source.svg
        │   ├── undraw_operating_system.svg
        │   ├── undraw_react.svg
        │   ├── undraw_tweetstorm.svg
        │   └── undraw_youtube_tutorial.svg
        └── js
        │   ├── analytics.js
        │   ├── code-block-buttons.js
        │   └── start_highlight.js
    └── vercel.json


/.dockerignore:
--------------------------------------------------------------------------------
1 | */node_modules
2 | *.log
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | website/* linguist-documentation
2 | Dockerfile -linguist-vendored
3 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 | 
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }}
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   test:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
18 |     steps:
19 |       - name: Checkout project
20 |         uses: actions/checkout@v3
21 | 
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v3
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 | 
27 |       - name: Set up venv
28 |         shell: bash
29 |         run: |
30 |           python3 -m pip install --upgrade pip setuptools
31 |           python3 -m venv .venv
32 | 
33 |       - name: Install project
34 |         shell: bash
35 |         run: |
36 |           source .venv/bin/activate
37 |           python3 -m pip install ".[dev]"
38 | 
39 |       - name: Test
40 |         run: .venv/bin/python3 -m pytest --cov=texthero --cov-report=term-missing --cov-report xml --cov-branch
41 | 
42 |       - name: Upload coverage reports to Codecov
43 |         uses: codecov/codecov-action@v3
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #############################
  2 | # Texthero
  3 | #############################
  4 | 
  5 | build
  6 | dist
  7 | __pycache__
  8 | .DS_Store
  9 | .cache
 10 | 
 11 | .ipynb_checkpoints/
 12 | 
 13 | *.log
 14 | 
 15 | .idea
 16 | 
 17 | alpha
 18 | 
 19 | texthero.egg-info
 20 | 
 21 | #############################
 22 | # Dataset
 23 | #############################
 24 | 
 25 | raw
 26 | download.sh
 27 | 
 28 | 
 29 | #############################
 30 | # Docusaurus and website
 31 | #############################
 32 | 
 33 | website/translated_docs
 34 | website/build/
 35 | website/yarn.lock
 36 | website/node_modules
 37 | website/i18n/*
 38 | 
 39 | node_modules
 40 | 
 41 | lib/core/metadata.js
 42 | lib/core/MetadataBlog.js
 43 | 
 44 | #############################
 45 | # GITHUB gitignore
 46 | #############################
 47 | 
 48 | # Byte-compiled / optimized / DLL files
 49 | __pycache__/
 50 | *.py[cod]
 51 | *$py.class
 52 | 
 53 | # C extensions
 54 | *.so
 55 | 
 56 | # Distribution / packaging
 57 | .Python
 58 | build/
 59 | develop-eggs/
 60 | dist/
 61 | downloads/
 62 | eggs/
 63 | .eggs/
 64 | lib/
 65 | lib64/
 66 | parts/
 67 | sdist/
 68 | var/
 69 | wheels/
 70 | share/python-wheels/
 71 | *.egg-info/
 72 | .installed.cfg
 73 | *.egg
 74 | MANIFEST
 75 | 
 76 | # PyInstaller
 77 | #  Usually these files are written by a python script from a template
 78 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 79 | *.manifest
 80 | *.spec
 81 | 
 82 | # Installer logs
 83 | pip-log.txt
 84 | pip-delete-this-directory.txt
 85 | 
 86 | # Unit test / coverage reports
 87 | htmlcov/
 88 | .tox/
 89 | .nox/
 90 | .coverage
 91 | .coverage.*
 92 | .cache
 93 | nosetests.xml
 94 | coverage.xml
 95 | *.cover
 96 | *.py,cover
 97 | .hypothesis/
 98 | .pytest_cache/
 99 | cover/
100 | 
101 | # Translations
102 | *.mo
103 | *.pot
104 | 
105 | # Django stuff:
106 | *.log
107 | local_settings.py
108 | db.sqlite3
109 | db.sqlite3-journal
110 | 
111 | # Flask stuff:
112 | instance/
113 | .webassets-cache
114 | 
115 | # Scrapy stuff:
116 | .scrapy
117 | 
118 | # Sphinx documentation
119 | docs/_build/
120 | 
121 | # PyBuilder
122 | .pybuilder/
123 | target/
124 | 
125 | # Jupyter Notebook
126 | .ipynb_checkpoints
127 | 
128 | # IPython
129 | profile_default/
130 | ipython_config.py
131 | 
132 | # pyenv
133 | #   For a library or package, you might want to ignore these files since the code is
134 | #   intended to run in multiple environments; otherwise, check them in:
135 | # .python-version
136 | 
137 | # pipenv
138 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
139 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
140 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
141 | #   install all needed dependencies.
142 | #Pipfile.lock
143 | 
144 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
145 | __pypackages__/
146 | 
147 | # Celery stuff
148 | celerybeat-schedule
149 | celerybeat.pid
150 | 
151 | # SageMath parsed files
152 | *.sage.py
153 | 
154 | # Environments
155 | .env
156 | .venv
157 | env/
158 | venv/
159 | ENV/
160 | env.bak/
161 | venv.bak/
162 | 
163 | # Spyder project settings
164 | .spyderproject
165 | .spyproject
166 | 
167 | # Rope project settings
168 | .ropeproject
169 | 
170 | # mkdocs documentation
171 | /site
172 | 
173 | # mypy
174 | .mypy_cache/
175 | .dmypy.json
176 | dmypy.json
177 | 
178 | # Pyre type checker
179 | .pyre/
180 | 
181 | # pytype static type analyzer
182 | .pytype/
183 | 
184 | # Cython debug symbols
185 | cython_debug/
186 | docs/source/api
187 | 
188 | 
189 | # Hide vs code hidden files
190 | .vs_code
191 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/ambv/black
3 |     rev: stable
4 |     hooks:
5 |     - id: black
6 |       language_version: python3


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # 2020-04-20
2 | 
3 | * Version 1.0
4 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:lts
 2 | 
 3 | WORKDIR /app/website
 4 | 
 5 | EXPOSE 3000 35729
 6 | COPY ./docs /app/docs
 7 | COPY ./website /app/website
 8 | RUN yarn install
 9 | 
10 | CMD ["yarn", "start"]
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020 Texthero
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/PURPOSE.md:
--------------------------------------------------------------------------------
 1 | # PURPOSE
 2 | 
 3 | This document attempts at defining the purpose of Texthero and it's future enhancements.
 4 | 
 5 | ### Motivation
 6 | 
 7 | We believe the text mining and text analytics community is missing a space for learning how to deal with all the different NLP/text mining/text analytics tools as well as a simple python package based on Pandas to work with text data effortlessly.
 8 | 
 9 | The lack is in a clear "Universal text mining and data analysis documentation" and that's the main purpose of **texthero**.
10 | 
11 | ### Objective
12 | 
13 | We can decompose the objective of Texthero in two parts:
14 | 
15 | 1. ** Offer an efficient tool to deal with text-based datasets (The texthero python package). Texthero is mainly a teaching tool and therefore easy to use and understand, but at the same time quite efficient and should be able to handle large quantities of data.
16 | 
17 | 2. ** Provide a sustain to newcomers in the NLP world to efficiently learn all the main core topics (tf-idf, text cleaning, regular expression, etc). As there are many other tutorials, the main approach is to redirect users to valuable resources and explain better any missing point. This part is done mainly through the *tutorials* on texthero.org.
18 | 
19 | 
20 | ### Channels
21 | 
22 | 1. **Github repository** development of texthero python package. The README should mainly discuss the PyPI package and not the extra tutorials.
23 | 
24 | 2. **Texthero.org**
25 |     The website acts both as the official documentation for the python package as well as a source of information to learn about how to deal with textual data.
26 | 
27 |     - **Getting Started** 4/5 pages document that explains how to use the Texthero tool. The tutorials assume a very basic understanding of the main topics (representation, tf-idf, word2vec, etc) but at the same time provide links to internal (tutorials) and external resources.
28 | 
29 |     - **Tutorials** Sort of a blog with articles related to NLP and text mining. This includes both tutorials on how to use certain texthero tools, how some part of the Texthero code has been developed as well as extra articles related to other parts of text analytics. Tutorials should focus on how to analyze large quantities of text.
30 | 
31 |     - **?** Open to any request. For ideas, open a new issue and/or contact jonathan.besomi__AT__gmail.com
32 | 
33 | 
34 | ### Python package
35 | 
36 | For future development, it is important to have a clear idea in mind of the purpose of Texthero as a python package.
37 | 
38 | 
39 | **Package core purpose**
40 | 
41 | The goal is to extract insights from the whole corpora, i.e collection of document and not from the single element.
42 | 
43 | Generally, the corpora are composed of a __long__ collection of documents and therefore the required techniques need to be efficient to deal with a large amount of text.
44 | 
45 | **Neural network**
46 | 
47 | Texthero function (as of now) does not make use of a neural network solution. The main reason is that there is no need for that as there are mature libraries (PyTorch and Tensorflow to name a few).
48 | 
49 | What Texthero offers is a tool to be used in addition to any other machine learning libraries. Ideally, texthero should be used before applying any "sophisticated" approach to the dataset; to first better understand the underlying data before applying any complex model.
50 | 
51 | 
52 | Note: a text corpus or collection of documents need to be always in form of a Pandas Series. "do that on a text corpus" or "do that on a Pandas Series" refers to the same act.
53 | 
54 | **Common usage**:
55 |  - Clean a text Pandas Series
56 |  - Tokenize a text Pandas Series
57 |  - Represent a text Pandas Series
58 |  - Benchmark on very simple models (Bayes ?) if changes improved the models
59 |  - Understand a text without the need for using complex models such as Transformers.
60 |  - Extract the main facts from a Pandas Series
61 | 
62 | 
63 | **Naive Pandas Support**
64 | 
65 | Most of texthero python functions should accept as an argument a Pandas Series and return a Pandas Series. This permits to chain the different functions and also always append the Series to a Pandas Column.
66 | 
67 | Few exceptions:
68 |     - When representing the data, the results might be very sparse, in this case, the returned value is a _Sparse_ Pandas Series. It's important to underline the difference in the documentation.
69 | 
70 |     - The "visualization" module might return visualization such as the count of top words. An alternative would be to add a custom `hero` accessor to access this kind of features.
71 |     
72 | 


--------------------------------------------------------------------------------
/STYLE.md:
--------------------------------------------------------------------------------
 1 | # Texthero Style
 2 | 
 3 | Color palette:
 4 | 
 5 | (Mango Tango, orange): ff8c42
 6 | (Corn, yellow): fff275
 7 | (Green blue): 3f88c5
 8 | (Crimson, red): d7263d
 9 | (Oxford blue): 02182b
10 | 
11 | 
12 | Orange stronger for menubar: ff7b26
13 | 


--------------------------------------------------------------------------------
/dataset/Superheroes NLP Dataset/README.md:
--------------------------------------------------------------------------------
 1 | # Superheroes NLP Dataset
 2 | 
 3 | A playground dataset to learn and practice NLP, text mining and data analysis while having fun.
 4 | 
 5 | The same dataset can be found on Kaggle: [Superheroes NLP Dataset](https://www.kaggle.com/jonathanbesomi/superheroes-nlp-dataset).
 6 | 
 7 | All data have been scraped with python from [Superhero Database](https://www.superherodb.com/), credits belongs to them.
 8 | 
 9 | ## Dataset summary
10 | 
11 | Size: 8 MB.
12 | 
13 | Num. columns: 81.
14 | 
15 | Num. superheroes: 1447.
16 | 
17 | Main columns:
18 |    - name
19 |    - real_name
20 |    - full_name
21 |    - overall_score - how powerful is the superhero according to superherodb.
22 |    - *history_text* - Superhero's history.
23 |    - *powers_text* - Description of superhero's powers
24 |    - intelligence_score
25 |    - strength_score
26 |    - speed_score
27 |    - durability_score
28 |    - power_score	
29 |    - combat_score
30 |    - alter_egos - List of alternative personality
31 |    - aliases 
32 |    - creator - _DC Comics_ or _Marvel Comics_ for instance.
33 |    - alignment	- Is the character good or bad?
34 |    - occupation
35 |    - type_race	
36 |    - height	
37 |    - weight	
38 |    - eye_color	
39 |    - hair_color	
40 |    - skin_color
41 | 
42 | 
43 | ## Getting started
44 | 
45 | You can download the complete dataset directly from Github here: [Superheroes NLP Dataset](https://github.com/jbesomi/texthero/tree/master/dataset/Superheroes%20NLP%20Dataset/data).
46 | 
47 | If you feel lazy, you can also import it directly from pandas:
48 | 
49 | ```python
50 | import pandas as pd
51 | 
52 | df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/superheroes_nlp_dataset.csv")
53 | 
54 | df.head()
55 | ```
56 | 
57 | ```bash
58 |             name               real_name               full_name overall_score  ... has_durability has_stamina  has_agility  has_super_strength
59 | 0        3-D Man     Delroy Garrett, Jr.     Delroy Garrett, Jr.             6  ...            0.0         0.0          0.0                 1.0
60 | 1  514A (Gotham)             Bruce Wayne                     NaN            10  ...            1.0         0.0          0.0                 1.0
61 | 2         A-Bomb  Richard Milhouse Jones  Richard Milhouse Jones            20  ...            1.0         1.0          1.0                 1.0
62 | 3             Aa                      Aa                     NaN            12  ...            0.0         0.0          0.0                 0.0
63 | 4     Aaron Cash              Aaron Cash              Aaron Cash             5  ...            0.0         0.0          0.0                 0.0
64 | 
65 | [5 rows x 81 columns]
66 | ```
67 | 


--------------------------------------------------------------------------------
/dataset/Superheroes NLP Dataset/create_dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Create the dataset from the raw data
  3 | """
  4 | 
  5 | import helper as h
  6 | import glob
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | import ast
 10 | 
 11 | DOWNLOAD_DIR = "./data/raw/"
 12 | 
 13 | all_files = glob.glob(DOWNLOAD_DIR + "*.html")
 14 | 
 15 | # Get unique names:
 16 | all_about = glob.glob(DOWNLOAD_DIR + "*_about.html")
 17 | 
 18 | ids = [h.get_id_from_about(a) for a in all_about]
 19 | 
 20 | dataset = []
 21 | 
 22 | for id in tqdm(ids):
 23 | 
 24 |     filename_about = DOWNLOAD_DIR + id + "_about.html"
 25 |     filename_history = DOWNLOAD_DIR + id + "_history.html"
 26 |     filename_powers = DOWNLOAD_DIR + id + "_powers.html"
 27 | 
 28 |     try:
 29 |         h.get_soup(filename_about)
 30 |     except:
 31 |         print(filename_about)
 32 | 
 33 |     data_about = h.get_soup(filename_about)
 34 |     data_history = h.get_soup(filename_history)
 35 |     data_powers = h.get_soup(filename_powers)
 36 | 
 37 |     row = h.merge_data(data_about, data_history, data_powers)
 38 | 
 39 |     dataset.append(row)
 40 | 
 41 | df = pd.DataFrame(dataset)
 42 | 
 43 | df.columns = df.columns.str.lower()
 44 | # Clean dataset
 45 | 
 46 | 
 47 | def clean_teams(df):
 48 |     df["teams"] = df["teams"].astype(str)
 49 |     df["teams"] = df["teams"].str.replace("\nNo teams added.", "no_team")
 50 | 
 51 |     df["teams"] = df["teams"].str.replace("\n", "").str.strip()
 52 |     return df
 53 | 
 54 | 
 55 | df = clean_teams(df)
 56 | 
 57 | # lowercase all columns
 58 | df.columns = df.columns.str.lower().str.replace(" ", "_")
 59 | 
 60 | 
 61 | # Rename columns
 62 | df = df.rename(columns={"type_/_race": "type_race"})
 63 | 
 64 | power_score = dict(
 65 |     intelligence="intelligence_score",
 66 |     strength="strength_score",
 67 |     speed="speed_score",
 68 |     durability="durability_score",
 69 |     power="power_score",
 70 |     combat="combat_score",
 71 | )
 72 | 
 73 | df = df.rename(columns=power_score)
 74 | 
 75 | df = df.rename(columns=dict(hist_content="history_text", powers_content="powers_text"))
 76 | 
 77 | # Reorder columns
 78 | df = df[
 79 |     [
 80 |         "name",
 81 |         "real_name",
 82 |         "full_name",
 83 |         "overall_score",
 84 |         "history_text",
 85 |         "powers_text",
 86 |         "intelligence_score",
 87 |         "strength_score",
 88 |         "speed_score",
 89 |         "durability_score",
 90 |         "power_score",
 91 |         "combat_score",
 92 |         "superpowers",
 93 |         "alter_egos",
 94 |         "aliases",
 95 |         "place_of_birth",
 96 |         "first_appearance",
 97 |         "creator",
 98 |         "alignment",
 99 |         "occupation",
100 |         "base",
101 |         "teams",
102 |         "relatives",
103 |         "gender",
104 |         "type_race",
105 |         "height",
106 |         "weight",
107 |         "eye_color",
108 |         "hair_color",
109 |         "skin_color",
110 |         "img",
111 |     ]
112 | ]
113 | 
114 | # Extract 'superpowers' data
115 | 
116 | df_superpowers = (
117 |     df["superpowers"].apply(pd.Series).stack().pipe(pd.get_dummies).sum(level=0)
118 | )
119 | 
120 | # Keep only most 50 common superpowers
121 | common_superpowers = df_superpowers.sum(axis=0).sort_values().tail(50).index
122 | df_superpowers = df_superpowers[common_superpowers]
123 | df_superpowers.columns = df_superpowers.columns.str.lower().str.replace(" ", "_")
124 | df_superpowers = df_superpowers.add_prefix("has_")
125 | 
126 | df = df.join(df_superpowers)
127 | 
128 | 
129 | # Split aliases
130 | df["aliases"] = df["aliases"].str.split("\n")
131 | 
132 | print(df.shape)
133 | 
134 | # Keep only rows where 'history_text' or 'powers_text' is not null.
135 | df = df[
136 |     ~(df["history_text"].str.strip() == "") | ~(df["powers_text"].str.strip() == "")
137 | ]
138 | 
139 | print(df.shape)
140 | 
141 | 
142 | df.to_csv("./data/superheroes_nlp_dataset.csv", index=False)
143 | 


--------------------------------------------------------------------------------
/dataset/Superheroes NLP Dataset/create_download_file.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Create a "download.sh" file containing a list of all http url that needs to be downloaded.
 3 | """
 4 | 
 5 | import helper as h
 6 | 
 7 | # NUM_PAGE = 1
 8 | # data_char = h.get_data("https://www.superherodb.com/characters/?page_nr={}".format(NUM_PAGE))
 9 | # superhero_links = h.get_superheroes_links(data_char)
10 | 
11 | 
12 | # Get all superheroes link
13 | 
14 | TOTAL_PAGES = 33
15 | all_links = []
16 | 
17 | for p in range(1, 33 + 1):
18 |     data_char = h.get_data(
19 |         "https://www.superherodb.com/characters/?page_nr={}".format(p)
20 |     )
21 |     all_links += h.get_superheroes_links(data_char)
22 | 
23 | 
24 | DOWNLOAD_DIR = "./data/raw/"
25 | 
26 | file_content = ""
27 | command = "wget {} -t 5 --limit-rate=20K --show-progress -O {}\n"
28 | 
29 | file_content += "#!/bin/sh\n\n\n"
30 | file_content += "mkdir -p {}\n\n\n".format(DOWNLOAD_DIR)
31 | 
32 | filename_set = []
33 | 
34 | for link in all_links:
35 | 
36 |     filename = DOWNLOAD_DIR + link.split("/")[-3]
37 | 
38 |     # Download about
39 |     filename_about = filename + "_about.html"
40 |     file_content += command.format(link, filename_about)
41 | 
42 |     # Download history
43 |     filename_history = filename + "_history.html"
44 |     file_content += command.format(link + "history/", filename_history)
45 | 
46 |     # Download powers
47 |     filename_powers = filename + "_powers.html"
48 |     file_content += command.format(link + "powers/", filename_powers)
49 | 
50 |     file_content += "\n"
51 | 
52 |     filename_set.append(filename)
53 | 
54 | print("There are ", len(filename_set), " files.")
55 | print("There are ", len(set(filename_set)), "unique files.")
56 | 
57 | # with open("download.sh", "w") as file:
58 | #    file.write(file_content)
59 | 


--------------------------------------------------------------------------------
/dataset/Superheroes NLP Dataset/helper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions to scrape all superrheroes data from 'https://www.superherodb.com/'
  3 | """
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | import urllib3
  7 | import pandas as pd
  8 | from collections import defaultdict
  9 | import requests
 10 | import re
 11 | 
 12 | 
 13 | def get_data(url):
 14 |     """
 15 |     Return BeautifulSoup html object.
 16 |     """
 17 |     r = requests.get(url)
 18 |     data = BeautifulSoup(r.text, "lxml")
 19 |     return data
 20 | 
 21 | 
 22 | def get_superheroes_links(data):
 23 |     herolinks = []
 24 | 
 25 |     home_url = "https://www.superherodb.com"
 26 | 
 27 |     for all_li in data.find_all(class_="list"):
 28 |         for link in all_li.find_all("li"):
 29 |             for hero in link.find_all("a"):
 30 |                 herolinks.append(home_url + hero["href"])
 31 |     return herolinks
 32 | 
 33 | 
 34 | def get_id_from_about(filename):
 35 |     """
 36 |     Extract id from local filename.
 37 |     """
 38 |     return filename.replace("_about.html", "").split("/")[-1]
 39 | 
 40 | 
 41 | def get_soup(filename):
 42 |     with open(filename, "rb") as f:
 43 |         file = f.read()
 44 |         return BeautifulSoup(file, "lxml")
 45 | 
 46 | 
 47 | """
 48 | Get data
 49 | """
 50 | 
 51 | 
 52 | def get_data(url):
 53 |     r = requests.get(url)
 54 |     data = BeautifulSoup(r.text, "lxml")
 55 |     return data
 56 | 
 57 | 
 58 | """
 59 | About
 60 | """
 61 | 
 62 | 
 63 | def get_image(data_about):
 64 | 
 65 |     img = data_about.find(class_="portrait").find("img")
 66 |     if img:
 67 |         return dict(img=img["src"])
 68 |     else:
 69 |         return dict(img=None)
 70 | 
 71 | 
 72 | def get_name_real_name(data_about):
 73 |     name = data_about.find("h1").text
 74 |     real_name = data_about.find("h2").text
 75 |     return dict(name=name, real_name=real_name)
 76 | 
 77 | 
 78 | def get_overall_score(data_about):
 79 |     return dict(overall_score=data_about.find(href="#class-info").text)
 80 | 
 81 | 
 82 | def get_power_stats(data_about):
 83 | 
 84 |     scripts = data_about.findAll("script")
 85 |     # Find script containng the 'stats_shdb'
 86 |     script = next(
 87 |         (s.text for s in scripts if s.text.strip().startswith("var stats_shdb = ["))
 88 |     )
 89 |     # Extract the list of powers
 90 |     values = re.findall(r"(\d+)", script.split(";")[0])
 91 |     values = [int(v) for v in values]
 92 | 
 93 |     labels = data_about.find(class_="stat-holder").findAll("label")
 94 |     labels = [l.text for l in labels]
 95 | 
 96 |     return dict(zip(labels, values))
 97 | 
 98 | 
 99 | def get_super_powers(data_about):
100 |     superpowers = data_about.find("h3", text="Super Powers").findParent().findAll("a")
101 |     superpowers = [s.text for s in superpowers]
102 |     return dict(superpowers=superpowers)
103 | 
104 | 
105 | def get_all_links(td):
106 |     links = td.findAll("a")
107 |     links = [a.text for a in links]
108 |     return links
109 | 
110 | 
111 | def get_origin(data_about):
112 | 
113 |     data = data_about.find("h3", text="Origin").findNext()
114 | 
115 |     origin = {}
116 | 
117 |     for row in data.find_all("tr"):
118 |         key = row.find_all("td")[0].text
119 |         value = row.find_all("td")[1]
120 | 
121 |         if "alter egos" in key.lower():
122 |             origin[key] = get_all_links(value)
123 |         else:
124 |             origin[key] = value.text
125 |     return origin
126 | 
127 | 
128 | def get_connections(data_about):
129 |     data = data_about.find("h3", text="Connections").findNext()
130 | 
131 |     connections = {}
132 | 
133 |     for row in data.find_all("tr"):
134 |         key = row.find_all("td")[0].text
135 |         value = row.find_all("td")[1]
136 | 
137 |         if "Teams" in key:
138 |             connections[key] = get_all_links(value)
139 |         else:
140 |             connections[key] = value.text
141 | 
142 |     return connections
143 | 
144 | 
145 | def get_appearance(data_about):
146 |     table = data_about.find("h3", text="Appearance").findParent()
147 |     labels = table.findAll(class_="table-label")
148 |     return dict([(l.text, l.findNext().text) for l in labels])
149 | 
150 | 
151 | """
152 | History
153 | """
154 | 
155 | 
156 | def get_history(data_history):
157 |     content = data_history.find(class_="text-columns-2")
158 |     title = content.find("h3").text
159 |     subtitles = [s.text for s in content.findAll("h4")]
160 |     content = " ".join([p.text for p in content.findAll("p")]).replace("\s+", " ")
161 |     return {"hist_title": title, "hist_subtitles": subtitles, "hist_content": content}
162 | 
163 | 
164 | """
165 | Powers
166 | """
167 | 
168 | 
169 | def get_powers(data_powers):
170 |     content = data_powers.find_all(class_="col-8")[1]
171 |     title = content.find("h3").text
172 |     subtitles = [s.text for s in content.findAll("h4")]
173 |     content = " ".join([p.text for p in content.findAll("p")]).replace("\s+", " ")
174 |     return {
175 |         "powers_title": title,
176 |         "powers_subtitles": subtitles,
177 |         "powers_content": content,
178 |     }
179 | 
180 | 
181 | """
182 | Merge all
183 | """
184 | 
185 | 
186 | def merge_data(data_about, data_history, data_powers):
187 | 
188 |     data = {}
189 | 
190 |     # Get from about page
191 |     data.update(get_image(data_about))
192 |     data.update(get_name_real_name(data_about))
193 |     data.update(get_overall_score(data_about))
194 |     data.update(get_power_stats(data_about))
195 |     data.update(get_super_powers(data_about))
196 |     data.update(get_origin(data_about))
197 |     data.update(get_connections(data_about))
198 |     data.update(get_appearance(data_about))
199 | 
200 |     # Get history data
201 |     data.update(get_history(data_history))
202 | 
203 |     # Get powers data
204 |     data.update(get_powers(data_powers))
205 | 
206 |     return data
207 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   docusaurus:
 5 |     build: .
 6 |     ports:
 7 |       - 3000:3000
 8 |       - 35729:35729
 9 |     volumes:
10 |       - ./docs:/app/docs
11 |       - ./website/blog:/app/website/blog
12 |       - ./website/core:/app/website/core
13 |       - ./website/i18n:/app/website/i18n
14 |       - ./website/pages:/app/website/pages
15 |       - ./website/static:/app/website/static
16 |       - ./website/sidebars.json:/app/website/sidebars.json
17 |       - ./website/siteConfig.js:/app/website/siteConfig.js
18 |     working_dir: /app/website
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    = -q
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 	@./to_docusaurus.py
21 | 	@rsync -u ./_build/html/_static/basic.css ../website/static/css/sphinx_basic.css
22 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | import matplotlib
17 | 
18 | sys.path.insert(0, os.path.abspath("."))
19 | 
20 | 
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = "texthero"
24 | copyright = ""  # will not be used.
25 | author = ""  # will not be used.
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = ""  # will not be used.
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     "numpydoc",
38 |     "sphinx.ext.autodoc",  # automatically construct the documentation.
39 |     "sphinx.ext.autosummary",
40 |     # prefer numpydoc at sphinx.ext.napoleon as it looks nicer.
41 |     "sphinx.ext.intersphinx",
42 |     "matplotlib.sphinxext.plot_directive",
43 | ]
44 | 
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ["_templates"]
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "*.md"]
52 | 
53 | add_module_names = False
54 | 
55 | autosummary_generate = True
56 | 
57 | autodoc_typehints = "none"
58 | 
59 | 
60 | intersphinx_mapping = {"python": ("https://docs.python.org/3", None)}
61 | 
62 | # -- Options for HTML output -------------------------------------------------
63 | 
64 | # The theme to use for HTML and HTML Help pages.  See the documentation for
65 | # a list of builtin themes.
66 | #
67 | html_theme = "pydata_sphinx_theme"  # "alabaster", "pydata_sphinx_theme"
68 | 
69 | # html_theme_options = {"nosidebar": "true"}
70 | 
71 | # html_use_index = False  # Create an extra page containing the index.
72 | 
73 | # html_show_sourcelink = False
74 | 
75 | # html_file_suffix = ".md" later
76 | 
77 | # html_show_copyright = False
78 | 
79 | # html_show_sphinx = False
80 | 
81 | # html_domain_indices = False
82 | 
83 | # Add any paths that contain custom static files (such as style sheets) here,
84 | # relative to this directory. They are copied after the builtin static files,
85 | # so a file named "default.css" will overwrite the builtin "default.css".
86 | html_static_path = []
87 | 
88 | 
89 | html_css_files = [
90 |     "css/pigments.css",
91 |     "css/custom.css",
92 | ]
93 | 
94 | autodoc_typehints = "none"
95 | 
96 | source_suffix = [".rst"]
97 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | Texthero API
 3 | ************
 4 | 
 5 | Preprocessing
 6 | =============
 7 | 
 8 | ..  toctree::
 9 |     
10 |     preprocessing
11 | 
12 | NLP
13 | ==============
14 | 
15 | ..  toctree::
16 | 
17 |     nlp
18 | 
19 | Representation
20 | ==============
21 | 
22 | ..  toctree::
23 | 
24 |     representation
25 | 
26 | Visualization
27 | =============
28 | 
29 | ..  toctree::
30 | 
31 |     visualization
32 | 


--------------------------------------------------------------------------------
/docs/source/nlp.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: texthero.nlp
 2 | 
 3 |    .. autosummary::
 4 |       :toctree: api
 5 | 
 6 |       named_entities
 7 |       noun_chunks
 8 |    
 9 |    
10 | 
11 |    
12 |    
13 |    
14 | 
15 |    
16 |    
17 |    
18 | 


--------------------------------------------------------------------------------
/docs/source/preprocessing.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: texthero.preprocessing
 2 | 
 3 |    .. autosummary::
 4 |       :toctree: api
 5 | 
 6 |       clean
 7 |       drop_no_content
 8 |       get_default_pipeline
 9 |       has_content
10 |       remove_angle_brackets
11 |       remove_brackets
12 |       remove_curly_brackets
13 |       remove_diacritics
14 |       remove_digits
15 |       remove_html_tags
16 |       remove_punctuation
17 |       remove_round_brackets
18 |       remove_square_brackets
19 |       remove_stopwords
20 |       remove_urls
21 |       replace_urls
22 |       remove_whitespace
23 |       replace_punctuation
24 |       replace_stopwords
25 |       tokenize
26 |    
27 |    
28 | 
29 |    
30 |    
31 |    
32 | 


--------------------------------------------------------------------------------
/docs/source/representation.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: texthero.representation 
 2 |    
 3 |    .. autosummary::
 4 |       :toctree: api
 5 | 
 6 |       dbscan
 7 |       kmeans
 8 |       meanshift
 9 |       nmf
10 |       pca
11 |       term_frequency
12 |       tfidf
13 |       tsne
14 |    
15 |    
16 | 
17 |    
18 |    
19 |    
20 | 


--------------------------------------------------------------------------------
/docs/source/visualization.rst:
--------------------------------------------------------------------------------
 1 | .. automodule:: texthero.visualization 
 2 | 
 3 | 
 4 |    .. autosummary:: 
 5 |       :toctree: api
 6 | 
 7 |       scatterplot
 8 |       top_words
 9 |       wordcloud
10 |    
11 |    
12 | 
13 |    
14 |    
15 |    
16 | 


--------------------------------------------------------------------------------
/docs/to_docusaurus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Takes the output from Sphinx, clean it and send it to Docusaurus.
  5 | 
  6 | 1. Get four main modules from _build/html/
  7 |     - Extract only the 'body' html and store it as a md file under 
  8 |             ./website/docs/api-{module_name}.md
  9 | 
 10 | 2. Get all files under _build/html/api/
 11 |     - Extract 'body' html and store it as a md file under
 12 |             ./website/docs/api/{filenames}.md
 13 | 
 14 | 3. Update 'sidebars.json' with the new markdown files
 15 |     - Update the 'api' section.
 16 |     - Add each module under a sub-directory.
 17 | """
 18 | 
 19 | 
 20 | """
 21 | Takes all relevant html files from the html output sphinx folder, parse it with Beautifulsoup, remove unnecessary html data (such as <head>) and
 22 | save a markdown file.
 23 | """
 24 | 
 25 | from bs4 import BeautifulSoup
 26 | import glob
 27 | from pathlib import Path
 28 | from typing import List
 29 | import re
 30 | import json
 31 | 
 32 | """
 33 | PARAMETERS
 34 | """
 35 | 
 36 | MODULES = ["preprocessing", "nlp", "representation", "visualization"]
 37 | ROOT_HTML_DIRECTORY = "./_build/html"
 38 | ROOT_MD_DIRECTORY = "../website/docs/"
 39 | SIDEBARS_FILEPATH = "../website/sidebars.json"
 40 | """
 41 | Helper functions
 42 | """
 43 | 
 44 | 
 45 | def get_content(soup):
 46 |     return soup.find("main").find("div")
 47 | 
 48 | 
 49 | def add_docusaurus_metadata(content: str, id: str, title: str, hide_title) -> str:
 50 |     """
 51 |     Add docusaurus metadata into content.
 52 |     """
 53 |     return f"---\nid: {id}\ntitle: {title}\nhide_title: {hide_title}\n---\n\n" + content
 54 | 
 55 | 
 56 | def fix_href(soup, module: str):
 57 |     """
 58 |     Fix internal href to be compatible with docusaurus.
 59 |     """
 60 | 
 61 |     for a in soup.find_all("a", {"class": "reference internal"}, href=True):
 62 |         a["href"] = re.sub("^texthero\.", f"/docs/{module}/", a["href"])
 63 |         a["href"] = a["href"].lower()
 64 |     return soup
 65 | 
 66 | 
 67 | def to_md(
 68 |     in_html_filepath: str, out_md_filepath: str, id: str, title: str, hide_title: str
 69 | ) -> None:
 70 |     """
 71 |     Convert Sphinx-generated html to md.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     in_html_filepath : str
 76 |         input html file. Example: ./_build/html/preprocessing.html
 77 |     out_md_filepath : str
 78 |         output html file. Example: ../website/docs/preprocessing.md
 79 |     id : str
 80 |         Docusaurus document id
 81 |     title : str
 82 |         Docusaurus title id
 83 |     hide_title : str ("true" or "false")
 84 |         Whether to hide title in Docusaurus.
 85 |         
 86 |     """
 87 | 
 88 |     with open(in_html_filepath, "r") as f:
 89 |         soup = BeautifulSoup(f.read(), "html.parser")
 90 |         body = get_content(soup)
 91 | 
 92 |     with open(out_md_filepath, "w") as f:
 93 |         content = add_docusaurus_metadata(str(body), id, title, hide_title)
 94 |         f.write(content)
 95 | 
 96 | 
 97 | def get_html(module: str) -> List[str]:
 98 |     """Return all html files on the html/module folder"""
 99 |     files = glob.glob(f"./html/{module}/*.html")
100 |     # remove ./html/module
101 |     return [f.replace(f"./html/{module}/texthero.", "") for f in files]
102 | 
103 | 
104 | def get_prettified_module_name(module_name: str):
105 |     """
106 |     Return a prettified version of the module name. 
107 |     
108 |     Examples
109 |     --------
110 |     >>> get_title("preprocessing")
111 |     Preprocessing
112 |     >>> get_title("nlp")
113 |     NLP
114 |     """
115 |     module_name = module_name.lower().strip()
116 |     if module_name == "nlp":
117 |         return "NLP"
118 |     else:
119 |         return module_name.capitalize()
120 | 
121 | 
122 | """
123 | Update sidebars and markdown files
124 | """
125 | 
126 | # make sure folder exists
127 | Path(ROOT_MD_DIRECTORY).mkdir(parents=True, exist_ok=True)
128 | Path(ROOT_MD_DIRECTORY + "api").mkdir(parents=True, exist_ok=True)
129 | 
130 | api_sidebars = {}
131 | 
132 | for m in MODULES:
133 |     in_html_filename = f"{ROOT_HTML_DIRECTORY}/{m}.html"
134 |     out_md_filename = f"{ROOT_MD_DIRECTORY}/api-{m}.md"
135 |     id = "api-" + m.lower().strip()
136 |     title = get_prettified_module_name(m)
137 | 
138 |     hide_title = "false"
139 | 
140 |     # initialize api_sidebars
141 |     api_sidebars[title] = [id]
142 | 
143 |     to_md(in_html_filename, out_md_filename, id, title, hide_title)
144 | 
145 | 
146 | for a in glob.glob("./_build/html/api/*.html"):
147 |     object_name = a.split("/")[-1].replace(".html", "")
148 | 
149 |     id = object_name
150 |     (_, module_name, fun_name) = object_name.split(".")
151 | 
152 |     title = f"{module_name}.{fun_name}"
153 | 
154 |     module_name = get_prettified_module_name(module_name)
155 | 
156 |     hide_title = "true"
157 | 
158 |     api_sidebars[module_name].sort()
159 | 
160 |     api_sidebars[module_name] = api_sidebars[module_name] + ["api/" + id]
161 | 
162 |     in_html_filename = f"{ROOT_HTML_DIRECTORY}/api/{object_name}.html"
163 |     out_md_filename = f"{ROOT_MD_DIRECTORY}/api/{object_name}.md"
164 | 
165 |     to_md(in_html_filename, out_md_filename, id, title, hide_title)
166 | 
167 | 
168 | # Load, update and save again sidebars.json
169 | with open(SIDEBARS_FILEPATH) as js:
170 |     sidebars = json.load(js)
171 | 
172 | sidebars["api"] = api_sidebars
173 | 
174 | with open(SIDEBARS_FILEPATH, "w") as f:
175 |     json.dump(sidebars, f, indent=2)
176 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | - `getting-started.ipynb` should contains the exact same code shown on the [getting started](https://texthero.org/docs/getting-started) doc page.
4 | 


--------------------------------------------------------------------------------
/github/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/github/demo.gif


--------------------------------------------------------------------------------
/github/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/github/logo.png


--------------------------------------------------------------------------------
/github/screencast.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/github/screencast.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/scripts/check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # if any command inside script returns error, exit and return that error
 4 | set -e 
 5 | 
 6 | # ensure that we're always inside the root of our application
 7 | cd "${0%/*}/.."
 8 | 
 9 | cd scripts
10 | 
11 | echo "Format code."
12 | ./format.sh
13 | 
14 | 
15 | echo "Update documentation."
16 | ./update_documentation.sh
17 | 
18 | 
19 | echo "Test code."
20 | ./tests.sh
21 | 
22 | #cd website
23 | #npm run build
24 | 


--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd ..
4 | 
5 | black texthero
6 | black tests
7 | 


--------------------------------------------------------------------------------
/scripts/install-hooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | GIT_DIR=$(git rev-parse --git-dir)
4 | 
5 | echo "Installing hooks..."
6 | # this command creates symlink to our pre-commit script
7 | ln -sf ../../scripts/pre-commit.sh $GIT_DIR/hooks/pre-commit
8 | echo "Done!"
9 | 


--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd ..
4 | 
5 | pip3 install -e .
6 | 


--------------------------------------------------------------------------------
/scripts/pre-commit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Run check.
 4 | ./scripts/check.sh
 5 | 
 6 | # $? stores exit value of the last command
 7 | if [ $? -ne 0 ]; then
 8 |  echo "All tests must pass before commit."
 9 |  exit 1
10 | fi
11 | 


--------------------------------------------------------------------------------
/scripts/push_pip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "Formatting code ..."
 4 | ./format.sh
 5 | 
 6 | echo "Checking code ..."
 7 | ./check.sh
 8 | 
 9 | echo "Updating doc ..."
10 | cd ../docs/
11 | make html
12 | ./to_docusaurus.py
13 | cd ..
14 | 
15 | python3 setup.py sdist bdist_wheel
16 | twine upload --skip-existing dist/*
17 | 


--------------------------------------------------------------------------------
/scripts/test_coverage.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd ..
4 | 
5 | coverage run -m unittest discover -s tests -t .
6 | 
7 | coverage report -m
8 | coverage html
9 | 


--------------------------------------------------------------------------------
/scripts/tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd ..
4 | 
5 | python3 -m unittest discover -s tests -t .
6 | 


--------------------------------------------------------------------------------
/scripts/update_documentation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd ../docs
4 | make html
5 | ./to_docusaurus.py
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = texthero
 3 | version = 1.0.9
 4 | description = Text preprocessing, representation and visualization from zero to hero.
 5 | author = Jonathan Besomi
 6 | license = MIT
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | classifiers =
10 |     Development Status :: 3 - Alpha
11 |     License :: OSI Approved :: MIT License
12 |     Intended Audience :: Developers
13 |     Programming Language :: Python
14 |     Natural Language :: English
15 |     Topic :: Scientific/Engineering
16 | keywords =
17 |     text mining
18 |     text preprocessing
19 |     text representation
20 |     text visualization
21 | url = https://github.com/jbesomi/texthero
22 | project_urls =
23 |     Documentation = https://texthero.org/
24 |     Source Code = https://github.com/jbesomi/texthero
25 |     Bug Tracker = https://github.com/jbesomi/texthero/issues
26 | [options]
27 | packages = find:
28 | python_requires = >=3.6.1
29 | install_requires =
30 |     numpy>=1.17
31 |     scikit-learn>=0.22
32 |     spacy<3.0.0
33 |     tqdm>=4.3, <5
34 |     nltk>=3.3, <4
35 |     plotly>=4.2.0, <5
36 |     pandas>=1.0.2, <2
37 |     wordcloud>=1.5.0, <2
38 |     gensim>4.0, <5
39 |     matplotlib>=3.1.0, <3.7
40 | # TODO pick the correct version.
41 | [options.extras_require]
42 | dev =
43 |     black==19.10b0
44 |     pytest>=4.0.0
45 |     pytest-cov
46 |     Sphinx>=3.0.3
47 |     sphinx-markdown-builder>=0.5.4
48 |     recommonmark>=0.6.0
49 |     nbsphinx
50 |     parameterized>=0.7.4
51 |     coverage
52 |     pre-commit
53 |     pandas>=1.1.0
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | if __name__ == "__main__":
4 |     setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # TESTS
2 | 
3 | "In most cases, missing type hints in third-party packages is not something you want to be bothered with so you can silence these messages:"
4 | 
5 | => mypy --ignore-missing-imports
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class PandasTestCase(unittest.TestCase):
 6 |     def assertDataframeEqual(self, a, b, msg):
 7 |         try:
 8 |             pd.testing.assert_frame_equal(a, b)
 9 |         except AssertionError as e:
10 |             raise self.failureException(msg) from e
11 | 
12 |     def assertSeriesEqual(self, a, b, msg):
13 |         try:
14 |             pd.testing.assert_series_equal(a, b)
15 |         except AssertionError as e:
16 |             raise self.failureException(msg) from e
17 | 
18 |     def setUp(self):
19 |         self.addTypeEqualityFunc(pd.DataFrame, self.assertDataframeEqual)
20 |         self.addTypeEqualityFunc(pd.Series, self.assertSeriesEqual)
21 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--no-skip-broken",
 7 |         action="store_true",
 8 |         default=False,
 9 |         help="run tests marked as broken",
10 |     )
11 | 
12 | 
13 | def pytest_configure(config):
14 |     config.addinivalue_line("markers", "skip_broken: mark test broken")
15 | 
16 | 
17 | def pytest_collection_modifyitems(config, items):
18 |     if config.getoption("--no-skip-broken"):
19 |         return
20 | 
21 |     skip_broken = pytest.mark.skip(reason="test marked as broken")
22 |     for item in items:
23 |         if "skip_broken" in item.keywords:
24 |             item.add_marker(skip_broken)
25 | 
26 | 
27 | def broken_case(*params):
28 |     return pytest.param(*params, marks=(pytest.mark.skip_broken))
29 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit-tests for the helper module.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | from . import PandasTestCase
 9 | import doctest
10 | import unittest
11 | import warnings
12 | 
13 | from texthero import _helper
14 | 
15 | """
16 | Doctests.
17 | """
18 | 
19 | 
20 | def load_tests(loader, tests, ignore):
21 |     tests.addTests(doctest.DocTestSuite(_helper))
22 |     return tests
23 | 
24 | 
25 | """
26 | Test Decorators.
27 | """
28 | 
29 | 
30 | class TestHelpers(PandasTestCase):
31 |     """
32 |     handle_nans.
33 |     """
34 | 
35 |     def test_handle_nans(self):
36 |         s = pd.Series(["Test", np.nan, pd.NA])
37 | 
38 |         @_helper.handle_nans(replace_nans_with="This was a NAN")
39 |         def f(s):
40 |             return s
41 | 
42 |         s_true = pd.Series(["Test", "This was a NAN", "This was a NAN"])
43 | 
44 |         with warnings.catch_warnings():
45 |             warnings.simplefilter("ignore")
46 |             self.assertEqual(f(s), s_true)
47 | 
48 |         with self.assertWarns(Warning):
49 |             f(s)
50 | 
51 |     def test_handle_nans_no_nans_in_input(self):
52 |         s = pd.Series(["Test"])
53 | 
54 |         @_helper.handle_nans(replace_nans_with="This was a NAN")
55 |         def f(s):
56 |             return s
57 | 
58 |         s_true = pd.Series(["Test"])
59 | 
60 |         self.assertEqual(f(s), s_true)
61 | 
62 |     # This is not in test_indexes.py as it requires a custom test case.
63 |     def test_handle_nans_index(self):
64 |         s = pd.Series(["Test", np.nan, pd.NA], index=[4, 5, 6])
65 | 
66 |         @_helper.handle_nans(replace_nans_with="This was a NAN")
67 |         def f(s):
68 |             return s
69 | 
70 |         s_true = pd.Series(
71 |             ["Test", "This was a NAN", "This was a NAN"], index=[4, 5, 6]
72 |         )
73 | 
74 |         with warnings.catch_warnings():
75 |             warnings.simplefilter("ignore")
76 |             self.assertTrue(f(s).index.equals(s_true.index))
77 | 


--------------------------------------------------------------------------------
/tests/test_indexes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from texthero import nlp, visualization, preprocessing, representation
  3 | 
  4 | import pytest
  5 | 
  6 | from . import PandasTestCase
  7 | import unittest
  8 | import string
  9 | from parameterized import parameterized
 10 | 
 11 | from .conftest import broken_case
 12 | 
 13 | 
 14 | # Define valid inputs for different functions.
 15 | s_text = pd.Series(["Test"], index=[5])
 16 | s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6])
 17 | s_numeric = pd.Series([5.0], index=[5])
 18 | s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
 19 | 
 20 | # Define all test cases. Every test case is a list
 21 | # of [name of test case, function to test, tuple of valid input for the function].
 22 | # First argument of valid input has to be the Pandas Series where we
 23 | # want to keep the index. If this is different for a function, a separate
 24 | # test case has to implemented in the class below.
 25 | # The tests will be run by AbstractIndexTest below through the @parameterized
 26 | # decorator.
 27 | # The names will be expanded automatically, so e.g. "named_entities"
 28 | # creates test cases test_correct_index_named_entities and test_incorrect_index_named_entities.
 29 | 
 30 | test_cases_nlp = [
 31 |     ["named_entities", nlp.named_entities, (s_text,)],
 32 |     ["noun_chunks", nlp.noun_chunks, (s_text,)],
 33 |     ["stem", nlp.stem, (s_text,)],
 34 | ]
 35 | 
 36 | test_cases_preprocessing = [
 37 |     ["fillna", preprocessing.fillna, (s_text,)],
 38 |     ["lowercase", preprocessing.lowercase, (s_text,)],
 39 |     ["replace_digits", preprocessing.replace_digits, (s_text, "")],
 40 |     ["remove_digits", preprocessing.remove_digits, (s_text,)],
 41 |     ["replace_punctuation", preprocessing.replace_punctuation, (s_text, "")],
 42 |     ["remove_punctuation", preprocessing.remove_punctuation, (s_text,)],
 43 |     ["remove_diacritics", preprocessing.remove_diacritics, (s_text,)],
 44 |     ["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
 45 |     ["replace_stopwords", preprocessing.replace_stopwords, (s_text, "")],
 46 |     ["remove_stopwords", preprocessing.remove_stopwords, (s_text,)],
 47 |     ["clean", preprocessing.clean, (s_text,)],
 48 |     ["remove_round_brackets", preprocessing.remove_round_brackets, (s_text,)],
 49 |     ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_text,)],
 50 |     ["remove_square_brackets", preprocessing.remove_square_brackets, (s_text,)],
 51 |     ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text,)],
 52 |     ["remove_brackets", preprocessing.remove_brackets, (s_text,)],
 53 |     ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
 54 |     ["tokenize", preprocessing.tokenize, (s_text,)],
 55 |     broken_case("phrases", preprocessing.phrases, (s_tokenized_lists,)),
 56 |     ["replace_urls", preprocessing.replace_urls, (s_text, "")],
 57 |     ["remove_urls", preprocessing.remove_urls, (s_text,)],
 58 |     ["replace_tags", preprocessing.replace_tags, (s_text, "")],
 59 |     ["remove_tags", preprocessing.remove_tags, (s_text,)],
 60 | ]
 61 | 
 62 | test_cases_representation = [
 63 |     broken_case("count", representation.count, (s_tokenized_lists,),),
 64 |     broken_case("term_frequency", representation.term_frequency, (s_tokenized_lists,),),
 65 |     broken_case("tfidf", representation.tfidf, (s_tokenized_lists,),),
 66 |     ["pca", representation.pca, (s_numeric_lists, 0)],
 67 |     ["nmf", representation.nmf, (s_numeric_lists,)],
 68 |     broken_case("tsne", representation.tsne, (s_numeric_lists,)),
 69 |     ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
 70 |     ["dbscan", representation.dbscan, (s_numeric_lists,)],
 71 |     ["meanshift", representation.meanshift, (s_numeric_lists,)],
 72 | ]
 73 | 
 74 | test_cases = test_cases_nlp + test_cases_preprocessing + test_cases_representation
 75 | 
 76 | 
 77 | class TestAbstractIndex:
 78 |     """
 79 |     Class for index test cases. Tests for all cases
 80 |     in test_cases whether the input's index is correctly
 81 |     preserved by the function. Some function's tests
 82 |     are implemented manually as they take different inputs.
 83 | 
 84 |     """
 85 | 
 86 |     """
 87 |     Tests defined in test_cases above.
 88 |     """
 89 | 
 90 |     @pytest.mark.parametrize("name, test_function, valid_input", test_cases)
 91 |     def test_correct_index(self, name, test_function, valid_input):
 92 |         s = valid_input[0]
 93 |         result_s = test_function(*valid_input)
 94 |         t_same_index = pd.Series(s.values, s.index)
 95 |         assert result_s.index.equals(t_same_index.index)
 96 | 
 97 |     @pytest.mark.parametrize("name, test_function, valid_input", test_cases)
 98 |     def test_incorrect_index(self, name, test_function, valid_input):
 99 |         s = valid_input[0]
100 |         result_s = test_function(*valid_input)
101 |         t_different_index = pd.Series(s.values, index=None)
102 |         assert not result_s.index.equals(t_different_index.index)
103 | 


--------------------------------------------------------------------------------
/tests/test_nlp.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from texthero import nlp
  4 | 
  5 | from . import PandasTestCase
  6 | import doctest
  7 | import unittest
  8 | import string
  9 | 
 10 | """
 11 | Test doctest
 12 | """
 13 | 
 14 | 
 15 | def load_tests(loader, tests, ignore):
 16 |     tests.addTests(doctest.DocTestSuite(nlp))
 17 |     return tests
 18 | 
 19 | 
 20 | class TestNLP(PandasTestCase):
 21 |     """
 22 |     Named entity.
 23 |     """
 24 | 
 25 |     def test_named_entities(self):
 26 |         s = pd.Series("New York is a big city")
 27 |         s_true = pd.Series([[("New York", "GPE", 0, 8)]])
 28 |         self.assertEqual(nlp.named_entities(s), s_true)
 29 | 
 30 |     """
 31 |     Noun chunks.
 32 |     """
 33 | 
 34 |     def test_noun_chunks(self):
 35 |         s = pd.Series("Today is such a beautiful day")
 36 |         s_true = pd.Series(
 37 |             [[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]]
 38 |         )
 39 |         self.assertEqual(nlp.noun_chunks(s), s_true)
 40 | 
 41 |     """
 42 |     Count sentences.
 43 |     """
 44 | 
 45 |     def test_count_sentences(self):
 46 |         s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
 47 |         s_true = pd.Series(3)
 48 |         self.assertEqual(nlp.count_sentences(s), s_true)
 49 | 
 50 |     def test_count_sentences_numeric(self):
 51 |         s = pd.Series([13.0, 42.0])
 52 |         self.assertRaises(TypeError, nlp.count_sentences, s)
 53 | 
 54 |     def test_count_sentences_missing_value(self):
 55 |         s = pd.Series(["Test.", np.nan])
 56 |         self.assertRaises(TypeError, nlp.count_sentences, s)
 57 | 
 58 |     def test_count_sentences_index(self):
 59 |         s = pd.Series(["Test"], index=[5])
 60 |         counted_sentences_s = nlp.count_sentences(s)
 61 |         t_same_index = pd.Series([""], index=[5])
 62 | 
 63 |         self.assertTrue(counted_sentences_s.index.equals(t_same_index.index))
 64 | 
 65 |     def test_count_sentences_wrong_index(self):
 66 |         s = pd.Series(["Test", "Test"], index=[5, 6])
 67 |         counted_sentences_s = nlp.count_sentences(s)
 68 |         t_different_index = pd.Series(["", ""], index=[5, 7])
 69 | 
 70 |         self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
 71 | 
 72 |     """
 73 |     POS tagging.
 74 |     """
 75 | 
 76 |     def test_pos(self):
 77 |         s = pd.Series(["Today is such a beautiful day", "São Paulo is a great city"])
 78 |         pos_tagging = nlp.pos_tag(s)
 79 |         s_true = pd.Series(
 80 |             [
 81 |                 [
 82 |                     ("Today", "NOUN", "NN", 0, 5),
 83 |                     ("is", "AUX", "VBZ", 6, 8),
 84 |                     ("such", "DET", "PDT", 9, 13),
 85 |                     ("a", "DET", "DT", 14, 15),
 86 |                     ("beautiful", "ADJ", "JJ", 16, 25),
 87 |                     ("day", "NOUN", "NN", 26, 29),
 88 |                 ],
 89 |                 [
 90 |                     ("São", "PROPN", "NNP", 0, 3),
 91 |                     ("Paulo", "PROPN", "NNP", 4, 9),
 92 |                     ("is", "AUX", "VBZ", 10, 12),
 93 |                     ("a", "DET", "DT", 13, 14),
 94 |                     ("great", "ADJ", "JJ", 15, 20),
 95 |                     ("city", "NOUN", "NN", 21, 25),
 96 |                 ],
 97 |             ]
 98 |         )
 99 | 
100 |         self.assertEqual(pos_tagging, s_true)
101 | 


--------------------------------------------------------------------------------
/tests/test_types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit-tests for the types module.
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | from . import PandasTestCase
  9 | import doctest
 10 | import unittest
 11 | 
 12 | from texthero import _types
 13 | 
 14 | """
 15 | Doctests.
 16 | """
 17 | 
 18 | 
 19 | def load_tests(loader, tests, ignore):
 20 |     tests.addTests(doctest.DocTestSuite(_types))
 21 |     return tests
 22 | 
 23 | 
 24 | class TestTypes(PandasTestCase):
 25 |     """
 26 |     InputSeries.
 27 |     """
 28 | 
 29 |     def test_inputseries_function_executes_correctly(self):
 30 |         @_types.InputSeries(_types.TextSeries)
 31 |         def f(s, t):
 32 |             return t
 33 | 
 34 |         s = pd.Series("I'm a TextSeries")
 35 |         t = "test"
 36 |         self.assertEqual(f(s, t), t)
 37 | 
 38 |     def test_inputseries_wrong_type(self):
 39 |         @_types.InputSeries(_types.TextSeries)
 40 |         def f(s):
 41 |             pass
 42 | 
 43 |         self.assertRaises(TypeError, f, pd.Series([["token", "ized"]]))
 44 | 
 45 |     def test_inputseries_correct_type_textseries(self):
 46 |         @_types.InputSeries(_types.TextSeries)
 47 |         def f(s):
 48 |             pass
 49 | 
 50 |         try:
 51 |             f(pd.Series("I'm a TextSeries"))
 52 |         except TypeError:
 53 |             self.fail("Failed although input type is correct.")
 54 | 
 55 |     def test_inputseries_correct_type_tokenseries(self):
 56 |         @_types.InputSeries(_types.TokenSeries)
 57 |         def f(s):
 58 |             pass
 59 | 
 60 |         try:
 61 |             f(pd.Series([["token", "ized"]]))
 62 |         except TypeError:
 63 |             self.fail("Failed although input type is correct.")
 64 | 
 65 |     def test_inputseries_correct_type_vectorseries(self):
 66 |         @_types.InputSeries(_types.VectorSeries)
 67 |         def f(s):
 68 |             pass
 69 | 
 70 |         try:
 71 |             f(pd.Series([[0.0, 1.0]]))
 72 |         except TypeError:
 73 |             self.fail("Failed although input type is correct.")
 74 | 
 75 |     def test_inputseries_correct_type_DataFrame(self):
 76 |         @_types.InputSeries(_types.DataFrame)
 77 |         def f(s):
 78 |             pass
 79 | 
 80 |         try:
 81 |             f(pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"], dtype="Sparse",))
 82 |         except TypeError:
 83 |             self.fail("Failed although input type is correct.")
 84 | 
 85 |     def test_inputseries_correct_type_first_value_is_nan_TextSeries(self):
 86 |         @_types.InputSeries(_types.TextSeries)
 87 |         def f(s):
 88 |             pass
 89 | 
 90 |         try:
 91 |             f(pd.Series([np.nan, pd.NA, "I'm a TextSeries"]))
 92 |         except TypeError:
 93 |             self.fail("Failed although input type is correct.")
 94 | 
 95 |     def test_inputseries_correct_type_first_value_is_nan_TokenSeries(self):
 96 |         @_types.InputSeries(_types.TokenSeries)
 97 |         def f(s):
 98 |             pass
 99 | 
100 |         try:
101 |             f(pd.Series([np.nan, pd.NA, ["Token", "Series"]]))
102 |         except TypeError:
103 |             self.fail("Failed although input type is correct.")
104 | 
105 |     def test_inputseries_correct_type_first_value_is_nan_VectorSeries(self):
106 |         @_types.InputSeries(_types.VectorSeries)
107 |         def f(s):
108 |             pass
109 | 
110 |         try:
111 |             f(pd.Series([np.nan, pd.NA, [0, 1, 2]]))
112 |         except TypeError:
113 |             self.fail("Failed although input type is correct.")
114 | 
115 |     def test_several_possible_types_correct_type(self):
116 |         @_types.InputSeries([_types.DataFrame, _types.VectorSeries])
117 |         def f(x):
118 |             pass
119 | 
120 |         try:
121 |             f(pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"], dtype="Sparse",))
122 | 
123 |             f(pd.Series([[1.0, 2.0]]))
124 | 
125 |         except TypeError:
126 |             self.fail("Failed although input type is correct.")
127 | 
128 |     def test_several_possible_types_wrong_type(self):
129 |         @_types.InputSeries([_types.DataFrame, _types.VectorSeries])
130 |         def f(x):
131 |             pass
132 | 
133 |         self.assertRaises(TypeError, f, pd.Series([["token", "ized"]]))
134 | 


--------------------------------------------------------------------------------
/tests/test_visualization.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | import pandas as pd
 4 | import doctest
 5 | 
 6 | from texthero import visualization
 7 | from . import PandasTestCase
 8 | 
 9 | 
10 | """
11 | Test doctest
12 | """
13 | 
14 | 
15 | def load_tests(loader, tests, ignore):
16 |     tests.addTests(doctest.DocTestSuite(visualization))
17 |     return tests
18 | 
19 | 
20 | class TestVisualization(PandasTestCase):
21 |     """
22 |     Test scatterplot.
23 |     """
24 | 
25 |     def test_scatterplot_dimension_too_high(self):
26 |         s = pd.Series([[1, 2, 3, 4], [1, 2, 3, 4]])
27 |         df = pd.DataFrame(s)
28 |         self.assertRaises(ValueError, visualization.scatterplot, df, col=0)
29 | 
30 |     def test_scatterplot_dimension_too_low(self):
31 |         s = pd.Series([[1], [1]])
32 |         df = pd.DataFrame(s)
33 |         self.assertRaises(ValueError, visualization.scatterplot, df, col=0)
34 | 
35 |     def test_scatterplot_return_figure(self):
36 |         s = pd.Series([[1, 2, 3], [1, 2, 3]])
37 |         df = pd.DataFrame(s)
38 |         ret = visualization.scatterplot(df, col=0, return_figure=True)
39 |         self.assertIsNotNone(ret)
40 | 
41 |     """
42 |     Test top_words.
43 |     """
44 | 
45 |     def test_top_words(self):
46 |         s = pd.Series("one two two three three three")
47 |         s_true = pd.Series([1, 3, 2], index=["one", "three", "two"])
48 |         self.assertEqual(visualization.top_words(s).sort_index(), s_true)
49 | 
50 |     def test_top_words_space_char(self):
51 |         s = pd.Series("one \n\t")
52 |         s_true = pd.Series([1], index=["one"])
53 |         self.assertEqual(visualization.top_words(s), s_true)
54 | 
55 |     def test_top_words_punctuation_between(self):
56 |         s = pd.Series("can't hello-world u.s.a")
57 |         s_true = pd.Series([1, 1, 1], index=["can't", "hello-world", "u.s.a"])
58 |         self.assertEqual(visualization.top_words(s).sort_index(), s_true)
59 | 
60 |     def test_top_words_remove_external_punctuation(self):
61 |         s = pd.Series("stop. please!")
62 |         s_true = pd.Series([1, 1], index=["please", "stop"])
63 |         self.assertEqual(visualization.top_words(s).sort_index(), s_true)
64 | 
65 |     def test_top_words_digits(self):
66 |         s = pd.Series("123 hello h1n1")
67 |         s_true = pd.Series([1, 1, 1], index=["123", "h1n1", "hello"])
68 |         self.assertEqual(visualization.top_words(s).sort_index(), s_true)
69 | 
70 |     def test_top_words_digits_punctuation(self):
71 |         s = pd.Series("123. .321 -h1n1 -cov2")
72 |         s_true = pd.Series([1, 1, 1, 1], index=["123", "321", "cov2", "h1n1"])
73 |         self.assertEqual(visualization.top_words(s).sort_index(), s_true)
74 | 
75 |     """
76 |     Test worcloud
77 |     """
78 | 
79 |     def test_wordcloud(self):
80 |         s = pd.Series("one two three")
81 |         self.assertEqual(visualization.wordcloud(s), None)
82 | 


--------------------------------------------------------------------------------
/texthero/__init__.py:
--------------------------------------------------------------------------------
 1 | """Texthero: python toolkit for text preprocessing, representation and visualization.
 2 | 
 3 | 
 4 | 
 5 | """
 6 | from . import preprocessing
 7 | from .preprocessing import *
 8 | 
 9 | from . import representation
10 | from .representation import *
11 | 
12 | from . import visualization
13 | from .visualization import *
14 | 
15 | from . import nlp
16 | from .nlp import *
17 | 
18 | from . import stopwords
19 | 


--------------------------------------------------------------------------------
/texthero/_helper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Useful helper functions for the texthero library.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import functools
 7 | import warnings
 8 | 
 9 | 
10 | """
11 | Warnings.
12 | """
13 | 
14 | _warning_nans_in_input = (
15 |     "There are NaNs (missing values) in the given input series."
16 |     " They were replaced with appropriate values before the function"
17 |     " was applied. Consider using hero.fillna to replace those NaNs yourself"
18 |     " or hero.drop_no_content to remove them."
19 | )
20 | 
21 | 
22 | """
23 | Decorators.
24 | """
25 | 
26 | 
27 | def handle_nans(replace_nans_with):
28 |     """
29 |     Decorator to handle NaN values in a function's input.
30 | 
31 |     Using the decorator, if there are NaNs in the input,
32 |     they are replaced with replace_nans_with
33 |     and a warning is printed.
34 | 
35 |     The function must take as first input a Pandas Series.
36 | 
37 |     Examples
38 |     --------
39 |     >>> from texthero._helper import handle_nans
40 |     >>> import pandas as pd
41 |     >>> import numpy as np
42 |     >>> @handle_nans(replace_nans_with="I was missing!")
43 |     ... def replace_b_with_c(s):
44 |     ...     return s.str.replace("b", "c")
45 |     >>> s_with_nan = pd.Series(["Test b", np.nan])
46 |     >>> replace_b_with_c(s_with_nan) # doctest: +SKIP
47 |     0            Test c
48 |     1    I was missing!
49 |     dtype: object
50 |     """
51 | 
52 |     def decorator(func):
53 |         @functools.wraps(func)
54 |         def wrapper(*args, **kwargs):
55 | 
56 |             # Get first input argument (the series) and replace the NaNs.
57 |             s = args[0]
58 |             if s.isna().values.any():
59 |                 warnings.warn(_warning_nans_in_input, UserWarning)
60 |                 s = s.fillna(value=replace_nans_with)
61 | 
62 |             # Put the series back into the input.
63 |             if args[1:]:
64 |                 args = (s,) + args[1:]
65 |             else:
66 |                 args = (s,)
67 | 
68 |             # Apply function as usual.
69 |             return func(*args, **kwargs)
70 | 
71 |         return wrapper
72 | 
73 |     return decorator
74 | 


--------------------------------------------------------------------------------
/texthero/stopwords.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import spacy
 3 | 
 4 | try:
 5 |     # If not present, download NLTK stopwords.
 6 |     nltk.data.find("corpora/stopwords")
 7 | except LookupError:
 8 |     nltk.download("stopwords")
 9 | 
10 | from nltk.corpus import stopwords as nltk_en_stopwords
11 | from spacy.lang.en import stop_words as spacy_en_stopwords
12 | 
13 | DEFAULT = set(nltk_en_stopwords.words("english"))
14 | NLTK_EN = DEFAULT
15 | SPACY_EN = spacy_en_stopwords.STOP_WORDS
16 | 


--------------------------------------------------------------------------------
/vercel.json:
--------------------------------------------------------------------------------
1 | {
2 |   "github": {
3 |     "silent": true
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/website/blog/2017-10-24-texthero-welcome.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Texthero welcome
 3 | author: Jonathan Besomi
 4 | ---
 5 | 
 6 | # Texthero welcome.
 7 | 
 8 | 
 9 | Welcome to Texthero.
10 | 
11 | Texthero is a python package for working with text-based dataset with ease.
12 | 
13 | You can start from the online [documentation](https://texthero.org/docs/).
14 | 
15 | This tab is a work in progress, soon interesting articles will pop-up. Stay tuned.
16 | 


--------------------------------------------------------------------------------
/website/blog/2020-04-27-rename-columns-pandas.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Groupby and rename columns in pandas
 3 | author: Jonathan Besomi
 4 | unlisted: True
 5 | ---
 6 | 
 7 | 
 8 | 
 9 | ## Groupby and rename columns in pandas
10 | 
11 | ```
12 | df.groupby(['artist']).mean().stack().rename_axis(['one', 'bar']).reset_index(name='ooo')
13 | ```
14 | 
15 | ```
16 | df_empath = (
17 |     df_empath.groupby(['artist'])
18 |              .max()
19 |              .stack()
20 |              .rename_axis(['artist', 'sentiment'])
21 |              .reset_index(name='r')
22 | )
23 | ```
24 | 


--------------------------------------------------------------------------------
/website/blog/2020-04-27-text-preprocessing-with-pandas.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Text preprocessing with Pandas and Texthero.
3 | author: Jonathan Besomi
4 | unlisted: True
5 | ---
6 | 
7 | ## Text preprocessing with Pandas and Texthero.
8 | 


--------------------------------------------------------------------------------
/website/blog/2020-05-03-text-mining-with-python.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Text mining with python
3 | author: Jonathan Besomi
4 | unlisted: True
5 | ---
6 | 
7 | # Text mining with python
8 | 


--------------------------------------------------------------------------------
/website/blog/2020-05-03-text-unsupervised-learning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Unsupervised text learning
 3 | author: Jonathan Besomi
 4 | unlisted: True
 5 | ---
 6 | 
 7 | ## Pandas and Texthero.
 8 | 
 9 | 
10 | ## Introduction
11 | 
12 | - Pandas
13 | - Texthero
14 | - Unsupervised learning
15 | - Dataset:
16 | 
17 | ## Text representation
18 | 
19 | - TF-IDF
20 | - Count
21 | - Word2Vec
22 | 
23 | ##  If it is due to identical points in the dataset, removing these points may help.
24 | 
25 | 
26 | ## Clustering
27 | 
28 | ## Unsupervised
29 | - k-means
30 | - Totally Random Trees embedding
31 | 
32 | ## Semi-supervised
33 | - LDA
34 | 
35 | ## Dimensionality reduction
36 | To visualize the structure of a dataset, the dimension must be reduced somehow.
37 | 
38 | - PCA
39 | - NCA
40 | - Multi-dimensional Scaling, a technique used for analyzing similarity or dissimilarity data. It attempts to model dissimilarity or similarity of data as distance in a geometric vector spaces.
41 | - t-sne
42 | 
43 | ## 
44 | 


--------------------------------------------------------------------------------
/website/blog/2020-05-08-bar-run-chart-python.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Bar run chart in python
3 | author: Jonathan Besomi
4 | unlisted: True
5 | ---
6 | 
7 | # Bar run chart in python
8 | 


--------------------------------------------------------------------------------
/website/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | npm run build
4 | 


--------------------------------------------------------------------------------
/website/core/AnnouncementBar.js:
--------------------------------------------------------------------------------
 1 | import React, {useState, useEffect} from 'react';
 2 | 
 3 | // import styles from './announcement-bar.css';
 4 | 
 5 | const STORAGE_DISMISS_KEY = 'docusaurus.announcement.dismiss';
 6 | const STORAGE_ID_KEY = 'docusaurus.announcement.id';
 7 | 
 8 | function AnnouncementBar() {
 9 | 
10 | 
11 |    //const {id, content, backgroundColor, textColor} = {
12 |    //   id: 'supportus',
13 |    //   content:
14 |    //     '⭐️ If you like Docusaurus, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/facebook/docusaurus">GitHub</a>! ⭐️',
15 |    // };
16 | 
17 |   const id = "supportus"
18 |   const content = "⭐️ If you like Docusaurus"
19 | 
20 |   const [isClosed, setClosed] = useState(true);
21 |   const handleClose = () => {
22 |     localStorage.setItem(STORAGE_DISMISS_KEY, true);
23 |     setClosed(true);
24 |   };
25 | 
26 |   useEffect(() => {
27 |     const viewedId = localStorage.getItem(STORAGE_ID_KEY);
28 |     const isNewAnnouncement = id !== viewedId;
29 | 
30 |     localStorage.setItem(STORAGE_ID_KEY, id);
31 | 
32 |     if (isNewAnnouncement) {
33 |       localStorage.setItem(STORAGE_DISMISS_KEY, false);
34 |     }
35 | 
36 |     if (
37 |       isNewAnnouncement ||
38 |       localStorage.getItem(STORAGE_DISMISS_KEY) === 'false'
39 |     ) {
40 |       setClosed(false);
41 |     }
42 |   }, []);
43 | 
44 |   if (!content || isClosed) {
45 |     return null;
46 |   }
47 | 
48 |   return (
49 |     <div
50 |       className="announcementBar"
51 |       style={{backgroundColor, color: textColor}}
52 |       role="banner">
53 |       <div
54 |         className="announcementBarContent"
55 |         dangerouslySetInnerHTML={{__html: content}}
56 |       />
57 | 
58 |       <button
59 |         type="button"
60 |         className="announcementBarClose"
61 |         onClick={handleClose}
62 |         aria-label="Close">
63 |         <span aria-hidden="true">&times;</span>
64 |       </button>
65 |     </div>
66 |   );
67 | }
68 | 
69 | export default AnnouncementBar;
70 | 


--------------------------------------------------------------------------------
/website/core/Footer.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2017-present, Facebook, Inc.
  3 |  *
  4 |  * This source code is licensed under the MIT license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | const React = require('react');
  9 | 
 10 | class Footer_ extends React.Component {
 11 |   docUrl(doc, language) {
 12 |     const baseUrl = this.props.config.baseUrl;
 13 |     const docsUrl = this.props.config.docsUrl;
 14 |     const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
 15 |     const langPart = `${language ? `${language}/` : ''}`;
 16 |     return `${baseUrl}${docsPart}${langPart}${doc}`;
 17 |   }
 18 | 
 19 |   pageUrl(doc, language) {
 20 |     const baseUrl = this.props.config.baseUrl;
 21 |     return baseUrl + (language ? `${language}/` : '') + doc;
 22 |   }
 23 | 
 24 |   render() {
 25 |     return (
 26 |       <footer className="nav-footer" id="footer">
 27 |         <section className="sitemap">
 28 |           <a href={this.props.config.baseUrl} className="nav-home">
 29 |             {this.props.config.footerIcon && (
 30 |               <img
 31 |                 src={this.props.config.baseUrl + this.props.config.footerIcon}
 32 |                 alt={this.props.config.title}
 33 |                 width="66"
 34 |                 height="58"
 35 |               />
 36 |             )}
 37 |           </a>
 38 |           <div>
 39 |             <h5>Docs</h5>
 40 |             <a href={this.docUrl('from-zero-to-hero.html', this.props.language)}>
 41 |               From zero to hero
 42 |             </a>
 43 |           </div>
 44 |           <div>
 45 |             <h5>Tutorial</h5>
 46 |             <a href={this.pageUrl('users.html', this.props.language)}>
 47 |               Tutorial 1
 48 |             </a>
 49 |           </div>
 50 |           <div>
 51 |             <h5>More</h5>
 52 |             <a href={`${this.props.config.baseUrl}blog`}>Blog</a>
 53 |             <a href="https://github.com/">GitHub</a>
 54 |             <a
 55 |               className="github-button"
 56 |               href={this.props.config.repoUrl}
 57 |               data-icon="octicon-star"
 58 |               data-count-href="/facebook/docusaurus/stargazers"
 59 |               data-show-count="true"
 60 |               data-count-aria-label="# stargazers on GitHub"
 61 |               aria-label="Star this project on GitHub">
 62 |               Star
 63 |             </a>
 64 |             {this.props.config.twitterUsername && (
 65 |               <div className="social">
 66 |                 <a
 67 |                   href={`https://twitter.com/${this.props.config.twitterUsername}`}
 68 |                   className="twitter-follow-button">
 69 |                   Follow @{this.props.config.twitterUsername}
 70 |                 </a>
 71 |               </div>
 72 |             )}
 73 |             {this.props.config.facebookAppId && (
 74 |               <div className="social">
 75 |                 <div
 76 |                   className="fb-like"
 77 |                   data-href={this.props.config.url}
 78 |                   data-colorscheme="dark"
 79 |                   data-layout="standard"
 80 |                   data-share="true"
 81 |                   data-width="225"
 82 |                   data-show-faces="false"
 83 |                 />
 84 |               </div>
 85 |             )}
 86 |           </div>
 87 |         </section>
 88 | 
 89 |         <section className="copyright">{this.props.config.copyright}</section>
 90 |       </footer>
 91 |     );
 92 |   }
 93 | }
 94 | 
 95 | const Footer = props => (
 96 |    <footer className="nav-footer" id="footer">
 97 |       <section className="copyright">{props.config.copyright}</section>
 98 |    </footer>
 99 | );
100 | 
101 | module.exports = Footer;
102 | 


--------------------------------------------------------------------------------
/website/core/Showcase.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | const React = require('react');
 9 | const PropTypes = require('prop-types');
10 | 
11 | const UserLink = ({infoLink, image, caption}) => (
12 |   <a className="link" href={infoLink} key={infoLink}>
13 |     <img src={image} alt={caption} title={caption} />
14 |     <span className="caption">{caption}</span>
15 |   </a>
16 | );
17 | 
18 | UserLink.propTypes = {
19 |   infoLink: PropTypes.string.isRequired,
20 |   image: PropTypes.string.isRequired,
21 |   caption: PropTypes.string.isRequired,
22 | };
23 | 
24 | const Showcase = ({users}) => (
25 |   <div className="showcase">
26 |     {users.map((user) => (
27 |       <UserLink key={user.infoLink} {...user} />
28 |     ))}
29 |   </div>
30 | );
31 | 
32 | Showcase.propTypes = {
33 |   users: PropTypes.array.isRequired,
34 | };
35 | 
36 | module.exports = Showcase;
37 | 


--------------------------------------------------------------------------------
/website/core/annonucement-bar.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | .announcementBar {
 9 |   position: relative;
10 |   width: 100%;
11 |   background-color: black; /*var(--ifm-color-primary);*/
12 |   color: white; /*var(--ifm-color-black);*/
13 | }
14 | 
15 | .announcementBarClose {
16 |   position: absolute;
17 |   right: 0;
18 |   top: 0;
19 |   width: 55px;
20 |   font-size: 1.25rem;
21 |   padding: 0;
22 |   border: none;
23 |   cursor: pointer;
24 |   background: none;
25 |   color: inherit;
26 |   height: 100%;
27 | }
28 | 
29 | .announcementBarContent {
30 |   font-size: 85%;
31 |   width: 100%;
32 |   text-align: center;
33 |   padding: 5px 0;
34 |   margin-right: 55px;
35 | }
36 | 
37 | @media screen and (max-width: 576px) {
38 |   .announcementBarClose {
39 |     width: 35px;
40 |   }
41 |   .announcementBarContent {
42 |     width: auto;
43 |     margin-right: 35px;
44 |   }
45 | }
46 | 
47 | .announcementBarContent a {
48 |   color: inherit;
49 |   text-decoration: underline;
50 | }
51 | 


--------------------------------------------------------------------------------
/website/docs/api-nlp.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: api-nlp
 3 | title: NLP
 4 | hide_title: false
 5 | ---
 6 | 
 7 | <div>
 8 | <span class="target" id="module-texthero.nlp"></span><p>Common NLP tasks such as named_entities, noun_chunks, etc.</p>
 9 | <table class="longtable table">
10 | <colgroup>
11 | <col style="width: 10%"/>
12 | <col style="width: 90%"/>
13 | </colgroup>
14 | <tbody>
15 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.nlp.named_entities.html#texthero.nlp.named_entities" title="texthero.nlp.named_entities"><code class="xref py py-obj docutils literal notranslate"><span class="pre">named_entities</span></code></a>(s[, package])</p></td>
16 | <td><p>Return named-entities.</p></td>
17 | </tr>
18 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.nlp.noun_chunks.html#texthero.nlp.noun_chunks" title="texthero.nlp.noun_chunks"><code class="xref py py-obj docutils literal notranslate"><span class="pre">noun_chunks</span></code></a>(s)</p></td>
19 | <td><p>Return noun_chunks, group of consecutive words that belong together.</p></td>
20 | </tr>
21 | </tbody>
22 | </table>
23 | </div>


--------------------------------------------------------------------------------
/website/docs/api-representation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: api-representation
 3 | title: Representation
 4 | hide_title: false
 5 | ---
 6 | 
 7 | <div>
 8 | <span class="target" id="module-texthero.representation"></span><p>Map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe.</p>
 9 | <table class="longtable table">
10 | <colgroup>
11 | <col style="width: 10%"/>
12 | <col style="width: 90%"/>
13 | </colgroup>
14 | <tbody>
15 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.representation.dbscan.html#texthero.representation.dbscan" title="texthero.representation.dbscan"><code class="xref py py-obj docutils literal notranslate"><span class="pre">dbscan</span></code></a>(s[, eps, min_samples, metric, …])</p></td>
16 | <td><p>Perform DBSCAN clustering.</p></td>
17 | </tr>
18 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.representation.kmeans.html#texthero.representation.kmeans" title="texthero.representation.kmeans"><code class="xref py py-obj docutils literal notranslate"><span class="pre">kmeans</span></code></a>(s[, n_clusters, init, n_init, …])</p></td>
19 | <td><p>Perform K-means clustering algorithm.</p></td>
20 | </tr>
21 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.representation.meanshift.html#texthero.representation.meanshift" title="texthero.representation.meanshift"><code class="xref py py-obj docutils literal notranslate"><span class="pre">meanshift</span></code></a>(s[, bandwidth, seeds, …])</p></td>
22 | <td><p>Perform mean shift clustering.</p></td>
23 | </tr>
24 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.representation.nmf.html#texthero.representation.nmf" title="texthero.representation.nmf"><code class="xref py py-obj docutils literal notranslate"><span class="pre">nmf</span></code></a>(s[, n_components])</p></td>
25 | <td><p>Perform non-negative matrix factorization.</p></td>
26 | </tr>
27 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.representation.pca.html#texthero.representation.pca" title="texthero.representation.pca"><code class="xref py py-obj docutils literal notranslate"><span class="pre">pca</span></code></a>(s[, n_components])</p></td>
28 | <td><p>Perform principal component analysis on the given Pandas Series.</p></td>
29 | </tr>
30 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.representation.term_frequency.html#texthero.representation.term_frequency" title="texthero.representation.term_frequency"><code class="xref py py-obj docutils literal notranslate"><span class="pre">term_frequency</span></code></a>(s, max_features, NoneType] = None)</p></td>
31 | <td><p>Represent a text-based Pandas Series using term_frequency.</p></td>
32 | </tr>
33 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.representation.tfidf.html#texthero.representation.tfidf" title="texthero.representation.tfidf"><code class="xref py py-obj docutils literal notranslate"><span class="pre">tfidf</span></code></a>(s[, max_features, min_df, …])</p></td>
34 | <td><p>Represent a text-based Pandas Series using TF-IDF.</p></td>
35 | </tr>
36 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.representation.tsne.html#texthero.representation.tsne" title="texthero.representation.tsne"><code class="xref py py-obj docutils literal notranslate"><span class="pre">tsne</span></code></a>(s[, n_components, perplexity, …])</p></td>
37 | <td><p>Perform TSNE on the given pandas series.</p></td>
38 | </tr>
39 | </tbody>
40 | </table>
41 | </div>


--------------------------------------------------------------------------------
/website/docs/api-visualization.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: api-visualization
 3 | title: Visualization
 4 | hide_title: false
 5 | ---
 6 | 
 7 | <div>
 8 | <span class="target" id="module-texthero.visualization"></span><p>Visualize insights and statistics of a text-based Pandas DataFrame.</p>
 9 | <table class="longtable table">
10 | <colgroup>
11 | <col style="width: 10%"/>
12 | <col style="width: 90%"/>
13 | </colgroup>
14 | <tbody>
15 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.visualization.scatterplot.html#texthero.visualization.scatterplot" title="texthero.visualization.scatterplot"><code class="xref py py-obj docutils literal notranslate"><span class="pre">scatterplot</span></code></a>(df, col, color, hover_data[, …])</p></td>
16 | <td><p>Show scatterplot using python plotly scatter.</p></td>
17 | </tr>
18 | <tr class="row-even"><td><p><a class="reference internal" href="api/texthero.visualization.top_words.html#texthero.visualization.top_words" title="texthero.visualization.top_words"><code class="xref py py-obj docutils literal notranslate"><span class="pre">top_words</span></code></a>(s[, normalize])</p></td>
19 | <td><p>Return a pandas series with index the top words and as value the count.</p></td>
20 | </tr>
21 | <tr class="row-odd"><td><p><a class="reference internal" href="api/texthero.visualization.wordcloud.html#texthero.visualization.wordcloud" title="texthero.visualization.wordcloud"><code class="xref py py-obj docutils literal notranslate"><span class="pre">wordcloud</span></code></a>(s, font_path, width, height[, …])</p></td>
22 | <td><p>Plot wordcloud image using WordCloud from word_cloud package.</p></td>
23 | </tr>
24 | </tbody>
25 | </table>
26 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.nlp.dependency_parse.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.nlp.dependency_parse
 3 | title: nlp.dependency_parse
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-nlp-dependency-parse">
 9 | <h1>texthero.nlp.dependency_parse<a class="headerlink" href="#texthero-nlp-dependency-parse" title="Permalink to this headline">¶</a></h1>
10 | </div>
11 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.nlp.named_entities.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.nlp.named_entities
 3 | title: nlp.named_entities
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-nlp-named-entities">
 9 | <h1>texthero.nlp.named_entities<a class="headerlink" href="#texthero-nlp-named-entities" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.nlp.named_entities">
12 | <code class="sig-name descname">named_entities</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em>, <em class="sig-param"><span class="n">package</span><span class="o">=</span><span class="default_value">'spacy'</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.nlp.named_entities" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Return named-entities.</p>
14 | <p>Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.</p>
15 | <p>Tuple: (<cite>entity’name</cite>, <cite>entity’label</cite>, <cite>starting character</cite>, <cite>ending character</cite>)</p>
16 | <p>Under the hood, <cite>named_entities</cite> make use of Spacy name entity recognition.</p>
17 | <dl class="simple">
18 | <dt>List of labels:</dt><dd><ul class="simple">
19 | <li><p><cite>PERSON</cite>: People, including fictional.</p></li>
20 | <li><p><cite>NORP</cite>: Nationalities or religious or political groups.</p></li>
21 | <li><p><cite>FAC</cite>: Buildings, airports, highways, bridges, etc.</p></li>
22 | <li><p><cite>ORG</cite> : Companies, agencies, institutions, etc.</p></li>
23 | <li><p><cite>GPE</cite>: Countries, cities, states.</p></li>
24 | <li><p><cite>LOC</cite>: Non-GPE locations, mountain ranges, bodies of water.</p></li>
25 | <li><p><cite>PRODUCT</cite>: Objects, vehicles, foods, etc. (Not services.)</p></li>
26 | <li><p><cite>EVENT</cite>: Named hurricanes, battles, wars, sports events, etc.</p></li>
27 | <li><p><cite>WORK_OF_ART</cite>: Titles of books, songs, etc.</p></li>
28 | <li><p><cite>LAW</cite>: Named documents made into laws.</p></li>
29 | <li><p><cite>LANGUAGE</cite>: Any named language.</p></li>
30 | <li><p><cite>DATE</cite>: Absolute or relative dates or periods.</p></li>
31 | <li><p><cite>TIME</cite>: Times smaller than a day.</p></li>
32 | <li><p><cite>PERCENT</cite>: Percentage, including ”%“.</p></li>
33 | <li><p><cite>MONEY</cite>: Monetary values, including unit.</p></li>
34 | <li><p><cite>QUANTITY</cite>: Measurements, as of weight or distance.</p></li>
35 | <li><p><cite>ORDINAL</cite>: “first”, “second”, etc.</p></li>
36 | <li><p><cite>CARDINAL</cite>: Numerals that do not fall under another type.</p></li>
37 | </ul>
38 | </dd>
39 | </dl>
40 | <p class="rubric">Examples</p>
41 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
42 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
43 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Yesterday I was in NY with Bill de Blasio"</span><span class="p">)</span>
44 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">named_entities</span><span class="p">(</span><span class="n">s</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
45 | <span class="go">[('Yesterday', 'DATE', 0, 9), ('NY', 'GPE', 19, 21), ('Bill de Blasio', 'PERSON', 27, 41)]</span>
46 | </pre></div>
47 | </div>
48 | </dd></dl>
49 | </div>
50 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.nlp.noun_chunks.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.nlp.noun_chunks
 3 | title: nlp.noun_chunks
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-nlp-noun-chunks">
 9 | <h1>texthero.nlp.noun_chunks<a class="headerlink" href="#texthero-nlp-noun-chunks" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.nlp.noun_chunks">
12 | <code class="sig-name descname">noun_chunks</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.nlp.noun_chunks" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Return noun_chunks, group of consecutive words that belong together.</p>
14 | </dd></dl>
15 | </div>
16 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.clean.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.clean
 3 | title: preprocessing.clean
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-clean">
 9 | <h1>texthero.preprocessing.clean<a class="headerlink" href="#texthero-preprocessing-clean" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.clean">
12 | <code class="sig-name descname">clean</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">pipeline</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.clean" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Pre-process a text-based Pandas Series.</p>
14 | <dl class="simple">
15 | <dt>Default pipeline:</dt><dd><ol class="arabic simple">
16 | <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.fillna()</span></code></p></li>
17 | <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.lowercase()</span></code></p></li>
18 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_digits.html#texthero.preprocessing.remove_digits" title="texthero.preprocessing.remove_digits"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_digits()</span></code></a></p></li>
19 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_punctuation.html#texthero.preprocessing.remove_punctuation" title="texthero.preprocessing.remove_punctuation"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_punctuation()</span></code></a></p></li>
20 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_diacritics.html#texthero.preprocessing.remove_diacritics" title="texthero.preprocessing.remove_diacritics"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_diacritics()</span></code></a></p></li>
21 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_stopwords.html#texthero.preprocessing.remove_stopwords" title="texthero.preprocessing.remove_stopwords"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_stopwords()</span></code></a></p></li>
22 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_whitespace.html#texthero.preprocessing.remove_whitespace" title="texthero.preprocessing.remove_whitespace"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_whitespace()</span></code></a></p></li>
23 | </ol>
24 | </dd>
25 | </dl>
26 | </dd></dl>
27 | </div>
28 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.drop_no_content.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.drop_no_content
 3 | title: preprocessing.drop_no_content
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-drop-no-content">
 9 | <h1>texthero.preprocessing.drop_no_content<a class="headerlink" href="#texthero-preprocessing-drop-no-content" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.drop_no_content">
12 | <code class="sig-name descname">drop_no_content</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.drop_no_content" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Drop all rows without content.</p>
14 | <p>Drop all rows from the given Pandas Series where <a class="reference internal" href="texthero.preprocessing.has_content.html#texthero.preprocessing.has_content" title="texthero.preprocessing.has_content"><code class="xref py py-meth docutils literal notranslate"><span class="pre">has_content()</span></code></a> is False.</p>
15 | <p class="rubric">Examples</p>
16 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"content"</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">,</span> <span class="s2">"</span><span class="se">\t\n</span><span class="s2">"</span><span class="p">,</span> <span class="s2">" "</span><span class="p">])</span>
17 | <span class="gp">&gt;&gt;&gt; </span><span class="n">drop_no_content</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
18 | <span class="go">0    content</span>
19 | <span class="go">dtype: object</span>
20 | </pre></div>
21 | </div>
22 | </dd></dl>
23 | </div>
24 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.get_default_pipeline.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.get_default_pipeline
 3 | title: preprocessing.get_default_pipeline
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-get-default-pipeline">
 9 | <h1>texthero.preprocessing.get_default_pipeline<a class="headerlink" href="#texthero-preprocessing-get-default-pipeline" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.get_default_pipeline">
12 | <code class="sig-name descname">get_default_pipeline</code><span class="sig-paren">(</span><span class="sig-paren">)</span> → List<span class="p">[</span>Callable<span class="p">[</span><span class="p">[</span>pandas.core.series.Series<span class="p">]</span><span class="p">, </span>pandas.core.series.Series<span class="p">]</span><span class="p">]</span><a class="headerlink" href="#texthero.preprocessing.get_default_pipeline" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Return a list contaning all the methods used in the default cleaning pipeline.</p>
14 | <dl class="simple">
15 | <dt>Return a list with the following functions:</dt><dd><ol class="arabic simple">
16 | <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.fillna()</span></code></p></li>
17 | <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.lowercase()</span></code></p></li>
18 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_digits.html#texthero.preprocessing.remove_digits" title="texthero.preprocessing.remove_digits"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_digits()</span></code></a></p></li>
19 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_punctuation.html#texthero.preprocessing.remove_punctuation" title="texthero.preprocessing.remove_punctuation"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_punctuation()</span></code></a></p></li>
20 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_diacritics.html#texthero.preprocessing.remove_diacritics" title="texthero.preprocessing.remove_diacritics"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_diacritics()</span></code></a></p></li>
21 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_stopwords.html#texthero.preprocessing.remove_stopwords" title="texthero.preprocessing.remove_stopwords"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_stopwords()</span></code></a></p></li>
22 | <li><p><a class="reference internal" href="texthero.preprocessing.remove_whitespace.html#texthero.preprocessing.remove_whitespace" title="texthero.preprocessing.remove_whitespace"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_whitespace()</span></code></a></p></li>
23 | </ol>
24 | </dd>
25 | </dl>
26 | </dd></dl>
27 | </div>
28 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.has_content.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.has_content
 3 | title: preprocessing.has_content
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-has-content">
 9 | <h1>texthero.preprocessing.has_content<a class="headerlink" href="#texthero-preprocessing-has-content" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.has_content">
12 | <code class="sig-name descname">has_content</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.has_content" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Return a Boolean Pandas Series indicating if the rows has content.</p>
14 | <p class="rubric">Examples</p>
15 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"content"</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">,</span> <span class="s2">"</span><span class="se">\t\n</span><span class="s2">"</span><span class="p">,</span> <span class="s2">" "</span><span class="p">])</span>
16 | <span class="gp">&gt;&gt;&gt; </span><span class="n">has_content</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
17 | <span class="go">0     True</span>
18 | <span class="go">1    False</span>
19 | <span class="go">2    False</span>
20 | <span class="go">3    False</span>
21 | <span class="go">dtype: bool</span>
22 | </pre></div>
23 | </div>
24 | </dd></dl>
25 | </div>
26 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_angle_brackets.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_angle_brackets
 3 | title: preprocessing.remove_angle_brackets
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-angle-brackets">
 9 | <h1>texthero.preprocessing.remove_angle_brackets<a class="headerlink" href="#texthero-preprocessing-remove-angle-brackets" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_angle_brackets">
12 | <code class="sig-name descname">remove_angle_brackets</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.remove_angle_brackets" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove content within angle brackets &lt;&gt; and the angle brackets.</p>
14 | <div class="alert alert-info">
15 | <p class="admonition-title">See also</p>
16 | <dl class="simple">
17 | <dt><a class="reference internal" href="texthero.preprocessing.remove_brackets.html#texthero.preprocessing.remove_brackets" title="texthero.preprocessing.remove_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_brackets()</span></code></a></dt><dd></dd>
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_round_brackets.html#texthero.preprocessing.remove_round_brackets" title="texthero.preprocessing.remove_round_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_round_brackets()</span></code></a></dt><dd></dd>
19 | <dt><a class="reference internal" href="texthero.preprocessing.remove_curly_brackets.html#texthero.preprocessing.remove_curly_brackets" title="texthero.preprocessing.remove_curly_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_curly_brackets()</span></code></a></dt><dd></dd>
20 | <dt><a class="reference internal" href="texthero.preprocessing.remove_square_brackets.html#texthero.preprocessing.remove_square_brackets" title="texthero.preprocessing.remove_square_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_square_brackets()</span></code></a></dt><dd></dd>
21 | </dl>
22 | </div>
23 | <p class="rubric">Examples</p>
24 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero &lt;is not a superhero!&gt;"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_angle_brackets</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
26 | <span class="go">0    Texthero </span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_brackets.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_brackets
 3 | title: preprocessing.remove_brackets
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-brackets">
 9 | <h1>texthero.preprocessing.remove_brackets<a class="headerlink" href="#texthero-preprocessing-remove-brackets" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_brackets">
12 | <code class="sig-name descname">remove_brackets</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.remove_brackets" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove content within brackets and the brackets itself.</p>
14 | <p>Remove content from any kind of brackets, (), [], {}, &lt;&gt;.</p>
15 | <div class="alert alert-info">
16 | <p class="admonition-title">See also</p>
17 | <dl class="simple">
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_round_brackets.html#texthero.preprocessing.remove_round_brackets" title="texthero.preprocessing.remove_round_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_round_brackets()</span></code></a></dt><dd></dd>
19 | <dt><a class="reference internal" href="texthero.preprocessing.remove_curly_brackets.html#texthero.preprocessing.remove_curly_brackets" title="texthero.preprocessing.remove_curly_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_curly_brackets()</span></code></a></dt><dd></dd>
20 | <dt><a class="reference internal" href="texthero.preprocessing.remove_square_brackets.html#texthero.preprocessing.remove_square_brackets" title="texthero.preprocessing.remove_square_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_square_brackets()</span></code></a></dt><dd></dd>
21 | <dt><a class="reference internal" href="texthero.preprocessing.remove_angle_brackets.html#texthero.preprocessing.remove_angle_brackets" title="texthero.preprocessing.remove_angle_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_angle_brackets()</span></code></a></dt><dd></dd>
22 | </dl>
23 | </div>
24 | <p class="rubric">Examples</p>
25 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero (round) [square] [curly] [angle]"</span><span class="p">)</span>
26 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_brackets</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
27 | <span class="go">0    Texthero    </span>
28 | <span class="go">dtype: object</span>
29 | </pre></div>
30 | </div>
31 | </dd></dl>
32 | </div>
33 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_curly_brackets.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_curly_brackets
 3 | title: preprocessing.remove_curly_brackets
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-curly-brackets">
 9 | <h1>texthero.preprocessing.remove_curly_brackets<a class="headerlink" href="#texthero-preprocessing-remove-curly-brackets" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_curly_brackets">
12 | <code class="sig-name descname">remove_curly_brackets</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.remove_curly_brackets" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove content within curly brackets {} and the curly brackets.</p>
14 | <div class="alert alert-info">
15 | <p class="admonition-title">See also</p>
16 | <dl class="simple">
17 | <dt><a class="reference internal" href="texthero.preprocessing.remove_brackets.html#texthero.preprocessing.remove_brackets" title="texthero.preprocessing.remove_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_brackets()</span></code></a></dt><dd></dd>
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_angle_brackets.html#texthero.preprocessing.remove_angle_brackets" title="texthero.preprocessing.remove_angle_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_angle_brackets()</span></code></a></dt><dd></dd>
19 | <dt><a class="reference internal" href="texthero.preprocessing.remove_round_brackets.html#texthero.preprocessing.remove_round_brackets" title="texthero.preprocessing.remove_round_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_round_brackets()</span></code></a></dt><dd></dd>
20 | <dt><a class="reference internal" href="texthero.preprocessing.remove_square_brackets.html#texthero.preprocessing.remove_square_brackets" title="texthero.preprocessing.remove_square_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_square_brackets()</span></code></a></dt><dd></dd>
21 | </dl>
22 | </div>
23 | <p class="rubric">Examples</p>
24 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero {is not a superhero!}"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_curly_brackets</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
26 | <span class="go">0    Texthero </span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_diacritics.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_diacritics
 3 | title: preprocessing.remove_diacritics
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-diacritics">
 9 | <h1>texthero.preprocessing.remove_diacritics<a class="headerlink" href="#texthero-preprocessing-remove-diacritics" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_diacritics">
12 | <code class="sig-name descname">remove_diacritics</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_diacritics" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove all diacritics and accents.</p>
14 | <p>Remove all diacritics and accents from any word and characters from the given Pandas Series. Return a cleaned version of the Pandas Series.</p>
15 | <p class="rubric">Examples</p>
16 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
17 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
18 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Noël means Christmas in French"</span><span class="p">)</span>
19 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_diacritics</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
20 | <span class="go">0    Noel means Christmas in French</span>
21 | <span class="go">dtype: object</span>
22 | </pre></div>
23 | </div>
24 | </dd></dl>
25 | </div>
26 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_digits.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_digits
 3 | title: preprocessing.remove_digits
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-digits">
 9 | <h1>texthero.preprocessing.remove_digits<a class="headerlink" href="#texthero-preprocessing-remove-digits" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_digits">
12 | <code class="sig-name descname">remove_digits</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">only_blocks</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_digits" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove all digits and replace it with a single space.</p>
14 | <p>By default, only removes “blocks” of digits. For instance, <cite>1234 falcon9</cite> becomes ` falcon9`.</p>
15 | <p>When the arguments <cite>only_blocks</cite> is set to ´False´, remove any digits.</p>
16 | <p>See also <code class="xref py py-meth docutils literal notranslate"><span class="pre">replace_digits()</span></code> to replace digits with another string.</p>
17 | <dl class="field-list simple">
18 | <dt class="field-odd">Parameters</dt>
19 | <dd class="field-odd"><dl class="simple">
20 | <dt><strong>input</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
21 | <dt><strong>only_blocks</strong><span class="classifier">bool</span></dt><dd><p>Remove only blocks of digits.</p>
22 | </dd>
23 | </dl>
24 | </dd>
25 | </dl>
26 | <p class="rubric">Examples</p>
27 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
28 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
29 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"7ex7hero is fun 1111"</span><span class="p">)</span>
30 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">preprocessing</span><span class="o">.</span><span class="n">remove_digits</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
31 | <span class="go">0    7ex7hero is fun  </span>
32 | <span class="go">dtype: object</span>
33 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">preprocessing</span><span class="o">.</span><span class="n">remove_digits</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">only_blocks</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
34 | <span class="go">0     ex hero is fun  </span>
35 | <span class="go">dtype: object</span>
36 | </pre></div>
37 | </div>
38 | </dd></dl>
39 | </div>
40 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_html_tags.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_html_tags
 3 | title: preprocessing.remove_html_tags
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-html-tags">
 9 | <h1>texthero.preprocessing.remove_html_tags<a class="headerlink" href="#texthero-preprocessing-remove-html-tags" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_html_tags">
12 | <code class="sig-name descname">remove_html_tags</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_html_tags" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove html tags from the given Pandas Series.</p>
14 | <p>Remove all html tags of the type <cite>&lt;.*?&gt;</cite> such as &lt;html&gt;, &lt;p&gt;, &lt;div class=”hello”&gt; and remove all html tags of type &amp;nbsp and return a cleaned Pandas Series.</p>
15 | <p class="rubric">Examples</p>
16 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"&lt;html&gt;&lt;h1&gt;Title&lt;/h1&gt;&lt;/html&gt;"</span><span class="p">)</span>
17 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_html_tags</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
18 | <span class="go">0    Title</span>
19 | <span class="go">dtype: object</span>
20 | </pre></div>
21 | </div>
22 | </dd></dl>
23 | </div>
24 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_punctuation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_punctuation
 3 | title: preprocessing.remove_punctuation
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-punctuation">
 9 | <h1>texthero.preprocessing.remove_punctuation<a class="headerlink" href="#texthero-preprocessing-remove-punctuation" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_punctuation">
12 | <code class="sig-name descname">remove_punctuation</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_punctuation" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Replace all punctuation with a single space (” “).</p>
14 | <p><cite>remove_punctuation</cite> removes all punctuation from the given Pandas Series and replace it with a single space. It consider as punctuation characters all <a class="reference external" href="https://docs.python.org/3/library/string.html#string.punctuation" title="(in Python v3.8)"><code class="xref py py-data docutils literal notranslate"><span class="pre">string.punctuation</span></code></a> symbols <cite>!”#$%&amp;’()*+,-./:;&lt;=&gt;?@[]^_`{|}~).</cite></p>
15 | <p>See also <a class="reference internal" href="texthero.preprocessing.replace_punctuation.html#texthero.preprocessing.replace_punctuation" title="texthero.preprocessing.replace_punctuation"><code class="xref py py-meth docutils literal notranslate"><span class="pre">replace_punctuation()</span></code></a> to replace punctuation with a custom symbol.</p>
16 | <p class="rubric">Examples</p>
17 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
18 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
19 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Finnaly."</span><span class="p">)</span>
20 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_punctuation</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
21 | <span class="go">0    Finnaly </span>
22 | <span class="go">dtype: object</span>
23 | </pre></div>
24 | </div>
25 | </dd></dl>
26 | </div>
27 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_round_brackets.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_round_brackets
 3 | title: preprocessing.remove_round_brackets
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-round-brackets">
 9 | <h1>texthero.preprocessing.remove_round_brackets<a class="headerlink" href="#texthero-preprocessing-remove-round-brackets" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_round_brackets">
12 | <code class="sig-name descname">remove_round_brackets</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.remove_round_brackets" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove content within parentheses () and parentheses.</p>
14 | <div class="alert alert-info">
15 | <p class="admonition-title">See also</p>
16 | <dl class="simple">
17 | <dt><a class="reference internal" href="texthero.preprocessing.remove_brackets.html#texthero.preprocessing.remove_brackets" title="texthero.preprocessing.remove_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_brackets()</span></code></a></dt><dd></dd>
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_angle_brackets.html#texthero.preprocessing.remove_angle_brackets" title="texthero.preprocessing.remove_angle_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_angle_brackets()</span></code></a></dt><dd></dd>
19 | <dt><a class="reference internal" href="texthero.preprocessing.remove_curly_brackets.html#texthero.preprocessing.remove_curly_brackets" title="texthero.preprocessing.remove_curly_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_curly_brackets()</span></code></a></dt><dd></dd>
20 | <dt><a class="reference internal" href="texthero.preprocessing.remove_square_brackets.html#texthero.preprocessing.remove_square_brackets" title="texthero.preprocessing.remove_square_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_square_brackets()</span></code></a></dt><dd></dd>
21 | </dl>
22 | </div>
23 | <p class="rubric">Examples</p>
24 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero (is not a superhero!)"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_round_brackets</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
26 | <span class="go">0    Texthero </span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_square_brackets.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_square_brackets
 3 | title: preprocessing.remove_square_brackets
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-square-brackets">
 9 | <h1>texthero.preprocessing.remove_square_brackets<a class="headerlink" href="#texthero-preprocessing-remove-square-brackets" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_square_brackets">
12 | <code class="sig-name descname">remove_square_brackets</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.preprocessing.remove_square_brackets" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove content within square brackets [] and the square brackets.</p>
14 | <div class="alert alert-info">
15 | <p class="admonition-title">See also</p>
16 | <dl class="simple">
17 | <dt><a class="reference internal" href="texthero.preprocessing.remove_brackets.html#texthero.preprocessing.remove_brackets" title="texthero.preprocessing.remove_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_brackets()</span></code></a></dt><dd></dd>
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_angle_brackets.html#texthero.preprocessing.remove_angle_brackets" title="texthero.preprocessing.remove_angle_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_angle_brackets()</span></code></a></dt><dd></dd>
19 | <dt><a class="reference internal" href="texthero.preprocessing.remove_round_brackets.html#texthero.preprocessing.remove_round_brackets" title="texthero.preprocessing.remove_round_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_round_brackets()</span></code></a></dt><dd></dd>
20 | <dt><a class="reference internal" href="texthero.preprocessing.remove_curly_brackets.html#texthero.preprocessing.remove_curly_brackets" title="texthero.preprocessing.remove_curly_brackets"><code class="xref py py-meth docutils literal notranslate"><span class="pre">remove_curly_brackets()</span></code></a></dt><dd></dd>
21 | </dl>
22 | </div>
23 | <p class="rubric">Examples</p>
24 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero [is not a superhero!]"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">remove_square_brackets</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
26 | <span class="go">0    Texthero </span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_stopwords.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_stopwords
 3 | title: preprocessing.remove_stopwords
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-stopwords">
 9 | <h1>texthero.preprocessing.remove_stopwords<a class="headerlink" href="#texthero-preprocessing-remove-stopwords" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_stopwords">
12 | <code class="sig-name descname">remove_stopwords</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">stopwords</span><span class="p">:</span> <span class="n">Union<span class="p">[</span>Set<span class="p">[</span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a><span class="p">]</span><span class="p">, </span>NoneType<span class="p">]</span></span> <span class="o">=</span> <span class="default_value">None</span></em>, <em class="sig-param"><span class="n">remove_str_numbers</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_stopwords" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove all instances of <cite>words</cite>.</p>
14 | <p>By default uses NLTK’s english stopwords of 179 words:</p>
15 | <dl class="field-list simple">
16 | <dt class="field-odd">Parameters</dt>
17 | <dd class="field-odd"><dl class="simple">
18 | <dt><strong>input</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
19 | <dt><strong>stopwords</strong><span class="classifier">Set[str], Optional</span></dt><dd><p>Set of stopwords string to remove. If not passed, by default it used NLTK English stopwords.</p>
20 | </dd>
21 | </dl>
22 | </dd>
23 | </dl>
24 | <p class="rubric">Examples</p>
25 | <p>Using default NLTK list of stopwords:</p>
26 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
27 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
28 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero is not only for the heroes"</span><span class="p">)</span>
29 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_stopwords</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
30 | <span class="go">0    Texthero      heroes</span>
31 | <span class="go">dtype: object</span>
32 | </pre></div>
33 | </div>
34 | <p>Add custom words into the default list of stopwords:</p>
35 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
36 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">texthero</span> <span class="kn">import</span> <span class="n">stopwords</span>
37 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
38 | <span class="gp">&gt;&gt;&gt; </span><span class="n">default_stopwords</span> <span class="o">=</span> <span class="n">stopwords</span><span class="o">.</span><span class="n">DEFAULT</span>
39 | <span class="gp">&gt;&gt;&gt; </span><span class="n">custom_stopwords</span> <span class="o">=</span> <span class="n">default_stopwords</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="nb">set</span><span class="p">([</span><span class="s2">"heroes"</span><span class="p">]))</span>
40 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Texthero is not only for the heroes"</span><span class="p">)</span>
41 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_stopwords</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">custom_stopwords</span><span class="p">)</span>
42 | <span class="go">0    Texthero      </span>
43 | <span class="go">dtype: object</span>
44 | </pre></div>
45 | </div>
46 | </dd></dl>
47 | </div>
48 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_urls.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_urls
 3 | title: preprocessing.remove_urls
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-urls">
 9 | <h1>texthero.preprocessing.remove_urls<a class="headerlink" href="#texthero-preprocessing-remove-urls" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_urls">
12 | <code class="sig-name descname">remove_urls</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_urls" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove all urls from a given Pandas Series.</p>
14 | <p><cite>remove_urls</cite> remove any urls and replace it with a single empty space.</p>
15 | <div class="alert alert-info">
16 | <p class="admonition-title">See also</p>
17 | <dl class="simple">
18 | <dt><a class="reference internal" href="texthero.preprocessing.replace_urls.html#texthero.preprocessing.replace_urls" title="texthero.preprocessing.replace_urls"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.replace_urls()</span></code></a></dt><dd></dd>
19 | </dl>
20 | </div>
21 | <p class="rubric">Examples</p>
22 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
23 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
24 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Go to: https://example.com"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_urls</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
26 | <span class="go">0    Go to:  </span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.remove_whitespace.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.remove_whitespace
 3 | title: preprocessing.remove_whitespace
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-remove-whitespace">
 9 | <h1>texthero.preprocessing.remove_whitespace<a class="headerlink" href="#texthero-preprocessing-remove-whitespace" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.remove_whitespace">
12 | <code class="sig-name descname">remove_whitespace</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.remove_whitespace" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Remove any extra white spaces.</p>
14 | <p>Remove any extra whitespace in the given Pandas Series. Removes also newline, tabs and any form of space.</p>
15 | <p>Useful when there is a need to visualize a Pandas Series and most cells have many newlines or other kind of space characters.</p>
16 | <p class="rubric">Examples</p>
17 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
18 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
19 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Title </span><span class="se">\n</span><span class="s2"> Subtitle </span><span class="se">\t</span><span class="s2">    ..."</span><span class="p">)</span>
20 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">remove_whitespace</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
21 | <span class="go">0    Title Subtitle ...</span>
22 | <span class="go">dtype: object</span>
23 | </pre></div>
24 | </div>
25 | </dd></dl>
26 | </div>
27 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.replace_punctuation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.replace_punctuation
 3 | title: preprocessing.replace_punctuation
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-replace-punctuation">
 9 | <h1>texthero.preprocessing.replace_punctuation<a class="headerlink" href="#texthero-preprocessing-replace-punctuation" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.replace_punctuation">
12 | <code class="sig-name descname">replace_punctuation</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">symbol</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span> <span class="o">=</span> <span class="default_value">' '</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.replace_punctuation" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Replace all punctuation with a given symbol.</p>
14 | <p><cite>replace_punctuation</cite> replace all punctuation from the given Pandas Series and replace it with a custom symbol. It consider as punctuation characters all <a class="reference external" href="https://docs.python.org/3/library/string.html#string.punctuation" title="(in Python v3.8)"><code class="xref py py-data docutils literal notranslate"><span class="pre">string.punctuation</span></code></a> symbols <cite>!”#$%&amp;’()*+,-./:;&lt;=&gt;?@[]^_`{|}~).</cite></p>
15 | <dl class="field-list simple">
16 | <dt class="field-odd">Parameters</dt>
17 | <dd class="field-odd"><dl class="simple">
18 | <dt><strong>input</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
19 | <dt><strong>symbol</strong><span class="classifier">str (default single empty space)</span></dt><dd><p>Symbol to use as replacement for all string punctuation.</p>
20 | </dd>
21 | </dl>
22 | </dd>
23 | </dl>
24 | <p class="rubric">Examples</p>
25 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
26 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
27 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Finnaly."</span><span class="p">)</span>
28 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">replace_punctuation</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="s2">" &lt;PUNCT&gt; "</span><span class="p">)</span>
29 | <span class="go">0    Finnaly &lt;PUNCT&gt; </span>
30 | <span class="go">dtype: object</span>
31 | </pre></div>
32 | </div>
33 | </dd></dl>
34 | </div>
35 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.replace_stopwords.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.replace_stopwords
 3 | title: preprocessing.replace_stopwords
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-replace-stopwords">
 9 | <h1>texthero.preprocessing.replace_stopwords<a class="headerlink" href="#texthero-preprocessing-replace-stopwords" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.replace_stopwords">
12 | <code class="sig-name descname">replace_stopwords</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">symbol</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span></em>, <em class="sig-param"><span class="n">stopwords</span><span class="p">:</span> <span class="n">Union<span class="p">[</span>Set<span class="p">[</span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a><span class="p">]</span><span class="p">, </span>NoneType<span class="p">]</span></span> <span class="o">=</span> <span class="default_value">None</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.replace_stopwords" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Replace all instances of <cite>words</cite> with symbol.</p>
14 | <p>By default uses NLTK’s english stopwords of 179 words.</p>
15 | <dl class="field-list simple">
16 | <dt class="field-odd">Parameters</dt>
17 | <dd class="field-odd"><dl class="simple">
18 | <dt><strong>input</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
19 | <dt><strong>symbol: str</strong></dt><dd><p>Character(s) to replace words with.</p>
20 | </dd>
21 | <dt><strong>stopwords</strong><span class="classifier">Set[str], Optional</span></dt><dd><p>Set of stopwords string to remove. If not passed, by default it used NLTK English stopwords.</p>
22 | </dd>
23 | </dl>
24 | </dd>
25 | </dl>
26 | <p class="rubric">Examples</p>
27 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"the book of the jungle"</span><span class="p">)</span>
28 | <span class="gp">&gt;&gt;&gt; </span><span class="n">replace_stopwords</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="s2">"X"</span><span class="p">)</span>
29 | <span class="go">0    X book X X jungle</span>
30 | <span class="go">dtype: object</span>
31 | </pre></div>
32 | </div>
33 | </dd></dl>
34 | </div>
35 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.replace_urls.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.replace_urls
 3 | title: preprocessing.replace_urls
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-replace-urls">
 9 | <h1>texthero.preprocessing.replace_urls<a class="headerlink" href="#texthero-preprocessing-replace-urls" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.replace_urls">
12 | <code class="sig-name descname">replace_urls</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">symbol</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.replace_urls" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Replace all urls with the given symbol.</p>
14 | <p><cite>replace_urls</cite> replace any urls from the given Pandas Series with the given symbol.</p>
15 | <div class="alert alert-info">
16 | <p class="admonition-title">See also</p>
17 | <dl class="simple">
18 | <dt><a class="reference internal" href="texthero.preprocessing.remove_urls.html#texthero.preprocessing.remove_urls" title="texthero.preprocessing.remove_urls"><code class="xref py py-meth docutils literal notranslate"><span class="pre">texthero.preprocessing.remove_urls()</span></code></a></dt><dd></dd>
19 | </dl>
20 | </div>
21 | <p class="rubric">Examples</p>
22 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
23 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
24 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"Go to: https://example.com"</span><span class="p">)</span>
25 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">replace_urls</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="s2">"&lt;URL&gt;"</span><span class="p">)</span>
26 | <span class="go">0    Go to: &lt;URL&gt;</span>
27 | <span class="go">dtype: object</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.stem.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.stem
 3 | title: preprocessing.stem
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-stem">
 9 | <h1>texthero.preprocessing.stem<a class="headerlink" href="#texthero-preprocessing-stem" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.stem">
12 | <code class="sig-name descname">stem</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">input</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">stem</span><span class="o">=</span><span class="default_value">'snowball'</span></em>, <em class="sig-param"><span class="n">language</span><span class="o">=</span><span class="default_value">'english'</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.stem" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Stem series using either <cite>porter</cite> or <cite>snowball</cite> NLTK stemmers.</p>
14 | <p>The act of stemming means removing the end of a words with an heuristic process. It’s useful in context where the meaning of the word is important rather than his derivation. Stemming is very efficient and adapt in case the given dataset is large.</p>
15 | <p><cite>texthero.preprocessing.stem</cite> make use of two NLTK stemming algorithms known as <code class="xref py py-class docutils literal notranslate"><span class="pre">nltk.stem.SnowballStemmer</span></code> and <code class="xref py py-class docutils literal notranslate"><span class="pre">nltk.stem.PorterStemmer</span></code>. SnowballStemmer should be used when the Pandas Series contains non-English text has it has multilanguage support.</p>
16 | <dl class="field-list simple">
17 | <dt class="field-odd">Parameters</dt>
18 | <dd class="field-odd"><dl class="simple">
19 | <dt><strong>input</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
20 | <dt><strong>stem</strong><span class="classifier">str (snowball by default)</span></dt><dd><p>Stemming algorithm. It can be either ‘snowball’ or ‘porter’</p>
21 | </dd>
22 | <dt><strong>language</strong><span class="classifier">str (english by default)</span></dt><dd><p>Supported languages: <cite>danish</cite>, <cite>dutch</cite>, <cite>english</cite>, <cite>finnish</cite>, <cite>french</cite>, <cite>german</cite> , <cite>hungarian</cite>, <cite>italian</cite>, <cite>norwegian</cite>, <cite>portuguese</cite>, <cite>romanian</cite>, <cite>russian</cite>, <cite>spanish</cite> and <cite>swedish</cite>.</p>
23 | </dd>
24 | </dl>
25 | </dd>
26 | </dl>
27 | <p class="rubric">Notes</p>
28 | <p>By default NLTK stemming algorithms lowercase all text.</p>
29 | <p class="rubric">Examples</p>
30 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
31 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
32 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">"I used to go </span><span class="se">\t\n</span><span class="s2"> running."</span><span class="p">)</span>
33 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">preprocessing</span><span class="o">.</span><span class="n">stem</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
34 | <span class="go">0    i use to go running.</span>
35 | <span class="go">dtype: object</span>
36 | </pre></div>
37 | </div>
38 | </dd></dl>
39 | </div>
40 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.preprocessing.tokenize.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.preprocessing.tokenize
 3 | title: preprocessing.tokenize
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-preprocessing-tokenize">
 9 | <h1>texthero.preprocessing.tokenize<a class="headerlink" href="#texthero-preprocessing-tokenize" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.preprocessing.tokenize">
12 | <code class="sig-name descname">tokenize</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.preprocessing.tokenize" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Tokenize each row of the given Series.</p>
14 | <p>Tokenize each row of the given Pandas Series and return a Pandas Series where each row contains a list of tokens.</p>
15 | <p>Algorithm: add a space between any punctuation symbol at
16 | exception if the symbol is between two alphanumeric character and split.</p>
17 | <p class="rubric">Examples</p>
18 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
19 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
20 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Today you're looking great!"</span><span class="p">])</span>
21 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">tokenize</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
22 | <span class="go">0    [Today, you're, looking, great, !]</span>
23 | <span class="go">dtype: object</span>
24 | </pre></div>
25 | </div>
26 | </dd></dl>
27 | </div>
28 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.dbscan.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.dbscan
 3 | title: representation.dbscan
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-dbscan">
 9 | <h1>texthero.representation.dbscan<a class="headerlink" href="#texthero-representation-dbscan" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.dbscan">
12 | <code class="sig-name descname">dbscan</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em>, <em class="sig-param"><span class="n">eps</span><span class="o">=</span><span class="default_value">0.5</span></em>, <em class="sig-param"><span class="n">min_samples</span><span class="o">=</span><span class="default_value">5</span></em>, <em class="sig-param"><span class="n">metric</span><span class="o">=</span><span class="default_value">'euclidean'</span></em>, <em class="sig-param"><span class="n">metric_params</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">algorithm</span><span class="o">=</span><span class="default_value">'auto'</span></em>, <em class="sig-param"><span class="n">leaf_size</span><span class="o">=</span><span class="default_value">30</span></em>, <em class="sig-param"><span class="n">p</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">n_jobs</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.dbscan" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform DBSCAN clustering.</p>
14 | </dd></dl>
15 | </div>
16 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.kmeans.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.kmeans
 3 | title: representation.kmeans
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-kmeans">
 9 | <h1>texthero.representation.kmeans<a class="headerlink" href="#texthero-representation-kmeans" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.kmeans">
12 | <code class="sig-name descname">kmeans</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">n_clusters</span><span class="o">=</span><span class="default_value">5</span></em>, <em class="sig-param"><span class="n">init</span><span class="o">=</span><span class="default_value">'k-means++'</span></em>, <em class="sig-param"><span class="n">n_init</span><span class="o">=</span><span class="default_value">10</span></em>, <em class="sig-param"><span class="n">max_iter</span><span class="o">=</span><span class="default_value">300</span></em>, <em class="sig-param"><span class="n">tol</span><span class="o">=</span><span class="default_value">0.0001</span></em>, <em class="sig-param"><span class="n">precompute_distances</span><span class="o">=</span><span class="default_value">'auto'</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">0</span></em>, <em class="sig-param"><span class="n">random_state</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">copy_x</span><span class="o">=</span><span class="default_value">True</span></em>, <em class="sig-param"><span class="n">n_jobs</span><span class="o">=</span><span class="default_value">- 1</span></em>, <em class="sig-param"><span class="n">algorithm</span><span class="o">=</span><span class="default_value">'auto'</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.kmeans" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform K-means clustering algorithm.</p>
14 | </dd></dl>
15 | </div>
16 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.meanshift.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.meanshift
 3 | title: representation.meanshift
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-meanshift">
 9 | <h1>texthero.representation.meanshift<a class="headerlink" href="#texthero-representation-meanshift" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.meanshift">
12 | <code class="sig-name descname">meanshift</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em>, <em class="sig-param"><span class="n">bandwidth</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">seeds</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">bin_seeding</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">min_bin_freq</span><span class="o">=</span><span class="default_value">1</span></em>, <em class="sig-param"><span class="n">cluster_all</span><span class="o">=</span><span class="default_value">True</span></em>, <em class="sig-param"><span class="n">n_jobs</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">max_iter</span><span class="o">=</span><span class="default_value">300</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.meanshift" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform mean shift clustering.</p>
14 | </dd></dl>
15 | </div>
16 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.nmf.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.nmf
 3 | title: representation.nmf
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-nmf">
 9 | <h1>texthero.representation.nmf<a class="headerlink" href="#texthero-representation-nmf" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.nmf">
12 | <code class="sig-name descname">nmf</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em>, <em class="sig-param"><span class="n">n_components</span><span class="o">=</span><span class="default_value">2</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.nmf" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform non-negative matrix factorization.</p>
14 | </dd></dl>
15 | </div>
16 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.pca.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.pca
 3 | title: representation.pca
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-pca">
 9 | <h1>texthero.representation.pca<a class="headerlink" href="#texthero-representation-pca" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.pca">
12 | <code class="sig-name descname">pca</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em>, <em class="sig-param"><span class="n">n_components</span><span class="o">=</span><span class="default_value">2</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.pca" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform principal component analysis on the given Pandas Series.</p>
14 | <p>In general, <em>pca</em> should be called after the text has already been represented.</p>
15 | <dl class="field-list simple">
16 | <dt class="field-odd">Parameters</dt>
17 | <dd class="field-odd"><dl class="simple">
18 | <dt><strong>s</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
19 | <dt><strong>n_components</strong><span class="classifier">Int. Default is 2.</span></dt><dd><p>Number of components to keep. If n_components is not set or None, all components are kept.</p>
20 | </dd>
21 | </dl>
22 | </dd>
23 | </dl>
24 | <p class="rubric">Examples</p>
25 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
26 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
27 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Sentence one"</span><span class="p">,</span> <span class="s2">"Sentence two"</span><span class="p">])</span>
28 | </pre></div>
29 | </div>
30 | </dd></dl>
31 | </div>
32 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.term_frequency.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.term_frequency
 3 | title: representation.term_frequency
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-term-frequency">
 9 | <h1>texthero.representation.term_frequency<a class="headerlink" href="#texthero-representation-term-frequency" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.term_frequency">
12 | <code class="sig-name descname">term_frequency</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">max_features</span><span class="p">:</span> <span class="n">Union<span class="p">[</span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.8)">int</a><span class="p">, </span>NoneType<span class="p">]</span></span> <span class="o">=</span> <span class="default_value">None</span></em>, <em class="sig-param"><span class="n">return_feature_names</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.term_frequency" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Represent a text-based Pandas Series using term_frequency.</p>
14 | <dl class="field-list simple">
15 | <dt class="field-odd">Parameters</dt>
16 | <dd class="field-odd"><dl class="simple">
17 | <dt><strong>s</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
18 | <dt><strong>max_features</strong><span class="classifier">int, optional</span></dt><dd><p>Maximum number of features to keep.</p>
19 | </dd>
20 | <dt><strong>return_features_names</strong><span class="classifier">Boolean, False by Default</span></dt><dd><p>If True, return a tuple (<em>term_frequency_series</em>, <em>features_names</em>)</p>
21 | </dd>
22 | </dl>
23 | </dd>
24 | </dl>
25 | <p class="rubric">Examples</p>
26 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
27 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
28 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Sentence one"</span><span class="p">,</span> <span class="s2">"Sentence two"</span><span class="p">])</span>
29 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">term_frequency</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
30 | <span class="go">0    [1, 1, 0]</span>
31 | <span class="go">1    [1, 0, 1]</span>
32 | <span class="go">dtype: object</span>
33 | </pre></div>
34 | </div>
35 | <p>To return the features_names:</p>
36 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
37 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
38 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Sentence one"</span><span class="p">,</span> <span class="s2">"Sentence two"</span><span class="p">])</span>
39 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">term_frequency</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">return_feature_names</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
40 | <span class="go">(0    [1, 1, 0]</span>
41 | <span class="go">1    [1, 0, 1]</span>
42 | <span class="go">dtype: object, ['Sentence', 'one', 'two'])</span>
43 | </pre></div>
44 | </div>
45 | </dd></dl>
46 | </div>
47 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.tfidf.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.tfidf
 3 | title: representation.tfidf
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-tfidf">
 9 | <h1>texthero.representation.tfidf<a class="headerlink" href="#texthero-representation-tfidf" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.tfidf">
12 | <code class="sig-name descname">tfidf</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">max_features</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">min_df</span><span class="o">=</span><span class="default_value">1</span></em>, <em class="sig-param"><span class="n">return_feature_names</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.tfidf" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Represent a text-based Pandas Series using TF-IDF.</p>
14 | <dl class="field-list simple">
15 | <dt class="field-odd">Parameters</dt>
16 | <dd class="field-odd"><dl class="simple">
17 | <dt><strong>s</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
18 | <dt><strong>max_features</strong><span class="classifier">int, optional</span></dt><dd><p>Maximum number of features to keep.</p>
19 | </dd>
20 | <dt><strong>min_df</strong><span class="classifier">int, optional. Default to 1.</span></dt><dd><p>When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.</p>
21 | </dd>
22 | <dt><strong>return_features_names</strong><span class="classifier">Boolean. Default to False.</span></dt><dd><p>If True, return a tuple (<em>tfidf_series</em>, <em>features_names</em>)</p>
23 | </dd>
24 | </dl>
25 | </dd>
26 | </dl>
27 | <p class="rubric">Examples</p>
28 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
29 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
30 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Sentence one"</span><span class="p">,</span> <span class="s2">"Sentence two"</span><span class="p">])</span>
31 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">tfidf</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
32 | <span class="go">0    [0.5797386715376657, 0.8148024746671689, 0.0]</span>
33 | <span class="go">1    [0.5797386715376657, 0.0, 0.8148024746671689]</span>
34 | <span class="go">dtype: object</span>
35 | </pre></div>
36 | </div>
37 | <p>To return the <em>feature_names</em>:</p>
38 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">texthero</span> <span class="k">as</span> <span class="nn">hero</span>
39 | <span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
40 | <span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="s2">"Sentence one"</span><span class="p">,</span> <span class="s2">"Sentence two"</span><span class="p">])</span>
41 | <span class="gp">&gt;&gt;&gt; </span><span class="n">hero</span><span class="o">.</span><span class="n">tfidf</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">return_feature_names</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
42 | <span class="go">(0    [0.5797386715376657, 0.8148024746671689, 0.0]</span>
43 | <span class="go">1    [0.5797386715376657, 0.0, 0.8148024746671689]</span>
44 | <span class="go">dtype: object, ['Sentence', 'one', 'two'])</span>
45 | </pre></div>
46 | </div>
47 | </dd></dl>
48 | </div>
49 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.representation.tsne.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.representation.tsne
 3 | title: representation.tsne
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-representation-tsne">
 9 | <h1>texthero.representation.tsne<a class="headerlink" href="#texthero-representation-tsne" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.representation.tsne">
12 | <code class="sig-name descname">tsne</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">n_components</span><span class="o">=</span><span class="default_value">2</span></em>, <em class="sig-param"><span class="n">perplexity</span><span class="o">=</span><span class="default_value">30.0</span></em>, <em class="sig-param"><span class="n">early_exaggeration</span><span class="o">=</span><span class="default_value">12.0</span></em>, <em class="sig-param"><span class="n">learning_rate</span><span class="o">=</span><span class="default_value">200.0</span></em>, <em class="sig-param"><span class="n">n_iter</span><span class="o">=</span><span class="default_value">1000</span></em>, <em class="sig-param"><span class="n">n_iter_without_progress</span><span class="o">=</span><span class="default_value">300</span></em>, <em class="sig-param"><span class="n">min_grad_norm</span><span class="o">=</span><span class="default_value">1e-07</span></em>, <em class="sig-param"><span class="n">metric</span><span class="o">=</span><span class="default_value">'euclidean'</span></em>, <em class="sig-param"><span class="n">init</span><span class="o">=</span><span class="default_value">'random'</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">0</span></em>, <em class="sig-param"><span class="n">random_state</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">method</span><span class="o">=</span><span class="default_value">'barnes_hut'</span></em>, <em class="sig-param"><span class="n">angle</span><span class="o">=</span><span class="default_value">0.5</span></em>, <em class="sig-param"><span class="n">n_jobs</span><span class="o">=</span><span class="default_value">- 1</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.representation.tsne" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Perform TSNE on the given pandas series.</p>
14 | <dl class="field-list simple">
15 | <dt class="field-odd">Parameters</dt>
16 | <dd class="field-odd"><dl class="simple">
17 | <dt><strong>s</strong><span class="classifier">Pandas Series</span></dt><dd></dd>
18 | <dt><strong>n_components</strong><span class="classifier">int, default is 2.</span></dt><dd><p>Number of components to keep. If n_components is not set or None, all components are kept.</p>
19 | </dd>
20 | <dt><strong>perplexity</strong><span class="classifier">int, default is 30.0</span></dt><dd></dd>
21 | </dl>
22 | </dd>
23 | </dl>
24 | </dd></dl>
25 | </div>
26 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.visualization.scatterplot.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.visualization.scatterplot
 3 | title: visualization.scatterplot
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-visualization-scatterplot">
 9 | <h1>texthero.visualization.scatterplot<a class="headerlink" href="#texthero-visualization-scatterplot" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.visualization.scatterplot">
12 | <code class="sig-name descname">scatterplot</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">df</span><span class="p">:</span> <span class="n">pandas.core.frame.DataFrame</span></em>, <em class="sig-param"><span class="n">col</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span></em>, <em class="sig-param"><span class="n">color</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span> <span class="o">=</span> <span class="default_value">None</span></em>, <em class="sig-param"><span class="n">hover_data</span><span class="p">:</span> <span class="n"><span class="p">]</span></span> <span class="o">=</span> <span class="default_value">None</span></em>, <em class="sig-param"><span class="n">title</span><span class="o">=</span><span class="default_value">''</span></em>, <em class="sig-param"><span class="n">return_figure</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.visualization.scatterplot" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Show scatterplot using python plotly scatter.</p>
14 | <dl class="field-list simple">
15 | <dt class="field-odd">Parameters</dt>
16 | <dd class="field-odd"><dl class="simple">
17 | <dt><strong>df</strong></dt><dd></dd>
18 | <dt><strong>col</strong></dt><dd><p>The name of the column of the DataFrame used for x and y axis.</p>
19 | </dd>
20 | </dl>
21 | </dd>
22 | </dl>
23 | </dd></dl>
24 | </div>
25 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.visualization.top_words.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.visualization.top_words
 3 | title: visualization.top_words
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-visualization-top-words">
 9 | <h1>texthero.visualization.top_words<a class="headerlink" href="#texthero-visualization-top-words" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.visualization.top_words">
12 | <code class="sig-name descname">top_words</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">normalize</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span> → pandas.core.series.Series<a class="headerlink" href="#texthero.visualization.top_words" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Return a pandas series with index the top words and as value the count.</p>
14 | <p>Tokenization: split by space and remove all punctuations that are not between characters.</p>
15 | <dl class="field-list simple">
16 | <dt class="field-odd">Parameters</dt>
17 | <dd class="field-odd"><dl class="simple">
18 | <dt><strong>normalize :</strong></dt><dd><p>When set to true, return normalized values.</p>
19 | </dd>
20 | </dl>
21 | </dd>
22 | </dl>
23 | </dd></dl>
24 | </div>
25 | </div>


--------------------------------------------------------------------------------
/website/docs/api/texthero.visualization.wordcloud.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: texthero.visualization.wordcloud
 3 | title: visualization.wordcloud
 4 | hide_title: true
 5 | ---
 6 | 
 7 | <div>
 8 | <div class="section" id="texthero-visualization-wordcloud">
 9 | <h1>texthero.visualization.wordcloud<a class="headerlink" href="#texthero-visualization-wordcloud" title="Permalink to this headline">¶</a></h1>
10 | <dl class="py function">
11 | <dt id="texthero.visualization.wordcloud">
12 | <code class="sig-name descname">wordcloud</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span><span class="p">:</span> <span class="n">pandas.core.series.Series</span></em>, <em class="sig-param"><span class="n">font_path</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.8)">str</a></span> <span class="o">=</span> <span class="default_value">None</span></em>, <em class="sig-param"><span class="n">width</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.8)">int</a></span> <span class="o">=</span> <span class="default_value">400</span></em>, <em class="sig-param"><span class="n">height</span><span class="p">:</span> <span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.8)">int</a></span> <span class="o">=</span> <span class="default_value">200</span></em>, <em class="sig-param"><span class="n">max_words</span><span class="o">=</span><span class="default_value">200</span></em>, <em class="sig-param"><span class="n">mask</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">contour_width</span><span class="o">=</span><span class="default_value">0</span></em>, <em class="sig-param"><span class="n">contour_color</span><span class="o">=</span><span class="default_value">'PAPAYAWHIP'</span></em>, <em class="sig-param"><span class="n">background_color</span><span class="o">=</span><span class="default_value">'PAPAYAWHIP'</span></em>, <em class="sig-param"><span class="n">relative_scaling</span><span class="o">=</span><span class="default_value">'auto'</span></em>, <em class="sig-param"><span class="n">colormap</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">return_figure</span><span class="o">=</span><span class="default_value">False</span></em><span class="sig-paren">)</span><a class="headerlink" href="#texthero.visualization.wordcloud" title="Permalink to this definition">¶</a></dt>
13 | <dd><p>Plot wordcloud image using WordCloud from word_cloud package.</p>
14 | <p>Most of the arguments are very similar if not equal to the mother function. In constrast, all words are taken into account when computing the wordcloud, inclusive stopwords. They can be easily removed with preprocessing.remove_stopwords.</p>
15 | <p>Word are compute using generate_from_frequencies.</p>
16 | <dl class="field-list simple">
17 | <dt class="field-odd">Parameters</dt>
18 | <dd class="field-odd"><dl class="simple">
19 | <dt><strong>s</strong><span class="classifier">pd.Series</span></dt><dd></dd>
20 | <dt><strong>font_path</strong><span class="classifier">str</span></dt><dd><p>Font path to the font that will be used (OTF or TTF). Defaults to DroidSansMono path on a Linux machine. If you are on another OS or don’t have this font, you need to adjust this path.</p>
21 | </dd>
22 | <dt><strong>width</strong><span class="classifier">int</span></dt><dd><p>Width of the canvas.</p>
23 | </dd>
24 | <dt><strong>height</strong><span class="classifier">int</span></dt><dd><p>Height of the canvas.</p>
25 | </dd>
26 | <dt><strong>max_words</strong><span class="classifier">number (default=200)</span></dt><dd><p>The maximum number of words.</p>
27 | </dd>
28 | <dt><strong>mask</strong><span class="classifier">nd-array or None (default=None)</span></dt><dd><p>When set, gives a binary mask on where to draw words. When set, width and height will be ignored and the shape of mask will be used instead. All white (#FF or #FFFFFF) entries will be considerd “masked out” while other entries will be free to draw on.</p>
29 | </dd>
30 | <dt><strong>contour_width: float (default=0)</strong></dt><dd><p>If mask is not None and contour_width &gt; 0, draw the mask contour.</p>
31 | </dd>
32 | <dt><strong>contour_color: color value (default=”PAPAYAWHIP”)</strong></dt><dd><p>Mask contour color.</p>
33 | </dd>
34 | <dt><strong>min_font_size</strong><span class="classifier">int (default=4)</span></dt><dd><p>Smallest font size to use. Will stop when there is no more room in this size.</p>
35 | </dd>
36 | <dt><strong>background_color</strong><span class="classifier">color value (default=”PAPAYAWHIP”)</span></dt><dd><p>Background color for the word cloud image.</p>
37 | </dd>
38 | <dt><strong>max_font_size</strong><span class="classifier">int or None (default=None)</span></dt><dd><p>Maximum font size for the largest word. If None, height of the image is used.</p>
39 | </dd>
40 | <dt><strong>relative_scaling</strong><span class="classifier">float (default=’auto’)</span></dt><dd><p>Importance of relative word frequencies for font-size.  With
41 | relative_scaling=0, only word-ranks are considered.  With
42 | relative_scaling=1, a word that is twice as frequent will have twice
43 | the size.  If you want to consider the word frequencies and not only
44 | their rank, relative_scaling around .5 often looks good.
45 | If ‘auto’ it will be set to 0.5 unless repeat is true, in which
46 | case it will be set to 0.</p>
47 | </dd>
48 | <dt><strong>colormap</strong><span class="classifier">string or matplotlib colormap, default=”viridis”</span></dt><dd><p>Matplotlib colormap to randomly draw colors from for each word.</p>
49 | </dd>
50 | </dl>
51 | </dd>
52 | </dl>
53 | </dd></dl>
54 | </div>
55 | </div>


--------------------------------------------------------------------------------
/website/docs/assets/texthero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/docs/assets/texthero.png


--------------------------------------------------------------------------------
/website/docs/getting-started-installation.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | id: installation
  3 | title: Installation
  4 | description: Texthero is a python package easy to install
  5 | ---
  6 | 
  7 | Docusaurus was designed from the ground up to be easily installed and used to get your website up and running quickly.
  8 | 
  9 | > **Important Note:** If you are setting up a new Docusaurus website for a Facebook Open Source project, we highly encourage you to use [Docusaurus 2](https://v2.docusaurus.io) instead.
 10 | 
 11 | ## Installing Docusaurus
 12 | 
 13 | We have created a helpful script that will get all of the infrastructure set up for you:
 14 | 
 15 | 1.  Ensure you have the latest version of [Node](https://nodejs.org/en/download/) installed. We also recommend you install [Yarn](https://yarnpkg.com/en/docs/install) as well.
 16 | 
 17 |     > You have to be on Node >= 10.9.0 and Yarn >= 1.5.
 18 | 
 19 | 1.  Create a project, if none exists, and change your directory to this project's root.
 20 | 
 21 |     You will be creating the docs in this directory. The root directory may contain other files. The Docusaurus installation script will create two new directories: `docs` and `website`.
 22 | 
 23 |     > Commonly, either an existing or newly created GitHub project will be the location for your Docusaurus site, but that is not mandatory to use Docusaurus.
 24 | 
 25 | 1.  Run the Docusaurus installation script: `npx docusaurus-init`.
 26 | 
 27 |     > If you don't have Node 8.2+ or if you prefer to install Docusaurus globally, run `yarn global add docusaurus-init` or `npm install --global docusaurus-init`. After that, run `docusaurus-init`.
 28 | 
 29 | ## Verifying Installation
 30 | 
 31 | Along with previously existing files and directories, your root directory will now contain a structure similar to:
 32 | 
 33 | ```bash
 34 | root-directory
 35 | ├── Dockerfile
 36 | ├── README.md
 37 | ├── docker-compose.yml
 38 | ├── docs
 39 | │   ├── doc1.md
 40 | │   ├── doc2.md
 41 | │   ├── doc3.md
 42 | │   ├── exampledoc4.md
 43 | │   └── exampledoc5.md
 44 | └── website
 45 |     ├── blog
 46 |     │   ├── 2016-03-11-blog-post.md
 47 |     │   ├── 2017-04-10-blog-post-two.md
 48 |     │   ├── 2017-09-25-testing-rss.md
 49 |     │   ├── 2017-09-26-adding-rss.md
 50 |     │   └── 2017-10-24-new-version-1.0.0.md
 51 |     ├── core
 52 |     │   └── Footer.js
 53 |     ├── package.json
 54 |     ├── pages
 55 |     ├── sidebars.json
 56 |     ├── siteConfig.js
 57 |     └── static
 58 | ```
 59 | 
 60 | > This installation creates some Docker files that are not necessary to run docusaurus. They may be deleted without issue in the interest of saving space. For more information on Docker, please see the [Docker documentation](https://docs.docker.com/).
 61 | 
 62 | ## Running the example website
 63 | 
 64 | After running the Docusaurus initialization script, `docusaurus-init` as described in the [Installation](#installing-docusaurus) section, you will have a runnable, example website to use as your site's base. To run:
 65 | 
 66 | 1.  `cd website`
 67 | 1.  From within the `website` directory, run the local web server using `yarn start` or `npm start`.
 68 | 1.  Load the example site at http://localhost:3000 if it did not already open automatically. If port 3000 has already been taken, another port will be used. Look at the console messages to see which.
 69 | 
 70 |     You should see the example site loaded in your web browser. There's also a LiveReload server running and any changes made to the docs and files in the `website` directory will cause the page to refresh. A randomly generated primary and secondary theme color will be picked for you.
 71 | 
 72 | ![](/img/getting-started-preparation-verify.png)
 73 | 
 74 | ### Launching the server behind a proxy
 75 | 
 76 | If you are behind a corporate proxy, you need to disable it for the development server requests. It can be done using the `NO_PROXY` environment variable.
 77 | 
 78 | ```sh
 79 | SET NO_PROXY=localhost
 80 | yarn start (or npm run start)
 81 | ```
 82 | 
 83 | ## Updating Your Docusaurus Version
 84 | 
 85 | At any time after Docusaurus is installed, you can check your current version of Docusaurus by going into the `website` directory and typing `yarn outdated docusaurus` or `npm outdated docusaurus`.
 86 | 
 87 | You will see something like this:
 88 | 
 89 | ```
 90 | $ yarn outdated
 91 | Using globally installed version of Yarn
 92 | yarn outdated v1.5.1
 93 | warning package.json: No license field
 94 | warning No license field
 95 | info Color legend :
 96 |  "<red>"    : Major Update backward-incompatible updates
 97 |  "<yellow>" : Minor Update backward-compatible features
 98 |  "<green>"  : Patch Update backward-compatible bug fixes
 99 | Package    Current Wanted Latest Package Type    URL
100 | docusaurus 1.0.9   1.2.0  1.2.0  devDependencies https://github.com/facebook/docusaurus#readme
101 | ✨  Done in 0.41s.
102 | ```
103 | 
104 | > If there is no noticeable version output from the `outdated` commands, then you are up-to-date.
105 | 
106 | You can update to the [latest version](https://www.npmjs.com/package/docusaurus) of Docusaurus by:
107 | 
108 | ```
109 | yarn upgrade docusaurus --latest
110 | ```
111 | 
112 | or
113 | 
114 | ```
115 | npm update docusaurus
116 | ```
117 | 
118 | > If you are finding that you are getting errors after your upgrade, try to either clear your Babel cache (usually it's in a [temporary directory](https://babeljs.io/docs/en/babel-register/#environment-variables) or run the Docusaurus server (e.g., `yarn start`) with the `BABEL_DISABLE_CACHE=1` environment configuration.
119 | 


--------------------------------------------------------------------------------
/website/docs/getting-started-preprocessing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | id: getting-started-preprocessing
 3 | ---
 4 | 
 5 | ## Getting started with pre-processing
 6 | 
 7 | 
 8 | 
 9 | ### Stemming
10 | 
11 | `do_stem` returns better results when used after `remove_punctuation`.
12 | 
13 | Example:
14 | 
15 | ```python
16 | 
17 | >>> text = "I love climbing and running."
18 | >>> hero .stem(pd.Series(text), stem="snowball")
19 |    0    i love climb and running.
20 |    dtype: object
21 | ```
22 | 
23 | Whereas 
24 | 
25 | ```python
26 | 
27 | >>> text = "I love climbing and running"
28 | >>> hero .stem(pd.Series(text), stem="snowball")
29 |    0    i love climb and run
30 |    dtype: object
31 | ```
32 | 


--------------------------------------------------------------------------------
/website/docs/tutorial-tfidf.md:
--------------------------------------------------------------------------------
1 | ---
2 | id: tutorial-tfidf
3 | title: Tutorial tfidf
4 | ---
5 | 


--------------------------------------------------------------------------------
/website/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scripts": {
 3 |     "examples": "docusaurus-examples",
 4 |     "start": "docusaurus-start",
 5 |     "build": "docusaurus-build",
 6 |     "publish-gh-pages": "docusaurus-publish",
 7 |     "write-translations": "docusaurus-write-translations",
 8 |     "version": "docusaurus-version",
 9 |     "rename-version": "docusaurus-rename-version"
10 |   },
11 |   "devDependencies": {
12 |     "docusaurus": "^1.14.4"
13 |   },
14 |    "license": "MIT"
15 | }
16 | 


--------------------------------------------------------------------------------
/website/pages/en/help.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2017-present, Facebook, Inc.
 3 |  *
 4 |  * This source code is licensed under the MIT license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | const React = require('react');
 9 | 
10 | const CompLibrary = require('../../core/CompLibrary.js');
11 | 
12 | const Container = CompLibrary.Container;
13 | const GridBlock = CompLibrary.GridBlock;
14 | 
15 | function Help(props) {
16 |   const {config: siteConfig, language = ''} = props;
17 |   const {baseUrl, docsUrl} = siteConfig;
18 |   const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
19 |   const langPart = `${language ? `${language}/` : ''}`;
20 |   const docUrl = doc => `${baseUrl}${docsPart}${langPart}${doc}`;
21 | 
22 |   const supportLinks = [
23 |     {
24 |       content: `Learn more using the [documentation on this site.](${docUrl(
25 |         'doc1.html',
26 |       )})`,
27 |       title: 'Browse Docs',
28 |     },
29 |     {
30 |       content: 'Ask questions about the documentation and project',
31 |       title: 'Join the community',
32 |     },
33 |     {
34 |       content: "Find out what's new with this project",
35 |       title: 'Stay up to date',
36 |     },
37 |   ];
38 | 
39 |   return (
40 |     <div className="docMainWrapper wrapper">
41 |       <Container className="mainContainer documentContainer postContainer">
42 |         <div className="post">
43 |           <header className="postHeader">
44 |             <h1>Need help?</h1>
45 |           </header>
46 |           <p>This project is maintained by a dedicated group of people.</p>
47 |           <GridBlock contents={supportLinks} layout="threeColumn" />
48 |         </div>
49 |       </Container>
50 |     </div>
51 |   );
52 | }
53 | 
54 | module.exports = Help;
55 | 


--------------------------------------------------------------------------------
/website/pages/en/index_original.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2017-present, Facebook, Inc.
  3 |  *
  4 |  * This source code is licensed under the MIT license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | const React = require('react');
  9 | 
 10 | const CompLibrary = require('../../core/CompLibrary.js');
 11 | 
 12 | const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */
 13 | const Container = CompLibrary.Container;
 14 | const GridBlock = CompLibrary.GridBlock;
 15 | 
 16 | class HomeSplash extends React.Component {
 17 |   render() {
 18 |     const {siteConfig, language = ''} = this.props;
 19 |     const {baseUrl, docsUrl} = siteConfig;
 20 |     const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
 21 |     const langPart = `${language ? `${language}/` : ''}`;
 22 |     const docUrl = doc => `${baseUrl}${docsPart}${langPart}${doc}`;
 23 | 
 24 |     const SplashContainer = props => (
 25 |       <div className="homeContainer">
 26 |         <div className="homeSplashFade">
 27 |           <div className="wrapper homeWrapper">{props.children}</div>
 28 |         </div>
 29 |       </div>
 30 |     );
 31 | 
 32 |     const Logo = props => (
 33 |       <div className="projectLogo">
 34 |         <img src={props.img_src} alt="Project Logo" />
 35 |       </div>
 36 |     );
 37 | 
 38 |     const ProjectTitle = props => (
 39 |       <h2 className="projectTitle">
 40 |         {props.title}
 41 |         <small>{props.tagline}</small>
 42 |       </h2>
 43 |     );
 44 | 
 45 |     const PromoSection = props => (
 46 |       <div className="section promoSection">
 47 |         <div className="promoRow">
 48 |           <div className="pluginRowBlock">{props.children}</div>
 49 |         </div>
 50 |       </div>
 51 |     );
 52 | 
 53 |     const Button = props => (
 54 |       <div className="pluginWrapper buttonWrapper">
 55 |         <a className="button" href={props.href} target={props.target}>
 56 |           {props.children}
 57 |         </a>
 58 |       </div>
 59 |     );
 60 | 
 61 |     return (
 62 |       <SplashContainer>
 63 |         <Logo img_src={`${baseUrl}img/undraw_monitor.svg`} />
 64 |         <div className="inner">
 65 |           <ProjectTitle tagline={siteConfig.tagline} title={siteConfig.title} />
 66 |           <PromoSection>
 67 |             <Button href="#try">Try It Out</Button>
 68 |             <Button href={docUrl('doc1.html')}>Example Link</Button>
 69 |             <Button href={docUrl('doc2.html')}>Example Link 2</Button>
 70 |           </PromoSection>
 71 |         </div>
 72 |       </SplashContainer>
 73 |     );
 74 |   }
 75 | }
 76 | 
 77 | class Index extends React.Component {
 78 |   render() {
 79 |     const {config: siteConfig, language = ''} = this.props;
 80 |     const {baseUrl} = siteConfig;
 81 | 
 82 |     const Block = props => (
 83 |       <Container
 84 |         padding={['bottom', 'top']}
 85 |         id={props.id}
 86 |         background={props.background}>
 87 |         <GridBlock
 88 |           align="center"
 89 |           contents={props.children}
 90 |           layout={props.layout}
 91 |         />
 92 |       </Container>
 93 |     );
 94 | 
 95 |     const FeatureCallout = () => (
 96 |       <div
 97 |         className="productShowcaseSection paddingBottom"
 98 |         style={{textAlign: 'center'}}>
 99 |         <h2>Feature Callout</h2>
100 |         <MarkdownBlock>These are features of this project</MarkdownBlock>
101 |       </div>
102 |     );
103 | 
104 |     const TryOut = () => (
105 |       <Block id="try">
106 |         {[
107 |           {
108 |             content:
109 |               'To make your landing page more attractive, use illustrations! Check out ' +
110 |               '[**unDraw**](https://undraw.co/) which provides you with customizable illustrations which are free to use. ' +
111 |               'The illustrations you see on this page are from unDraw.',
112 |             image: `${baseUrl}img/undraw_code_review.svg`,
113 |             imageAlign: 'left',
114 |             title: 'Wonderful SVG Illustrations',
115 |           },
116 |         ]}
117 |       </Block>
118 |     );
119 | 
120 |     const Description = () => (
121 |       <Block background="dark">
122 |         {[
123 |           {
124 |             content:
125 |               'This is another description of how this project is useful',
126 |             image: `${baseUrl}img/undraw_note_list.svg`,
127 |             imageAlign: 'right',
128 |             title: 'Description',
129 |           },
130 |         ]}
131 |       </Block>
132 |     );
133 | 
134 |     const LearnHow = () => (
135 |       <Block background="light">
136 |         {[
137 |           {
138 |             content:
139 |               'Each new Docusaurus project has **randomly-generated** theme colors.',
140 |             image: `${baseUrl}img/undraw_youtube_tutorial.svg`,
141 |             imageAlign: 'right',
142 |             title: 'Randomly Generated Theme Colors',
143 |           },
144 |         ]}
145 |       </Block>
146 |     );
147 | 
148 |     const Features = () => (
149 |       <Block layout="fourColumn">
150 |         {[
151 |           {
152 |             content: 'This is the content of my feature',
153 |             image: `${baseUrl}img/undraw_react.svg`,
154 |             imageAlign: 'top',
155 |             title: 'Feature One',
156 |           },
157 |           {
158 |             content: 'The content of my second feature',
159 |             image: `${baseUrl}img/undraw_operating_system.svg`,
160 |             imageAlign: 'top',
161 |             title: 'Feature Two',
162 |           },
163 |         ]}
164 |       </Block>
165 |     );
166 | 
167 |     const Showcase = () => {
168 |       if ((siteConfig.users || []).length === 0) {
169 |         return null;
170 |       }
171 | 
172 |       const showcase = siteConfig.users
173 |         .filter(user => user.pinned)
174 |         .map(user => (
175 |           <a href={user.infoLink} key={user.infoLink}>
176 |             <img src={user.image} alt={user.caption} title={user.caption} />
177 |           </a>
178 |         ));
179 | 
180 |       const pageUrl = page => baseUrl + (language ? `${language}/` : '') + page;
181 | 
182 |       return (
183 |         <div className="productShowcaseSection paddingBottom">
184 |           <h2>Who is Using This?</h2>
185 |           <p>This project is used by all these people</p>
186 |           <div className="logos">{showcase}</div>
187 |           <div className="more-users">
188 |             <a className="button" href={pageUrl('users.html')}>
189 |               More {siteConfig.title} Users
190 |             </a>
191 |           </div>
192 |         </div>
193 |       );
194 |     };
195 | 
196 |     return (
197 |       <div>
198 |         <HomeSplash siteConfig={siteConfig} language={language} />
199 |         <div className="mainContainer">
200 |           <Features />
201 |           <FeatureCallout />
202 |           <LearnHow />
203 |           <TryOut />
204 |           <Description />
205 |           <Showcase />
206 |         </div>
207 |       </div>
208 |     );
209 |   }
210 | }
211 | 
212 | module.exports = Index;
213 | 


--------------------------------------------------------------------------------
/website/pages/en/users.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2017-present, Facebook, Inc.
 3 |  *
 4 |  * This source code is licensed under the MIT license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | const React = require('react');
 9 | 
10 | const CompLibrary = require('../../core/CompLibrary.js');
11 | 
12 | const Container = CompLibrary.Container;
13 | 
14 | class Users extends React.Component {
15 |   render() {
16 |     const {config: siteConfig} = this.props;
17 |     if ((siteConfig.users || []).length === 0) {
18 |       return null;
19 |     }
20 | 
21 |     const editUrl = `${siteConfig.repoUrl}/edit/master/website/siteConfig.js`;
22 |     const showcase = siteConfig.users.map(user => (
23 |       <a href={user.infoLink} key={user.infoLink}>
24 |         <img src={user.image} alt={user.caption} title={user.caption} />
25 |       </a>
26 |     ));
27 | 
28 |     return (
29 |       <div className="mainContainer">
30 |         <Container padding={['bottom', 'top']}>
31 |           <div className="showcaseSection">
32 |             <div className="prose">
33 |               <h1>Who is Using This?</h1>
34 |               <p>This project is used by many folks</p>
35 |             </div>
36 |             <div className="logos">{showcase}</div>
37 |             <p>Are you using this project?</p>
38 |             <a href={editUrl} className="button">
39 |               Add your company
40 |             </a>
41 |           </div>
42 |         </Container>
43 |       </div>
44 |     );
45 |   }
46 | }
47 | 
48 | module.exports = Users;
49 | 


--------------------------------------------------------------------------------
/website/sidebars.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "docs": {
 3 |     "Getting Started": [
 4 |       "getting-started"
 5 |     ]
 6 |   },
 7 |   "api": {
 8 |     "Preprocessing": [
 9 |       "api-preprocessing",
10 |       "api/texthero.preprocessing.clean",
11 |       "api/texthero.preprocessing.drop_no_content",
12 |       "api/texthero.preprocessing.get_default_pipeline",
13 |       "api/texthero.preprocessing.has_content",
14 |       "api/texthero.preprocessing.remove_angle_brackets",
15 |       "api/texthero.preprocessing.remove_curly_brackets",
16 |       "api/texthero.preprocessing.remove_diacritics",
17 |       "api/texthero.preprocessing.remove_digits",
18 |       "api/texthero.preprocessing.remove_html_tags",
19 |       "api/texthero.preprocessing.remove_punctuation",
20 |       "api/texthero.preprocessing.remove_round_brackets",
21 |       "api/texthero.preprocessing.remove_square_brackets",
22 |       "api/texthero.preprocessing.remove_stopwords",
23 |       "api/texthero.preprocessing.remove_urls",
24 |       "api/texthero.preprocessing.remove_whitespace",
25 |       "api/texthero.preprocessing.replace_punctuation",
26 |       "api/texthero.preprocessing.replace_stopwords",
27 |       "api/texthero.preprocessing.replace_urls",
28 |       "api/texthero.preprocessing.stem",
29 |       "api/texthero.preprocessing.tokenize",
30 |       "api/texthero.preprocessing.remove_brackets"
31 |     ],
32 |     "NLP": [
33 |       "api-nlp",
34 |       "api/texthero.nlp.dependency_parse",
35 |       "api/texthero.nlp.named_entities",
36 |       "api/texthero.nlp.noun_chunks"
37 |     ],
38 |     "Representation": [
39 |       "api-representation",
40 |       "api/texthero.representation.dbscan",
41 |       "api/texthero.representation.kmeans",
42 |       "api/texthero.representation.meanshift",
43 |       "api/texthero.representation.nmf",
44 |       "api/texthero.representation.pca",
45 |       "api/texthero.representation.tfidf",
46 |       "api/texthero.representation.tsne",
47 |       "api/texthero.representation.term_frequency"
48 |     ],
49 |     "Visualization": [
50 |       "api-visualization",
51 |       "api/texthero.visualization.scatterplot",
52 |       "api/texthero.visualization.wordcloud",
53 |       "api/texthero.visualization.top_words"
54 |     ]
55 |   }
56 | }


--------------------------------------------------------------------------------
/website/siteConfig.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2017-present, Facebook, Inc.
  3 |  *
  4 |  * This source code is licensed under the MIT license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | // See https://docusaurus.io/docs/site-config for all the possible
  9 | // site configuration options.
 10 | 
 11 | const path = require("path");
 12 | 
 13 | // List of projects/orgs using your project for the users page.
 14 | const users = [
 15 |   {
 16 |     caption: "jbesomi",
 17 |     // You will need to prepend the image path with your baseUrl
 18 |     // if it is not '/', like: '/test-site/img/image.jpg'.
 19 |     image: "/img/undraw_open_source.svg",
 20 |     infoLink: "https://texthero.org",
 21 |     pinned: true,
 22 |   },
 23 | ];
 24 | 
 25 | const siteConfig = {
 26 |   title: "Texthero", // Title for your website.
 27 |   tagline:
 28 |     "Text preprocessing, representation and visualization from zero to hero.",
 29 |   url: "https://texthero.org", // Your website URL
 30 |   baseUrl: "/", // Base URL for your project */
 31 |   // For github.io type URLs, you would set the url and baseUrl like:
 32 |   //   url: 'https://facebook.github.io',
 33 |   //   baseUrl: '/test-site/',
 34 | 
 35 |   // Used for publishing and more
 36 |   projectName: "texthero",
 37 |   organizationName: "jbesomi",
 38 |   // For top-level user or org sites, the organization is still the same.
 39 |   // e.g., for the https://JoelMarcey.github.io site, it would be set like...
 40 |   //   organizationName: 'JoelMarcey'
 41 | 
 42 |   // For no header links in the top nav bar -> headerLinks: [],
 43 |   headerLinks: [
 44 |     { doc: "getting-started", label: "Getting started" },
 45 |     { blog: true, label: "Tutorial" },
 46 |     { doc: "api-preprocessing", label: "API" },
 47 |     {
 48 |       href: "https://github.com/jbesomi/texthero",
 49 |       label: "GitHub",
 50 |     },
 51 |   ],
 52 | 
 53 |   blogSidebarTitle: { default: "Recent tutorials", all: "All tutorials posts" },
 54 | 
 55 |   // If you have users set above, you add it here:
 56 |   users,
 57 | 
 58 |   /* path to images for header/footer */
 59 | 
 60 |   customDocsPath: path.basename(__dirname) + "/docs",
 61 | 
 62 |   usePrism: ["py"],
 63 |   highlight: {
 64 |     theme: "atom-one-dark",
 65 |   },
 66 | 
 67 |   /*
 68 |   headerIcon: 'img/logo_v2_transparent.png',
 69 |   footerIcon: 'img/favicon.ico',
 70 |   favicon: 'img/favicon.ico',
 71 |   */
 72 | 
 73 |   /* Colors for website */
 74 |   colors: {
 75 |     primaryColor: "#3f88c5",
 76 |     secondaryColor: "#ff8c42",
 77 |   },
 78 | 
 79 |   /* Custom fonts for website */
 80 |   /*
 81 |   fonts: {
 82 |     myFont: [
 83 |       "Times New Roman",
 84 |       "Serif"
 85 |     ],
 86 |     myOtherFont: [
 87 |       "-apple-system",
 88 |       "system-ui"
 89 |     ]
 90 |   },
 91 |   */
 92 | 
 93 |   // This copyright info is used in /core/Footer.js and blog RSS/Atom feeds.
 94 |   copyright: `Texthero - MIT license`,
 95 | 
 96 |   // Add custom scripts here that would be placed in <script> tags.
 97 |   scripts: [
 98 |     "https://buttons.github.io/buttons.js",
 99 |     "https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.0/clipboard.min.js",
100 |     "/js/code-block-buttons.js",
101 |     "//cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/highlight.min.js",
102 |     "/js/start_highlight.js",
103 |     "https://www.googletagmanager.com/gtag/js?id=G-0V7XX3QG4C",
104 |     "/js/analytics.js",
105 |   ],
106 |   stylesheets: ["/css/code-block-buttons.css", "/css/sphinx_basic.css"],
107 | 
108 |   // On page navigation for the current documentation page.
109 |   onPageNav: "separate",
110 |   // No .html extensions for paths.
111 |   cleanUrl: true,
112 | 
113 |   // Open Graph and Twitter card images.
114 |   ogImage: "img/T.png",
115 |   twitterImage: "img/T.png",
116 | 
117 |   favicon: "img/favicon.png",
118 | 
119 |   // For sites with a sizable amount of content, set collapsible to true.
120 |   // Expand/collapse the links and subcategories under categories.
121 |   docsSideNavCollapsible: true,
122 | 
123 |   // Show documentation's last contributor's name.
124 |   // enableUpdateBy: true,
125 | 
126 |   // Show documentation's last update time.
127 |   // enableUpdateTime: true,
128 | 
129 |   // You may provide arbitrary config keys to be used as needed by your
130 |   // template. For example, if you need your repo's URL...
131 |   //   repoUrl: 'https://github.com/facebook/test-site',
132 | };
133 | 
134 | module.exports = siteConfig;
135 | 


--------------------------------------------------------------------------------
/website/static/css/announcement-bar.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | .announcementBar {
 9 |   position: relative;
10 |   width: 100%;
11 |   background-color: black; /*var(--ifm-color-primary);*/
12 |   color: white; /*var(--ifm-color-black);*/
13 | }
14 | 
15 | .announcementBarClose {
16 |   position: absolute;
17 |   right: 0;
18 |   top: 0;
19 |   width: 55px;
20 |   font-size: 1.25rem;
21 |   padding: 0;
22 |   border: none;
23 |   cursor: pointer;
24 |   background: none;
25 |   color: inherit;
26 |   height: 100%;
27 | }
28 | 
29 | .announcementBarContent {
30 |   font-size: 85%;
31 |   width: 100%;
32 |   text-align: center;
33 |   padding: 5px 0;
34 |   margin-right: 55px;
35 | }
36 | 
37 | @media screen and (max-width: 576px) {
38 |   .announcementBarClose {
39 |     width: 35px;
40 |   }
41 |   .announcementBarContent {
42 |     width: auto;
43 |     margin-right: 35px;
44 |   }
45 | }
46 | 
47 | .announcementBarContent a {
48 |   color: inherit;
49 |   text-decoration: underline;
50 | }
51 | 


--------------------------------------------------------------------------------
/website/static/css/code-block-buttons.css:
--------------------------------------------------------------------------------
 1 | /* "Copy" code block button */
 2 | pre {
 3 |   position: relative;
 4 | }
 5 | 
 6 | pre .btnIcon {
 7 |   position: absolute;
 8 |   top: 4px;
 9 |   z-index: 2;
10 |   cursor: pointer;
11 |   border: 1px solid transparent;
12 |   padding: 0;
13 |   color: #fff;
14 |   background-color: transparent;
15 |   height: 30px;
16 |   transition: all .25s ease-out;
17 | }
18 | 
19 | pre .btnIcon:hover {
20 |   text-decoration: none;
21 | }
22 | 
23 | .btnIcon__body {
24 |   align-items: center;
25 |   display: flex;
26 | }
27 | 
28 | .btnIcon svg {
29 |   fill: currentColor;
30 |   margin-right: .4em;
31 | }
32 | 
33 | .btnIcon__label {
34 |   font-size: 11px;
35 | }
36 | 
37 | .btnClipboard {
38 |   right: 10px;
39 | }
40 | 


--------------------------------------------------------------------------------
/website/static/css/custom.css:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2017-present, Facebook, Inc.
  3 |  *
  4 |  * This source code is licensed under the MIT license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | 
  9 | html {
 10 |     box-sizing: border-box;
 11 | }
 12 | 
 13 | *, *:before, *:after {
 14 |     box-sizing: inherit;
 15 | }
 16 | 
 17 | body {
 18 |     min-height: 100vh;
 19 | }
 20 | 
 21 | /* remove default margin-top*/
 22 | h1, h2, h3, h4, h5, h6 {
 23 |     margin-top: 0;
 24 | }
 25 | 
 26 | .mainContainer .wrapper .post h4 {
 27 |    color: #3f88c5;
 28 | }
 29 | 
 30 | .homeContainer h3 {
 31 |    font-weight: 400;
 32 | }
 33 | 
 34 | .footer-logo {
 35 |     padding-top: 1em;
 36 |     display: flex;
 37 |     justify-content: center;
 38 | }
 39 | 
 40 | /* Reset .navToggle box-sizing */
 41 | .navToggle {
 42 |     box-sizing: content-box;
 43 | }
 44 | 
 45 | 
 46 | /* HOME */
 47 | 
 48 | .homeContainer {
 49 |     flex: 1 0 auto;
 50 |     padding-bottom: 1em;
 51 | }
 52 | 
 53 | .homeContainer .homeWrapper .projectLogo {
 54 |     justify-content: center;
 55 |     position: relative;
 56 |     padding: 0em;
 57 | }
 58 | 
 59 | .homeContainer .homeWrapper .projectLogo img {
 60 |     max-height: 200px;
 61 | }
 62 | 
 63 | .separateOnPageNav .wrapper,
 64 | .separateOnPageNav .headerWrapper.wrapper,
 65 | .wrapper {
 66 |    max-width: 1300px;
 67 | }
 68 | 
 69 | 
 70 | /* DOCS  */
 71 | 
 72 | .docMainWrapper {
 73 |     width: 100%;
 74 | }
 75 | 
 76 | 
 77 | /* HEADINGS */
 78 | 
 79 | /* TODO, do we really needs all that? */
 80 | 
 81 | .mainContainer .wrapper .post h2,
 82 | .mainContainer .wrapper .post h3,
 83 | .mainContainer .wrapper .post h4,
 84 | .mainContainer .wrapper .post h5,
 85 | .mainContainer .wrapper .post h6 {
 86 |     /* margin-top: 2.5rem; */
 87 | }
 88 | 
 89 | .mainContainer .wrapper .post .postHeader h1 {
 90 |     font-size: 2.909rem;
 91 | }
 92 | 
 93 | .mainContainer .wrapper .post h2 {
 94 |     font-size: 2.218rem;
 95 | }
 96 | 
 97 | .mainContainer .wrapper .post h3 {
 98 |     font-size: 1.798rem;
 99 |     color: #a6a6a6;
100 | }
101 | 
102 | .mainContainer .wrapper .post h4 {
103 |     font-size: 1.618rem;
104 |     color: #a6a6a6;
105 |     font-weight: 300;
106 |     line-height: 1.5;
107 |     padding: 10px 0;
108 | }
109 | 
110 | .mainContainer .wrapper .post h5 {
111 |     font-size: 1.111rem;
112 |     color: #a6a6a6;
113 |     font-weight: 300;
114 |     line-height: 1.5;
115 |     padding: 10px 0;
116 | }
117 | 
118 | .mainContainer .wrapper .post h6 {
119 |     font-size: 1rem;
120 |     color: #a6a6a6;
121 |     font-weight: 300;
122 |     line-height: 1.5;
123 |     padding: 10px 0;
124 | }
125 | 
126 | 
127 | 
128 | 
129 | @media only screen and (max-width: 1023px) {
130 | }
131 | 
132 | @media only screen and (min-width: 1400px) {
133 | }
134 | 
135 | @media only screen and (min-width: 1500px) {
136 | }
137 | 
138 | 
139 | /*
140 |     CODE
141 | */
142 | 
143 | code {
144 |     padding: 0;
145 |     border: 0;
146 | }
147 | 
148 | /* 
149 |     API PAGES
150 | */
151 | 
152 | /* make scrollbar transparent */
153 | ::-webkit-scrollbar {
154 |     width: 0px;
155 |     background: transparent; 
156 | }
157 | 
158 | 
159 | /* Sidebar: if text is too long, clip it. */
160 | 
161 | .navItem {
162 |     text-overflow: clip;
163 |     overflow: hidden;
164 |     white-space: nowrap;
165 | }
166 | 
167 | /* Do not wrap the first column of API tables */
168 | table td:first-child {
169 |     white-space:nowrap;
170 | }
171 | 
172 | /* Define box around code example */
173 | .highlight pre {
174 |     padding: 10px;
175 |     color: #222;
176 |     line-height: 1.2em;
177 |     border: 1px solid #3f88c552;
178 |     margin: 5px 0;
179 |     box-shadow: 1px 1px 1px #d8d8d8;
180 | }
181 | 
182 | /* remove space in descname 
183 | .section dt {
184 |     display: flex; 
185 | }
186 | */
187 | 
188 | 
189 | /* Add blue color to 'name_function' in descname */
190 | code.descname {
191 |     color: #3f88c5;
192 | }
193 | 
194 | /* reduce space between x : y */
195 | .classifier:before {
196 |     font-style: normal;
197 |     margin: 0.3em;
198 |     content: ":";
199 | }
200 | 
201 | /* Allineate examples to parameters */
202 | p.rubric, .doctest {
203 |     padding-left: 0.5em;
204 | }
205 | 
206 | /* Less space between parameters and examples */
207 | p.rubric {
208 |     margin-top: 10px;
209 | }
210 | 
211 | .highlight{
212 |     font-size: 15px;
213 |     color: grey;
214 | }
215 | 
216 | /* Larger sidebar */
217 | 
218 | @media only screen and (min-width: 1024px) {
219 |     .separateOnPageNav .docsNavContainer {
220 |         flex: 0 0 280px;
221 |     }
222 | }
223 | 
224 | /* Reduce section title to allow it in one line. */
225 | .section h1 {
226 |     font-size: 27px;
227 | }
228 | 
229 | /* Make cite behave like code */
230 | cite {
231 |     background-color: rgba(27, 31, 35, 0.05);
232 |     border-radius: 3px;
233 |     color: inherit;
234 |     font-family: SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace;
235 |     font-size: 85%;
236 |     margin: 0;
237 |     padding: 3.2px 3.2px;
238 | }
239 | 
240 | /* Make docs-prevnext button 1px smaller */
241 | .docs-prevnext .button {
242 |     font-size: 13px;
243 | }
244 | 
245 | 
246 | .showcase {
247 |     width: 70%;
248 |     margin: auto;
249 |     text-align: left;
250 | }
251 | 
252 | .homebox {
253 |     width: 45%;
254 | }
255 | 
256 | .left {
257 |     margin-left: 0px;
258 |     margin-right: auto;
259 | }
260 | 
261 | .right {
262 |     margin-left: auto;
263 |     margin-right: 0px;
264 | }
265 | 
266 | /* for small screen, do not position box to left and right */
267 | 
268 | /* @media only screen and (min-device-width: 360px) and (max-device-width: 736px) { */
269 | 
270 | @media only screen and (max-width: 750px){
271 |     .right {
272 |         margin-left: 0px;
273 |         margin-right: auto;
274 |     }
275 | 
276 |     .homebox {
277 |         width: 100%;
278 |     }
279 | }
280 | 
281 | 
282 | 
283 | .home_separator {
284 |     width: 60%;
285 |     border-top: 2px solid #3f88c5;
286 |     margin: 0px auto 20px auto;
287 | }
288 | 
289 | /* Homepage code box */
290 | 
291 | /* basically copy the code from pygments. TODO unite pygments and highlight.js */
292 | .hljs {
293 |     text-align: left;
294 |     padding: 10px;
295 |     color: #222;
296 |     line-height: 1.2em;
297 |     border: 1px solid #3f88c552;
298 |     margin: 5px 0;
299 |     box-shadow: 1px 1px 1px #d8d8d8;
300 |     background: #f8f8f8;
301 | }
302 | 
303 | .hljs-keyword {
304 |     color: #3f88c5;
305 |     font-weight: bold;
306 | }
307 | 
308 | /* Table size a bit smaller ... */
309 | 
310 | .showcase table {
311 |     font-size: 80%;
312 | }
313 | 
314 | .showcase table td {
315 |     overflow: hidden;
316 |     text-overflow: ellipsis;
317 |     white-space: nowrap;
318 |     max-width: 280px;
319 | }
320 | 
321 | .showcase blockquote {
322 |     background-color: rgba(255, 229, 100, 0.3);
323 |     border-left: 8px solid #ffe564;
324 |     font-size: 80%;
325 |     padding: 5px 30px 5px 15px;
326 | }
327 | 
328 | .showcase blockquote a {
329 |     color: black;
330 |     font-weight: 400;
331 | }
332 | 
333 | .hljs-string {
334 |     color: #3f88c5cf;
335 | }
336 | 
337 | .showcase .projectTitle > small {
338 |     display: block;
339 |     font-weight: normal;
340 |     font-size: 50%;
341 |     line-height: 1em;
342 |     margin: 0;
343 | }
344 | 
345 | .showcase .btnIcon {
346 |     display: none;
347 | }
348 | 


--------------------------------------------------------------------------------
/website/static/css/pygments.css:
--------------------------------------------------------------------------------
 1 | .highlight .hll { background-color: #ffffcc }
 2 | .highlight  { background: #f8f8f8; }
 3 | .highlight .c { color: #8f5902; font-style: italic } /* Comment */
 4 | .highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */
 5 | .highlight .g { color: #333 } /* Generic */
 6 | .highlight .k { color: #3f88c5; font-weight: bold } /* Keyword */
 7 | .highlight .l { color: #333 } /* Literal */
 8 | .highlight .n { color: #333 } /* Name */
 9 | .highlight .o { color: #ce5c00; font-weight: bold } /* Operator */
10 | .highlight .x { color: #333 } /* Other */
11 | .highlight .p { color: #333; font-weight: normal } /* Punctuation */
12 | .highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */
13 | .highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */
14 | .highlight .cp { color: #8f5902; font-style: italic } /* Comment.Preproc */
15 | .highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */
16 | .highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */
17 | .highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */
18 | .highlight .gd { color: #a40000 } /* Generic.Deleted */
19 | .highlight .ge { color: #333; font-style: italic } /* Generic.Emph */
20 | .highlight .gr { color: #ef2929 } /* Generic.Error */
21 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
22 | .highlight .gi { color: #00A000 } /* Generic.Inserted */
23 | .highlight .go { color: #333;  } /* Generic.Output  font-style: italic */
24 | .highlight .gp { color: #8f5902 } /* Generic.Prompt */
25 | .highlight .gs { color: #333; font-weight: bold } /* Generic.Strong */
26 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
27 | .highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */
28 | .highlight .kc { color: #3f88c5; font-weight: bold } /* Keyword.Constant */
29 | .highlight .kd { color: #3f88c5; font-weight: bold } /* Keyword.Declaration */
30 | .highlight .kn { color: #3f88c5; font-weight: bold } /* Keyword.Namespace */
31 | .highlight .kp { color: #3f88c5; font-weight: bold } /* Keyword.Pseudo */
32 | .highlight .kr { color: #3f88c5; font-weight: bold } /* Keyword.Reserved */
33 | .highlight .kt { color: #3f88c5; font-weight: bold } /* Keyword.Type */
34 | .highlight .ld { color: #333 } /* Literal.Date */
35 | .highlight .m { color: #0000cf; font-weight: bold } /* Literal.Number */
36 | .highlight .s { color: #4e9a06 } /* Literal.String */
37 | .highlight .na { color: #c4a000 } /* Name.Attribute */
38 | .highlight .nb { color: #3f88c5 } /* Name.Builtin */
39 | .highlight .nc { color: #333 } /* Name.Class */
40 | .highlight .no { color: #333 } /* Name.Constant */
41 | .highlight .nd { color: #5c35cc; font-weight: bold } /* Name.Decorator */
42 | .highlight .ni { color: #ce5c00 } /* Name.Entity */
43 | .highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */
44 | .highlight .nf { color: #333 } /* Name.Function */
45 | .highlight .nl { color: #f57900 } /* Name.Label */
46 | .highlight .nn { color: #333 } /* Name.Namespace */
47 | .highlight .nx { color: #333 } /* Name.Other */
48 | .highlight .py { color: #333 } /* Name.Property */
49 | .highlight .nt { color: #3f88c5; font-weight: bold } /* Name.Tag */
50 | .highlight .nv { color: #333 } /* Name.Variable */
51 | .highlight .ow { color: #3f88c5; font-weight: bold } /* Operator.Word */
52 | .highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */
53 | .highlight .mb { color: #0000cf; font-weight: bold } /* Literal.Number.Bin */
54 | .highlight .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */
55 | .highlight .mh { color: #0000cf; font-weight: bold } /* Literal.Number.Hex */
56 | .highlight .mi { color: #0000cf; font-weight: bold } /* Literal.Number.Integer */
57 | .highlight .mo { color: #0000cf; font-weight: bold } /* Literal.Number.Oct */
58 | .highlight .sa { color: #4e9a06 } /* Literal.String.Affix */
59 | .highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */
60 | .highlight .sc { color: #4e9a06 } /* Literal.String.Char */
61 | .highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */
62 | .highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */
63 | .highlight .s2 { color: #4e9a06 } /* Literal.String.Double */
64 | .highlight .se { color: #4e9a06 } /* Literal.String.Escape */
65 | .highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */
66 | .highlight .si { color: #4e9a06 } /* Literal.String.Interpol */
67 | .highlight .sx { color: #4e9a06 } /* Literal.String.Other */
68 | .highlight .sr { color: #4e9a06 } /* Literal.String.Regex */
69 | .highlight .s1 { color: #4e9a06 } /* Literal.String.Single */
70 | .highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */
71 | .highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */
72 | .highlight .fm { color: #333 } /* Name.Function.Magic */
73 | .highlight .vc { color: #333 } /* Name.Variable.Class */
74 | .highlight .vg { color: #333 } /* Name.Variable.Global */
75 | .highlight .vi { color: #333 } /* Name.Variable.Instance */
76 | .highlight .vm { color: #333 } /* Name.Variable.Magic */
77 | .highlight .il { color: #0000cf; font-weight: bold } /* Literal.Number.Integer.Long */
78 | 


--------------------------------------------------------------------------------
/website/static/img/T.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/static/img/T.png


--------------------------------------------------------------------------------
/website/static/img/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/static/img/favicon.png


--------------------------------------------------------------------------------
/website/static/img/logo_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/static/img/logo_v2.png


--------------------------------------------------------------------------------
/website/static/img/logo_v2_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/static/img/logo_v2_transparent.png


--------------------------------------------------------------------------------
/website/static/img/oss_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbesomi/texthero/37d09f0299fe14329b4fae5002c3a1950e4f563e/website/static/img/oss_logo.png


--------------------------------------------------------------------------------
/website/static/js/analytics.js:
--------------------------------------------------------------------------------
1 | window.dataLayer = window.dataLayer || [];
2 | function gtag() {
3 |   dataLayer.push(arguments);
4 | }
5 | gtag("js", new Date());
6 | gtag("config", "G-0V7XX3QG4C");
7 | 


--------------------------------------------------------------------------------
/website/static/js/code-block-buttons.js:
--------------------------------------------------------------------------------
 1 | // Turn off ESLint for this file because it's sent down to users as-is.
 2 | /* eslint-disable */
 3 | window.addEventListener('load', function() {
 4 |   function button(label, ariaLabel, icon, className) {
 5 |     const btn = document.createElement('button');
 6 |     btn.classList.add('btnIcon', className);
 7 |     btn.setAttribute('type', 'button');
 8 |     btn.setAttribute('aria-label', ariaLabel);
 9 |     btn.innerHTML =
10 |       '<div class="btnIcon__body">' +
11 |       icon +
12 |       '<strong class="btnIcon__label">' +
13 |       label +
14 |       '</strong>' +
15 |       '</div>';
16 |     return btn;
17 |   }
18 | 
19 |   function addButtons(codeBlockSelector, btn) {
20 |     document.querySelectorAll(codeBlockSelector).forEach(function(code) {
21 |       code.parentNode.appendChild(btn.cloneNode(true));
22 |     });
23 |   }
24 | 
25 |   const copyIcon =
26 |     '<svg width="12" height="12" viewBox="340 364 14 15" xmlns="http://www.w3.org/2000/svg"><path fill="currentColor" d="M342 375.974h4v.998h-4v-.998zm5-5.987h-5v.998h5v-.998zm2 2.994v-1.995l-3 2.993 3 2.994v-1.996h5v-1.995h-5zm-4.5-.997H342v.998h2.5v-.997zm-2.5 2.993h2.5v-.998H342v.998zm9 .998h1v1.996c-.016.28-.11.514-.297.702-.187.187-.422.28-.703.296h-10c-.547 0-1-.452-1-.998v-10.976c0-.546.453-.998 1-.998h3c0-1.107.89-1.996 2-1.996 1.11 0 2 .89 2 1.996h3c.547 0 1 .452 1 .998v4.99h-1v-2.995h-10v8.98h10v-1.996zm-9-7.983h8c0-.544-.453-.996-1-.996h-1c-.547 0-1-.453-1-.998 0-.546-.453-.998-1-.998-.547 0-1 .452-1 .998 0 .545-.453.998-1 .998h-1c-.547 0-1 .452-1 .997z" fill-rule="evenodd"/></svg>';
27 | 
28 |   addButtons(
29 |     '.hljs',
30 |     button('Copy', 'Copy code to clipboard', copyIcon, 'btnClipboard'),
31 |   );
32 | 
33 |   const clipboard = new ClipboardJS('.btnClipboard', {
34 |     target: function(trigger) {
35 |       return trigger.parentNode.querySelector('code');
36 |     },
37 |   });
38 | 
39 |   clipboard.on('success', function(event) {
40 |     event.clearSelection();
41 |     const textEl = event.trigger.querySelector('.btnIcon__label');
42 |     textEl.textContent = 'Copied';
43 |     setTimeout(function() {
44 |       textEl.textContent = 'Copy';
45 |     }, 2000);
46 |   });
47 | });
48 | 


--------------------------------------------------------------------------------
/website/static/js/start_highlight.js:
--------------------------------------------------------------------------------
1 | hljs.initHighlightingOnLoad();
2 | 


--------------------------------------------------------------------------------
/website/vercel.json:
--------------------------------------------------------------------------------
1 | {
2 |   "github": {
3 |     "silent": true
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------