├── tests
├── __init__.py
├── test_grams.py
├── test_vectorize_spacy.py
├── test_read_file.py
├── test_stopwords.py
├── test_transform.py
└── test_clean.py
├── signs
├── grams
│ ├── __init__.py
│ └── grams.py
├── utils
│ ├── __init__.py
│ ├── html_print.py
│ ├── file.py
│ ├── converters.py
│ └── stopwords.py
├── models
│ ├── __init__.py
│ └── mlp.py
├── similarity
│ ├── __init__.py
│ ├── similarity.py
│ └── doc_similarity.py
├── commands
│ ├── __init__.py
│ ├── verbatims.py
│ ├── autoprep.py
│ ├── describe.py
│ ├── transform.py
│ ├── embeds.py
│ ├── stopwords.py
│ ├── docsimilarity.py
│ ├── load_vectors.py
│ ├── clean.py
│ └── preds.py
├── preprocess
│ ├── __init__.py
│ ├── tokenize.py
│ ├── embedding_matrix.py
│ ├── embedding_index.py
│ └── transforms.py
├── vectorize
│ ├── __init__.py
│ ├── spacy.py
│ ├── sentence_embeddings.py
│ ├── gensim_doc2vec_train.py
│ └── huggingface_embeddings.py
└── __init__.py
├── .coveragerc
├── logo.png
├── requirements.txt
├── .github
├── PULL_REQUEST_TEMPLATE.md
├── workflows
│ ├── ci-deploy.yml
│ └── ci-push.yml
└── ISSUE_TEMPLATE
│ ├── feature-request.md
│ ├── support-request.md
│ └── bug-report.md
├── test-ci.py
├── LICENSE
├── .gitignore
├── setup.py
├── README.md
├── examples
├── Cleaning Documents for NLP with Signs.ipynb
├── Rapid NLP Development Examples with Signs.ipynb
└── Fake News Classification Signs Embeds Pipeline and Hyperparameter Optimization for Keras and TF2.ipynb
└── CONTRIBUTING.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/signs/grams/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/signs/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/signs/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/signs/similarity/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/signs/commands/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/signs/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import *
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 |
3 | omit =
4 | signs/tests/*
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/autonomio/signs/HEAD/logo.png
--------------------------------------------------------------------------------
/signs/vectorize/__init__.py:
--------------------------------------------------------------------------------
1 | from .huggingface_embeddings import HuggingfaceEmbeddings
from .sentence_embeddings import SentenceEmbeddings
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | kerasplotlib
2 | wrangle
3 | numpy
4 | pandas
5 | cython
6 | spacy
7 | keras
8 | gensim
9 | ipython
10 | sentence_transformers
--------------------------------------------------------------------------------
/tests/test_grams.py:
--------------------------------------------------------------------------------
1 | def test_grams():
2 |
3 | from signs import Grams
4 |
5 | docs = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
6 |
7 | grams = Grams(docs)
8 |
9 | grams.combinations()
10 | grams.flexgrams()
11 | grams.ngrams()
12 | grams.skipgrams()
13 | grams.sobolgrams(n=50)
14 |
--------------------------------------------------------------------------------
/tests/test_vectorize_spacy.py:
--------------------------------------------------------------------------------
1 | def test_vectorize_spacy():
2 |
3 | from signs.vectorize.spacy import spacy_load, spacy_word2vec
4 |
5 | model = spacy_load()
6 | model = spacy_load('en')
7 |
8 | test = round(sum(spacy_word2vec('test', model)), 2)
9 | test2 = round(sum(spacy_word2vec('testing sentence', model)), 2)
10 |
--------------------------------------------------------------------------------
/signs/utils/html_print.py:
--------------------------------------------------------------------------------
1 | def html_print(text, title=''):
2 |
3 | from IPython.core.display import display, HTML
4 |
5 | # create title for the content
6 | display(HTML("
" + str(title) + "
"))
7 |
8 | # create content
9 | html = display(HTML("" + text + ""))
10 |
11 | return html
12 |
--------------------------------------------------------------------------------
/signs/utils/file.py:
--------------------------------------------------------------------------------
1 | def read_file(filename):
2 |
3 | '''Takes as input a file where each row will become
4 | a document in the output list
5 |
6 | filename : str
7 | Local file with content
8 |
9 | '''
10 |
11 | with open(filename) as f:
12 | content = f.readlines()
13 |
14 | content = [x for x in content]
15 |
16 | return content
17 |
--------------------------------------------------------------------------------
/tests/test_read_file.py:
--------------------------------------------------------------------------------
1 | def test_read_file():
2 |
3 | from signs.utils.file import read_file
4 |
5 | s = "here is test \n that goes into a file \n"
6 | f = open('temp.txt', 'w')
7 | f.write(s)
8 | f.close()
9 |
10 | out = read_file('temp.txt')
11 | if out != ['here is test \n', ' that goes into a file \n']:
12 | raise ValueError('read_file() test failed')
13 |
--------------------------------------------------------------------------------
/signs/utils/converters.py:
--------------------------------------------------------------------------------
1 | def embeds_to_text(x_test, word_index):
2 |
3 | '''EMBEDS TO TEXT CONVERTER
4 |
5 | Takes in keras embeddings and converts it back to text.
6 | Note that the resulting strings are limited by max_words
7 | from tokenization.'''
8 |
9 | l = []
10 |
11 | for i in range(len(x_test)):
12 |
13 | inv_map = {v: k for k, v in word_index.items()}
14 | words = list(map(inv_map.get, x_test[i]))
15 | sentence = ' '.join([x for x in words if x is not None])
16 | l.append(sentence)
17 |
18 | return l
19 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## You want to make a PR to Talos
2 |
3 | Thanks so much :) First, please take a moment to carefully check through
4 | the below items:
5 |
6 | #### Sanity
7 |
8 | - [ ] I'm aware of the implications of the proposed changes
9 | - [ ] Code is [PEP8](https://www.python.org/dev/peps/pep-0008/)
10 | - [ ] I'm making the PR to `master`
11 |
12 | #### Tests
13 |
14 | - [ ] Changes have gone through actual use testing
15 | - [ ] All local tests have passed (run ./test.sh in /talos)
16 | - [ ] Tests have been updated to reflect the changes
17 |
18 |
19 |
--------------------------------------------------------------------------------
/signs/commands/verbatims.py:
--------------------------------------------------------------------------------
1 | class Verbatims:
2 |
3 | def __init__(self, text):
4 |
5 | self.text = text
6 | self.counter = len(text)
7 |
8 | def verbatims(self, keyword, no_of_words=1):
9 |
10 | # the output
11 | out = []
12 |
13 | # process the doc word-by-word
14 | for i in range(self.counter):
15 | try:
16 | if self.text[i] == keyword:
17 | words = self.text[i-no_of_words:i+1+no_of_words]
18 | out.append(' '.join(words))
19 | except IndexError:
20 | pass
21 |
22 | return out
23 |
--------------------------------------------------------------------------------
/signs/commands/autoprep.py:
--------------------------------------------------------------------------------
1 | class AutoPrep:
2 |
3 | def __init__(self, docs):
4 |
5 | self.docs = self._format(docs)
6 | self.docs = self._clean()
7 |
8 | def _format(self, docs):
9 |
10 | # input is a single string
11 | if isinstance(docs, str):
12 | pass
13 | # input is list with strings
14 | if isinstance(docs[0], str):
15 | pass
16 | # input is list with lists
17 | if isinstance(docs[0], list):
18 | pass
19 |
20 | def _clean(self):
21 |
22 | from signs import Clean
23 | return [[Clean(doc).text] for doc in self.docs]
24 |
--------------------------------------------------------------------------------
/test-ci.py:
--------------------------------------------------------------------------------
1 | from tests.test_clean import test_clean
2 | from tests.test_grams import test_grams
3 | from tests.test_stopwords import test_stopwords
4 | from tests.test_vectorize_spacy import test_vectorize_spacy
5 | from tests.test_read_file import test_read_file
6 | from tests.test_transform import test_transform
7 |
8 | print('Testing Clean() ...')
9 | test_clean()
10 |
11 | print('Testing Grams() ...')
12 | test_grams()
13 |
14 | print('Testing Stopwords() ...')
15 | test_stopwords()
16 |
17 | print('Testing Transform() ...')
18 | test_transform()
19 |
20 | print('Testing vectorize_spacy()')
21 | test_vectorize_spacy()
22 |
23 | print('Testing read_file()')
24 | test_read_file()
25 |
--------------------------------------------------------------------------------
/signs/preprocess/tokenize.py:
--------------------------------------------------------------------------------
1 | def tokenize(texts, num_words=None, maxlen=12, padding='post'):
2 |
3 | '''TOKENIZE
4 |
5 | Create a word index and returns padded docs in return.
6 |
7 | '''
8 |
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 |
12 | tokenizer = Tokenizer(num_words=num_words)
13 | tokenizer.fit_on_texts(texts)
14 | encoded_docs = tokenizer.texts_to_sequences(texts)
15 | word_index = tokenizer.word_index
16 |
17 | x = pad_sequences(encoded_docs, maxlen=maxlen, padding=padding)
18 | vocab_size = max(list(word_index.values())) + 1
19 |
20 | return x, vocab_size, word_index
21 |
--------------------------------------------------------------------------------
/signs/similarity/similarity.py:
--------------------------------------------------------------------------------
1 | def similarity(self, word1, word2):
2 |
3 | '''WORD SIMILARITY
4 | Takes as input two words and returns similarity.
5 |
6 | PARAMS
7 | ------
8 | word1 :: a string
9 | word2 :: another string
10 |
11 |
12 | EXAMPLE
13 | -------
14 | get = Signs(backend='gensim',
15 | trained_vectors="e:/signs_dev/vectors/glove/glove.twitter.27B.25d.txt")
16 | get.similarity('donald', 'trump')
17 |
18 | '''
19 |
20 | if self.backend == 'gensim':
21 | return self.model.similarity(word1, word2)
22 |
23 | if self.backend == 'spacy':
24 |
25 | a = self.model(word1)
26 | b = self.model(word2)
27 |
28 | return a.similarity(b)
29 |
--------------------------------------------------------------------------------
/signs/preprocess/embedding_matrix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def embedding_matrix(word_index,
5 | vector_dims,
6 | embeddings_index,
7 | max_num_words=None):
8 |
9 | if max_num_words is None:
10 | max_num_words = len(word_index) + 1
11 |
12 | num_words = min(max_num_words, len(word_index) + 1)
13 | embedding_matrix = np.zeros((num_words, vector_dims))
14 |
15 | for word, i in word_index.items():
16 | if i >= num_words:
17 | continue
18 | embedding_vector = embeddings_index.get(word)
19 |
20 | if embedding_vector is not None:
21 | embedding_matrix[i] = embedding_vector
22 |
23 | return embedding_matrix
24 |
--------------------------------------------------------------------------------
/.github/workflows/ci-deploy.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v1
12 | - name: Set up Python
13 | uses: actions/setup-python@v1
14 | with:
15 | python-version: '3.x'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 | run: |
25 | python setup.py sdist bdist_wheel
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/tests/test_stopwords.py:
--------------------------------------------------------------------------------
1 | def test_stopwords():
2 |
3 | from signs import Stopwords, Transform
4 |
5 | doc = ' Jack is a green 😂😂😂 cat... \n with a hat \n '
6 |
7 | # transform doc/s to the right format
8 | tokens = Transform([doc]).tokens()
9 |
10 | # filter the docs
11 | filtered_tokens = Stopwords(tokens)
12 |
13 | # then access the filtered docs
14 | filtered_tokens.docs
15 |
16 | # set minimum length for words
17 | Stopwords(tokens, min_length=3)
18 |
19 | # set maximum threshold for words (accept all words above this)
20 | Stopwords(tokens, max_threshold=8)
21 |
22 | # add custom words
23 | Stopwords(tokens, add_stopwords=['jack'])
24 |
25 | # just use custom words
26 | Stopwords(tokens, common_stopwords=False, add_stopwords=['jack']).docs
--------------------------------------------------------------------------------
/signs/vectorize/spacy.py:
--------------------------------------------------------------------------------
1 | def spacy_load(trained_vectors='en'):
2 |
3 | '''LOAD SPACY MODEL
4 |
5 | The default option 'en' loads the spacy default vectors.
6 | Alternative a local vector file can be loaded or one of the
7 | other spacy vectors.
8 |
9 | '''
10 |
11 | import spacy
12 |
13 | if trained_vectors == 'en':
14 |
15 | import en_core_web_sm
16 | return en_core_web_sm.load()
17 |
18 | else:
19 | return spacy.load(trained_vectors)
20 |
21 |
22 | def spacy_word2vec(string, model):
23 |
24 | '''GET VECTOR REPRESENTATION
25 |
26 | Takes as input a string (word or sentence) and
27 | returns the vector representation based on trained vectors.
28 |
29 | '''
30 |
31 | return model(string).vector
32 |
--------------------------------------------------------------------------------
/signs/vectorize/sentence_embeddings.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer
2 |
3 |
4 | class SentenceEmbeddings:
5 | """
6 | Loads model available from from sbert.net
7 | Arguments:
8 | Checkpoint|str| model name avaialable from sbert.net
9 | """
10 |
11 | def __init__(self, checkpoint="paraphrase-MiniLM-L6-v2"):
12 | self.checkpoint = checkpoint
13 | self.model = SentenceTransformer(checkpoint)
14 |
15 | def generate_sentence_embeddings(self, docs):
16 | """
17 | Generates sentence embeddings
18 | Arguments:
19 | `Docs`| `String` or `List` of `Strings`| The docs to be encoded
20 | Returns:
21 | list of embeddings
22 |
23 | """
24 | return self.model.encode(docs)
25 |
--------------------------------------------------------------------------------
/signs/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.simplefilter('ignore')
3 |
4 | from .commands.embeds import Embeds
5 | from .commands.clean import Clean
6 | from .commands.preds import Preds
7 | from .commands.load_vectors import LoadVectors
8 | from .commands.verbatims import Verbatims
9 | from .commands.describe import Describe
10 | from .commands.transform import Transform
11 | from .commands.docsimilarity import DocSimilarity
12 | from .commands.stopwords import Stopwords
13 | from .commands.autoprep import AutoPrep
14 |
15 | from .grams.grams import Grams
16 | from .vectorize.gensim_doc2vec_train import gensim_doc2vec_train as TrainDoc2Vec
17 | import signs.preprocess
18 |
19 | __all__ = ['Embeds', 'Clean', 'Preds', 'LoadVectors', 'Grams', 'Describe', 'Transform']
20 |
21 | del commands, utils, grams, vectorize, signs
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Request
3 | about: I want to suggest a new feature
4 |
5 | ---
6 |
7 | Thanks a lot for suggesting a feature to Signs. Please take a moment to go through the below checklist to provide context in a way that makes it easy to take your request forward.
8 |
9 | #### 1) I think Signs should add
10 |
11 | *A description of the feature with as much detail as you believe is valuable*
12 |
13 | #### 2) Once implemented, I can see how this feature will
14 |
15 | *Explain how researchers will benefit from having this feature in Signs**
16 |
17 | #### 3) I believe this feature is
18 |
19 | - [ ] critically important
20 | - [ ] must have
21 | - [ ] nice to have
22 |
23 | #### 4) Given the chance, I'd be happy to make a PR for this feature
24 |
25 | - [ ] definitely
26 | - [ ] possibly
27 | - [ ] unlikely
28 |
29 | ---
30 |
--------------------------------------------------------------------------------
/signs/preprocess/embedding_index.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def embedding_index(trained_vectors):
5 |
6 | '''KERAS EMBEDDING INDEX
7 |
8 | Takes in trained vectors (e.g. GloVe) and returns
9 | a Keras embeddings index.
10 |
11 | PARAMS
12 | ------
13 |
14 | trained_vectors :: a file with the trained vectors
15 |
16 | '''
17 |
18 | embeddings_index = {}
19 |
20 | with open(trained_vectors, encoding='utf-8') as f:
21 |
22 | for line in f:
23 |
24 | values = line.split()
25 | word = values[0]
26 | try:
27 | coefs = np.asarray(values[1:], dtype='float32')
28 | except ValueError:
29 | pass
30 | embeddings_index[word] = coefs
31 |
32 | vector_dims = len(list(embeddings_index.values())[0])
33 |
34 | return embeddings_index, vector_dims
35 |
--------------------------------------------------------------------------------
/tests/test_transform.py:
--------------------------------------------------------------------------------
1 | def test_transform():
2 |
3 | from signs import Transform
4 |
5 | doc = ' Jack is a green 😂😂😂 cat... \n with a hat \n '
6 |
7 | docs = Transform([doc])
8 |
9 | # return the original docs
10 | docs.docs()
11 |
12 | # return the original docs cleaned
13 | docs.docs(True)
14 |
15 | # return original docs but flattened
16 | docs.docs_flat()
17 |
18 | # return original docs flattened and clean
19 | docs.docs_flat(True)
20 |
21 | # return original docs in a single string blob
22 | docs.docs_string()
23 |
24 | # return original docs in single string blob cleaned
25 | docs.docs_string(True)
26 |
27 | # return tokenized version of docs
28 | docs.tokens()
29 |
30 | # return tokenized version cleaned
31 | docs.tokens(True)
32 |
33 | # return tokenized and flattend
34 | docs.tokens_flat()
35 |
36 | # return tokenized and flattened clean
37 | docs.tokens_flat(True)
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/support-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Support
3 | about: I want to ask for support
4 |
5 | ---
6 |
7 | #### 1) Confirm the below
8 |
9 | - [ ] My Python version is 3.5 or higher
10 |
11 | #### 2) Include the output of:
12 |
13 | `signs.__version__`
14 |
15 | #### 3) Explain clearly what you are trying to achieve
16 |
17 | *A description of your specific use-case and what you hope to achieve with it*
18 |
19 | #### 4) Explain what you have already tried
20 |
21 | *An outline of the steps that you have already taken so far*
22 |
23 | #### 5) Provide a code-complete reference
24 |
25 | - [ ] My support question includes the full code I'm trying to execute
26 | - [ ] I've answered questions 1, 2, 3 and 4 above
27 | - [ ] My support question includes a link to a sample of the data
28 |
29 | **A self-contained Jupyter Notebook, Google Colab, or similar is highly preferred and will speed up helping you with your issue.**
30 |
31 | ---
32 |
--------------------------------------------------------------------------------
/signs/vectorize/gensim_doc2vec_train.py:
--------------------------------------------------------------------------------
1 | def gensim_doc2vec_train(docs):
2 |
3 | '''Trains a gensim doc2vec model based on a training corpus.
4 | Returns the trained model and the training docs.
5 |
6 | NOTE: the input docs format is list-of-lists where each
7 | sublists consist of tokenized document.
8 |
9 | EXAMPLE:
10 |
11 | model, train_corpus = gensim_doc2vec_train(docs[:400])
12 | test_corpus = docs[400:]
13 |
14 | '''
15 |
16 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
17 |
18 | documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
19 |
20 | model = Doc2Vec(vector_size=50,
21 | window=2,
22 | min_count=2,
23 | workers=4,
24 | epochs=50)
25 |
26 | model.build_vocab(documents)
27 |
28 | model.train(documents,
29 | total_examples=model.corpus_count,
30 | epochs=model.epochs)
31 |
32 | return model, documents
--------------------------------------------------------------------------------
/tests/test_clean.py:
--------------------------------------------------------------------------------
1 | def test_clean():
2 |
3 | from signs import Clean
4 |
5 | doc = ' Jack is a green 😂😂😂 cat... \n with a hat \n '
6 |
7 | # create the object
8 | cleaned = Clean(doc)
9 |
10 | # access the text
11 | cleaned.text
12 |
13 | # you could of course also directly do
14 | Clean(doc).text
15 |
16 | # create the object
17 | cleaned = Clean(doc, auto=False)
18 |
19 | # make text all caps
20 | cleaned.caps()
21 |
22 | # make text all lower
23 | cleaned.low()
24 |
25 | # decode the text
26 | cleaned.decod()
27 |
28 | # remove emojis
29 | cleaned.emoji()
30 |
31 | # remove leading and trailing whitespace
32 | cleaned.leadtrail()
33 |
34 | # remove all whitespace
35 | cleaned.whitespace()
36 |
37 | # remove linebreaks
38 | cleaned.linebreaks()
39 |
40 | # remove links
41 | cleaned.links()
42 |
43 | # remove punctuation
44 | cleaned.punct()
45 |
46 | # remove arbitrary string
47 | cleaned.string('is a green')
48 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Autonomio
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/signs/vectorize/huggingface_embeddings.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModel, AutoTokenizer
2 |
3 |
4 | class HuggingfaceEmbeddings:
5 | """
6 | loads a `model` and `tokenizer` from huggingface from the given
7 | checkpoint if in local storage or huggingface hub
8 | Arguments:
9 | Checkpoint|str|Checkpoint name from huggingface model hub
10 | """
11 |
12 | def __init__(self, checkpoint="prajjwal1/bert-small"):
13 | self.checkpoint = checkpoint
14 | self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
15 | self.model = AutoModel.from_pretrained(self.checkpoint)
16 |
17 | def generate_huggingface_embeddings(self, docs):
18 | """
19 | Generate huggingface embeddings.
20 | Arguments:
21 | docs|`string` or a `list` of `strings`|The docs to be encoded
22 | returns:
23 | tuple of tensors
24 |
25 | """
26 | model = self.model
27 | tokenizer = self.tokenizer
28 | inputs = tokenizer(docs, padding=True,
29 | truncation=True, return_tensors="pt")
30 | outputs = model(**inputs)
31 | return outputs.last_hidden_state
32 |
--------------------------------------------------------------------------------
/.github/workflows/ci-push.yml:
--------------------------------------------------------------------------------
1 | name: Push
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 | name: Runtimes
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | max-parallel: 9
11 | matrix:
12 | python-version: [3.7, 3.8]
13 | os: [ubuntu-latest, macos-latest]
14 |
15 | steps:
16 | - uses: actions/checkout@v1
17 | - name: Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v1
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Dependencies
22 | run: |
23 | export MPLBACKEND=agg
24 | pip install --upgrade pip
25 | pip install cython
26 | pip install tensorflow
27 | pip install coveralls
28 | pip install -r requirements.txt
29 | python -m spacy download en
30 | - name: Style
31 | run: |
32 | pip install flake8
33 | flake8 . --count --select=E9,F63,F7 --show-source --statistics
34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
35 | - name: Tests
36 | run: |
37 | export MPLBACKEND=agg
38 | coverage run --source=signs ./test-ci.py
39 | - name: Coverage
40 | env:
41 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
42 | run: |
43 | coveralls
44 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Report
3 | about: I want to report something that is broken
4 |
5 | ---
6 |
7 | Thank you very much for reporting a bug on Signs. Before you do, please go through the below checklist carefully and make sure to prepare your bug report in a way that facilitates effective handling of the matter.
8 |
9 | #### 1) Confirm the below
10 |
11 | - [ ] My Python version is 3.5 or higher
12 | - [ ] I have searched through the issues [Issues](https://github.com/autonomio/signs/issues) for a duplicate
13 |
14 | #### 2) Include the output of:
15 |
16 | `signs.__version__`
17 |
18 | #### 3) Explain clearly what you expect to happen
19 |
20 | *A description of what you tried to do and what you thought should happen.*
21 |
22 | #### 4) Explain what actually happened
23 |
24 | *A description of the issue in Signs that you had identified*
25 |
26 | #### 5) Provide a code-complete reference
27 |
28 | - [ ] The command causing the issue is included
29 | - [ ] A sample of the data (or link to the data) is included
30 | - [ ] A full erroc message / trace is included
31 |
32 | NOTE: If the data is sensitive and can't be shared, [create dummy data](https://scikit-learn.org/stable/modules/classes.html#samples-generator) that mimics it.
33 |
34 | **A self-contained Jupyter Notebook, Google Colab, or similar is highly preferred and will speed up helping you with your issue.**
35 |
36 | ---
37 |
--------------------------------------------------------------------------------
/signs/commands/describe.py:
--------------------------------------------------------------------------------
1 | class Describe:
2 |
3 | def __init__(self, tokens):
4 |
5 | '''Takes in a list with tokens.'''
6 |
7 | from signs.preprocess import lists_to_list
8 |
9 | if isinstance(tokens[0], str):
10 | self.tokens = tokens
11 | if isinstance(tokens[0], list):
12 | self.tokens = lists_to_list(tokens)
13 |
14 | def get_counts(self):
15 |
16 | '''Get word frequencies for the aggregate
17 | of all the tokens in Describe() input.'''
18 |
19 | from collections import Counter
20 |
21 | # return the counted values as dictionary
22 | return dict(Counter(self.tokens).most_common())
23 |
24 | def get_gram_counts(self, ngram=3, skip=0):
25 |
26 | '''Returns ngram counts. Supports skipgrams.
27 |
28 | docs : list
29 | A list of lists where each sublist contains a string.
30 | ngram : int
31 | The value for 'n' in ngram. For example, 2 for bigram.
32 | skip : int
33 | The value for skipgrams. For example, 3 to skip three tokens
34 | '''
35 |
36 | from collections import Counter
37 | from signs import Grams, preprocess
38 |
39 | temp = [Grams(self.tokens).ngrams(ngram, skip)]
40 | temp = preprocess.lists_to_list(temp)
41 | temp = [' '.join(i) for i in temp]
42 |
43 | return dict(Counter(temp).most_common())
44 |
--------------------------------------------------------------------------------
/signs/models/mlp.py:
--------------------------------------------------------------------------------
1 | def mlp(x,
2 | y,
3 | vocab_size,
4 | vector_dims,
5 | embedding_matrix,
6 | epochs=50,
7 | layers=0,
8 | dropout=0,
9 | batch_size=10,
10 | inner_neurons=None,
11 | loss='binary_crossentropy',
12 | x_test=None,
13 | y_test=None):
14 |
15 | '''Trains a basic MLP style neural network
16 | with embedding_matrix from Embeds().layer()'''
17 |
18 | from keras.models import Sequential
19 | from keras.layers import Dense, Dropout, Flatten, Embedding
20 | from kerasplotlib import TrainingLog
21 |
22 | model = Sequential()
23 | model.add(Embedding(vocab_size,
24 | vector_dims,
25 | weights=[embedding_matrix],
26 | input_length=x.shape[1],
27 | trainable=False))
28 | model.add(Flatten())
29 | model.add(Dropout(dropout))
30 |
31 | # add layers
32 | if layers > 0:
33 | for i in range(layers):
34 | model.add(Dense(inner_neurons))
35 | model.add(Dropout(dropout))
36 |
37 | model.add(Dense(1, activation='sigmoid'))
38 | model.compile(optimizer='adam', loss=loss, metrics=['acc'])
39 | model.fit(x, y,
40 | epochs=epochs,
41 | batch_size=batch_size,
42 | verbose=0,
43 | validation_split=.3,
44 | callbacks=[TrainingLog()])
45 | # loss, accuracy = model.evaluate(x, y, verbose=0)
46 |
47 | return model
48 |
--------------------------------------------------------------------------------
/signs/preprocess/transforms.py:
--------------------------------------------------------------------------------
1 | def lists_to_list(list_of_lists):
2 |
3 | '''Takes in list of lists and makes it into a flat list'''
4 |
5 | return [item for sublist in list_of_lists for item in sublist]
6 |
7 |
8 | def strings_to_tokens(docs):
9 |
10 | '''Converts list-of-lists where each sublist
11 | is a string, to list-of-lists where each sublist
12 | consist of tokens'''
13 |
14 | return [doc[0].split() for doc in docs if doc[0] is not None]
15 |
16 |
17 | def docs_to_list(docs):
18 |
19 | '''convert list of sentences to list of word'''
20 |
21 | return [word for line in docs for word in line.split()]
22 |
23 |
24 | def docs_to_blob(docs):
25 |
26 | '''takes in list of words or sentences
27 | and returns a single blob of text.'''
28 |
29 | return ' '.join(word for word in docs)
30 |
31 |
32 | def lists_to_blob(docs):
33 |
34 | return ' '.join([''.join(doc) for doc in docs if doc is not None])
35 |
36 |
37 | def create_tokens(docs, flatten=True, clean=True):
38 |
39 | '''Takes in a list-of-lists where each sublist
40 | contains of a string. For example, a paragraph.
41 |
42 | Returns a single list where strings have
43 | been converted into tokens.'''
44 |
45 | from signs import Clean
46 |
47 | if clean:
48 | docs = [[Clean(doc, auto=True).text] for doc in docs]
49 |
50 | tokens = strings_to_tokens(docs)
51 |
52 | if flatten:
53 | return lists_to_list(tokens)
54 | else:
55 | return tokens
56 |
--------------------------------------------------------------------------------
/signs/commands/transform.py:
--------------------------------------------------------------------------------
1 | class Transform:
2 |
3 | def __init__(self, documents):
4 |
5 | '''Accepts input docs as list-of-lists where
6 | each sublist consist of a string value representing
7 | a single document, for example a paragraph.'''
8 |
9 | self._documents = documents
10 |
11 | def docs(self, clean=True, stopwords=True):
12 |
13 | '''Returns list-of-lists of strings'''
14 |
15 | from signs import Clean
16 |
17 | if clean:
18 | return [[Clean(doc, auto=True).text] for doc in self._documents]
19 |
20 | return self._documents
21 |
22 | def tokens(self, clean=True, stopwords=True):
23 |
24 | '''Returns a list-of-lists of tokens'''
25 |
26 | from signs.preprocess import create_tokens
27 |
28 | return create_tokens(self._documents, flatten=False, clean=clean)
29 |
30 | def tokens_flat(self, clean=True, stopwords=True):
31 |
32 | '''Returns a flattened list of tokens'''
33 |
34 | from signs.preprocess import create_tokens
35 |
36 | return create_tokens(self._documents, flatten=True, clean=clean)
37 |
38 | def docs_flat(self, clean=True, stopwords=True):
39 |
40 | '''Returns a flatted list of strings'''
41 |
42 | from signs.preprocess import lists_to_list
43 |
44 | return lists_to_list(self._documents)
45 |
46 | def docs_string(self, clean=True, stopwords=True):
47 |
48 | '''Returns a single string with blobbed documents'''
49 |
50 | from signs import Clean
51 | from signs.preprocess import lists_to_blob
52 |
53 | if clean:
54 | return lists_to_blob(
55 | [Clean(doc, auto=True).text for doc in self._documents])
56 | else:
57 | return lists_to_blob(self._documents)
58 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | .vscode/
7 |
8 | .DS_Store
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # pyenv
80 | .python-version
81 |
82 | # celery beat schedule file
83 | celerybeat-schedule
84 |
85 | # SageMath parsed files
86 | *.sage.py
87 |
88 | # Environments
89 | .env
90 | .venv
91 | env/
92 | venv/
93 | ENV/
94 | env.bak/
95 | venv.bak/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 |
--------------------------------------------------------------------------------
/signs/commands/embeds.py:
--------------------------------------------------------------------------------
1 | from ..preprocess.embedding_index import embedding_index
2 | from ..preprocess.embedding_matrix import embedding_matrix
3 | from ..preprocess.tokenize import tokenize
4 |
5 |
6 | class Embeds:
7 |
8 | def __init__(self, trained_vectors):
9 |
10 | '''Takes as input trained vectors and returns an object which can be
11 | used to to create a Keras Embeds layer given input of documents.
12 |
13 | e = Embeds('glove.twitter.27B.25d.tx')
14 | embedding_layer = e.layer(docs)
15 |
16 | '''
17 |
18 | self.t_v = trained_vectors
19 | self.embeddings_index, self.vector_dims = embedding_index(self.t_v)
20 |
21 | def _matrix(self):
22 |
23 | '''Helper function for processing input docs for the
24 | Keras Embeds layer'''
25 |
26 | self.x, self.vocab_size, self.word_index = tokenize(self.docs)
27 | self.embedding_matrix = embedding_matrix(self.word_index,
28 | self.vector_dims,
29 | self.embeddings_index)
30 |
31 | def layer(self, docs):
32 |
33 | '''Takes as input a series or array of documents and returns
34 | a Keras embeddings layer and x data.
35 |
36 | USE:
37 |
38 | embeds_layer, x = e.layer(docs)
39 |
40 | then in Keras model:
41 |
42 | model.add(embeds_layer)
43 | model.fit(x=x)
44 |
45 | '''
46 |
47 | from keras.layers import Embedding
48 |
49 | self.docs = docs
50 |
51 | self._matrix()
52 |
53 | embeddding_layer = Embedding(self.vocab_size,
54 | self.vector_dims,
55 | weights=[self.embedding_matrix],
56 | input_length=self.x.shape[1],
57 | trainable=False)
58 |
59 | return embeddding_layer, self.x
60 |
--------------------------------------------------------------------------------
/signs/similarity/doc_similarity.py:
--------------------------------------------------------------------------------
1 | def similarity_docs(doc, model):
2 |
3 | '''Computes similarity scores for docs in the
4 | training corpus based on input doc.'''
5 |
6 | vecs = model.infer_vector(doc)
7 | sims = model.docvecs.most_similar([vecs], topn=len(model.docvecs))
8 |
9 | return dict(sims)
10 |
11 |
12 | def seen_similarity_matrix(model):
13 |
14 | '''Computes a similarity matrix for all the docs
15 | in the training corpus or in a set of documents
16 | based on the trained model.'''
17 |
18 | import numpy as np
19 |
20 | similarities = []
21 | out = []
22 |
23 | for i in range(len(model.docvecs)):
24 |
25 | # get similarities
26 | sims = model.docvecs.most_similar([model.docvecs[i]],
27 | topn=len(model.docvecs))
28 |
29 | # add to a list in dictionary format
30 | similarities.append(dict(sims))
31 |
32 | for sims in similarities:
33 |
34 | temp = []
35 |
36 | for i in range(len(sims)):
37 | temp.append(sims[i])
38 |
39 | out.append(temp)
40 |
41 | return np.array(out)
42 |
43 |
44 | def unseen_similarity_matrix(model, docs):
45 |
46 | '''Computes a similarity matrix for unseen documents
47 | based on a trained doc2vec model.'''
48 |
49 | import numpy as np
50 |
51 | out = []
52 |
53 | for doc in docs:
54 | temp = []
55 | for i in range(len(docs)):
56 | sim = model.docvecs.similarity_unseen_docs(model, doc, docs[i])
57 | temp.append(sim)
58 | out.append(temp)
59 |
60 | return np.array(out)
61 |
62 |
63 | def vector_spatial_distance(model, doc1, doc2):
64 |
65 | '''>> EXPERIMENTAL <<
66 |
67 | Computes spatial distance cosine for two documents
68 | based on a trained model.'''
69 |
70 | import scipy
71 |
72 | vec1 = model.infer_vector(doc1)
73 | vec2 = model.infer_vector(doc2)
74 |
75 | return 1 - scipy.spatial.distance.cosine(vec1, vec2)
76 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | #
3 | # Copyright (C) 2018 Mikko Kotila
4 |
5 | import os
6 |
7 | DESCRIPTION = "Signs Text Processing for Deep Learning"
8 | LONG_DESCRIPTION = """\
9 | Signs is a utility for text preprocessing, vectorizing, and analysis
10 | such as semantic similarity, mainly for the purpose of using unstructured
11 | data in deep learning models.
12 | """
13 |
14 | DISTNAME = 'signs'
15 | MAINTAINER = 'Mikko Kotila'
16 | MAINTAINER_EMAIL = 'mailme@mikkokotila.com'
17 | URL = 'http://autonom.io'
18 | LICENSE = 'MIT'
19 | DOWNLOAD_URL = 'https://github.com/autonomio/signs/'
20 | VERSION = '0.3.2'
21 |
22 | try:
23 | from setuptools import setup
24 | _has_setuptools = True
25 | except ImportError:
26 | from distutils.core import setup
27 |
28 | install_requires = ['kerasplotlib',
29 | 'wrangle',
30 | 'pandas',
31 | 'numpy',
32 | 'cython',
33 | 'spacy',
34 | 'gensim',
35 | 'keras',
36 | 'ipython']
37 |
38 | if __name__ == "__main__":
39 |
40 | setup(name=DISTNAME,
41 | author=MAINTAINER,
42 | author_email=MAINTAINER_EMAIL,
43 | maintainer=MAINTAINER,
44 | maintainer_email=MAINTAINER_EMAIL,
45 | description=DESCRIPTION,
46 | long_description=LONG_DESCRIPTION,
47 | license=LICENSE,
48 | url=URL,
49 | version=VERSION,
50 | download_url=DOWNLOAD_URL,
51 | install_requires=install_requires,
52 | packages=['signs',
53 | 'signs.commands',
54 | 'signs.preprocess',
55 | 'signs.vectorize',
56 | 'signs.grams',
57 | 'signs.utils',
58 | 'signs.models',
59 | 'signs.similarity'],
60 |
61 | classifiers=[
62 | 'Intended Audience :: Science/Research',
63 | 'Programming Language :: Python :: 3.6',
64 | 'License :: OSI Approved :: MIT License',
65 | 'Topic :: Scientific/Engineering :: Human Machine Interfaces',
66 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
67 | 'Topic :: Scientific/Engineering :: Mathematics',
68 | 'Operating System :: POSIX',
69 | 'Operating System :: Unix',
70 | 'Operating System :: MacOS'])
71 |
72 | os.system("python -m spacy download en")
73 |
--------------------------------------------------------------------------------
/signs/utils/stopwords.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | def stopwords():
4 |
5 | nltk_stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']
6 | twitter_stop = ['RT', 'http', 'https', 'rt', 'via']
7 | generic_stop = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','1','2','3','4','5','6','7','8','9','0','was','are','going','used','&','has','dont','amp', 'the','be','and','of','a','in','to','is','i','an', 'have','to','it','I','that','for','you','why','were', 'he','with','on','do','say','this','they','had','been', 'at','but','we','his','from','that','not','\"the', 'by','she','or','as','what','go','their','did', 'can','who','get','if','would','her','all','you.', 'my','make','about','know','will','as','up','let', 'one','time','there','so','when','which','them','did', 'some','me','take','out','into','just','see','him', 'your','come','could','now','than','like','other', 'how','then','its','our','two','more','these','don\'t', 'want','way','look','first','also','new','because', 'day','more','use','no','man','find','here','thing', 'give','many','well','only','those','tell','one', 'very','her','even','back','any','good','through', 'us','there','down','may','should','over','still', 'try','in','as','last','ask','need','too','feel', 'three','when','never','become','between','high','each', 'really','something','most','another','much','own','both', 'out','leave','put','old','while','mean','it\'s', 'im','u','i\'m','said','de','http','https','got', 'didn\'t','doesnt','didnt\…','i\'d','can\'t','doesn\'t', 'isn\'t','\“what\’s','ever','again.','()','thought', 'before','after','2016','knows','everyone','every','please']
8 |
9 | return nltk_stop + twitter_stop + generic_stop
--------------------------------------------------------------------------------
/signs/commands/stopwords.py:
--------------------------------------------------------------------------------
1 | class Stopwords:
2 |
3 | def __init__(self,
4 | docs,
5 | common_stopwords=True,
6 | add_stopwords=[],
7 | min_length=2,
8 | max_threshold=10):
9 |
10 | '''Accepts as input a list-of-lists where
11 | each sublist is a document represented in tokens.
12 |
13 | docs : list (of lists)
14 | Tokenized documents.
15 | common_stopwords : bool
16 | If a comprehensive list of stopwords should be used. If set
17 | to False then add_stopwords can't be empty.
18 | add_stopwords : list or None
19 | If a list of words is provided, then those will be used
20 | as well as common_stopwords unless it's set to False.
21 | min_lenght : int
22 | Drop all words below this length.
23 | max_threshold : int
24 | Keep all words that are at least this long.
25 |
26 | '''
27 |
28 | import string
29 |
30 | self.common_stopwords = common_stopwords
31 | self.add_stopwords = add_stopwords
32 | self.min_length = min_length
33 | self.max_threshold = max_threshold
34 |
35 | self.string = string
36 | self.stopwords = self.stopword_index()
37 | self.docs = docs
38 |
39 | for i in range(len(docs)):
40 | self.docs[i] = self.check_stopwords(docs[i])
41 |
42 | def stopword_index(self):
43 |
44 | import numpy as np
45 |
46 | out = []
47 |
48 | if self.common_stopwords is True:
49 | from signs.utils.stopwords import stopwords
50 | stopword_list = np.unique(stopwords()).tolist()
51 | stopword_list + self.add_stopwords
52 | else:
53 | stopword_list = self.add_stopwords
54 |
55 | for word in stopword_list:
56 | if len(word) > 1:
57 | out.append(word.lower())
58 |
59 | stopword_dict = {}
60 |
61 | for word in out:
62 |
63 | if word[0] in self.string.ascii_letters:
64 | try:
65 | stopword_dict[word[0]].append(word)
66 | except KeyError:
67 | stopword_dict[word[0]] = [word]
68 |
69 | return stopword_dict
70 |
71 | def check_stopwords(self, doc):
72 |
73 | out = []
74 |
75 | for word in doc:
76 |
77 | # always keep words longer than 10 characters
78 | if len(word) >= self.max_threshold:
79 | out.append(word)
80 |
81 | # always pass words shorter than 2 characters
82 | elif len(word) <= self.min_length:
83 | continue
84 |
85 | elif word[0] not in self.stopwords.keys():
86 | out.append(word)
87 |
88 | elif word not in self.stopwords[word[0]]:
89 | out.append(word)
90 |
91 | return out
92 |
--------------------------------------------------------------------------------
/signs/commands/docsimilarity.py:
--------------------------------------------------------------------------------
1 | class DocSimilarity:
2 |
3 | def __init__(self, model, docs):
4 |
5 | from signs.similarity import doc_similarity as sims
6 | from signs.utils.html_print import html_print
7 |
8 | self._sims = sims
9 | self._model = model
10 | self._docs = docs
11 | self._html_print = html_print
12 |
13 | def seen_matrix(self):
14 |
15 | '''creates a 2d matrix with similarities'''
16 |
17 | return self._sims.seen_similarity_matrix(self._model)
18 |
19 | def unseen_matrix(self, docs):
20 |
21 | '''same as above but for unseen docs'''
22 |
23 | return self._sims.unseen_similarity_matrix(self._model, docs)
24 |
25 | def similar_docs(self, doc):
26 |
27 | '''for comparing a single doc against all seen docs'''
28 | return self._sims.similarity_docs(doc, self._model)
29 |
30 | def spatial_distance(self, doc1, doc2):
31 |
32 | '''for comparing two unseen or seen docs'''
33 | return self._sims.vector_spatial_distance(self._model,
34 | doc1,
35 | doc2)
36 |
37 | def preview_results(self, docs=None):
38 |
39 | if docs is None:
40 | # get the keys from docs with similarities
41 | similarities = self._get_similarities(self._docs.docs())
42 | else:
43 | similarities = self._get_similarities(docs)
44 |
45 | # print out the highest and lowest match
46 | self._print_highest(similarities)
47 | self._print_lowest(similarities)
48 |
49 |
50 | def _get_similarities(self, docs):
51 |
52 | import random
53 |
54 | # pick a ramdom document
55 | doc_id = random.randint(0, len(docs))
56 |
57 | # find similar documents
58 | similarities = self._sims.similarity_docs(docs[doc_id], self._model)
59 |
60 | return similarities
61 |
62 | def _print_highest(self, similarities):
63 |
64 | # create content
65 | text = self._docs.docs(False)[list(similarities.keys())[0]][0]
66 |
67 | # create similarity value and round
68 | similarity = similarities[list(similarities.keys())[0]]
69 | similarity = round(similarity, 4)
70 |
71 | # parse together the title
72 | title = "HIGHEST MATCH : " + str(similarity)
73 |
74 | # print it out
75 | self._html_print(text, title)
76 |
77 |
78 | def _print_lowest(self, similarities):
79 |
80 | # create content
81 | text = self._docs.docs(False)[list(similarities.keys())[-1]][0]
82 |
83 | # create similarity value and round
84 | similarity = similarities[list(similarities.keys())[-1]]
85 | similarity = round(similarity, 4)
86 |
87 | # parse together the title
88 | title = "LOWEST MATCH : " + str(similarity)
89 |
90 | # print it out
91 | self._html_print(text, title)
--------------------------------------------------------------------------------
/signs/commands/load_vectors.py:
--------------------------------------------------------------------------------
1 | class LoadVectors:
2 |
3 | '''Load, train, and
4 |
5 | The default option 'en' loads the spacy default vectors.
6 | Alternative a local vector file can be loaded or one of the
7 | other spacy vectors.
8 |
9 | trained_vectors :: name or path to the pre-trained vector file.
10 | If set to 'en' then spacy small web vectors
11 | will be used.
12 | backend :: either 'spacy' or 'gensim'
13 | mode :: applies to backend 'gensim'. The options are 'glove', 'word2vec',
14 | and 'fasttext'
15 | temp :: filename for glove to word2vec conversion
16 | binary :: if the vector file format is binary or not
17 |
18 | '''
19 |
20 | def __init__(self,
21 | trained_vectors='en',
22 | backend='spacy',
23 | mode=None,
24 | temp='signs_temp.sig',
25 | binary=False):
26 |
27 | self.backend = backend
28 | self.mode = mode
29 | self.temp = temp
30 | self.binary = binary
31 | self.trained_vectors = trained_vectors
32 | self._null = self.load_model()
33 |
34 | def load_model(self):
35 |
36 | import os
37 |
38 | if self.backend == 'spacy':
39 |
40 | import spacy as sp
41 |
42 | if self.trained_vectors == 'en':
43 |
44 | import en_core_web_sm
45 | self.model = en_core_web_sm.load()
46 |
47 | else:
48 | self.model = sp.load(self.trained_vectors)
49 |
50 | elif self.backend == 'gensim':
51 |
52 | from gensim.models import KeyedVectors, FastText
53 | from gensim.scripts.glove2word2vec import glove2word2vec
54 |
55 | if self.mode is None:
56 | self.mode = 'glove'
57 |
58 | if self.mode == 'glove':
59 | glove2word2vec(self.trained_vectors, self.temp)
60 | self.model = KeyedVectors.load_word2vec_format(self.temp,
61 | binary=self.binary)
62 | os.remove(self.temp)
63 |
64 | elif self.mode == 'word2vec':
65 | self.model = KeyedVectors.load_word2vec_format(self.trained_vectors,
66 | binary=self.binary,
67 | encoding='latin-1')
68 | elif self.mode == 'fasttext':
69 | self.model = FastText.load_fasttext_format(self.trained_vectors,
70 | encoding='latin-1')
71 |
72 | def vectors(self, word):
73 |
74 | '''GET VECTOR REPRESENTATION
75 |
76 | Takes as input a string (word or sentence) and
77 | returns the vector representation based on trained vectors.
78 |
79 | '''
80 |
81 | if self.backend == 'spacy':
82 | return self.model(word).vector
83 |
84 | elif self.backend == 'gensim':
85 | return self.model[word]
86 |
--------------------------------------------------------------------------------
/signs/commands/clean.py:
--------------------------------------------------------------------------------
1 | class Clean:
2 |
3 | def __init__(self, text, auto=True, remove_string=''):
4 |
5 | '''Allows several document cleaning workflows either
6 | individually, or as an automated pipeline. Accepts
7 | string value or list with single string value.'''
8 |
9 | # a custom string to be removed
10 | self._remove_string = remove_string
11 |
12 | # input is a string inside a list
13 | if isinstance(text, list):
14 | self.text = text[0]
15 | else:
16 | self.text = text
17 |
18 | if auto:
19 | self.automated()
20 |
21 | def automated(self):
22 |
23 | self.text = self.nones()
24 | self.text = self.decod()
25 | self.text = self.low()
26 | self.text = self.links()
27 | self.text = self.emoji()
28 | self.text = self.punct()
29 | self.text = self.linebreaks()
30 | self.text = self.string()
31 | self.text = self.leadtrail()
32 | self.text = self.whitespace()
33 |
34 | def nones(self, replace_with='NA'):
35 |
36 | '''Accepts as input a list-of-lists where each
37 | sublist contains a string. Replaces None values
38 | with an arbitrary string.'''
39 |
40 | if self.text is None:
41 | return replace_with
42 | else:
43 | return self.text
44 |
45 | def string(self, remove_string=None):
46 |
47 | '''remove arbitrary string'''
48 |
49 | if remove_string is None:
50 | remove_string = self._remove_string
51 |
52 | return self.text.replace(remove_string, '')
53 |
54 | def whitespace(self):
55 |
56 | '''Remove extra whitespaces'''
57 |
58 | return ' '.join(self.text.split())
59 |
60 | def low(self):
61 |
62 | '''make string lowercase'''
63 |
64 | return self.text.lower()
65 |
66 | def caps(self):
67 |
68 | '''make string uppercase'''
69 |
70 | return self.text.upper()
71 |
72 | def punct(self):
73 |
74 | '''remove special characters'''
75 |
76 | import re
77 | import string
78 |
79 | return re.sub('['+string.punctuation+']', '', self.text)
80 |
81 | def leadtrail(self):
82 |
83 | '''remove trailing and leading whitespace
84 |
85 | NOTE: this also removes the last line break'''
86 |
87 | return self.text.strip()
88 |
89 | def linebreaks(self):
90 |
91 | '''remove linebreaks'''
92 |
93 | return self.text.replace('\n', ' ').replace('\r', '')
94 |
95 | def decod(self):
96 |
97 | '''decode binary'''
98 | try:
99 | return self.text.decode()
100 | except AttributeError:
101 | return self.text
102 |
103 | def links(self):
104 |
105 | '''remove links'''
106 |
107 | import re
108 |
109 | return re.sub(r'http\S+', '', self.text)
110 |
111 | def emoji(self):
112 |
113 | '''remove emojis'''
114 |
115 | import re
116 |
117 | emoji_pattern = re.compile("["
118 | u"\U0001F600-\U0001F64F"
119 | u"\U0001F300-\U0001F5FF"
120 | u"\U0001F680-\U0001F6FF"
121 | u"\U0001F1E0-\U0001F1FF"
122 | u"\U00002702-\U000027B0"
123 | u"\U000024C2-\U0001F251"
124 | "]+", flags=re.UNICODE)
125 | return emoji_pattern.sub(r'', self.text)
126 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Computational Text Processing for Humans
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Signs •
23 | Key Features •
24 | Examples •
25 | Install •
26 | Support •
27 | Docs •
28 | Issues •
29 | License •
30 | Download
31 |
32 |
33 |
34 | If you want the simplest possible way to generate embeddings for deep learning models without sacrificing the power of state-of-the-art vector models, Signs is for you.
35 |
36 |
37 |
38 |
39 |
40 |
41 | ### Signs
42 |
43 | Signs is a set of tools for text preparation, vectorization and processing and radically simplifies raw text to Keras embeddings workflow. Signs unifies Gensim and SpaCy vectorization backends for Keras users and provides an easy-to-use vectorization solution to manage otherwise complex workflows. Signs provides a meaningful replacement for dozens of lines of redundant code that are currently required to transform raw text into a a Keras Embeds layer.
44 |
45 | ### Key Features
46 |
47 | - unifies Gensim and SpaCy vectorization backends
48 | - supports using common vector models: [GloVe](https://nlp.stanford.edu/projects/glove/), [Fasttext](https://fasttext.cc/docs/en/english-vectors.html), and [word2vec](https://en.wikipedia.org/wiki/Word2vec)
49 | - removes NLP learning curve
50 | - adds no more than a few lines of code to your worflow
51 | - From text to Keras embedding layer in a single command
52 | - Train, save, and load custom vectors
53 | - Evaluate results after training a Keras prediction model
54 | - Powerful text preprocessing features
55 | - Allows completely automated text preprocessing
56 |
57 |
58 | ### Examples
59 |
60 | [get source](https://gist.github.com/mikkokotila/ad0138788c93bef2b71eaccc9c95701b) for the below example.
61 |
62 |
63 |
64 | Several example notebooks with common workflows can be found [here](https://github.com/autonomio/signs/tree/master/examples).
65 |
66 | ### Install
67 |
68 | Stable version:
69 |
70 | #### `pip install signs`
71 |
72 | Daily development version:
73 |
74 | #### `pip install git+https://github.com/autonomio/signs.git@daily-dev`
75 |
76 | ### Support
77 |
78 | If you want ask a **"how can I use Signs to..."** question, the right place is [StackOverflow](https://stackoverflow.com/questions/ask).
79 |
80 | If you found a bug or want to suggest a feature, check the [issues](https://github.com/autonomio/signs/issues) or [create](https://github.com/autonomio/signs/issues/new/choose) a new issue.
81 |
82 |
83 | ### License
84 |
85 | [MIT License](https://github.com/autonomio/signs/blob/master/LICENSE)
86 |
87 |
88 |
89 |
--------------------------------------------------------------------------------
/signs/commands/preds.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from ..utils.converters import embeds_to_text
3 |
4 | class Preds:
5 |
6 | def __init__(self, x_test, y_test, word_index, model):
7 |
8 | # temp values
9 | self.model = model
10 | self.x_test = x_test
11 | self.y_test = y_test
12 | self.word_index = word_index
13 |
14 | self.results = self._preds_df()
15 |
16 | # delete temp values
17 | del self.word_index, self.y_test, self.x_test, self.model
18 |
19 | def _preds_df(self):
20 |
21 | '''
22 | x_test :: the test data (already embedded)
23 | y_test :: truth values for the test data
24 | word_index :: the word index used for creating the embeddings
25 |
26 | '''
27 |
28 | results = pd.DataFrame({
29 | 'text': embeds_to_text(self.x_test, self.word_index),
30 | 'pred': [i[0] for i in self.model.predict(self.x_test)],
31 | 'truth': self.y_test
32 | })
33 |
34 | return results
35 |
36 | def _printing(self, data, title, n, notebook):
37 |
38 | from kerasplotlib.text import text
39 |
40 | data.drop_duplicates(inplace=True)
41 |
42 | if notebook:
43 | text(data, 'text', title=title)
44 | else:
45 | print(title)
46 | for t in data['text'].head(n):
47 | print(t)
48 | print('\n')
49 |
50 |
51 | def summary(self, sensitivity=.1, n=5, notebook=True):
52 |
53 | '''Provides a summary of the classification results for the model
54 |
55 | sensitivity : float
56 | Cut-off point for deciding between positive and negative.
57 | n : int
58 | Number of samples to display.
59 | notebook : bool
60 | Use html printing for notebooks.
61 |
62 | '''
63 |
64 | pos = self.results[self.results.pred > 1 - sensitivity].sample(n, replace=True)
65 | self._printing(pos, 'CLEAR POSITIVE', n, notebook)
66 |
67 | mid = self.results[self.results.pred.between(0.5 - (sensitivity / 2), 0.5 + (sensitivity / 2))].sample(n, replace=True)
68 | self._printing(mid, 'CLOSE CALL', n, notebook)
69 |
70 | neg = self.results[self.results.pred < sensitivity].sample(n, replace=True)
71 | self._printing(neg, 'CLEAR NEGATIVE', n, notebook)
72 |
73 |
74 | def misses(self, sensitivity=.5, n=5, notebook=True):
75 |
76 | '''Random examples of false positives and false negatives
77 |
78 | sensitivity : float
79 | Cut-off point for deciding between positive and negative.
80 | n : int
81 | Number of samples to display.
82 | notebook : bool
83 | Use html printing for notebooks.
84 |
85 | '''
86 |
87 | # false positive
88 | fp = self.results[self.results.truth == 0][self.results.pred > 1 - sensitivity].sample(n, replace=True)
89 | self._printing(fp, 'FALSE POSITIVE', n, notebook)
90 |
91 | # false negatives
92 | fn = self.results[self.results.truth == 1][self.results.pred < sensitivity].sample(n, replace=True)
93 | self._printing(fn, 'FALSE NEGATIVE', n, notebook)
94 |
95 | def hits(self, sensitivity=.1, n=5, notebook=True):
96 |
97 | '''Random examples of true positives and true negatives
98 |
99 | sensitivity : float
100 | Cut-off point for deciding between positive and negative.
101 | n : int
102 | Number of samples to display.
103 | notebook : bool
104 | Use html printing for notebooks.
105 |
106 | '''
107 |
108 | # true positive
109 | tp = self.results[self.results.truth == 1][self.results.pred > 1 - sensitivity].sample(n, replace=True)
110 | self._printing(tp, 'TRUE POSITIVE', n, notebook)
111 |
112 | # true negatives
113 | tn = self.results[self.results.truth == 0][self.results.pred < sensitivity].sample(n, replace=True)
114 | self._printing(tn, 'TRUE NEGATIVE', n, notebook)
115 |
--------------------------------------------------------------------------------
/signs/grams/grams.py:
--------------------------------------------------------------------------------
1 | class Grams:
2 |
3 | def __init__(self, text):
4 |
5 | '''Takes in a list of words representing a document
6 | and returns a list of ngrams or skipgrams.
7 |
8 | text : list
9 | A list of words representing a document.
10 |
11 | '''
12 |
13 | self._text = text
14 | self._counter = len(text)
15 |
16 | def ngrams(self, ngram=2, skip=0):
17 |
18 | '''Produce ngrams or skipgrams.
19 |
20 | ngram: int
21 | Number of words per gram.
22 | skip : int
23 | The number of words to skip for each gram'''
24 |
25 | # the output
26 | out = []
27 |
28 | # process the doc word-by-word
29 | for i in range(self._counter):
30 | try:
31 | # create the list for single instance of grams
32 | words = [self._text[i]]
33 |
34 | # find the grams
35 | for ii in range(ngram - 1):
36 | words.append(self._text[i + ii + 1 + skip])
37 |
38 | # add to the output
39 | out.append(words)
40 |
41 | # handle the case where end is near
42 | except IndexError:
43 | pass
44 |
45 | return out
46 |
47 | def combinations(self, length=3):
48 |
49 | '''Produce every possible combination of n length.
50 |
51 | length | int | length of the combinations
52 | '''
53 |
54 | import itertools
55 |
56 | out = []
57 |
58 | for i in list(itertools.combinations(self._text, length)):
59 | if len(i) == len(set(i)):
60 | out.append(i)
61 |
62 | return out
63 |
64 | def flexgrams(self, length=3, flex_n=1):
65 |
66 | '''Simple flexgram where all combinations of certain length
67 | are returned, but n items are randomly dropped. The final
68 | length must be shorter than number of items minus flex_n
69 |
70 | length | int | length of the resulting combinations
71 | flex_n | int | the number of items to randomly drop
72 |
73 | '''
74 | import random
75 | import itertools
76 |
77 | combinations = list(itertools.combinations(self._text, length))
78 |
79 | out = []
80 |
81 | for i in [random.sample(i, k=length-flex_n) for i in combinations]:
82 | if len(i) == len(set(i)):
83 | if i not in out:
84 | out.append(i)
85 |
86 | return out
87 |
88 | def sobolgrams(self, length=3, n=2):
89 |
90 | '''Sobolgram is a random variation of a flexgram, where
91 | instead of following a sequential order of strings and then
92 | randomly dropping words, there is no order at all and grams
93 | are formed based on random pick within the doc.
94 |
95 | length | int | length of the resulting combinations
96 | n | int | number of grams to draw
97 |
98 | '''
99 |
100 | import random
101 |
102 | out = []
103 |
104 | while len(out) < n:
105 |
106 | gram = random.sample(self._text, k=length)
107 |
108 | if len(set(gram)) == length:
109 | if gram not in out:
110 | out.append(gram)
111 |
112 | return out
113 |
114 | def skipgrams(self, length=3, skip_nth=2):
115 |
116 | '''Simple flexgram where all combinations of certain length
117 | are returned, but n items are randomly dropped. The final
118 | length must be shorter than number of items minus flex_n
119 |
120 | length | int | length of the resulting combinations
121 | skip_nth | int | the number of item to skip
122 |
123 | '''
124 |
125 | import itertools
126 | import numpy as np
127 |
128 | combinations = list(itertools.combinations(self._text, length))
129 |
130 | # create indicies
131 | indicies = list(range(length))
132 | indicies.remove(skip_nth)
133 |
134 | out = []
135 |
136 | for i in [np.array(i)[indicies].tolist() for i in combinations]:
137 | if i not in out:
138 | out.append(i)
139 |
140 | return out
141 |
--------------------------------------------------------------------------------
/examples/Cleaning Documents for NLP with Signs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Signs is a set of tools for text preparation, vectorization and processing. Below is provided a set of examples that cover many of the commonly used workflows. "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import signs as signs"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "### Cleaning | `signs.Clean()`\n",
24 | "\n",
25 | "While `signs.Transform()` has `auto=True` for automatic cleaning, sometimes it's useful to explicitly clean documents. Let's start with an automated example."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "doc = ' Jack is a green 😂😂😂 cat... \\n with a hat \\n '"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# create the object\n",
44 | "cleaned = signs.Clean(doc)\n",
45 | "\n",
46 | "# access the text\n",
47 | "cleaned.text\n",
48 | "\n",
49 | "# you could of course also directly do\n",
50 | "signs.Clean(doc).text"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "All the cleaning operations can be accessed individually, and not all cleaning operations are included in the automatic processing."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "# create the object\n",
67 | "cleaned = signs.Clean(doc, auto=False)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "# make text all caps\n",
77 | "cleaned.caps()\n",
78 | "\n",
79 | "# make text all lower\n",
80 | "cleaned.low()\n",
81 | "\n",
82 | "# decode the text\n",
83 | "cleaned.decod()\n",
84 | "\n",
85 | "# remove emojis\n",
86 | "cleaned.emoji()\n",
87 | "\n",
88 | "# remove leading and trailing whitespace\n",
89 | "cleaned.leadtrail()\n",
90 | "\n",
91 | "# remove all whitespace\n",
92 | "cleaned.whitespace()\n",
93 | "\n",
94 | "# remove linebreaks\n",
95 | "cleaned.linebreaks()\n",
96 | "\n",
97 | "# remove links\n",
98 | "cleaned.links()\n",
99 | "\n",
100 | "# remove punctuation\n",
101 | "cleaned.punct()\n",
102 | "\n",
103 | "# remove arbitrary string\n",
104 | "cleaned.string('is a green')"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "### Remove common words | `signs.Stopwords()`\n",
112 | "\n",
113 | "**Signs** another common operation for data cleaning involves removing a list of words from the documents. \n",
114 | "\n",
115 | "For this purpose we have to transform the documents into a list-of-lists where each sublist consist of a tokenized document. This easily done with `signs.Transform()`. Because `signs.Transform()` expects as input a set of documents, and only have one, we have to wrap the document in a list."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# transform doc/s to the right format\n",
125 | "tokens = signs.Transform([doc]).tokens()\n",
126 | "\n",
127 | "# filter the docs\n",
128 | "filtered_tokens = signs.Stopwords(tokens)\n",
129 | "\n",
130 | "# then access the filtered docs\n",
131 | "filtered_tokens.docs"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "`signs.Stopwords()` allows several options for customization."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "# set minimum length for words\n",
148 | "signs.Stopwords(tokens, min_length=3)\n",
149 | "\n",
150 | "# set maximum threshold for words (accept all words above this)\n",
151 | "signs.Stopwords(tokens, max_threshold=8)\n",
152 | "\n",
153 | "# add custom words\n",
154 | "signs.Stopwords(tokens, add_stopwords=['jack'])\n",
155 | "\n",
156 | "# just use custom words\n",
157 | "signs.Stopwords(tokens, common_stopwords=False, add_stopwords=['jack']).docs"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "Bare in mind that all operations in `signs.Stopwords()` are destructive."
165 | ]
166 | }
167 | ],
168 | "metadata": {
169 | "kernelspec": {
170 | "display_name": "Python 3",
171 | "language": "python",
172 | "name": "python3"
173 | },
174 | "language_info": {
175 | "codemirror_mode": {
176 | "name": "ipython",
177 | "version": 3
178 | },
179 | "file_extension": ".py",
180 | "mimetype": "text/x-python",
181 | "name": "python",
182 | "nbconvert_exporter": "python",
183 | "pygments_lexer": "ipython3",
184 | "version": "3.6.6"
185 | }
186 | },
187 | "nbformat": 4,
188 | "nbformat_minor": 2
189 | }
190 |
--------------------------------------------------------------------------------
/examples/Rapid NLP Development Examples with Signs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Signs is a set of tools for text preparation, vectorization and processing. Below is provided a set of examples that cover many of the commonly used workflows. "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import signs as signs"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "First, we will read some data."
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "import dedomena as da\n",
33 | "docs = da.apis.pubmed('cervical cancer', 500, True)"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "### Transformation | `signs.Transform()`\n",
41 | "\n",
42 | "Then next, let's clean up the data. `signs.Transform()` allows the systematic creation of all important data formats from a single class object. If `clean=True` then the following preprocessing tasks will be performed:\n",
43 | "\n",
44 | "- force lower case\n",
45 | "- remove urls\n",
46 | "- remove emojis\n",
47 | "- remove punctuation\n",
48 | "- remove linebreaks\n",
49 | "- remove leading and traing whitespace\n",
50 | "- comprehensive stopwords filtering\n",
51 | "- decode from binary"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "docs = signs.Transform(docs)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "# return the original docs\n",
70 | "docs.docs()\n",
71 | "\n",
72 | "# return the original docs cleaned\n",
73 | "docs.docs(True)\n",
74 | "\n",
75 | "# return original docs but flattened\n",
76 | "docs.docs_flat()\n",
77 | "\n",
78 | "# return original docs flattened and clean\n",
79 | "docs.docs_flat(True)\n",
80 | "\n",
81 | "# return original docs in a single string blob\n",
82 | "docs.docs_string()\n",
83 | "\n",
84 | "# return original docs in single string blob cleaned\n",
85 | "docs.docs_string(True)\n",
86 | "\n",
87 | "# return tokenized version of docs\n",
88 | "docs.tokens()\n",
89 | "\n",
90 | "# return tokenized version cleaned\n",
91 | "docs.tokens(True)\n",
92 | "\n",
93 | "# return tokenized and flattend\n",
94 | "docs.tokens_flat()\n",
95 | "\n",
96 | "# return tokenized and flattened clean\n",
97 | "docs.tokens_flat(True)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "All of the the following examples will utilize one of these data formats by calling the class object `docs` we have created above. It's better to always ingest the original docs into `signs.Transform` to minimize overhead while being sure of format compliance."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "### Text Statistics | `signs.Stopwords()`\n",
112 | "\n",
113 | "**Signs** allows stopword removal against an arbitrary sized list of stopwords in roughly 10,000 documents per second."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "filtered_tokens = signs.Stopwords(docs.tokens())"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "### Text Statistics | `signs.Describe()`\n",
130 | "**Signs** provides common text analytics functionalities under the Describe() class. "
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# read documents\n",
140 | "desc = signs.Describe(docs.tokens_flat())"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "desc.get_counts()\n",
150 | "desc.get_gram_counts(3, 1)"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "### Grams | `signs.Grams()`\n",
158 | "**Signs** provides access to ngrams and skipgrams through `signs.Grams()`. "
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "# bigrams\n",
168 | "signs.Grams(docs.tokens_flat()).ngrams(2)\n",
169 | "\n",
170 | "# trigrams\n",
171 | "signs.Grams(docs.tokens_flat()).ngrams(3)\n",
172 | "\n",
173 | "# trigram with 2-step skipgram\n",
174 | "signs.Grams(docs.tokens_flat()).ngrams(3, 2)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "### Snippets of text | `signs.Verbatims()`\n",
182 | "Another helpful feature is exracting verbatims based on a keyword and boundary through `signs.Verbatims()`. "
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "signs.Verbatims(docs.tokens_flat()).verbatims('cell')"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "### Document Vectors | `signs.TrainDoc2Vec()`"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "# then train the Doc2Vec model\n",
208 | "model, train_corpus = signs.TrainDoc2Vec(docs.tokens()[:450])"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "### Document Similarity | `signs.DocSimilarity()`\n",
216 | "\n",
217 | "There are several document similarity options available. Examples for each are provided below.\n",
218 | "\n",
219 | "- similarity matrix for seen documents\n",
220 | "- similarity matrix for unseen documents\n",
221 | "- similarity between a single unseen document and seen docs\n",
222 | "- spatial distance between two documents, seen or unseen"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "sims = signs.DocSimilarity(model, docs)"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "There are several options for getting the similarities:\n",
239 | "\n",
240 | "- `similar_docs()` for any document to all training documents\n",
241 | "- `spatial_distance()` for any document to any document\n",
242 | "- `seen_matrix()` for a 2d similarity matrix for all training documents\n",
243 | "- `unseen_matrix()` for a 2d similarity matrix for any set of documents\n",
244 | "\n",
245 | "Note that `unseen_matrix()` might take time as the matrix grows. Example of use as below:"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "sims.similar_docs(docs.tokens()[1])\n",
255 | "\n",
256 | "sims.spatial_distance(doc1=docs.tokens()[451],\n",
257 | " doc2=docs.tokens()[452])\n",
258 | "\n",
259 | "sims.seen_matrix()\n",
260 | "\n",
261 | "sims.unseen_matrix(docs.tokens())"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "Finally, there is a method for previewing the most and least similar documents as a reference. This is done with `sims.preview_results()`."
269 | ]
270 | }
271 | ],
272 | "metadata": {
273 | "kernelspec": {
274 | "display_name": "Python 3",
275 | "language": "python",
276 | "name": "python3"
277 | },
278 | "language_info": {
279 | "codemirror_mode": {
280 | "name": "ipython",
281 | "version": 3
282 | },
283 | "file_extension": ".py",
284 | "mimetype": "text/x-python",
285 | "name": "python",
286 | "nbconvert_exporter": "python",
287 | "pygments_lexer": "ipython3",
288 | "version": "3.6.6"
289 | }
290 | },
291 | "nbformat": 4,
292 | "nbformat_minor": 2
293 | }
294 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Autonomio
2 |
3 | First I want to thank you A LOT for considering / taking the effort to contribute code to Signs. Below you will find some simple and mostly obvious guidelines on how to do it in the most valuable way.
4 |
5 | In one picture, all contributions should keep in mind that great software is built like...
6 |
7 |
8 |
9 |
10 | 1. [Ways to Contribute](#ways-to-contribute)
11 |
12 | 1.1. [Code](#code)
13 |
14 | 1.2. [Ideas](#ideas)
15 |
16 | 1.3. [Testing](#testing)
17 |
18 | 1.4. [Something Else](#something)
19 |
20 | 1.5. [Documentation](#documentation)
21 |
22 | 1.6. [Examples](#examples)
23 |
24 | 2. [Important Precautions for Code Contributions](#precautions)
25 |
26 | 2.1. [Planning](#code)
27 |
28 | 2.2. [Testing](#ideas)
29 |
30 | 2.3. [Documentation](#docs_for_review)
31 |
32 | 3. [Reviewing Pull Requests](#review)
33 |
34 | 4. [Specific Guidelines for Github](#github)
35 |
36 | ## 1. Ways to contribute
37 |
38 | There are several ways programmers, data scientists and others can contribute to Signs.
39 |
40 | #### 1.1. Contributing Code
41 |
42 | ##### 1.1.0. Note on Philosophy and Style
43 |
44 | **SIGNS DEV PHILOSOPHY**
45 |
46 | - Doing is more interesting than achieving
47 | - Having fun is more important than being productive
48 | - User docs are as important as features
49 | - Testing is more important than building
50 | - Testing should be focused on function not coverage
51 | - Creating great stuff (that works) takes long time
52 |
53 | **CODING STYLE GUIDELINES**
54 |
55 | We follow pep8. Because [reading docs](http://legacy.python.org/dev/peps/pep-0008/) and particulary [style guides](http://legacy.python.org/dev/peps/pep-0008/) more or less suck, we use Atom and the amazing Linter plugin so we don't have to.
56 |
57 | **MORE STYLE GUIDELINES**
58 |
59 | We also make the best effort in moving towards following pep20:
60 |
61 | - Beautiful is better than ugly.
62 | - Explicit is better than implicit.
63 | - Simple is better than complex.
64 | - Complex is better than complicated.
65 | - Flat is better than nested.
66 | - Sparse is better than dense.
67 | - Readability counts.
68 | - Special cases aren't special enough to break the rules.
69 | - Although practicality beats purity.
70 | - Errors should never pass silently.
71 | - Unless explicitly silenced.
72 | - In the face of ambiguity, refuse the temptation to guess.
73 | - There should be one-- and preferably only one --obvious way to do it.
74 | - Although that way may not be obvious at first unless you're Dutch.
75 | - Now is better than never.
76 | - Although never is often better than right now.
77 | - If the implementation is hard to explain, it's a bad idea.
78 | - If the implementation is easy to explain, it may be a good idea.
79 | - Namespaces are one honking great idea -- let's do more of those!
80 |
81 | ##### 1.1.1. Contribute to Open Issues
82 |
83 | It will be great if you can contribute some code to Signs. To do this, the best way is to:
84 |
85 | 1) check out the [open issues](https://github.com/autonomio/signs/issues)
86 | 2) join the conversation and share your willingness to contribute
87 | 3) somebody will help you get started / provide more details if needed
88 | 4) fork [the current dev](https://github.com/autonomio/signs/issues#fork-destination-box) branch
89 | 5) make your changes to your own fork/repo
90 | 6) test, test, test
91 | 7) if it's a new feature, make changes to test_script.py accordingly
92 | 8) make sure that Travis build passes
93 | 9) come back and make a pull request
94 |
95 | What we really try to avoid, is being this guy...
96 |
97 |
98 |
99 | **Committing Code**
100 |
101 | In the root of your local you will find a commit script, you can run it with:
102 |
103 | ./commit
104 |
105 | Do this before committing as it ensures that basic sanity checks are done before commits.
106 |
107 | #### 1.1.2. Contribute to a New Idea
108 |
109 | Same as above, but start by [creating a new issue](https://github.com/autonomio/signs/issues/new) to open a discussion on the idea you have for contribution.
110 |
111 | ### 1.2. Contributing Ideas
112 |
113 | In case you don't want to contribute code, but have a feature request or some other idea, that is a great contribution as well and will be much appreciated. You can do it by [creating a new issue](https://github.com/autonomio/signs/issues/new).
114 |
115 |
116 |
117 | ### 1.3. Contributing Testing
118 |
119 | In case you don't want to contribute code, but have a feature request or some other idea, that is a great contribution as well and will be much appreciated. You can do it by [creating a new issue](https://github.com/autonomio/signs/issues/new).
120 |
121 | **Testing comes in two forms:**
122 |
123 | #### 1.3.1 actual testing
124 |
125 | Just use Autonomio for any open challenge you are working on. Or pick one from [Kaggle](https://www.kaggle.com/competitions).
126 |
127 | 1) Work with Autonomion in data science challenges
128 | 2) Try a lot of different things
129 | 3) [Report issues](https://github.com/autonomio/core-module/issues/new) as you may find them
130 |
131 | #### 1.3.2 improving code coverage
132 |
133 | We're using [Coveralls](https://coveralls.io) for code coverage testing, and even the smallest contributions to this end help a great deal.
134 |
135 | 1) Follow the instructions in section 1.1 and 1.3.1
136 | 2) Use your own fork to see how the results improve in comparison to [current Master](https://coveralls.io/github/autonomio/core-module)
137 |
138 | ### 1.4. Contributing Something Else
139 |
140 | Last but not least, if there is something you want to do what was not covered in the above sections, please share more by [creating a new issue](https://github.com/autonomio/core-module/issues/new).
141 |
142 |
143 | ### 1.5. Contributing to Manual / Documentation
144 |
145 | We're using [Readthedocs](http://readthedocs.io) for documentation. See the latest [Autonomio documentation build](http://autonomio.readthedocs.io) for a reference of current status. The goal is to have comprehensive documentation, enough so that 100% of the actual practical capability is covered in terms of clear instructions. The documentation is automatically built from changes to [docs/index.rst](https://github.com/autonomio/core-module/tree/master/docs).
146 |
147 | To contribute to the Manual, you have two options:
148 |
149 | - The pro way: follow the steps in section 1.1
150 | - A simpler way: follow the steps in section 1.4
151 |
152 |
153 |
154 | ### 1.6. Contributing Examples
155 |
156 | One of the most useful ways to contribute is when you use Autononomio for an actual project / challenge, and then write a blog post about your experience with code examples.
157 |
158 | ## 2. Important Precautions for Code Contributions
159 |
160 | ### 2.1. Planning the Change
161 |
162 | Before even thinking about making any changes to actual code:
163 |
164 | 1) Define what is happening now (what needs to be changed)
165 | 2) Define what is happening differently (once the code is changed)
166 | 3) Use text search to find which files / functions are affected
167 | 4) Make sure that you understand what each function is doing in relation to the change
168 |
169 | ### 2.2. Testing the Change
170 |
171 | Never ever, under any circumstances, commit code that is not thoroughly tested:
172 |
173 | 1) Run through the code changes and ask yourself if it makes sense
174 | 2) Create a clean environment and install from your fork:
175 |
176 | pip install git+http://your-fork-repo-address.git
177 |
178 | 3) Perform all the commands where your changes are involved and note them down
179 | 4) Change the test_script.py in the repo root with the commands from step 3
180 | 5) Make sure that code coverage is not becoming lower
181 | 6) Make sure that Travis build is passed
182 |
183 | In terms of code coverage, 100% coverage for your changes (coverage does not drop at all) should be the case always. If you can't do that, then at least explain the possible caveats you've made in your commit details and also in the comments section of the pull request you are making.
184 |
185 | Once you've gone through all these steps, take a short break, come back and ask yourself the question:
186 |
187 | "WHAT COULD GO WRONG?"
188 |
189 | ### 2.3. Documentation standards
190 |
191 | Because Autonomio is to sombased on creating a very high level abstraction, and a lot of "magic" takes place without the user knowing about it, it's very important that we are elaborate in our documentation. This needs to take place to the extent that users like the one who made the following comment are happy:
192 |
193 | > Strong documentation and tutorials, with an emphasis on data prep. Don't abstract things at the cost of understanding what is going on behind the hood. I want to know how it all works, and what is being done to my data - I just don't want to have to code every step.
194 |
195 | This is the kind of situation we are trying to create; a lot of mundane but hard or time consuming to do things are taken care of on behalf of the user, but if the user so desires, he or she can get a full (and clear!) picture of what's being done.
196 |
197 | In order to pass code review (required for merging pull requests), together with your code in the same pull request you must provide the following updates to the documentation [docs/index.rst](https://github.com/mikkokotila/core-module/blob/master/docs/index.rst).
198 |
199 | 1) What does it do (a high level overview)
200 | 1.1) some use examples
201 | 2) What data input is requires
202 | 2.1) some use examples
203 | 3) What it outputs
204 | 4) What parameters are there (a high level overview)
205 | 4.1) What options each parameter has
206 | 4.2) What is the function of each parameter option
207 |
208 | In short summary, the goal is that the user can completely understand 100% of the functioning of the features. If something is done so that the user can’t see it when they are running the commands, we have to explain exactly and thoroughly what kind of automations are taking place.
209 |
210 | ### 3. Reviewing Pull Requests
211 |
212 | If you've been assigned as a reviewer of a given pull request, unless you've been explicitly asked to do so, **DON'T MERGE** just approve the review and share in the comments what you think. If you don't have any comments, just confirm with a comment that you don't have any. While this is kind of obvious, don't start reviewing before you can see all the tests have passed ;)
213 |
214 | ### 4. General points on using Github
215 |
216 | 1) First things first, make sure you understand [this](https://guides.github.com/introduction/flow/index.html) 100%
217 | 2) Also make sure that you clearly understand everything that is said [here](https://blog.hartleybrody.com/git-small-teams/)
218 | 3) Working on your local machine, only have one folder (the git remote)
219 | 4) Load it as module with:
220 |
221 | import sys
222 | return sys.path.insert(0, '/home/autonomio/dev/core-module')
223 |
224 | 5) Frequently fetch origin to make sure you get latest changes from other people
225 | 6) Don’t work in separate forks, but in branches
226 | 7) Keep commits as small as possible
227 | 8) Make clear commit messages (explain what you actually are changing)
228 | 9) Unless working on something completely new on a separate brach, never start new day without fetching origin
229 | 10) Make sure to update test script before pushing
230 |
231 | For Mac users Github desktop is pretty fantastic. For Linux users the GUIs are not so fantastic. Atom looks like a good cross-platform option.
232 |
--------------------------------------------------------------------------------
/examples/Fake News Classification Signs Embeds Pipeline and Hyperparameter Optimization for Keras and TF2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Signs is a set of tools for text preparation, vectorization and processing. Below is provided a set of examples that cover many of the commonly used workflows. "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stderr",
17 | "output_type": "stream",
18 | "text": [
19 | "Using TensorFlow backend.\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "import sys\n",
25 | "sys.path.insert(0, '/Users/mikko/Documents/GitHub/talos/')\n",
26 | "sys.path.insert(0, '/Users/mikko/Documents/GitHub/signs/')\n",
27 | "sys.path.insert(0, '/Users/mikko/Documents/GitHub/dedomena/')\n",
28 | "\n",
29 | "from pandas import read_csv\n",
30 | "\n",
31 | "import signs\n",
32 | "import wrangle as wr\n",
33 | "from kerasplotlib import TrainingLog\n",
34 | "import talos\n",
35 | "\n",
36 | "%matplotlib inline"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "Let's read some data first and the title of each document for training a model. We're going to use a fake news dataset for these examples."
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "import dedomena\n",
53 | "df = dedomena.datasets.autonomio('fake_news')\n",
54 | "\n",
55 | "# prepared the documents from the dataset\n",
56 | "docs = df.title[:1000].astype(str)\n",
57 | "\n",
58 | "# create y data\n",
59 | "y = df.label[:1000].values"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "### Embeddings | `signs.Embeds()`\n",
67 | "**Signs** provides a very convinient way to create embeddings for a TF/Keras model. You can read in any pretrained vectors from one of the supported vector types:\n",
68 | "\n",
69 | "- GloVe\n",
70 | "- Word2Vec\n",
71 | "- FastText"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "embeds = signs.Embeds(\"/Volumes/KINGSTON/glove.twitter.27B.25d.txt\")"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "Contained within the created `embeds` object, we now have the Keras embedding layer which we can use to ingest our documents."
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 4,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# here we also get the embedding layer for keras\n",
97 | "embedding_layer, x = embeds.layer(docs)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "Now that we have our `x` data and the embeddings layer for the model ready, we can finally split the data before moving onto the model. We will use 30% of the data to validate the results after the hyperparameter scanning process is finished."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 5,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "import wrangle\n",
114 | "x_train, y_train, x_val, y_val = wrangle.array_split(x, y, .3)"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "For hyperparameter optimization we're going to use another Autonomio solution, Talos."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 6,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "from keras.models import Sequential\n",
131 | "from keras.layers import Dense, Dropout, Flatten\n",
132 | "\n",
133 | "\n",
134 | "def fake_news(x_train, y_train, x_val, y_val, params):\n",
135 | "\n",
136 | " model = Sequential()\n",
137 | " model.add(params['embedding_layer'])\n",
138 | " model.add(Flatten())\n",
139 | " model.add(Dropout(params['dropout']))\n",
140 | "\n",
141 | " model.add(Dense(1, activation='sigmoid'))\n",
142 | " model.compile(optimizer=params['optimizer'],\n",
143 | " loss=params['losses'],\n",
144 | " metrics=['acc'])\n",
145 | "\n",
146 | " out = model.fit(x_train, y_train,\n",
147 | " epochs=params['epochs'],\n",
148 | " batch_size=params['batch_size'],\n",
149 | " verbose=0,\n",
150 | " validation_split=.3,\n",
151 | " validation_data=[x_val, y_val])\n",
152 | " \n",
153 | " return out, model"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "In addition to the input model, Talos requires us to provide a parameter dictionary with the parameters for the experiment."
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 7,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "params = {'embedding_layer': [embedding_layer],\n",
170 | " 'batch_size': (10, 30, 5),\n",
171 | " 'epochs': [50],\n",
172 | " 'dropout': (0.1, 0.3, 10),\n",
173 | " 'optimizer': ['Adam', 'Nadam'],\n",
174 | " 'losses': ['binary_crossentropy', 'logcosh'],\n",
175 | " 'activation':['relu', 'elu']}"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "The really cool thing here is that we could try different embedding layers (based on different trained vectors for example) as part of the experiment. "
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 8,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stderr",
192 | "output_type": "stream",
193 | "text": [
194 | "100%|██████████| 40/40 [05:49<00:00, 10.16s/it]\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "h = talos.Scan(x_train, y_train,\n",
200 | " params=params,\n",
201 | " experiment_name='fake_news_test',\n",
202 | " model=fake_news,\n",
203 | " fraction_limit=0.1,\n",
204 | " clear_session=False)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "### Predictions | `signs.Preds()`\n",
212 | "Next, let's put the best model from the experiment into use to see how the results look like. For this, we need to first find the best model from the Talos `Scan()` object."
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 13,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "# get the best model from the experiment\n",
222 | "model = h.best_model()\n",
223 | "\n",
224 | "# prepare the predictions object\n",
225 | "preds = signs.Preds(x_val, y_val, embeds.word_index, model)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "There are several ways we can learn more about the model. "
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 17,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/html": [
243 | "TRUE POSITIVE
"
244 | ],
245 | "text/plain": [
246 | ""
247 | ]
248 | },
249 | "metadata": {},
250 | "output_type": "display_data"
251 | },
252 | {
253 | "data": {
254 | "text/html": [
255 | "no intention of attacking anyone this is absurd ' says vladimir putin"
256 | ],
257 | "text/plain": [
258 | ""
259 | ]
260 | },
261 | "metadata": {},
262 | "output_type": "display_data"
263 | },
264 | {
265 | "data": {
266 | "text/html": [
267 | "why polls showing hillary in the lead are useless and misleading cartoon"
268 | ],
269 | "text/plain": [
270 | ""
271 | ]
272 | },
273 | "metadata": {},
274 | "output_type": "display_data"
275 | },
276 | {
277 | "data": {
278 | "text/html": [
279 | "carter attempting to fool the american public about veterans caught red handed"
280 | ],
281 | "text/plain": [
282 | ""
283 | ]
284 | },
285 | "metadata": {},
286 | "output_type": "display_data"
287 | },
288 | {
289 | "data": {
290 | "text/html": [
291 | "this is the man militarized police at standing rock are working for"
292 | ],
293 | "text/plain": [
294 | ""
295 | ]
296 | },
297 | "metadata": {},
298 | "output_type": "display_data"
299 | },
300 | {
301 | "data": {
302 | "text/html": [
303 | "to be unable to use linkedin services — company russia news now"
304 | ],
305 | "text/plain": [
306 | ""
307 | ]
308 | },
309 | "metadata": {},
310 | "output_type": "display_data"
311 | },
312 | {
313 | "data": {
314 | "text/html": [
315 | "TRUE NEGATIVE
"
316 | ],
317 | "text/plain": [
318 | ""
319 | ]
320 | },
321 | "metadata": {},
322 | "output_type": "display_data"
323 | },
324 | {
325 | "data": {
326 | "text/html": [
327 | "a sunny spirit in the face of hardship the new york times"
328 | ],
329 | "text/plain": [
330 | ""
331 | ]
332 | },
333 | "metadata": {},
334 | "output_type": "display_data"
335 | },
336 | {
337 | "data": {
338 | "text/html": [
339 | "a pioneer in women’s hairstyling dies at 94 the new york times"
340 | ],
341 | "text/plain": [
342 | ""
343 | ]
344 | },
345 | "metadata": {},
346 | "output_type": "display_data"
347 | },
348 | {
349 | "data": {
350 | "text/html": [
351 | "failed to disclose income from russia linked entities the new york times"
352 | ],
353 | "text/plain": [
354 | ""
355 | ]
356 | },
357 | "metadata": {},
358 | "output_type": "display_data"
359 | },
360 | {
361 | "data": {
362 | "text/html": [
363 | "branch might keep parts of the health law the new york times"
364 | ],
365 | "text/plain": [
366 | ""
367 | ]
368 | },
369 | "metadata": {},
370 | "output_type": "display_data"
371 | },
372 | {
373 | "data": {
374 | "text/html": [
375 | "world’s smallest machines awarded nobel prize in chemistry the new york times"
376 | ],
377 | "text/plain": [
378 | ""
379 | ]
380 | },
381 | "metadata": {},
382 | "output_type": "display_data"
383 | }
384 | ],
385 | "source": [
386 | "# Examples of true positives and true negatives\n",
387 | "preds.hits()"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 18,
393 | "metadata": {},
394 | "outputs": [
395 | {
396 | "data": {
397 | "text/html": [
398 | "FALSE POSITIVE
"
399 | ],
400 | "text/plain": [
401 | ""
402 | ]
403 | },
404 | "metadata": {},
405 | "output_type": "display_data"
406 | },
407 | {
408 | "data": {
409 | "text/html": [
410 | "goodbye for good to black sabbath the new york times"
411 | ],
412 | "text/plain": [
413 | ""
414 | ]
415 | },
416 | "metadata": {},
417 | "output_type": "display_data"
418 | },
419 | {
420 | "data": {
421 | "text/html": [
422 | "russia u s missile defense ’poses deep risk’ to security of asia"
423 | ],
424 | "text/plain": [
425 | ""
426 | ]
427 | },
428 | "metadata": {},
429 | "output_type": "display_data"
430 | },
431 | {
432 | "data": {
433 | "text/html": [
434 | "cnn statement distances network from buzzfeed fake news dossier breitbart"
435 | ],
436 | "text/plain": [
437 | ""
438 | ]
439 | },
440 | "metadata": {},
441 | "output_type": "display_data"
442 | },
443 | {
444 | "data": {
445 | "text/html": [
446 | "suicides by chicago police officers skyrocket"
447 | ],
448 | "text/plain": [
449 | ""
450 | ]
451 | },
452 | "metadata": {},
453 | "output_type": "display_data"
454 | },
455 | {
456 | "data": {
457 | "text/html": [
458 | "public employees and the shadow world of american carnage"
459 | ],
460 | "text/plain": [
461 | ""
462 | ]
463 | },
464 | "metadata": {},
465 | "output_type": "display_data"
466 | },
467 | {
468 | "data": {
469 | "text/html": [
470 | "FALSE NEGATIVE
"
471 | ],
472 | "text/plain": [
473 | ""
474 | ]
475 | },
476 | "metadata": {},
477 | "output_type": "display_data"
478 | },
479 | {
480 | "data": {
481 | "text/html": [
482 | "yikes megyn kelly receives rude awakening reminded she’s replaceable"
483 | ],
484 | "text/plain": [
485 | ""
486 | ]
487 | },
488 | "metadata": {},
489 | "output_type": "display_data"
490 | },
491 | {
492 | "data": {
493 | "text/html": [
494 | "texas elector expects massive corruption related to the electoral college vote"
495 | ],
496 | "text/plain": [
497 | ""
498 | ]
499 | },
500 | "metadata": {},
501 | "output_type": "display_data"
502 | },
503 | {
504 | "data": {
505 | "text/html": [
506 | "unsc members fail to agree on new zealand draft resolution on aleppo"
507 | ],
508 | "text/plain": [
509 | ""
510 | ]
511 | },
512 | "metadata": {},
513 | "output_type": "display_data"
514 | },
515 | {
516 | "data": {
517 | "text/html": [
518 | "saudi arabia announces date certain for the application of vat"
519 | ],
520 | "text/plain": [
521 | ""
522 | ]
523 | },
524 | "metadata": {},
525 | "output_type": "display_data"
526 | },
527 | {
528 | "data": {
529 | "text/html": [
530 | "putin’s pro trump online trolls just spilled the beans to samantha bee"
531 | ],
532 | "text/plain": [
533 | ""
534 | ]
535 | },
536 | "metadata": {},
537 | "output_type": "display_data"
538 | }
539 | ],
540 | "source": [
541 | "# Examples of false positives and false negatives\n",
542 | "preds.misses()"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": 20,
548 | "metadata": {},
549 | "outputs": [
550 | {
551 | "data": {
552 | "text/html": [
553 | "\n",
554 | "\n",
567 | "
\n",
568 | " \n",
569 | " \n",
570 | " | \n",
571 | " text | \n",
572 | " pred | \n",
573 | " truth | \n",
574 | "
\n",
575 | " \n",
576 | " \n",
577 | " \n",
578 | " | 0 | \n",
579 | " donald trump obama thanksgiving your weekend b... | \n",
580 | " 0.040221 | \n",
581 | " 0 | \n",
582 | "
\n",
583 | " \n",
584 | " | 1 | \n",
585 | " orcs of a different domain fighting with heart... | \n",
586 | " 0.002642 | \n",
587 | " 0 | \n",
588 | "
\n",
589 | " \n",
590 | " | 2 | \n",
591 | " loserpalooza 9 craziest scenes from anti trump... | \n",
592 | " 0.183427 | \n",
593 | " 0 | \n",
594 | "
\n",
595 | " \n",
596 | " | 3 | \n",
597 | " descubre que ha llevado siempre un trozo de le... | \n",
598 | " 0.239980 | \n",
599 | " 1 | \n",
600 | "
\n",
601 | " \n",
602 | " | 4 | \n",
603 | " builds 150 million war chest doubling donald t... | \n",
604 | " 0.007459 | \n",
605 | " 0 | \n",
606 | "
\n",
607 | " \n",
608 | "
\n",
609 | "
"
610 | ],
611 | "text/plain": [
612 | " text pred truth\n",
613 | "0 donald trump obama thanksgiving your weekend b... 0.040221 0\n",
614 | "1 orcs of a different domain fighting with heart... 0.002642 0\n",
615 | "2 loserpalooza 9 craziest scenes from anti trump... 0.183427 0\n",
616 | "3 descubre que ha llevado siempre un trozo de le... 0.239980 1\n",
617 | "4 builds 150 million war chest doubling donald t... 0.007459 0"
618 | ]
619 | },
620 | "execution_count": 20,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "# display the full results\n",
627 | "preds.results.head()"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 22,
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "data": {
637 | "text/html": [
638 | "CLEAR POSITIVE
"
639 | ],
640 | "text/plain": [
641 | ""
642 | ]
643 | },
644 | "metadata": {},
645 | "output_type": "display_data"
646 | },
647 | {
648 | "data": {
649 | "text/html": [
650 | "world’s first zero emissions hydrogen powered passenger train unveiled in germany"
651 | ],
652 | "text/plain": [
653 | ""
654 | ]
655 | },
656 | "metadata": {},
657 | "output_type": "display_data"
658 | },
659 | {
660 | "data": {
661 | "text/html": [
662 | "they are trying to “flip the electoral college” to block trump’s win"
663 | ],
664 | "text/plain": [
665 | ""
666 | ]
667 | },
668 | "metadata": {},
669 | "output_type": "display_data"
670 | },
671 | {
672 | "data": {
673 | "text/html": [
674 | "vandal outs himself in shameless new video taunts police seeking him out"
675 | ],
676 | "text/plain": [
677 | ""
678 | ]
679 | },
680 | "metadata": {},
681 | "output_type": "display_data"
682 | },
683 | {
684 | "data": {
685 | "text/html": [
686 | "cina e arabia saudita domano l'egemonia del dollaro di ariel noyola rodríguez"
687 | ],
688 | "text/plain": [
689 | ""
690 | ]
691 | },
692 | "metadata": {},
693 | "output_type": "display_data"
694 | },
695 | {
696 | "data": {
697 | "text/html": [
698 | "tory councillor say homeless people should be ‘eliminated’"
699 | ],
700 | "text/plain": [
701 | ""
702 | ]
703 | },
704 | "metadata": {},
705 | "output_type": "display_data"
706 | },
707 | {
708 | "data": {
709 | "text/html": [
710 | "CLOSE CALL
"
711 | ],
712 | "text/plain": [
713 | ""
714 | ]
715 | },
716 | "metadata": {},
717 | "output_type": "display_data"
718 | },
719 | {
720 | "data": {
721 | "text/html": [
722 | "coalition u s troops fighting in mosul offensive ’come under fire’"
723 | ],
724 | "text/plain": [
725 | ""
726 | ]
727 | },
728 | "metadata": {},
729 | "output_type": "display_data"
730 | },
731 | {
732 | "data": {
733 | "text/html": [
734 | "spirit cooking the most disturbing podesta email yet warning graphic content"
735 | ],
736 | "text/plain": [
737 | ""
738 | ]
739 | },
740 | "metadata": {},
741 | "output_type": "display_data"
742 | },
743 | {
744 | "data": {
745 | "text/html": [
746 | "saudi arabia announces date certain for the application of vat"
747 | ],
748 | "text/plain": [
749 | ""
750 | ]
751 | },
752 | "metadata": {},
753 | "output_type": "display_data"
754 | },
755 | {
756 | "data": {
757 | "text/html": [
758 | "chuck todd ’buzzfeed did donald trump a political favor’ breitbart"
759 | ],
760 | "text/plain": [
761 | ""
762 | ]
763 | },
764 | "metadata": {},
765 | "output_type": "display_data"
766 | },
767 | {
768 | "data": {
769 | "text/html": [
770 | "of muslims and immigrants to promote policies that ’undermine’ our values breitbart"
771 | ],
772 | "text/plain": [
773 | ""
774 | ]
775 | },
776 | "metadata": {},
777 | "output_type": "display_data"
778 | },
779 | {
780 | "data": {
781 | "text/html": [
782 | "CLEAR NEGATIVE
"
783 | ],
784 | "text/plain": [
785 | ""
786 | ]
787 | },
788 | "metadata": {},
789 | "output_type": "display_data"
790 | },
791 | {
792 | "data": {
793 | "text/html": [
794 | "next year is primed for the big show the new york times"
795 | ],
796 | "text/plain": [
797 | ""
798 | ]
799 | },
800 | "metadata": {},
801 | "output_type": "display_data"
802 | },
803 | {
804 | "data": {
805 | "text/html": [
806 | "radiohead’s ‘a moon shaped pool ’ patient perfectionism the new york times"
807 | ],
808 | "text/plain": [
809 | ""
810 | ]
811 | },
812 | "metadata": {},
813 | "output_type": "display_data"
814 | },
815 | {
816 | "data": {
817 | "text/html": [
818 | "american drivers regain appetite for gas guzzlers the new york times"
819 | ],
820 | "text/plain": [
821 | ""
822 | ]
823 | },
824 | "metadata": {},
825 | "output_type": "display_data"
826 | },
827 | {
828 | "data": {
829 | "text/html": [
830 | "buffett stake suggests apple is all grown up the new york times"
831 | ],
832 | "text/plain": [
833 | ""
834 | ]
835 | },
836 | "metadata": {},
837 | "output_type": "display_data"
838 | }
839 | ],
840 | "source": [
841 | "# a summary of model predictions\n",
842 | "preds.summary()"
843 | ]
844 | },
845 | {
846 | "cell_type": "markdown",
847 | "metadata": {},
848 | "source": [
849 | "### Evaluation | `signs.Evaluate()`\n",
850 | "Finally, let's perform some objective evaluation of the results to see how well the model is doing. We will use Talos for doing this."
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 30,
856 | "metadata": {},
857 | "outputs": [
858 | {
859 | "data": {
860 | "text/plain": [
861 | "[0.870967741935484,\n",
862 | " 0.819672131147541,\n",
863 | " 0.7878787878787877,\n",
864 | " 0.84375,\n",
865 | " 0.8666666666666667]"
866 | ]
867 | },
868 | "execution_count": 30,
869 | "metadata": {},
870 | "output_type": "execute_result"
871 | }
872 | ],
873 | "source": [
874 | "evl = ta.Evaluate(h)\n",
875 | "evl.evaluate(x_val, y_val, mode='binary')"
876 | ]
877 | }
878 | ],
879 | "metadata": {
880 | "kernelspec": {
881 | "display_name": "Python 3",
882 | "language": "python",
883 | "name": "python3"
884 | },
885 | "language_info": {
886 | "codemirror_mode": {
887 | "name": "ipython",
888 | "version": 3
889 | },
890 | "file_extension": ".py",
891 | "mimetype": "text/x-python",
892 | "name": "python",
893 | "nbconvert_exporter": "python",
894 | "pygments_lexer": "ipython3",
895 | "version": "3.6.6"
896 | }
897 | },
898 | "nbformat": 4,
899 | "nbformat_minor": 2
900 | }
901 |
--------------------------------------------------------------------------------