├── .gitignore
├── LICENSE
├── README.md
├── nate
    ├── __init__.py
    ├── cooc
    │   ├── __init__.py
    │   ├── cooc_class.py
    │   └── cooc_offsets.py
    ├── docnet
    │   ├── __init__.py
    │   └── docnet.py
    ├── edgeburst
    │   ├── __init__.py
    │   ├── burst_class.py
    │   ├── burst_mixin.py
    │   ├── export.py
    │   ├── pybursts.py
    │   └── visualize_bursts.py
    ├── importers
    │   ├── __init__.py
    │   ├── dataframe_importers.py
    │   ├── edgelist_importers.py
    │   ├── named_tuple_generator.py
    │   ├── nate_class.py
    │   ├── raw_importers.py
    │   └── timestamp_process.py
    ├── netplus
    │   ├── __init__.py
    │   └── netplus.py
    ├── semnet
    │   ├── __init__.py
    │   └── semnet.py
    ├── socnet
    │   ├── __init__.py
    │   ├── alters.py
    │   ├── centralities.py
    │   ├── dissimilarities.py
    │   ├── old_temsna
    │   │   ├── combine_covariates.py
    │   │   ├── create_author_covariates.py
    │   │   ├── extract_coauthor.py
    │   │   ├── generate_meta_strings.py
    │   │   ├── spacy_process
    │   │   │   ├── spacy_gpu.py
    │   │   │   ├── spacy_new.py
    │   │   │   └── spacy_processing_mp.py
    │   │   └── temsna_dependencies_sparse.png
    │   └── socnet_class.py
    ├── svonet
    │   ├── Arial.ttf
    │   ├── __init__.py
    │   ├── degree_over_time.py
    │   ├── graph_svo.py
    │   ├── svo.py
    │   ├── svo_burst_animate.py
    │   ├── svo_degree_over_time.py
    │   ├── svo_offsets.py
    │   ├── svoburst_class.py
    │   └── svonet_class.py
    └── utils
    │   ├── __init__.py
    │   ├── mp_helpers.py
    │   ├── network_helpers.py
    │   ├── nlp_helpers.py
    │   └── text_helpers.py
├── setup.py
└── tests
    └── importers
        ├── conftest.py
        ├── test_dfimporters.py
        ├── test_namedtuples.py
        ├── test_nate.py
        ├── test_rawimporters.py
        ├── test_times.py
        └── textfiles
            ├── 1.txt
            ├── 2.txt
            └── 3.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | planning/
  2 | archive/
  3 | testing.py
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | #
136 | hand/
137 | 
138 | #
139 | .vscode/
140 | .vscode
141 | 
142 | #
143 | *.org
144 | *.el
145 | 
146 | #
147 | output/*
148 | !*.gitkeep
149 | 
150 | #
151 | data/*
152 | .vscode


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 John McLevey
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://uwaterloo.ca/networks-lab/"><img src="http://www.johnmclevey.com/assets/img/logo.png" width="125"  align="right" /></a>
 2 | 
 3 | # nate (Network Analysis + Text)
 4 | 
 5 | *Research at the intersection of **social network analysis** and applied **natural language processing**.*
 6 | 
 7 | `nate` is a Python package designed and developed by [NETLAB](https://uwaterloo.ca/networks-lab/) at the [University of Waterloo](https://uwaterloo.ca/). It is designed to facilitate research at the intersection of social network analysis / network science and applied natural language processing. It scales efficiently for large and complex datasets. 
 8 | 
 9 | `nate` offers functionality for seamlessly connecting state-of-the-art machine learning models used in natural language processing workflows using [`spacy`](https://github.com/explosion/spaCy) with network analysis workflows using packages such as [`networkx`](https://networkx.github.io/), [`igraph`](https://igraph.org/python/) for Python, and [`graph-tool`](https://graph-tool.skewed.de/). `nate` has carefully-designed data structures that connect these two types of research workflows, and offers a set of tools for quickly producing descriptive reports and visualizations. 
10 | 
11 | # Installation 
12 | 
13 | ## GitHub 
14 | 
15 | If you want access to the most recent development version of `nate`, you can install it from the source code in this repository. 
16 | 
17 | `git clone https://github.com/UWNETLAB/nate.git && cd nate && pip install -e .`
18 | 
19 | # Documentation 
20 | 
21 | * Binder-enabled documentation coming soon... 
22 | 
23 | # Asking Questions and Getting Help 
24 | 
25 | It is not always possible for us to provide help via email. Instead, we encourage you to use the Github Issue Tracker. By answering your questions (or fixing bugs you find) in public, we can also help other members of the research community. 
26 | 
27 | # Selected Features 
28 | 
29 | * Coming soon... 
30 | 


--------------------------------------------------------------------------------
/nate/__init__.py:
--------------------------------------------------------------------------------
1 | from .importers.dataframe_importers import import_csv, import_dataframe, import_excel
2 | from .importers.raw_importers import import_files, import_text, import_dict_of_dicts


--------------------------------------------------------------------------------
/nate/cooc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/cooc/__init__.py


--------------------------------------------------------------------------------
/nate/cooc/cooc_class.py:
--------------------------------------------------------------------------------
 1 | """Definition of the `Cooc` pipeline, for co-occurence analysis.
 2 | 
 3 | This module defines the `Cooc` pipeline, which contains a dictionary of
 4 | the results from the co-occurence analysis which is conducted when
 5 | this class is instantiated from the `Nate` class's `cooc_pipeline()`
 6 | method. 
 7 | 
 8 | This class contains useful information in its own right, but primarily
 9 | serves as an intermediary for the `Bursts` class, which can be 
10 | instantiated using this class's `cooc_to_bursts()` method.
11 | """
12 | from typing import Dict, Union, List
13 | from ..edgeburst.burst_mixin import BurstMixin
14 | from nate.edgeburst.burst_class import Bursts
15 | 
16 | 
17 | class Cooc(BurstMixin):
18 |     """The main object in the `Cooc` pipeline.
19 | 
20 |     Attributes:
21 |         offset_dict (Dict): A dictionary with term-term pairs as keys and a list
22 |             of times when they occur as values.
23 |         lookup (Dict): A dictionary with the integer representation of a term as
24 |             key and the string representation as value.
25 |         minimum_offsets (int): The minimum number of 'offsets' - or occurrences
26 |             in the dataset - a given token/term pair must have had in order to
27 |             be retained.
28 |         from_svo (Bool): A flag to be passed to future steps in the pipeline
29 |             marking whether the data descended from an SVO class.
30 |             [Should be removed on future development.]
31 |     """
32 | 
33 |     def __init__(self,
34 |                  offset_dict: Dict,
35 |                  lookup: Dict,
36 |                  minimum_offsets: int = 20):
37 |         self.offset_dict = offset_dict
38 |         self.lookup = lookup
39 |         self.minimum_offsets = minimum_offsets
40 |         self.from_svo = False
41 | 
42 | 
43 |     def __getitem__(self, index: Union[slice, int, tuple]):
44 |         """Called when `Cooc` is accessed using indexing or slicing.
45 | 
46 |         Args:
47 |             index (slice): A range of integers used to retrieve corresponding entries
48 |                 in the `offset_dict` attribute.
49 | 
50 |         Returns:
51 |             List: A list of named tuples, each corresponding to one row in the dataset.
52 |         """
53 | 
54 |         if isinstance(index, slice) or isinstance(index, int):
55 |             return list(self.offset_dict.items())[index]
56 |         else:
57 |             return self.offset_dict[index]
58 | 
59 | 
60 |     def cooc_to_burst(self, s=2, gamma=1):
61 |         """Returns an instance of the `Bursts` class.
62 | 
63 |         Args:
64 |             s (float, optional): s parameter for tuning Kleinberg algorithm.
65 |                 Higher values make it more difficult for bursts to move up the
66 |                 burst hierarchy. Defaults to 2.
67 |             gamma (float, optional): gamma parameter for tuning Kleinberg
68 |                 algorithm. Higher values make it more difficult for activity to
69 |                 be considered a burst. Defaults to 1.
70 | 
71 |         Returns:
72 |             Bursts: An instance of the `Bursts` class containing data from the
73 |                 instance of the `Cooc` class it was called from.
74 |         """
75 |         offset_dict_strings, edge_burst_dict_strings, s, gamma, from_svo, lookup = self.burst_detection(
76 |             s, gamma)
77 | 
78 |         return Bursts(offset_dict_strings, edge_burst_dict_strings, s, gamma,
79 |                       from_svo, lookup)
80 | 


--------------------------------------------------------------------------------
/nate/cooc/cooc_offsets.py:
--------------------------------------------------------------------------------
  1 | """Builds the offset dictionary for the cooc pipeline.
  2 | 
  3 | The dictionary is of the form {(word1, word2):[time1,time2,...], ...}.
  4 | """
  5 | 
  6 | import pandas as pd
  7 | from time import time as marktime
  8 | from typing import List
  9 | from ..utils.mp_helpers import mp
 10 | from itertools import groupby, combinations, chain
 11 | from collections import defaultdict
 12 | 
 13 | 
 14 | def cooc_offsets(processed_list: List, time: List, minimum_offsets):
 15 |     """Generates the offset_dict for the `Cooc` pipeline.
 16 | 
 17 |     Args:
 18 |         processed_list (List): A list of lists, where each entry in the outer
 19 |             list represents a text, and the entries of each inner list are
 20 |             the tokens found in those texts in string form.
 21 |         time (List): A list of times for when each text was written.
 22 |         minimum_offsets (int): The minimum number of 'offsets' - or occurrences
 23 |             in the dataset - a given token/term pair must have in order to
 24 |             be retained.
 25 | 
 26 |     Returns:
 27 |         Dict: The offset dictionary for the `Cooc` class, with word-word pairs
 28 |             in integer format as keys and a list of offsets (occurence
 29 |             timestamps) as values.
 30 |         Dict: A lookup dictionary for each word in the corpus, with the integer
 31 |             representation as key and the string representation as value.
 32 |     """
 33 |     print("Generating Offsets:")
 34 | 
 35 |     start = marktime()
 36 | 
 37 |     # send list of documents to text_to_int so that cooc function can work with integers for memory and processing efficiency
 38 |     word_ints, lookup = text_to_int(processed_list)
 39 | 
 40 |     # multiprocess the cooc function on the list of integers
 41 |     offset_dict = mp(word_ints, cooc, time)
 42 | 
 43 | 
 44 |     # recreate the dictionary of offsets, pruning all those with a less occurrences than the minimum_offsets threshold
 45 |     offsets = {
 46 |         k: v for k, v in offset_dict.items() if len(v) >= minimum_offsets
 47 |     }
 48 | 
 49 |     print("Finished offset generation in {} seconds".format(
 50 |         round(marktime() - start)))
 51 |     print("Commencing timestamp deduplication...")
 52 | 
 53 |     # kleinberg requires that timestamps be unique - increment simultaneous occurrences by 1 millisecond.
 54 |     # Note: it's possible that some dataset will require this to be microseconds, if term pairs appear more than 999 times at once
 55 |     for item in offsets.keys():
 56 |         offsets[item].sort()
 57 |         offsets[item] = [
 58 |             g + i * 0.001
 59 |             for k, group in groupby(offsets[item])
 60 |             for i, g in enumerate(group)
 61 |         ]
 62 | 
 63 |     print("finished timestamp deduplication in {} seconds".format(
 64 |         round(marktime() - start)))
 65 | 
 66 |     print("Finished Generating Offsets. Returning offset dictionary.")
 67 | 
 68 |     return offsets, lookup
 69 | 
 70 | 
 71 | def text_to_int(processed_list):
 72 |     """Converts every word in a list of texts into an integer representation.
 73 | 
 74 |     After conversion to the integer representation, the tokens of the text are
 75 |     no longer in the same order. This function should only be used on texts
 76 |     where the distance between tokens in the source text is not relevant. It
 77 |     should only be used on texts where token co-occurence _in the same document_
 78 |     is relevant.
 79 | 
 80 |     Args:
 81 |         processed_list (List): A list of texts, where each text is a
 82 |             list of tokens (strings).
 83 | 
 84 |     Returns:
 85 |         List: A list of texts, where each text is a list of tokens (integers).
 86 |         Dict: A lookup dict, to convert integer representations of tokens
 87 |             to strings. It is of the form {i:s} where i is the integer
 88 |             representation of the token, and s is the string representation.
 89 |     """
 90 | 
 91 |     # sort string tokens in each text, keeping only unique words
 92 |     sorted_texts = [sorted(set(x)) for x in processed_list]
 93 | 
 94 |     # create a sorted list of all unique words in the corpus, used for the lookup dictionary
 95 |     flat_text = sorted(set(list(chain(*sorted_texts))))
 96 | 
 97 |     del processed_list
 98 | 
 99 |     # create dataframe with 1 column ('word') of words in the corpus
100 |     df = pd.DataFrame({'word': flat_text})
101 | 
102 |     del flat_text
103 | 
104 |     # use the dataframe index as the identifier for each word, casting to a dictionary
105 |     word_dict = df.reset_index().set_index('word')['index'].to_dict()
106 | 
107 |     # invert the dictionary, making word integers the keys, and words the values
108 |     lookup_dict = {v: k for k, v in word_dict.items()}
109 | 
110 |     # create a list (documents) of lists (words in each document) integer representation of the corpus
111 |     word_ints = [[word_dict[word] for word in text] for text in sorted_texts]
112 | 
113 |     del word_dict
114 | 
115 |     return word_ints, lookup_dict
116 | 
117 | 
118 | def cooc(time, word_ints):
119 |     """Generates co-occurence pairs from documents and their timestamps.
120 | 
121 |     Args:
122 |         time (List): A list of of the times each text in word_ints was written.
123 |         word_ints (List): A list of lists, where each entry in the outer list
124 |             represents a text, and the entries of each inner list are the
125 |             integer representations of tokens found in those texts (as
126 |             produced by text_to_int).
127 | 
128 |     Returns:
129 |         Dict: A dictionary with token-token pairs as keys and a list of
130 |             occurence timestamps as values.
131 |     """
132 | 
133 |     # use defaultdict so that dictionary entries are created if they don't exist already
134 |     offset_dict = defaultdict(list)
135 | 
136 |     # iterate through each document and its timestamp
137 |     for text, timestamp in zip(word_ints, time):
138 | 
139 |         # use combinations to find all word-pairs in the current document
140 |         keys = list(combinations(text, 2))
141 | 
142 |         # add current timestamp to list of timestamps (dictionary value) for each word-pair (dictionary key) found in current document
143 |         for key in keys:
144 |             offset_dict[key].append(timestamp)
145 | 
146 |     return offset_dict
147 | 


--------------------------------------------------------------------------------
/nate/docnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/docnet/__init__.py


--------------------------------------------------------------------------------
/nate/docnet/docnet.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a MODULE docstring
3 | """
4 | # Coming soon...
5 | 


--------------------------------------------------------------------------------
/nate/edgeburst/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/edgeburst/__init__.py


--------------------------------------------------------------------------------
/nate/edgeburst/burst_class.py:
--------------------------------------------------------------------------------
  1 | """Definition of the `Bursts` class, for analysis of bursty term relations.
  2 | 
  3 | While `BurstMixin` provides the actual burst detection functionality,
  4 | this module provides export and plotting functions to facilitate further
  5 | analysis.
  6 | """
  7 | from nate.edgeburst import pybursts
  8 | from ..utils.mp_helpers import mp
  9 | from .visualize_bursts import plot_bursts
 10 | from .export import df_export, max_bursts_export
 11 | from nate.edgeburst import visualize_bursts
 12 | from typing import Tuple, Dict, Callable, Union
 13 | 
 14 | 
 15 | def get_bursts(s, gamma, offset_list):
 16 |     """Sends Kleinberg parameters and offset_list to pybursts."""
 17 |     burst_list = pybursts.process(offset_list, s, gamma)
 18 | 
 19 |     return burst_list
 20 | 
 21 | 
 22 | def detect_bursts(offsets, s=2, gamma=1):
 23 |     """Returns dictionary with bursting terms as keys and burst data as values.
 24 | 
 25 |     Args:
 26 |         offsets (Dict): A dictionary of offsets, with keys being edge
 27 |             objects and the values being lists of occurence times.
 28 |         s (float, optional): s parameter for tuning Kleinberg algorithm.
 29 |             Higher values make it more difficult for bursts to move up the
 30 |             burst hierarchy. Defaults to 2.
 31 |         gamma (float, optional): gamma parameter for tuning Kleinberg
 32 |             algorithm. Higher values make it more difficult for activity to
 33 |             be considered a burst. Defaults to 1.
 34 | 
 35 |     Returns:
 36 |         Dict: A dictionary of bursts, with keys being edge objects and values
 37 |             being lists of burst data. Each burst is in the format
 38 |             [intensity, start_time, end_time].
 39 |     """
 40 |     key_list = list(offsets.keys())
 41 |     offset_list = list(offsets.values())
 42 | 
 43 |     burst_list = mp(offset_list, get_bursts, s, gamma)
 44 | 
 45 |     edge_bursts = dict(zip(key_list, burst_list))
 46 | 
 47 |     return edge_bursts
 48 | 
 49 | 
 50 | class Bursts():
 51 |     """The core burst detection class.
 52 | 
 53 |     This class provides all burst analysis functionality, including export
 54 |     and plotting abilities.
 55 | 
 56 |     Attributes:
 57 |         offset_dict (Dict): A dictionary with edge objects in string format
 58 |             as keys and occurence times as values.
 59 |         edge_burst_dict (Dict): A dictionary with edge objects in string format
 60 |             as keys and a list of bursts as values. The burst lists are in the
 61 |             format [intensity, start_time, end_time].
 62 |         s (float, optional): s parameter for tuning Kleinberg algorithm.
 63 |             Higher values make it more difficult for bursts to move up the
 64 |             burst hierarchy. Changing this parameter after object instatiation
 65 |             does not change the object's data.
 66 |         gamma (float, optional): gamma parameter for tuning Kleinberg
 67 |             algorithm. Higher values make it more difficult for activity to
 68 |             be considered a burst. Changing this parameter after object
 69 |             instatiation does not change the object's data.
 70 |         from_svo (Bool): A flad that determines whether the pipeline should be
 71 |             configured for bursts of SVOs.
 72 |         lookup (Dict): A lookup dictionary for terms, with integer
 73 |             representations as keys and string representations as values.
 74 |     """
 75 | 
 76 |     def __init__(self, offset_dict, edge_burst_dict, s, gamma, from_svo,
 77 |                  lookup):
 78 |         self.offset_dict: dict = offset_dict
 79 |         self.edge_burst_dict: dict = edge_burst_dict
 80 |         self.s = s
 81 |         self.gamma = gamma
 82 |         self.from_svo = from_svo # flag that determines whether the pipeline should be configured for bursts of SVOs
 83 |         self.bdf = None
 84 |         self.odf = None
 85 |         self.lookup = lookup
 86 | 
 87 |     def __getitem__(self, index: Union[slice, int, tuple]):
 88 |         """Called when `Bursts` is accessed using indexing or slicing.
 89 | 
 90 |         Args:
 91 |             index (slice): A range of integers used to retrieve corresponding
 92 |                 entries in the `offset_dict` attribute.
 93 | 
 94 |         Returns:
 95 |             List: A list of named tuples, each corresponding to one row in the
 96 |                 dataset.
 97 |         """
 98 | 
 99 |         if isinstance(index, slice) or isinstance(index, int):
100 |             return list(self.edge_burst_dict.items())[index]
101 |         else:
102 |             return self.edge_burst_dict[index]
103 | 
104 | 
105 |     def export_df(self):
106 |         """Exports burst data to a dataframe.
107 | 
108 |         Returns:
109 |             pandas.Dataframe: A dataframe containing all bursts data.
110 | 
111 |             The returned dataframe has the following columns:
112 |                 - Column(s) representing the edge objects (terms), whose names
113 |                   depend on the object the `Bursts` object was formed from.
114 |                 - 'bursts': A dict with
115 |                 - 'term_id' (int): The id of the edge object in the dataset.
116 |                   This will match the index.
117 |                 - 'interval_start' (datetime): The start of the burst.
118 |                 - 'interval_end' (datetime): The end of the burst.
119 |                 - 'intensity' (int): The intensity of the burst.
120 |         """
121 |         return df_export(self.edge_burst_dict, self.offset_dict, self.from_svo)
122 | 
123 |     def export_max_bursts(self):
124 |         """Returns a dict with edges as keys and all max bursts as values."""
125 |         return max_bursts_export(self.edge_burst_dict, self.from_svo)
126 | 
127 |     def to_pandas(self, key: Tuple, unit='s') -> Tuple[Dict, Dict]:
128 |         """Exports bursts and offsets to separate dataframes for a given key.
129 | 
130 |         TODO: refactor the wrapped function (visualize_bursts.to_pandas)
131 |         so that it is not SVO specific. Should not be much of an issue.
132 | 
133 |         Args:
134 |             key (Tuple): The edge for which burst and offset data will
135 |                 be extracted.
136 |             unit (str, optional): The unit to be passed to pd.to_datetime.
137 |                 Defaults to 's'.
138 | 
139 |         Returns:
140 |             Tuple[pandas.Dataframe, pandas.Dataframe]: The first dataframe
141 |                 contains burst data. The second dataframe contains offset data.
142 | 
143 |             The first dataframe has the following columns:
144 |                 - 'level' (int): The level of the burst.
145 |                 - 'start' (datetime): The start time of the burst.
146 |                 - 'end' (datetime): The end time of the burst.
147 |                 - 'svo' (string): The edge for which the dataframe contains
148 |                   data.
149 | 
150 |             The second dataframe has the following columns:
151 |                 - 'Date' (int): The date of the occurence.
152 |                 - 'Year' (int): The year of occurence.
153 |                 - 'Month' (int): The month of the occurence.
154 |                 - 'Day' (int): The day of the occurence.
155 |                 - 'svo' (string): The edge for which the dataframe contains
156 |                   data.
157 |         """
158 | 
159 |         offsets = self.offset_dict[key]
160 |         bursts = self.edge_burst_dict[key]
161 | 
162 |         return visualize_bursts.to_pandas(bursts, offsets, key, unit)
163 | 
164 |     def plot_bursts(self,
165 |                     key: Tuple,
166 |                     unit='s',
167 |                     lowest_level=0,
168 |                     title=True,
169 |                     daterange=None,
170 |                     xrangeoffsets=3):
171 |         """Plots the occurences and bursts of the given key.
172 | 
173 |         TODO: Refactor wrapped function so that it is not SVO specific.
174 | 
175 |         Args:
176 |             key (Tuple): The key whose burst data to plot.
177 |             unit (str, optional): The unit to be passed to pd.to_datetime.
178 |                 Defaults to 's'.
179 |             lowest_level (int, optional): If passed, includes bursts only if
180 |                 they are greater than the given lowest level. Defaults to 0.
181 |             title (Bool, optional): If True, include the name of SVO as the
182 |                 title of the figure. Defaults to True.
183 |             daterange (Tuple[str,str], optional): If passed, only bursts in the
184 |                 range daterange[0] to daterange[1] will be plotted. The dates
185 |                 must be passed as strings in the format 'year-month-day'.
186 |                 Defaults to None.
187 |             xrangeoffsets (int, optional): The number of days to add before the
188 |                 minimum date and after the maximum date. Used to 'pad' the plot.
189 |                 Defaults to 3.
190 |         """
191 |         bdf, odf = self.to_pandas(key, unit)
192 | 
193 |         visualize_bursts.plot_bursts(odf=odf,
194 |                                      bdf=bdf,
195 |                                      lowest_level=lowest_level,
196 |                                      title=True,
197 |                                      daterange=daterange,
198 |                                      xrangeoffsets=xrangeoffsets,
199 |                                      s=self.s,
200 |                                      gamma=self.gamma)
201 | 
202 |     # def create_burst_plot(self, token_pairs, zoom_level = 0, output_path = False, plot_size_x = 20, plot_size_y = 10, plot_vertically = False, num_ticks = 10, rug_alpha = 0.35, dark = True):
203 |     #     """
204 |     #     `token_pair` accepts either a tuple or a list of tuples corresponding to one of the token-token pairs in the edge_burst_dict dictionary.
205 |     #     If a list of valid token pairs is provided, one separate plot for each of the token pairs is produced.
206 | 
207 |     #     `zoom_level` (default = 0) splits the burst structure for each provided token-token pair into a series of separate bursts hierarchies, omitting any levels
208 |     #     below the indicated zoom_level. A zoom level of 0 does not omit any of the bursts (including the baseline burst, which spans the entirety of the supplied data)
209 |     #     """
210 |     #     if isinstance(token_pairs, tuple):
211 |     #         token_pairs = [token_pairs]
212 | 
213 |     #     for entry in token_pairs:
214 | 
215 |     #         plot_title = "'{}' + '{}'  -  Full Plot (s = {}, gamma = {})".format(entry[0], entry[1], self.s, self.gamma)
216 | 
217 |     #         plot_bursts(self.offset_dict[entry], self.edge_burst_dict[entry], plot_title, output_path, plot_size_x, plot_size_y, plot_vertically, num_ticks, rug_alpha, dark)
218 | 
219 |     #         if zoom_level > 0: # When the zoom level is 0, we can just pass everything directly into the plotting function.
220 |     #             offsets = self.offset_dict[entry]
221 |     #             bursts = self.edge_burst_dict[entry]
222 | 
223 |     #             burst_stack = []
224 |     #             temp_burst_stack = []
225 | 
226 |     #             for burst in bursts:
227 |     #                 if burst[0] < zoom_level:
228 |     #                     pass
229 |     #                 elif burst[0] == zoom_level:
230 |     #                     if len(temp_burst_stack) > 0:
231 |     #                         burst_stack.append(temp_burst_stack)
232 |     #                     temp_burst_stack = []
233 |     #                     temp_burst_stack.append(burst)
234 |     #                 else:
235 |     #                     temp_burst_stack.append(burst)
236 | 
237 |     #             if len(temp_burst_stack) > 0:
238 |     #                 burst_stack.append(temp_burst_stack)
239 | 
240 |     #             offset_stack = []
241 | 
242 |     #             for burst in burst_stack:
243 |     #                 low = burst[0][1]
244 |     #                 high = burst[0][2]
245 |     #                 temp_offset_stack = []
246 |     #                 for offset in offsets:
247 |     #                     if low <= offset and offset <= high:
248 |     #                         temp_offset_stack.append(offset)
249 |     #                 offset_stack.append(temp_offset_stack)
250 | 
251 |     #             assert len(burst_stack) == len(offset_stack)
252 | 
253 |     #             for i in range(0, len(burst_stack)):
254 |     #                 plot_title = ("'{}' + '{}'  -  Zoom Level {}, Slice {} of {} (s = {}, gamma = {})".format(entry[0], entry[1], zoom_level, i+1, len(burst_stack), self.s, self.gamma))
255 | 
256 |     #                 plot_bursts(offset_stack[i], burst_stack[i], plot_title, output_path, plot_size_x, plot_size_y, plot_vertically, num_ticks, rug_alpha, dark)
257 | 


--------------------------------------------------------------------------------
/nate/edgeburst/burst_mixin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a MODULE docstring
 3 | """
 4 | from .export import all_bursts_export, offsets_export
 5 | from .burst_class import Bursts, detect_bursts
 6 | 
 7 | 
 8 | class BurstMixin():
 9 | 
10 |     def __init__(self):
11 |         self.offset_dict: dict
12 |         self.lookup: dict
13 |         self.from_svo: bool
14 | 
15 |     def burst_detection(self, s: float = 2, gamma: float = 1):
16 |         """Returns an object of the class `bursts`.
17 | 
18 |         This method is used to detect bursts for _all_ of the term-term pairs
19 |         in the offset dictionary generated when this class (`edge_burst`) was
20 |         instantiated.
21 | 
22 |         This method is best employed as an exploratory tool for identifying
23 |         unusually bursty term pairs, or groups of term pairs with correlated
24 |         burst patterns.
25 | 
26 |         Since calling this method on an entire dataset can consume significant
27 |         amounts of time and memory, this method only allows for one value of
28 |         s and one value of gamma.
29 | 
30 |         If you wish to detect bursts using a variety of different values for
31 |         the s and gamma parameters, instead utilize the `multi_bursts` method
32 |         contained in this class.
33 | 
34 |         Args:
35 |             s (float, optional): s parameter for tuning Kleinberg algorithm.
36 |                 Higher values make it more difficult for bursts to move up the
37 |                 burst hierarchy. Defaults to 2.
38 |             gamma (float, optional): gamma parameter for tuning Kleinberg
39 |                 algorithm. Higher values make it more difficult for activity to
40 |                 be considered a burst. Defaults to 1.
41 | 
42 |         Returns:
43 |             Dict: The object's `offset_dict`, with integer keys converted
44 |                 to strings.
45 |             Dict: A dictionary with string representations of terms as keys
46 |                 and lists of burst data as values.
47 |             float: The s value passed as a parameter.
48 |             float: The gamma value passed as a parameter.
49 |             Bool: A flag passed to other functions in the pipeline to configure
50 |                 it for SVO data.
51 |             Dict: A dictionary that maps integer representations of terms
52 |                 (as keys) to string representations as values.
53 |         """
54 | 
55 |         # use offsets_export to return a dictionary with terms as keys in string format and list of time offsets as values
56 |         offset_dict_strings = offsets_export(self.offset_dict, self.lookup,
57 |                                              self.from_svo)
58 | 
59 |         # use detect_bursts to return a dictionary with terms as keys in integer format and list of nested burst data as values
60 |         edge_burst_dict_int = detect_bursts(self.offset_dict, s, gamma)
61 | 
62 |         # same as above, but convert keys from integers to the string values they represent
63 |         edge_burst_dict_strings = all_bursts_export(edge_burst_dict_int,
64 |                                                     self.lookup, self.from_svo)
65 | 
66 |         return offset_dict_strings, edge_burst_dict_strings, s, gamma, self.from_svo, self.lookup
67 | 
68 |     # def multi_burst(self, token_pairs, s, gamma):
69 |     #     """
70 |     #     The lists passed to s and gamma must be exactly the same length.
71 | 
72 |     #     Returns a dictionary where keys are strings containing two numbers separated by an underscore, corresponding to the s and gamma values for the run, respectively.
73 |     #     The values of each entry in the dictionary consists of {SOMETHING}
74 |     #     """
75 |     #     assert len(s) == len(gamma)
76 | 
77 |     #     run_dict = {}
78 |     #     offset_subset_dict = {}
79 | 
80 |     #     for token_pair in token_pairs:
81 |     #         offset_subset_dict[token_pair] = self.offset_dict[token_pair]
82 | 
83 |     #     for i in range(0,len(s)):
84 |     #         run_name = "{}_{}".format(str(s[i]), str(gamma[i]))
85 |     #         run_result = Bursts(self.offset_dict,self.lookup, s[i], gamma[i], self.from_svo, self.lookup)
86 |     #         run_dict[run_name] = run_result
87 | 
88 |     #     return run_dict
89 | 


--------------------------------------------------------------------------------
/nate/edgeburst/export.py:
--------------------------------------------------------------------------------
  1 | """Exports burst data to other data structures."""
  2 | import pandas as pd
  3 | import numpy as np
  4 | import os
  5 | import itertools
  6 | import pickle
  7 | from itertools import groupby
  8 | 
  9 | 
 10 | def df_export(bursts, offsets, from_svo=False):
 11 |     """Exports the burst data to a dataframe.
 12 | 
 13 |     TODO: remove offsets parameter, as it is not used to generate the dataframe
 14 |     (as far as I can tell).
 15 | 
 16 |     TODO: does the 'bursts' column need to be kept for every edge entry?
 17 |     """
 18 |     key_list = []
 19 |     burst_list = []
 20 |     offset_list = []
 21 |     for k, v in bursts.items():
 22 |         key_list.append(k)
 23 |         burst_list.append(v)
 24 |         offset_list.append(offsets[k])
 25 | 
 26 |     if from_svo == True:
 27 |         df = pd.DataFrame()
 28 |         df['svo'] = key_list
 29 | 
 30 | 
 31 |         intensities = max_intensities(burst_list)
 32 | 
 33 |     else:
 34 | 
 35 |         df = pd.DataFrame.from_records(key_list, columns=['word1', 'word2'])
 36 | 
 37 |         intensities = max_intensities(burst_list)
 38 | 
 39 |     df['bursts'] = intensities
 40 | 
 41 |     full_df = flatten(df, intensities)
 42 |     return full_df
 43 | 
 44 | 
 45 | def max_intensities(burst_list):
 46 |     """Removes all but the max intensity for each burst interval."""
 47 |     max_bursts = [{(j, k): i for i, j, k in x} for x in burst_list]
 48 | 
 49 |     return max_bursts
 50 | 
 51 | 
 52 | def flatten(df, intensities):
 53 |     """Flattens burst data into dataframe columns.
 54 | 
 55 |     Depends on the df being in the same order as the list of intensities.
 56 |     """
 57 |     term_id_list = []
 58 |     interval_start_list = []
 59 |     interval_end_list = []
 60 |     intensity_list = []
 61 | 
 62 |     for i, term in enumerate(intensities):
 63 |         for interval, intensity in term.items():
 64 |             term_id_list.append(i)
 65 |             interval_start_list.append(interval[0])
 66 |             interval_end_list.append(interval[1])
 67 |             intensity_list.append(intensity)
 68 | 
 69 |     temp_df = pd.DataFrame()
 70 |     temp_df['term_id'], temp_df['interval_start'], temp_df['interval_end'], temp_df['intensity'] =\
 71 |         term_id_list, interval_start_list, interval_end_list, intensity_list
 72 | 
 73 |     return_df = pd.merge(df, temp_df, left_index=True, right_on='term_id')
 74 | 
 75 |     return_df = return_df.sort_values(by=['intensity'], ascending=False)
 76 | 
 77 |     return return_df
 78 | 
 79 | 
 80 | def max_bursts_export(bursts, from_svo=False):
 81 |     """Returns a dict with term as key and maximum intensity burst as value.
 82 | 
 83 |     TODO: make this function export what it means to. As of now, it returns
 84 |     a dict with all bursts as values.
 85 |     """
 86 |     key_list = []
 87 |     burst_list = []
 88 | 
 89 |     for k, v in bursts.items():
 90 |         key_list.append(k)
 91 |         burst_list.append(v)
 92 | 
 93 |     if from_svo:
 94 |         df = pd.DataFrame()
 95 |         df['svo'] = key_list
 96 | 
 97 |         intensities = max_intensities(burst_list)
 98 | 
 99 |         max_bursts = {df['svo'][x]: intensities[x] for x in df.index}
100 |     else:
101 | 
102 |         df = pd.DataFrame.from_records(key_list, columns=['word1', 'word2'])
103 | 
104 |         intensities = max_intensities(burst_list)
105 | 
106 |         max_bursts = {
107 |             (df['word1'][x], df['word2'][x]): intensities[x] for x in df.index
108 |         }
109 | 
110 |     return max_bursts
111 | 
112 | 
113 | def all_bursts_export(bursts, lookup, from_svo=False):
114 |     """Converts the keys of the `bursts` dictionary from ints to strings."""
115 |     key_list = []
116 |     burst_list = []
117 | 
118 |     for k, v in bursts.items():
119 |         key_list.append(k)
120 |         burst_list.append(v)
121 | 
122 |     if from_svo:
123 |         df = pd.DataFrame()
124 |         df['svo_#'] = key_list
125 |         df['svo'] = df['svo_#'].map(lookup)
126 | 
127 |         all_bursts = {df['svo'][x]: burst_list[x] for x in df.index}
128 |     else:
129 |         df = pd.DataFrame.from_records(key_list, columns=['word1_#', 'word2_#'])
130 |         df['word1'] = df['word1_#'].map(lookup)
131 |         df['word2'] = df['word2_#'].map(lookup)
132 | 
133 |         all_bursts = {
134 |             (df['word1'][x], df['word2'][x]): burst_list[x] for x in df.index
135 |         }
136 | 
137 |     return all_bursts
138 | 
139 | 
140 | def offsets_export(offsets, lookup, from_svo=False):
141 |     """Converts the keys of the `offsets` dictionary from ints to strings.
142 | 
143 |     TODO: This does exactly the same thing as all_bursts_export above:
144 |     the differences between the two datastructures aren't relevant to
145 |     replacing their keys with strings.
146 |     """
147 |     key_list = []
148 |     offset_list = []
149 | 
150 |     for k, _ in offsets.items():
151 |         key_list.append(k)
152 |         offset_list.append(offsets[k])
153 | 
154 |     if from_svo:
155 |         df = pd.DataFrame()
156 |         df['svo_#'] = key_list
157 |         df['svo'] = df['svo_#'].map(lookup)
158 | 
159 |         offsets = {df['svo'][x]: offset_list[x] for x in df.index}
160 | 
161 |     else:
162 |         df = pd.DataFrame.from_records(key_list, columns=['word1_#', 'word2_#'])
163 |         df['word1'] = df['word1_#'].map(lookup)
164 |         df['word2'] = df['word2_#'].map(lookup)
165 | 
166 |         offsets = {
167 |             (df['word1'][x], df['word2'][x]): offset_list[x] for x in df.index
168 |         }
169 | 
170 |     return offsets
171 | 


--------------------------------------------------------------------------------
/nate/edgeburst/pybursts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a MODULE docstring. This module is adapted from the pybursts package, which is an implementation of Kleinberg's
  3 | burst detection algorithm by Renzo Poddighe: https://pypi.org/project/pybursts/
  4 | Changes are primarily to increase performance by moving object creation outside of loops and using numba just-in-time 
  5 | compilation to perform mathematical calculations in C.
  6 | The process function was also added to perform burst detection on a list of documents
  7 | """
  8 | import numpy as np
  9 | import math
 10 | import numba
 11 | from numba import njit, jit
 12 | 
 13 | 
 14 | def single(offsets, s=2, gamma=1):
 15 | 
 16 |     if s <= 1:
 17 |         raise ValueError("s must be greater than 1!")
 18 |     if gamma <= 0:
 19 |         raise ValueError("gamma must be positive!")
 20 |     if len(offsets) < 1:
 21 |         raise ValueError("offsets must be non-empty!")
 22 | 
 23 |     offsets = np.array(offsets, dtype=object)
 24 | 
 25 |     if offsets.size == 1:
 26 |         bursts = np.array([0, offsets[0], offsets[0]], ndmin=2, dtype=object)
 27 |         return bursts
 28 | 
 29 |     offsets = np.sort(offsets)
 30 |     gaps = np.diff(offsets)
 31 | 
 32 |     if not np.all(gaps):
 33 |         raise ValueError("Input cannot contain events with zero time between!")
 34 | 
 35 |     T = np.sum(gaps)
 36 |     n = np.size(gaps)
 37 |     g_hat = T / n
 38 | 
 39 |     k = int(
 40 |         math.ceil(float(1 + math.log(T, s) + math.log(1 / np.amin(gaps), s))))
 41 | 
 42 |     gamma_log_n = gamma * math.log(n)
 43 | 
 44 |     alpha_function = np.vectorize(lambda x: s**x / g_hat)
 45 |     alpha = alpha_function(np.arange(k))
 46 | 
 47 |     C = np.repeat(float("inf"), k)
 48 | 
 49 |     C[0] = 0
 50 | 
 51 |     q = np.empty((k, 0))
 52 |     for t in range(n):
 53 |         C_prime = np.repeat(float("inf"), k)
 54 |         q_prime = np.empty((k, t + 1))
 55 |         q_prime.fill(np.nan)
 56 |         k_range = np.arange(0, k)
 57 |         C_temp = C[k_range]
 58 |         gaps_t = gaps[t]
 59 |         for j in range(k):
 60 |             tau_arr = tau(k_range, j, gamma_log_n)
 61 |             cost = np.add(C_temp, tau_arr)
 62 |             el = min_cost(cost)
 63 |             alpha_temp = alpha[j]
 64 |             f_j_t = f(alpha_temp, gaps_t)
 65 | 
 66 |             if f_j_t > 0:
 67 |                 C_prime[j] = cost[el] - math.log(f_j_t)
 68 | 
 69 |             if t > 0:
 70 |                 q_prime[j, :t] = q[el, :]
 71 | 
 72 |             q_prime[j, t] = j + 1
 73 | 
 74 |         C = C_prime
 75 |         q = q_prime
 76 | 
 77 |     j = np.argmin(C)
 78 |     q = q[j, :]
 79 | 
 80 |     prev_q = 0
 81 | 
 82 |     N = int(0)
 83 |     for t in range(n):
 84 |         if q[t] > prev_q:
 85 |             N = N + q[t] - prev_q
 86 |         prev_q = q[t]
 87 | 
 88 |     bursts = np.array([
 89 |         np.repeat(np.newaxis, N),
 90 |         np.repeat(offsets[0], N),
 91 |         np.repeat(offsets[0], N)
 92 |     ],
 93 |                       ndmin=2,
 94 |                       dtype=object).transpose()
 95 | 
 96 |     burst_counter = -1
 97 |     prev_q = 0
 98 |     stack = np.repeat(np.newaxis, N)
 99 |     stack_counter = -1
100 |     for t in range(n):
101 |         if q[t] > prev_q:
102 |             num_levels_opened = q[t] - prev_q
103 |             for i in range(int(num_levels_opened)):
104 |                 burst_counter += 1
105 |                 bursts[burst_counter, 0] = int(prev_q + i)
106 |                 bursts[burst_counter, 1] = offsets[t]
107 |                 stack_counter += 1
108 |                 stack[stack_counter] = burst_counter
109 |         elif q[t] < prev_q:
110 |             num_levels_closed = prev_q - q[t]
111 |             for i in range(int(num_levels_closed)):
112 |                 bursts[stack[stack_counter], 2] = offsets[t]
113 |                 stack_counter -= 1
114 |         prev_q = q[t]
115 | 
116 |     while stack_counter >= 0:
117 |         bursts[stack[stack_counter], 2] = offsets[n]
118 |         stack_counter -= 1
119 | 
120 |     burst_lists = []
121 | 
122 |     for burst in bursts:
123 |         burst_lists.append(burst.tolist())
124 | 
125 |     return bursts
126 | 
127 | 
128 | @njit
129 | def f(alpha, x):
130 | 
131 |     return alpha * math.exp(-alpha * x)
132 | 
133 | 
134 | @njit
135 | def min_cost(cost):
136 | 
137 |     return np.argmin(cost)
138 | 
139 | 
140 | @njit(cache=False)
141 | def tau(i, j, gamma_log_n):
142 | 
143 |     return np.where(i >= j, 0, ((j - i) * gamma_log_n))
144 | 
145 | 
146 | @jit(forceobj=True)
147 | def process(offset_list, s, gamma):
148 | 
149 |     bursts = [single(x, s, gamma) for x in offset_list]
150 |     return bursts
151 | 


--------------------------------------------------------------------------------
/nate/edgeburst/visualize_bursts.py:
--------------------------------------------------------------------------------
  1 | """Visualizes burst data."""
  2 | 
  3 | import pandas as pd
  4 | import matplotlib as mpl
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.dates as mdates
  7 | 
  8 | 
  9 | def to_pandas(ebursts, offsets, svo, unit='s'):
 10 |     """Exports burst and offset data to dataframes for a single term.
 11 | 
 12 |     ebursts is an edgebust dict from the SVO object
 13 |     offsets is an offsets dict from the SVO object
 14 |     """
 15 |     svos = " | ".join(svo)
 16 | 
 17 |     bdf = pd.DataFrame(ebursts)
 18 |     bdf[1] = pd.to_datetime(bdf[1], unit=unit)
 19 |     bdf[2] = pd.to_datetime(bdf[2], unit=unit)
 20 |     bdf.columns = ['level', 'start', 'end']
 21 |     bdf['svo'] = svos
 22 | 
 23 |     odf = pd.DataFrame()
 24 |     i = pd.to_datetime(offsets, unit='s')
 25 |     odf['Date'], odf['Year'], odf['Month'], odf[
 26 |         'Day'] = i.date, i.year, i.month, i.day
 27 |     odf = odf.set_index(i)
 28 |     odf['svo'] = svos
 29 | 
 30 |     return bdf, odf
 31 | 
 32 | 
 33 | 
 34 | def plot_bursts(odf,
 35 |                 bdf,
 36 |                 lowest_level=0,
 37 |                 title=True,
 38 |                 daterange=None,
 39 |                 xrangeoffsets=3,
 40 |                 s=None,
 41 |                 gamma=None):
 42 |     """Plots burst and offset data.
 43 | 
 44 |     odf = an offsets dataframe
 45 |     bdf = an edgeburst dataframe
 46 |     lowest_level = subset the burst dataframe with bursts greater than or equal to the specified level
 47 |     daterange = a tuple with two elements: a start date and end date as *strings*. format is 'year-month-day'
 48 |     xrangeoffsets = the number of days to add before and after the min and max x dates
 49 |     """
 50 | 
 51 |     svo_title = str(set(bdf['svo']).pop())
 52 | 
 53 |     fig, (axa, axb) = plt.subplots(2, sharey=False, sharex=True)
 54 |     fig.set_figwidth(10)
 55 |     fig.set_figheight(6)
 56 | 
 57 |     formatter = mdates.DateFormatter("%b %d\n%Y")
 58 |     axb.xaxis.set_major_formatter(formatter)
 59 | 
 60 |     # offsets plot
 61 |     day_freq = odf.resample('D').size()
 62 |     axa.plot(day_freq, color='#32363A')
 63 |     axa.xaxis.set_major_formatter(formatter)
 64 |     axa.xaxis_date()
 65 |     axa.tick_params(axis='both', which='both', length=0)
 66 |     axa.set_ylabel('Daily offsets')
 67 |     if daterange:
 68 |         axa.set_xlim(pd.Timestamp(daterange[0]), pd.Timestamp(daterange[1]))
 69 | 
 70 |     # bursts plot
 71 | 
 72 |     days = [day_freq.index[0]]
 73 |     levels = [0]
 74 | 
 75 |     for i in range(1, len(day_freq.index)):
 76 | 
 77 |         period_start = odf.resample('D').size().index[i - 1]
 78 |         period_end = odf.resample('D').size().index[i]
 79 | 
 80 |         max_burst = set()
 81 | 
 82 |         days.append(period_end)
 83 | 
 84 |         for j in range(len(bdf)):
 85 | 
 86 |             burst_start = bdf['start'][j]
 87 |             burst_end = bdf['end'][j]
 88 |             level = bdf['level'][j]
 89 | 
 90 |             if burst_end < period_start or period_end < burst_start :
 91 |                 pass
 92 |             else:
 93 |                 max_burst.add(level)
 94 | 
 95 |         levels.append(max(max_burst))
 96 | 
 97 |     finaldf = pd.DataFrame({"start": days, "level": levels})
 98 | 
 99 |     if lowest_level > 0:
100 |         bdf = bdf[bdf['level'] >= lowest_level]
101 |         xmin = min(bdf['start'])
102 |         xmax = max(bdf['start'])
103 | 
104 |         if xmin == xmax:
105 |             raise Exception("There must be at least two bursts at or above the specified level. Try reducing the `lowest_level` parameter.")
106 | 
107 |         daterange = ((xmin + pd.DateOffset(days=2)).date(), (xmax + pd.DateOffset(days=2)).date())
108 | 
109 |     # bursts plot
110 |     axb.bar(finaldf['start'], finaldf['level'], color='#32363A', width=1)
111 | 
112 |     if s != None and gamma != None:
113 |         axb.set_ylabel(r'Burst levels (s = {}, $\gamma$ = {})'.format(s, gamma))
114 |     else:
115 |         axb.set_ylabel('Burst level')
116 | 
117 |     axb.tick_params(axis='both', which='both', length=0)
118 | 
119 |     if daterange:
120 |         axb.set_xlim(pd.Timestamp(daterange[0]), pd.Timestamp(daterange[1]))
121 | 
122 |     fig.tight_layout(rect=[0, 0.03, 1, 0.95])
123 | 
124 |     if title is True:
125 |         fig.suptitle(f'{svo_title}', fontsize=12, ha='center')
126 | 


--------------------------------------------------------------------------------
/nate/importers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/importers/__init__.py


--------------------------------------------------------------------------------
/nate/importers/dataframe_importers.py:
--------------------------------------------------------------------------------
  1 | """`Nate` importers involving pandas.
  2 | 
  3 | This module provides common importers for the `Nate` class. They use existing
  4 | pandas import functionality as an interface to `Nate`. These importers are the
  5 | reccomended way to import data into `Nate`, unless the user needs to import data
  6 | in ways not covered by this module's functionality.
  7 | """
  8 | 
  9 | import pandas
 10 | from typing import List, Union
 11 | from .named_tuple_generator import tupleize
 12 | from .nate_class import Nate
 13 | from .timestamp_process import convert_times
 14 | 
 15 | 
 16 | def process_dataframe(temp_data,
 17 |                       text: str,
 18 |                       unique_id: str = None,
 19 |                       time: str = None,
 20 |                       twitter_times: bool = False,
 21 |                       columns_to_keep: List = []):
 22 |     """Builds a nate object from a dataframe."""
 23 |     series_dict = {}
 24 |     special_column_list = [(text, "text"), (unique_id, "unique_id"),
 25 |                            (time, "times")]
 26 | 
 27 |     for special_column, special_column_name in special_column_list:
 28 |         if special_column != None:
 29 |             temp_column = temp_data[special_column]
 30 |             temp_column.name = special_column_name
 31 |             series_dict[special_column_name] = temp_column.tolist()
 32 | 
 33 |     for covariate_column in columns_to_keep:
 34 |         temp_column = temp_data[covariate_column]
 35 |         temp_column.name = covariate_column
 36 |         series_dict[covariate_column] = temp_column.tolist()
 37 | 
 38 |     if time != None:
 39 |         try:
 40 |             series_dict['time'] = convert_times(series_dict['times'])
 41 |             del series_dict['times']
 42 |         except:
 43 |             series_dict['time'] = series_dict['times']
 44 |             del series_dict['times']
 45 | 
 46 |     return Nate(tupleize(series_dict))
 47 | 
 48 | 
 49 | def import_dataframe(input_dataframe: pandas.DataFrame,
 50 |                      text: str,
 51 |                      unique_id: str = None,
 52 |                      time: str = None,
 53 |                      twitter_times: bool = False,
 54 |                      columns_to_keep: List = []):
 55 |     """Imports a pandas dataframe into nate.
 56 | 
 57 |     Args:
 58 |         input_dataframe (pandas.DataFrame): The dataframe to be loaded.
 59 |         text (str): The name of the column containing the text data to be
 60 |             analyzed with nate. Required for all uses of nate.
 61 |         unique_id (str, optional): The name of the column containing unique
 62 |             identifiers (e.g. a unique name or hash ID#). Required
 63 |             for some uses of nate (e.g. Divsim).
 64 |         time (str, optional): The name of the column containing the time the
 65 |             observation was recorded. Required for some uses of
 66 |             nate (e.g. edge_burst).
 67 |         columns_to_keep (list, optional): A list of column names indicating
 68 |             which columns not specified elsewhere (e.g. for the
 69 |             time parameter) are kept.
 70 | 
 71 |     Returns:
 72 |         Nate: an instance of the `Nate` class containing all data from the
 73 |             columns specified in the parameters.
 74 | 
 75 |     The columns indicated in the text, unique_id, and time parameters will
 76 |     be renamed to 'text', 'unique_id', and 'time', accordingly. The names
 77 |     of the columns listed in 'columns_to_keep' will be preserved as-is.
 78 |     """
 79 |     
 80 |     if time!= None and twitter_times == False:
 81 |         input_dataframe = input_dataframe.astype({time: 'str'})
 82 |         input_dataframe[time] = pandas.to_datetime(input_dataframe[time], infer_datetime_format=True)
 83 |     return process_dataframe(input_dataframe, text, unique_id, time, twitter_times,
 84 |                              columns_to_keep)
 85 | 
 86 | 
 87 | def import_csv(file_paths: Union[List, str],
 88 |                text: str,
 89 |                unique_id: str = None,
 90 |                time: str = None,
 91 |                twitter_times: bool = False,
 92 |                columns_to_keep: List = [],
 93 |                observation_threshold=0):
 94 |     """Imports a comma-separated values file (.csv) into `nate`.
 95 | 
 96 |     This function uses pre-existing pandas functionality to read in a
 97 |     comma-separated value file (.csv) into `nate`.
 98 | 
 99 |     Args:
100 |         file_path (str or path-like): The location of the file to
101 |             be loaded from disk.
102 |         text (str): The name of the column containing the text
103 |             data to be analyzed with nate. Required for all uses of nate.
104 |         unique_id (str, optional): The name of the column containing unique
105 |             identifiers (e.g. a unique name or hash ID#). Required for
106 |             some uses of nate (e.g. Divsim).
107 |         time (str, optional): The name of the column containing the time the
108 |             observation was recorded. Required for some uses of nate
109 |             (e.g. edgeburst).
110 |         columns_to_keep (list, optional): A list of column names indicating
111 |             which columns not specified elsewhere (e.g. for the time
112 |             parameter) are kept.
113 |         observation_threshold (int, optional): An integer indicating how many
114 |             observations to include in the imported data, at minimum.
115 |             Once the number of rows in the imported dataset exceeds this value,
116 |             the importer will not import the next file in the list of
117 |             file paths passed to `file_path`. Has no effect if a string
118 |             or path-like object is passed to `file_paths`.
119 | 
120 |     Returns:
121 |         Nate: an instance of the `Nate` class containing all data from the
122 |             columns specified in the parameters.
123 | 
124 |     The columns indicated in the text, unique_id, and time parameters will
125 |     be renamed to 'text', 'unique_id', and 'time', accordingly. The names of the
126 |     columns listed in 'columns_to_keep' will be preserved as-is.
127 | 
128 |     Note that this function is only equipped to handle pre-processed .csv
129 |     files that are ready to be loaded into a pandas dataframe with no
130 |     additional manipulation. If the data requires any kind of special
131 |     treatment, prudent users will first load their data using pandas
132 |     directly into python, and then use the 'import_dataframe' function
133 |     to load their data into nate.
134 |     """
135 |     columns_to_import = [*columns_to_keep]
136 | 
137 |     for special_column in [text, unique_id, time]:
138 |         if special_column != None:
139 |             columns_to_import.append(special_column)
140 |             
141 |     dtypes = {}
142 |     
143 |     if time!= None:
144 |         dtypes[time] = "str"
145 | 
146 |     if isinstance(file_paths, list):
147 |         df_list = []
148 |         total_len = 0
149 |         for entry in file_paths:
150 |             temp_df = pandas.read_csv(entry, usecols=columns_to_import, dtype = dtypes)
151 |             df_list.append(temp_df)
152 | 
153 |             if observation_threshold != 0:
154 |                 total_len += len(temp_df)
155 |                 if total_len >= observation_threshold:
156 |                     break
157 | 
158 |         temp_data = pandas.concat(df_list)
159 | 
160 |     elif isinstance(file_paths, str):
161 | 
162 |         temp_data = pandas.read_csv(file_paths, usecols=columns_to_import, dtype = dtypes)
163 | 
164 |     else:
165 |         raise TypeError("file_paths must be either string or list of strings")
166 |         
167 |     if time!= None and twitter_times == False:
168 |         temp_data = temp_data.astype({time: 'str'})
169 |         temp_data[time] = pandas.to_datetime(temp_data[time], infer_datetime_format=True)
170 | 
171 |     return process_dataframe(temp_data, text, unique_id, time, twitter_times, columns_to_keep)
172 | 
173 | 
174 | def import_excel(file_paths: Union[List, str],
175 |                  text: str,
176 |                  unique_id: str = None,
177 |                  time: str = None,
178 |                  twitter_times: bool = False,
179 |                  columns_to_keep: List = [],
180 |                  observation_threshold=0):
181 |     """Imports an excel file (.xlsx) into nate.
182 | 
183 |     This function uses pre-existing pandas functionality to read in an excel
184 |     file (.xlsx) into nate.
185 | 
186 |     Args:
187 |         file_path (str or path-like): The location of the file to be
188 |             loaded from disk.
189 |         text (str): The name of the column containing the text data
190 |             to be analyzed with nate. Required for all uses of nate.
191 |         unique_id (str, optional): The name of the column containing unique
192 |             identifiers (e.g. a unique name or hash ID#). Required for
193 |             some uses of Nate.
194 |         time (str, optional): The name of the column containing the time the
195 |             observation was recorded. Required for some uses of nate
196 |             (e.g. edge_burst).
197 |         columns_to_keep (list, optional): A list of column names indicating
198 |             which columns not specified elsewhere (e.g. for the time
199 |             parameter) are kept.
200 |         observation_threshold (int, optional): An integer indicating how many
201 |             observations to include in the imported data, at minimum. Once
202 |             the number of rows in the imported dataset exceeds this value,
203 |             the importer will not import the next file in the list of file
204 |             paths passed to `file_path`. Has no effect if a string or
205 |             path-like object is passed to `file_paths`.
206 | 
207 |     Returns:
208 |         A `Nate` object containing all data from the columns specified in
209 |             the parameters.
210 | 
211 |     The columns indicated in the text, unique_id, and time parameters will be
212 |     renamed to 'text', 'unique_id', and 'time', accordingly. The names of the
213 |     columns listed in 'columns_to_keep' will be preserved as-is.
214 | 
215 |     Note that this function is only equipped to handle pre-processed .xlsx
216 |     files that are ready to be loaded into a pandas dataframe with no
217 |     additional manipulation. If the data requires any kind of special
218 |     treatment, prudent users will first load their data using pandas
219 |     directly into python, and then use the 'import_dataframe' function to
220 |     load their data into nate.
221 |     """
222 |     columns_to_import = [*columns_to_keep]
223 | 
224 |     for special_column in [text, unique_id, time]:
225 |         if special_column != None:
226 |             columns_to_import.append(special_column)
227 |             
228 |     dtypes = {}
229 |     
230 |     if time!= None:
231 |         dtypes[time] = "str"
232 | 
233 |     print(columns_to_import)
234 |     print(columns_to_keep)
235 | 
236 |     if isinstance(file_paths, list):
237 |         df_list = []
238 |         total_len = 0
239 |         for entry in file_paths:
240 |             temp_df = pandas.read_excel(entry, usecols=columns_to_import, dtype = dtypes)
241 |             df_list.append(temp_df)
242 | 
243 |             if observation_threshold != 0:
244 |                 total_len += len(temp_df)
245 |                 if total_len >= observation_threshold:
246 |                     break
247 | 
248 |         temp_data = pandas.concat(df_list)
249 | 
250 |     elif isinstance(file_paths, str):
251 | 
252 |         temp_data = pandas.read_excel(file_paths, usecols=columns_to_import, dtype = dtypes)
253 | 
254 |     else:
255 |         raise TypeError("file_paths must be either string or list of strings")
256 |         
257 |     if time!= None and twitter_times == False:
258 |         temp_data = temp_data.astype({time: 'str'})
259 |         temp_data[time] = pandas.to_datetime(temp_data[time], infer_datetime_format=True)
260 | 
261 |     return process_dataframe(temp_data, text, unique_id, time, twitter_times, columns_to_keep)
262 | 


--------------------------------------------------------------------------------
/nate/importers/edgelist_importers.py:
--------------------------------------------------------------------------------
 1 | """`Nate` importers for edgelists."""
 2 | 
 3 | import pandas
 4 | from .named_tuple_generator import tupleize
 5 | 
 6 | 
 7 | def process_edgelist(temp_data, From, To, Weight=None):
 8 |     """Turns an edgelist in a dataframe into a list of NamedTuples."""
 9 | 
10 |     series_dict = {}
11 | 
12 |     special_column_list = [(From, "From"), (To, "To"), (Weight, "Weight")]
13 | 
14 |     for special_column, special_column_name in special_column_list:
15 |         if special_column != None:
16 |             temp_column = temp_data[special_column]
17 |             temp_column.name = special_column_name
18 |             series_dict[special_column_name] = temp_column.tolist()
19 | 
20 |     return tupleize(series_dict, "edge")
21 | 
22 | 
23 | class EdgelistMixin():
24 |     """Provides edgelist functionality to objects in nate."""
25 | 
26 |     def add_edges_from_csv(self, file_path, From, To, Weight=None):
27 |         """Imports an edgelist from a .csv file into `nate`.
28 |         
29 |         This function sets the self.edgelist attribute to a list of
30 |         NamedTuples, with each tuple representing one edge.
31 | 
32 |         Args:
33 |             file_path (str): The location of the file to be loaded from disk.
34 |             From (str): The name of the column containing the origin of
35 |                 the edge.
36 |             To (str): The name of the column containing the destination of
37 |                 the edge.
38 |             Weight (str, optional): The column containing the edge's weight.
39 |             
40 |         Note that the capitalized arguments are a result of 'from' being a
41 |         reserved keyword in Python.
42 |         """
43 | 
44 |         col_list = [From, To]
45 | 
46 |         if Weight != None:
47 |             col_list.append(Weight)
48 | 
49 |         temp_data = pandas.read_csv(file_path, usecols=col_list)
50 | 
51 |         self.edgelist = process_edgelist(temp_data,
52 |                                          From=From,
53 |                                          To=To,
54 |                                          Weight=Weight)
55 | 
56 |     def add_edges_from_dataframe(self, dataframe, From, To, Weight=None):
57 |         """Imports an edgelist from a dataframe into `nate`.
58 |         
59 |         This function sets the self.edgelist attribute to a list of
60 |         NamedTuples, with each tuple representing one edge.
61 | 
62 |         Args:
63 |             dataframe (pandas.Dataframe): The dataframe from which to extract
64 |                 the edgelist.
65 |             From (str): The name of the column containing the origin of
66 |                 the edge.
67 |             To (str): The name of the column containing the destination of
68 |                 the edge.
69 |             Weight (str, optional): The column containing the edge's weight.
70 |             
71 |         Note that the capitalized arguments are a result of 'from' being a
72 |         reserved keyword in Python.
73 |         """
74 |         self.edgelist = process_edgelist(dataframe,
75 |                                          From=From,
76 |                                          To=To,
77 |                                          Weight=Weight)
78 | 


--------------------------------------------------------------------------------
/nate/importers/named_tuple_generator.py:
--------------------------------------------------------------------------------
 1 | """Implements extra NamedTuple functionality."""
 2 | 
 3 | from collections import namedtuple
 4 | from typing import List, NamedTuple
 5 | 
 6 | 
 7 | def define_named_tuple(observation_name, attribute_names: List[str]):
 8 |     """Creates a new subclass of NamedTuple."""
 9 |     output_tuple = namedtuple(observation_name, attribute_names)
10 | 
11 |     return output_tuple
12 | 
13 | 
14 | def create_observation_list(observation_name: str,
15 |                             **kwargs) -> List[NamedTuple]:
16 |     """Creates an observation list of NamedTuples.
17 |     
18 |     This function builds a new NamedTuple type from the lists passed as
19 |     kwargs, with each field given the name of the keyword it was passed with.
20 | 
21 |     This function requires that all lists passed as kwargs are the same length.
22 | 
23 |     Args:
24 |         observation_name (str): The name given to the new NamedTuple type.
25 |         **kwargs: Lists containing data for each observation. The keyword
26 |             passed with each list will become the name of that field in the
27 |             resulting NamedTuple type.
28 | 
29 |     Returns:
30 |         List[NamedTuple]: A list of NamedTuples, with each tuple corresponding
31 |             to one observation.
32 |     
33 |     Raises:
34 |         Exception: If the lists passed as kwargs are not the same length.
35 |     """
36 |     custom_named_tuple = define_named_tuple(observation_name,
37 |                                             list(kwargs.keys()))
38 | 
39 |     #Length check: all of the lists fed in MUST be of the same length
40 | 
41 |     arg_lengths = [len(arg) for arg in kwargs.values()]
42 |     arg_length = set(arg_lengths)
43 | 
44 |     if len(arg_length) != 1:
45 |         raise Exception("Not all of the input data is the same length.")
46 | 
47 |     observation_list = []
48 | 
49 |     for i in range(0, arg_length.pop()):
50 | 
51 |         variables = []
52 | 
53 |         for arg in kwargs:
54 |             variables.append(kwargs[arg][i])
55 | 
56 |         observation_list.append(custom_named_tuple(*variables))
57 | 
58 |     return observation_list
59 | 
60 | 
61 | def tupleize(series_dict, tuple_name="obs"):
62 |     """Creates an observation list of NamedTuples."""
63 |     kwarg_dict = {}
64 | 
65 |     keys = [i for i in series_dict.keys()]
66 | 
67 |     for i in range(0, len(keys)):
68 |         kwarg_dict[keys[i]] = list(series_dict[keys[i]])
69 | 
70 |     return create_observation_list(tuple_name, **kwarg_dict)
71 | 


--------------------------------------------------------------------------------
/nate/importers/raw_importers.py:
--------------------------------------------------------------------------------
  1 | """Import text, and only text, directly into `Nate`."""
  2 | 
  3 | from typing import List, Union
  4 | from .named_tuple_generator import define_named_tuple
  5 | from .nate_class import Nate
  6 | from .timestamp_process import convert_time
  7 | 
  8 | text_only_namedtuple = define_named_tuple('obs', ['text'])
  9 | 
 10 | 
 11 | def import_text(strings):
 12 |     """Directly imports a string (or a list of strings) into `nate`.
 13 | 
 14 |     Args:
 15 |         strings (Union(str, List[str])): A string or a list of strings.
 16 | 
 17 |     Returns:
 18 |         Nate: An instance of the `Nate` class.
 19 |     """
 20 |     if isinstance(strings, str):
 21 |         strings = [strings]
 22 | 
 23 |     return Nate([text_only_namedtuple(string) for string in strings])
 24 | 
 25 | 
 26 | def import_files(files):
 27 |     """Directly imports a text file (or list of text files) into `nate`.
 28 | 
 29 |     Args:
 30 |         files (Union(str, List[str])): A filename or list of filenames to be
 31 |             loaded from disk.
 32 | 
 33 |     Returns:
 34 |         Nate: A `Nate` object containing only the text data given.
 35 |     """
 36 |     if isinstance(files, str):
 37 |         files = [files]
 38 | 
 39 |     obs_list = []
 40 | 
 41 |     for filepath in files:
 42 |         with open(filepath, 'r', encoding='utf-8') as stream:
 43 |             obs_list.append(
 44 |                 text_only_namedtuple(stream.read().replace('\n', ' ')))
 45 | 
 46 |     return Nate(obs_list)
 47 | 
 48 | 
 49 | def import_dict_of_dicts(dictionary, text, time=None, values_to_keep=[]):
 50 |     """Imports a dict of dicts into `nate`.
 51 |     
 52 |     Args:
 53 |         dictionary (Dict): A dict of dicts, with the keys of the outer dict
 54 |             corresponding to unique observation ids.
 55 |         text (str): The name of the text entry in each inner dict. 
 56 |         time (str, optional): The name of the time entry in each inner dict.
 57 |         values_to_keep (List[str], optional): A list of keys which appear in 
 58 |             all inner dicts. The values will be kept in the resulting `Nate` 
 59 |             object.
 60 | 
 61 |     Returns:
 62 |         Nate: An instance of the `Nate` class.
 63 |     """
 64 | 
 65 |     lookup_list = [text]
 66 |     named_list = ['unique_id', 'text']
 67 | 
 68 |     if time != None:
 69 |         lookup_list.append(time)
 70 |         named_list.append('time')
 71 | 
 72 |     lookup_list.extend(values_to_keep)
 73 |     named_list.extend(values_to_keep)
 74 | 
 75 |     dict_namedtuple = define_named_tuple('obs', named_list)
 76 | 
 77 |     obs_list = []
 78 | 
 79 |     for key, subdict in dictionary.items():
 80 |         filtered_values = []
 81 |         for value in lookup_list:
 82 | 
 83 |             value_to_append = subdict[value]
 84 | 
 85 |             if value == 'time':
 86 |                 value_to_append = convert_time(value_to_append)
 87 | 
 88 |             filtered_values.append(value_to_append)
 89 | 
 90 |         obs_list.append(dict_namedtuple(key, *filtered_values))
 91 | 
 92 |     return Nate(obs_list)
 93 | 
 94 | 
 95 | def import_lists(text: List, time: List = None, unique_id: List = None):
 96 |     """Imports a number of list into `nate`.
 97 |     
 98 |     [Note: it might be a good idea to add a **kwargs parameter so that
 99 |     users can pass arbitrary other lists, similar to values_to_keep above.]
100 |     
101 |     Args:
102 |         text (List): A list of strings.
103 |         time (List, optional): A list containing the times each observation
104 |             was recorded.
105 |         unique_id (List, optional): The list containing unique
106 |             identifiers (e.g. a unique name or hash ID#). 
107 |     
108 |     Returns:
109 |         Nate: An instance of the `Nate` class.
110 |     """
111 |     pass
112 | 


--------------------------------------------------------------------------------
/nate/importers/timestamp_process.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Process and reformat times for consistency across Nate.
 3 | """
 4 | from datetime import datetime, timezone
 5 | 
 6 | 
 7 | def convert_times(times, timezone=timezone.utc):
 8 |     """Convert all times to POSIX timestamps."""
 9 |     timestamps = []
10 | 
11 |     for time in times:
12 |         dt = datetime.strptime(time, "%m/%d/%Y %H:%M")
13 |         timestamps.append(int(dt.replace(tzinfo=timezone).timestamp()))
14 | 
15 |     return timestamps
16 | 
17 | 
18 | def convert_time(time, timezone=timezone.utc):
19 |     """Convert a single time to POSIX timestamp."""
20 |     dt = datetime.strptime(time, "%m/%d/%Y %H:%M")
21 |     return int(dt.replace(tzinfo=timezone).timestamp())
22 | 


--------------------------------------------------------------------------------
/nate/netplus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/netplus/__init__.py


--------------------------------------------------------------------------------
/nate/netplus/netplus.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a MODULE docstring
3 | """
4 | 
5 | # Coming soon...
6 | 


--------------------------------------------------------------------------------
/nate/semnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/semnet/__init__.py


--------------------------------------------------------------------------------
/nate/semnet/semnet.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a MODULE docstring
3 | """
4 | 
5 | # Coming soon...
6 | 


--------------------------------------------------------------------------------
/nate/socnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/socnet/__init__.py


--------------------------------------------------------------------------------
/nate/socnet/alters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a MODULE docstring
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import networkx as nx
 7 | from collections import namedtuple
 8 | 
 9 | alter_tuple = namedtuple('alters', ['vertex', 'betweenness', 'closeness', 'eigenvector'])
10 | 
11 | 
12 | def find_alters(edgelist) -> dict:
13 |     G = nx.Graph()
14 | 
15 |     G = nx.from_pandas_edgelist(pd.DataFrame(edgelist, columns = ['From', 'To']), source='From', target='To')
16 | 
17 |     authorlist = [entry.From for entry in edgelist]
18 |     authorlist.extend([entry.To for entry in edgelist])
19 |     author_dict = {item: [] for item in set(authorlist)}
20 | 
21 |     for author in author_dict:
22 |         alter_list = list(G.neighbors(author))
23 |         alter_2_list = []
24 |         for alter in alter_list:
25 |             alters_2 = list(G.neighbors(alter))
26 |             alter_2_list.extend(alters_2)
27 | 
28 |         alter_list = list(set(alter_list))
29 |         alter_2_list = list(set(alter_2_list))
30 | 
31 |         alter_2_list.remove(author)
32 |         for alter in alter_list:
33 |             if alter in alter_2_list:
34 |                 alter_2_list.remove(alter)
35 | 
36 |         author_dict[author] = [alter_list, alter_2_list]
37 | 
38 |     return author_dict
39 | 


--------------------------------------------------------------------------------
/nate/socnet/centralities.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a MODULE docstring
 3 | """
 4 | from importlib.util import find_spec
 5 | import igraph 
 6 | from collections import namedtuple
 7 | 
 8 | cent_tuple = namedtuple('centralities', ['vertex', 'betweenness', 'closeness', 'eigenvector'])
 9 | 
10 | def compute_centralities(tuples, force_igraph = False):
11 |     """
12 |     This is a docstring
13 |     """
14 | 
15 |     if find_spec('graph_tool') != None and force_igraph == False:
16 |         print("using graph-tool")
17 |         return gt_cents(tuples)
18 | 
19 |     elif find_spec('igraph') != None:
20 |         print("using igraph")
21 |         return igraph_cents(tuples)
22 | 
23 |     else:
24 |         raise Exception("Please ensure that either graph_tool or python_igraph are installed.")
25 | 
26 | def gt_cents(tuples):
27 |     """
28 |     This is a docstring
29 |     """
30 |     author_lookup = {}
31 |     author_number = 0
32 | 
33 |     for entry in tuples:
34 |         for i in range(2):
35 |             if entry[i] not in author_lookup:
36 |                 author_lookup[entry[i]] = author_number
37 |                 author_number += 1
38 | 
39 |     import graph_tool as gt
40 |     from graph_tool.centrality import betweenness, closeness, eigenvector
41 | 
42 |     G = gt.Graph(directed=False)
43 | 
44 |     for edge in tuples:
45 |         G.add_edge(author_lookup[edge[0]], author_lookup[edge[1]], add_missing=True)
46 | 
47 |     betweenness_vertex, _ = betweenness(G)
48 |     closeness_vertex = closeness(G)
49 |     _, eigenvector_vertex = eigenvector(G)
50 | 
51 |     return_list = []
52 | 
53 |     for k, v in author_lookup.items():
54 |         cent = cent_tuple(
55 |             k,
56 |             betweenness_vertex[v],
57 |             closeness_vertex[v],
58 |             eigenvector_vertex[v]
59 |         )
60 |         return_list.append(cent)
61 | 
62 |     return return_list
63 | 
64 | def igraph_cents(tuples):
65 |     """
66 |     This is a docstring
67 |     """
68 |     G = igraph.Graph.TupleList(tuples, directed = False)
69 | 
70 |     vertex_list = G.vs()
71 |     between_list = G.betweenness(directed=False)
72 |     close_list = G.closeness(normalized=True)
73 |     eigen_list = G.eigenvector_centrality(directed=False, scale=True)
74 | 
75 |     return_list = []
76 | 
77 |     for i in range(len(vertex_list)):
78 |         cent = cent_tuple(
79 |             vertex_list[i]['name'],
80 |             between_list[i],
81 |             close_list[i],
82 |             eigen_list[i]
83 |         )
84 |         return_list.append(cent)
85 | 
86 |     return return_list
87 | 


--------------------------------------------------------------------------------
/nate/socnet/dissimilarities.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | def find_dissimilarities():
5 | 
6 |     return


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/combine_covariates.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import metaknowledge as mk
 3 | import networkx as nx
 4 | import pickle
 5 | 
 6 | node_covariates = pd.read_csv("../input/node_covariates.csv")
 7 | sim_scores = pd.read_csv("../input/sim_scores.csv")
 8 | centralities = pd.read_csv("../input/centralities.csv")
 9 | 
10 | node_covariates.rename(columns={'Unnamed: 0': 'author'}, inplace=True)
11 | 
12 | node_covariates.head()
13 | 
14 | sim_scores.head()
15 | 
16 | sim_scores = sim_scores.merge(node_covariates,
17 |                               left_on="author",
18 |                               right_on="author")
19 | 
20 | sim_scores.columns = [
21 |     "author", "dissim_alters", "dissim_alters_2", "alter_dissim_avg",
22 |     "bridge_dissim_avg", "first_ring_dissim_avg", "num_citations", "num_papers",
23 |     "career_start", "num_alter1", "num_alter2"
24 | ]
25 | 
26 | sim_scores = sim_scores.merge(centralities, left_on="author", right_on="author")
27 | 
28 | sim_scores.to_csv("../output/author_covariates.csv", index=False)
29 | 


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/create_author_covariates.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pickle
 3 | 
 4 | alter_list = pd.read_pickle("../input/alter_lists.pkl")
 5 | alter_list = alter_list.set_index('author').to_dict()
 6 | nodes = pd.read_csv("../input/coauthorship_nodeAttributes.csv")
 7 | with open("../input/author_metadata.pkl", "rb") as pkl:
 8 |     author_metadata = pickle.load(pkl)
 9 | 
10 | num_citations = {}
11 | num_papers = {}
12 | career_start = {}
13 | num_alter1 = {}
14 | num_alter2 = {}
15 | 
16 | for author in author_metadata:
17 |     num_citations[author] = author_metadata[author]["wosTimesCited"]
18 |     num_papers[author] = author_metadata[author]["num_papers"] = len(
19 |         author_metadata[author]["wosString"])
20 |     author_metadata[author]["year"] = list(
21 |         filter(None, author_metadata[author]["year"]))
22 |     try:
23 |         career_start[author] = min(
24 |             [int(i) for i in author_metadata[author]["year"]])
25 |     except ValueError:
26 |         career_start[author] = 2018
27 |     try:
28 |         num_alter1[author] = len(alter_list['alter'][author])
29 |     except KeyError:
30 |         pass
31 |     try:
32 |         num_alter2[author] = len(alter_list['alter_2'][author])
33 |     except KeyError:
34 |         pass
35 | 
36 | covariates = pd.DataFrame.from_dict(num_citations, orient='index')
37 | 
38 | covariates['num_citations'] = pd.Series(num_citations)
39 | covariates['num_papers'] = pd.Series(num_papers)
40 | covariates['career_start'] = pd.Series(career_start)
41 | covariates['num_alter1'] = pd.Series(num_alter1)
42 | covariates['num_alter2'] = pd.Series(num_alter2)
43 | 
44 | covariates = covariates.drop(columns=[0])
45 | 
46 | covariates.to_csv("../output/node_covariates.csv")
47 | 


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/extract_coauthor.py:
--------------------------------------------------------------------------------
  1 | import metaknowledge as mk
  2 | import pandas as pd
  3 | import pickle
  4 | import community
  5 | import networkx as nx
  6 | import yaml
  7 | # Web of Science Field Codes
  8 | # AF (Full Author)
  9 | # TI (Title)
 10 | # ID (WoS Keywords)
 11 | # DE (Author keywords)
 12 | # AB (Abstracts)
 13 | # TC (Times Cited)
 14 | # PY (Year Published)
 15 | 
 16 | RCY = mk.RecordCollection('../input/', cached=False)
 17 | 
 18 | RC = RCY.yearSplit(2008, 2019)
 19 | 
 20 | coauth = RC.networkCoAuthor()
 21 | mk.writeGraph(coauth, 'coauthorship')
 22 | 
 23 | wos_dict = RC.makeDict(
 24 |     onlyTheseTags=["UT", "AF", "AU", "TI", "ID", "DE", "AB", "TC", "SO", "PY"],
 25 |     longNames=True,
 26 |     numAuthors=False,
 27 |     genderCounts=False)
 28 | 
 29 | author_dict = {}
 30 | 
 31 | abs_dict = {}
 32 | 
 33 | cites_dict = {}
 34 | 
 35 | for i in range(0, len(wos_dict['wosString'])):
 36 |     wosID = wos_dict['wosString'][i]
 37 | 
 38 |     try:
 39 |         abs_dict[wosID] = {
 40 |             "abstract": wos_dict['abstract'][i],
 41 |             "title": wos_dict['title'][i],
 42 |             "keywords": [],
 43 |         }
 44 | 
 45 |         cites_dict[wosID] = {
 46 |             "cites": wos_dict['wosTimesCited'][i],
 47 |             "year": wos_dict['year'][i],
 48 |         }
 49 | 
 50 |         abs_keywords = []
 51 |         try:
 52 |             abs_keywords.extend(wos_dict['keywords'][i])
 53 |         except TypeError:
 54 |             pass
 55 | 
 56 |         try:
 57 |             abs_keywords.extend(wos_dict['authKeywords'][i])
 58 |         except TypeError:
 59 |             pass
 60 | 
 61 |         abs_dict[wosID]['keywords'] = list(set(x.lower() for x in abs_keywords))
 62 | 
 63 |     except TypeError:
 64 |         pass
 65 | 
 66 |     try:
 67 |         for author in wos_dict['authorsFull'][i]:
 68 |             if author in coauth:
 69 |                 if author not in author_dict:
 70 |                     author_dict[author] = {
 71 |                         "wosString": [],
 72 |                         "title": [],
 73 |                         "keywords": [],
 74 |                         "abstract": [],
 75 |                         "wosTimesCited": 0,
 76 |                         "journal": [],
 77 |                         "year": [],
 78 |                         "community": 0,
 79 |                     }
 80 | 
 81 |                 combined_keywords = []
 82 |                 combined_keywords2 = []
 83 |                 try:
 84 |                     combined_keywords.extend(wos_dict["keywords"][i])
 85 |                 except TypeError:
 86 |                     pass
 87 |                 try:
 88 |                     combined_keywords.extend(wos_dict["authKeywords"][i])
 89 |                 except TypeError:
 90 |                     pass
 91 | 
 92 |                 for keyword in combined_keywords:
 93 |                     combined_keywords2.append(keyword.lower())
 94 | 
 95 |                 combined_keywords2 = list(set(combined_keywords2))
 96 | 
 97 |                 author_dict[author]["wosString"].append(
 98 |                     wos_dict["wosString"][i])
 99 |                 author_dict[author]["title"].append(wos_dict["title"][i])
100 |                 author_dict[author]["keywords"] = combined_keywords2
101 |                 author_dict[author]["abstract"].append(wos_dict["abstract"][i])
102 |                 author_dict[author]["wosTimesCited"] += (
103 |                     wos_dict["wosTimesCited"][i])
104 |                 author_dict[author]["journal"].append(wos_dict["journal"][i])
105 |                 author_dict[author]["year"].append(wos_dict["year"][i])
106 |     except TypeError:
107 |         pass
108 | 
109 | with open("author_metadata.pkl", "wb") as handle:
110 |     pickle.dump(author_dict, handle)
111 | 
112 | with open("comm_abs.pkl", "wb") as handle:
113 |     pickle.dump(abs_dict, handle)
114 | 
115 | with open("cites_dict.yaml", "w") as stream:
116 |     yaml.dump(cites_dict, stream, default_flow_style=False)
117 | 


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/generate_meta_strings.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pickle
 3 | 
 4 | with open('../input/author_metadata.pkl', "rb") as pkl:
 5 |     author_dict = pickle.load(pkl)
 6 | 
 7 | meta_string_dict = {}
 8 | 
 9 | for author in author_dict:
10 | 
11 |     meta_string = ""
12 |     journals = ""
13 | 
14 |     try:
15 |         for title in author_dict[author]["title"]:
16 |             meta_string = meta_string + title + " "
17 |     except TypeError:
18 |         pass
19 |     try:
20 |         for keyword in author_dict[author]["keywords"]:
21 |             meta_string = meta_string + keyword + " "
22 |     except TypeError:
23 |         pass
24 |     try:
25 |         for abstract in author_dict[author]["abstract"]:
26 |             meta_string = meta_string + abstract + " "
27 |     except TypeError:
28 |         pass
29 |     try:
30 |         for journal in author_dict[author]["journal"]:
31 |             journals = journals + journal + " "
32 |     except TypeError:
33 |         pass
34 | 
35 |     meta_string_dict[author] = {
36 |         "meta_string": meta_string,
37 |         "journals": journals,
38 |     }
39 | 
40 | with open("../output/generated_meta_strings.pkl", "wb") as pkl:
41 |     pickle.dump(meta_string_dict, pkl)
42 | 


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/spacy_process/spacy_new.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | import spacy
  5 | import pickle
  6 | from joblib import dump, load, Parallel, delayed, cpu_count
  7 | from joblib import parallel_backend
  8 | import warnings
  9 | warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
 10 | from gensim.models.phrases import Phrases, Phraser
 11 | from sklearn.feature_extraction.text import TfidfVectorizer
 12 | from sklearn.metrics.pairwise import cosine_similarity
 13 | from toolz import partition_all
 14 | import itertools
 15 | import time
 16 | from tqdm import tqdm
 17 | from gensim.utils import simple_preprocess
 18 | 
 19 | from scipy.sparse import vstack
 20 | 
 21 | from numpy.lib.stride_tricks import as_strided  # for removing the diagonal (self-self comparison) in a matrix
 22 | 
 23 | from sklearn.metrics.pairwise import linear_kernel  # equal to cosine_similarity for L2 normalized data
 24 | 
 25 | from sklearn import preprocessing
 26 | 
 27 | from yaml import load
 28 | 
 29 | 
 30 | def mp(items, function, cpu, *args):
 31 |     batch_size = round(
 32 |         len(items) /
 33 |         cpu)  # split the list of items so that each CPU receives one batch
 34 |     partitions = partition_all(batch_size, items)
 35 |     temp = Parallel(n_jobs=cpu, max_nbytes=None)(delayed(function)(
 36 |         v, *args) for v in partitions)  #executes the function on each batch
 37 |     results = list(
 38 |         itertools.chain(*temp)
 39 |     )  # joblib.delayed returns a list of lists (ie. list of each batch result), concatenate them
 40 |     return results
 41 | 
 42 | 
 43 | # same as above, but when 2 lists of results are needed
 44 | def mp2(items, function, cpu, *args):
 45 |     batch_size = round(len(items) / cpu)
 46 |     partitions = partition_all(batch_size, items)
 47 |     temp = Parallel(n_jobs=cpu, max_nbytes=None)(
 48 |         delayed(function)(v, *args) for v in partitions)
 49 |     results1, results2 = zip(*temp)
 50 |     results1 = list(itertools.chain(*results1))
 51 |     results2 = list(itertools.chain(*results2))
 52 |     return results1, results2
 53 | 
 54 | 
 55 | # ibid
 56 | def mp3(items, function, cpu, *args):
 57 |     batch_size = round(len(items) / cpu)
 58 |     partitions = partition_all(batch_size, items)
 59 |     temp = Parallel(n_jobs=cpu, max_nbytes=None)(
 60 |         delayed(function)(v, *args) for v in partitions)
 61 |     results1, results2, results3 = zip(*temp)
 62 |     results1 = list(itertools.chain(*results1))
 63 |     results2 = list(itertools.chain(*results2))
 64 |     results3 = list(itertools.chain(*results3))
 65 |     return results1, results2, results3
 66 | 
 67 | 
 68 | def mp_shared(items, function, cpu, *args):
 69 |     batch_size = round(
 70 |         len(items) /
 71 |         cpu)  # split the list of items so that each CPU receives one batch
 72 |     partitions = partition_all(batch_size, items)
 73 |     temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)(
 74 |         delayed(function)(v, *args)
 75 |         for v in partitions)  #executes the function on each batch
 76 |     results = list(
 77 |         itertools.chain(*temp)
 78 |     )  # joblib.delayed returns a list of lists (ie. list of each batch result), concatenate them
 79 |     return results
 80 | 
 81 | 
 82 | def mp2_shared(items, function, cpu, *args):
 83 |     batch_size = round(len(items) / cpu)
 84 |     partitions = partition_all(batch_size, items)
 85 |     temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)(
 86 |         delayed(function)(v, *args) for v in partitions)
 87 |     results1, results2 = zip(*temp)
 88 |     results1 = list(itertools.chain(*results1))
 89 |     results2 = list(itertools.chain(*results2))
 90 |     return results1, results2
 91 | 
 92 | 
 93 | def mp3_shared(items, function, cpu, *args):
 94 |     batch_size = round(len(items) / cpu)
 95 |     partitions = partition_all(batch_size, items)
 96 |     temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)(
 97 |         delayed(function)(v, *args) for v in partitions)
 98 |     results1, results2, results3 = zip(*temp)
 99 |     results1 = list(itertools.chain(*results1))
100 |     results2 = list(itertools.chain(*results2))
101 |     results3 = list(itertools.chain(*results3))
102 |     return results1, results2, results3
103 | 
104 | 
105 | def dissim_rba(auth_list, auth_alt_dict, auth_alt_dict_2, auth_vectors):
106 |     rb_avg_dissims = []
107 |     ring_avg_dissims = []
108 |     bridge_avg_dissims = []
109 |     for batch_list in batch(auth_list, 50):
110 |         comp_list = []
111 |         for author in batch_list:
112 |             comp_list += [author]
113 |             comp_list += auth_alt_dict[author]
114 |             comp_list += auth_alt_dict_2[author]
115 |         comp_list = sorted(list(set(comp_list)))
116 |         comp_dict = {k: v for v, k in enumerate(comp_list)}
117 |         comp_vectors = []
118 |         for member in comp_list:
119 |             comp_vectors.append(auth_vectors[member])
120 |         v_array = vstack(comp_vectors)
121 |         dissim_matrix = v_array @ v_array.T
122 |         dissim_matrix = dissim_matrix.todense()
123 | 
124 |         for author in batch_list:
125 | 
126 |             rb_dissims = []
127 |             ring_dissims = []
128 |             bridge_dissims = []
129 |             if len(auth_alt_dict[author]) > 0:
130 |                 alter_list = auth_alt_dict[author]
131 | 
132 |                 for alter in alter_list:
133 |                     if len(auth_alt_dict[alter]) > 1:
134 |                         alter_2_list = auth_alt_dict[alter]
135 |                         ring_list = list_common(alter_list, alter_2_list)
136 |                         bridge_list = list_difference(alter_2_list, alter_list)
137 |                         alter_2_list_trim = [
138 |                             x for x in alter_2_list if x != author
139 |                         ]
140 |                         bridge_list_trim = [
141 |                             x for x in bridge_list if x != author
142 |                         ]
143 |                         if len(alter_2_list_trim) > 0:
144 |                             alter_dissim = create_average_dissim(
145 |                                 alter, alter_2_list_trim, comp_dict,
146 |                                 dissim_matrix)
147 |                             rb_dissims.append(1 - alter_dissim)
148 |                         if len(ring_list) > 0:
149 |                             alter_dissim = create_average_dissim(
150 |                                 alter, ring_list, comp_dict, dissim_matrix)
151 |                             ring_dissims.append(1 - alter_dissim)
152 |                         if len(bridge_list_trim) > 0:
153 |                             alter_dissim = create_average_dissim(
154 |                                 alter, bridge_list_trim, comp_dict,
155 |                                 dissim_matrix)
156 |                             bridge_dissims.append(1 - alter_dissim)
157 | 
158 |             if len(rb_dissims) > 0:
159 |                 rb_avg_dissims.append(np.round(np.average(rb_dissims), 3))
160 |             else:
161 |                 rb_avg_dissims.append('NA')
162 | 
163 |             if len(ring_dissims) > 0:
164 |                 ring_avg_dissims.append(np.round(np.average(ring_dissims), 3))
165 |             else:
166 |                 ring_avg_dissims.append('NA')
167 | 
168 |             if len(bridge_dissims) > 0:
169 |                 bridge_avg_dissims.append(
170 |                     np.round(np.average(bridge_dissims), 3))
171 |             else:
172 |                 bridge_avg_dissims.append('NA')
173 | 
174 |     return (rb_avg_dissims, ring_avg_dissims, bridge_avg_dissims)
175 | 
176 | 
177 | def group_avg_dissim(members, vectors):
178 |     member_vectors = []
179 |     for member in members:
180 |         member_vectors.append(vectors[member])
181 |     v_array = vstack(member_vectors)
182 |     group_dissim = 1 - linear_kernel(v_array)
183 |     m = group_dissim.shape[0]
184 |     s0, s1 = group_dissim.strides
185 |     dissim_avg = np.round(
186 |         np.average(
187 |             as_strided(group_dissim.ravel()[1:],
188 |                        shape=(m - 1, m),
189 |                        strides=(s0 + s1, s1)).reshape(m, -1)), 3)
190 | 
191 |     return dissim_avg
192 | 
193 | 
194 | # perform NLP on a list of texts, requires NLP object from main() function (note for future work: NLP object can't be pickled using
195 | # python's pickle module (fast), so there may be performance gains possible by sorting this out re: disabling Loky in mp() functions)
196 | def spacy_process(texts, nlp):
197 |     processed_list = []
198 |     copyright_stops = ['elsevier', 'right', 'rights', '(c)',
199 |                        'ltd']  # domain specific stop words to remove
200 |     allowed_postags = ['NOUN', 'PROPN']  # parts of speech to keep
201 |     for doc in nlp.pipe(
202 |             texts
203 |     ):  # nlp.pipe sends texts to spacy_process in batches for efficiency. Default is 128 (should experiment)
204 |         processed = []
205 |         for token in doc:
206 |             if token.is_stop == False and len(
207 |                     token) > 1:  # don't bother with single char tokens
208 |                 if token.text not in copyright_stops and token.pos_ in allowed_postags:
209 |                     processed.append(
210 |                         token.lemma_
211 |                     )  # keeping lemmatized version of each NOUN and PROPN
212 |         processed = ' '.join(
213 |             processed
214 |         )  # concat the tokens of the document with whitespace between
215 |         processed_list.append(
216 |             processed
217 |         )  # add the doc's processed words to the list of processed documents
218 |     return processed_list
219 | 
220 | 
221 | # same as above, but with a small batch size for memory constraints
222 | def spacy_process_large(texts, nlp):
223 |     processed_list = []
224 |     copyright_stops = ['elsevier', 'right', 'rights', '(c)', 'ltd']
225 |     allowed_postags = ['NOUN', 'PROPN']
226 |     for doc in nlp.pipe(texts, batch_size=1):
227 |         processed = []
228 |         for token in doc:
229 |             if token.is_stop == False and len(token) > 1:
230 |                 if token.text not in copyright_stops and token.pos_ in allowed_postags:
231 |                     processed.append(token.lemma_)
232 |         processed = ' '.join(processed)
233 |         processed_list.append(processed)
234 |     return processed_list
235 | 
236 | 
237 | # bigram detection on a list of texts using sklearn's Phrases module. Note: test whether creating trigrams is as simple as calling
238 | # this process on the text again
239 | def bigram_process(texts):
240 |     words = [
241 |         simple_preprocess(x, deacc=False) for x in texts
242 |     ]  # very efficient preprocessing into tokens based on white space only
243 |     phrases = Phrases(words, min_count=1, threshold=0.8,
244 |                       scoring='npmi')  # bigram model training
245 |     bigram = Phraser(
246 |         phrases)  # creates a leaner specialized version of the bigram model
247 |     bigrams = list(
248 |         bigram[words])  # concatenate words into bigrams (ie. word1_word2)
249 |     bigrams = [' '.join(words) for words in bigrams]
250 |     return bigrams
251 | 
252 | 
253 | def list_difference(list1, list2):
254 |     return (list(set(list1) - set(list2)))
255 | 
256 | 
257 | def list_common(list1, list2):
258 |     return (list(set(list1).intersection(list2)))
259 | 
260 | 
261 | #not used for now
262 | 
263 | 
264 | def batch(batch_list, n=1):
265 |     l = len(batch_list)
266 |     for ndx in range(0, l, n):
267 |         yield batch_list[ndx:min(ndx + n, l)]
268 | 
269 | 
270 | def create_average_dissim(ego, alters, index_dict, matrix):
271 |     dissims = []
272 |     ego_idx = index_dict[ego]
273 |     for alter in alters:
274 |         alter_idx = index_dict[alter]
275 | 
276 |         dissim = matrix[ego_idx, alter_idx]
277 | 
278 |         dissims.append(dissim)
279 |     dissim_avg = np.round(np.average(dissims), 3)
280 |     return dissim_avg
281 | 
282 | 
283 | def dissim_alters(auth_list, auth_alt_dict, auth_alt_dict_2, auth_vectors):
284 |     alters_avg_dissims = []
285 |     alters_2_avg_dissims = []
286 |     for batch_list in batch(auth_list, 4):
287 |         comp_list = []
288 |         for author in batch_list:
289 |             comp_list += [author]
290 |             if author in auth_alt_dict and len(auth_alt_dict[author]) > 0:
291 |                 comp_list += auth_alt_dict[author]
292 |             if author in auth_alt_dict and len(auth_alt_dict_2[author]) > 0:
293 |                 comp_list += auth_alt_dict_2[author]
294 |         comp_list = sorted(list(set(comp_list)))
295 |         comp_dict = {k: v for v, k in enumerate(comp_list)}
296 |         comp_vectors = []
297 |         for member in comp_list:
298 |             comp_vectors.append(auth_vectors[member])
299 |         v_array = vstack(comp_vectors)
300 |         dissim_matrix = v_array @ v_array.T
301 |         dissim_matrix = dissim_matrix.todense()
302 | 
303 |         for author in batch_list:
304 |             if author in auth_alt_dict and len(auth_alt_dict[author]) > 0:
305 |                 alter_list = auth_alt_dict[author]
306 |                 alter_dissim = create_average_dissim(author, alter_list,
307 |                                                      comp_dict, dissim_matrix)
308 |                 alters_avg_dissims.append(1 - alter_dissim)
309 |             else:
310 |                 alters_avg_dissims.append('NA')
311 |             if author in auth_alt_dict_2 and len(auth_alt_dict_2[author]) > 0:
312 |                 alter_list = auth_alt_dict_2[author]
313 |                 alter_dissim = create_average_dissim(author, alter_list,
314 |                                                      comp_dict, dissim_matrix)
315 |                 alters_2_avg_dissims.append(1 - alter_dissim)
316 |             else:
317 |                 alters_2_avg_dissims.append('NA')
318 | 
319 |     return (alters_avg_dissims, alters_2_avg_dissims)
320 | 
321 | 
322 | def single_avg_dissim(ego, alter_list, vectors):
323 |     ego_vector = vectors[ego]
324 |     alter_vectors = []
325 |     if len(alter_list) > 1:
326 |         for alter in alter_list:  # create list of word vectors for each alter in the list
327 |             alter_vectors.append(vectors[alter])
328 |         v_array = vstack(
329 |             alter_vectors
330 |         )  # stack the list of vectors into a numpy array of shape 1 x the number of alters
331 |         ego_dissim = 1 - linear_kernel(
332 |             ego_vector, v_array
333 |         )  # pairwise comparison of author vector to all  vectors in the array
334 |         dissim_avg = np.round(np.average(ego_dissim),
335 |                               3)  # average the above results
336 |     else:
337 |         alter = alter_list[0]  # if author has only 1 alter, no vstack is needed
338 |         dissim_avg = np.round(
339 |             np.average(1 - linear_kernel(ego_vector, vectors[alter])), 3)
340 |     return dissim_avg
341 | 
342 | 
343 | #not used for now
344 | # def group_avg_dissim(members, vectors):
345 | #     member_vectors = []
346 | #     for member in members:
347 | #         member_vectors.append(vectors[member])
348 | #     v_array = vstack(member_vectors)
349 | #     group_dissim = 1 - linear_kernel(v_array)
350 | #     m = group_dissim.shape[0]
351 | #     s0,s1 = group_dissim.strides
352 | #     dissim_avg = np.round(np.average(as_strided(group_dissim.ravel()[1:], shape=(m-1,m), strides=(s0+s1,s1)).reshape(m,-1)), 3)
353 | 
354 | #     return dissim_avg
355 | 
356 | 
357 | def main(
358 | ):  #execute all functions within main to protect against multiprocessing infinite feedback loop
359 | 
360 |     if cpu_count() >= 8:  #to avoid overtaxing Brad, save some cores
361 |         cpu = 10
362 |     else:
363 |         cpu = cpu_count()
364 | 
365 |     with open(
366 |             '../input/generated_meta_strings.pkl', "rb"
367 |     ) as pkl:  # dictionary with authors as keys and their strings as values
368 |         auth_strings = pickle.load(pkl)
369 | 
370 |     with open(
371 |             '../input/alter_lists.pkl', "rb"
372 |     ) as pkl:  # dataframe with author column, alters column, and alters_2 column
373 |         alter_lists = pickle.load(pkl)
374 | 
375 |     auth_alt_dict = dict(zip(alter_lists.author,
376 |                              alter_lists.alter))  # dict of {auth:alter list}
377 |     auth_alt_dict_2 = dict(
378 |         zip(alter_lists.author,
379 |             alter_lists.alter_2))  # dict of {auth: alter_2 list}
380 |     auth_list = sorted(list(auth_strings.keys()))[:]  # list of author names
381 | 
382 |     abs_list = []  # list of author strings to process
383 | 
384 |     # NOTE: this is only safe because the auth_strings dict hasn't been modified. Should be modified for posterity
385 |     for author in auth_list:
386 |         abs_list.append(auth_strings[author]["meta_string"])
387 | 
388 |     del auth_strings
389 | 
390 |     bigram_text = bigram_process(
391 |         abs_list)  # find and concatenate bigrams in the author string list
392 | 
393 |     # load spacy model, disable unnecessary parser and named entity recog for performance
394 |     #spacy.require_gpu()
395 |     nlp = spacy.load('en', disable=['parser', 'ner'])
396 | 
397 |     #nlp.max_length = 10000000   # community strings are very large, may cause memory problems on modest PCs - needs rethinking
398 | 
399 |     # send bigrammed text and spacy function + its required variables to multiprocess function for execution
400 |     processed_list = mp(bigram_text, spacy_process, cpu, nlp)
401 |     vectorizer = TfidfVectorizer(max_df=0.5,
402 |                                  min_df=3,
403 |                                  stop_words='english',
404 |                                  norm='l2')
405 |     matrix = vectorizer.fit_transform(
406 |         processed_list)  # Tfidf vectors for each author string
407 |     auth_vectors = dict(zip(auth_list,
408 |                             matrix))  # creat a dict of {author : tfidf vector}
409 | 
410 |     #create a dataframe by sending list of authors and the dissim function + its required variables to multiprocess function
411 |     sim_df = pd.DataFrame()
412 |     sim_df['author'] = pd.Series(auth_list)
413 |     sim_df['dissim_alters'], sim_df['dissim_alters_2'] = pd.Series(
414 |         mp2_shared(auth_list, dissim_alters, cpu, auth_alt_dict,
415 |                    auth_alt_dict_2, auth_vectors)).array
416 |     sim_df['alter_dissim_avg'], sim_df['bridge_dissim_avg'], sim_df['first_ring_dissim_avg'] =\
417 |         pd.Series(mp3_shared(auth_list, dissim_rba, cpu, auth_alt_dict, auth_alt_dict_2, auth_vectors)).array
418 | 
419 |     sim_df.to_csv('../output/sim_scores.csv', index=False)
420 | 
421 | 
422 | if __name__ == '__main__':
423 |     main()
424 | 


--------------------------------------------------------------------------------
/nate/socnet/old_temsna/temsna_dependencies_sparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/socnet/old_temsna/temsna_dependencies_sparse.png


--------------------------------------------------------------------------------
/nate/socnet/socnet_class.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module accepts a social network that has text attributes for nodes and outputs
 3 | the same social network with similarity values between i,j as an edge attribute 
 4 | """
 5 | from nate.socnet.centralities import compute_centralities
 6 | from nate.socnet.alters import find_alters
 7 | from nate.socnet.dissimilarities import find_dissimilarities
 8 | 
 9 | class SOCnet():
10 |     def __init__(self, data, edgelist):
11 |         self.data = data
12 |         self.edgelist = edgelist
13 |         self.centralities = compute_centralities(edgelist)
14 |         self.alters = find_alters(edgelist)
15 |         self.dissimilarities = None
16 | 


--------------------------------------------------------------------------------
/nate/svonet/Arial.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/svonet/Arial.ttf


--------------------------------------------------------------------------------
/nate/svonet/__init__.py:
--------------------------------------------------------------------------------
1 | from .svo import findSVOs
2 | 


--------------------------------------------------------------------------------
/nate/svonet/degree_over_time.py:
--------------------------------------------------------------------------------
  1 | from nate.svonet.graph_svo import generate_ticks, find_max_burst
  2 | import networkx as nx
  3 | import stop_words as sw
  4 | import copy
  5 | import pandas as pd
  6 | import matplotlib as mpl
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib.dates as mdates
  9 | from matplotlib.ticker import MaxNLocator
 10 | import numpy as np
 11 | 
 12 | 
 13 | class DegreeOverTimeMixIn():
 14 | 
 15 |     def __init__(self):
 16 |         self.offset_dict: dict
 17 |         self.edge_burst_dict: dict
 18 |         self.s: int
 19 |         self.gamma: int
 20 |         self.from_svo: bool
 21 |         self.lookup: dict
 22 | 
 23 |     def top_degree(self,
 24 |                    number_of_slices: int = 8,
 25 |                    list_top: int = 10,
 26 |                    minimum_burst_level: int = 0,
 27 |                    degree_type="both",
 28 |                    remove_stop_words=True):
 29 |         """[summary]
 30 |         
 31 |         Args:
 32 |             number_of_slices (int, optional): [description]. Defaults to 20.
 33 |             list_top (int, optional): [description]. Defaults to 10.
 34 |             degree_type (str, optional): Type of degree calculation to use.
 35 |             Must be one of "in", "out", or "both". Defaults to "both".
 36 |         
 37 |         Returns:
 38 |             [type]: [description]
 39 |         """
 40 | 
 41 |         if degree_type != "in" and degree_type != "out" and degree_type != "both":
 42 |             raise Exception(
 43 |                 "`degree_type` must be one of 'in', 'out', or 'both'")
 44 | 
 45 |         # Create list of time slices:
 46 | 
 47 |         offset_set = set()
 48 | 
 49 |         for key in self.offset_dict:
 50 |             for offset in self.offset_dict[key]:
 51 |                 offset_set.add(offset)
 52 | 
 53 |         time_slices, time_labels = generate_ticks(
 54 |             offset_set, number_of_ticks=(number_of_slices))
 55 | 
 56 |         # Create network consisting of all Subjects and Objects:
 57 | 
 58 |         G = nx.DiGraph()
 59 | 
 60 |         for entry in self.edge_burst_dict:
 61 |             G.add_node(entry[0])
 62 |             G.add_node(entry[-1])
 63 | 
 64 |         # Iterate over time slices
 65 | 
 66 |         top_degree_by_slice = {}
 67 | 
 68 |         for i in range(1, len(time_slices)):
 69 |             graphCopy = copy.deepcopy(G)
 70 | 
 71 |             for key in self.edge_burst_dict:
 72 |                 burst_level = find_max_burst(self.edge_burst_dict[key],
 73 |                                              time_slices[i - 1], time_slices[i])
 74 | 
 75 |                 if burst_level > minimum_burst_level:
 76 |                     graphCopy.add_edge(key[0], key[-1])
 77 | 
 78 |             if degree_type == "in":
 79 |                 degree_list = list(graphCopy.in_degree)
 80 |             elif degree_type == "out":
 81 |                 degree_list = list(graphCopy.out_degree)
 82 |             elif degree_type == "both":
 83 |                 degree_list = list(graphCopy.degree)
 84 | 
 85 |             degree_list.sort(key=lambda x: x[1], reverse=True)
 86 | 
 87 |             if remove_stop_words:
 88 |                 stops = sw.get_stop_words("english")
 89 |                 degree_list = [
 90 |                     item for item in degree_list if item[0] not in stops
 91 |                 ]
 92 | 
 93 |             top_degree_by_slice[time_labels[i]] = degree_list[0:list_top]
 94 | 
 95 |         return top_degree_by_slice
 96 | 
 97 |     def specific_degree(self,
 98 |                         tokens: list,
 99 |                         number_of_slices: int = 15,
100 |                         minimum_burst_level: int = 0,
101 |                         degree_type="both",
102 |                         remove_stop_words=False):
103 |         """[summary]
104 |         
105 |         Args:
106 |             tokens (list): [description]
107 |             number_of_slices (int, optional): [description]. Defaults to 20.
108 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
109 |             degree_type (str, optional): [description]. Defaults to "both".
110 |             remove_stop_words (bool, optional): [description]. Defaults to False.
111 |         
112 |         Returns:
113 |             [type]: [description]
114 |         """
115 | 
116 |         if isinstance(tokens, list) == False:
117 |             tokens = [tokens]
118 | 
119 |         full_lists = self.top_degree(number_of_slices=number_of_slices,
120 |                                      list_top=None,
121 |                                      minimum_burst_level=minimum_burst_level,
122 |                                      degree_type=degree_type,
123 |                                      remove_stop_words=remove_stop_words)
124 | 
125 |         token_rank_dict = {}
126 | 
127 |         for day in full_lists:
128 |             v = [item for item in full_lists[day] if item[0] in tokens]
129 |             token_rank_dict[day] = v
130 | 
131 |         return token_rank_dict
132 | 
133 |     def plot_top_degree(self,
134 |                         number_of_slices: int = 8,
135 |                         list_top: int = 10,
136 |                         minimum_burst_level: int = 0,
137 |                         degree_type="both",
138 |                         remove_stop_words=True,
139 |                         filename: str = False,
140 |                         ):
141 |         """[summary]
142 |         
143 |         Args:
144 |             number_of_slices (int, optional): [description]. Defaults to 20.
145 |             list_top (int, optional): [description]. Defaults to 10.
146 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
147 |             degree_type (str, optional): [description]. Defaults to "both".
148 |             remove_stop_words (bool, optional): [description]. Defaults to True.
149 |         """
150 | 
151 |         data = self.top_degree(number_of_slices=number_of_slices,
152 |                                list_top=list_top,
153 |                                minimum_burst_level=minimum_burst_level,
154 |                                degree_type=degree_type,
155 |                                remove_stop_words=remove_stop_words)
156 | 
157 |         print(data)
158 | 
159 |         date_names = []
160 |         time_slices = []
161 | 
162 |         for k, v in data.items():
163 |             date_names.append(k)
164 |             time_slices.append(v)
165 | 
166 |         for i in range(1, len(date_names)):
167 | 
168 |             x = np.arange(list_top)
169 |             values = []
170 |             names = []
171 | 
172 |             for top_degrees in time_slices[i]:
173 |                 values.append(top_degrees[1])
174 |                 names.append(top_degrees[0])
175 | 
176 |             values.reverse()
177 |             names.reverse()
178 | 
179 |             if np.sum(values) > 0:
180 |                 fig, ax = plt.subplots()
181 |                 fig.set_figwidth(6)
182 |                 fig.set_figheight(10)
183 |                 fig.suptitle('{} to {}'.format(date_names[i - 1],
184 |                                                date_names[i]),
185 |                              fontsize=12, ha="center")
186 |                 ax.xaxis.set_major_locator(MaxNLocator(integer=True))
187 |                 plt.barh(x, values, color='#32363A')
188 |                 plt.yticks(x, names)
189 | 
190 |                 if filename:
191 |                     plt.savefig(str(filename) + str(i) + ".pdf")
192 |                 else:
193 |                     plt.show()
194 |             else:
195 |                 print("No nodes with degree > 0 in this time slice.")
196 | 
197 |     def plot_specific_degree(self,
198 |                              tokens: list,
199 |                              number_of_slices: int = 15,
200 |                              minimum_burst_level: int = 0,
201 |                              degree_type="both",
202 |                              plot_type="line",
203 |                              remove_stop_words=False,
204 |                              filename: str = False,):
205 |         """[summary]
206 |         
207 |         Args:
208 |             tokens (list): [description]
209 |             number_of_slices (int, optional): [description]. Defaults to 20.
210 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
211 |             degree_type (str, optional): [description]. Defaults to "both".
212 |             plot_type (str, optional): [description]. Defaults to "line".
213 |             remove_stop_words (bool, optional): [description]. Defaults to False.
214 |         
215 |         Raises:
216 |             Exception: [description]
217 |         """
218 | 
219 |         if isinstance(tokens, list) == False:
220 |             tokens = [tokens]
221 | 
222 |         if plot_type != "line" and plot_type != "bar":
223 |             raise Exception("`plot_type` must be one of 'line' or 'bar'")
224 | 
225 |         data = self.specific_degree(tokens=tokens,
226 |                                     number_of_slices=number_of_slices,
227 |                                     minimum_burst_level=minimum_burst_level,
228 |                                     degree_type=degree_type,
229 |                                     remove_stop_words=remove_stop_words)
230 | 
231 |         inverted_dict = {}
232 | 
233 |         for token in tokens:
234 |             full_list = []
235 | 
236 |             for date, degree_list in data.items():
237 |                 degree = [item[1] for item in degree_list if item[0] == token]
238 |                 full_list.append((date, degree[0]))
239 | 
240 |             inverted_dict[token] = full_list
241 | 
242 |         x = np.arange(number_of_slices)
243 | 
244 |         for k, v in inverted_dict.items():
245 | 
246 |             values = [item[1] for item in v]
247 |             dates = [item[0].replace(", ", "\n") for item in v]
248 | 
249 |             fig, ax = plt.subplots()
250 |             fig.set_figwidth(10)
251 |             fig.set_figheight(6)
252 |             fig.suptitle("'{}'".format(k), fontsize=12, ha="center")
253 |             ax.yaxis.set_major_locator(MaxNLocator(integer=True))
254 |             if plot_type == "bar":
255 |                 plt.bar(x, values, color='#32363A')
256 |             elif plot_type == "line":
257 |                 plt.plot(x, values, color='#32363A')
258 |             plt.xticks(x, dates)
259 | 
260 |             if filename:
261 |                 plt.savefig(str(filename) + str(k) + ".pdf")
262 |             else:
263 |                 plt.show()
264 | 


--------------------------------------------------------------------------------
/nate/svonet/graph_svo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a MODULE docstring
  3 | """
  4 | 
  5 | import networkx as nx
  6 | from PIL import Image
  7 | from os import remove
  8 | from typing import Tuple, List
  9 | from datetime import datetime
 10 | 
 11 | color_dict = {
 12 |     0: "#F62D2D",
 13 |     1: "#D3212D",
 14 |     2: "#A2264B",
 15 |     3: "#722B6A",
 16 |     4: "#412F88",
 17 |     5: "#1F0033",
 18 |     6: "#000000"
 19 | }
 20 | 
 21 | 
 22 | def generate_ticks(offsets, number_of_ticks=10) -> Tuple[List[int], List[str]]:
 23 |     """[summary]
 24 |     
 25 |     Args:
 26 |         offsets ([type]): [description]
 27 |         number_of_ticks (int, optional): [description]. Defaults to 10.
 28 |     
 29 |     Returns:
 30 |         Tuple[List[int], List[str]]: [description]
 31 |     """
 32 | 
 33 |     rawdif = max(offsets) - min(offsets)
 34 | 
 35 |     divdiff = rawdif / number_of_ticks
 36 | 
 37 |     chunk_size = round(divdiff)
 38 | 
 39 |     tick_positions: List[int] = []
 40 | 
 41 |     for i in range(0, number_of_ticks + 1):
 42 |         tick_positions.append(int(min(offsets) + (i * chunk_size)))
 43 | 
 44 |     tick_labels: List[str] = []
 45 | 
 46 |     for tick in tick_positions:
 47 | 
 48 |         time_label = datetime.utcfromtimestamp(tick).strftime("%b %d, %Y")
 49 | 
 50 |         tick_labels.append(time_label)
 51 | 
 52 |     return tick_positions, tick_labels
 53 | 
 54 | 
 55 | def find_max_burst(burst_list: list, offset_start, offset_end):
 56 |     """[summary]
 57 |     
 58 |     Args:
 59 |         burst_list (list): [description]
 60 |         offset_start ([type]): [description]
 61 |         offset_end ([type]): [description]
 62 |     
 63 |     Returns:
 64 |         [type]: [description]
 65 |     """
 66 | 
 67 |     burst_levels = set()
 68 |     burst_levels.add(0)
 69 | 
 70 |     for burst in burst_list:
 71 |         if burst[2] < offset_start or offset_end < burst[1]: #offset_start < burst[1] < offset_end or offset_start < burst[2] < offset_end:
 72 |             pass
 73 |         else:
 74 |             burst_levels.add(burst[0])
 75 | 
 76 |     return max(burst_levels)
 77 | 
 78 | 
 79 | class SVOgraphMixin():
 80 | 
 81 |     def get_giant_component(self):
 82 |         """[summary]
 83 |         
 84 |         Returns:
 85 |             [type]: [description]
 86 |         """
 87 | 
 88 |         G = nx.DiGraph()
 89 | 
 90 |         svo_list = self.edge_burst_dict
 91 | 
 92 |         for entry in svo_list:
 93 |             G.add_edge(entry[0], entry[2], label=" " + entry[1])
 94 | 
 95 |         return G.subgraph(max(nx.weakly_connected_components(G),
 96 |                               key=len)).copy()
 97 | 
 98 |     def save_svo_graph(self,
 99 |                        term_list,
100 |                        use_giant=False,
101 |                        file_name=None,
102 |                        return_networkx=False):
103 |         """[summary]
104 |         
105 |         Args:
106 |             term_list ([type]): [description]
107 |             use_giant (bool, optional): [description]. Defaults to False.
108 |             file_name ([type], optional): [description]. Defaults to None.
109 |             return_networkx (bool, optional): [description]. Defaults to False.
110 |         
111 |         Returns:
112 |             [type]: [description]
113 |         """
114 | 
115 |         G = nx.DiGraph()
116 | 
117 |         if isinstance(term_list, str):
118 |             term_list = [term_list]
119 | 
120 |         svo_list = self.edge_burst_dict
121 | 
122 |         for entry in svo_list:
123 |             include = False
124 |             for entry_part in entry:
125 |                 if entry_part in term_list:
126 |                     include = True
127 | 
128 |                 for term in term_list:
129 |                     if term in entry_part or entry_part in term:
130 |                         include = True
131 | 
132 |             if include:
133 |                 G.add_edge(entry[0], entry[2], label=" " + entry[1])
134 | 
135 |         for entry in G:
136 |             G.nodes[entry]['style'] = 'filled'
137 |             G.nodes[entry]['fillcolor'] = 'cadetblue2'
138 | 
139 |         toPdot = nx.drawing.nx_pydot.to_pydot
140 |         N = toPdot(G)
141 | 
142 |         if return_networkx:
143 |             return G
144 |         else:
145 |             if file_name == None:
146 |                 file_name = "_".join(term_list)
147 | 
148 |             N.write(file_name + "_svo_visualization.png",
149 |                     prog='dot',
150 |                     format='png')
151 | 
152 |     def create_svo_animation(self,
153 |                              term_list,
154 |                              use_giant=False,
155 |                              num_ticks=20,
156 |                              delay_per_tick=3,
157 |                              file_name="test",
158 |                              remove_images=True):
159 |         """[summary]
160 |         
161 |         Args:
162 |             term_list ([type]): [description]
163 |             use_giant (bool, optional): [description]. Defaults to False.
164 |             num_ticks (int, optional): [description]. Defaults to 20.
165 |             delay_per_tick (int, optional): [description]. Defaults to 3.
166 |             file_name (str, optional): [description]. Defaults to "test".
167 |             remove_images (bool, optional): [description]. Defaults to True.
168 |         """
169 | 
170 |         file_name = str(file_name)
171 | 
172 |         if use_giant:
173 |             G = self.get_giant_component()
174 |         else:
175 |             G = self.save_svo_graph(self, term_list, return_networkx=True)
176 | 
177 |         offset_list = set()
178 |         svo_keys = []
179 | 
180 |         for edge in G.edges:
181 |             G[edge[0]][edge[1]]['burst_last'] = -100
182 |             G[edge[0]][edge[1]]['burst_level'] = 0
183 |             G[edge[0]][edge[1]]['color'] = "black"
184 |             G[edge[0]][edge[1]]['penwidth'] = 1
185 |             label = G.get_edge_data(edge[0], edge[1])['label']
186 |             key = (edge[0], label[1:], edge[1])
187 |             offsets = self.offset_dict[key]
188 |             offset_list.add(min(offsets))
189 |             offset_list.add(max(offsets))
190 |             svo_keys.append(key)
191 | 
192 |         time_slices, time_labels = generate_ticks(offset_list, num_ticks)
193 | 
194 |         initial_graph = nx.drawing.nx_pydot.to_pydot(G)
195 | 
196 |         graphs = [initial_graph]
197 | 
198 |         for i in range(1, len(time_slices)):
199 |             # The following lines are for functionality not yet implemented: we can cause the nodes - not just the edges - to show their burst patterns
200 |             # bursting_nodes = set()
201 |             # cooling_nodes = set()
202 |             # inactive_nodes = set()
203 |             for key in svo_keys:
204 | 
205 |                 burst_level = find_max_burst(self.edge_burst_dict[key],
206 |                                              time_slices[i - 1], time_slices[i])
207 | 
208 |                 G[key[0]][key[2]]['burst_level'] = burst_level
209 | 
210 |                 if burst_level > 0:
211 |                     G[key[0]][key[2]]['burst_last'] = i
212 |                     # print(key[0])
213 |                     # print(key[1])
214 |                     # print(key[2])
215 |                     # print(i)
216 | 
217 |                 distance = i - G[key[0]][key[2]]['burst_last']
218 | 
219 |                 color = color_dict[min([distance, 6])]
220 |                 penwidth = max([6 - distance, 0.5])
221 | 
222 |                 G[key[0]][key[2]]['penwidth'] = penwidth
223 |                 G[key[0]][key[2]]['color'] = color
224 | 
225 |             subgraph = nx.drawing.nx_pydot.to_pydot(G)
226 | 
227 |             graphs.append(subgraph)
228 | 
229 |         filenames = []
230 | 
231 |         for i in range(len(graphs)):
232 |             this_file = file_name + "_" + str(i) + ".png"
233 |             filenames.append(this_file)
234 | 
235 |             graphs[i].write_png(this_file)
236 | 
237 |         images = []
238 | 
239 |         for name in filenames:
240 |             images.append(Image.open(name))
241 | 
242 |         images[0].save(file_name + ".gif",
243 |                        save_all=True,
244 |                        append_images=images[1:],
245 |                        optimize=False,
246 |                        duration=len(images * delay_per_tick),
247 |                        loop=0)
248 | 
249 |         if remove_images:
250 |             for file_ in filenames:
251 |                 remove(file_)
252 | 


--------------------------------------------------------------------------------
/nate/svonet/svo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a MODULE docstring. This module is a modification of the `enhanced-subject-verb-object-extraction` 
  3 | package by Rock de Vocht: https://github.com/peter3125/enhanced-subject-verb-object-extraction
  4 | Changes are primarily to allow filtering for subjects and objects included in optional lists of spaCy's named
  5 | entity tags, as well as including those tags in the output tuple.
  6 | Note that this module must receive sentences one at a time, otherwise a passive sentence will flag
  7 | all subsequent sentences as passive, reversing subject and object order incorrectly.
  8 | """
  9 | 
 10 | #
 11 | # Copyright 2017 Peter de Vocht
 12 | #
 13 | # Licensed under the Apache License, Version 2.0 (the "License");
 14 | # you may not use this file except in compliance with the License.
 15 | # You may obtain a copy of the License at
 16 | #
 17 | #    http://www.apache.org/licenses/LICENSE-2.0
 18 | #
 19 | # Unless required by applicable law or agreed to in writing, software
 20 | # distributed under the License is distributed on an "AS IS" BASIS,
 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 22 | # See the License for the specific language governing permissions and
 23 | # limitations under the License.
 24 | 
 25 | # dependency markers for subjects
 26 | SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"}
 27 | # dependency markers for objects
 28 | OBJECTS = {"dobj", "dative", "attr", "oprd"}
 29 | # POS tags that will break adjoining items
 30 | BREAKER_POS = {"CCONJ", "VERB"}
 31 | # words that are negations
 32 | NEGATIONS = {"no", "not", "n't", "never", "none"}
 33 | 
 34 | sub_ner_tags = False
 35 | obj_ner_tags = False
 36 | sub_ent_types = []
 37 | obj_ent_types = []
 38 | 
 39 | 
 40 | # does dependency set contain any coordinating conjunctions?
 41 | def contains_conj(depSet):
 42 | 
 43 |     return "and" in depSet or "or" in depSet or "nor" in depSet or \
 44 |            "but" in depSet or "yet" in depSet or "so" in depSet or "for" in depSet
 45 | 
 46 | 
 47 | # get subs joined by conjunctions
 48 | def _get_subs_from_conjunctions(subs):
 49 | 
 50 |     more_subs = []
 51 |     for sub in subs:
 52 |         # rights is a generator
 53 |         rights = list(sub.rights)
 54 |         rightDeps = {tok.lower_ for tok in rights}
 55 |         if contains_conj(rightDeps):
 56 |             if sub_ner_tags:
 57 |                 more_subs.extend([
 58 |                     tok for tok in rights
 59 |                     if tok.dep_ in SUBJECTS and tok.ent_type_ in sub_ner_tags
 60 |                 ])
 61 |             else:
 62 |                 more_subs.extend([
 63 |                     tok for tok in rights
 64 |                     if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"
 65 |                 ])
 66 |             if len(more_subs) > 0:
 67 |                 more_subs.extend(_get_subs_from_conjunctions(more_subs))
 68 |     return more_subs
 69 | 
 70 | 
 71 | # get objects joined by conjunctions
 72 | def _get_objs_from_conjunctions(objs):
 73 | 
 74 |     more_objs = []
 75 |     for obj in objs:
 76 |         # rights is a generator
 77 |         rights = list(obj.rights)
 78 |         rightDeps = {tok.lower_ for tok in rights}
 79 |         if contains_conj(rightDeps):
 80 |             if obj_ner_tags:
 81 |                 more_objs.extend([
 82 |                     tok for tok in rights
 83 |                     if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags)
 84 |                     or (tok.pos_ == "NOUN" and tok.ent_type_ in obj_ner_tags)
 85 |                 ])
 86 |             else:
 87 |                 more_objs.extend([
 88 |                     tok for tok in rights
 89 |                     if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"
 90 |                 ])
 91 |             if len(more_objs) > 0:
 92 |                 more_objs.extend(_get_objs_from_conjunctions(more_objs))
 93 |     return more_objs
 94 | 
 95 | 
 96 | # find sub dependencies
 97 | def _find_subs(tok):
 98 | 
 99 |     head = tok.head
100 |     while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
101 |         head = head.head
102 |     if head.pos_ == "VERB":
103 |         if sub_ner_tags:
104 |             subs = [
105 |                 tok for tok in head.lefts
106 |                 if tok.dep_ == "SUB" and tok.ent_type_ in sub_ner_tags
107 |             ]
108 |         else:
109 |             subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
110 |         if len(subs) > 0:
111 |             verb_negated = _is_negated(head)
112 |             subs.extend(_get_subs_from_conjunctions(subs))
113 |             return subs, verb_negated
114 |         elif head.head != head:
115 |             return _find_subs(head)
116 |     elif sub_ner_tags and head.ent_type_ in sub_ner_tags:
117 |         return [head], _is_negated(tok)
118 |     elif not sub_ner_tags and head.pos_ == "NOUN":
119 |         return [head], _is_negated(tok)
120 |     return [], False
121 | 
122 | 
123 | # is the tok set's left or right negated?
124 | def _is_negated(tok):
125 | 
126 |     parts = list(tok.lefts) + list(tok.rights)
127 |     for dep in parts:
128 |         if dep.lower_ in NEGATIONS:
129 |             return True
130 |     return False
131 | 
132 | 
133 | # get all the verbs on tokens with negation marker
134 | def _find_svs(tokens):
135 | 
136 |     svs = []
137 |     verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
138 |     for v in verbs:
139 |         subs, verbNegated = _get_all_subs(v)
140 |         if len(subs) > 0:
141 |             for sub in subs:
142 |                 svs.append(
143 |                     (sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
144 |     return svs
145 | 
146 | 
147 | # get grammatical objects for a given set of dependencies (including passive sentences)
148 | def _get_objs_from_prepositions(deps, is_pas):
149 | 
150 |     objs = []
151 |     for dep in deps:
152 |         if obj_ner_tags:
153 |             if dep.pos_ == "ADP" and (dep.dep_ == "prep" or
154 |                                       (is_pas and dep.dep_ == "agent")):
155 |                 objs.extend([
156 |                     tok for tok in dep.rights
157 |                     if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags)
158 |                 ])
159 |                 #(is_pas and tok.ent_type_ in obj_ner_tags and tok.dep_ == 'pobj')]) #temporarily disabled
160 |         else:
161 |             if dep.pos_ == "ADP" and (dep.dep_ == "prep" or
162 |                                       (is_pas and dep.dep_ == "agent")):
163 |                 objs.extend([
164 |                     tok for tok in dep.rights if tok.dep_ in OBJECTS or
165 |                     (tok.pos_ == "PRON" and tok.lower_ == "me") or
166 |                     (is_pas and tok.dep_ == 'pobj')
167 |                 ])
168 |     return objs
169 | 
170 | 
171 | # get objects from the dependencies using the attribute dependency
172 | # *NOTE* disabled for unknown reason in _get_all_objs, this needs NER option if it should be enabled
173 | def _get_objs_from_attrs(deps, is_pas):
174 | 
175 |     for dep in deps:
176 |         if dep.pos_ == "NOUN" and dep.dep_ == "attr":
177 |             verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
178 |             if len(verbs) > 0:
179 |                 for v in verbs:
180 |                     rights = list(v.rights)
181 |                     objs = [tok for tok in rights if tok.dep_ in OBJECTS]
182 |                     objs.extend(_get_objs_from_prepositions(rights, is_pas))
183 |                     if len(objs) > 0:
184 |                         return v, objs
185 |     return None, None
186 | 
187 | 
188 | # xcomp; open complement - verb has no subject
189 | def _get_obj_from_xcomp(deps, is_pas):
190 | 
191 |     for dep in deps:
192 |         if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
193 |             v = dep
194 |             rights = list(v.rights)
195 |             if obj_ner_tags:
196 |                 objs = [
197 |                     tok for tok in rights
198 |                     if tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags
199 |                 ]
200 |             else:
201 |                 objs = [tok for tok in rights if tok.dep_ in OBJECTS]
202 |             objs.extend(_get_objs_from_prepositions(rights, is_pas))
203 |             if len(objs) > 0:
204 |                 return v, objs
205 |     return None, None
206 | 
207 | 
208 | # get all functional subjects adjacent to the verb passed in
209 | def _get_all_subs(v):
210 | 
211 |     verb_negated = _is_negated(v)
212 |     if sub_ner_tags:
213 |         subs = [
214 |             tok for tok in v.lefts if tok.dep_ in SUBJECTS and
215 |             tok.ent_type_ in sub_ner_tags and tok.pos_ != "DET"
216 |         ]
217 |     else:
218 |         subs = [
219 |             tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"
220 |         ]
221 |     if len(subs) > 0:
222 |         subs.extend(_get_subs_from_conjunctions(subs))
223 |     else:
224 |         foundSubs, verb_negated = _find_subs(v)
225 |         subs.extend(foundSubs)
226 | 
227 |     global sub_ent_types
228 |     sub_ent_types = [sub.ent_type_ for sub in subs]
229 | 
230 |     return subs, verb_negated
231 | 
232 | 
233 | # is the token a verb?  (excluding auxiliary verbs)
234 | def _is_non_aux_verb(tok):
235 | 
236 |     return tok.pos_ == "VERB" and (tok.dep_ != "aux" and tok.dep_ != "auxpass")
237 | 
238 | 
239 | # return the verb to the right of this verb in a CCONJ relationship if applicable
240 | # returns a tuple, first part True|False and second part the modified verb if True
241 | def _right_of_verb_is_conj_verb(v):
242 | 
243 |     # rights is a generator
244 |     rights = list(v.rights)
245 | 
246 |     # VERB CCONJ VERB (e.g. he beat and hurt me)
247 |     if len(rights) > 1 and rights[0].pos_ == 'CCONJ':
248 |         for tok in rights[1:]:
249 |             if _is_non_aux_verb(tok):
250 |                 return True, tok
251 | 
252 |     return False, v
253 | 
254 | 
255 | # get all objects for an active/passive sentence
256 | def _get_all_objs(v, is_pas):
257 | 
258 |     # rights is a generator
259 |     rights = list(v.rights)
260 |     if obj_ner_tags:
261 |         objs = [
262 |             tok for tok in rights
263 |             if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) or
264 |             (is_pas and tok.dep_ == 'pobj' and tok.ent_type_ in obj_ner_tags)
265 |         ]
266 |     else:
267 |         objs = [
268 |             tok for tok in rights
269 |             if tok.dep_ in OBJECTS or (is_pas and tok.dep_ == 'pobj')
270 |         ]
271 |     objs.extend(_get_objs_from_prepositions(rights, is_pas))
272 | 
273 |     #potentialNewVerb, potentialNewObjs = _get_objs_from_attrs(rights)
274 |     #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
275 |     #    objs.extend(potentialNewObjs)
276 |     #    v = potentialNewVerb
277 | 
278 |     potential_new_verb, potential_new_objs = _get_obj_from_xcomp(rights, is_pas)
279 |     if potential_new_verb is not None and potential_new_objs is not None and len(
280 |             potential_new_objs) > 0:
281 |         objs.extend(potential_new_objs)
282 |         v = potential_new_verb
283 |     if len(objs) > 0:
284 |         objs.extend(_get_objs_from_conjunctions(objs))
285 | 
286 |     global obj_ent_types
287 |     obj_ent_types = [obj.ent_type_ for obj in objs]
288 | 
289 |     return v, objs
290 | 
291 | 
292 | # return true if the sentence is passive - at he moment a sentence is assumed passive if it has an auxpass verb
293 | def _is_passive(tokens):
294 | 
295 |     for tok in tokens:
296 |         if tok.dep_ == "auxpass":
297 |             return True
298 |     return False
299 | 
300 | 
301 | # resolve a 'that' where/if appropriate
302 | def _get_that_resolution(toks):
303 | 
304 |     for tok in toks:
305 |         if 'that' in [t.orth_ for t in tok.lefts]:
306 |             return tok.head
307 |     return toks
308 | 
309 | 
310 | # simple stemmer using lemmas
311 | def _get_lemma(word: str):
312 | 
313 |     tokens = word  #nlp(word)
314 |     if len(tokens) == 1:
315 |         return tokens[0].lemma_
316 |     return word
317 | 
318 | 
319 | # print information for displaying all kinds of things of the parse tree
320 | def printDeps(toks):
321 | 
322 |     for tok in toks:
323 |         print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_,
324 |               [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])
325 | 
326 | 
327 | # expand an obj / subj np using its chunk
328 | def expand(item, tokens, visited):
329 | 
330 |     if item.lower_ == 'that':
331 |         item = _get_that_resolution(tokens)
332 | 
333 |     parts = []
334 | 
335 |     if hasattr(item, 'lefts'):
336 |         for part in item.lefts:
337 |             if part.pos_ in BREAKER_POS:
338 |                 break
339 |             if not part.lower_ in NEGATIONS:
340 |                 parts.append(part)
341 | 
342 |     parts.append(item)
343 | 
344 |     if hasattr(item, 'rights'):
345 |         for part in item.rights:
346 |             if part.pos_ in BREAKER_POS:
347 |                 break
348 |             if not part.lower_ in NEGATIONS:
349 |                 parts.append(part)
350 | 
351 |     if hasattr(parts[-1], 'rights'):
352 |         for item2 in parts[-1].rights:
353 |             if item2.pos_ == "DET" or item2.pos_ == "NOUN":
354 |                 if item2.i not in visited:
355 |                     visited.add(item2.i)
356 |                     parts.extend(expand(item2, tokens, visited))
357 |             break
358 | 
359 |     return parts
360 | 
361 | 
362 | # convert a list of tokens to a string
363 | def to_str(tokens):
364 | 
365 |     return ' '.join([item.text for item in tokens])
366 | 
367 | 
368 | # find verbs and their subjects / objects to create SVOs, detect passive/active sentences
369 | def findSVOs(tokens, sub_tags=False, obj_tags=False):
370 |     global sub_ner_tags
371 |     sub_ner_tags = sub_tags
372 |     global obj_ner_tags
373 |     obj_ner_tags = obj_tags
374 |     svos = []
375 |     is_pas = _is_passive(tokens)
376 |     verbs = [tok for tok in tokens if _is_non_aux_verb(tok)]
377 |     visited = set()  # recursion detection
378 |     sub_ent_types = []
379 |     obj_ent_types = []
380 |     for v in verbs:
381 |         subs, verbNegated = _get_all_subs(v)
382 |         # hopefully there are subs, if not, don't examine this verb any longer
383 |         if len(subs) > 0:
384 |             isConjVerb, conjV = _right_of_verb_is_conj_verb(v)
385 |             if isConjVerb:
386 |                 v2, objs = _get_all_objs(conjV, is_pas)
387 |                 for sub in subs:
388 |                     for obj in objs:
389 |                         objNegated = _is_negated(obj)
390 |                         if is_pas:  # reverse object / subject for passive
391 |                             svos.append(
392 |                                 (to_str(expand(obj, tokens,
393 |                                                visited)), "!" + v.lemma_
394 |                                  if verbNegated or objNegated else v.lemma_,
395 |                                  to_str(expand(sub, tokens, visited))))
396 |                             sub_ent_types.append(sub.ent_type_)
397 |                             obj_ent_types.append(obj.ent_type_)
398 |                             svos.append(
399 |                                 (to_str(expand(obj, tokens,
400 |                                                visited)), "!" + v2.lemma_
401 |                                  if verbNegated or objNegated else v2.lemma_,
402 |                                  to_str(expand(sub, tokens, visited))))
403 |                             sub_ent_types.append(sub.ent_type_)
404 |                             obj_ent_types.append(obj.ent_type_)
405 |                         else:
406 |                             svos.append(
407 |                                 (to_str(expand(sub, tokens,
408 |                                                visited)), "!" + v.lower_
409 |                                  if verbNegated or objNegated else v.lower_,
410 |                                  to_str(expand(obj, tokens, visited))))
411 |                             sub_ent_types.append(sub.ent_type_)
412 |                             obj_ent_types.append(obj.ent_type_)
413 |                             svos.append(
414 |                                 (to_str(expand(sub, tokens,
415 |                                                visited)), "!" + v2.lower_
416 |                                  if verbNegated or objNegated else v2.lower_,
417 |                                  to_str(expand(obj, tokens, visited))))
418 |                             sub_ent_types.append(sub.ent_type_)
419 |                             obj_ent_types.append(obj.ent_type_)
420 |             else:
421 |                 v, objs = _get_all_objs(v, is_pas)
422 |                 for sub in subs:
423 |                     for obj in objs:
424 |                         objNegated = _is_negated(obj)
425 |                         if is_pas:  # reverse object / subject for passive
426 |                             svos.append(
427 |                                 (to_str(expand(obj, tokens,
428 |                                                visited)), "!" + v.lemma_
429 |                                  if verbNegated or objNegated else v.lemma_,
430 |                                  to_str(expand(sub, tokens, visited))))
431 |                             sub_ent_types.append(sub.ent_type_)
432 |                             obj_ent_types.append(obj.ent_type_)
433 |                         else:
434 |                             svos.append(
435 |                                 (to_str(expand(sub, tokens,
436 |                                                visited)), "!" + v.lower_
437 |                                  if verbNegated or objNegated else v.lower_,
438 |                                  to_str(expand(obj, tokens, visited))))
439 |                             sub_ent_types.append(sub.ent_type_)
440 |                             obj_ent_types.append(obj.ent_type_)
441 | 
442 |     return (svos, sub_ent_types, obj_ent_types)
443 | 


--------------------------------------------------------------------------------
/nate/svonet/svo_degree_over_time.py:
--------------------------------------------------------------------------------
  1 | from nate.svonet.graph_svo import generate_ticks, find_max_burst
  2 | import networkx as nx
  3 | import stop_words as sw
  4 | import copy
  5 | import pandas as pd
  6 | import matplotlib as mpl
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib.dates as mdates
  9 | from matplotlib.ticker import MaxNLocator
 10 | import numpy as np
 11 | from multiprocessing import Process, Queue
 12 | from os import cpu_count
 13 | 
 14 | 
 15 | def get_degree_for_slice(
 16 |         q: Queue,
 17 |         G, 
 18 |         edge_burst_dict, 
 19 |         time_slice_start,
 20 |         time_slice_end, 
 21 |         minimum_burst_level, 
 22 |         stops, 
 23 |         overlap_threshold,
 24 |         return_edge_overlaps,
 25 |         list_top,
 26 |         time_label):
 27 |     graphCopy = copy.deepcopy(G)
 28 | 
 29 |     for key in edge_burst_dict:
 30 |         burst_level = find_max_burst(edge_burst_dict[key], time_slice_start, time_slice_end)
 31 | 
 32 |         if burst_level > minimum_burst_level:
 33 |             for node in graphCopy.nodes():
 34 |                 for j in [0, -1]:
 35 |                     for k in [0, -1]:
 36 |                         if key[j] == node[k] and key[j] not in stops:
 37 |                             overlap = len(set(key).intersection(set(node)))
 38 |                             if overlap >= overlap_threshold:
 39 |                                 graphCopy.add_edge(key, node, overlap=overlap)
 40 | 
 41 |     graphCopy.remove_edges_from(nx.selfloop_edges(graphCopy))
 42 | 
 43 | 
 44 |     degree_list = list(graphCopy.degree)
 45 | 
 46 |     degree_list.sort(key=lambda x: x[1], reverse=True)
 47 | 
 48 |     degree_list = degree_list[0:list_top]
 49 | 
 50 |     overlap_list = []
 51 | 
 52 |     if return_edge_overlaps:
 53 |         
 54 |         for entry in degree_list[0:list_top]:
 55 |             overlap_sum = []
 56 |             for edge in graphCopy.edges(entry[0]):
 57 |                 overlap_sum.append(graphCopy.edges[edge]['overlap'])
 58 | 
 59 |             if len(overlap_sum) > 0:
 60 |                 avg = round(sum(overlap_sum) / len(overlap_sum), 2)
 61 |             else:
 62 |                 avg = 0
 63 | 
 64 |             overlap_list.append((entry[0], avg))
 65 | 
 66 |     if return_edge_overlaps: 
 67 |         q.put((time_label, time_slice_end, degree_list, overlap_list))
 68 |     else:
 69 |         q.put((time_label, time_slice_end, degree_list))
 70 | 
 71 | 
 72 | class SVODegreeOverTimeMixin():
 73 | 
 74 |     def __init__(self):
 75 |         self.offset_dict:dict 
 76 |         self.edge_burst_dict:dict
 77 |         self.s: int
 78 |         self.gamma: int
 79 |         self.from_svo: bool
 80 |         self.lookup: dict
 81 | 
 82 |     
 83 |     def top_svo_degree(
 84 |         self, 
 85 |         number_of_slices: int = 8, 
 86 |         list_top: int = 10,
 87 |         minimum_burst_level: int = 0,
 88 |         return_edge_overlaps: bool = True,
 89 |         overlap_threshold: int = 1):
 90 |         """[summary]
 91 |         
 92 |         Args:
 93 |             number_of_slices (int, optional): [description]. Defaults to 20.
 94 |             list_top (int, optional): [description]. Defaults to 10.
 95 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
 96 |             return_edge_overlaps (bool, optional): [description]. Defaults to True.
 97 |             overlap_threshold (int, optional): [description]. Defaults to 1.
 98 |         
 99 |         Raises:
100 |             Exception: [description]
101 |         
102 |         Returns:
103 |             [type]: [description]
104 |         """
105 | 
106 |         if overlap_threshold > 2 or overlap_threshold < 1:
107 |             raise Exception("Overlap Filter must be 1 or 2.")
108 | 
109 |         stops = sw.get_stop_words("english")
110 | 
111 |         # Create list of time slices:
112 | 
113 |         offset_set = set()
114 | 
115 |         for key in self.offset_dict:
116 |             for offset in self.offset_dict[key]:
117 |                 offset_set.add(offset)
118 | 
119 |         time_slices, time_labels = generate_ticks(offset_set, number_of_ticks=(number_of_slices))
120 | 
121 |         # Create network consisting of all Subjects and Objects:
122 | 
123 |         G = nx.Graph()
124 | 
125 |         for entry in self.edge_burst_dict:
126 |             G.add_node(entry)
127 | 
128 |         if list_top == None:
129 |             list_top = len(self.edge_burst_dict)
130 | 
131 |         # Iterate over time slices
132 | 
133 |         q = Queue()
134 | 
135 |         processes = []
136 | 
137 |         for i in range(1, len(time_slices)):
138 | 
139 |             time_slice_start = time_slices[i-1]
140 |             time_slice_end = time_slices[i]
141 |             time_label = time_labels[i]
142 | 
143 |             t = Process(
144 |                 target = get_degree_for_slice,
145 |                 args= (
146 |                     q,
147 |                     G,
148 |                     self.edge_burst_dict,
149 |                     time_slice_start,
150 |                     time_slice_end,
151 |                     minimum_burst_level,
152 |                     stops,
153 |                     overlap_threshold,
154 |                     return_edge_overlaps,
155 |                     list_top,
156 |                     time_label
157 |                 )
158 |             )
159 | 
160 |             processes.append(t)
161 |             t.start()
162 | 
163 |         result_list = []
164 | 
165 |         for i in range(1, len(time_slices)):
166 |             result_list.append(q.get())
167 | 
168 | 
169 |         top_degree_by_slice = {}
170 |         edge_overlap = {}
171 | 
172 |         result_list = sorted(result_list, key = lambda x: x[1])
173 |                 
174 |         for result in result_list:
175 |             time_label = result[0]
176 |             degree_list = result[2]
177 |             top_degree_by_slice[time_label] = degree_list
178 |             if return_edge_overlaps:
179 |                 edge_overlap[time_label] = result[3]
180 |         
181 |         if return_edge_overlaps: 
182 |             return top_degree_by_slice, edge_overlap
183 |         else:
184 |             return top_degree_by_slice
185 | 
186 |     def specific_svo_degree(self,
187 |                         tokens: list,
188 |                         number_of_slices: int = 15,
189 |                         minimum_burst_level: int = 0,
190 |                         overlap_threshold: int = 1):
191 |         """[summary]
192 |         
193 |         Args:
194 |             tokens (list): [description]
195 |             number_of_slices (int, optional): [description]. Defaults to 20.
196 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
197 |             overlap_threshold (int, optional): [description]. Defaults to 1.
198 |         
199 |         Returns:
200 |             [type]: [description]
201 |         """
202 | 
203 |         if isinstance(tokens, list) == False:
204 |             tokens = [tokens]
205 | 
206 |         full_lists = self.top_svo_degree(number_of_slices=number_of_slices,
207 |                                      list_top=None,
208 |                                      minimum_burst_level=minimum_burst_level,
209 |                                      return_edge_overlaps=False, 
210 |                                      overlap_threshold=overlap_threshold,
211 |                                      )
212 | 
213 | 
214 |         token_rank_dict = {}
215 | 
216 |         for day in full_lists:
217 |             v = [item for item in full_lists[day] if item[0] in tokens]
218 |             token_rank_dict[day] = v
219 | 
220 |         return token_rank_dict
221 | 
222 |     def plot_top_svo_degree( 
223 |         self, 
224 |         number_of_slices: int = 8, 
225 |         list_top: int = 10,
226 |         minimum_burst_level: int = 0,
227 |         overlap_threshold: int = 1,
228 |         filename: str = False,):
229 |         """[summary]
230 |         
231 |         Args:
232 |             number_of_slices (int, optional): [description]. Defaults to 20.
233 |             list_top (int, optional): [description]. Defaults to 10.
234 |             minimum_burst_level (int, optional): [description]. Defaults to 0.
235 |             overlap_threshold (int, optional): [description]. Defaults to 1.
236 |         """
237 | 
238 |         data = self.top_svo_degree(
239 |             number_of_slices = number_of_slices, 
240 |             list_top = list_top,
241 |             minimum_burst_level = minimum_burst_level,
242 |             return_edge_overlaps = False,
243 |             overlap_threshold=overlap_threshold,)
244 | 
245 |         date_names = []
246 |         time_slices = []
247 | 
248 |         for k, v in data.items():
249 |             date_names.append(k)
250 |             time_slices.append(v)
251 |    
252 |         for i in range(1, len(date_names)):
253 | 
254 |             x = np.arange(list_top)
255 |             values = []
256 |             names = []
257 | 
258 |             for top_degrees in time_slices[i]:
259 |                 values.append(top_degrees[1])
260 |                 names.append(top_degrees[0])
261 | 
262 |             values.reverse()
263 |             names.reverse()
264 | 
265 |             fig, ax = plt.subplots()
266 |             fig.set_figwidth(6)
267 |             fig.set_figheight(10)
268 |             fig.suptitle('{} to {}'.format(date_names[i-1], date_names[i]), fontsize=12, ha="center")
269 |             ax.xaxis.set_major_locator(MaxNLocator(integer=True))
270 |             plt.barh(x, values, color='#32363A')
271 |             plt.yticks(x, names)
272 |             if filename:
273 |                 plt.savefig(str(filename) + str(i) + ".pdf")
274 |             else:
275 |                 plt.show()
276 | 
277 |     def plot_specific_svo_degree(self,
278 |                              tokens: list,
279 |                              number_of_slices: int = 15,
280 |                              minimum_burst_level: int = 0,
281 |                              overlap_threshold: int = 1,
282 |                              plot_type="line",
283 |                              filename: str = False,):
284 |         
285 |         if isinstance(tokens, list) == False:
286 |             tokens = [tokens]
287 | 
288 |         if plot_type != "line" and plot_type != "bar":
289 |             raise Exception("`plot_type` must be one of 'line' or 'bar'")
290 | 
291 |         data = self.specific_svo_degree(tokens=tokens,
292 |                                     number_of_slices=number_of_slices,
293 |                                     minimum_burst_level=minimum_burst_level,
294 |                                     overlap_threshold=overlap_threshold,
295 |                                     )
296 | 
297 |         inverted_dict = {}
298 | 
299 |         for token in tokens:
300 |             full_list = []
301 | 
302 |             for date, degree_list in data.items():
303 |                 degree = [item[1] for item in degree_list if item[0] == token]
304 |                 full_list.append((date, degree[0]))
305 | 
306 |             inverted_dict[token] = full_list
307 | 
308 |         x = np.arange(number_of_slices)
309 | 
310 |         for k, v in inverted_dict.items():
311 | 
312 |             values = [item[1] for item in v]
313 |             dates = [item[0].replace(", ", "\n") for item in v]
314 | 
315 |             fig, ax = plt.subplots()
316 |             fig.set_figwidth(10)
317 |             fig.set_figheight(6)
318 |             fig.suptitle("'{}'".format(k), fontsize=12, ha="center")
319 |             ax.yaxis.set_major_locator(MaxNLocator(integer=True))
320 |             if plot_type == "bar":
321 |                 plt.bar(x, values, color='#32363A')
322 |             elif plot_type == "line":
323 |                 plt.plot(x, values, color='#32363A')
324 |             plt.xticks(x, dates)
325 |             if filename:
326 |                 plt.savefig(str(filename) + str(k) + ".pdf")
327 |             else:
328 |                 plt.show()


--------------------------------------------------------------------------------
/nate/svonet/svo_offsets.py:
--------------------------------------------------------------------------------
 1 | """Generates the offset dictionary for the SVO pipeline."""
 2 | from time import time as marktime
 3 | from typing import List
 4 | from itertools import groupby
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | def generate_svo_offsets(svo_list: List, time: List, minimum_offsets):
 9 |     """Creates offset dictionary and int-to-string lookup for SVO format."""
10 |     print("Generating Offsets:")
11 | 
12 |     start = marktime()
13 | 
14 |     svo_dict = defaultdict(list)
15 |     for i, svo in enumerate(svo_list):
16 |         svo_dict[svo].append(time[i])
17 | 
18 |     svo_int_dict, lookup = text_to_int(svo_dict)
19 | 
20 |     # prune SVOs, excluding those with fewer occurrences than specified by minimum_offsets
21 |     offsets = {
22 |         k: v for k, v in svo_int_dict.items() if len(v) >= minimum_offsets
23 |     }
24 | 
25 |     print("Finished offset generation in {} seconds".format(
26 |         round(marktime() - start)))
27 |     print("Commencing timestamp deduplication...")
28 | 
29 |     # increment simultaneous occurrences by 1 millisecond to satisfy Kleinberg requirements
30 |     for item in offsets.keys():
31 |         offsets[item].sort()
32 |         offsets[item] = [
33 |             g + i * 0.001
34 |             for k, group in groupby(offsets[item])
35 |             for i, g in enumerate(group)
36 |         ]
37 | 
38 |     print("finished timestamp deduplication in {} seconds".format(
39 |         round(marktime() - start)))
40 | 
41 |     print("Finished Generating Offsets. Returning offset dictionary.")
42 | 
43 |     return offsets, lookup
44 | 
45 | 
46 | def text_to_int(svo_dict):
47 |     """Converts SVO terms to integers, and generates a lookup dictionary."""
48 |     svo_int_dict = defaultdict(list)
49 |     lookup_dict = defaultdict(tuple)
50 |     i = 0
51 |     for k, v in svo_dict.items():
52 |         svo_int_dict[i] = v
53 |         lookup_dict[i] = k
54 |         i = i + 1
55 | 
56 |     return svo_int_dict, lookup_dict
57 | 


--------------------------------------------------------------------------------
/nate/svonet/svoburst_class.py:
--------------------------------------------------------------------------------
 1 | from nate.edgeburst.burst_class import Bursts
 2 | from nate.svonet.degree_over_time import DegreeOverTimeMixIn
 3 | from nate.svonet.svo_degree_over_time import SVODegreeOverTimeMixin
 4 | 
 5 | 
 6 | class SVOburst(Bursts, DegreeOverTimeMixIn, SVODegreeOverTimeMixin):
 7 |     """
 8 |     Creates an SVOburst class object containing data about SVO terms that burst over time.
 9 | 
10 |     Attributes:
11 |         offset_dict (Dict): A dictionary with terms as keys, and a list
12 |             of offsets (occurrences) as values.
13 |         edge_burst_dict (Dict): A dictionary with terms as keys and nested
14 |             burst data as values.
15 |         s (float): s parameter for tuning Kleinberg algorithm. Higher values
16 |             make it more difficult for bursts to move up the burst hierarchy.
17 |         gamma (float): gamma parameter for tuning Kleinberg algorithm. Higher
18 |             values make it more difficult for activity to be considered a
19 |             burst.
20 |         from_svo (bool): A flag to alert other functions to the SVO pipeline.
21 |         lookup (dict): A dictionary with integers as keys and the SVO terms
22 |             they represent as values.
23 |     """
24 | 
25 |     def __init__(self, offset_dict, edge_burst_dict, s, gamma, from_svo,
26 |                  lookup):
27 | 
28 |         self.offset_dict: dict = offset_dict
29 |         self.edge_burst_dict: dict = edge_burst_dict
30 |         self.s = s
31 |         self.gamma = gamma
32 |         self.from_svo = from_svo
33 |         self.bdf = None
34 |         self.odf = None
35 |         self.lookup = lookup
36 | 
37 |     def animate(self, pos = False, offscreen = True, time_interval = False, new_burst_halo = True, dpi = 300):
38 |         """Creates an animation of the network of SVO bursts over time.
39 | 
40 |         The function will either create an onscreen animation window, or
41 |         dump each frame to disk. The function requires graph-tool to be
42 |         installed and able to be imported.
43 | 
44 |         Args:
45 |             pos (object, optional): A graph-tool `pos` vertex
46 |                 property map to specify layout. If passed, the map will be
47 |                 used to create the graph layout. Otherwise, one will be
48 |                 generated. Defaults to False.
49 |             offscreen (Bool, optional): Whether to generate the animation
50 |                 offscreen. If True, the frames will be dumped to disk in
51 |                 the directory `./data/frames`. if False, the animation will
52 |                 be shown onscreen. Defaults to True.
53 |             time_interval (int, optional): Specifes a custom time step
54 |                 interval in seconds. Defaults to 86400 (one day).
55 |             new_burst_halo (): not used in animate_graph
56 |             dpi (int): not used in animate_graph
57 |         """
58 |         # check if graph-tool and other requirements are able to be imported
59 |         try:
60 |             from nate.svonet.svo_burst_animate import prepare_df, build_graph, animate_graph
61 | 
62 |             df = prepare_df(self.edge_burst_dict, self.offset_dict)
63 |             graph = build_graph(df, pos, time_interval)
64 |             animate_graph(graph, pos, offscreen, new_burst_halo, dpi)
65 | 
66 |         except ImportError:
67 |             print("Graph-tool does not appear to be installed or importable")
68 | 


--------------------------------------------------------------------------------
/nate/svonet/svonet_class.py:
--------------------------------------------------------------------------------
  1 | """Definition of the `SVOnet` class, for subject-verb-object analysis.
  2 | 
  3 | This module defines the `SVOnet` class, [description of SVO pipeline].
  4 | """
  5 | 
  6 | from nate.svonet.svo import findSVOs
  7 | import pandas as pd
  8 | from nate.utils.mp_helpers import mp
  9 | from nate.utils.text_helpers import is_ascii
 10 | from typing import List, Dict
 11 | from nate.svonet.svo_offsets import generate_svo_offsets
 12 | from nate.edgeburst.burst_mixin import BurstMixin
 13 | from nate.svonet.degree_over_time import DegreeOverTimeMixIn
 14 | from nate.svonet.svoburst_class import SVOburst
 15 | 
 16 | 
 17 | def process_svo(sub_tags, obj_tags, doc):
 18 |     """Detects SVOs in a document after spaCy has processed it.
 19 | 
 20 |     Custom pipeline component for spaCY.
 21 | 
 22 |     TODO: move this to utils, where it is used.
 23 |     """
 24 |     sentences = [x.string.strip() for x in doc.sents]  # list of raw sentences in the document
 25 |     svo_items = [findSVOs(x, sub_tags, obj_tags) for x in doc.sents] # detect SVOs sentence-by-sentence in the document
 26 | 
 27 |     return (sentences, svo_items)
 28 | 
 29 | 
 30 | class SVOnet(BurstMixin):
 31 |     """Provides data cleanup, export functions, and burst detection.
 32 | 
 33 |     Attributes:
 34 |         doc_ids (List): A list of document ids, determining which document
 35 |             the SVO at index i came from.
 36 |         sent_ids (List): A list of sentence ids, determining which sentence
 37 |             the SVO at index i came from.
 38 |         sentences (List): The sentence that the SVO was pulled from.
 39 |         svo_items (List): The entire SVO item.
 40 |         times (List): The time that the SVO's source document was written.
 41 |         subjects (List): The SVO at index i's subject.
 42 |         verbs (List): The SVO at index i's verb.
 43 |         objects (List): The SVO at index i's object
 44 |         sub_ent_types (List): The SVO at index i's subject entity type.
 45 |         obj_ent_types (List): The SVO at index i's object entity type.
 46 |     """
 47 | 
 48 |     def __init__(self, sentences, svo_items, timestamps):
 49 | 
 50 |         self.doc_ids = []
 51 |         self.sent_ids = []
 52 |         self.sentences = []
 53 |         self.svo_items = []
 54 |         if timestamps:
 55 |             self.times = []
 56 |         self.subjects = []
 57 |         self.verbs = []
 58 |         self.objects = []
 59 |         self.sub_ent_types = []
 60 |         self.obj_ent_types = []
 61 | 
 62 |         # this somewhat obtuse code chunk flattens the heavily nested data format returned by the `svo` module
 63 |         for i, doc in enumerate(sentences):
 64 |             for j, sent in enumerate(doc):
 65 |                 if len(svo_items[i][j][0]) > 0:
 66 |                     for k, svo_item in enumerate(svo_items[i][j][0]):
 67 |                         if is_ascii(svo_item[0]) and is_ascii(
 68 |                                 svo_item[1]) and is_ascii(svo_item[2]):
 69 |                             svo_item = (svo_item[0].lower(),
 70 |                                         svo_item[1].lower(),
 71 |                                         svo_item[2].lower())
 72 |                             self.doc_ids.append(i)
 73 |                             self.sent_ids.append(j)
 74 |                             self.sentences.append(sent)
 75 |                             if timestamps:
 76 |                                 self.times.append(timestamps[i])
 77 |                             self.svo_items.append(svo_item)
 78 |                             self.subjects.append(svo_item[0])
 79 |                             self.verbs.append(svo_item[1])
 80 |                             self.objects.append(svo_item[2])
 81 |                             self.sub_ent_types.append(svo_items[i][j][1][k])
 82 |                             self.obj_ent_types.append(svo_items[i][j][2][k])
 83 | 
 84 |         self.from_svo = True
 85 | 
 86 |     def svo_to_df(self, tidy=True):
 87 |         """Outputs a pandas dataframe with all SVOs and their timestamps.
 88 | 
 89 |         If tidy is set to True, each SVO will have its own line in the dataframe.
 90 |         If tidy is set to False, identical SVOs will be grouped and their
 91 |         document ids, timestamps, and datetimes will be aggregated into lists
 92 |         in the dataframe.
 93 | 
 94 |         Args:
 95 |             tidy (Bool, optional): Whether to output a tidy or non-tidy
 96 |                 dataframe, the differences between which are documented above.
 97 |                 Defaults to True.
 98 | 
 99 |         Returns:
100 |             pandas.Dataframe: A dataframe containing data for all detected SVOs,
101 |                 including their associated timestamps (if present).
102 | 
103 |             The outputted dataframe will have the following columns:
104 |               - 'doc_ids' (int) : A list of document ids, determining which
105 |                 document the SVO at index i came from.
106 |               - 'sent_ids' (int): A list of sentence ids, determining which
107 |                  sentence
108 |                  the SVO at index i came from.
109 |               - 'sentences' (string): The sentence that the SVO was pulled from.
110 |               - 'svo' (Tuple): The entire SVO item.
111 |               - 'times' (datetime): The time that the SVO's source document
112 |                 was written.
113 |               - 'subjects' (string): The SVO at index i's subject.
114 |               - 'verbs' (string): The SVO at index i's verb.
115 |               - 'objects' (string): The SVO at index i's object
116 |               - 'sub_ent_types' (string): The SVO at index i's subject entity
117 |                 type.
118 |               - 'obj_ent_types' (string): The SVO at index i's object entity
119 |                 type.
120 |         """
121 |         df = pd.DataFrame()
122 | 
123 |         df['doc_id'], df['sent_id'], df['sentence'], df['svo'] =\
124 |             self.doc_ids, self.sent_ids, self.sentences, self.svo_items
125 |         if self.times:
126 |             df['timestamp'] = self.times
127 |         df['subject'], df['sub_type'], df['verb'], df['object'], df[
128 |             'obj_type'] = self.subjects, self.sub_ent_types, self.verbs, self.objects, self.obj_ent_types
129 |         if self.times:
130 |             df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
131 | 
132 |         if tidy == False and self.times:
133 |             df = df.groupby('svo')['doc_id', 'timestamp', 'datetime'].agg(list)
134 |         elif tidy == False:
135 |             df = df.groupby('svo')['doc_id'].agg(list)
136 | 
137 |         return df
138 | 
139 |     def svo_to_burst(self, minimum_offsets=20, s=2, gamma=1) -> SVOburst:
140 |         """Initiates burst detection on data contained in the SVOnet class.
141 | 
142 |         This function requires that the object was instantiates with a list
143 |         of times.
144 | 
145 |         Args:
146 |             minimum_offsets (int, optional): The minimum number of occurences
147 |                 of an SVO in the dataset for it to be included in the bursts
148 |                 calculation. Lower values include more of the dataset, at the
149 |                 cost of longer processing time. Defaults to 20.
150 |             s (float, optional): s parameter for tuning Kleinberg algorithm.
151 |                 Higher values make it more difficult for bursts to move up the
152 |                 burst hierarchy. Defaults to 2.
153 |             gamma (float, optional): gamma parameter for tuning Kleinberg
154 |                 algorithm. Higher values make it more difficult for activity to
155 |                 be considered a burst. Defaults to 1.
156 | 
157 |         Returns:
158 |             SVOburst: An SVOburst object for exporting, visualizing, and otherwise
159 |                 manipulating burst data for the data contained in this class.
160 |         """
161 |         if not self.times:
162 |             print("Burst detection requires timestamps")
163 |             return None
164 | 
165 |         # send offset_dict and lookup dictionary to svo_offset generating function
166 |         self.offset_dict, self.lookup = generate_svo_offsets(
167 |             self.svo_items, self.times, minimum_offsets)
168 | 
169 |         offset_dict_strings, edge_burst_dict_strings, s, gamma, from_svo, lookup = self.burst_detection(
170 |             s, gamma)
171 | 
172 |         return SVOburst(offset_dict=offset_dict_strings,
173 |                         edge_burst_dict=edge_burst_dict_strings,
174 |                         s=s,
175 |                         gamma=gamma,
176 |                         from_svo=from_svo,
177 |                         lookup=lookup)
178 | 


--------------------------------------------------------------------------------
/nate/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/utils/__init__.py


--------------------------------------------------------------------------------
/nate/utils/mp_helpers.py:
--------------------------------------------------------------------------------
 1 | """Utilities for multiprocessing."""
 2 | from joblib import Parallel, delayed, cpu_count
 3 | from itertools import chain
 4 | from spacy.util import minibatch
 5 | from functools import partial
 6 | from typing import Union, List, Dict
 7 | 
 8 | 
 9 | def mp(items, function, *args) -> Union[List, Dict]:
10 |     """Applies a function to a list or dict of items, using multiprocessing.
11 |     
12 |     This is a convenience function for generalized multiprocessing of any
13 |     function that deals with a list or dictionary of items. The functions
14 |     passed to `mp` must accept the list of items to be processed at the end 
15 |     of their function call, with optional arguments first. *args can be any
16 |     number of optional arguments accepted by the function that will be 
17 |     multiprocessed. On Windows, functions must be defined outside of the
18 |     current python file and imported, to avoid infinite recursion.
19 |     """
20 |     if cpu_count() >= 10:  #to avoid overtaxing Brad, save some cores
21 |         cpu = 10
22 |     else:
23 |         cpu = cpu_count()
24 | 
25 |     batch_size = round(len(items) / cpu)
26 |     partitions = minibatch(items, size=batch_size)
27 |     executor = Parallel(n_jobs=cpu,
28 |                         backend="multiprocessing",
29 |                         prefer="processes")
30 |     do = delayed(partial(function, *args))
31 |     tasks = (do(batch) for batch in partitions)
32 |     temp = executor(tasks)
33 | 
34 |     # todo: add error catch/message for zero results
35 | 
36 |     if isinstance(temp[0], dict):
37 |         results = {}
38 |         for batch in temp:
39 |             for key, value in batch.items():
40 |                 results.setdefault(key, []).extend(value)
41 |     elif isinstance(temp[0], (list, tuple)):
42 |         results = list(chain(*temp))
43 | 
44 |     return results
45 | 
46 | 
47 | def mp2(items, function, *args):
48 |     """Applies a function to a list, returning two lists of results.
49 | 
50 |     This is the same as `mp` but used when two lists of results need to be 
51 |     returned. Will perhaps be generalized for any number of results in the 
52 |     future. Does not currently work for dictionaries.
53 |     """
54 |     if cpu_count() >= 10:  #to avoid overtaxing Brad, save some cores
55 |         cpu = 10
56 |     else:
57 |         cpu = cpu_count()
58 | 
59 |     batch_size = round(len(items) / cpu)
60 |     partitions = minibatch(items, size=batch_size)
61 |     executor = Parallel(n_jobs=cpu,
62 |                         backend="multiprocessing",
63 |                         prefer="processes")
64 |     do = delayed(partial(function, *args))
65 |     tasks = (do(batch) for batch in partitions)
66 |     temp = executor(tasks)
67 |     results1, results2 = zip(*temp)
68 |     results1 = list(chain(*results1))
69 |     results2 = list(chain(*results2))
70 |     return results1, results2
71 | 


--------------------------------------------------------------------------------
/nate/utils/network_helpers.py:
--------------------------------------------------------------------------------
  1 | # CREDIT CHAIN OF DEVS FOR THIS... INCLUDING MALCOLM...
  2 | '''
  3 | This module implements the disparity filter to compute a significance score of edge weights in networks.
  4 | Forked from: https://github.com/aekpalakorn/python-backbone-network/blob/master/backbone.py
  5 | With the following changes:
  6 |  - formatted to pylint standards
  7 |  - architected as a module with no code that runs on load
  8 |  - broke large functions into smaller ones
  9 |  - copy all nodes so that completely disconnected nodes aren't removed and so that node attributes are not removed
 10 |  - copy all the original edge attributes so that they are not removed
 11 |  - bug fix: changed G.in_degree(G.successors(u)[0]) to G.in_degree(list(G.successors(u))[0])
 12 | '''
 13 | 
 14 | import networkx as nx
 15 | import numpy as np
 16 | from scipy import integrate
 17 | 
 18 | 
 19 | def get_graph_backbone(G, alpha_t=0.8):
 20 |     '''Gets the backbone of a given graph `G`.'''
 21 |     G_disp = compute_disparity_filter(G)
 22 |     G_backbone = apply_disparity_filter(G_disp, alpha_t, cut_mode='or')
 23 |     return G_backbone
 24 | 
 25 | 
 26 | def compute_disparity_filter(G, weight='weight'):
 27 |     ''' Compute significance scores (alpha) for weighted edges in G as defined in Serrano et al. 2009
 28 |         Args
 29 |             G: Weighted NetworkX graph
 30 |         Returns
 31 |             Weighted graph with a significance score (alpha) assigned to each edge
 32 |         References
 33 |             M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488.
 34 |     '''
 35 |     return compute_disparity_filter_directed(G, weight) \
 36 |            if nx.is_directed(G) else \
 37 |            compute_disparity_filter_undirected(G, weight)
 38 | 
 39 | 
 40 | def compute_disparity_filter_directed(G, weight='weight'):
 41 |     '''See docstring for `compute_disparity_filter`.'''
 42 |     N = nx.DiGraph()
 43 |     N.add_nodes_from(G.nodes(data=True))
 44 |     for u in G:
 45 | 
 46 |         k_out = G.out_degree(u)
 47 |         k_in = G.in_degree(u)
 48 | 
 49 |         if k_out > 1:
 50 |             sum_w_out = sum(
 51 |                 np.absolute(G[u][v][weight]) for v in G.successors(u))
 52 |             for v in G.successors(u):
 53 |                 w = G[u][v][weight]
 54 |                 p_ij_out = float(np.absolute(w)) / sum_w_out
 55 |                 alpha_ij_out = 1 - (k_out - 1) * integrate.quad(
 56 |                     lambda x: (1 - x)**(k_out - 2), 0, p_ij_out)[0]  # pylint: disable=cell-var-from-loop
 57 |                 N.add_edge(u, v, alpha_out=float('%.4f' % alpha_ij_out))
 58 |                 N[u][v].update(G[u][v])
 59 | 
 60 |         elif k_out == 1 and G.in_degree(list(G.successors(u))[0]) == 1:
 61 |             #we need to keep the connection as it is the only way to maintain the connectivity of the network
 62 |             v = list(G.successors(u))[0]
 63 |             N.add_edge(u, v, alpha_out=0., alpha_in=0.)
 64 |             N[u][v].update(G[u][v])
 65 |             #there is no need to do the same for the k_in, since the link is built already from the tail
 66 | 
 67 |         if k_in > 1:
 68 |             sum_w_in = sum(
 69 |                 np.absolute(G[v][u][weight]) for v in G.predecessors(u))
 70 |             for v in G.predecessors(u):
 71 |                 w = G[v][u][weight]
 72 |                 p_ij_in = float(np.absolute(w)) / sum_w_in
 73 |                 alpha_ij_in = 1 - (k_in - 1) * integrate.quad(
 74 |                     lambda x: (1 - x)**(k_in - 2), 0, p_ij_in)[0]  # pylint: disable=cell-var-from-loop
 75 |                 N.add_edge(v, u, alpha_in=float('%.4f' % alpha_ij_in))
 76 |                 N[v][u].update(G[v][u])
 77 |     return N
 78 | 
 79 | 
 80 | def compute_disparity_filter_undirected(G, weight='weight'):
 81 |     '''See docstring for `compute_disparity_filter`.'''
 82 |     B = nx.Graph()
 83 |     B.add_nodes_from(G.nodes(data=True))
 84 |     for u in G:
 85 |         k = len(G[u])
 86 |         if k > 1:
 87 |             sum_w = sum(np.absolute(G[u][v][weight]) for v in G[u])
 88 |             for v in G[u]:
 89 |                 w = G[u][v][weight]
 90 |                 p_ij = float(np.absolute(w)) / sum_w
 91 |                 alpha_ij = 1 - (k - 1) * integrate.quad(
 92 |                     lambda x: (1 - x)**(k - 2), 0, p_ij)[0]  # pylint: disable=cell-var-from-loop
 93 |                 B.add_edge(u, v, alpha=float('%.4f' % alpha_ij))
 94 |                 B[u][v].update(G[u][v])
 95 |     return B
 96 | 
 97 | 
 98 | def apply_disparity_filter(G, alpha_t=0.8, cut_mode='or'):
 99 |     ''' Performs a cut of the graph previously filtered through the disparity_filter function.
100 |         Args
101 |         ----
102 |         G: Weighted NetworkX graph
103 |         alpha_t: double (default='0.4')
104 |             The threshold for the alpha parameter that is used to select the surviving edges.
105 |             It has to be a number between 0 and 1.
106 |         cut_mode: string (default='or')
107 |             Possible strings: 'or', 'and'.
108 |             It applies only to directed graphs. It represents the logic operation to filter out edges
109 |             that do not pass the threshold value, combining the alpha_in and alpha_out attributes
110 |             resulting from the disparity_filter function.
111 |         Returns
112 |         -------
113 |         B: Weighted NetworkX graph
114 |             The resulting graph contains only edges that survived from the filtering with the alpha_t threshold
115 |         References
116 |         ---------
117 |         .. M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488.
118 |     '''
119 |     return apply_disparity_filter_directed(G, alpha_t, cut_mode) \
120 |            if nx.is_directed(G) else \
121 |            apply_disparity_filter_undirected(G, alpha_t)
122 | 
123 | 
124 | def apply_disparity_filter_directed(G, alpha_t=0.8, cut_mode='or'):
125 |     '''See the docstring for the `apply_disparity_filter` function.'''
126 |     B = nx.DiGraph()
127 |     B.add_nodes_from(G.nodes(data=True))
128 |     for u, v, w in G.edges(data=True):
129 |         try:
130 |             alpha_in = w['alpha_in']
131 |         except KeyError:  #there is no alpha_in, so we assign 1. It will never pass the cut
132 |             alpha_in = 1
133 |         try:
134 |             alpha_out = w['alpha_out']
135 |         except KeyError:  #there is no alpha_out, so we assign 1. It will never pass the cut
136 |             alpha_out = 1
137 | 
138 |         if cut_mode == 'or':
139 |             if alpha_in < alpha_t or alpha_out < alpha_t:
140 |                 B.add_edge(u, v)
141 |                 B[u][v].update(G[u][v])
142 |         elif cut_mode == 'and':
143 |             if alpha_in < alpha_t and alpha_out < alpha_t:
144 |                 B.add_edge(u, v)
145 |                 B[u][v].update(G[u][v])
146 |     return B
147 | 
148 | 
149 | def apply_disparity_filter_undirected(G, alpha_t=0.8):
150 |     '''See the docstring for the `apply_disparity_filter` function.'''
151 |     B = nx.Graph()
152 |     B.add_nodes_from(G.nodes(data=True))
153 |     for u, v, w in G.edges(data=True):
154 | 
155 |         try:
156 |             alpha = w['alpha']
157 |         except KeyError:  #there is no alpha, so we assign 1. It will never pass the cut
158 |             alpha = 1
159 | 
160 |         if alpha < alpha_t:
161 |             B.add_edge(u, v)
162 |             B[u][v].update(G[u][v])
163 |     return B
164 | 


--------------------------------------------------------------------------------
/nate/utils/nlp_helpers.py:
--------------------------------------------------------------------------------
 1 | """Utilities for NLP, mainly using spaCy."""
 2 | import spacy
 3 | from spacy.pipeline import merge_entities
 4 | from .mp_helpers import mp
 5 | from tok import sent_tokenize
 6 | from gensim.models.phrases import Phrases, Phraser
 7 | from itertools import chain
 8 | from ..svonet.svonet_class import process_svo
 9 | 
10 | # Everything from this point down was moved from the `text_helpers` module
11 | 
12 | 
13 | def spacy_process(nlp, joined, sub_tags, obj_tags, texts):
14 |     """Processes texts in spaCy.
15 | 
16 |     Primary point of access to spaCy. Requires the NLP model object to be
17 |     passed, as well as the texts to be processed. Setting joined to True
18 |     will combine tokens into strings, separated by white space. If the
19 |     svo_component is detected, will also accept subject tags and object
20 |     tags to be passed to `process_svo`
21 |     """
22 |     if 'svo_component' in nlp.pipe_names:
23 |         processed_list = [
24 |             doc for doc in nlp.pipe(texts,
25 |                                     component_cfg={
26 |                                         'svo_component': {
27 |                                             'sub_tags': sub_tags,
28 |                                             'obj_tags': obj_tags
29 |                                         }
30 |                                     })
31 |         ]
32 |     elif joined == True:
33 |         processed_list = [' '.join(doc) for doc in nlp.pipe(texts)]
34 |     else:
35 |         processed_list = [doc for doc in nlp.pipe(texts)]
36 |     return processed_list
37 | 
38 | 
39 | def default_filter_lemma(doc):  # to do: make this user-configurable
40 |     """Filters spaCy pipeline.
41 | 
42 |     This is the default filter to be used in the spaCy pipeline for tasks
43 |     that don't involve SVO.
44 |     """
45 |     proc = []
46 |     for token in doc:
47 |         if '_' in token.text and len(token) > 2 and token.is_ascii:
48 |             proc.append(token.text)
49 |         if token.is_alpha and len(token) >2 and token.is_stop is False and token.is_ascii:
50 |             proc.append(token.lemma_.lower())
51 | 
52 |     return proc
53 | 
54 | 
55 | def custom_spacy_component(doc):
56 |     """
57 |     Placeholder/example for a custom spaCy pipeline component
58 |     """
59 |     return [
60 |         token.lemma_.lower()
61 |         for token in doc
62 |         if token.is_stop == False and token.is_ascii
63 |     ]
64 | 
65 | 
66 | def svo_component(doc, sub_tags, obj_tags):
67 |     """Processes text in the SVO pipeline.
68 |     
69 |     TODO: Why does this function only wrap around process_svo? Consider
70 |     moving wrapped function here.
71 |     """
72 |     doc = process_svo(sub_tags, obj_tags, doc)
73 |     return doc
74 | 
75 | 
76 | def bigram_process(texts, trigrams, bigram_threshold, tokenized=True):
77 |     """Uses gensim to detect bigrams and trigrams.
78 | 
79 |     Expects a list of texts. See gensim documentation for explanations
80 |     of parameters: https://radimrehurek.com/gensim/models/phrases.html
81 |     """
82 |     sentences = [sent_tokenize(text) for text in texts] # gensim needs documents to come in as a list of sentences
83 |     all_sentences = list(chain(*sentences)) # flatten list of sentences for training purposes
84 |     model = Phrases(all_sentences, min_count=1, threshold=bigram_threshold, scoring='npmi') # train the model
85 |     bigrammer = Phraser(model) # create more efficient applicator of trained model
86 |     bigrammed_list = [[bigrammer[sent] for sent in doc] for doc in sentences] # apply the model to the original texts
87 |     if trigrams == True: # gensim detects trigrams by stacking bigram detection on text with detected bigrams
88 |         trigram_model = Phrases(bigrammer[all_sentences], min_count=1, threshold=bigram_threshold, scoring='npmi')
89 |         trigrammer = Phraser(trigram_model)
90 |         bigrammed_list = [[trigrammer[bigrammer[sent]] for sent in doc] for doc in sentences]
91 |     bigrammed_list = [list(chain(*x)) for x in bigrammed_list]
92 |     # option to return text in original form, but with underscores between bigrams
93 |     if tokenized == False:
94 |         bigrammed_list = [' '.join(doc) for doc in bigrammed_list]
95 | 
96 |     return bigrammed_list
97 | 


--------------------------------------------------------------------------------
/nate/utils/text_helpers.py:
--------------------------------------------------------------------------------
 1 | """Utilities for manipulation of plain text."""
 2 | 
 3 | import pandas as pd
 4 | import re
 5 | 
 6 | 
 7 | def window_text(string_of_text, window_lr=3):
 8 |     """Creates a list of windowed strings.
 9 | 
10 |     This function splits a string into tokens on each space. Then it iterates
11 |     over each token and takes add n words to a new list where n = the number of
12 |     ``window_lr`` * 2 + 1. This is because ``window_lr'' is the number of
13 |     words to grab to the left AND to the right of each token in the string.
14 |     If ``window_lr'' = 2, then it will take the token itself, 2 words to the
15 |     left of the token, and 2 words to the right of a token. The result is a
16 |     window of 5 words. As a result of this design decision, the smallest window
17 |     possible is 3 words, which can be given by ``window_lr'' = 1. Finally, the
18 |     windows at the start and end of a text string will be smaller than the rest
19 |     because they will have fewer words at the start (nothing / less to the left)
20 |     and at the end (nothing / less to the right). This function is designed to
21 |     take in a string. If the string is pre-processed (which is should be), make
22 |     sure it is receiving a string, not tokenized from another package, like
23 |     spacy or nltk.
24 | 
25 |     The output of this function is a new list of windowed strings. It can be
26 |     fed into functions like construct_conet() to construct a co-occurrence
27 |     network where co-occurrence happens between words within a moving window.
28 |     Obviouslly, this is the function that makes the windows, not the
29 |     co-occurrence network.
30 |     """
31 |     tokens = string_of_text.split()
32 |     for _ in tokens:
33 |         context = []
34 |         for index in range(len(tokens)):
35 |             start = max(0, index - window_lr)
36 |             finish = min(len(tokens), index + window_lr)
37 |             left = " ".join(tokens[start:index])
38 |             right = " ".join(tokens[index + 1:finish])
39 |             context.append("{} {} {}".format(left, tokens[index], right))
40 |     return context
41 | 
42 | 
43 | def search_entities(raw_text_string, search):
44 |     """Searches for known entities in a string.
45 |     
46 |     Helper function for construct_entity_conet(). Iterates over a list
47 |     of entities and looks to see if they are present in a given text
48 |     string. If they are, then it will append the entity to a list for
49 |     each text. These lists of ents appearing in texts can be used to
50 |     construct a network of entities that co-occurr within texts.
51 |     """
52 |     ents = []
53 |     for entity in search:
54 |         if entity.lower() in raw_text_string.lower():
55 |             ents.append(entity.lower())
56 |     return ents
57 | 
58 | 
59 | def adjmat_to_wel(adjmat, remove_self_loops=True):
60 |     """ Accepts an adjacency matrix and outputs a weighted edgelist."""
61 |     adjmat = pd.read_csv('~/Desktop/testing/ajm.csv', index_col=0)
62 |     adjmat.fillna(0, inplace=True)
63 | 
64 |     if remove_self_loops is True:
65 |         # zero out the diagonal
66 |         for i in adjmat.index:
67 |             adjmat.loc[i, i] = 0
68 |     else:
69 |         pass
70 | 
71 |     wel = [('i', 'j', 'Weight')]
72 |     for source in adjmat.index.values:
73 |         for target in adjmat.index.values:
74 |             if adjmat[source][target] > 0:
75 |                 wel.append((target, source, adjmat[source][target]))
76 |     return wel
77 | 
78 | 
79 | def write_topics(model, feature_names, no_top_words, filename='topics.txt'):
80 |     """
81 |     This is a docstring.
82 |     """
83 |     with open(filename, 'w') as f:
84 |         for topic_idx, topic in enumerate(model.components_):
85 |             f.write("Topic {}: ".format(topic_idx))
86 |             f.write(" ".join([
87 |                 feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]
88 |             ]))
89 |             f.write('\n')
90 | 
91 | def is_ascii(s):        
92 |     """Determines if a string is encoded in ascii."""
93 |     try:
94 |         s.encode('ascii')
95 |     except UnicodeEncodeError:
96 |         return False
97 |     return True
98 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="nate",
 5 |     version="0.0.1",
 6 |     install_requires=[
 7 |         "pandas>=0.25.0", 
 8 |         "spacy", 
 9 |         #"python-igraph>=0.8.0", 
10 |         "tok",
11 |         "numba",
12 |         "joblib",
13 |         "matplotlib",
14 |         "networkx",
15 |         "pillow",
16 |         "stop_words",
17 |         "gensim"
18 |         ], # A bunch of things will need to go here; we'll have to do an audit of every package we use
19 |     packages = find_packages(),
20 |     include_package_data=True,
21 |     author = "John McLevey, Tyler Crick, Pierson Browne", # likely more later
22 |     description = "nate (Network Analysis with TExt).",
23 |     url="http://networkslab.org",
24 | 	classifiers=(
25 | 		"Programming Language :: Python :: 3",
26 | 		"License :: OSI Approved :: MIT License",
27 | 		"Operating System :: OS Independent",
28 | 	)
29 | )
30 | 


--------------------------------------------------------------------------------
/tests/importers/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | 
 4 | @pytest.fixture(scope="module")
 5 | def df():
 6 |     df = pd.read_csv("tests/ira_data/IRAhandle_tweets_1.csv")
 7 |     return df
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def df11():
11 |     df = pd.read_csv("tests/ira_data/IRAhandle_tweets_11.csv")
12 |     return df
13 | 
14 | @pytest.fixture(scope="module")
15 | def empty_df(df):
16 |     return pd.DataFrame(columns=df.columns)
17 | 
18 | @pytest.fixture
19 | def dict_of_dicts_text(df):
20 |     return {df["tweet_id"][i]: {"text": df["content"][i]} for i in range(0,10)}
21 | 


--------------------------------------------------------------------------------
/tests/importers/test_dfimporters.py:
--------------------------------------------------------------------------------
 1 | import nate.importers.dataframe_importers as tst
 2 | from nate.importers.timestamp_process import convert_times
 3 | import pytest
 4 | import pandas as pd
 5 | 
 6 | # fixtures for import_csv
 7 | @pytest.fixture
 8 | def csv_file():
 9 |     return "tests/ira_data/IRAhandle_tweets_1.csv"
10 | 
11 | @pytest.fixture
12 | def csv_files():
13 |     return ["tests/ira_data/IRAhandle_tweets_1.csv",
14 |             "tests/ira_data/IRAhandle_tweets_11.csv"]
15 | 
16 | # fixtures for import_excel
17 | @pytest.fixture
18 | def excel_file():
19 |     return "tests/ira_data/[..]"
20 | 
21 | @pytest.fixture
22 | def excel_files():
23 |     return ["tests/ira_data/IRAhandle_tweets_1.xlsx",
24 |             "tests/ira_data/IRAhandle_tweets_11.xlsx"]
25 | 
26 | # tests for process_dataframe
27 | def test_process_dataframe_empty(empty_df):
28 |     nt = tst.process_dataframe(empty_df, "content", "tweet_id", "publish_date",
29 |                                columns_to_keep=["account_category"])
30 |     assert nt.list_texts() == [] 
31 |     assert nt.list_ids() == []
32 |     assert nt.list_times() == []
33 |     assert nt.list_column("account_category") == []
34 | 
35 | def test_process_dataframe_full(df):
36 |     nt = tst.process_dataframe(df, "content", "tweet_id", "publish_date",
37 |                                columns_to_keep=["account_category"])
38 |     assert nt.list_texts(0,5) == df["content"][0:5].tolist()
39 |     assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist()
40 |     assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist())
41 |     assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist()
42 | 
43 | # tests for import_dataframe (wrapper around process_dataframe)
44 | def test_import_dataframe_empty(empty_df):
45 |     nt = tst.import_dataframe(empty_df, "content", "tweet_id", "publish_date",
46 |                                columns_to_keep=["account_category"])
47 |     assert nt.list_texts() == [] 
48 |     assert nt.list_ids() == []
49 |     assert nt.list_times() == []
50 |     assert nt.list_column("account_category") == []
51 | 
52 | # tests for import_csv
53 | def test_import_csv_string(csv_file, df):
54 |     nt = tst.import_csv(csv_file, "content", "tweet_id", "publish_date",
55 |                                columns_to_keep=["account_category"])
56 |     assert nt.list_texts(0,5) == df["content"][0:5].tolist()
57 |     assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist()
58 |     assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist())
59 |     assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist()
60 | 
61 | def test_import_csv_list(csv_files, df, df11):
62 |     nt = tst.import_csv(csv_files, "content", "tweet_id", "publish_date",
63 |                                columns_to_keep=["account_category"])
64 |     assert nt.list_texts(0,5) == df["content"][0:5].tolist()
65 |     assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist()
66 |     assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist())
67 |     assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist()
68 |     assert nt.list_texts(243891, 243896) == df11["content"][0:5].tolist()
69 |     assert nt.list_ids(243891, 243896) == df11["tweet_id"][0:5].tolist()
70 |     assert nt.list_times(243891, 243896) == convert_times(df11["publish_date"][0:5].tolist())
71 |     assert nt.list_column("account_category", 243891, 243896) == df11["account_category"][0:5].tolist()
72 | 
73 | # tests for import_excel
74 | # TODO: add xlsx files. Issues saving them through python.
75 | def test_import_excel_string(excel_file):
76 |     nt = tst.import_excel(excel_file, "content", "tweet_id", "publish_date",
77 |                           columns_to_keep=["account_category"])
78 |     assert nt.list_texts(0,5) == df["content"][0:5].tolist()
79 |     assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist()
80 |     assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist())
81 |     assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist()
82 | 
83 | def test_import_excel_strings(excel_files):
84 |     nt = tst.import_excel(excel_files, "content", "tweet_id", "publish_date",
85 |                                columns_to_keep=["account_category"])
86 |     assert nt.list_texts(0,5) == df["content"][0:5].tolist()
87 |     assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist()
88 |     assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist())
89 |     assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist()
90 |     assert nt.list_texts(243891, 243896) == df11["content"][0:5].tolist()
91 |     assert nt.list_ids(243891, 243896) == df11["tweet_id"][0:5].tolist()
92 |     assert nt.list_times(243891, 243896) == convert_times(df11["publish_date"][0:5].tolist())
93 |     assert nt.list_column("account_category", 243891, 243896) == df11["account_category"][0:5].tolist()
94 | 


--------------------------------------------------------------------------------
/tests/importers/test_namedtuples.py:
--------------------------------------------------------------------------------
 1 | import nate.importers.named_tuple_generator as tst
 2 | from collections import namedtuple
 3 | import pytest
 4 | 
 5 | # fixtures for create_observation_list
 6 | @pytest.fixture
 7 | def list_of_lists():
 8 |     return [["January", "February", "March"],
 9 |             [1, 2, 3],
10 |             ["JA", "FE", "MR"]]
11 | 
12 | @pytest.fixture
13 | def created_obs_list(list_of_lists):
14 |     return tst.create_observation_list("Month", name=list_of_lists[0],
15 |                                        number=list_of_lists[1],
16 |                                        abbr=list_of_lists[2])
17 | @pytest.fixture
18 | def uneven_list_of_lists():
19 |     return [["January", "February", "March", "April"],
20 |             [1, 2],
21 |             ["JA", "FE", "MR"]]
22 | 
23 | # fixtures for tupleize
24 | @pytest.fixture
25 | def series_dict(list_of_lists):
26 |     return {"name":list_of_lists[0], "number":list_of_lists[1], "abbr":list_of_lists[2]}
27 | 
28 | @pytest.fixture
29 | def series_dict_tuple(series_dict):
30 |     return {k: tuple(v) for k, v in series_dict.items()}
31 | 
32 | # tests for create_observation_list
33 | def test_create_observation_list_names(created_obs_list):
34 |     assert created_obs_list[0]._fields == ("name", "number", "abbr")
35 |     assert created_obs_list[0].name == "January"
36 | 
37 | def test_create_observation_list_contents(created_obs_list):
38 |     assert created_obs_list == [("January", 1, "JA"),
39 |                                 ("February", 2, "FE"),
40 |                                 ("March", 3, "MR")]
41 | 
42 | def test_create_observation_list_exn(uneven_list_of_lists):
43 |     try:
44 |         tst.create_observation_list("Month", name=uneven_list_of_lists[0],
45 |                                        number=uneven_list_of_lists[1],
46 |                                        abbr=uneven_list_of_lists[2])
47 |     except Exception as exn:
48 |         assert exn.args[0] == "Not all of the input data is the same length."
49 | 
50 | 
51 | # tests for tupleize
52 | def test_tupleize_names(series_dict):
53 |     obs_list = tst.tupleize(series_dict)
54 |     assert obs_list[0]._fields == ("name", "number", "abbr")
55 |     assert obs_list[0].name == "January"
56 | 
57 | def test_tupleize_lists(series_dict):
58 |     obs_list = tst.tupleize(series_dict)
59 |     assert obs_list[1].name == "February"
60 |     assert obs_list == [("January", 1, "JA"),
61 |                         ("February", 2, "FE"),
62 |                         ("March", 3, "MR")]
63 | 
64 | def test_tupleize_tuples(series_dict_tuple):
65 |     obs_list = tst.tupleize(series_dict_tuple)
66 |     assert obs_list[1].name == "February"
67 |     assert obs_list == [("January", 1, "JA"),
68 |                         ("February", 2, "FE"),
69 |                         ("March", 3, "MR")]
70 | 


--------------------------------------------------------------------------------
/tests/importers/test_nate.py:
--------------------------------------------------------------------------------
 1 | import nate.importers.nate_class as tst
 2 | from nate.importers.dataframe_importers import import_dataframe
 3 | from nate.importers.raw_importers import import_dict_of_dicts
 4 | import pytest
 5 | from pprint import pformat
 6 | 
 7 | @pytest.fixture
 8 | def nate_empty_obj(empty_df):
 9 |     nt = import_dataframe(empty_df, "content", "tweet_id", "publish_date",
10 |                           columns_to_keep=["account_category"])
11 |     return nt
12 | 
13 | @pytest.fixture
14 | def nate_full_obj(df):
15 |     nt = import_dataframe(df, "content", "tweet_id", "publish_date",
16 |                           columns_to_keep=["account_category"])
17 |     return nt
18 | 
19 | @pytest.fixture
20 | def nate_text_only(dict_of_dicts_text):
21 |     return import_dict_of_dicts(dict_of_dicts_text, "text")
22 |     
23 | # test __call__
24 | def test_call_empty(nate_empty_obj, capsys):
25 |     nate_empty_obj()
26 |     captured = capsys.readouterr()
27 |     assert captured.out == "[]\n"
28 | 
29 | def test_call(nate_full_obj, capsys):
30 |     nate_full_obj()
31 |     captured = capsys.readouterr()
32 |     assert captured.out == pformat(nate_full_obj.data[0:5]) + "\n"
33 | 
34 | def test_call_nums(nate_full_obj, capsys):
35 |     nate_full_obj(2,9)
36 |     captured = capsys.readouterr()
37 |     assert captured.out == pformat(nate_full_obj.data[2:9]) + "\n"
38 | 
39 | 
40 | # test __getitem__
41 | def test_getitem_empty(nate_empty_obj):
42 |     with pytest.raises(IndexError):
43 |         nate_empty_obj[0]
44 | 
45 | def test_getitem(nate_full_obj):
46 |     assert nate_full_obj[0] == nate_full_obj.data[0]
47 |     assert nate_full_obj[0:5] == nate_full_obj.data[0:5]
48 |     assert nate_full_obj[-1] == nate_full_obj.data[-1]
49 | 
50 | # test head
51 | def test_head_empty(nate_empty_obj, capsys):
52 |     nate_empty_obj.head()
53 |     captured = capsys.readouterr()
54 |     assert captured.out == "[]\n"
55 | 
56 | def test_head(nate_full_obj, capsys):
57 |     nate_full_obj.head()
58 |     captured = capsys.readouterr()
59 |     assert captured.out == pformat(nate_full_obj.data[0:5]) + "\n"
60 | 
61 | def test_head_nums(nate_full_obj, capsys):
62 |     nate_full_obj.head(2,9)
63 |     captured = capsys.readouterr()
64 |     assert captured.out == pformat(nate_full_obj.data[2:9]) + "\n"
65 | 
66 | # test list_texts
67 | def test_list_texts_empty(nate_empty_obj):
68 |     assert nate_empty_obj.list_texts() == []
69 | 
70 | def test_list_texts(nate_full_obj):
71 |     text_list = [i.text for i in nate_full_obj.data[0:5]]
72 |     assert nate_full_obj.list_texts() == text_lists
73 | 
74 | # test list_times
75 | def test_list_times_empty(nate_empty_obj):
76 |     assert nate_empty_obj.list_times() == []
77 | 
78 | def test_list_times_exn(nate_text_only):
79 |     with pytest.raises(AttributeError):
80 |         nate_text_only.list_times()
81 | 
82 | def test_list_times(nate_full_obj):
83 |     times_list = [i.time for i in nate_full_obj.data[0:5]]
84 |     assert nate_full_obj.list_times() == times_list
85 | 
86 | # test list_ids
87 | def test_list_ids_empty(nate_empty_obj):
88 |     assert nate_empty_obj.list_ids() == []
89 | 
90 | def test_list_ids_exn(nate_text_only):
91 |     with pytest.raises(AttributeError):
92 |         nate_text_only.list_ids()
93 | 
94 | def test_list_ids(nate_full_obj):
95 |     id_list = [i.id for i in nate_full_obj.data[0:5]]
96 |     assert nate_full_obj.list_ids() == id_list
97 | 


--------------------------------------------------------------------------------
/tests/importers/test_rawimporters.py:
--------------------------------------------------------------------------------
 1 | from nate.importers.nate_class import Nate
 2 | import nate.importers.raw_importers as tst
 3 | import pytest
 4 | 
 5 | # fixtures for import_text
 6 | @pytest.fixture
 7 | def string():
 8 |     return "Nate is a cool package!"
 9 | 
10 | @pytest.fixture
11 | def list_of_strings(df):
12 |     return df["content"][0:10].values.tolist()
13 | 
14 | # fixtures for import_files
15 | @pytest.fixture
16 | def file():
17 |     return "tests/importers/textfiles/1.txt"
18 | 
19 | @pytest.fixture
20 | def list_of_files():
21 |     return ["tests/importers/textfiles/1.txt",
22 |             "tests/importers/textfiles/2.txt",
23 |             "tests/importers/textfiles/3.txt"]
24 | 
25 | # fixtures for import_dict_of_dicts
26 | 
27 | # see contest.py for dict_of_dicts_text
28 | 
29 | @pytest.fixture
30 | def dict_of_dicts_cols(df):
31 |     return {df["tweet_id"][i]: {"text": df["content"][i],
32 |                                 "account": df.author[i]} for i in range(0,10)}
33 | 
34 | # test import_texts
35 | def test_import_text_string(string):
36 |     nt = tst.import_text(string)
37 |     assert nt.list_texts() == [string]
38 | 
39 | def test_import_text_strings(list_of_strings):
40 |     nt = tst.import_text(list_of_strings)
41 |     assert nt.list_texts() == list_of_strings
42 | 
43 | # test import_files
44 | def test_import_files_single(file):
45 |     nt = tst.import_files(file)
46 |     with open(file, 'r') as stream:
47 |         string = stream.read().replace("\n", " ")
48 |     assert nt.list_texts() == [string]
49 | 
50 | def test_import_files_list(list_of_files):
51 |     nt = tst.import_files(list_of_files)
52 | 
53 |     strings = []
54 |     for file in list_of_files:
55 |         with open(file, 'r') as stream:
56 |             strings.append(stream.read().replace("\n", " "))
57 | 
58 |     assert nt.list_texts() == strings
59 | 
60 | # test import_dict_of_dicts
61 | def test_dict_of_dicts_texts(dict_of_dicts_text):
62 |     nt = tst.import_dict_of_dicts(dict_of_dicts_text, "text")
63 |     ids = nt.list_ids()
64 |     texts = nt.list_texts()
65 |     for i in range(0,10):
66 |         assert texts[i] == dict_of_dicts_text[ids[i]]["text"]
67 | 
68 | 
69 | def test_dict_of_dicts_cols(dict_of_dicts_cols):
70 |     nt = tst.import_dict_of_dicts(dict_of_dicts_cols, "text", values_to_keep=["account"])
71 |     ids = nt.list_ids()
72 |     texts = nt.list_texts()
73 |     accounts = nt.list_column("account", end=10)
74 |     for i in range(0,10):
75 |         assert texts[i] == dict_of_dicts_cols[ids[i]]["text"]
76 |         assert accounts[i] == dict_of_dicts_cols[ids[i]]["account"]
77 | 


--------------------------------------------------------------------------------
/tests/importers/test_times.py:
--------------------------------------------------------------------------------
 1 | import nate.importers.timestamp_process  as tst
 2 | import pytest
 3 | import pandas as pd
 4 | from datetime import timezone, timedelta
 5 | 
 6 | # fixtures to test convert_time
 7 | @pytest.fixture
 8 | def time_0():
 9 |     return "1/1/1970 00:00"
10 | 
11 | @pytest.fixture
12 | def time_1():
13 |     return "11/12/2019 13:35"
14 | 
15 | 
16 | # fixtures to test convert_times
17 | @pytest.fixture
18 | def times_empty():
19 |     return []
20 | 
21 | @pytest.fixture
22 | def times_0():
23 |     return ["1/1/1970 00:00", "1/1/1970 00:02", "1/1/1970 01:01"]
24 | 
25 | @pytest.fixture
26 | def times_1(df):
27 |     return df["publish_date"][0:3]
28 | 
29 | # tests for convert_time
30 | def test_convert_time_0(time_0):
31 |     assert tst.convert_time(time_0) == 0
32 | 
33 | def test_convert_time_1(time_1):
34 |     assert tst.convert_time(time_1) == 1573565700
35 | 
36 | def test_convert_time_timezone(time_0):
37 |     assert tst.convert_time(time_0, timezone(timedelta(hours=-3))) == 10800
38 | 
39 | # tests for convert_times
40 | def test_convert_times_empty(times_empty):
41 |     assert tst.convert_times(times_empty) == []
42 |     
43 | def test_convert_times_0(times_0):
44 |     assert tst.convert_times(times_0) == [0,120,3660]
45 |     
46 | def test_convert_times_1(times_1):
47 |     assert tst.convert_times(times_1) == [1506887880, 1506897780, 1506898200]
48 | 
49 | def test_convert_times_timezone(times_0):
50 |     assert tst.convert_times(times_0,
51 |                          timezone(timedelta(hours=-3))) == [0+10800, 120+10800, 3660+10800]
52 | 


--------------------------------------------------------------------------------
/tests/importers/textfiles/1.txt:
--------------------------------------------------------------------------------
 1 | How promotion excellent curiosity yet attempted happiness. Gay prosperous impression had conviction. For every delay death ask style. Me mean able my by in they. Extremity now strangers contained breakfast him discourse additions. Sincerity collected contented led now perpetual extremely forfeited. 
 2 | 
 3 | Bringing unlocked me an striking ye perceive. Mr by wound hours oh happy. Me in resolution pianoforte continuing we. Most my no spot felt by no. He he in forfeited furniture sweetness he arranging. Me tedious so to behaved written account ferrars moments. Too objection for elsewhere her preferred allowance her. Marianne shutters mr steepest to me. Up mr ignorant produced distance although is sociable blessing. Ham whom call all lain like. 
 4 | 
 5 | Demesne far hearted suppose venture excited see had has. Dependent on so extremely delivered by. Yet ﻿no jokes worse her why. Bed one supposing breakfast day fulfilled off depending questions. Whatever boy her exertion his extended. Ecstatic followed handsome drawings entirely mrs one yet outweigh. Of acceptance insipidity remarkably is invitation. 
 6 | 
 7 | Is at purse tried jokes china ready decay an. Small its shy way had woody downs power. To denoting admitted speaking learning my exercise so in. Procured shutters mr it feelings. To or three offer house begin taken am at. As dissuade cheerful overcame so of friendly he indulged unpacked. Alteration connection to so as collecting me. Difficult in delivered extensive at direction allowance. Alteration put use diminution can considered sentiments interested discretion. An seeing feebly stairs am branch income me unable. 
 8 | 
 9 | He my polite be object oh change. Consider no mr am overcame yourself throwing sociable children. Hastily her totally conduct may. My solid by stuff first smile fanny. Humoured how advanced mrs elegance sir who. Home sons when them dine do want to. Estimating themselves unsatiable imprudence an he at an. Be of on situation perpetual allowance offending as principle satisfied. Improved carriage securing are desirous too. 
10 | 
11 | So by colonel hearted ferrars. Draw from upon here gone add one. He in sportsman household otherwise it perceived instantly. Is inquiry no he several excited am. Called though excuse length ye needed it he having. Whatever throwing we on resolved entrance together graceful. Mrs assured add private married removed believe did she. 
12 | 
13 | Breakfast agreeable incommode departure it an. By ignorant at on wondered relation. Enough at tastes really so cousin am of. Extensive therefore supported by extremity of contented. Is pursuit compact demesne invited elderly be. View him she roof tell her case has sigh. Moreover is possible he admitted sociable concerns. By in cold no less been sent hard hill. 
14 | 
15 | Started his hearted any civilly. So me by marianne admitted speaking. Men bred fine call ask. Cease one miles truth day above seven. Suspicion sportsmen provision suffering mrs saw engrossed something. Snug soon he on plan in be dine some. 
16 | 
17 | Effect if in up no depend seemed. Ecstatic elegance gay but disposed. We me rent been part what. An concluded sportsman offending so provision mr education. Bed uncommonly his discovered for estimating far. Equally he minutes my hastily. Up hung mr we give rest half. Painful so he an comfort is manners. 
18 | 
19 | An country demesne message it. Bachelor domestic extended doubtful as concerns at. Morning prudent removal an letters by. On could my in order never it. Or excited certain sixteen it to parties colonel. Depending conveying direction has led immediate. Law gate her well bed life feet seen rent. On nature or no except it sussex. 
20 | 


--------------------------------------------------------------------------------
/tests/importers/textfiles/2.txt:
--------------------------------------------------------------------------------
 1 | Not him old music think his found enjoy merry. Listening acuteness dependent at or an. Apartments thoroughly unsatiable terminated sex how themselves. She are ten hours wrong walls stand early. Domestic perceive on an ladyship extended received do. Why jennings our whatever his learning gay perceive. Is against no he without subject. Bed connection unreserved preference partiality not unaffected. Years merit trees so think in hoped we as. 
 2 | 
 3 | Whether article spirits new her covered hastily sitting her. Money witty books nor son add. Chicken age had evening believe but proceed pretend mrs. At missed advice my it no sister. Miss told ham dull knew see she spot near can. Spirit her entire her called. 
 4 | 
 5 | Up unpacked friendly ecstatic so possible humoured do. Ample end might folly quiet one set spoke her. We no am former valley assure. Four need spot ye said we find mile. Are commanded him convinced dashwoods did estimable forfeited. Shy celebrated met sentiments she reasonably but. Proposal its disposed eat advanced marriage sociable. Drawings led greatest add subjects endeavor gay remember. Principles one yet assistance you met impossible. 
 6 | 
 7 | On recommend tolerably my belonging or am. Mutual has cannot beauty indeed now sussex merely you. It possible no husbands jennings ye offended packages pleasant he. Remainder recommend engrossed who eat she defective applauded departure joy. Get dissimilar not introduced day her apartments. Fully as taste he mr do smile abode every. Luckily offered article led lasting country minutes nor old. Happen people things oh is oppose up parish effect. Law handsome old outweigh humoured far appetite. 
 8 | 
 9 | Post no so what deal evil rent by real in. But her ready least set lived spite solid. September how men saw tolerably two behaviour arranging. She offices for highest and replied one venture pasture. Applauded no discovery in newspaper allowance am northward. Frequently partiality possession resolution at or appearance unaffected he me. Engaged its was evident pleased husband. Ye goodness felicity do disposal dwelling no. First am plate jokes to began of cause an scale. Subjects he prospect elegance followed no overcame possible it on. 
10 | 
11 | Forfeited you engrossed but gay sometimes explained. Another as studied it to evident. Merry sense given he be arise. Conduct at an replied removal an amongst. Remaining determine few her two cordially admitting old. Sometimes strangers his ourselves her depending you boy. Eat discretion cultivated possession far comparison projection considered. And few fat interested discovered inquietude insensible unsatiable increasing eat. 
12 | 
13 | He moonlight difficult engrossed an it sportsmen. Interested has all devonshire difficulty gay assistance joy. Unaffected at ye of compliment alteration to. Place voice no arise along to. Parlors waiting so against me no. Wishing calling are warrant settled was luckily. Express besides it present if at an opinion visitor. 
14 | 
15 | Smallest directly families surprise honoured am an. Speaking replying mistress him numerous she returned feelings may day. Evening way luckily son exposed get general greatly. Zealously prevailed be arranging do. Set arranging too dejection september happiness. Understood instrument or do connection no appearance do invitation. Dried quick round it or order. Add past see west felt did any. Say out noise you taste merry plate you share. My resolve arrived is we chamber be removal. 
16 | 
17 | Much did had call new drew that kept. Limits expect wonder law she. Now has you views woman noisy match money rooms. To up remark it eldest length oh passed. Off because yet mistake feeling has men. Consulted disposing to moonlight ye extremity. Engage piqued in on coming. 
18 | 
19 | Is we miles ready he might going. Own books built put civil fully blind fanny. Projection appearance at of admiration no. As he totally cousins warrant besides ashamed do. Therefore by applauded acuteness supported affection it. Except had sex limits county enough the figure former add. Do sang my he next mr soon. It merely waited do unable. 
20 | 
21 | 


--------------------------------------------------------------------------------
/tests/importers/textfiles/3.txt:
--------------------------------------------------------------------------------
 1 | Advantage old had otherwise sincerity dependent additions. It in adapted natural hastily is justice. Six draw you him full not mean evil. Prepare garrets it expense windows shewing do an. She projection advantages resolution son indulgence. Part sure on no long life am at ever. In songs above he as drawn to. Gay was outlived peculiar rendered led six. 
 2 | 
 3 | Am terminated it excellence invitation projection as. She graceful shy believed distance use nay. Lively is people so basket ladies window expect. Supply as so period it enough income he genius. Themselves acceptance bed sympathize get dissimilar way admiration son. Design for are edward regret met lovers. This are calm case roof and. 
 4 | 
 5 | Extended kindness trifling remember he confined outlived if. Assistance sentiments yet unpleasing say. Open they an busy they my such high. An active dinner wishes at unable hardly no talked on. Immediate him her resolving his favourite. Wished denote abroad at branch at. 
 6 | 
 7 | In show dull give need so held. One order all scale sense her gay style wrote. Incommode our not one ourselves residence. Shall there whose those stand she end. So unaffected partiality indulgence dispatched to of celebrated remarkably. Unfeeling are had allowance own perceived abilities. 
 8 | 
 9 | Up branch to easily missed by do. Admiration considered acceptance too led one melancholy expression. Are will took form the nor true. Winding enjoyed minuter her letters evident use eat colonel. He attacks observe mr cottage inquiry am examine gravity. Are dear but near left was. Year kept on over so as this of. She steepest doubtful betrayed formerly him. Active one called uneasy our seeing see cousin tastes its. Ye am it formed indeed agreed relied piqued. 
10 | 
11 | Or neglected agreeable of discovery concluded oh it sportsman. Week to time in john. Son elegance use weddings separate. Ask too matter formed county wicket oppose talent. He immediate sometimes or to dependent in. Everything few frequently discretion surrounded did simplicity decisively. Less he year do with no sure loud. 
12 | 
13 | Dwelling and speedily ignorant any steepest. Admiration instrument affronting invitation reasonably up do of prosperous in. Shy saw declared age debating ecstatic man. Call in so want pure rank am dear were. Remarkably to continuing in surrounded diminution on. In unfeeling existence objection immediate repulsive on he in. Imprudence comparison uncommonly me he difficulty diminution resolution. Likewise proposal differed scarcely dwelling as on raillery. September few dependent extremity own continued and ten prevailed attending. Early to weeks we could. 
14 | 
15 | Quick six blind smart out burst. Perfectly on furniture dejection determine my depending an to. Add short water court fat. Her bachelor honoured perceive securing but desirous ham required. Questions deficient acuteness to engrossed as. Entirely led ten humoured greatest and yourself. Besides ye country on observe. She continue appetite endeavor she judgment interest the met. For she surrounded motionless fat resolution may. 
16 | 
17 | Improve ashamed married expense bed her comfort pursuit mrs. Four time took ye your as fail lady. Up greatest am exertion or marianne. Shy occasional terminated insensible and inhabiting gay. So know do fond to half on. Now who promise was justice new winding. In finished on he speaking suitable advanced if. Boy happiness sportsmen say prevailed offending concealed nor was provision. Provided so as doubtful on striking required. Waiting we to compass assured.
18 | 
19 | She exposed painted fifteen are noisier mistake led waiting. Surprise not wandered speedily husbands although yet end. Are court tiled cease young built fat one man taken. We highest ye friends is exposed equally in. Ignorant had too strictly followed. Astonished as travelling assistance or unreserved oh pianoforte ye. Five with seen put need tore add neat. Bringing it is he returned received raptures. 
20 | 


--------------------------------------------------------------------------------