├── .gitignore ├── LICENSE ├── README.md ├── nate ├── __init__.py ├── cooc │ ├── __init__.py │ ├── cooc_class.py │ └── cooc_offsets.py ├── docnet │ ├── __init__.py │ └── docnet.py ├── edgeburst │ ├── __init__.py │ ├── burst_class.py │ ├── burst_mixin.py │ ├── export.py │ ├── pybursts.py │ └── visualize_bursts.py ├── importers │ ├── __init__.py │ ├── dataframe_importers.py │ ├── edgelist_importers.py │ ├── named_tuple_generator.py │ ├── nate_class.py │ ├── raw_importers.py │ └── timestamp_process.py ├── netplus │ ├── __init__.py │ └── netplus.py ├── semnet │ ├── __init__.py │ └── semnet.py ├── socnet │ ├── __init__.py │ ├── alters.py │ ├── centralities.py │ ├── dissimilarities.py │ ├── old_temsna │ │ ├── combine_covariates.py │ │ ├── create_author_covariates.py │ │ ├── extract_coauthor.py │ │ ├── generate_meta_strings.py │ │ ├── spacy_process │ │ │ ├── spacy_gpu.py │ │ │ ├── spacy_new.py │ │ │ └── spacy_processing_mp.py │ │ └── temsna_dependencies_sparse.png │ └── socnet_class.py ├── svonet │ ├── Arial.ttf │ ├── __init__.py │ ├── degree_over_time.py │ ├── graph_svo.py │ ├── svo.py │ ├── svo_burst_animate.py │ ├── svo_degree_over_time.py │ ├── svo_offsets.py │ ├── svoburst_class.py │ └── svonet_class.py └── utils │ ├── __init__.py │ ├── mp_helpers.py │ ├── network_helpers.py │ ├── nlp_helpers.py │ └── text_helpers.py ├── setup.py └── tests └── importers ├── conftest.py ├── test_dfimporters.py ├── test_namedtuples.py ├── test_nate.py ├── test_rawimporters.py ├── test_times.py └── textfiles ├── 1.txt ├── 2.txt └── 3.txt /.gitignore: -------------------------------------------------------------------------------- 1 | planning/ 2 | archive/ 3 | testing.py 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # 136 | hand/ 137 | 138 | # 139 | .vscode/ 140 | .vscode 141 | 142 | # 143 | *.org 144 | *.el 145 | 146 | # 147 | output/* 148 | !*.gitkeep 149 | 150 | # 151 | data/* 152 | .vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 John McLevey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # nate (Network Analysis + Text) 4 | 5 | *Research at the intersection of **social network analysis** and applied **natural language processing**.* 6 | 7 | `nate` is a Python package designed and developed by [NETLAB](https://uwaterloo.ca/networks-lab/) at the [University of Waterloo](https://uwaterloo.ca/). It is designed to facilitate research at the intersection of social network analysis / network science and applied natural language processing. It scales efficiently for large and complex datasets. 8 | 9 | `nate` offers functionality for seamlessly connecting state-of-the-art machine learning models used in natural language processing workflows using [`spacy`](https://github.com/explosion/spaCy) with network analysis workflows using packages such as [`networkx`](https://networkx.github.io/), [`igraph`](https://igraph.org/python/) for Python, and [`graph-tool`](https://graph-tool.skewed.de/). `nate` has carefully-designed data structures that connect these two types of research workflows, and offers a set of tools for quickly producing descriptive reports and visualizations. 10 | 11 | # Installation 12 | 13 | ## GitHub 14 | 15 | If you want access to the most recent development version of `nate`, you can install it from the source code in this repository. 16 | 17 | `git clone https://github.com/UWNETLAB/nate.git && cd nate && pip install -e .` 18 | 19 | # Documentation 20 | 21 | * Binder-enabled documentation coming soon... 22 | 23 | # Asking Questions and Getting Help 24 | 25 | It is not always possible for us to provide help via email. Instead, we encourage you to use the Github Issue Tracker. By answering your questions (or fixing bugs you find) in public, we can also help other members of the research community. 26 | 27 | # Selected Features 28 | 29 | * Coming soon... 30 | -------------------------------------------------------------------------------- /nate/__init__.py: -------------------------------------------------------------------------------- 1 | from .importers.dataframe_importers import import_csv, import_dataframe, import_excel 2 | from .importers.raw_importers import import_files, import_text, import_dict_of_dicts -------------------------------------------------------------------------------- /nate/cooc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/cooc/__init__.py -------------------------------------------------------------------------------- /nate/cooc/cooc_class.py: -------------------------------------------------------------------------------- 1 | """Definition of the `Cooc` pipeline, for co-occurence analysis. 2 | 3 | This module defines the `Cooc` pipeline, which contains a dictionary of 4 | the results from the co-occurence analysis which is conducted when 5 | this class is instantiated from the `Nate` class's `cooc_pipeline()` 6 | method. 7 | 8 | This class contains useful information in its own right, but primarily 9 | serves as an intermediary for the `Bursts` class, which can be 10 | instantiated using this class's `cooc_to_bursts()` method. 11 | """ 12 | from typing import Dict, Union, List 13 | from ..edgeburst.burst_mixin import BurstMixin 14 | from nate.edgeburst.burst_class import Bursts 15 | 16 | 17 | class Cooc(BurstMixin): 18 | """The main object in the `Cooc` pipeline. 19 | 20 | Attributes: 21 | offset_dict (Dict): A dictionary with term-term pairs as keys and a list 22 | of times when they occur as values. 23 | lookup (Dict): A dictionary with the integer representation of a term as 24 | key and the string representation as value. 25 | minimum_offsets (int): The minimum number of 'offsets' - or occurrences 26 | in the dataset - a given token/term pair must have had in order to 27 | be retained. 28 | from_svo (Bool): A flag to be passed to future steps in the pipeline 29 | marking whether the data descended from an SVO class. 30 | [Should be removed on future development.] 31 | """ 32 | 33 | def __init__(self, 34 | offset_dict: Dict, 35 | lookup: Dict, 36 | minimum_offsets: int = 20): 37 | self.offset_dict = offset_dict 38 | self.lookup = lookup 39 | self.minimum_offsets = minimum_offsets 40 | self.from_svo = False 41 | 42 | 43 | def __getitem__(self, index: Union[slice, int, tuple]): 44 | """Called when `Cooc` is accessed using indexing or slicing. 45 | 46 | Args: 47 | index (slice): A range of integers used to retrieve corresponding entries 48 | in the `offset_dict` attribute. 49 | 50 | Returns: 51 | List: A list of named tuples, each corresponding to one row in the dataset. 52 | """ 53 | 54 | if isinstance(index, slice) or isinstance(index, int): 55 | return list(self.offset_dict.items())[index] 56 | else: 57 | return self.offset_dict[index] 58 | 59 | 60 | def cooc_to_burst(self, s=2, gamma=1): 61 | """Returns an instance of the `Bursts` class. 62 | 63 | Args: 64 | s (float, optional): s parameter for tuning Kleinberg algorithm. 65 | Higher values make it more difficult for bursts to move up the 66 | burst hierarchy. Defaults to 2. 67 | gamma (float, optional): gamma parameter for tuning Kleinberg 68 | algorithm. Higher values make it more difficult for activity to 69 | be considered a burst. Defaults to 1. 70 | 71 | Returns: 72 | Bursts: An instance of the `Bursts` class containing data from the 73 | instance of the `Cooc` class it was called from. 74 | """ 75 | offset_dict_strings, edge_burst_dict_strings, s, gamma, from_svo, lookup = self.burst_detection( 76 | s, gamma) 77 | 78 | return Bursts(offset_dict_strings, edge_burst_dict_strings, s, gamma, 79 | from_svo, lookup) 80 | -------------------------------------------------------------------------------- /nate/cooc/cooc_offsets.py: -------------------------------------------------------------------------------- 1 | """Builds the offset dictionary for the cooc pipeline. 2 | 3 | The dictionary is of the form {(word1, word2):[time1,time2,...], ...}. 4 | """ 5 | 6 | import pandas as pd 7 | from time import time as marktime 8 | from typing import List 9 | from ..utils.mp_helpers import mp 10 | from itertools import groupby, combinations, chain 11 | from collections import defaultdict 12 | 13 | 14 | def cooc_offsets(processed_list: List, time: List, minimum_offsets): 15 | """Generates the offset_dict for the `Cooc` pipeline. 16 | 17 | Args: 18 | processed_list (List): A list of lists, where each entry in the outer 19 | list represents a text, and the entries of each inner list are 20 | the tokens found in those texts in string form. 21 | time (List): A list of times for when each text was written. 22 | minimum_offsets (int): The minimum number of 'offsets' - or occurrences 23 | in the dataset - a given token/term pair must have in order to 24 | be retained. 25 | 26 | Returns: 27 | Dict: The offset dictionary for the `Cooc` class, with word-word pairs 28 | in integer format as keys and a list of offsets (occurence 29 | timestamps) as values. 30 | Dict: A lookup dictionary for each word in the corpus, with the integer 31 | representation as key and the string representation as value. 32 | """ 33 | print("Generating Offsets:") 34 | 35 | start = marktime() 36 | 37 | # send list of documents to text_to_int so that cooc function can work with integers for memory and processing efficiency 38 | word_ints, lookup = text_to_int(processed_list) 39 | 40 | # multiprocess the cooc function on the list of integers 41 | offset_dict = mp(word_ints, cooc, time) 42 | 43 | 44 | # recreate the dictionary of offsets, pruning all those with a less occurrences than the minimum_offsets threshold 45 | offsets = { 46 | k: v for k, v in offset_dict.items() if len(v) >= minimum_offsets 47 | } 48 | 49 | print("Finished offset generation in {} seconds".format( 50 | round(marktime() - start))) 51 | print("Commencing timestamp deduplication...") 52 | 53 | # kleinberg requires that timestamps be unique - increment simultaneous occurrences by 1 millisecond. 54 | # Note: it's possible that some dataset will require this to be microseconds, if term pairs appear more than 999 times at once 55 | for item in offsets.keys(): 56 | offsets[item].sort() 57 | offsets[item] = [ 58 | g + i * 0.001 59 | for k, group in groupby(offsets[item]) 60 | for i, g in enumerate(group) 61 | ] 62 | 63 | print("finished timestamp deduplication in {} seconds".format( 64 | round(marktime() - start))) 65 | 66 | print("Finished Generating Offsets. Returning offset dictionary.") 67 | 68 | return offsets, lookup 69 | 70 | 71 | def text_to_int(processed_list): 72 | """Converts every word in a list of texts into an integer representation. 73 | 74 | After conversion to the integer representation, the tokens of the text are 75 | no longer in the same order. This function should only be used on texts 76 | where the distance between tokens in the source text is not relevant. It 77 | should only be used on texts where token co-occurence _in the same document_ 78 | is relevant. 79 | 80 | Args: 81 | processed_list (List): A list of texts, where each text is a 82 | list of tokens (strings). 83 | 84 | Returns: 85 | List: A list of texts, where each text is a list of tokens (integers). 86 | Dict: A lookup dict, to convert integer representations of tokens 87 | to strings. It is of the form {i:s} where i is the integer 88 | representation of the token, and s is the string representation. 89 | """ 90 | 91 | # sort string tokens in each text, keeping only unique words 92 | sorted_texts = [sorted(set(x)) for x in processed_list] 93 | 94 | # create a sorted list of all unique words in the corpus, used for the lookup dictionary 95 | flat_text = sorted(set(list(chain(*sorted_texts)))) 96 | 97 | del processed_list 98 | 99 | # create dataframe with 1 column ('word') of words in the corpus 100 | df = pd.DataFrame({'word': flat_text}) 101 | 102 | del flat_text 103 | 104 | # use the dataframe index as the identifier for each word, casting to a dictionary 105 | word_dict = df.reset_index().set_index('word')['index'].to_dict() 106 | 107 | # invert the dictionary, making word integers the keys, and words the values 108 | lookup_dict = {v: k for k, v in word_dict.items()} 109 | 110 | # create a list (documents) of lists (words in each document) integer representation of the corpus 111 | word_ints = [[word_dict[word] for word in text] for text in sorted_texts] 112 | 113 | del word_dict 114 | 115 | return word_ints, lookup_dict 116 | 117 | 118 | def cooc(time, word_ints): 119 | """Generates co-occurence pairs from documents and their timestamps. 120 | 121 | Args: 122 | time (List): A list of of the times each text in word_ints was written. 123 | word_ints (List): A list of lists, where each entry in the outer list 124 | represents a text, and the entries of each inner list are the 125 | integer representations of tokens found in those texts (as 126 | produced by text_to_int). 127 | 128 | Returns: 129 | Dict: A dictionary with token-token pairs as keys and a list of 130 | occurence timestamps as values. 131 | """ 132 | 133 | # use defaultdict so that dictionary entries are created if they don't exist already 134 | offset_dict = defaultdict(list) 135 | 136 | # iterate through each document and its timestamp 137 | for text, timestamp in zip(word_ints, time): 138 | 139 | # use combinations to find all word-pairs in the current document 140 | keys = list(combinations(text, 2)) 141 | 142 | # add current timestamp to list of timestamps (dictionary value) for each word-pair (dictionary key) found in current document 143 | for key in keys: 144 | offset_dict[key].append(timestamp) 145 | 146 | return offset_dict 147 | -------------------------------------------------------------------------------- /nate/docnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/docnet/__init__.py -------------------------------------------------------------------------------- /nate/docnet/docnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | # Coming soon... 5 | -------------------------------------------------------------------------------- /nate/edgeburst/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/edgeburst/__init__.py -------------------------------------------------------------------------------- /nate/edgeburst/burst_class.py: -------------------------------------------------------------------------------- 1 | """Definition of the `Bursts` class, for analysis of bursty term relations. 2 | 3 | While `BurstMixin` provides the actual burst detection functionality, 4 | this module provides export and plotting functions to facilitate further 5 | analysis. 6 | """ 7 | from nate.edgeburst import pybursts 8 | from ..utils.mp_helpers import mp 9 | from .visualize_bursts import plot_bursts 10 | from .export import df_export, max_bursts_export 11 | from nate.edgeburst import visualize_bursts 12 | from typing import Tuple, Dict, Callable, Union 13 | 14 | 15 | def get_bursts(s, gamma, offset_list): 16 | """Sends Kleinberg parameters and offset_list to pybursts.""" 17 | burst_list = pybursts.process(offset_list, s, gamma) 18 | 19 | return burst_list 20 | 21 | 22 | def detect_bursts(offsets, s=2, gamma=1): 23 | """Returns dictionary with bursting terms as keys and burst data as values. 24 | 25 | Args: 26 | offsets (Dict): A dictionary of offsets, with keys being edge 27 | objects and the values being lists of occurence times. 28 | s (float, optional): s parameter for tuning Kleinberg algorithm. 29 | Higher values make it more difficult for bursts to move up the 30 | burst hierarchy. Defaults to 2. 31 | gamma (float, optional): gamma parameter for tuning Kleinberg 32 | algorithm. Higher values make it more difficult for activity to 33 | be considered a burst. Defaults to 1. 34 | 35 | Returns: 36 | Dict: A dictionary of bursts, with keys being edge objects and values 37 | being lists of burst data. Each burst is in the format 38 | [intensity, start_time, end_time]. 39 | """ 40 | key_list = list(offsets.keys()) 41 | offset_list = list(offsets.values()) 42 | 43 | burst_list = mp(offset_list, get_bursts, s, gamma) 44 | 45 | edge_bursts = dict(zip(key_list, burst_list)) 46 | 47 | return edge_bursts 48 | 49 | 50 | class Bursts(): 51 | """The core burst detection class. 52 | 53 | This class provides all burst analysis functionality, including export 54 | and plotting abilities. 55 | 56 | Attributes: 57 | offset_dict (Dict): A dictionary with edge objects in string format 58 | as keys and occurence times as values. 59 | edge_burst_dict (Dict): A dictionary with edge objects in string format 60 | as keys and a list of bursts as values. The burst lists are in the 61 | format [intensity, start_time, end_time]. 62 | s (float, optional): s parameter for tuning Kleinberg algorithm. 63 | Higher values make it more difficult for bursts to move up the 64 | burst hierarchy. Changing this parameter after object instatiation 65 | does not change the object's data. 66 | gamma (float, optional): gamma parameter for tuning Kleinberg 67 | algorithm. Higher values make it more difficult for activity to 68 | be considered a burst. Changing this parameter after object 69 | instatiation does not change the object's data. 70 | from_svo (Bool): A flad that determines whether the pipeline should be 71 | configured for bursts of SVOs. 72 | lookup (Dict): A lookup dictionary for terms, with integer 73 | representations as keys and string representations as values. 74 | """ 75 | 76 | def __init__(self, offset_dict, edge_burst_dict, s, gamma, from_svo, 77 | lookup): 78 | self.offset_dict: dict = offset_dict 79 | self.edge_burst_dict: dict = edge_burst_dict 80 | self.s = s 81 | self.gamma = gamma 82 | self.from_svo = from_svo # flag that determines whether the pipeline should be configured for bursts of SVOs 83 | self.bdf = None 84 | self.odf = None 85 | self.lookup = lookup 86 | 87 | def __getitem__(self, index: Union[slice, int, tuple]): 88 | """Called when `Bursts` is accessed using indexing or slicing. 89 | 90 | Args: 91 | index (slice): A range of integers used to retrieve corresponding 92 | entries in the `offset_dict` attribute. 93 | 94 | Returns: 95 | List: A list of named tuples, each corresponding to one row in the 96 | dataset. 97 | """ 98 | 99 | if isinstance(index, slice) or isinstance(index, int): 100 | return list(self.edge_burst_dict.items())[index] 101 | else: 102 | return self.edge_burst_dict[index] 103 | 104 | 105 | def export_df(self): 106 | """Exports burst data to a dataframe. 107 | 108 | Returns: 109 | pandas.Dataframe: A dataframe containing all bursts data. 110 | 111 | The returned dataframe has the following columns: 112 | - Column(s) representing the edge objects (terms), whose names 113 | depend on the object the `Bursts` object was formed from. 114 | - 'bursts': A dict with 115 | - 'term_id' (int): The id of the edge object in the dataset. 116 | This will match the index. 117 | - 'interval_start' (datetime): The start of the burst. 118 | - 'interval_end' (datetime): The end of the burst. 119 | - 'intensity' (int): The intensity of the burst. 120 | """ 121 | return df_export(self.edge_burst_dict, self.offset_dict, self.from_svo) 122 | 123 | def export_max_bursts(self): 124 | """Returns a dict with edges as keys and all max bursts as values.""" 125 | return max_bursts_export(self.edge_burst_dict, self.from_svo) 126 | 127 | def to_pandas(self, key: Tuple, unit='s') -> Tuple[Dict, Dict]: 128 | """Exports bursts and offsets to separate dataframes for a given key. 129 | 130 | TODO: refactor the wrapped function (visualize_bursts.to_pandas) 131 | so that it is not SVO specific. Should not be much of an issue. 132 | 133 | Args: 134 | key (Tuple): The edge for which burst and offset data will 135 | be extracted. 136 | unit (str, optional): The unit to be passed to pd.to_datetime. 137 | Defaults to 's'. 138 | 139 | Returns: 140 | Tuple[pandas.Dataframe, pandas.Dataframe]: The first dataframe 141 | contains burst data. The second dataframe contains offset data. 142 | 143 | The first dataframe has the following columns: 144 | - 'level' (int): The level of the burst. 145 | - 'start' (datetime): The start time of the burst. 146 | - 'end' (datetime): The end time of the burst. 147 | - 'svo' (string): The edge for which the dataframe contains 148 | data. 149 | 150 | The second dataframe has the following columns: 151 | - 'Date' (int): The date of the occurence. 152 | - 'Year' (int): The year of occurence. 153 | - 'Month' (int): The month of the occurence. 154 | - 'Day' (int): The day of the occurence. 155 | - 'svo' (string): The edge for which the dataframe contains 156 | data. 157 | """ 158 | 159 | offsets = self.offset_dict[key] 160 | bursts = self.edge_burst_dict[key] 161 | 162 | return visualize_bursts.to_pandas(bursts, offsets, key, unit) 163 | 164 | def plot_bursts(self, 165 | key: Tuple, 166 | unit='s', 167 | lowest_level=0, 168 | title=True, 169 | daterange=None, 170 | xrangeoffsets=3): 171 | """Plots the occurences and bursts of the given key. 172 | 173 | TODO: Refactor wrapped function so that it is not SVO specific. 174 | 175 | Args: 176 | key (Tuple): The key whose burst data to plot. 177 | unit (str, optional): The unit to be passed to pd.to_datetime. 178 | Defaults to 's'. 179 | lowest_level (int, optional): If passed, includes bursts only if 180 | they are greater than the given lowest level. Defaults to 0. 181 | title (Bool, optional): If True, include the name of SVO as the 182 | title of the figure. Defaults to True. 183 | daterange (Tuple[str,str], optional): If passed, only bursts in the 184 | range daterange[0] to daterange[1] will be plotted. The dates 185 | must be passed as strings in the format 'year-month-day'. 186 | Defaults to None. 187 | xrangeoffsets (int, optional): The number of days to add before the 188 | minimum date and after the maximum date. Used to 'pad' the plot. 189 | Defaults to 3. 190 | """ 191 | bdf, odf = self.to_pandas(key, unit) 192 | 193 | visualize_bursts.plot_bursts(odf=odf, 194 | bdf=bdf, 195 | lowest_level=lowest_level, 196 | title=True, 197 | daterange=daterange, 198 | xrangeoffsets=xrangeoffsets, 199 | s=self.s, 200 | gamma=self.gamma) 201 | 202 | # def create_burst_plot(self, token_pairs, zoom_level = 0, output_path = False, plot_size_x = 20, plot_size_y = 10, plot_vertically = False, num_ticks = 10, rug_alpha = 0.35, dark = True): 203 | # """ 204 | # `token_pair` accepts either a tuple or a list of tuples corresponding to one of the token-token pairs in the edge_burst_dict dictionary. 205 | # If a list of valid token pairs is provided, one separate plot for each of the token pairs is produced. 206 | 207 | # `zoom_level` (default = 0) splits the burst structure for each provided token-token pair into a series of separate bursts hierarchies, omitting any levels 208 | # below the indicated zoom_level. A zoom level of 0 does not omit any of the bursts (including the baseline burst, which spans the entirety of the supplied data) 209 | # """ 210 | # if isinstance(token_pairs, tuple): 211 | # token_pairs = [token_pairs] 212 | 213 | # for entry in token_pairs: 214 | 215 | # plot_title = "'{}' + '{}' - Full Plot (s = {}, gamma = {})".format(entry[0], entry[1], self.s, self.gamma) 216 | 217 | # plot_bursts(self.offset_dict[entry], self.edge_burst_dict[entry], plot_title, output_path, plot_size_x, plot_size_y, plot_vertically, num_ticks, rug_alpha, dark) 218 | 219 | # if zoom_level > 0: # When the zoom level is 0, we can just pass everything directly into the plotting function. 220 | # offsets = self.offset_dict[entry] 221 | # bursts = self.edge_burst_dict[entry] 222 | 223 | # burst_stack = [] 224 | # temp_burst_stack = [] 225 | 226 | # for burst in bursts: 227 | # if burst[0] < zoom_level: 228 | # pass 229 | # elif burst[0] == zoom_level: 230 | # if len(temp_burst_stack) > 0: 231 | # burst_stack.append(temp_burst_stack) 232 | # temp_burst_stack = [] 233 | # temp_burst_stack.append(burst) 234 | # else: 235 | # temp_burst_stack.append(burst) 236 | 237 | # if len(temp_burst_stack) > 0: 238 | # burst_stack.append(temp_burst_stack) 239 | 240 | # offset_stack = [] 241 | 242 | # for burst in burst_stack: 243 | # low = burst[0][1] 244 | # high = burst[0][2] 245 | # temp_offset_stack = [] 246 | # for offset in offsets: 247 | # if low <= offset and offset <= high: 248 | # temp_offset_stack.append(offset) 249 | # offset_stack.append(temp_offset_stack) 250 | 251 | # assert len(burst_stack) == len(offset_stack) 252 | 253 | # for i in range(0, len(burst_stack)): 254 | # plot_title = ("'{}' + '{}' - Zoom Level {}, Slice {} of {} (s = {}, gamma = {})".format(entry[0], entry[1], zoom_level, i+1, len(burst_stack), self.s, self.gamma)) 255 | 256 | # plot_bursts(offset_stack[i], burst_stack[i], plot_title, output_path, plot_size_x, plot_size_y, plot_vertically, num_ticks, rug_alpha, dark) 257 | -------------------------------------------------------------------------------- /nate/edgeburst/burst_mixin.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | from .export import all_bursts_export, offsets_export 5 | from .burst_class import Bursts, detect_bursts 6 | 7 | 8 | class BurstMixin(): 9 | 10 | def __init__(self): 11 | self.offset_dict: dict 12 | self.lookup: dict 13 | self.from_svo: bool 14 | 15 | def burst_detection(self, s: float = 2, gamma: float = 1): 16 | """Returns an object of the class `bursts`. 17 | 18 | This method is used to detect bursts for _all_ of the term-term pairs 19 | in the offset dictionary generated when this class (`edge_burst`) was 20 | instantiated. 21 | 22 | This method is best employed as an exploratory tool for identifying 23 | unusually bursty term pairs, or groups of term pairs with correlated 24 | burst patterns. 25 | 26 | Since calling this method on an entire dataset can consume significant 27 | amounts of time and memory, this method only allows for one value of 28 | s and one value of gamma. 29 | 30 | If you wish to detect bursts using a variety of different values for 31 | the s and gamma parameters, instead utilize the `multi_bursts` method 32 | contained in this class. 33 | 34 | Args: 35 | s (float, optional): s parameter for tuning Kleinberg algorithm. 36 | Higher values make it more difficult for bursts to move up the 37 | burst hierarchy. Defaults to 2. 38 | gamma (float, optional): gamma parameter for tuning Kleinberg 39 | algorithm. Higher values make it more difficult for activity to 40 | be considered a burst. Defaults to 1. 41 | 42 | Returns: 43 | Dict: The object's `offset_dict`, with integer keys converted 44 | to strings. 45 | Dict: A dictionary with string representations of terms as keys 46 | and lists of burst data as values. 47 | float: The s value passed as a parameter. 48 | float: The gamma value passed as a parameter. 49 | Bool: A flag passed to other functions in the pipeline to configure 50 | it for SVO data. 51 | Dict: A dictionary that maps integer representations of terms 52 | (as keys) to string representations as values. 53 | """ 54 | 55 | # use offsets_export to return a dictionary with terms as keys in string format and list of time offsets as values 56 | offset_dict_strings = offsets_export(self.offset_dict, self.lookup, 57 | self.from_svo) 58 | 59 | # use detect_bursts to return a dictionary with terms as keys in integer format and list of nested burst data as values 60 | edge_burst_dict_int = detect_bursts(self.offset_dict, s, gamma) 61 | 62 | # same as above, but convert keys from integers to the string values they represent 63 | edge_burst_dict_strings = all_bursts_export(edge_burst_dict_int, 64 | self.lookup, self.from_svo) 65 | 66 | return offset_dict_strings, edge_burst_dict_strings, s, gamma, self.from_svo, self.lookup 67 | 68 | # def multi_burst(self, token_pairs, s, gamma): 69 | # """ 70 | # The lists passed to s and gamma must be exactly the same length. 71 | 72 | # Returns a dictionary where keys are strings containing two numbers separated by an underscore, corresponding to the s and gamma values for the run, respectively. 73 | # The values of each entry in the dictionary consists of {SOMETHING} 74 | # """ 75 | # assert len(s) == len(gamma) 76 | 77 | # run_dict = {} 78 | # offset_subset_dict = {} 79 | 80 | # for token_pair in token_pairs: 81 | # offset_subset_dict[token_pair] = self.offset_dict[token_pair] 82 | 83 | # for i in range(0,len(s)): 84 | # run_name = "{}_{}".format(str(s[i]), str(gamma[i])) 85 | # run_result = Bursts(self.offset_dict,self.lookup, s[i], gamma[i], self.from_svo, self.lookup) 86 | # run_dict[run_name] = run_result 87 | 88 | # return run_dict 89 | -------------------------------------------------------------------------------- /nate/edgeburst/export.py: -------------------------------------------------------------------------------- 1 | """Exports burst data to other data structures.""" 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import itertools 6 | import pickle 7 | from itertools import groupby 8 | 9 | 10 | def df_export(bursts, offsets, from_svo=False): 11 | """Exports the burst data to a dataframe. 12 | 13 | TODO: remove offsets parameter, as it is not used to generate the dataframe 14 | (as far as I can tell). 15 | 16 | TODO: does the 'bursts' column need to be kept for every edge entry? 17 | """ 18 | key_list = [] 19 | burst_list = [] 20 | offset_list = [] 21 | for k, v in bursts.items(): 22 | key_list.append(k) 23 | burst_list.append(v) 24 | offset_list.append(offsets[k]) 25 | 26 | if from_svo == True: 27 | df = pd.DataFrame() 28 | df['svo'] = key_list 29 | 30 | 31 | intensities = max_intensities(burst_list) 32 | 33 | else: 34 | 35 | df = pd.DataFrame.from_records(key_list, columns=['word1', 'word2']) 36 | 37 | intensities = max_intensities(burst_list) 38 | 39 | df['bursts'] = intensities 40 | 41 | full_df = flatten(df, intensities) 42 | return full_df 43 | 44 | 45 | def max_intensities(burst_list): 46 | """Removes all but the max intensity for each burst interval.""" 47 | max_bursts = [{(j, k): i for i, j, k in x} for x in burst_list] 48 | 49 | return max_bursts 50 | 51 | 52 | def flatten(df, intensities): 53 | """Flattens burst data into dataframe columns. 54 | 55 | Depends on the df being in the same order as the list of intensities. 56 | """ 57 | term_id_list = [] 58 | interval_start_list = [] 59 | interval_end_list = [] 60 | intensity_list = [] 61 | 62 | for i, term in enumerate(intensities): 63 | for interval, intensity in term.items(): 64 | term_id_list.append(i) 65 | interval_start_list.append(interval[0]) 66 | interval_end_list.append(interval[1]) 67 | intensity_list.append(intensity) 68 | 69 | temp_df = pd.DataFrame() 70 | temp_df['term_id'], temp_df['interval_start'], temp_df['interval_end'], temp_df['intensity'] =\ 71 | term_id_list, interval_start_list, interval_end_list, intensity_list 72 | 73 | return_df = pd.merge(df, temp_df, left_index=True, right_on='term_id') 74 | 75 | return_df = return_df.sort_values(by=['intensity'], ascending=False) 76 | 77 | return return_df 78 | 79 | 80 | def max_bursts_export(bursts, from_svo=False): 81 | """Returns a dict with term as key and maximum intensity burst as value. 82 | 83 | TODO: make this function export what it means to. As of now, it returns 84 | a dict with all bursts as values. 85 | """ 86 | key_list = [] 87 | burst_list = [] 88 | 89 | for k, v in bursts.items(): 90 | key_list.append(k) 91 | burst_list.append(v) 92 | 93 | if from_svo: 94 | df = pd.DataFrame() 95 | df['svo'] = key_list 96 | 97 | intensities = max_intensities(burst_list) 98 | 99 | max_bursts = {df['svo'][x]: intensities[x] for x in df.index} 100 | else: 101 | 102 | df = pd.DataFrame.from_records(key_list, columns=['word1', 'word2']) 103 | 104 | intensities = max_intensities(burst_list) 105 | 106 | max_bursts = { 107 | (df['word1'][x], df['word2'][x]): intensities[x] for x in df.index 108 | } 109 | 110 | return max_bursts 111 | 112 | 113 | def all_bursts_export(bursts, lookup, from_svo=False): 114 | """Converts the keys of the `bursts` dictionary from ints to strings.""" 115 | key_list = [] 116 | burst_list = [] 117 | 118 | for k, v in bursts.items(): 119 | key_list.append(k) 120 | burst_list.append(v) 121 | 122 | if from_svo: 123 | df = pd.DataFrame() 124 | df['svo_#'] = key_list 125 | df['svo'] = df['svo_#'].map(lookup) 126 | 127 | all_bursts = {df['svo'][x]: burst_list[x] for x in df.index} 128 | else: 129 | df = pd.DataFrame.from_records(key_list, columns=['word1_#', 'word2_#']) 130 | df['word1'] = df['word1_#'].map(lookup) 131 | df['word2'] = df['word2_#'].map(lookup) 132 | 133 | all_bursts = { 134 | (df['word1'][x], df['word2'][x]): burst_list[x] for x in df.index 135 | } 136 | 137 | return all_bursts 138 | 139 | 140 | def offsets_export(offsets, lookup, from_svo=False): 141 | """Converts the keys of the `offsets` dictionary from ints to strings. 142 | 143 | TODO: This does exactly the same thing as all_bursts_export above: 144 | the differences between the two datastructures aren't relevant to 145 | replacing their keys with strings. 146 | """ 147 | key_list = [] 148 | offset_list = [] 149 | 150 | for k, _ in offsets.items(): 151 | key_list.append(k) 152 | offset_list.append(offsets[k]) 153 | 154 | if from_svo: 155 | df = pd.DataFrame() 156 | df['svo_#'] = key_list 157 | df['svo'] = df['svo_#'].map(lookup) 158 | 159 | offsets = {df['svo'][x]: offset_list[x] for x in df.index} 160 | 161 | else: 162 | df = pd.DataFrame.from_records(key_list, columns=['word1_#', 'word2_#']) 163 | df['word1'] = df['word1_#'].map(lookup) 164 | df['word2'] = df['word2_#'].map(lookup) 165 | 166 | offsets = { 167 | (df['word1'][x], df['word2'][x]): offset_list[x] for x in df.index 168 | } 169 | 170 | return offsets 171 | -------------------------------------------------------------------------------- /nate/edgeburst/pybursts.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring. This module is adapted from the pybursts package, which is an implementation of Kleinberg's 3 | burst detection algorithm by Renzo Poddighe: https://pypi.org/project/pybursts/ 4 | Changes are primarily to increase performance by moving object creation outside of loops and using numba just-in-time 5 | compilation to perform mathematical calculations in C. 6 | The process function was also added to perform burst detection on a list of documents 7 | """ 8 | import numpy as np 9 | import math 10 | import numba 11 | from numba import njit, jit 12 | 13 | 14 | def single(offsets, s=2, gamma=1): 15 | 16 | if s <= 1: 17 | raise ValueError("s must be greater than 1!") 18 | if gamma <= 0: 19 | raise ValueError("gamma must be positive!") 20 | if len(offsets) < 1: 21 | raise ValueError("offsets must be non-empty!") 22 | 23 | offsets = np.array(offsets, dtype=object) 24 | 25 | if offsets.size == 1: 26 | bursts = np.array([0, offsets[0], offsets[0]], ndmin=2, dtype=object) 27 | return bursts 28 | 29 | offsets = np.sort(offsets) 30 | gaps = np.diff(offsets) 31 | 32 | if not np.all(gaps): 33 | raise ValueError("Input cannot contain events with zero time between!") 34 | 35 | T = np.sum(gaps) 36 | n = np.size(gaps) 37 | g_hat = T / n 38 | 39 | k = int( 40 | math.ceil(float(1 + math.log(T, s) + math.log(1 / np.amin(gaps), s)))) 41 | 42 | gamma_log_n = gamma * math.log(n) 43 | 44 | alpha_function = np.vectorize(lambda x: s**x / g_hat) 45 | alpha = alpha_function(np.arange(k)) 46 | 47 | C = np.repeat(float("inf"), k) 48 | 49 | C[0] = 0 50 | 51 | q = np.empty((k, 0)) 52 | for t in range(n): 53 | C_prime = np.repeat(float("inf"), k) 54 | q_prime = np.empty((k, t + 1)) 55 | q_prime.fill(np.nan) 56 | k_range = np.arange(0, k) 57 | C_temp = C[k_range] 58 | gaps_t = gaps[t] 59 | for j in range(k): 60 | tau_arr = tau(k_range, j, gamma_log_n) 61 | cost = np.add(C_temp, tau_arr) 62 | el = min_cost(cost) 63 | alpha_temp = alpha[j] 64 | f_j_t = f(alpha_temp, gaps_t) 65 | 66 | if f_j_t > 0: 67 | C_prime[j] = cost[el] - math.log(f_j_t) 68 | 69 | if t > 0: 70 | q_prime[j, :t] = q[el, :] 71 | 72 | q_prime[j, t] = j + 1 73 | 74 | C = C_prime 75 | q = q_prime 76 | 77 | j = np.argmin(C) 78 | q = q[j, :] 79 | 80 | prev_q = 0 81 | 82 | N = int(0) 83 | for t in range(n): 84 | if q[t] > prev_q: 85 | N = N + q[t] - prev_q 86 | prev_q = q[t] 87 | 88 | bursts = np.array([ 89 | np.repeat(np.newaxis, N), 90 | np.repeat(offsets[0], N), 91 | np.repeat(offsets[0], N) 92 | ], 93 | ndmin=2, 94 | dtype=object).transpose() 95 | 96 | burst_counter = -1 97 | prev_q = 0 98 | stack = np.repeat(np.newaxis, N) 99 | stack_counter = -1 100 | for t in range(n): 101 | if q[t] > prev_q: 102 | num_levels_opened = q[t] - prev_q 103 | for i in range(int(num_levels_opened)): 104 | burst_counter += 1 105 | bursts[burst_counter, 0] = int(prev_q + i) 106 | bursts[burst_counter, 1] = offsets[t] 107 | stack_counter += 1 108 | stack[stack_counter] = burst_counter 109 | elif q[t] < prev_q: 110 | num_levels_closed = prev_q - q[t] 111 | for i in range(int(num_levels_closed)): 112 | bursts[stack[stack_counter], 2] = offsets[t] 113 | stack_counter -= 1 114 | prev_q = q[t] 115 | 116 | while stack_counter >= 0: 117 | bursts[stack[stack_counter], 2] = offsets[n] 118 | stack_counter -= 1 119 | 120 | burst_lists = [] 121 | 122 | for burst in bursts: 123 | burst_lists.append(burst.tolist()) 124 | 125 | return bursts 126 | 127 | 128 | @njit 129 | def f(alpha, x): 130 | 131 | return alpha * math.exp(-alpha * x) 132 | 133 | 134 | @njit 135 | def min_cost(cost): 136 | 137 | return np.argmin(cost) 138 | 139 | 140 | @njit(cache=False) 141 | def tau(i, j, gamma_log_n): 142 | 143 | return np.where(i >= j, 0, ((j - i) * gamma_log_n)) 144 | 145 | 146 | @jit(forceobj=True) 147 | def process(offset_list, s, gamma): 148 | 149 | bursts = [single(x, s, gamma) for x in offset_list] 150 | return bursts 151 | -------------------------------------------------------------------------------- /nate/edgeburst/visualize_bursts.py: -------------------------------------------------------------------------------- 1 | """Visualizes burst data.""" 2 | 3 | import pandas as pd 4 | import matplotlib as mpl 5 | import matplotlib.pyplot as plt 6 | import matplotlib.dates as mdates 7 | 8 | 9 | def to_pandas(ebursts, offsets, svo, unit='s'): 10 | """Exports burst and offset data to dataframes for a single term. 11 | 12 | ebursts is an edgebust dict from the SVO object 13 | offsets is an offsets dict from the SVO object 14 | """ 15 | svos = " | ".join(svo) 16 | 17 | bdf = pd.DataFrame(ebursts) 18 | bdf[1] = pd.to_datetime(bdf[1], unit=unit) 19 | bdf[2] = pd.to_datetime(bdf[2], unit=unit) 20 | bdf.columns = ['level', 'start', 'end'] 21 | bdf['svo'] = svos 22 | 23 | odf = pd.DataFrame() 24 | i = pd.to_datetime(offsets, unit='s') 25 | odf['Date'], odf['Year'], odf['Month'], odf[ 26 | 'Day'] = i.date, i.year, i.month, i.day 27 | odf = odf.set_index(i) 28 | odf['svo'] = svos 29 | 30 | return bdf, odf 31 | 32 | 33 | 34 | def plot_bursts(odf, 35 | bdf, 36 | lowest_level=0, 37 | title=True, 38 | daterange=None, 39 | xrangeoffsets=3, 40 | s=None, 41 | gamma=None): 42 | """Plots burst and offset data. 43 | 44 | odf = an offsets dataframe 45 | bdf = an edgeburst dataframe 46 | lowest_level = subset the burst dataframe with bursts greater than or equal to the specified level 47 | daterange = a tuple with two elements: a start date and end date as *strings*. format is 'year-month-day' 48 | xrangeoffsets = the number of days to add before and after the min and max x dates 49 | """ 50 | 51 | svo_title = str(set(bdf['svo']).pop()) 52 | 53 | fig, (axa, axb) = plt.subplots(2, sharey=False, sharex=True) 54 | fig.set_figwidth(10) 55 | fig.set_figheight(6) 56 | 57 | formatter = mdates.DateFormatter("%b %d\n%Y") 58 | axb.xaxis.set_major_formatter(formatter) 59 | 60 | # offsets plot 61 | day_freq = odf.resample('D').size() 62 | axa.plot(day_freq, color='#32363A') 63 | axa.xaxis.set_major_formatter(formatter) 64 | axa.xaxis_date() 65 | axa.tick_params(axis='both', which='both', length=0) 66 | axa.set_ylabel('Daily offsets') 67 | if daterange: 68 | axa.set_xlim(pd.Timestamp(daterange[0]), pd.Timestamp(daterange[1])) 69 | 70 | # bursts plot 71 | 72 | days = [day_freq.index[0]] 73 | levels = [0] 74 | 75 | for i in range(1, len(day_freq.index)): 76 | 77 | period_start = odf.resample('D').size().index[i - 1] 78 | period_end = odf.resample('D').size().index[i] 79 | 80 | max_burst = set() 81 | 82 | days.append(period_end) 83 | 84 | for j in range(len(bdf)): 85 | 86 | burst_start = bdf['start'][j] 87 | burst_end = bdf['end'][j] 88 | level = bdf['level'][j] 89 | 90 | if burst_end < period_start or period_end < burst_start : 91 | pass 92 | else: 93 | max_burst.add(level) 94 | 95 | levels.append(max(max_burst)) 96 | 97 | finaldf = pd.DataFrame({"start": days, "level": levels}) 98 | 99 | if lowest_level > 0: 100 | bdf = bdf[bdf['level'] >= lowest_level] 101 | xmin = min(bdf['start']) 102 | xmax = max(bdf['start']) 103 | 104 | if xmin == xmax: 105 | raise Exception("There must be at least two bursts at or above the specified level. Try reducing the `lowest_level` parameter.") 106 | 107 | daterange = ((xmin + pd.DateOffset(days=2)).date(), (xmax + pd.DateOffset(days=2)).date()) 108 | 109 | # bursts plot 110 | axb.bar(finaldf['start'], finaldf['level'], color='#32363A', width=1) 111 | 112 | if s != None and gamma != None: 113 | axb.set_ylabel(r'Burst levels (s = {}, $\gamma$ = {})'.format(s, gamma)) 114 | else: 115 | axb.set_ylabel('Burst level') 116 | 117 | axb.tick_params(axis='both', which='both', length=0) 118 | 119 | if daterange: 120 | axb.set_xlim(pd.Timestamp(daterange[0]), pd.Timestamp(daterange[1])) 121 | 122 | fig.tight_layout(rect=[0, 0.03, 1, 0.95]) 123 | 124 | if title is True: 125 | fig.suptitle(f'{svo_title}', fontsize=12, ha='center') 126 | -------------------------------------------------------------------------------- /nate/importers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/importers/__init__.py -------------------------------------------------------------------------------- /nate/importers/dataframe_importers.py: -------------------------------------------------------------------------------- 1 | """`Nate` importers involving pandas. 2 | 3 | This module provides common importers for the `Nate` class. They use existing 4 | pandas import functionality as an interface to `Nate`. These importers are the 5 | reccomended way to import data into `Nate`, unless the user needs to import data 6 | in ways not covered by this module's functionality. 7 | """ 8 | 9 | import pandas 10 | from typing import List, Union 11 | from .named_tuple_generator import tupleize 12 | from .nate_class import Nate 13 | from .timestamp_process import convert_times 14 | 15 | 16 | def process_dataframe(temp_data, 17 | text: str, 18 | unique_id: str = None, 19 | time: str = None, 20 | twitter_times: bool = False, 21 | columns_to_keep: List = []): 22 | """Builds a nate object from a dataframe.""" 23 | series_dict = {} 24 | special_column_list = [(text, "text"), (unique_id, "unique_id"), 25 | (time, "times")] 26 | 27 | for special_column, special_column_name in special_column_list: 28 | if special_column != None: 29 | temp_column = temp_data[special_column] 30 | temp_column.name = special_column_name 31 | series_dict[special_column_name] = temp_column.tolist() 32 | 33 | for covariate_column in columns_to_keep: 34 | temp_column = temp_data[covariate_column] 35 | temp_column.name = covariate_column 36 | series_dict[covariate_column] = temp_column.tolist() 37 | 38 | if time != None: 39 | try: 40 | series_dict['time'] = convert_times(series_dict['times']) 41 | del series_dict['times'] 42 | except: 43 | series_dict['time'] = series_dict['times'] 44 | del series_dict['times'] 45 | 46 | return Nate(tupleize(series_dict)) 47 | 48 | 49 | def import_dataframe(input_dataframe: pandas.DataFrame, 50 | text: str, 51 | unique_id: str = None, 52 | time: str = None, 53 | twitter_times: bool = False, 54 | columns_to_keep: List = []): 55 | """Imports a pandas dataframe into nate. 56 | 57 | Args: 58 | input_dataframe (pandas.DataFrame): The dataframe to be loaded. 59 | text (str): The name of the column containing the text data to be 60 | analyzed with nate. Required for all uses of nate. 61 | unique_id (str, optional): The name of the column containing unique 62 | identifiers (e.g. a unique name or hash ID#). Required 63 | for some uses of nate (e.g. Divsim). 64 | time (str, optional): The name of the column containing the time the 65 | observation was recorded. Required for some uses of 66 | nate (e.g. edge_burst). 67 | columns_to_keep (list, optional): A list of column names indicating 68 | which columns not specified elsewhere (e.g. for the 69 | time parameter) are kept. 70 | 71 | Returns: 72 | Nate: an instance of the `Nate` class containing all data from the 73 | columns specified in the parameters. 74 | 75 | The columns indicated in the text, unique_id, and time parameters will 76 | be renamed to 'text', 'unique_id', and 'time', accordingly. The names 77 | of the columns listed in 'columns_to_keep' will be preserved as-is. 78 | """ 79 | 80 | if time!= None and twitter_times == False: 81 | input_dataframe = input_dataframe.astype({time: 'str'}) 82 | input_dataframe[time] = pandas.to_datetime(input_dataframe[time], infer_datetime_format=True) 83 | return process_dataframe(input_dataframe, text, unique_id, time, twitter_times, 84 | columns_to_keep) 85 | 86 | 87 | def import_csv(file_paths: Union[List, str], 88 | text: str, 89 | unique_id: str = None, 90 | time: str = None, 91 | twitter_times: bool = False, 92 | columns_to_keep: List = [], 93 | observation_threshold=0): 94 | """Imports a comma-separated values file (.csv) into `nate`. 95 | 96 | This function uses pre-existing pandas functionality to read in a 97 | comma-separated value file (.csv) into `nate`. 98 | 99 | Args: 100 | file_path (str or path-like): The location of the file to 101 | be loaded from disk. 102 | text (str): The name of the column containing the text 103 | data to be analyzed with nate. Required for all uses of nate. 104 | unique_id (str, optional): The name of the column containing unique 105 | identifiers (e.g. a unique name or hash ID#). Required for 106 | some uses of nate (e.g. Divsim). 107 | time (str, optional): The name of the column containing the time the 108 | observation was recorded. Required for some uses of nate 109 | (e.g. edgeburst). 110 | columns_to_keep (list, optional): A list of column names indicating 111 | which columns not specified elsewhere (e.g. for the time 112 | parameter) are kept. 113 | observation_threshold (int, optional): An integer indicating how many 114 | observations to include in the imported data, at minimum. 115 | Once the number of rows in the imported dataset exceeds this value, 116 | the importer will not import the next file in the list of 117 | file paths passed to `file_path`. Has no effect if a string 118 | or path-like object is passed to `file_paths`. 119 | 120 | Returns: 121 | Nate: an instance of the `Nate` class containing all data from the 122 | columns specified in the parameters. 123 | 124 | The columns indicated in the text, unique_id, and time parameters will 125 | be renamed to 'text', 'unique_id', and 'time', accordingly. The names of the 126 | columns listed in 'columns_to_keep' will be preserved as-is. 127 | 128 | Note that this function is only equipped to handle pre-processed .csv 129 | files that are ready to be loaded into a pandas dataframe with no 130 | additional manipulation. If the data requires any kind of special 131 | treatment, prudent users will first load their data using pandas 132 | directly into python, and then use the 'import_dataframe' function 133 | to load their data into nate. 134 | """ 135 | columns_to_import = [*columns_to_keep] 136 | 137 | for special_column in [text, unique_id, time]: 138 | if special_column != None: 139 | columns_to_import.append(special_column) 140 | 141 | dtypes = {} 142 | 143 | if time!= None: 144 | dtypes[time] = "str" 145 | 146 | if isinstance(file_paths, list): 147 | df_list = [] 148 | total_len = 0 149 | for entry in file_paths: 150 | temp_df = pandas.read_csv(entry, usecols=columns_to_import, dtype = dtypes) 151 | df_list.append(temp_df) 152 | 153 | if observation_threshold != 0: 154 | total_len += len(temp_df) 155 | if total_len >= observation_threshold: 156 | break 157 | 158 | temp_data = pandas.concat(df_list) 159 | 160 | elif isinstance(file_paths, str): 161 | 162 | temp_data = pandas.read_csv(file_paths, usecols=columns_to_import, dtype = dtypes) 163 | 164 | else: 165 | raise TypeError("file_paths must be either string or list of strings") 166 | 167 | if time!= None and twitter_times == False: 168 | temp_data = temp_data.astype({time: 'str'}) 169 | temp_data[time] = pandas.to_datetime(temp_data[time], infer_datetime_format=True) 170 | 171 | return process_dataframe(temp_data, text, unique_id, time, twitter_times, columns_to_keep) 172 | 173 | 174 | def import_excel(file_paths: Union[List, str], 175 | text: str, 176 | unique_id: str = None, 177 | time: str = None, 178 | twitter_times: bool = False, 179 | columns_to_keep: List = [], 180 | observation_threshold=0): 181 | """Imports an excel file (.xlsx) into nate. 182 | 183 | This function uses pre-existing pandas functionality to read in an excel 184 | file (.xlsx) into nate. 185 | 186 | Args: 187 | file_path (str or path-like): The location of the file to be 188 | loaded from disk. 189 | text (str): The name of the column containing the text data 190 | to be analyzed with nate. Required for all uses of nate. 191 | unique_id (str, optional): The name of the column containing unique 192 | identifiers (e.g. a unique name or hash ID#). Required for 193 | some uses of Nate. 194 | time (str, optional): The name of the column containing the time the 195 | observation was recorded. Required for some uses of nate 196 | (e.g. edge_burst). 197 | columns_to_keep (list, optional): A list of column names indicating 198 | which columns not specified elsewhere (e.g. for the time 199 | parameter) are kept. 200 | observation_threshold (int, optional): An integer indicating how many 201 | observations to include in the imported data, at minimum. Once 202 | the number of rows in the imported dataset exceeds this value, 203 | the importer will not import the next file in the list of file 204 | paths passed to `file_path`. Has no effect if a string or 205 | path-like object is passed to `file_paths`. 206 | 207 | Returns: 208 | A `Nate` object containing all data from the columns specified in 209 | the parameters. 210 | 211 | The columns indicated in the text, unique_id, and time parameters will be 212 | renamed to 'text', 'unique_id', and 'time', accordingly. The names of the 213 | columns listed in 'columns_to_keep' will be preserved as-is. 214 | 215 | Note that this function is only equipped to handle pre-processed .xlsx 216 | files that are ready to be loaded into a pandas dataframe with no 217 | additional manipulation. If the data requires any kind of special 218 | treatment, prudent users will first load their data using pandas 219 | directly into python, and then use the 'import_dataframe' function to 220 | load their data into nate. 221 | """ 222 | columns_to_import = [*columns_to_keep] 223 | 224 | for special_column in [text, unique_id, time]: 225 | if special_column != None: 226 | columns_to_import.append(special_column) 227 | 228 | dtypes = {} 229 | 230 | if time!= None: 231 | dtypes[time] = "str" 232 | 233 | print(columns_to_import) 234 | print(columns_to_keep) 235 | 236 | if isinstance(file_paths, list): 237 | df_list = [] 238 | total_len = 0 239 | for entry in file_paths: 240 | temp_df = pandas.read_excel(entry, usecols=columns_to_import, dtype = dtypes) 241 | df_list.append(temp_df) 242 | 243 | if observation_threshold != 0: 244 | total_len += len(temp_df) 245 | if total_len >= observation_threshold: 246 | break 247 | 248 | temp_data = pandas.concat(df_list) 249 | 250 | elif isinstance(file_paths, str): 251 | 252 | temp_data = pandas.read_excel(file_paths, usecols=columns_to_import, dtype = dtypes) 253 | 254 | else: 255 | raise TypeError("file_paths must be either string or list of strings") 256 | 257 | if time!= None and twitter_times == False: 258 | temp_data = temp_data.astype({time: 'str'}) 259 | temp_data[time] = pandas.to_datetime(temp_data[time], infer_datetime_format=True) 260 | 261 | return process_dataframe(temp_data, text, unique_id, time, twitter_times, columns_to_keep) 262 | -------------------------------------------------------------------------------- /nate/importers/edgelist_importers.py: -------------------------------------------------------------------------------- 1 | """`Nate` importers for edgelists.""" 2 | 3 | import pandas 4 | from .named_tuple_generator import tupleize 5 | 6 | 7 | def process_edgelist(temp_data, From, To, Weight=None): 8 | """Turns an edgelist in a dataframe into a list of NamedTuples.""" 9 | 10 | series_dict = {} 11 | 12 | special_column_list = [(From, "From"), (To, "To"), (Weight, "Weight")] 13 | 14 | for special_column, special_column_name in special_column_list: 15 | if special_column != None: 16 | temp_column = temp_data[special_column] 17 | temp_column.name = special_column_name 18 | series_dict[special_column_name] = temp_column.tolist() 19 | 20 | return tupleize(series_dict, "edge") 21 | 22 | 23 | class EdgelistMixin(): 24 | """Provides edgelist functionality to objects in nate.""" 25 | 26 | def add_edges_from_csv(self, file_path, From, To, Weight=None): 27 | """Imports an edgelist from a .csv file into `nate`. 28 | 29 | This function sets the self.edgelist attribute to a list of 30 | NamedTuples, with each tuple representing one edge. 31 | 32 | Args: 33 | file_path (str): The location of the file to be loaded from disk. 34 | From (str): The name of the column containing the origin of 35 | the edge. 36 | To (str): The name of the column containing the destination of 37 | the edge. 38 | Weight (str, optional): The column containing the edge's weight. 39 | 40 | Note that the capitalized arguments are a result of 'from' being a 41 | reserved keyword in Python. 42 | """ 43 | 44 | col_list = [From, To] 45 | 46 | if Weight != None: 47 | col_list.append(Weight) 48 | 49 | temp_data = pandas.read_csv(file_path, usecols=col_list) 50 | 51 | self.edgelist = process_edgelist(temp_data, 52 | From=From, 53 | To=To, 54 | Weight=Weight) 55 | 56 | def add_edges_from_dataframe(self, dataframe, From, To, Weight=None): 57 | """Imports an edgelist from a dataframe into `nate`. 58 | 59 | This function sets the self.edgelist attribute to a list of 60 | NamedTuples, with each tuple representing one edge. 61 | 62 | Args: 63 | dataframe (pandas.Dataframe): The dataframe from which to extract 64 | the edgelist. 65 | From (str): The name of the column containing the origin of 66 | the edge. 67 | To (str): The name of the column containing the destination of 68 | the edge. 69 | Weight (str, optional): The column containing the edge's weight. 70 | 71 | Note that the capitalized arguments are a result of 'from' being a 72 | reserved keyword in Python. 73 | """ 74 | self.edgelist = process_edgelist(dataframe, 75 | From=From, 76 | To=To, 77 | Weight=Weight) 78 | -------------------------------------------------------------------------------- /nate/importers/named_tuple_generator.py: -------------------------------------------------------------------------------- 1 | """Implements extra NamedTuple functionality.""" 2 | 3 | from collections import namedtuple 4 | from typing import List, NamedTuple 5 | 6 | 7 | def define_named_tuple(observation_name, attribute_names: List[str]): 8 | """Creates a new subclass of NamedTuple.""" 9 | output_tuple = namedtuple(observation_name, attribute_names) 10 | 11 | return output_tuple 12 | 13 | 14 | def create_observation_list(observation_name: str, 15 | **kwargs) -> List[NamedTuple]: 16 | """Creates an observation list of NamedTuples. 17 | 18 | This function builds a new NamedTuple type from the lists passed as 19 | kwargs, with each field given the name of the keyword it was passed with. 20 | 21 | This function requires that all lists passed as kwargs are the same length. 22 | 23 | Args: 24 | observation_name (str): The name given to the new NamedTuple type. 25 | **kwargs: Lists containing data for each observation. The keyword 26 | passed with each list will become the name of that field in the 27 | resulting NamedTuple type. 28 | 29 | Returns: 30 | List[NamedTuple]: A list of NamedTuples, with each tuple corresponding 31 | to one observation. 32 | 33 | Raises: 34 | Exception: If the lists passed as kwargs are not the same length. 35 | """ 36 | custom_named_tuple = define_named_tuple(observation_name, 37 | list(kwargs.keys())) 38 | 39 | #Length check: all of the lists fed in MUST be of the same length 40 | 41 | arg_lengths = [len(arg) for arg in kwargs.values()] 42 | arg_length = set(arg_lengths) 43 | 44 | if len(arg_length) != 1: 45 | raise Exception("Not all of the input data is the same length.") 46 | 47 | observation_list = [] 48 | 49 | for i in range(0, arg_length.pop()): 50 | 51 | variables = [] 52 | 53 | for arg in kwargs: 54 | variables.append(kwargs[arg][i]) 55 | 56 | observation_list.append(custom_named_tuple(*variables)) 57 | 58 | return observation_list 59 | 60 | 61 | def tupleize(series_dict, tuple_name="obs"): 62 | """Creates an observation list of NamedTuples.""" 63 | kwarg_dict = {} 64 | 65 | keys = [i for i in series_dict.keys()] 66 | 67 | for i in range(0, len(keys)): 68 | kwarg_dict[keys[i]] = list(series_dict[keys[i]]) 69 | 70 | return create_observation_list(tuple_name, **kwarg_dict) 71 | -------------------------------------------------------------------------------- /nate/importers/raw_importers.py: -------------------------------------------------------------------------------- 1 | """Import text, and only text, directly into `Nate`.""" 2 | 3 | from typing import List, Union 4 | from .named_tuple_generator import define_named_tuple 5 | from .nate_class import Nate 6 | from .timestamp_process import convert_time 7 | 8 | text_only_namedtuple = define_named_tuple('obs', ['text']) 9 | 10 | 11 | def import_text(strings): 12 | """Directly imports a string (or a list of strings) into `nate`. 13 | 14 | Args: 15 | strings (Union(str, List[str])): A string or a list of strings. 16 | 17 | Returns: 18 | Nate: An instance of the `Nate` class. 19 | """ 20 | if isinstance(strings, str): 21 | strings = [strings] 22 | 23 | return Nate([text_only_namedtuple(string) for string in strings]) 24 | 25 | 26 | def import_files(files): 27 | """Directly imports a text file (or list of text files) into `nate`. 28 | 29 | Args: 30 | files (Union(str, List[str])): A filename or list of filenames to be 31 | loaded from disk. 32 | 33 | Returns: 34 | Nate: A `Nate` object containing only the text data given. 35 | """ 36 | if isinstance(files, str): 37 | files = [files] 38 | 39 | obs_list = [] 40 | 41 | for filepath in files: 42 | with open(filepath, 'r', encoding='utf-8') as stream: 43 | obs_list.append( 44 | text_only_namedtuple(stream.read().replace('\n', ' '))) 45 | 46 | return Nate(obs_list) 47 | 48 | 49 | def import_dict_of_dicts(dictionary, text, time=None, values_to_keep=[]): 50 | """Imports a dict of dicts into `nate`. 51 | 52 | Args: 53 | dictionary (Dict): A dict of dicts, with the keys of the outer dict 54 | corresponding to unique observation ids. 55 | text (str): The name of the text entry in each inner dict. 56 | time (str, optional): The name of the time entry in each inner dict. 57 | values_to_keep (List[str], optional): A list of keys which appear in 58 | all inner dicts. The values will be kept in the resulting `Nate` 59 | object. 60 | 61 | Returns: 62 | Nate: An instance of the `Nate` class. 63 | """ 64 | 65 | lookup_list = [text] 66 | named_list = ['unique_id', 'text'] 67 | 68 | if time != None: 69 | lookup_list.append(time) 70 | named_list.append('time') 71 | 72 | lookup_list.extend(values_to_keep) 73 | named_list.extend(values_to_keep) 74 | 75 | dict_namedtuple = define_named_tuple('obs', named_list) 76 | 77 | obs_list = [] 78 | 79 | for key, subdict in dictionary.items(): 80 | filtered_values = [] 81 | for value in lookup_list: 82 | 83 | value_to_append = subdict[value] 84 | 85 | if value == 'time': 86 | value_to_append = convert_time(value_to_append) 87 | 88 | filtered_values.append(value_to_append) 89 | 90 | obs_list.append(dict_namedtuple(key, *filtered_values)) 91 | 92 | return Nate(obs_list) 93 | 94 | 95 | def import_lists(text: List, time: List = None, unique_id: List = None): 96 | """Imports a number of list into `nate`. 97 | 98 | [Note: it might be a good idea to add a **kwargs parameter so that 99 | users can pass arbitrary other lists, similar to values_to_keep above.] 100 | 101 | Args: 102 | text (List): A list of strings. 103 | time (List, optional): A list containing the times each observation 104 | was recorded. 105 | unique_id (List, optional): The list containing unique 106 | identifiers (e.g. a unique name or hash ID#). 107 | 108 | Returns: 109 | Nate: An instance of the `Nate` class. 110 | """ 111 | pass 112 | -------------------------------------------------------------------------------- /nate/importers/timestamp_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process and reformat times for consistency across Nate. 3 | """ 4 | from datetime import datetime, timezone 5 | 6 | 7 | def convert_times(times, timezone=timezone.utc): 8 | """Convert all times to POSIX timestamps.""" 9 | timestamps = [] 10 | 11 | for time in times: 12 | dt = datetime.strptime(time, "%m/%d/%Y %H:%M") 13 | timestamps.append(int(dt.replace(tzinfo=timezone).timestamp())) 14 | 15 | return timestamps 16 | 17 | 18 | def convert_time(time, timezone=timezone.utc): 19 | """Convert a single time to POSIX timestamp.""" 20 | dt = datetime.strptime(time, "%m/%d/%Y %H:%M") 21 | return int(dt.replace(tzinfo=timezone).timestamp()) 22 | -------------------------------------------------------------------------------- /nate/netplus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/netplus/__init__.py -------------------------------------------------------------------------------- /nate/netplus/netplus.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | 5 | # Coming soon... 6 | -------------------------------------------------------------------------------- /nate/semnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/semnet/__init__.py -------------------------------------------------------------------------------- /nate/semnet/semnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | 5 | # Coming soon... 6 | -------------------------------------------------------------------------------- /nate/socnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/socnet/__init__.py -------------------------------------------------------------------------------- /nate/socnet/alters.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | 5 | import pandas as pd 6 | import networkx as nx 7 | from collections import namedtuple 8 | 9 | alter_tuple = namedtuple('alters', ['vertex', 'betweenness', 'closeness', 'eigenvector']) 10 | 11 | 12 | def find_alters(edgelist) -> dict: 13 | G = nx.Graph() 14 | 15 | G = nx.from_pandas_edgelist(pd.DataFrame(edgelist, columns = ['From', 'To']), source='From', target='To') 16 | 17 | authorlist = [entry.From for entry in edgelist] 18 | authorlist.extend([entry.To for entry in edgelist]) 19 | author_dict = {item: [] for item in set(authorlist)} 20 | 21 | for author in author_dict: 22 | alter_list = list(G.neighbors(author)) 23 | alter_2_list = [] 24 | for alter in alter_list: 25 | alters_2 = list(G.neighbors(alter)) 26 | alter_2_list.extend(alters_2) 27 | 28 | alter_list = list(set(alter_list)) 29 | alter_2_list = list(set(alter_2_list)) 30 | 31 | alter_2_list.remove(author) 32 | for alter in alter_list: 33 | if alter in alter_2_list: 34 | alter_2_list.remove(alter) 35 | 36 | author_dict[author] = [alter_list, alter_2_list] 37 | 38 | return author_dict 39 | -------------------------------------------------------------------------------- /nate/socnet/centralities.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | from importlib.util import find_spec 5 | import igraph 6 | from collections import namedtuple 7 | 8 | cent_tuple = namedtuple('centralities', ['vertex', 'betweenness', 'closeness', 'eigenvector']) 9 | 10 | def compute_centralities(tuples, force_igraph = False): 11 | """ 12 | This is a docstring 13 | """ 14 | 15 | if find_spec('graph_tool') != None and force_igraph == False: 16 | print("using graph-tool") 17 | return gt_cents(tuples) 18 | 19 | elif find_spec('igraph') != None: 20 | print("using igraph") 21 | return igraph_cents(tuples) 22 | 23 | else: 24 | raise Exception("Please ensure that either graph_tool or python_igraph are installed.") 25 | 26 | def gt_cents(tuples): 27 | """ 28 | This is a docstring 29 | """ 30 | author_lookup = {} 31 | author_number = 0 32 | 33 | for entry in tuples: 34 | for i in range(2): 35 | if entry[i] not in author_lookup: 36 | author_lookup[entry[i]] = author_number 37 | author_number += 1 38 | 39 | import graph_tool as gt 40 | from graph_tool.centrality import betweenness, closeness, eigenvector 41 | 42 | G = gt.Graph(directed=False) 43 | 44 | for edge in tuples: 45 | G.add_edge(author_lookup[edge[0]], author_lookup[edge[1]], add_missing=True) 46 | 47 | betweenness_vertex, _ = betweenness(G) 48 | closeness_vertex = closeness(G) 49 | _, eigenvector_vertex = eigenvector(G) 50 | 51 | return_list = [] 52 | 53 | for k, v in author_lookup.items(): 54 | cent = cent_tuple( 55 | k, 56 | betweenness_vertex[v], 57 | closeness_vertex[v], 58 | eigenvector_vertex[v] 59 | ) 60 | return_list.append(cent) 61 | 62 | return return_list 63 | 64 | def igraph_cents(tuples): 65 | """ 66 | This is a docstring 67 | """ 68 | G = igraph.Graph.TupleList(tuples, directed = False) 69 | 70 | vertex_list = G.vs() 71 | between_list = G.betweenness(directed=False) 72 | close_list = G.closeness(normalized=True) 73 | eigen_list = G.eigenvector_centrality(directed=False, scale=True) 74 | 75 | return_list = [] 76 | 77 | for i in range(len(vertex_list)): 78 | cent = cent_tuple( 79 | vertex_list[i]['name'], 80 | between_list[i], 81 | close_list[i], 82 | eigen_list[i] 83 | ) 84 | return_list.append(cent) 85 | 86 | return return_list 87 | -------------------------------------------------------------------------------- /nate/socnet/dissimilarities.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | def find_dissimilarities(): 5 | 6 | return -------------------------------------------------------------------------------- /nate/socnet/old_temsna/combine_covariates.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import metaknowledge as mk 3 | import networkx as nx 4 | import pickle 5 | 6 | node_covariates = pd.read_csv("../input/node_covariates.csv") 7 | sim_scores = pd.read_csv("../input/sim_scores.csv") 8 | centralities = pd.read_csv("../input/centralities.csv") 9 | 10 | node_covariates.rename(columns={'Unnamed: 0': 'author'}, inplace=True) 11 | 12 | node_covariates.head() 13 | 14 | sim_scores.head() 15 | 16 | sim_scores = sim_scores.merge(node_covariates, 17 | left_on="author", 18 | right_on="author") 19 | 20 | sim_scores.columns = [ 21 | "author", "dissim_alters", "dissim_alters_2", "alter_dissim_avg", 22 | "bridge_dissim_avg", "first_ring_dissim_avg", "num_citations", "num_papers", 23 | "career_start", "num_alter1", "num_alter2" 24 | ] 25 | 26 | sim_scores = sim_scores.merge(centralities, left_on="author", right_on="author") 27 | 28 | sim_scores.to_csv("../output/author_covariates.csv", index=False) 29 | -------------------------------------------------------------------------------- /nate/socnet/old_temsna/create_author_covariates.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | 4 | alter_list = pd.read_pickle("../input/alter_lists.pkl") 5 | alter_list = alter_list.set_index('author').to_dict() 6 | nodes = pd.read_csv("../input/coauthorship_nodeAttributes.csv") 7 | with open("../input/author_metadata.pkl", "rb") as pkl: 8 | author_metadata = pickle.load(pkl) 9 | 10 | num_citations = {} 11 | num_papers = {} 12 | career_start = {} 13 | num_alter1 = {} 14 | num_alter2 = {} 15 | 16 | for author in author_metadata: 17 | num_citations[author] = author_metadata[author]["wosTimesCited"] 18 | num_papers[author] = author_metadata[author]["num_papers"] = len( 19 | author_metadata[author]["wosString"]) 20 | author_metadata[author]["year"] = list( 21 | filter(None, author_metadata[author]["year"])) 22 | try: 23 | career_start[author] = min( 24 | [int(i) for i in author_metadata[author]["year"]]) 25 | except ValueError: 26 | career_start[author] = 2018 27 | try: 28 | num_alter1[author] = len(alter_list['alter'][author]) 29 | except KeyError: 30 | pass 31 | try: 32 | num_alter2[author] = len(alter_list['alter_2'][author]) 33 | except KeyError: 34 | pass 35 | 36 | covariates = pd.DataFrame.from_dict(num_citations, orient='index') 37 | 38 | covariates['num_citations'] = pd.Series(num_citations) 39 | covariates['num_papers'] = pd.Series(num_papers) 40 | covariates['career_start'] = pd.Series(career_start) 41 | covariates['num_alter1'] = pd.Series(num_alter1) 42 | covariates['num_alter2'] = pd.Series(num_alter2) 43 | 44 | covariates = covariates.drop(columns=[0]) 45 | 46 | covariates.to_csv("../output/node_covariates.csv") 47 | -------------------------------------------------------------------------------- /nate/socnet/old_temsna/extract_coauthor.py: -------------------------------------------------------------------------------- 1 | import metaknowledge as mk 2 | import pandas as pd 3 | import pickle 4 | import community 5 | import networkx as nx 6 | import yaml 7 | # Web of Science Field Codes 8 | # AF (Full Author) 9 | # TI (Title) 10 | # ID (WoS Keywords) 11 | # DE (Author keywords) 12 | # AB (Abstracts) 13 | # TC (Times Cited) 14 | # PY (Year Published) 15 | 16 | RCY = mk.RecordCollection('../input/', cached=False) 17 | 18 | RC = RCY.yearSplit(2008, 2019) 19 | 20 | coauth = RC.networkCoAuthor() 21 | mk.writeGraph(coauth, 'coauthorship') 22 | 23 | wos_dict = RC.makeDict( 24 | onlyTheseTags=["UT", "AF", "AU", "TI", "ID", "DE", "AB", "TC", "SO", "PY"], 25 | longNames=True, 26 | numAuthors=False, 27 | genderCounts=False) 28 | 29 | author_dict = {} 30 | 31 | abs_dict = {} 32 | 33 | cites_dict = {} 34 | 35 | for i in range(0, len(wos_dict['wosString'])): 36 | wosID = wos_dict['wosString'][i] 37 | 38 | try: 39 | abs_dict[wosID] = { 40 | "abstract": wos_dict['abstract'][i], 41 | "title": wos_dict['title'][i], 42 | "keywords": [], 43 | } 44 | 45 | cites_dict[wosID] = { 46 | "cites": wos_dict['wosTimesCited'][i], 47 | "year": wos_dict['year'][i], 48 | } 49 | 50 | abs_keywords = [] 51 | try: 52 | abs_keywords.extend(wos_dict['keywords'][i]) 53 | except TypeError: 54 | pass 55 | 56 | try: 57 | abs_keywords.extend(wos_dict['authKeywords'][i]) 58 | except TypeError: 59 | pass 60 | 61 | abs_dict[wosID]['keywords'] = list(set(x.lower() for x in abs_keywords)) 62 | 63 | except TypeError: 64 | pass 65 | 66 | try: 67 | for author in wos_dict['authorsFull'][i]: 68 | if author in coauth: 69 | if author not in author_dict: 70 | author_dict[author] = { 71 | "wosString": [], 72 | "title": [], 73 | "keywords": [], 74 | "abstract": [], 75 | "wosTimesCited": 0, 76 | "journal": [], 77 | "year": [], 78 | "community": 0, 79 | } 80 | 81 | combined_keywords = [] 82 | combined_keywords2 = [] 83 | try: 84 | combined_keywords.extend(wos_dict["keywords"][i]) 85 | except TypeError: 86 | pass 87 | try: 88 | combined_keywords.extend(wos_dict["authKeywords"][i]) 89 | except TypeError: 90 | pass 91 | 92 | for keyword in combined_keywords: 93 | combined_keywords2.append(keyword.lower()) 94 | 95 | combined_keywords2 = list(set(combined_keywords2)) 96 | 97 | author_dict[author]["wosString"].append( 98 | wos_dict["wosString"][i]) 99 | author_dict[author]["title"].append(wos_dict["title"][i]) 100 | author_dict[author]["keywords"] = combined_keywords2 101 | author_dict[author]["abstract"].append(wos_dict["abstract"][i]) 102 | author_dict[author]["wosTimesCited"] += ( 103 | wos_dict["wosTimesCited"][i]) 104 | author_dict[author]["journal"].append(wos_dict["journal"][i]) 105 | author_dict[author]["year"].append(wos_dict["year"][i]) 106 | except TypeError: 107 | pass 108 | 109 | with open("author_metadata.pkl", "wb") as handle: 110 | pickle.dump(author_dict, handle) 111 | 112 | with open("comm_abs.pkl", "wb") as handle: 113 | pickle.dump(abs_dict, handle) 114 | 115 | with open("cites_dict.yaml", "w") as stream: 116 | yaml.dump(cites_dict, stream, default_flow_style=False) 117 | -------------------------------------------------------------------------------- /nate/socnet/old_temsna/generate_meta_strings.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | 4 | with open('../input/author_metadata.pkl', "rb") as pkl: 5 | author_dict = pickle.load(pkl) 6 | 7 | meta_string_dict = {} 8 | 9 | for author in author_dict: 10 | 11 | meta_string = "" 12 | journals = "" 13 | 14 | try: 15 | for title in author_dict[author]["title"]: 16 | meta_string = meta_string + title + " " 17 | except TypeError: 18 | pass 19 | try: 20 | for keyword in author_dict[author]["keywords"]: 21 | meta_string = meta_string + keyword + " " 22 | except TypeError: 23 | pass 24 | try: 25 | for abstract in author_dict[author]["abstract"]: 26 | meta_string = meta_string + abstract + " " 27 | except TypeError: 28 | pass 29 | try: 30 | for journal in author_dict[author]["journal"]: 31 | journals = journals + journal + " " 32 | except TypeError: 33 | pass 34 | 35 | meta_string_dict[author] = { 36 | "meta_string": meta_string, 37 | "journals": journals, 38 | } 39 | 40 | with open("../output/generated_meta_strings.pkl", "wb") as pkl: 41 | pickle.dump(meta_string_dict, pkl) 42 | -------------------------------------------------------------------------------- /nate/socnet/old_temsna/spacy_process/spacy_new.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import spacy 5 | import pickle 6 | from joblib import dump, load, Parallel, delayed, cpu_count 7 | from joblib import parallel_backend 8 | import warnings 9 | warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') 10 | from gensim.models.phrases import Phrases, Phraser 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | from toolz import partition_all 14 | import itertools 15 | import time 16 | from tqdm import tqdm 17 | from gensim.utils import simple_preprocess 18 | 19 | from scipy.sparse import vstack 20 | 21 | from numpy.lib.stride_tricks import as_strided # for removing the diagonal (self-self comparison) in a matrix 22 | 23 | from sklearn.metrics.pairwise import linear_kernel # equal to cosine_similarity for L2 normalized data 24 | 25 | from sklearn import preprocessing 26 | 27 | from yaml import load 28 | 29 | 30 | def mp(items, function, cpu, *args): 31 | batch_size = round( 32 | len(items) / 33 | cpu) # split the list of items so that each CPU receives one batch 34 | partitions = partition_all(batch_size, items) 35 | temp = Parallel(n_jobs=cpu, max_nbytes=None)(delayed(function)( 36 | v, *args) for v in partitions) #executes the function on each batch 37 | results = list( 38 | itertools.chain(*temp) 39 | ) # joblib.delayed returns a list of lists (ie. list of each batch result), concatenate them 40 | return results 41 | 42 | 43 | # same as above, but when 2 lists of results are needed 44 | def mp2(items, function, cpu, *args): 45 | batch_size = round(len(items) / cpu) 46 | partitions = partition_all(batch_size, items) 47 | temp = Parallel(n_jobs=cpu, max_nbytes=None)( 48 | delayed(function)(v, *args) for v in partitions) 49 | results1, results2 = zip(*temp) 50 | results1 = list(itertools.chain(*results1)) 51 | results2 = list(itertools.chain(*results2)) 52 | return results1, results2 53 | 54 | 55 | # ibid 56 | def mp3(items, function, cpu, *args): 57 | batch_size = round(len(items) / cpu) 58 | partitions = partition_all(batch_size, items) 59 | temp = Parallel(n_jobs=cpu, max_nbytes=None)( 60 | delayed(function)(v, *args) for v in partitions) 61 | results1, results2, results3 = zip(*temp) 62 | results1 = list(itertools.chain(*results1)) 63 | results2 = list(itertools.chain(*results2)) 64 | results3 = list(itertools.chain(*results3)) 65 | return results1, results2, results3 66 | 67 | 68 | def mp_shared(items, function, cpu, *args): 69 | batch_size = round( 70 | len(items) / 71 | cpu) # split the list of items so that each CPU receives one batch 72 | partitions = partition_all(batch_size, items) 73 | temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)( 74 | delayed(function)(v, *args) 75 | for v in partitions) #executes the function on each batch 76 | results = list( 77 | itertools.chain(*temp) 78 | ) # joblib.delayed returns a list of lists (ie. list of each batch result), concatenate them 79 | return results 80 | 81 | 82 | def mp2_shared(items, function, cpu, *args): 83 | batch_size = round(len(items) / cpu) 84 | partitions = partition_all(batch_size, items) 85 | temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)( 86 | delayed(function)(v, *args) for v in partitions) 87 | results1, results2 = zip(*temp) 88 | results1 = list(itertools.chain(*results1)) 89 | results2 = list(itertools.chain(*results2)) 90 | return results1, results2 91 | 92 | 93 | def mp3_shared(items, function, cpu, *args): 94 | batch_size = round(len(items) / cpu) 95 | partitions = partition_all(batch_size, items) 96 | temp = Parallel(n_jobs=cpu, require='sharedmem', max_nbytes=None)( 97 | delayed(function)(v, *args) for v in partitions) 98 | results1, results2, results3 = zip(*temp) 99 | results1 = list(itertools.chain(*results1)) 100 | results2 = list(itertools.chain(*results2)) 101 | results3 = list(itertools.chain(*results3)) 102 | return results1, results2, results3 103 | 104 | 105 | def dissim_rba(auth_list, auth_alt_dict, auth_alt_dict_2, auth_vectors): 106 | rb_avg_dissims = [] 107 | ring_avg_dissims = [] 108 | bridge_avg_dissims = [] 109 | for batch_list in batch(auth_list, 50): 110 | comp_list = [] 111 | for author in batch_list: 112 | comp_list += [author] 113 | comp_list += auth_alt_dict[author] 114 | comp_list += auth_alt_dict_2[author] 115 | comp_list = sorted(list(set(comp_list))) 116 | comp_dict = {k: v for v, k in enumerate(comp_list)} 117 | comp_vectors = [] 118 | for member in comp_list: 119 | comp_vectors.append(auth_vectors[member]) 120 | v_array = vstack(comp_vectors) 121 | dissim_matrix = v_array @ v_array.T 122 | dissim_matrix = dissim_matrix.todense() 123 | 124 | for author in batch_list: 125 | 126 | rb_dissims = [] 127 | ring_dissims = [] 128 | bridge_dissims = [] 129 | if len(auth_alt_dict[author]) > 0: 130 | alter_list = auth_alt_dict[author] 131 | 132 | for alter in alter_list: 133 | if len(auth_alt_dict[alter]) > 1: 134 | alter_2_list = auth_alt_dict[alter] 135 | ring_list = list_common(alter_list, alter_2_list) 136 | bridge_list = list_difference(alter_2_list, alter_list) 137 | alter_2_list_trim = [ 138 | x for x in alter_2_list if x != author 139 | ] 140 | bridge_list_trim = [ 141 | x for x in bridge_list if x != author 142 | ] 143 | if len(alter_2_list_trim) > 0: 144 | alter_dissim = create_average_dissim( 145 | alter, alter_2_list_trim, comp_dict, 146 | dissim_matrix) 147 | rb_dissims.append(1 - alter_dissim) 148 | if len(ring_list) > 0: 149 | alter_dissim = create_average_dissim( 150 | alter, ring_list, comp_dict, dissim_matrix) 151 | ring_dissims.append(1 - alter_dissim) 152 | if len(bridge_list_trim) > 0: 153 | alter_dissim = create_average_dissim( 154 | alter, bridge_list_trim, comp_dict, 155 | dissim_matrix) 156 | bridge_dissims.append(1 - alter_dissim) 157 | 158 | if len(rb_dissims) > 0: 159 | rb_avg_dissims.append(np.round(np.average(rb_dissims), 3)) 160 | else: 161 | rb_avg_dissims.append('NA') 162 | 163 | if len(ring_dissims) > 0: 164 | ring_avg_dissims.append(np.round(np.average(ring_dissims), 3)) 165 | else: 166 | ring_avg_dissims.append('NA') 167 | 168 | if len(bridge_dissims) > 0: 169 | bridge_avg_dissims.append( 170 | np.round(np.average(bridge_dissims), 3)) 171 | else: 172 | bridge_avg_dissims.append('NA') 173 | 174 | return (rb_avg_dissims, ring_avg_dissims, bridge_avg_dissims) 175 | 176 | 177 | def group_avg_dissim(members, vectors): 178 | member_vectors = [] 179 | for member in members: 180 | member_vectors.append(vectors[member]) 181 | v_array = vstack(member_vectors) 182 | group_dissim = 1 - linear_kernel(v_array) 183 | m = group_dissim.shape[0] 184 | s0, s1 = group_dissim.strides 185 | dissim_avg = np.round( 186 | np.average( 187 | as_strided(group_dissim.ravel()[1:], 188 | shape=(m - 1, m), 189 | strides=(s0 + s1, s1)).reshape(m, -1)), 3) 190 | 191 | return dissim_avg 192 | 193 | 194 | # perform NLP on a list of texts, requires NLP object from main() function (note for future work: NLP object can't be pickled using 195 | # python's pickle module (fast), so there may be performance gains possible by sorting this out re: disabling Loky in mp() functions) 196 | def spacy_process(texts, nlp): 197 | processed_list = [] 198 | copyright_stops = ['elsevier', 'right', 'rights', '(c)', 199 | 'ltd'] # domain specific stop words to remove 200 | allowed_postags = ['NOUN', 'PROPN'] # parts of speech to keep 201 | for doc in nlp.pipe( 202 | texts 203 | ): # nlp.pipe sends texts to spacy_process in batches for efficiency. Default is 128 (should experiment) 204 | processed = [] 205 | for token in doc: 206 | if token.is_stop == False and len( 207 | token) > 1: # don't bother with single char tokens 208 | if token.text not in copyright_stops and token.pos_ in allowed_postags: 209 | processed.append( 210 | token.lemma_ 211 | ) # keeping lemmatized version of each NOUN and PROPN 212 | processed = ' '.join( 213 | processed 214 | ) # concat the tokens of the document with whitespace between 215 | processed_list.append( 216 | processed 217 | ) # add the doc's processed words to the list of processed documents 218 | return processed_list 219 | 220 | 221 | # same as above, but with a small batch size for memory constraints 222 | def spacy_process_large(texts, nlp): 223 | processed_list = [] 224 | copyright_stops = ['elsevier', 'right', 'rights', '(c)', 'ltd'] 225 | allowed_postags = ['NOUN', 'PROPN'] 226 | for doc in nlp.pipe(texts, batch_size=1): 227 | processed = [] 228 | for token in doc: 229 | if token.is_stop == False and len(token) > 1: 230 | if token.text not in copyright_stops and token.pos_ in allowed_postags: 231 | processed.append(token.lemma_) 232 | processed = ' '.join(processed) 233 | processed_list.append(processed) 234 | return processed_list 235 | 236 | 237 | # bigram detection on a list of texts using sklearn's Phrases module. Note: test whether creating trigrams is as simple as calling 238 | # this process on the text again 239 | def bigram_process(texts): 240 | words = [ 241 | simple_preprocess(x, deacc=False) for x in texts 242 | ] # very efficient preprocessing into tokens based on white space only 243 | phrases = Phrases(words, min_count=1, threshold=0.8, 244 | scoring='npmi') # bigram model training 245 | bigram = Phraser( 246 | phrases) # creates a leaner specialized version of the bigram model 247 | bigrams = list( 248 | bigram[words]) # concatenate words into bigrams (ie. word1_word2) 249 | bigrams = [' '.join(words) for words in bigrams] 250 | return bigrams 251 | 252 | 253 | def list_difference(list1, list2): 254 | return (list(set(list1) - set(list2))) 255 | 256 | 257 | def list_common(list1, list2): 258 | return (list(set(list1).intersection(list2))) 259 | 260 | 261 | #not used for now 262 | 263 | 264 | def batch(batch_list, n=1): 265 | l = len(batch_list) 266 | for ndx in range(0, l, n): 267 | yield batch_list[ndx:min(ndx + n, l)] 268 | 269 | 270 | def create_average_dissim(ego, alters, index_dict, matrix): 271 | dissims = [] 272 | ego_idx = index_dict[ego] 273 | for alter in alters: 274 | alter_idx = index_dict[alter] 275 | 276 | dissim = matrix[ego_idx, alter_idx] 277 | 278 | dissims.append(dissim) 279 | dissim_avg = np.round(np.average(dissims), 3) 280 | return dissim_avg 281 | 282 | 283 | def dissim_alters(auth_list, auth_alt_dict, auth_alt_dict_2, auth_vectors): 284 | alters_avg_dissims = [] 285 | alters_2_avg_dissims = [] 286 | for batch_list in batch(auth_list, 4): 287 | comp_list = [] 288 | for author in batch_list: 289 | comp_list += [author] 290 | if author in auth_alt_dict and len(auth_alt_dict[author]) > 0: 291 | comp_list += auth_alt_dict[author] 292 | if author in auth_alt_dict and len(auth_alt_dict_2[author]) > 0: 293 | comp_list += auth_alt_dict_2[author] 294 | comp_list = sorted(list(set(comp_list))) 295 | comp_dict = {k: v for v, k in enumerate(comp_list)} 296 | comp_vectors = [] 297 | for member in comp_list: 298 | comp_vectors.append(auth_vectors[member]) 299 | v_array = vstack(comp_vectors) 300 | dissim_matrix = v_array @ v_array.T 301 | dissim_matrix = dissim_matrix.todense() 302 | 303 | for author in batch_list: 304 | if author in auth_alt_dict and len(auth_alt_dict[author]) > 0: 305 | alter_list = auth_alt_dict[author] 306 | alter_dissim = create_average_dissim(author, alter_list, 307 | comp_dict, dissim_matrix) 308 | alters_avg_dissims.append(1 - alter_dissim) 309 | else: 310 | alters_avg_dissims.append('NA') 311 | if author in auth_alt_dict_2 and len(auth_alt_dict_2[author]) > 0: 312 | alter_list = auth_alt_dict_2[author] 313 | alter_dissim = create_average_dissim(author, alter_list, 314 | comp_dict, dissim_matrix) 315 | alters_2_avg_dissims.append(1 - alter_dissim) 316 | else: 317 | alters_2_avg_dissims.append('NA') 318 | 319 | return (alters_avg_dissims, alters_2_avg_dissims) 320 | 321 | 322 | def single_avg_dissim(ego, alter_list, vectors): 323 | ego_vector = vectors[ego] 324 | alter_vectors = [] 325 | if len(alter_list) > 1: 326 | for alter in alter_list: # create list of word vectors for each alter in the list 327 | alter_vectors.append(vectors[alter]) 328 | v_array = vstack( 329 | alter_vectors 330 | ) # stack the list of vectors into a numpy array of shape 1 x the number of alters 331 | ego_dissim = 1 - linear_kernel( 332 | ego_vector, v_array 333 | ) # pairwise comparison of author vector to all vectors in the array 334 | dissim_avg = np.round(np.average(ego_dissim), 335 | 3) # average the above results 336 | else: 337 | alter = alter_list[0] # if author has only 1 alter, no vstack is needed 338 | dissim_avg = np.round( 339 | np.average(1 - linear_kernel(ego_vector, vectors[alter])), 3) 340 | return dissim_avg 341 | 342 | 343 | #not used for now 344 | # def group_avg_dissim(members, vectors): 345 | # member_vectors = [] 346 | # for member in members: 347 | # member_vectors.append(vectors[member]) 348 | # v_array = vstack(member_vectors) 349 | # group_dissim = 1 - linear_kernel(v_array) 350 | # m = group_dissim.shape[0] 351 | # s0,s1 = group_dissim.strides 352 | # dissim_avg = np.round(np.average(as_strided(group_dissim.ravel()[1:], shape=(m-1,m), strides=(s0+s1,s1)).reshape(m,-1)), 3) 353 | 354 | # return dissim_avg 355 | 356 | 357 | def main( 358 | ): #execute all functions within main to protect against multiprocessing infinite feedback loop 359 | 360 | if cpu_count() >= 8: #to avoid overtaxing Brad, save some cores 361 | cpu = 10 362 | else: 363 | cpu = cpu_count() 364 | 365 | with open( 366 | '../input/generated_meta_strings.pkl', "rb" 367 | ) as pkl: # dictionary with authors as keys and their strings as values 368 | auth_strings = pickle.load(pkl) 369 | 370 | with open( 371 | '../input/alter_lists.pkl', "rb" 372 | ) as pkl: # dataframe with author column, alters column, and alters_2 column 373 | alter_lists = pickle.load(pkl) 374 | 375 | auth_alt_dict = dict(zip(alter_lists.author, 376 | alter_lists.alter)) # dict of {auth:alter list} 377 | auth_alt_dict_2 = dict( 378 | zip(alter_lists.author, 379 | alter_lists.alter_2)) # dict of {auth: alter_2 list} 380 | auth_list = sorted(list(auth_strings.keys()))[:] # list of author names 381 | 382 | abs_list = [] # list of author strings to process 383 | 384 | # NOTE: this is only safe because the auth_strings dict hasn't been modified. Should be modified for posterity 385 | for author in auth_list: 386 | abs_list.append(auth_strings[author]["meta_string"]) 387 | 388 | del auth_strings 389 | 390 | bigram_text = bigram_process( 391 | abs_list) # find and concatenate bigrams in the author string list 392 | 393 | # load spacy model, disable unnecessary parser and named entity recog for performance 394 | #spacy.require_gpu() 395 | nlp = spacy.load('en', disable=['parser', 'ner']) 396 | 397 | #nlp.max_length = 10000000 # community strings are very large, may cause memory problems on modest PCs - needs rethinking 398 | 399 | # send bigrammed text and spacy function + its required variables to multiprocess function for execution 400 | processed_list = mp(bigram_text, spacy_process, cpu, nlp) 401 | vectorizer = TfidfVectorizer(max_df=0.5, 402 | min_df=3, 403 | stop_words='english', 404 | norm='l2') 405 | matrix = vectorizer.fit_transform( 406 | processed_list) # Tfidf vectors for each author string 407 | auth_vectors = dict(zip(auth_list, 408 | matrix)) # creat a dict of {author : tfidf vector} 409 | 410 | #create a dataframe by sending list of authors and the dissim function + its required variables to multiprocess function 411 | sim_df = pd.DataFrame() 412 | sim_df['author'] = pd.Series(auth_list) 413 | sim_df['dissim_alters'], sim_df['dissim_alters_2'] = pd.Series( 414 | mp2_shared(auth_list, dissim_alters, cpu, auth_alt_dict, 415 | auth_alt_dict_2, auth_vectors)).array 416 | sim_df['alter_dissim_avg'], sim_df['bridge_dissim_avg'], sim_df['first_ring_dissim_avg'] =\ 417 | pd.Series(mp3_shared(auth_list, dissim_rba, cpu, auth_alt_dict, auth_alt_dict_2, auth_vectors)).array 418 | 419 | sim_df.to_csv('../output/sim_scores.csv', index=False) 420 | 421 | 422 | if __name__ == '__main__': 423 | main() 424 | -------------------------------------------------------------------------------- /nate/socnet/old_temsna/temsna_dependencies_sparse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/socnet/old_temsna/temsna_dependencies_sparse.png -------------------------------------------------------------------------------- /nate/socnet/socnet_class.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module accepts a social network that has text attributes for nodes and outputs 3 | the same social network with similarity values between i,j as an edge attribute 4 | """ 5 | from nate.socnet.centralities import compute_centralities 6 | from nate.socnet.alters import find_alters 7 | from nate.socnet.dissimilarities import find_dissimilarities 8 | 9 | class SOCnet(): 10 | def __init__(self, data, edgelist): 11 | self.data = data 12 | self.edgelist = edgelist 13 | self.centralities = compute_centralities(edgelist) 14 | self.alters = find_alters(edgelist) 15 | self.dissimilarities = None 16 | -------------------------------------------------------------------------------- /nate/svonet/Arial.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/svonet/Arial.ttf -------------------------------------------------------------------------------- /nate/svonet/__init__.py: -------------------------------------------------------------------------------- 1 | from .svo import findSVOs 2 | -------------------------------------------------------------------------------- /nate/svonet/degree_over_time.py: -------------------------------------------------------------------------------- 1 | from nate.svonet.graph_svo import generate_ticks, find_max_burst 2 | import networkx as nx 3 | import stop_words as sw 4 | import copy 5 | import pandas as pd 6 | import matplotlib as mpl 7 | import matplotlib.pyplot as plt 8 | import matplotlib.dates as mdates 9 | from matplotlib.ticker import MaxNLocator 10 | import numpy as np 11 | 12 | 13 | class DegreeOverTimeMixIn(): 14 | 15 | def __init__(self): 16 | self.offset_dict: dict 17 | self.edge_burst_dict: dict 18 | self.s: int 19 | self.gamma: int 20 | self.from_svo: bool 21 | self.lookup: dict 22 | 23 | def top_degree(self, 24 | number_of_slices: int = 8, 25 | list_top: int = 10, 26 | minimum_burst_level: int = 0, 27 | degree_type="both", 28 | remove_stop_words=True): 29 | """[summary] 30 | 31 | Args: 32 | number_of_slices (int, optional): [description]. Defaults to 20. 33 | list_top (int, optional): [description]. Defaults to 10. 34 | degree_type (str, optional): Type of degree calculation to use. 35 | Must be one of "in", "out", or "both". Defaults to "both". 36 | 37 | Returns: 38 | [type]: [description] 39 | """ 40 | 41 | if degree_type != "in" and degree_type != "out" and degree_type != "both": 42 | raise Exception( 43 | "`degree_type` must be one of 'in', 'out', or 'both'") 44 | 45 | # Create list of time slices: 46 | 47 | offset_set = set() 48 | 49 | for key in self.offset_dict: 50 | for offset in self.offset_dict[key]: 51 | offset_set.add(offset) 52 | 53 | time_slices, time_labels = generate_ticks( 54 | offset_set, number_of_ticks=(number_of_slices)) 55 | 56 | # Create network consisting of all Subjects and Objects: 57 | 58 | G = nx.DiGraph() 59 | 60 | for entry in self.edge_burst_dict: 61 | G.add_node(entry[0]) 62 | G.add_node(entry[-1]) 63 | 64 | # Iterate over time slices 65 | 66 | top_degree_by_slice = {} 67 | 68 | for i in range(1, len(time_slices)): 69 | graphCopy = copy.deepcopy(G) 70 | 71 | for key in self.edge_burst_dict: 72 | burst_level = find_max_burst(self.edge_burst_dict[key], 73 | time_slices[i - 1], time_slices[i]) 74 | 75 | if burst_level > minimum_burst_level: 76 | graphCopy.add_edge(key[0], key[-1]) 77 | 78 | if degree_type == "in": 79 | degree_list = list(graphCopy.in_degree) 80 | elif degree_type == "out": 81 | degree_list = list(graphCopy.out_degree) 82 | elif degree_type == "both": 83 | degree_list = list(graphCopy.degree) 84 | 85 | degree_list.sort(key=lambda x: x[1], reverse=True) 86 | 87 | if remove_stop_words: 88 | stops = sw.get_stop_words("english") 89 | degree_list = [ 90 | item for item in degree_list if item[0] not in stops 91 | ] 92 | 93 | top_degree_by_slice[time_labels[i]] = degree_list[0:list_top] 94 | 95 | return top_degree_by_slice 96 | 97 | def specific_degree(self, 98 | tokens: list, 99 | number_of_slices: int = 15, 100 | minimum_burst_level: int = 0, 101 | degree_type="both", 102 | remove_stop_words=False): 103 | """[summary] 104 | 105 | Args: 106 | tokens (list): [description] 107 | number_of_slices (int, optional): [description]. Defaults to 20. 108 | minimum_burst_level (int, optional): [description]. Defaults to 0. 109 | degree_type (str, optional): [description]. Defaults to "both". 110 | remove_stop_words (bool, optional): [description]. Defaults to False. 111 | 112 | Returns: 113 | [type]: [description] 114 | """ 115 | 116 | if isinstance(tokens, list) == False: 117 | tokens = [tokens] 118 | 119 | full_lists = self.top_degree(number_of_slices=number_of_slices, 120 | list_top=None, 121 | minimum_burst_level=minimum_burst_level, 122 | degree_type=degree_type, 123 | remove_stop_words=remove_stop_words) 124 | 125 | token_rank_dict = {} 126 | 127 | for day in full_lists: 128 | v = [item for item in full_lists[day] if item[0] in tokens] 129 | token_rank_dict[day] = v 130 | 131 | return token_rank_dict 132 | 133 | def plot_top_degree(self, 134 | number_of_slices: int = 8, 135 | list_top: int = 10, 136 | minimum_burst_level: int = 0, 137 | degree_type="both", 138 | remove_stop_words=True, 139 | filename: str = False, 140 | ): 141 | """[summary] 142 | 143 | Args: 144 | number_of_slices (int, optional): [description]. Defaults to 20. 145 | list_top (int, optional): [description]. Defaults to 10. 146 | minimum_burst_level (int, optional): [description]. Defaults to 0. 147 | degree_type (str, optional): [description]. Defaults to "both". 148 | remove_stop_words (bool, optional): [description]. Defaults to True. 149 | """ 150 | 151 | data = self.top_degree(number_of_slices=number_of_slices, 152 | list_top=list_top, 153 | minimum_burst_level=minimum_burst_level, 154 | degree_type=degree_type, 155 | remove_stop_words=remove_stop_words) 156 | 157 | print(data) 158 | 159 | date_names = [] 160 | time_slices = [] 161 | 162 | for k, v in data.items(): 163 | date_names.append(k) 164 | time_slices.append(v) 165 | 166 | for i in range(1, len(date_names)): 167 | 168 | x = np.arange(list_top) 169 | values = [] 170 | names = [] 171 | 172 | for top_degrees in time_slices[i]: 173 | values.append(top_degrees[1]) 174 | names.append(top_degrees[0]) 175 | 176 | values.reverse() 177 | names.reverse() 178 | 179 | if np.sum(values) > 0: 180 | fig, ax = plt.subplots() 181 | fig.set_figwidth(6) 182 | fig.set_figheight(10) 183 | fig.suptitle('{} to {}'.format(date_names[i - 1], 184 | date_names[i]), 185 | fontsize=12, ha="center") 186 | ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 187 | plt.barh(x, values, color='#32363A') 188 | plt.yticks(x, names) 189 | 190 | if filename: 191 | plt.savefig(str(filename) + str(i) + ".pdf") 192 | else: 193 | plt.show() 194 | else: 195 | print("No nodes with degree > 0 in this time slice.") 196 | 197 | def plot_specific_degree(self, 198 | tokens: list, 199 | number_of_slices: int = 15, 200 | minimum_burst_level: int = 0, 201 | degree_type="both", 202 | plot_type="line", 203 | remove_stop_words=False, 204 | filename: str = False,): 205 | """[summary] 206 | 207 | Args: 208 | tokens (list): [description] 209 | number_of_slices (int, optional): [description]. Defaults to 20. 210 | minimum_burst_level (int, optional): [description]. Defaults to 0. 211 | degree_type (str, optional): [description]. Defaults to "both". 212 | plot_type (str, optional): [description]. Defaults to "line". 213 | remove_stop_words (bool, optional): [description]. Defaults to False. 214 | 215 | Raises: 216 | Exception: [description] 217 | """ 218 | 219 | if isinstance(tokens, list) == False: 220 | tokens = [tokens] 221 | 222 | if plot_type != "line" and plot_type != "bar": 223 | raise Exception("`plot_type` must be one of 'line' or 'bar'") 224 | 225 | data = self.specific_degree(tokens=tokens, 226 | number_of_slices=number_of_slices, 227 | minimum_burst_level=minimum_burst_level, 228 | degree_type=degree_type, 229 | remove_stop_words=remove_stop_words) 230 | 231 | inverted_dict = {} 232 | 233 | for token in tokens: 234 | full_list = [] 235 | 236 | for date, degree_list in data.items(): 237 | degree = [item[1] for item in degree_list if item[0] == token] 238 | full_list.append((date, degree[0])) 239 | 240 | inverted_dict[token] = full_list 241 | 242 | x = np.arange(number_of_slices) 243 | 244 | for k, v in inverted_dict.items(): 245 | 246 | values = [item[1] for item in v] 247 | dates = [item[0].replace(", ", "\n") for item in v] 248 | 249 | fig, ax = plt.subplots() 250 | fig.set_figwidth(10) 251 | fig.set_figheight(6) 252 | fig.suptitle("'{}'".format(k), fontsize=12, ha="center") 253 | ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 254 | if plot_type == "bar": 255 | plt.bar(x, values, color='#32363A') 256 | elif plot_type == "line": 257 | plt.plot(x, values, color='#32363A') 258 | plt.xticks(x, dates) 259 | 260 | if filename: 261 | plt.savefig(str(filename) + str(k) + ".pdf") 262 | else: 263 | plt.show() 264 | -------------------------------------------------------------------------------- /nate/svonet/graph_svo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring 3 | """ 4 | 5 | import networkx as nx 6 | from PIL import Image 7 | from os import remove 8 | from typing import Tuple, List 9 | from datetime import datetime 10 | 11 | color_dict = { 12 | 0: "#F62D2D", 13 | 1: "#D3212D", 14 | 2: "#A2264B", 15 | 3: "#722B6A", 16 | 4: "#412F88", 17 | 5: "#1F0033", 18 | 6: "#000000" 19 | } 20 | 21 | 22 | def generate_ticks(offsets, number_of_ticks=10) -> Tuple[List[int], List[str]]: 23 | """[summary] 24 | 25 | Args: 26 | offsets ([type]): [description] 27 | number_of_ticks (int, optional): [description]. Defaults to 10. 28 | 29 | Returns: 30 | Tuple[List[int], List[str]]: [description] 31 | """ 32 | 33 | rawdif = max(offsets) - min(offsets) 34 | 35 | divdiff = rawdif / number_of_ticks 36 | 37 | chunk_size = round(divdiff) 38 | 39 | tick_positions: List[int] = [] 40 | 41 | for i in range(0, number_of_ticks + 1): 42 | tick_positions.append(int(min(offsets) + (i * chunk_size))) 43 | 44 | tick_labels: List[str] = [] 45 | 46 | for tick in tick_positions: 47 | 48 | time_label = datetime.utcfromtimestamp(tick).strftime("%b %d, %Y") 49 | 50 | tick_labels.append(time_label) 51 | 52 | return tick_positions, tick_labels 53 | 54 | 55 | def find_max_burst(burst_list: list, offset_start, offset_end): 56 | """[summary] 57 | 58 | Args: 59 | burst_list (list): [description] 60 | offset_start ([type]): [description] 61 | offset_end ([type]): [description] 62 | 63 | Returns: 64 | [type]: [description] 65 | """ 66 | 67 | burst_levels = set() 68 | burst_levels.add(0) 69 | 70 | for burst in burst_list: 71 | if burst[2] < offset_start or offset_end < burst[1]: #offset_start < burst[1] < offset_end or offset_start < burst[2] < offset_end: 72 | pass 73 | else: 74 | burst_levels.add(burst[0]) 75 | 76 | return max(burst_levels) 77 | 78 | 79 | class SVOgraphMixin(): 80 | 81 | def get_giant_component(self): 82 | """[summary] 83 | 84 | Returns: 85 | [type]: [description] 86 | """ 87 | 88 | G = nx.DiGraph() 89 | 90 | svo_list = self.edge_burst_dict 91 | 92 | for entry in svo_list: 93 | G.add_edge(entry[0], entry[2], label=" " + entry[1]) 94 | 95 | return G.subgraph(max(nx.weakly_connected_components(G), 96 | key=len)).copy() 97 | 98 | def save_svo_graph(self, 99 | term_list, 100 | use_giant=False, 101 | file_name=None, 102 | return_networkx=False): 103 | """[summary] 104 | 105 | Args: 106 | term_list ([type]): [description] 107 | use_giant (bool, optional): [description]. Defaults to False. 108 | file_name ([type], optional): [description]. Defaults to None. 109 | return_networkx (bool, optional): [description]. Defaults to False. 110 | 111 | Returns: 112 | [type]: [description] 113 | """ 114 | 115 | G = nx.DiGraph() 116 | 117 | if isinstance(term_list, str): 118 | term_list = [term_list] 119 | 120 | svo_list = self.edge_burst_dict 121 | 122 | for entry in svo_list: 123 | include = False 124 | for entry_part in entry: 125 | if entry_part in term_list: 126 | include = True 127 | 128 | for term in term_list: 129 | if term in entry_part or entry_part in term: 130 | include = True 131 | 132 | if include: 133 | G.add_edge(entry[0], entry[2], label=" " + entry[1]) 134 | 135 | for entry in G: 136 | G.nodes[entry]['style'] = 'filled' 137 | G.nodes[entry]['fillcolor'] = 'cadetblue2' 138 | 139 | toPdot = nx.drawing.nx_pydot.to_pydot 140 | N = toPdot(G) 141 | 142 | if return_networkx: 143 | return G 144 | else: 145 | if file_name == None: 146 | file_name = "_".join(term_list) 147 | 148 | N.write(file_name + "_svo_visualization.png", 149 | prog='dot', 150 | format='png') 151 | 152 | def create_svo_animation(self, 153 | term_list, 154 | use_giant=False, 155 | num_ticks=20, 156 | delay_per_tick=3, 157 | file_name="test", 158 | remove_images=True): 159 | """[summary] 160 | 161 | Args: 162 | term_list ([type]): [description] 163 | use_giant (bool, optional): [description]. Defaults to False. 164 | num_ticks (int, optional): [description]. Defaults to 20. 165 | delay_per_tick (int, optional): [description]. Defaults to 3. 166 | file_name (str, optional): [description]. Defaults to "test". 167 | remove_images (bool, optional): [description]. Defaults to True. 168 | """ 169 | 170 | file_name = str(file_name) 171 | 172 | if use_giant: 173 | G = self.get_giant_component() 174 | else: 175 | G = self.save_svo_graph(self, term_list, return_networkx=True) 176 | 177 | offset_list = set() 178 | svo_keys = [] 179 | 180 | for edge in G.edges: 181 | G[edge[0]][edge[1]]['burst_last'] = -100 182 | G[edge[0]][edge[1]]['burst_level'] = 0 183 | G[edge[0]][edge[1]]['color'] = "black" 184 | G[edge[0]][edge[1]]['penwidth'] = 1 185 | label = G.get_edge_data(edge[0], edge[1])['label'] 186 | key = (edge[0], label[1:], edge[1]) 187 | offsets = self.offset_dict[key] 188 | offset_list.add(min(offsets)) 189 | offset_list.add(max(offsets)) 190 | svo_keys.append(key) 191 | 192 | time_slices, time_labels = generate_ticks(offset_list, num_ticks) 193 | 194 | initial_graph = nx.drawing.nx_pydot.to_pydot(G) 195 | 196 | graphs = [initial_graph] 197 | 198 | for i in range(1, len(time_slices)): 199 | # The following lines are for functionality not yet implemented: we can cause the nodes - not just the edges - to show their burst patterns 200 | # bursting_nodes = set() 201 | # cooling_nodes = set() 202 | # inactive_nodes = set() 203 | for key in svo_keys: 204 | 205 | burst_level = find_max_burst(self.edge_burst_dict[key], 206 | time_slices[i - 1], time_slices[i]) 207 | 208 | G[key[0]][key[2]]['burst_level'] = burst_level 209 | 210 | if burst_level > 0: 211 | G[key[0]][key[2]]['burst_last'] = i 212 | # print(key[0]) 213 | # print(key[1]) 214 | # print(key[2]) 215 | # print(i) 216 | 217 | distance = i - G[key[0]][key[2]]['burst_last'] 218 | 219 | color = color_dict[min([distance, 6])] 220 | penwidth = max([6 - distance, 0.5]) 221 | 222 | G[key[0]][key[2]]['penwidth'] = penwidth 223 | G[key[0]][key[2]]['color'] = color 224 | 225 | subgraph = nx.drawing.nx_pydot.to_pydot(G) 226 | 227 | graphs.append(subgraph) 228 | 229 | filenames = [] 230 | 231 | for i in range(len(graphs)): 232 | this_file = file_name + "_" + str(i) + ".png" 233 | filenames.append(this_file) 234 | 235 | graphs[i].write_png(this_file) 236 | 237 | images = [] 238 | 239 | for name in filenames: 240 | images.append(Image.open(name)) 241 | 242 | images[0].save(file_name + ".gif", 243 | save_all=True, 244 | append_images=images[1:], 245 | optimize=False, 246 | duration=len(images * delay_per_tick), 247 | loop=0) 248 | 249 | if remove_images: 250 | for file_ in filenames: 251 | remove(file_) 252 | -------------------------------------------------------------------------------- /nate/svonet/svo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a MODULE docstring. This module is a modification of the `enhanced-subject-verb-object-extraction` 3 | package by Rock de Vocht: https://github.com/peter3125/enhanced-subject-verb-object-extraction 4 | Changes are primarily to allow filtering for subjects and objects included in optional lists of spaCy's named 5 | entity tags, as well as including those tags in the output tuple. 6 | Note that this module must receive sentences one at a time, otherwise a passive sentence will flag 7 | all subsequent sentences as passive, reversing subject and object order incorrectly. 8 | """ 9 | 10 | # 11 | # Copyright 2017 Peter de Vocht 12 | # 13 | # Licensed under the Apache License, Version 2.0 (the "License"); 14 | # you may not use this file except in compliance with the License. 15 | # You may obtain a copy of the License at 16 | # 17 | # http://www.apache.org/licenses/LICENSE-2.0 18 | # 19 | # Unless required by applicable law or agreed to in writing, software 20 | # distributed under the License is distributed on an "AS IS" BASIS, 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | # See the License for the specific language governing permissions and 23 | # limitations under the License. 24 | 25 | # dependency markers for subjects 26 | SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"} 27 | # dependency markers for objects 28 | OBJECTS = {"dobj", "dative", "attr", "oprd"} 29 | # POS tags that will break adjoining items 30 | BREAKER_POS = {"CCONJ", "VERB"} 31 | # words that are negations 32 | NEGATIONS = {"no", "not", "n't", "never", "none"} 33 | 34 | sub_ner_tags = False 35 | obj_ner_tags = False 36 | sub_ent_types = [] 37 | obj_ent_types = [] 38 | 39 | 40 | # does dependency set contain any coordinating conjunctions? 41 | def contains_conj(depSet): 42 | 43 | return "and" in depSet or "or" in depSet or "nor" in depSet or \ 44 | "but" in depSet or "yet" in depSet or "so" in depSet or "for" in depSet 45 | 46 | 47 | # get subs joined by conjunctions 48 | def _get_subs_from_conjunctions(subs): 49 | 50 | more_subs = [] 51 | for sub in subs: 52 | # rights is a generator 53 | rights = list(sub.rights) 54 | rightDeps = {tok.lower_ for tok in rights} 55 | if contains_conj(rightDeps): 56 | if sub_ner_tags: 57 | more_subs.extend([ 58 | tok for tok in rights 59 | if tok.dep_ in SUBJECTS and tok.ent_type_ in sub_ner_tags 60 | ]) 61 | else: 62 | more_subs.extend([ 63 | tok for tok in rights 64 | if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN" 65 | ]) 66 | if len(more_subs) > 0: 67 | more_subs.extend(_get_subs_from_conjunctions(more_subs)) 68 | return more_subs 69 | 70 | 71 | # get objects joined by conjunctions 72 | def _get_objs_from_conjunctions(objs): 73 | 74 | more_objs = [] 75 | for obj in objs: 76 | # rights is a generator 77 | rights = list(obj.rights) 78 | rightDeps = {tok.lower_ for tok in rights} 79 | if contains_conj(rightDeps): 80 | if obj_ner_tags: 81 | more_objs.extend([ 82 | tok for tok in rights 83 | if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) 84 | or (tok.pos_ == "NOUN" and tok.ent_type_ in obj_ner_tags) 85 | ]) 86 | else: 87 | more_objs.extend([ 88 | tok for tok in rights 89 | if tok.dep_ in OBJECTS or tok.pos_ == "NOUN" 90 | ]) 91 | if len(more_objs) > 0: 92 | more_objs.extend(_get_objs_from_conjunctions(more_objs)) 93 | return more_objs 94 | 95 | 96 | # find sub dependencies 97 | def _find_subs(tok): 98 | 99 | head = tok.head 100 | while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head: 101 | head = head.head 102 | if head.pos_ == "VERB": 103 | if sub_ner_tags: 104 | subs = [ 105 | tok for tok in head.lefts 106 | if tok.dep_ == "SUB" and tok.ent_type_ in sub_ner_tags 107 | ] 108 | else: 109 | subs = [tok for tok in head.lefts if tok.dep_ == "SUB"] 110 | if len(subs) > 0: 111 | verb_negated = _is_negated(head) 112 | subs.extend(_get_subs_from_conjunctions(subs)) 113 | return subs, verb_negated 114 | elif head.head != head: 115 | return _find_subs(head) 116 | elif sub_ner_tags and head.ent_type_ in sub_ner_tags: 117 | return [head], _is_negated(tok) 118 | elif not sub_ner_tags and head.pos_ == "NOUN": 119 | return [head], _is_negated(tok) 120 | return [], False 121 | 122 | 123 | # is the tok set's left or right negated? 124 | def _is_negated(tok): 125 | 126 | parts = list(tok.lefts) + list(tok.rights) 127 | for dep in parts: 128 | if dep.lower_ in NEGATIONS: 129 | return True 130 | return False 131 | 132 | 133 | # get all the verbs on tokens with negation marker 134 | def _find_svs(tokens): 135 | 136 | svs = [] 137 | verbs = [tok for tok in tokens if tok.pos_ == "VERB"] 138 | for v in verbs: 139 | subs, verbNegated = _get_all_subs(v) 140 | if len(subs) > 0: 141 | for sub in subs: 142 | svs.append( 143 | (sub.orth_, "!" + v.orth_ if verbNegated else v.orth_)) 144 | return svs 145 | 146 | 147 | # get grammatical objects for a given set of dependencies (including passive sentences) 148 | def _get_objs_from_prepositions(deps, is_pas): 149 | 150 | objs = [] 151 | for dep in deps: 152 | if obj_ner_tags: 153 | if dep.pos_ == "ADP" and (dep.dep_ == "prep" or 154 | (is_pas and dep.dep_ == "agent")): 155 | objs.extend([ 156 | tok for tok in dep.rights 157 | if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) 158 | ]) 159 | #(is_pas and tok.ent_type_ in obj_ner_tags and tok.dep_ == 'pobj')]) #temporarily disabled 160 | else: 161 | if dep.pos_ == "ADP" and (dep.dep_ == "prep" or 162 | (is_pas and dep.dep_ == "agent")): 163 | objs.extend([ 164 | tok for tok in dep.rights if tok.dep_ in OBJECTS or 165 | (tok.pos_ == "PRON" and tok.lower_ == "me") or 166 | (is_pas and tok.dep_ == 'pobj') 167 | ]) 168 | return objs 169 | 170 | 171 | # get objects from the dependencies using the attribute dependency 172 | # *NOTE* disabled for unknown reason in _get_all_objs, this needs NER option if it should be enabled 173 | def _get_objs_from_attrs(deps, is_pas): 174 | 175 | for dep in deps: 176 | if dep.pos_ == "NOUN" and dep.dep_ == "attr": 177 | verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"] 178 | if len(verbs) > 0: 179 | for v in verbs: 180 | rights = list(v.rights) 181 | objs = [tok for tok in rights if tok.dep_ in OBJECTS] 182 | objs.extend(_get_objs_from_prepositions(rights, is_pas)) 183 | if len(objs) > 0: 184 | return v, objs 185 | return None, None 186 | 187 | 188 | # xcomp; open complement - verb has no subject 189 | def _get_obj_from_xcomp(deps, is_pas): 190 | 191 | for dep in deps: 192 | if dep.pos_ == "VERB" and dep.dep_ == "xcomp": 193 | v = dep 194 | rights = list(v.rights) 195 | if obj_ner_tags: 196 | objs = [ 197 | tok for tok in rights 198 | if tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags 199 | ] 200 | else: 201 | objs = [tok for tok in rights if tok.dep_ in OBJECTS] 202 | objs.extend(_get_objs_from_prepositions(rights, is_pas)) 203 | if len(objs) > 0: 204 | return v, objs 205 | return None, None 206 | 207 | 208 | # get all functional subjects adjacent to the verb passed in 209 | def _get_all_subs(v): 210 | 211 | verb_negated = _is_negated(v) 212 | if sub_ner_tags: 213 | subs = [ 214 | tok for tok in v.lefts if tok.dep_ in SUBJECTS and 215 | tok.ent_type_ in sub_ner_tags and tok.pos_ != "DET" 216 | ] 217 | else: 218 | subs = [ 219 | tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET" 220 | ] 221 | if len(subs) > 0: 222 | subs.extend(_get_subs_from_conjunctions(subs)) 223 | else: 224 | foundSubs, verb_negated = _find_subs(v) 225 | subs.extend(foundSubs) 226 | 227 | global sub_ent_types 228 | sub_ent_types = [sub.ent_type_ for sub in subs] 229 | 230 | return subs, verb_negated 231 | 232 | 233 | # is the token a verb? (excluding auxiliary verbs) 234 | def _is_non_aux_verb(tok): 235 | 236 | return tok.pos_ == "VERB" and (tok.dep_ != "aux" and tok.dep_ != "auxpass") 237 | 238 | 239 | # return the verb to the right of this verb in a CCONJ relationship if applicable 240 | # returns a tuple, first part True|False and second part the modified verb if True 241 | def _right_of_verb_is_conj_verb(v): 242 | 243 | # rights is a generator 244 | rights = list(v.rights) 245 | 246 | # VERB CCONJ VERB (e.g. he beat and hurt me) 247 | if len(rights) > 1 and rights[0].pos_ == 'CCONJ': 248 | for tok in rights[1:]: 249 | if _is_non_aux_verb(tok): 250 | return True, tok 251 | 252 | return False, v 253 | 254 | 255 | # get all objects for an active/passive sentence 256 | def _get_all_objs(v, is_pas): 257 | 258 | # rights is a generator 259 | rights = list(v.rights) 260 | if obj_ner_tags: 261 | objs = [ 262 | tok for tok in rights 263 | if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) or 264 | (is_pas and tok.dep_ == 'pobj' and tok.ent_type_ in obj_ner_tags) 265 | ] 266 | else: 267 | objs = [ 268 | tok for tok in rights 269 | if tok.dep_ in OBJECTS or (is_pas and tok.dep_ == 'pobj') 270 | ] 271 | objs.extend(_get_objs_from_prepositions(rights, is_pas)) 272 | 273 | #potentialNewVerb, potentialNewObjs = _get_objs_from_attrs(rights) 274 | #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0: 275 | # objs.extend(potentialNewObjs) 276 | # v = potentialNewVerb 277 | 278 | potential_new_verb, potential_new_objs = _get_obj_from_xcomp(rights, is_pas) 279 | if potential_new_verb is not None and potential_new_objs is not None and len( 280 | potential_new_objs) > 0: 281 | objs.extend(potential_new_objs) 282 | v = potential_new_verb 283 | if len(objs) > 0: 284 | objs.extend(_get_objs_from_conjunctions(objs)) 285 | 286 | global obj_ent_types 287 | obj_ent_types = [obj.ent_type_ for obj in objs] 288 | 289 | return v, objs 290 | 291 | 292 | # return true if the sentence is passive - at he moment a sentence is assumed passive if it has an auxpass verb 293 | def _is_passive(tokens): 294 | 295 | for tok in tokens: 296 | if tok.dep_ == "auxpass": 297 | return True 298 | return False 299 | 300 | 301 | # resolve a 'that' where/if appropriate 302 | def _get_that_resolution(toks): 303 | 304 | for tok in toks: 305 | if 'that' in [t.orth_ for t in tok.lefts]: 306 | return tok.head 307 | return toks 308 | 309 | 310 | # simple stemmer using lemmas 311 | def _get_lemma(word: str): 312 | 313 | tokens = word #nlp(word) 314 | if len(tokens) == 1: 315 | return tokens[0].lemma_ 316 | return word 317 | 318 | 319 | # print information for displaying all kinds of things of the parse tree 320 | def printDeps(toks): 321 | 322 | for tok in toks: 323 | print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, 324 | [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights]) 325 | 326 | 327 | # expand an obj / subj np using its chunk 328 | def expand(item, tokens, visited): 329 | 330 | if item.lower_ == 'that': 331 | item = _get_that_resolution(tokens) 332 | 333 | parts = [] 334 | 335 | if hasattr(item, 'lefts'): 336 | for part in item.lefts: 337 | if part.pos_ in BREAKER_POS: 338 | break 339 | if not part.lower_ in NEGATIONS: 340 | parts.append(part) 341 | 342 | parts.append(item) 343 | 344 | if hasattr(item, 'rights'): 345 | for part in item.rights: 346 | if part.pos_ in BREAKER_POS: 347 | break 348 | if not part.lower_ in NEGATIONS: 349 | parts.append(part) 350 | 351 | if hasattr(parts[-1], 'rights'): 352 | for item2 in parts[-1].rights: 353 | if item2.pos_ == "DET" or item2.pos_ == "NOUN": 354 | if item2.i not in visited: 355 | visited.add(item2.i) 356 | parts.extend(expand(item2, tokens, visited)) 357 | break 358 | 359 | return parts 360 | 361 | 362 | # convert a list of tokens to a string 363 | def to_str(tokens): 364 | 365 | return ' '.join([item.text for item in tokens]) 366 | 367 | 368 | # find verbs and their subjects / objects to create SVOs, detect passive/active sentences 369 | def findSVOs(tokens, sub_tags=False, obj_tags=False): 370 | global sub_ner_tags 371 | sub_ner_tags = sub_tags 372 | global obj_ner_tags 373 | obj_ner_tags = obj_tags 374 | svos = [] 375 | is_pas = _is_passive(tokens) 376 | verbs = [tok for tok in tokens if _is_non_aux_verb(tok)] 377 | visited = set() # recursion detection 378 | sub_ent_types = [] 379 | obj_ent_types = [] 380 | for v in verbs: 381 | subs, verbNegated = _get_all_subs(v) 382 | # hopefully there are subs, if not, don't examine this verb any longer 383 | if len(subs) > 0: 384 | isConjVerb, conjV = _right_of_verb_is_conj_verb(v) 385 | if isConjVerb: 386 | v2, objs = _get_all_objs(conjV, is_pas) 387 | for sub in subs: 388 | for obj in objs: 389 | objNegated = _is_negated(obj) 390 | if is_pas: # reverse object / subject for passive 391 | svos.append( 392 | (to_str(expand(obj, tokens, 393 | visited)), "!" + v.lemma_ 394 | if verbNegated or objNegated else v.lemma_, 395 | to_str(expand(sub, tokens, visited)))) 396 | sub_ent_types.append(sub.ent_type_) 397 | obj_ent_types.append(obj.ent_type_) 398 | svos.append( 399 | (to_str(expand(obj, tokens, 400 | visited)), "!" + v2.lemma_ 401 | if verbNegated or objNegated else v2.lemma_, 402 | to_str(expand(sub, tokens, visited)))) 403 | sub_ent_types.append(sub.ent_type_) 404 | obj_ent_types.append(obj.ent_type_) 405 | else: 406 | svos.append( 407 | (to_str(expand(sub, tokens, 408 | visited)), "!" + v.lower_ 409 | if verbNegated or objNegated else v.lower_, 410 | to_str(expand(obj, tokens, visited)))) 411 | sub_ent_types.append(sub.ent_type_) 412 | obj_ent_types.append(obj.ent_type_) 413 | svos.append( 414 | (to_str(expand(sub, tokens, 415 | visited)), "!" + v2.lower_ 416 | if verbNegated or objNegated else v2.lower_, 417 | to_str(expand(obj, tokens, visited)))) 418 | sub_ent_types.append(sub.ent_type_) 419 | obj_ent_types.append(obj.ent_type_) 420 | else: 421 | v, objs = _get_all_objs(v, is_pas) 422 | for sub in subs: 423 | for obj in objs: 424 | objNegated = _is_negated(obj) 425 | if is_pas: # reverse object / subject for passive 426 | svos.append( 427 | (to_str(expand(obj, tokens, 428 | visited)), "!" + v.lemma_ 429 | if verbNegated or objNegated else v.lemma_, 430 | to_str(expand(sub, tokens, visited)))) 431 | sub_ent_types.append(sub.ent_type_) 432 | obj_ent_types.append(obj.ent_type_) 433 | else: 434 | svos.append( 435 | (to_str(expand(sub, tokens, 436 | visited)), "!" + v.lower_ 437 | if verbNegated or objNegated else v.lower_, 438 | to_str(expand(obj, tokens, visited)))) 439 | sub_ent_types.append(sub.ent_type_) 440 | obj_ent_types.append(obj.ent_type_) 441 | 442 | return (svos, sub_ent_types, obj_ent_types) 443 | -------------------------------------------------------------------------------- /nate/svonet/svo_degree_over_time.py: -------------------------------------------------------------------------------- 1 | from nate.svonet.graph_svo import generate_ticks, find_max_burst 2 | import networkx as nx 3 | import stop_words as sw 4 | import copy 5 | import pandas as pd 6 | import matplotlib as mpl 7 | import matplotlib.pyplot as plt 8 | import matplotlib.dates as mdates 9 | from matplotlib.ticker import MaxNLocator 10 | import numpy as np 11 | from multiprocessing import Process, Queue 12 | from os import cpu_count 13 | 14 | 15 | def get_degree_for_slice( 16 | q: Queue, 17 | G, 18 | edge_burst_dict, 19 | time_slice_start, 20 | time_slice_end, 21 | minimum_burst_level, 22 | stops, 23 | overlap_threshold, 24 | return_edge_overlaps, 25 | list_top, 26 | time_label): 27 | graphCopy = copy.deepcopy(G) 28 | 29 | for key in edge_burst_dict: 30 | burst_level = find_max_burst(edge_burst_dict[key], time_slice_start, time_slice_end) 31 | 32 | if burst_level > minimum_burst_level: 33 | for node in graphCopy.nodes(): 34 | for j in [0, -1]: 35 | for k in [0, -1]: 36 | if key[j] == node[k] and key[j] not in stops: 37 | overlap = len(set(key).intersection(set(node))) 38 | if overlap >= overlap_threshold: 39 | graphCopy.add_edge(key, node, overlap=overlap) 40 | 41 | graphCopy.remove_edges_from(nx.selfloop_edges(graphCopy)) 42 | 43 | 44 | degree_list = list(graphCopy.degree) 45 | 46 | degree_list.sort(key=lambda x: x[1], reverse=True) 47 | 48 | degree_list = degree_list[0:list_top] 49 | 50 | overlap_list = [] 51 | 52 | if return_edge_overlaps: 53 | 54 | for entry in degree_list[0:list_top]: 55 | overlap_sum = [] 56 | for edge in graphCopy.edges(entry[0]): 57 | overlap_sum.append(graphCopy.edges[edge]['overlap']) 58 | 59 | if len(overlap_sum) > 0: 60 | avg = round(sum(overlap_sum) / len(overlap_sum), 2) 61 | else: 62 | avg = 0 63 | 64 | overlap_list.append((entry[0], avg)) 65 | 66 | if return_edge_overlaps: 67 | q.put((time_label, time_slice_end, degree_list, overlap_list)) 68 | else: 69 | q.put((time_label, time_slice_end, degree_list)) 70 | 71 | 72 | class SVODegreeOverTimeMixin(): 73 | 74 | def __init__(self): 75 | self.offset_dict:dict 76 | self.edge_burst_dict:dict 77 | self.s: int 78 | self.gamma: int 79 | self.from_svo: bool 80 | self.lookup: dict 81 | 82 | 83 | def top_svo_degree( 84 | self, 85 | number_of_slices: int = 8, 86 | list_top: int = 10, 87 | minimum_burst_level: int = 0, 88 | return_edge_overlaps: bool = True, 89 | overlap_threshold: int = 1): 90 | """[summary] 91 | 92 | Args: 93 | number_of_slices (int, optional): [description]. Defaults to 20. 94 | list_top (int, optional): [description]. Defaults to 10. 95 | minimum_burst_level (int, optional): [description]. Defaults to 0. 96 | return_edge_overlaps (bool, optional): [description]. Defaults to True. 97 | overlap_threshold (int, optional): [description]. Defaults to 1. 98 | 99 | Raises: 100 | Exception: [description] 101 | 102 | Returns: 103 | [type]: [description] 104 | """ 105 | 106 | if overlap_threshold > 2 or overlap_threshold < 1: 107 | raise Exception("Overlap Filter must be 1 or 2.") 108 | 109 | stops = sw.get_stop_words("english") 110 | 111 | # Create list of time slices: 112 | 113 | offset_set = set() 114 | 115 | for key in self.offset_dict: 116 | for offset in self.offset_dict[key]: 117 | offset_set.add(offset) 118 | 119 | time_slices, time_labels = generate_ticks(offset_set, number_of_ticks=(number_of_slices)) 120 | 121 | # Create network consisting of all Subjects and Objects: 122 | 123 | G = nx.Graph() 124 | 125 | for entry in self.edge_burst_dict: 126 | G.add_node(entry) 127 | 128 | if list_top == None: 129 | list_top = len(self.edge_burst_dict) 130 | 131 | # Iterate over time slices 132 | 133 | q = Queue() 134 | 135 | processes = [] 136 | 137 | for i in range(1, len(time_slices)): 138 | 139 | time_slice_start = time_slices[i-1] 140 | time_slice_end = time_slices[i] 141 | time_label = time_labels[i] 142 | 143 | t = Process( 144 | target = get_degree_for_slice, 145 | args= ( 146 | q, 147 | G, 148 | self.edge_burst_dict, 149 | time_slice_start, 150 | time_slice_end, 151 | minimum_burst_level, 152 | stops, 153 | overlap_threshold, 154 | return_edge_overlaps, 155 | list_top, 156 | time_label 157 | ) 158 | ) 159 | 160 | processes.append(t) 161 | t.start() 162 | 163 | result_list = [] 164 | 165 | for i in range(1, len(time_slices)): 166 | result_list.append(q.get()) 167 | 168 | 169 | top_degree_by_slice = {} 170 | edge_overlap = {} 171 | 172 | result_list = sorted(result_list, key = lambda x: x[1]) 173 | 174 | for result in result_list: 175 | time_label = result[0] 176 | degree_list = result[2] 177 | top_degree_by_slice[time_label] = degree_list 178 | if return_edge_overlaps: 179 | edge_overlap[time_label] = result[3] 180 | 181 | if return_edge_overlaps: 182 | return top_degree_by_slice, edge_overlap 183 | else: 184 | return top_degree_by_slice 185 | 186 | def specific_svo_degree(self, 187 | tokens: list, 188 | number_of_slices: int = 15, 189 | minimum_burst_level: int = 0, 190 | overlap_threshold: int = 1): 191 | """[summary] 192 | 193 | Args: 194 | tokens (list): [description] 195 | number_of_slices (int, optional): [description]. Defaults to 20. 196 | minimum_burst_level (int, optional): [description]. Defaults to 0. 197 | overlap_threshold (int, optional): [description]. Defaults to 1. 198 | 199 | Returns: 200 | [type]: [description] 201 | """ 202 | 203 | if isinstance(tokens, list) == False: 204 | tokens = [tokens] 205 | 206 | full_lists = self.top_svo_degree(number_of_slices=number_of_slices, 207 | list_top=None, 208 | minimum_burst_level=minimum_burst_level, 209 | return_edge_overlaps=False, 210 | overlap_threshold=overlap_threshold, 211 | ) 212 | 213 | 214 | token_rank_dict = {} 215 | 216 | for day in full_lists: 217 | v = [item for item in full_lists[day] if item[0] in tokens] 218 | token_rank_dict[day] = v 219 | 220 | return token_rank_dict 221 | 222 | def plot_top_svo_degree( 223 | self, 224 | number_of_slices: int = 8, 225 | list_top: int = 10, 226 | minimum_burst_level: int = 0, 227 | overlap_threshold: int = 1, 228 | filename: str = False,): 229 | """[summary] 230 | 231 | Args: 232 | number_of_slices (int, optional): [description]. Defaults to 20. 233 | list_top (int, optional): [description]. Defaults to 10. 234 | minimum_burst_level (int, optional): [description]. Defaults to 0. 235 | overlap_threshold (int, optional): [description]. Defaults to 1. 236 | """ 237 | 238 | data = self.top_svo_degree( 239 | number_of_slices = number_of_slices, 240 | list_top = list_top, 241 | minimum_burst_level = minimum_burst_level, 242 | return_edge_overlaps = False, 243 | overlap_threshold=overlap_threshold,) 244 | 245 | date_names = [] 246 | time_slices = [] 247 | 248 | for k, v in data.items(): 249 | date_names.append(k) 250 | time_slices.append(v) 251 | 252 | for i in range(1, len(date_names)): 253 | 254 | x = np.arange(list_top) 255 | values = [] 256 | names = [] 257 | 258 | for top_degrees in time_slices[i]: 259 | values.append(top_degrees[1]) 260 | names.append(top_degrees[0]) 261 | 262 | values.reverse() 263 | names.reverse() 264 | 265 | fig, ax = plt.subplots() 266 | fig.set_figwidth(6) 267 | fig.set_figheight(10) 268 | fig.suptitle('{} to {}'.format(date_names[i-1], date_names[i]), fontsize=12, ha="center") 269 | ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 270 | plt.barh(x, values, color='#32363A') 271 | plt.yticks(x, names) 272 | if filename: 273 | plt.savefig(str(filename) + str(i) + ".pdf") 274 | else: 275 | plt.show() 276 | 277 | def plot_specific_svo_degree(self, 278 | tokens: list, 279 | number_of_slices: int = 15, 280 | minimum_burst_level: int = 0, 281 | overlap_threshold: int = 1, 282 | plot_type="line", 283 | filename: str = False,): 284 | 285 | if isinstance(tokens, list) == False: 286 | tokens = [tokens] 287 | 288 | if plot_type != "line" and plot_type != "bar": 289 | raise Exception("`plot_type` must be one of 'line' or 'bar'") 290 | 291 | data = self.specific_svo_degree(tokens=tokens, 292 | number_of_slices=number_of_slices, 293 | minimum_burst_level=minimum_burst_level, 294 | overlap_threshold=overlap_threshold, 295 | ) 296 | 297 | inverted_dict = {} 298 | 299 | for token in tokens: 300 | full_list = [] 301 | 302 | for date, degree_list in data.items(): 303 | degree = [item[1] for item in degree_list if item[0] == token] 304 | full_list.append((date, degree[0])) 305 | 306 | inverted_dict[token] = full_list 307 | 308 | x = np.arange(number_of_slices) 309 | 310 | for k, v in inverted_dict.items(): 311 | 312 | values = [item[1] for item in v] 313 | dates = [item[0].replace(", ", "\n") for item in v] 314 | 315 | fig, ax = plt.subplots() 316 | fig.set_figwidth(10) 317 | fig.set_figheight(6) 318 | fig.suptitle("'{}'".format(k), fontsize=12, ha="center") 319 | ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 320 | if plot_type == "bar": 321 | plt.bar(x, values, color='#32363A') 322 | elif plot_type == "line": 323 | plt.plot(x, values, color='#32363A') 324 | plt.xticks(x, dates) 325 | if filename: 326 | plt.savefig(str(filename) + str(k) + ".pdf") 327 | else: 328 | plt.show() -------------------------------------------------------------------------------- /nate/svonet/svo_offsets.py: -------------------------------------------------------------------------------- 1 | """Generates the offset dictionary for the SVO pipeline.""" 2 | from time import time as marktime 3 | from typing import List 4 | from itertools import groupby 5 | from collections import defaultdict 6 | 7 | 8 | def generate_svo_offsets(svo_list: List, time: List, minimum_offsets): 9 | """Creates offset dictionary and int-to-string lookup for SVO format.""" 10 | print("Generating Offsets:") 11 | 12 | start = marktime() 13 | 14 | svo_dict = defaultdict(list) 15 | for i, svo in enumerate(svo_list): 16 | svo_dict[svo].append(time[i]) 17 | 18 | svo_int_dict, lookup = text_to_int(svo_dict) 19 | 20 | # prune SVOs, excluding those with fewer occurrences than specified by minimum_offsets 21 | offsets = { 22 | k: v for k, v in svo_int_dict.items() if len(v) >= minimum_offsets 23 | } 24 | 25 | print("Finished offset generation in {} seconds".format( 26 | round(marktime() - start))) 27 | print("Commencing timestamp deduplication...") 28 | 29 | # increment simultaneous occurrences by 1 millisecond to satisfy Kleinberg requirements 30 | for item in offsets.keys(): 31 | offsets[item].sort() 32 | offsets[item] = [ 33 | g + i * 0.001 34 | for k, group in groupby(offsets[item]) 35 | for i, g in enumerate(group) 36 | ] 37 | 38 | print("finished timestamp deduplication in {} seconds".format( 39 | round(marktime() - start))) 40 | 41 | print("Finished Generating Offsets. Returning offset dictionary.") 42 | 43 | return offsets, lookup 44 | 45 | 46 | def text_to_int(svo_dict): 47 | """Converts SVO terms to integers, and generates a lookup dictionary.""" 48 | svo_int_dict = defaultdict(list) 49 | lookup_dict = defaultdict(tuple) 50 | i = 0 51 | for k, v in svo_dict.items(): 52 | svo_int_dict[i] = v 53 | lookup_dict[i] = k 54 | i = i + 1 55 | 56 | return svo_int_dict, lookup_dict 57 | -------------------------------------------------------------------------------- /nate/svonet/svoburst_class.py: -------------------------------------------------------------------------------- 1 | from nate.edgeburst.burst_class import Bursts 2 | from nate.svonet.degree_over_time import DegreeOverTimeMixIn 3 | from nate.svonet.svo_degree_over_time import SVODegreeOverTimeMixin 4 | 5 | 6 | class SVOburst(Bursts, DegreeOverTimeMixIn, SVODegreeOverTimeMixin): 7 | """ 8 | Creates an SVOburst class object containing data about SVO terms that burst over time. 9 | 10 | Attributes: 11 | offset_dict (Dict): A dictionary with terms as keys, and a list 12 | of offsets (occurrences) as values. 13 | edge_burst_dict (Dict): A dictionary with terms as keys and nested 14 | burst data as values. 15 | s (float): s parameter for tuning Kleinberg algorithm. Higher values 16 | make it more difficult for bursts to move up the burst hierarchy. 17 | gamma (float): gamma parameter for tuning Kleinberg algorithm. Higher 18 | values make it more difficult for activity to be considered a 19 | burst. 20 | from_svo (bool): A flag to alert other functions to the SVO pipeline. 21 | lookup (dict): A dictionary with integers as keys and the SVO terms 22 | they represent as values. 23 | """ 24 | 25 | def __init__(self, offset_dict, edge_burst_dict, s, gamma, from_svo, 26 | lookup): 27 | 28 | self.offset_dict: dict = offset_dict 29 | self.edge_burst_dict: dict = edge_burst_dict 30 | self.s = s 31 | self.gamma = gamma 32 | self.from_svo = from_svo 33 | self.bdf = None 34 | self.odf = None 35 | self.lookup = lookup 36 | 37 | def animate(self, pos = False, offscreen = True, time_interval = False, new_burst_halo = True, dpi = 300): 38 | """Creates an animation of the network of SVO bursts over time. 39 | 40 | The function will either create an onscreen animation window, or 41 | dump each frame to disk. The function requires graph-tool to be 42 | installed and able to be imported. 43 | 44 | Args: 45 | pos (object, optional): A graph-tool `pos` vertex 46 | property map to specify layout. If passed, the map will be 47 | used to create the graph layout. Otherwise, one will be 48 | generated. Defaults to False. 49 | offscreen (Bool, optional): Whether to generate the animation 50 | offscreen. If True, the frames will be dumped to disk in 51 | the directory `./data/frames`. if False, the animation will 52 | be shown onscreen. Defaults to True. 53 | time_interval (int, optional): Specifes a custom time step 54 | interval in seconds. Defaults to 86400 (one day). 55 | new_burst_halo (): not used in animate_graph 56 | dpi (int): not used in animate_graph 57 | """ 58 | # check if graph-tool and other requirements are able to be imported 59 | try: 60 | from nate.svonet.svo_burst_animate import prepare_df, build_graph, animate_graph 61 | 62 | df = prepare_df(self.edge_burst_dict, self.offset_dict) 63 | graph = build_graph(df, pos, time_interval) 64 | animate_graph(graph, pos, offscreen, new_burst_halo, dpi) 65 | 66 | except ImportError: 67 | print("Graph-tool does not appear to be installed or importable") 68 | -------------------------------------------------------------------------------- /nate/svonet/svonet_class.py: -------------------------------------------------------------------------------- 1 | """Definition of the `SVOnet` class, for subject-verb-object analysis. 2 | 3 | This module defines the `SVOnet` class, [description of SVO pipeline]. 4 | """ 5 | 6 | from nate.svonet.svo import findSVOs 7 | import pandas as pd 8 | from nate.utils.mp_helpers import mp 9 | from nate.utils.text_helpers import is_ascii 10 | from typing import List, Dict 11 | from nate.svonet.svo_offsets import generate_svo_offsets 12 | from nate.edgeburst.burst_mixin import BurstMixin 13 | from nate.svonet.degree_over_time import DegreeOverTimeMixIn 14 | from nate.svonet.svoburst_class import SVOburst 15 | 16 | 17 | def process_svo(sub_tags, obj_tags, doc): 18 | """Detects SVOs in a document after spaCy has processed it. 19 | 20 | Custom pipeline component for spaCY. 21 | 22 | TODO: move this to utils, where it is used. 23 | """ 24 | sentences = [x.string.strip() for x in doc.sents] # list of raw sentences in the document 25 | svo_items = [findSVOs(x, sub_tags, obj_tags) for x in doc.sents] # detect SVOs sentence-by-sentence in the document 26 | 27 | return (sentences, svo_items) 28 | 29 | 30 | class SVOnet(BurstMixin): 31 | """Provides data cleanup, export functions, and burst detection. 32 | 33 | Attributes: 34 | doc_ids (List): A list of document ids, determining which document 35 | the SVO at index i came from. 36 | sent_ids (List): A list of sentence ids, determining which sentence 37 | the SVO at index i came from. 38 | sentences (List): The sentence that the SVO was pulled from. 39 | svo_items (List): The entire SVO item. 40 | times (List): The time that the SVO's source document was written. 41 | subjects (List): The SVO at index i's subject. 42 | verbs (List): The SVO at index i's verb. 43 | objects (List): The SVO at index i's object 44 | sub_ent_types (List): The SVO at index i's subject entity type. 45 | obj_ent_types (List): The SVO at index i's object entity type. 46 | """ 47 | 48 | def __init__(self, sentences, svo_items, timestamps): 49 | 50 | self.doc_ids = [] 51 | self.sent_ids = [] 52 | self.sentences = [] 53 | self.svo_items = [] 54 | if timestamps: 55 | self.times = [] 56 | self.subjects = [] 57 | self.verbs = [] 58 | self.objects = [] 59 | self.sub_ent_types = [] 60 | self.obj_ent_types = [] 61 | 62 | # this somewhat obtuse code chunk flattens the heavily nested data format returned by the `svo` module 63 | for i, doc in enumerate(sentences): 64 | for j, sent in enumerate(doc): 65 | if len(svo_items[i][j][0]) > 0: 66 | for k, svo_item in enumerate(svo_items[i][j][0]): 67 | if is_ascii(svo_item[0]) and is_ascii( 68 | svo_item[1]) and is_ascii(svo_item[2]): 69 | svo_item = (svo_item[0].lower(), 70 | svo_item[1].lower(), 71 | svo_item[2].lower()) 72 | self.doc_ids.append(i) 73 | self.sent_ids.append(j) 74 | self.sentences.append(sent) 75 | if timestamps: 76 | self.times.append(timestamps[i]) 77 | self.svo_items.append(svo_item) 78 | self.subjects.append(svo_item[0]) 79 | self.verbs.append(svo_item[1]) 80 | self.objects.append(svo_item[2]) 81 | self.sub_ent_types.append(svo_items[i][j][1][k]) 82 | self.obj_ent_types.append(svo_items[i][j][2][k]) 83 | 84 | self.from_svo = True 85 | 86 | def svo_to_df(self, tidy=True): 87 | """Outputs a pandas dataframe with all SVOs and their timestamps. 88 | 89 | If tidy is set to True, each SVO will have its own line in the dataframe. 90 | If tidy is set to False, identical SVOs will be grouped and their 91 | document ids, timestamps, and datetimes will be aggregated into lists 92 | in the dataframe. 93 | 94 | Args: 95 | tidy (Bool, optional): Whether to output a tidy or non-tidy 96 | dataframe, the differences between which are documented above. 97 | Defaults to True. 98 | 99 | Returns: 100 | pandas.Dataframe: A dataframe containing data for all detected SVOs, 101 | including their associated timestamps (if present). 102 | 103 | The outputted dataframe will have the following columns: 104 | - 'doc_ids' (int) : A list of document ids, determining which 105 | document the SVO at index i came from. 106 | - 'sent_ids' (int): A list of sentence ids, determining which 107 | sentence 108 | the SVO at index i came from. 109 | - 'sentences' (string): The sentence that the SVO was pulled from. 110 | - 'svo' (Tuple): The entire SVO item. 111 | - 'times' (datetime): The time that the SVO's source document 112 | was written. 113 | - 'subjects' (string): The SVO at index i's subject. 114 | - 'verbs' (string): The SVO at index i's verb. 115 | - 'objects' (string): The SVO at index i's object 116 | - 'sub_ent_types' (string): The SVO at index i's subject entity 117 | type. 118 | - 'obj_ent_types' (string): The SVO at index i's object entity 119 | type. 120 | """ 121 | df = pd.DataFrame() 122 | 123 | df['doc_id'], df['sent_id'], df['sentence'], df['svo'] =\ 124 | self.doc_ids, self.sent_ids, self.sentences, self.svo_items 125 | if self.times: 126 | df['timestamp'] = self.times 127 | df['subject'], df['sub_type'], df['verb'], df['object'], df[ 128 | 'obj_type'] = self.subjects, self.sub_ent_types, self.verbs, self.objects, self.obj_ent_types 129 | if self.times: 130 | df['datetime'] = pd.to_datetime(df['timestamp'], unit='s') 131 | 132 | if tidy == False and self.times: 133 | df = df.groupby('svo')['doc_id', 'timestamp', 'datetime'].agg(list) 134 | elif tidy == False: 135 | df = df.groupby('svo')['doc_id'].agg(list) 136 | 137 | return df 138 | 139 | def svo_to_burst(self, minimum_offsets=20, s=2, gamma=1) -> SVOburst: 140 | """Initiates burst detection on data contained in the SVOnet class. 141 | 142 | This function requires that the object was instantiates with a list 143 | of times. 144 | 145 | Args: 146 | minimum_offsets (int, optional): The minimum number of occurences 147 | of an SVO in the dataset for it to be included in the bursts 148 | calculation. Lower values include more of the dataset, at the 149 | cost of longer processing time. Defaults to 20. 150 | s (float, optional): s parameter for tuning Kleinberg algorithm. 151 | Higher values make it more difficult for bursts to move up the 152 | burst hierarchy. Defaults to 2. 153 | gamma (float, optional): gamma parameter for tuning Kleinberg 154 | algorithm. Higher values make it more difficult for activity to 155 | be considered a burst. Defaults to 1. 156 | 157 | Returns: 158 | SVOburst: An SVOburst object for exporting, visualizing, and otherwise 159 | manipulating burst data for the data contained in this class. 160 | """ 161 | if not self.times: 162 | print("Burst detection requires timestamps") 163 | return None 164 | 165 | # send offset_dict and lookup dictionary to svo_offset generating function 166 | self.offset_dict, self.lookup = generate_svo_offsets( 167 | self.svo_items, self.times, minimum_offsets) 168 | 169 | offset_dict_strings, edge_burst_dict_strings, s, gamma, from_svo, lookup = self.burst_detection( 170 | s, gamma) 171 | 172 | return SVOburst(offset_dict=offset_dict_strings, 173 | edge_burst_dict=edge_burst_dict_strings, 174 | s=s, 175 | gamma=gamma, 176 | from_svo=from_svo, 177 | lookup=lookup) 178 | -------------------------------------------------------------------------------- /nate/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UWNETLAB/Nate/9029670d5804f478cac2e83d77ff86ff2a7266c2/nate/utils/__init__.py -------------------------------------------------------------------------------- /nate/utils/mp_helpers.py: -------------------------------------------------------------------------------- 1 | """Utilities for multiprocessing.""" 2 | from joblib import Parallel, delayed, cpu_count 3 | from itertools import chain 4 | from spacy.util import minibatch 5 | from functools import partial 6 | from typing import Union, List, Dict 7 | 8 | 9 | def mp(items, function, *args) -> Union[List, Dict]: 10 | """Applies a function to a list or dict of items, using multiprocessing. 11 | 12 | This is a convenience function for generalized multiprocessing of any 13 | function that deals with a list or dictionary of items. The functions 14 | passed to `mp` must accept the list of items to be processed at the end 15 | of their function call, with optional arguments first. *args can be any 16 | number of optional arguments accepted by the function that will be 17 | multiprocessed. On Windows, functions must be defined outside of the 18 | current python file and imported, to avoid infinite recursion. 19 | """ 20 | if cpu_count() >= 10: #to avoid overtaxing Brad, save some cores 21 | cpu = 10 22 | else: 23 | cpu = cpu_count() 24 | 25 | batch_size = round(len(items) / cpu) 26 | partitions = minibatch(items, size=batch_size) 27 | executor = Parallel(n_jobs=cpu, 28 | backend="multiprocessing", 29 | prefer="processes") 30 | do = delayed(partial(function, *args)) 31 | tasks = (do(batch) for batch in partitions) 32 | temp = executor(tasks) 33 | 34 | # todo: add error catch/message for zero results 35 | 36 | if isinstance(temp[0], dict): 37 | results = {} 38 | for batch in temp: 39 | for key, value in batch.items(): 40 | results.setdefault(key, []).extend(value) 41 | elif isinstance(temp[0], (list, tuple)): 42 | results = list(chain(*temp)) 43 | 44 | return results 45 | 46 | 47 | def mp2(items, function, *args): 48 | """Applies a function to a list, returning two lists of results. 49 | 50 | This is the same as `mp` but used when two lists of results need to be 51 | returned. Will perhaps be generalized for any number of results in the 52 | future. Does not currently work for dictionaries. 53 | """ 54 | if cpu_count() >= 10: #to avoid overtaxing Brad, save some cores 55 | cpu = 10 56 | else: 57 | cpu = cpu_count() 58 | 59 | batch_size = round(len(items) / cpu) 60 | partitions = minibatch(items, size=batch_size) 61 | executor = Parallel(n_jobs=cpu, 62 | backend="multiprocessing", 63 | prefer="processes") 64 | do = delayed(partial(function, *args)) 65 | tasks = (do(batch) for batch in partitions) 66 | temp = executor(tasks) 67 | results1, results2 = zip(*temp) 68 | results1 = list(chain(*results1)) 69 | results2 = list(chain(*results2)) 70 | return results1, results2 71 | -------------------------------------------------------------------------------- /nate/utils/network_helpers.py: -------------------------------------------------------------------------------- 1 | # CREDIT CHAIN OF DEVS FOR THIS... INCLUDING MALCOLM... 2 | ''' 3 | This module implements the disparity filter to compute a significance score of edge weights in networks. 4 | Forked from: https://github.com/aekpalakorn/python-backbone-network/blob/master/backbone.py 5 | With the following changes: 6 | - formatted to pylint standards 7 | - architected as a module with no code that runs on load 8 | - broke large functions into smaller ones 9 | - copy all nodes so that completely disconnected nodes aren't removed and so that node attributes are not removed 10 | - copy all the original edge attributes so that they are not removed 11 | - bug fix: changed G.in_degree(G.successors(u)[0]) to G.in_degree(list(G.successors(u))[0]) 12 | ''' 13 | 14 | import networkx as nx 15 | import numpy as np 16 | from scipy import integrate 17 | 18 | 19 | def get_graph_backbone(G, alpha_t=0.8): 20 | '''Gets the backbone of a given graph `G`.''' 21 | G_disp = compute_disparity_filter(G) 22 | G_backbone = apply_disparity_filter(G_disp, alpha_t, cut_mode='or') 23 | return G_backbone 24 | 25 | 26 | def compute_disparity_filter(G, weight='weight'): 27 | ''' Compute significance scores (alpha) for weighted edges in G as defined in Serrano et al. 2009 28 | Args 29 | G: Weighted NetworkX graph 30 | Returns 31 | Weighted graph with a significance score (alpha) assigned to each edge 32 | References 33 | M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488. 34 | ''' 35 | return compute_disparity_filter_directed(G, weight) \ 36 | if nx.is_directed(G) else \ 37 | compute_disparity_filter_undirected(G, weight) 38 | 39 | 40 | def compute_disparity_filter_directed(G, weight='weight'): 41 | '''See docstring for `compute_disparity_filter`.''' 42 | N = nx.DiGraph() 43 | N.add_nodes_from(G.nodes(data=True)) 44 | for u in G: 45 | 46 | k_out = G.out_degree(u) 47 | k_in = G.in_degree(u) 48 | 49 | if k_out > 1: 50 | sum_w_out = sum( 51 | np.absolute(G[u][v][weight]) for v in G.successors(u)) 52 | for v in G.successors(u): 53 | w = G[u][v][weight] 54 | p_ij_out = float(np.absolute(w)) / sum_w_out 55 | alpha_ij_out = 1 - (k_out - 1) * integrate.quad( 56 | lambda x: (1 - x)**(k_out - 2), 0, p_ij_out)[0] # pylint: disable=cell-var-from-loop 57 | N.add_edge(u, v, alpha_out=float('%.4f' % alpha_ij_out)) 58 | N[u][v].update(G[u][v]) 59 | 60 | elif k_out == 1 and G.in_degree(list(G.successors(u))[0]) == 1: 61 | #we need to keep the connection as it is the only way to maintain the connectivity of the network 62 | v = list(G.successors(u))[0] 63 | N.add_edge(u, v, alpha_out=0., alpha_in=0.) 64 | N[u][v].update(G[u][v]) 65 | #there is no need to do the same for the k_in, since the link is built already from the tail 66 | 67 | if k_in > 1: 68 | sum_w_in = sum( 69 | np.absolute(G[v][u][weight]) for v in G.predecessors(u)) 70 | for v in G.predecessors(u): 71 | w = G[v][u][weight] 72 | p_ij_in = float(np.absolute(w)) / sum_w_in 73 | alpha_ij_in = 1 - (k_in - 1) * integrate.quad( 74 | lambda x: (1 - x)**(k_in - 2), 0, p_ij_in)[0] # pylint: disable=cell-var-from-loop 75 | N.add_edge(v, u, alpha_in=float('%.4f' % alpha_ij_in)) 76 | N[v][u].update(G[v][u]) 77 | return N 78 | 79 | 80 | def compute_disparity_filter_undirected(G, weight='weight'): 81 | '''See docstring for `compute_disparity_filter`.''' 82 | B = nx.Graph() 83 | B.add_nodes_from(G.nodes(data=True)) 84 | for u in G: 85 | k = len(G[u]) 86 | if k > 1: 87 | sum_w = sum(np.absolute(G[u][v][weight]) for v in G[u]) 88 | for v in G[u]: 89 | w = G[u][v][weight] 90 | p_ij = float(np.absolute(w)) / sum_w 91 | alpha_ij = 1 - (k - 1) * integrate.quad( 92 | lambda x: (1 - x)**(k - 2), 0, p_ij)[0] # pylint: disable=cell-var-from-loop 93 | B.add_edge(u, v, alpha=float('%.4f' % alpha_ij)) 94 | B[u][v].update(G[u][v]) 95 | return B 96 | 97 | 98 | def apply_disparity_filter(G, alpha_t=0.8, cut_mode='or'): 99 | ''' Performs a cut of the graph previously filtered through the disparity_filter function. 100 | Args 101 | ---- 102 | G: Weighted NetworkX graph 103 | alpha_t: double (default='0.4') 104 | The threshold for the alpha parameter that is used to select the surviving edges. 105 | It has to be a number between 0 and 1. 106 | cut_mode: string (default='or') 107 | Possible strings: 'or', 'and'. 108 | It applies only to directed graphs. It represents the logic operation to filter out edges 109 | that do not pass the threshold value, combining the alpha_in and alpha_out attributes 110 | resulting from the disparity_filter function. 111 | Returns 112 | ------- 113 | B: Weighted NetworkX graph 114 | The resulting graph contains only edges that survived from the filtering with the alpha_t threshold 115 | References 116 | --------- 117 | .. M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488. 118 | ''' 119 | return apply_disparity_filter_directed(G, alpha_t, cut_mode) \ 120 | if nx.is_directed(G) else \ 121 | apply_disparity_filter_undirected(G, alpha_t) 122 | 123 | 124 | def apply_disparity_filter_directed(G, alpha_t=0.8, cut_mode='or'): 125 | '''See the docstring for the `apply_disparity_filter` function.''' 126 | B = nx.DiGraph() 127 | B.add_nodes_from(G.nodes(data=True)) 128 | for u, v, w in G.edges(data=True): 129 | try: 130 | alpha_in = w['alpha_in'] 131 | except KeyError: #there is no alpha_in, so we assign 1. It will never pass the cut 132 | alpha_in = 1 133 | try: 134 | alpha_out = w['alpha_out'] 135 | except KeyError: #there is no alpha_out, so we assign 1. It will never pass the cut 136 | alpha_out = 1 137 | 138 | if cut_mode == 'or': 139 | if alpha_in < alpha_t or alpha_out < alpha_t: 140 | B.add_edge(u, v) 141 | B[u][v].update(G[u][v]) 142 | elif cut_mode == 'and': 143 | if alpha_in < alpha_t and alpha_out < alpha_t: 144 | B.add_edge(u, v) 145 | B[u][v].update(G[u][v]) 146 | return B 147 | 148 | 149 | def apply_disparity_filter_undirected(G, alpha_t=0.8): 150 | '''See the docstring for the `apply_disparity_filter` function.''' 151 | B = nx.Graph() 152 | B.add_nodes_from(G.nodes(data=True)) 153 | for u, v, w in G.edges(data=True): 154 | 155 | try: 156 | alpha = w['alpha'] 157 | except KeyError: #there is no alpha, so we assign 1. It will never pass the cut 158 | alpha = 1 159 | 160 | if alpha < alpha_t: 161 | B.add_edge(u, v) 162 | B[u][v].update(G[u][v]) 163 | return B 164 | -------------------------------------------------------------------------------- /nate/utils/nlp_helpers.py: -------------------------------------------------------------------------------- 1 | """Utilities for NLP, mainly using spaCy.""" 2 | import spacy 3 | from spacy.pipeline import merge_entities 4 | from .mp_helpers import mp 5 | from tok import sent_tokenize 6 | from gensim.models.phrases import Phrases, Phraser 7 | from itertools import chain 8 | from ..svonet.svonet_class import process_svo 9 | 10 | # Everything from this point down was moved from the `text_helpers` module 11 | 12 | 13 | def spacy_process(nlp, joined, sub_tags, obj_tags, texts): 14 | """Processes texts in spaCy. 15 | 16 | Primary point of access to spaCy. Requires the NLP model object to be 17 | passed, as well as the texts to be processed. Setting joined to True 18 | will combine tokens into strings, separated by white space. If the 19 | svo_component is detected, will also accept subject tags and object 20 | tags to be passed to `process_svo` 21 | """ 22 | if 'svo_component' in nlp.pipe_names: 23 | processed_list = [ 24 | doc for doc in nlp.pipe(texts, 25 | component_cfg={ 26 | 'svo_component': { 27 | 'sub_tags': sub_tags, 28 | 'obj_tags': obj_tags 29 | } 30 | }) 31 | ] 32 | elif joined == True: 33 | processed_list = [' '.join(doc) for doc in nlp.pipe(texts)] 34 | else: 35 | processed_list = [doc for doc in nlp.pipe(texts)] 36 | return processed_list 37 | 38 | 39 | def default_filter_lemma(doc): # to do: make this user-configurable 40 | """Filters spaCy pipeline. 41 | 42 | This is the default filter to be used in the spaCy pipeline for tasks 43 | that don't involve SVO. 44 | """ 45 | proc = [] 46 | for token in doc: 47 | if '_' in token.text and len(token) > 2 and token.is_ascii: 48 | proc.append(token.text) 49 | if token.is_alpha and len(token) >2 and token.is_stop is False and token.is_ascii: 50 | proc.append(token.lemma_.lower()) 51 | 52 | return proc 53 | 54 | 55 | def custom_spacy_component(doc): 56 | """ 57 | Placeholder/example for a custom spaCy pipeline component 58 | """ 59 | return [ 60 | token.lemma_.lower() 61 | for token in doc 62 | if token.is_stop == False and token.is_ascii 63 | ] 64 | 65 | 66 | def svo_component(doc, sub_tags, obj_tags): 67 | """Processes text in the SVO pipeline. 68 | 69 | TODO: Why does this function only wrap around process_svo? Consider 70 | moving wrapped function here. 71 | """ 72 | doc = process_svo(sub_tags, obj_tags, doc) 73 | return doc 74 | 75 | 76 | def bigram_process(texts, trigrams, bigram_threshold, tokenized=True): 77 | """Uses gensim to detect bigrams and trigrams. 78 | 79 | Expects a list of texts. See gensim documentation for explanations 80 | of parameters: https://radimrehurek.com/gensim/models/phrases.html 81 | """ 82 | sentences = [sent_tokenize(text) for text in texts] # gensim needs documents to come in as a list of sentences 83 | all_sentences = list(chain(*sentences)) # flatten list of sentences for training purposes 84 | model = Phrases(all_sentences, min_count=1, threshold=bigram_threshold, scoring='npmi') # train the model 85 | bigrammer = Phraser(model) # create more efficient applicator of trained model 86 | bigrammed_list = [[bigrammer[sent] for sent in doc] for doc in sentences] # apply the model to the original texts 87 | if trigrams == True: # gensim detects trigrams by stacking bigram detection on text with detected bigrams 88 | trigram_model = Phrases(bigrammer[all_sentences], min_count=1, threshold=bigram_threshold, scoring='npmi') 89 | trigrammer = Phraser(trigram_model) 90 | bigrammed_list = [[trigrammer[bigrammer[sent]] for sent in doc] for doc in sentences] 91 | bigrammed_list = [list(chain(*x)) for x in bigrammed_list] 92 | # option to return text in original form, but with underscores between bigrams 93 | if tokenized == False: 94 | bigrammed_list = [' '.join(doc) for doc in bigrammed_list] 95 | 96 | return bigrammed_list 97 | -------------------------------------------------------------------------------- /nate/utils/text_helpers.py: -------------------------------------------------------------------------------- 1 | """Utilities for manipulation of plain text.""" 2 | 3 | import pandas as pd 4 | import re 5 | 6 | 7 | def window_text(string_of_text, window_lr=3): 8 | """Creates a list of windowed strings. 9 | 10 | This function splits a string into tokens on each space. Then it iterates 11 | over each token and takes add n words to a new list where n = the number of 12 | ``window_lr`` * 2 + 1. This is because ``window_lr'' is the number of 13 | words to grab to the left AND to the right of each token in the string. 14 | If ``window_lr'' = 2, then it will take the token itself, 2 words to the 15 | left of the token, and 2 words to the right of a token. The result is a 16 | window of 5 words. As a result of this design decision, the smallest window 17 | possible is 3 words, which can be given by ``window_lr'' = 1. Finally, the 18 | windows at the start and end of a text string will be smaller than the rest 19 | because they will have fewer words at the start (nothing / less to the left) 20 | and at the end (nothing / less to the right). This function is designed to 21 | take in a string. If the string is pre-processed (which is should be), make 22 | sure it is receiving a string, not tokenized from another package, like 23 | spacy or nltk. 24 | 25 | The output of this function is a new list of windowed strings. It can be 26 | fed into functions like construct_conet() to construct a co-occurrence 27 | network where co-occurrence happens between words within a moving window. 28 | Obviouslly, this is the function that makes the windows, not the 29 | co-occurrence network. 30 | """ 31 | tokens = string_of_text.split() 32 | for _ in tokens: 33 | context = [] 34 | for index in range(len(tokens)): 35 | start = max(0, index - window_lr) 36 | finish = min(len(tokens), index + window_lr) 37 | left = " ".join(tokens[start:index]) 38 | right = " ".join(tokens[index + 1:finish]) 39 | context.append("{} {} {}".format(left, tokens[index], right)) 40 | return context 41 | 42 | 43 | def search_entities(raw_text_string, search): 44 | """Searches for known entities in a string. 45 | 46 | Helper function for construct_entity_conet(). Iterates over a list 47 | of entities and looks to see if they are present in a given text 48 | string. If they are, then it will append the entity to a list for 49 | each text. These lists of ents appearing in texts can be used to 50 | construct a network of entities that co-occurr within texts. 51 | """ 52 | ents = [] 53 | for entity in search: 54 | if entity.lower() in raw_text_string.lower(): 55 | ents.append(entity.lower()) 56 | return ents 57 | 58 | 59 | def adjmat_to_wel(adjmat, remove_self_loops=True): 60 | """ Accepts an adjacency matrix and outputs a weighted edgelist.""" 61 | adjmat = pd.read_csv('~/Desktop/testing/ajm.csv', index_col=0) 62 | adjmat.fillna(0, inplace=True) 63 | 64 | if remove_self_loops is True: 65 | # zero out the diagonal 66 | for i in adjmat.index: 67 | adjmat.loc[i, i] = 0 68 | else: 69 | pass 70 | 71 | wel = [('i', 'j', 'Weight')] 72 | for source in adjmat.index.values: 73 | for target in adjmat.index.values: 74 | if adjmat[source][target] > 0: 75 | wel.append((target, source, adjmat[source][target])) 76 | return wel 77 | 78 | 79 | def write_topics(model, feature_names, no_top_words, filename='topics.txt'): 80 | """ 81 | This is a docstring. 82 | """ 83 | with open(filename, 'w') as f: 84 | for topic_idx, topic in enumerate(model.components_): 85 | f.write("Topic {}: ".format(topic_idx)) 86 | f.write(" ".join([ 87 | feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1] 88 | ])) 89 | f.write('\n') 90 | 91 | def is_ascii(s): 92 | """Determines if a string is encoded in ascii.""" 93 | try: 94 | s.encode('ascii') 95 | except UnicodeEncodeError: 96 | return False 97 | return True 98 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="nate", 5 | version="0.0.1", 6 | install_requires=[ 7 | "pandas>=0.25.0", 8 | "spacy", 9 | #"python-igraph>=0.8.0", 10 | "tok", 11 | "numba", 12 | "joblib", 13 | "matplotlib", 14 | "networkx", 15 | "pillow", 16 | "stop_words", 17 | "gensim" 18 | ], # A bunch of things will need to go here; we'll have to do an audit of every package we use 19 | packages = find_packages(), 20 | include_package_data=True, 21 | author = "John McLevey, Tyler Crick, Pierson Browne", # likely more later 22 | description = "nate (Network Analysis with TExt).", 23 | url="http://networkslab.org", 24 | classifiers=( 25 | "Programming Language :: Python :: 3", 26 | "License :: OSI Approved :: MIT License", 27 | "Operating System :: OS Independent", 28 | ) 29 | ) 30 | -------------------------------------------------------------------------------- /tests/importers/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | 4 | @pytest.fixture(scope="module") 5 | def df(): 6 | df = pd.read_csv("tests/ira_data/IRAhandle_tweets_1.csv") 7 | return df 8 | 9 | @pytest.fixture(scope="module") 10 | def df11(): 11 | df = pd.read_csv("tests/ira_data/IRAhandle_tweets_11.csv") 12 | return df 13 | 14 | @pytest.fixture(scope="module") 15 | def empty_df(df): 16 | return pd.DataFrame(columns=df.columns) 17 | 18 | @pytest.fixture 19 | def dict_of_dicts_text(df): 20 | return {df["tweet_id"][i]: {"text": df["content"][i]} for i in range(0,10)} 21 | -------------------------------------------------------------------------------- /tests/importers/test_dfimporters.py: -------------------------------------------------------------------------------- 1 | import nate.importers.dataframe_importers as tst 2 | from nate.importers.timestamp_process import convert_times 3 | import pytest 4 | import pandas as pd 5 | 6 | # fixtures for import_csv 7 | @pytest.fixture 8 | def csv_file(): 9 | return "tests/ira_data/IRAhandle_tweets_1.csv" 10 | 11 | @pytest.fixture 12 | def csv_files(): 13 | return ["tests/ira_data/IRAhandle_tweets_1.csv", 14 | "tests/ira_data/IRAhandle_tweets_11.csv"] 15 | 16 | # fixtures for import_excel 17 | @pytest.fixture 18 | def excel_file(): 19 | return "tests/ira_data/[..]" 20 | 21 | @pytest.fixture 22 | def excel_files(): 23 | return ["tests/ira_data/IRAhandle_tweets_1.xlsx", 24 | "tests/ira_data/IRAhandle_tweets_11.xlsx"] 25 | 26 | # tests for process_dataframe 27 | def test_process_dataframe_empty(empty_df): 28 | nt = tst.process_dataframe(empty_df, "content", "tweet_id", "publish_date", 29 | columns_to_keep=["account_category"]) 30 | assert nt.list_texts() == [] 31 | assert nt.list_ids() == [] 32 | assert nt.list_times() == [] 33 | assert nt.list_column("account_category") == [] 34 | 35 | def test_process_dataframe_full(df): 36 | nt = tst.process_dataframe(df, "content", "tweet_id", "publish_date", 37 | columns_to_keep=["account_category"]) 38 | assert nt.list_texts(0,5) == df["content"][0:5].tolist() 39 | assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist() 40 | assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist()) 41 | assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist() 42 | 43 | # tests for import_dataframe (wrapper around process_dataframe) 44 | def test_import_dataframe_empty(empty_df): 45 | nt = tst.import_dataframe(empty_df, "content", "tweet_id", "publish_date", 46 | columns_to_keep=["account_category"]) 47 | assert nt.list_texts() == [] 48 | assert nt.list_ids() == [] 49 | assert nt.list_times() == [] 50 | assert nt.list_column("account_category") == [] 51 | 52 | # tests for import_csv 53 | def test_import_csv_string(csv_file, df): 54 | nt = tst.import_csv(csv_file, "content", "tweet_id", "publish_date", 55 | columns_to_keep=["account_category"]) 56 | assert nt.list_texts(0,5) == df["content"][0:5].tolist() 57 | assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist() 58 | assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist()) 59 | assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist() 60 | 61 | def test_import_csv_list(csv_files, df, df11): 62 | nt = tst.import_csv(csv_files, "content", "tweet_id", "publish_date", 63 | columns_to_keep=["account_category"]) 64 | assert nt.list_texts(0,5) == df["content"][0:5].tolist() 65 | assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist() 66 | assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist()) 67 | assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist() 68 | assert nt.list_texts(243891, 243896) == df11["content"][0:5].tolist() 69 | assert nt.list_ids(243891, 243896) == df11["tweet_id"][0:5].tolist() 70 | assert nt.list_times(243891, 243896) == convert_times(df11["publish_date"][0:5].tolist()) 71 | assert nt.list_column("account_category", 243891, 243896) == df11["account_category"][0:5].tolist() 72 | 73 | # tests for import_excel 74 | # TODO: add xlsx files. Issues saving them through python. 75 | def test_import_excel_string(excel_file): 76 | nt = tst.import_excel(excel_file, "content", "tweet_id", "publish_date", 77 | columns_to_keep=["account_category"]) 78 | assert nt.list_texts(0,5) == df["content"][0:5].tolist() 79 | assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist() 80 | assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist()) 81 | assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist() 82 | 83 | def test_import_excel_strings(excel_files): 84 | nt = tst.import_excel(excel_files, "content", "tweet_id", "publish_date", 85 | columns_to_keep=["account_category"]) 86 | assert nt.list_texts(0,5) == df["content"][0:5].tolist() 87 | assert nt.list_ids(0,5) == df["tweet_id"][0:5].tolist() 88 | assert nt.list_times(0,5) == convert_times(df["publish_date"][0:5].tolist()) 89 | assert nt.list_column("account_category",0,5) == df["account_category"][0:5].tolist() 90 | assert nt.list_texts(243891, 243896) == df11["content"][0:5].tolist() 91 | assert nt.list_ids(243891, 243896) == df11["tweet_id"][0:5].tolist() 92 | assert nt.list_times(243891, 243896) == convert_times(df11["publish_date"][0:5].tolist()) 93 | assert nt.list_column("account_category", 243891, 243896) == df11["account_category"][0:5].tolist() 94 | -------------------------------------------------------------------------------- /tests/importers/test_namedtuples.py: -------------------------------------------------------------------------------- 1 | import nate.importers.named_tuple_generator as tst 2 | from collections import namedtuple 3 | import pytest 4 | 5 | # fixtures for create_observation_list 6 | @pytest.fixture 7 | def list_of_lists(): 8 | return [["January", "February", "March"], 9 | [1, 2, 3], 10 | ["JA", "FE", "MR"]] 11 | 12 | @pytest.fixture 13 | def created_obs_list(list_of_lists): 14 | return tst.create_observation_list("Month", name=list_of_lists[0], 15 | number=list_of_lists[1], 16 | abbr=list_of_lists[2]) 17 | @pytest.fixture 18 | def uneven_list_of_lists(): 19 | return [["January", "February", "March", "April"], 20 | [1, 2], 21 | ["JA", "FE", "MR"]] 22 | 23 | # fixtures for tupleize 24 | @pytest.fixture 25 | def series_dict(list_of_lists): 26 | return {"name":list_of_lists[0], "number":list_of_lists[1], "abbr":list_of_lists[2]} 27 | 28 | @pytest.fixture 29 | def series_dict_tuple(series_dict): 30 | return {k: tuple(v) for k, v in series_dict.items()} 31 | 32 | # tests for create_observation_list 33 | def test_create_observation_list_names(created_obs_list): 34 | assert created_obs_list[0]._fields == ("name", "number", "abbr") 35 | assert created_obs_list[0].name == "January" 36 | 37 | def test_create_observation_list_contents(created_obs_list): 38 | assert created_obs_list == [("January", 1, "JA"), 39 | ("February", 2, "FE"), 40 | ("March", 3, "MR")] 41 | 42 | def test_create_observation_list_exn(uneven_list_of_lists): 43 | try: 44 | tst.create_observation_list("Month", name=uneven_list_of_lists[0], 45 | number=uneven_list_of_lists[1], 46 | abbr=uneven_list_of_lists[2]) 47 | except Exception as exn: 48 | assert exn.args[0] == "Not all of the input data is the same length." 49 | 50 | 51 | # tests for tupleize 52 | def test_tupleize_names(series_dict): 53 | obs_list = tst.tupleize(series_dict) 54 | assert obs_list[0]._fields == ("name", "number", "abbr") 55 | assert obs_list[0].name == "January" 56 | 57 | def test_tupleize_lists(series_dict): 58 | obs_list = tst.tupleize(series_dict) 59 | assert obs_list[1].name == "February" 60 | assert obs_list == [("January", 1, "JA"), 61 | ("February", 2, "FE"), 62 | ("March", 3, "MR")] 63 | 64 | def test_tupleize_tuples(series_dict_tuple): 65 | obs_list = tst.tupleize(series_dict_tuple) 66 | assert obs_list[1].name == "February" 67 | assert obs_list == [("January", 1, "JA"), 68 | ("February", 2, "FE"), 69 | ("March", 3, "MR")] 70 | -------------------------------------------------------------------------------- /tests/importers/test_nate.py: -------------------------------------------------------------------------------- 1 | import nate.importers.nate_class as tst 2 | from nate.importers.dataframe_importers import import_dataframe 3 | from nate.importers.raw_importers import import_dict_of_dicts 4 | import pytest 5 | from pprint import pformat 6 | 7 | @pytest.fixture 8 | def nate_empty_obj(empty_df): 9 | nt = import_dataframe(empty_df, "content", "tweet_id", "publish_date", 10 | columns_to_keep=["account_category"]) 11 | return nt 12 | 13 | @pytest.fixture 14 | def nate_full_obj(df): 15 | nt = import_dataframe(df, "content", "tweet_id", "publish_date", 16 | columns_to_keep=["account_category"]) 17 | return nt 18 | 19 | @pytest.fixture 20 | def nate_text_only(dict_of_dicts_text): 21 | return import_dict_of_dicts(dict_of_dicts_text, "text") 22 | 23 | # test __call__ 24 | def test_call_empty(nate_empty_obj, capsys): 25 | nate_empty_obj() 26 | captured = capsys.readouterr() 27 | assert captured.out == "[]\n" 28 | 29 | def test_call(nate_full_obj, capsys): 30 | nate_full_obj() 31 | captured = capsys.readouterr() 32 | assert captured.out == pformat(nate_full_obj.data[0:5]) + "\n" 33 | 34 | def test_call_nums(nate_full_obj, capsys): 35 | nate_full_obj(2,9) 36 | captured = capsys.readouterr() 37 | assert captured.out == pformat(nate_full_obj.data[2:9]) + "\n" 38 | 39 | 40 | # test __getitem__ 41 | def test_getitem_empty(nate_empty_obj): 42 | with pytest.raises(IndexError): 43 | nate_empty_obj[0] 44 | 45 | def test_getitem(nate_full_obj): 46 | assert nate_full_obj[0] == nate_full_obj.data[0] 47 | assert nate_full_obj[0:5] == nate_full_obj.data[0:5] 48 | assert nate_full_obj[-1] == nate_full_obj.data[-1] 49 | 50 | # test head 51 | def test_head_empty(nate_empty_obj, capsys): 52 | nate_empty_obj.head() 53 | captured = capsys.readouterr() 54 | assert captured.out == "[]\n" 55 | 56 | def test_head(nate_full_obj, capsys): 57 | nate_full_obj.head() 58 | captured = capsys.readouterr() 59 | assert captured.out == pformat(nate_full_obj.data[0:5]) + "\n" 60 | 61 | def test_head_nums(nate_full_obj, capsys): 62 | nate_full_obj.head(2,9) 63 | captured = capsys.readouterr() 64 | assert captured.out == pformat(nate_full_obj.data[2:9]) + "\n" 65 | 66 | # test list_texts 67 | def test_list_texts_empty(nate_empty_obj): 68 | assert nate_empty_obj.list_texts() == [] 69 | 70 | def test_list_texts(nate_full_obj): 71 | text_list = [i.text for i in nate_full_obj.data[0:5]] 72 | assert nate_full_obj.list_texts() == text_lists 73 | 74 | # test list_times 75 | def test_list_times_empty(nate_empty_obj): 76 | assert nate_empty_obj.list_times() == [] 77 | 78 | def test_list_times_exn(nate_text_only): 79 | with pytest.raises(AttributeError): 80 | nate_text_only.list_times() 81 | 82 | def test_list_times(nate_full_obj): 83 | times_list = [i.time for i in nate_full_obj.data[0:5]] 84 | assert nate_full_obj.list_times() == times_list 85 | 86 | # test list_ids 87 | def test_list_ids_empty(nate_empty_obj): 88 | assert nate_empty_obj.list_ids() == [] 89 | 90 | def test_list_ids_exn(nate_text_only): 91 | with pytest.raises(AttributeError): 92 | nate_text_only.list_ids() 93 | 94 | def test_list_ids(nate_full_obj): 95 | id_list = [i.id for i in nate_full_obj.data[0:5]] 96 | assert nate_full_obj.list_ids() == id_list 97 | -------------------------------------------------------------------------------- /tests/importers/test_rawimporters.py: -------------------------------------------------------------------------------- 1 | from nate.importers.nate_class import Nate 2 | import nate.importers.raw_importers as tst 3 | import pytest 4 | 5 | # fixtures for import_text 6 | @pytest.fixture 7 | def string(): 8 | return "Nate is a cool package!" 9 | 10 | @pytest.fixture 11 | def list_of_strings(df): 12 | return df["content"][0:10].values.tolist() 13 | 14 | # fixtures for import_files 15 | @pytest.fixture 16 | def file(): 17 | return "tests/importers/textfiles/1.txt" 18 | 19 | @pytest.fixture 20 | def list_of_files(): 21 | return ["tests/importers/textfiles/1.txt", 22 | "tests/importers/textfiles/2.txt", 23 | "tests/importers/textfiles/3.txt"] 24 | 25 | # fixtures for import_dict_of_dicts 26 | 27 | # see contest.py for dict_of_dicts_text 28 | 29 | @pytest.fixture 30 | def dict_of_dicts_cols(df): 31 | return {df["tweet_id"][i]: {"text": df["content"][i], 32 | "account": df.author[i]} for i in range(0,10)} 33 | 34 | # test import_texts 35 | def test_import_text_string(string): 36 | nt = tst.import_text(string) 37 | assert nt.list_texts() == [string] 38 | 39 | def test_import_text_strings(list_of_strings): 40 | nt = tst.import_text(list_of_strings) 41 | assert nt.list_texts() == list_of_strings 42 | 43 | # test import_files 44 | def test_import_files_single(file): 45 | nt = tst.import_files(file) 46 | with open(file, 'r') as stream: 47 | string = stream.read().replace("\n", " ") 48 | assert nt.list_texts() == [string] 49 | 50 | def test_import_files_list(list_of_files): 51 | nt = tst.import_files(list_of_files) 52 | 53 | strings = [] 54 | for file in list_of_files: 55 | with open(file, 'r') as stream: 56 | strings.append(stream.read().replace("\n", " ")) 57 | 58 | assert nt.list_texts() == strings 59 | 60 | # test import_dict_of_dicts 61 | def test_dict_of_dicts_texts(dict_of_dicts_text): 62 | nt = tst.import_dict_of_dicts(dict_of_dicts_text, "text") 63 | ids = nt.list_ids() 64 | texts = nt.list_texts() 65 | for i in range(0,10): 66 | assert texts[i] == dict_of_dicts_text[ids[i]]["text"] 67 | 68 | 69 | def test_dict_of_dicts_cols(dict_of_dicts_cols): 70 | nt = tst.import_dict_of_dicts(dict_of_dicts_cols, "text", values_to_keep=["account"]) 71 | ids = nt.list_ids() 72 | texts = nt.list_texts() 73 | accounts = nt.list_column("account", end=10) 74 | for i in range(0,10): 75 | assert texts[i] == dict_of_dicts_cols[ids[i]]["text"] 76 | assert accounts[i] == dict_of_dicts_cols[ids[i]]["account"] 77 | -------------------------------------------------------------------------------- /tests/importers/test_times.py: -------------------------------------------------------------------------------- 1 | import nate.importers.timestamp_process as tst 2 | import pytest 3 | import pandas as pd 4 | from datetime import timezone, timedelta 5 | 6 | # fixtures to test convert_time 7 | @pytest.fixture 8 | def time_0(): 9 | return "1/1/1970 00:00" 10 | 11 | @pytest.fixture 12 | def time_1(): 13 | return "11/12/2019 13:35" 14 | 15 | 16 | # fixtures to test convert_times 17 | @pytest.fixture 18 | def times_empty(): 19 | return [] 20 | 21 | @pytest.fixture 22 | def times_0(): 23 | return ["1/1/1970 00:00", "1/1/1970 00:02", "1/1/1970 01:01"] 24 | 25 | @pytest.fixture 26 | def times_1(df): 27 | return df["publish_date"][0:3] 28 | 29 | # tests for convert_time 30 | def test_convert_time_0(time_0): 31 | assert tst.convert_time(time_0) == 0 32 | 33 | def test_convert_time_1(time_1): 34 | assert tst.convert_time(time_1) == 1573565700 35 | 36 | def test_convert_time_timezone(time_0): 37 | assert tst.convert_time(time_0, timezone(timedelta(hours=-3))) == 10800 38 | 39 | # tests for convert_times 40 | def test_convert_times_empty(times_empty): 41 | assert tst.convert_times(times_empty) == [] 42 | 43 | def test_convert_times_0(times_0): 44 | assert tst.convert_times(times_0) == [0,120,3660] 45 | 46 | def test_convert_times_1(times_1): 47 | assert tst.convert_times(times_1) == [1506887880, 1506897780, 1506898200] 48 | 49 | def test_convert_times_timezone(times_0): 50 | assert tst.convert_times(times_0, 51 | timezone(timedelta(hours=-3))) == [0+10800, 120+10800, 3660+10800] 52 | -------------------------------------------------------------------------------- /tests/importers/textfiles/1.txt: -------------------------------------------------------------------------------- 1 | How promotion excellent curiosity yet attempted happiness. Gay prosperous impression had conviction. For every delay death ask style. Me mean able my by in they. Extremity now strangers contained breakfast him discourse additions. Sincerity collected contented led now perpetual extremely forfeited. 2 | 3 | Bringing unlocked me an striking ye perceive. Mr by wound hours oh happy. Me in resolution pianoforte continuing we. Most my no spot felt by no. He he in forfeited furniture sweetness he arranging. Me tedious so to behaved written account ferrars moments. Too objection for elsewhere her preferred allowance her. Marianne shutters mr steepest to me. Up mr ignorant produced distance although is sociable blessing. Ham whom call all lain like. 4 | 5 | Demesne far hearted suppose venture excited see had has. Dependent on so extremely delivered by. Yet no jokes worse her why. Bed one supposing breakfast day fulfilled off depending questions. Whatever boy her exertion his extended. Ecstatic followed handsome drawings entirely mrs one yet outweigh. Of acceptance insipidity remarkably is invitation. 6 | 7 | Is at purse tried jokes china ready decay an. Small its shy way had woody downs power. To denoting admitted speaking learning my exercise so in. Procured shutters mr it feelings. To or three offer house begin taken am at. As dissuade cheerful overcame so of friendly he indulged unpacked. Alteration connection to so as collecting me. Difficult in delivered extensive at direction allowance. Alteration put use diminution can considered sentiments interested discretion. An seeing feebly stairs am branch income me unable. 8 | 9 | He my polite be object oh change. Consider no mr am overcame yourself throwing sociable children. Hastily her totally conduct may. My solid by stuff first smile fanny. Humoured how advanced mrs elegance sir who. Home sons when them dine do want to. Estimating themselves unsatiable imprudence an he at an. Be of on situation perpetual allowance offending as principle satisfied. Improved carriage securing are desirous too. 10 | 11 | So by colonel hearted ferrars. Draw from upon here gone add one. He in sportsman household otherwise it perceived instantly. Is inquiry no he several excited am. Called though excuse length ye needed it he having. Whatever throwing we on resolved entrance together graceful. Mrs assured add private married removed believe did she. 12 | 13 | Breakfast agreeable incommode departure it an. By ignorant at on wondered relation. Enough at tastes really so cousin am of. Extensive therefore supported by extremity of contented. Is pursuit compact demesne invited elderly be. View him she roof tell her case has sigh. Moreover is possible he admitted sociable concerns. By in cold no less been sent hard hill. 14 | 15 | Started his hearted any civilly. So me by marianne admitted speaking. Men bred fine call ask. Cease one miles truth day above seven. Suspicion sportsmen provision suffering mrs saw engrossed something. Snug soon he on plan in be dine some. 16 | 17 | Effect if in up no depend seemed. Ecstatic elegance gay but disposed. We me rent been part what. An concluded sportsman offending so provision mr education. Bed uncommonly his discovered for estimating far. Equally he minutes my hastily. Up hung mr we give rest half. Painful so he an comfort is manners. 18 | 19 | An country demesne message it. Bachelor domestic extended doubtful as concerns at. Morning prudent removal an letters by. On could my in order never it. Or excited certain sixteen it to parties colonel. Depending conveying direction has led immediate. Law gate her well bed life feet seen rent. On nature or no except it sussex. 20 | -------------------------------------------------------------------------------- /tests/importers/textfiles/2.txt: -------------------------------------------------------------------------------- 1 | Not him old music think his found enjoy merry. Listening acuteness dependent at or an. Apartments thoroughly unsatiable terminated sex how themselves. She are ten hours wrong walls stand early. Domestic perceive on an ladyship extended received do. Why jennings our whatever his learning gay perceive. Is against no he without subject. Bed connection unreserved preference partiality not unaffected. Years merit trees so think in hoped we as. 2 | 3 | Whether article spirits new her covered hastily sitting her. Money witty books nor son add. Chicken age had evening believe but proceed pretend mrs. At missed advice my it no sister. Miss told ham dull knew see she spot near can. Spirit her entire her called. 4 | 5 | Up unpacked friendly ecstatic so possible humoured do. Ample end might folly quiet one set spoke her. We no am former valley assure. Four need spot ye said we find mile. Are commanded him convinced dashwoods did estimable forfeited. Shy celebrated met sentiments she reasonably but. Proposal its disposed eat advanced marriage sociable. Drawings led greatest add subjects endeavor gay remember. Principles one yet assistance you met impossible. 6 | 7 | On recommend tolerably my belonging or am. Mutual has cannot beauty indeed now sussex merely you. It possible no husbands jennings ye offended packages pleasant he. Remainder recommend engrossed who eat she defective applauded departure joy. Get dissimilar not introduced day her apartments. Fully as taste he mr do smile abode every. Luckily offered article led lasting country minutes nor old. Happen people things oh is oppose up parish effect. Law handsome old outweigh humoured far appetite. 8 | 9 | Post no so what deal evil rent by real in. But her ready least set lived spite solid. September how men saw tolerably two behaviour arranging. She offices for highest and replied one venture pasture. Applauded no discovery in newspaper allowance am northward. Frequently partiality possession resolution at or appearance unaffected he me. Engaged its was evident pleased husband. Ye goodness felicity do disposal dwelling no. First am plate jokes to began of cause an scale. Subjects he prospect elegance followed no overcame possible it on. 10 | 11 | Forfeited you engrossed but gay sometimes explained. Another as studied it to evident. Merry sense given he be arise. Conduct at an replied removal an amongst. Remaining determine few her two cordially admitting old. Sometimes strangers his ourselves her depending you boy. Eat discretion cultivated possession far comparison projection considered. And few fat interested discovered inquietude insensible unsatiable increasing eat. 12 | 13 | He moonlight difficult engrossed an it sportsmen. Interested has all devonshire difficulty gay assistance joy. Unaffected at ye of compliment alteration to. Place voice no arise along to. Parlors waiting so against me no. Wishing calling are warrant settled was luckily. Express besides it present if at an opinion visitor. 14 | 15 | Smallest directly families surprise honoured am an. Speaking replying mistress him numerous she returned feelings may day. Evening way luckily son exposed get general greatly. Zealously prevailed be arranging do. Set arranging too dejection september happiness. Understood instrument or do connection no appearance do invitation. Dried quick round it or order. Add past see west felt did any. Say out noise you taste merry plate you share. My resolve arrived is we chamber be removal. 16 | 17 | Much did had call new drew that kept. Limits expect wonder law she. Now has you views woman noisy match money rooms. To up remark it eldest length oh passed. Off because yet mistake feeling has men. Consulted disposing to moonlight ye extremity. Engage piqued in on coming. 18 | 19 | Is we miles ready he might going. Own books built put civil fully blind fanny. Projection appearance at of admiration no. As he totally cousins warrant besides ashamed do. Therefore by applauded acuteness supported affection it. Except had sex limits county enough the figure former add. Do sang my he next mr soon. It merely waited do unable. 20 | 21 | -------------------------------------------------------------------------------- /tests/importers/textfiles/3.txt: -------------------------------------------------------------------------------- 1 | Advantage old had otherwise sincerity dependent additions. It in adapted natural hastily is justice. Six draw you him full not mean evil. Prepare garrets it expense windows shewing do an. She projection advantages resolution son indulgence. Part sure on no long life am at ever. In songs above he as drawn to. Gay was outlived peculiar rendered led six. 2 | 3 | Am terminated it excellence invitation projection as. She graceful shy believed distance use nay. Lively is people so basket ladies window expect. Supply as so period it enough income he genius. Themselves acceptance bed sympathize get dissimilar way admiration son. Design for are edward regret met lovers. This are calm case roof and. 4 | 5 | Extended kindness trifling remember he confined outlived if. Assistance sentiments yet unpleasing say. Open they an busy they my such high. An active dinner wishes at unable hardly no talked on. Immediate him her resolving his favourite. Wished denote abroad at branch at. 6 | 7 | In show dull give need so held. One order all scale sense her gay style wrote. Incommode our not one ourselves residence. Shall there whose those stand she end. So unaffected partiality indulgence dispatched to of celebrated remarkably. Unfeeling are had allowance own perceived abilities. 8 | 9 | Up branch to easily missed by do. Admiration considered acceptance too led one melancholy expression. Are will took form the nor true. Winding enjoyed minuter her letters evident use eat colonel. He attacks observe mr cottage inquiry am examine gravity. Are dear but near left was. Year kept on over so as this of. She steepest doubtful betrayed formerly him. Active one called uneasy our seeing see cousin tastes its. Ye am it formed indeed agreed relied piqued. 10 | 11 | Or neglected agreeable of discovery concluded oh it sportsman. Week to time in john. Son elegance use weddings separate. Ask too matter formed county wicket oppose talent. He immediate sometimes or to dependent in. Everything few frequently discretion surrounded did simplicity decisively. Less he year do with no sure loud. 12 | 13 | Dwelling and speedily ignorant any steepest. Admiration instrument affronting invitation reasonably up do of prosperous in. Shy saw declared age debating ecstatic man. Call in so want pure rank am dear were. Remarkably to continuing in surrounded diminution on. In unfeeling existence objection immediate repulsive on he in. Imprudence comparison uncommonly me he difficulty diminution resolution. Likewise proposal differed scarcely dwelling as on raillery. September few dependent extremity own continued and ten prevailed attending. Early to weeks we could. 14 | 15 | Quick six blind smart out burst. Perfectly on furniture dejection determine my depending an to. Add short water court fat. Her bachelor honoured perceive securing but desirous ham required. Questions deficient acuteness to engrossed as. Entirely led ten humoured greatest and yourself. Besides ye country on observe. She continue appetite endeavor she judgment interest the met. For she surrounded motionless fat resolution may. 16 | 17 | Improve ashamed married expense bed her comfort pursuit mrs. Four time took ye your as fail lady. Up greatest am exertion or marianne. Shy occasional terminated insensible and inhabiting gay. So know do fond to half on. Now who promise was justice new winding. In finished on he speaking suitable advanced if. Boy happiness sportsmen say prevailed offending concealed nor was provision. Provided so as doubtful on striking required. Waiting we to compass assured. 18 | 19 | She exposed painted fifteen are noisier mistake led waiting. Surprise not wandered speedily husbands although yet end. Are court tiled cease young built fat one man taken. We highest ye friends is exposed equally in. Ignorant had too strictly followed. Astonished as travelling assistance or unreserved oh pianoforte ye. Five with seen put need tore add neat. Bringing it is he returned received raptures. 20 | --------------------------------------------------------------------------------