├── .editorconfig ├── .gitignore ├── LICENSE.txt ├── README.md ├── bin └── textplot ├── notes └── mental-maps │ ├── figures │ ├── battle-v-dance.png │ ├── borodino-austerlitz.png │ ├── borodino-v-austerlitz.png │ ├── horse-histogram.png │ ├── horse-kde.png │ ├── horse-offsets.png │ ├── horse-v-galloped.png │ ├── horse-v-rode.png │ ├── kernels.png │ ├── peace.png │ ├── war-v-peace.png │ └── war.png │ ├── index.md │ └── networks │ ├── divine-comedy.jpg │ ├── leaves-of-grass.jpg │ ├── moby-dick.jpg │ ├── notes-from-underground.jpg │ ├── odyssey.jpg │ ├── origin-of-species.jpg │ ├── walden.jpg │ ├── war-and-peace.jpg │ └── winters-tale.jpg ├── requirements.txt ├── setup.py ├── test ├── matrix │ ├── __init__.py │ ├── test_anchored_pairs.py │ ├── test_index.py │ └── test_set_get.py ├── text │ ├── __init__.py │ ├── fixtures │ │ └── stopwords.txt │ ├── test_load_stopwords.py │ ├── test_most_frequent_terms.py │ ├── test_term_count_buckets.py │ ├── test_term_counts.py │ ├── test_tokenize.py │ └── test_unstem.py └── utils │ ├── __init__.py │ ├── test_sort_dict.py │ ├── test_tokenize.py │ └── test_window.py └── textplot ├── __init__.py ├── data └── stopwords.txt ├── graphs.py ├── helpers.py ├── matrix.py ├── text.py └── utils.py /.editorconfig: -------------------------------------------------------------------------------- 1 | 2 | # EditorConfig: http://EditorConfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | indent_style = space 8 | trim_trailing_whitespace = true 9 | end_of_line = lf 10 | insert_final_newline = true 11 | max_line_length = 80 12 | indent_size = 4 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | env 4 | build 5 | dist 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2015 David McClure 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Textplot 2 | 3 | **_War and Peace_** (click to zoom) 4 | 5 | ![War and Peace](notes/mental-maps/networks/war-and-peace.jpg) 6 | 7 | **Textplot** is a little program that converts a document into a network of terms, with the goal of teasing out information about the high-level topic structure of the text. For each term: 8 | 9 | 1. Get the set of offsets in the document where the term appears. 10 | 11 | 1. Using [kernel density estimation](http://en.wikipedia.org/wiki/Kernel_density_estimation), compute a probability density function (PDF) that represents the word's distribution across the document. Eg, from _War and Peace_: 12 | 13 | ![War and Peace](notes/mental-maps/figures/war.png) 14 | 15 | 1. Compute a [Bray-Curtis](http://en.wikipedia.org/wiki/Bray%E2%80%93Curtis_dissimilarity) dissimilarity between the term's PDF and the PDFs of all other terms in the document. This measures the extent to which two words appear in the same locations. 16 | 17 | 1. Sort this list in descending order to get a custom "topic" for the term. Skim off the top N words (usually 10-20) to get the strongest links. Here's "napoleon": 18 | 19 | ```bash 20 | [('napoleon', 1.0), 21 | ('war', 0.65319871313854128), 22 | ('military', 0.64782349297012154), 23 | ('men', 0.63958189887106576), 24 | ('order', 0.63636730075877446), 25 | ('general', 0.62621616907584432), 26 | ('russia', 0.62233286026418089), 27 | ('king', 0.61854160459241103), 28 | ('single', 0.61630514751638699), 29 | ('killed', 0.61262010905310182), 30 | ('peace', 0.60775702746632576), 31 | ('contrary', 0.60750138486684579), 32 | ('number', 0.59936009740377516), 33 | ('accompanied', 0.59748552019874168), 34 | ('clear', 0.59661288775164523), 35 | ('force', 0.59657370362505935), 36 | ('army', 0.59584331507492383), 37 | ('authority', 0.59523854206807647), 38 | ('troops', 0.59293965397478188), 39 | ('russian', 0.59077308177196441)] 40 | ``` 41 | 42 | 1. Shovel all of these links into a network and export a GML file. 43 | 44 | ## Generating graphs 45 | 46 | There are two ways to create graphs - you can use the `textplot` executable from the command line, or, if you want to tinker around with the underlying NetworkX graph instance, you can fire up a Python shell and use the `build_graph()` helper directly. 47 | 48 | Either way, first install Textplot. With PyPI: 49 | 50 | `pip install textplot` 51 | 52 | Or, clone the repo and install the package manually: 53 | 54 | ```bash 55 | pyvenv env 56 | . env/bin/activate 57 | pip install -r requirements.txt 58 | python setup.py install 59 | ``` 60 | 61 | ### From the command line 62 | 63 | Then, from the command line, generate graphs with: 64 | 65 | `texplot generate [IN_PATH] [OUT_PATH] [OPTIONS]` 66 | 67 | Where the input is a regular `.txt` file, and the output is a [`.gml`](http://en.wikipedia.org/wiki/Graph_Modelling_Language) file. So, if you're working with _War and Peace_: 68 | 69 | `texplot generate war-and-peace.txt war-and-peace.gml` 70 | 71 | The `generate` command takes these options: 72 | 73 | - **`--term_depth=1000` (int)** - The number of terms to include in the network. For now, Textplot takes the top N most frequent terms, after stopwords are removed. 74 | 75 | - **`--skim_depth=10` (int)** - The number of connections (edges) to skim off the top of the "topics" computed for each word. 76 | 77 | - **`--d_weights` (flag)** - By default, terms that appear in similar locations in the document will be connected by edges with "heavy" weights, the semantic expected by force-directed layout algorithms like Force Atlas 2 in Gephi. If this flag is passed, the weights will be inverted - use this if you want to do any kind of pathfinding analysis on the graph, where it's generally assumed that edge weights represent _distance_ or _cost_. 78 | 79 | - **`--bandwidth=2000` (int)** - The [bandwidth](http://en.wikipedia.org/wiki/Kernel_density_estimation#Bandwidth_selection) for the kernel density estimation. This controls how "smoothness" of the curve. 2000 is a sensible default for long novels, but bump it down if you're working with shorter texts. 80 | 81 | - **`--samples=1000` (int)** - The number of equally-spaced points on the X-axis where the kernel density is sampled. 1000 is almost always enough, unless you're working with a huge document. 82 | 83 | - **`--kernel=gaussian` (str)** - The kernel function. The [scikit-learn implementation](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html) also supports `tophat`, `epanechnikov`, `exponential`, `linear`, and `cosine`. 84 | 85 | ### From a Python shell 86 | 87 | Or, fire up a Python shell and import `build_graph()` directly: 88 | 89 | ```bash 90 | In [1]: from textplot.helpers import build_graph 91 | 92 | In [2]: g = build_graph('war-and-peace.txt') 93 | 94 | Tokenizing text... 95 | Extracted 573064 tokens 96 | 97 | Indexing terms: 98 | [################################] 124750/124750 - 00:00:06 99 | 100 | Generating graph: 101 | [################################] 500/500 - 00:00:03 102 | ``` 103 | 104 | `build_graph()` returns an instance of `textplot.graphs.Skimmer`, which gives access to an instance of `networkx.Graph`. Eg, to get degree centralities: 105 | 106 | ```bash 107 | In [3]: import networkx as nx 108 | In [4]: nx.degree_centrality(g.graph) 109 | ``` 110 | 111 | --- 112 | 113 | Texplot uses **[numpy](http://www.numpy.org)**, **[scipy](http://www.scipy.org)**, **[scikit-learn](http://scikit-learn.org)**, **[matplotlib](http://matplotlib.org)**, **[networkx](http://networkx.github.io)**, and **[clint](https://github.com/kennethreitz/clint)**. 114 | -------------------------------------------------------------------------------- /bin/textplot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import click 5 | 6 | from textplot.helpers import build_graph 7 | 8 | 9 | @click.group() 10 | def textplot(): 11 | pass 12 | 13 | 14 | @textplot.command() 15 | 16 | @click.argument('in_path', type=click.Path()) 17 | @click.argument('out_path', type=click.Path()) 18 | 19 | @click.option( 20 | '--term_depth', 21 | default=1000, 22 | help='The total number of terms in the network.' 23 | ) 24 | 25 | @click.option( 26 | '--skim_depth', 27 | default=10, 28 | help='The number of words each word is connected to in the network.' 29 | ) 30 | 31 | @click.option( 32 | '--d_weights', 33 | is_flag=True, 34 | help='If set, connect "close" terms with low edge weights.' 35 | ) 36 | 37 | @click.option( 38 | '--bandwidth', 39 | default=2000, 40 | help='The kernel bandwidth.' 41 | ) 42 | 43 | @click.option( 44 | '--samples', 45 | default=1000, 46 | help='The number of times the kernel density is sampled.' 47 | ) 48 | 49 | @click.option( 50 | '--kernel', 51 | default='gaussian', 52 | help='The kernel function.', 53 | type=click.Choice([ 54 | 'gaussian', 55 | 'tophat', 56 | 'epanechnikov', 57 | 'exponential', 58 | 'linear', 59 | 'cosine' 60 | ]) 61 | ) 62 | 63 | def generate(in_path, out_path, **kwargs): 64 | 65 | """ 66 | Convert a text into a GML file. 67 | """ 68 | 69 | g = build_graph(in_path, **kwargs) 70 | g.write_gml(out_path) 71 | 72 | 73 | if __name__ == '__main__': 74 | textplot() 75 | -------------------------------------------------------------------------------- /notes/mental-maps/figures/battle-v-dance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/battle-v-dance.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/borodino-austerlitz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/borodino-austerlitz.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/borodino-v-austerlitz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/borodino-v-austerlitz.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/horse-histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/horse-histogram.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/horse-kde.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/horse-kde.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/horse-offsets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/horse-offsets.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/horse-v-galloped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/horse-v-galloped.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/horse-v-rode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/horse-v-rode.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/kernels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/kernels.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/peace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/peace.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/war-v-peace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/war-v-peace.png -------------------------------------------------------------------------------- /notes/mental-maps/figures/war.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/figures/war.png -------------------------------------------------------------------------------- /notes/mental-maps/index.md: -------------------------------------------------------------------------------- 1 | 2 | ## (Mental) maps of texts 3 | 4 | ![War and Peace](networks/war-and-peace.jpg) 5 | 6 | **_War and Peace_** (click to zoom) 7 | 8 | Earlier in the summer, I was thinking about the way that words distribute inside of long texts - the way they slosh around, ebb and flow, clump together in some parts but not others. Some words don't really do this at all - they're spaced evenly throughout the document, and their distribution doesn't say much about the overall structure of the text. This is certainly true for stopwords like "the" or "an," but it's also true for words that carry more semantic information but aren't really associated with any particular content matter. For example, think of words like "next" or "perhaps" - they're generic terms, free-agents that could be used in almost any context. 9 | 10 | Other words, though, have a really strong semantic focus - they occur unevenly, and they tend to hang together with other words that orbit around a shared topic. For example, think of a long novel like _War and Peace_, which contains dozens of different conceptual threads. There are battles, dances, hunts, meals, duels, salons, parlors - and, in the broadest sense, the "war" sections and the "peace" sections. Some words are really closely associated with some of these topics but not others. If you open to a random page and see words like "Natasha," "Sonya," "mother," "love," or "tender," it's a pretty good bet that you're in a peace-y section. But if you see words like "Napoleon," "war," "military," "general," or "order," it's probably a war section. Or, at a more granular level, if you see words like "historian" or "clock" or "inevitable," there's a good chance it's one of those pesky historiographic essays. 11 | 12 | To borrow Franco Moretti's term, I was looking for a way to "operationalize" these distributions - some kind of lightweight, flexible statistic that would capture the structure of the locations of a term inside a document, ideally in a way that would make it easy to compare it with with the locations of other words. I started poking around, and quickly discovered that if you know anything about statistics (I really don't, so take all of this with a grain of salt), there's a really simple and obvious way to do this - a [kernel density estimate](http://en.wikipedia.org/wiki/Kernel_density_estimation), which takes a collection of observed data points and works backward to approximate a probabilty density function that, if you sampled it the same number of times, would produce more or less the same set of data. 13 | 14 | Kernel density estimation (KDE) is really easy to reason about - unlike the math behind something like topic modeling, which gets complicated pretty fast, KDE is basically just simple arithmetic. Think of the text as a big X-axis, where each integer corresponds to a word position in the text - "novel time," as Matt Jockers calls it. So, for _War and Peace_, the text would stretch from the origin to the X-axis offset of 573,064, the number of words in the text. Then, any word can be plotted just by laying down ticks on the X-axis at each location where the word shows up in the document. For example, here's "horse" in _War and Peace_: 15 | 16 | !["Horse" instance offsets](figures/horse-offsets.png) 17 | 18 | An obvious first step is to create a simple histogram: 19 | 20 | ![Histogram of "horse"](figures/horse-histogram.png) 21 | 22 | A kernel density estimate is the same idea, except, instead of just chopping the X-axis up into a set of bins and counting the points, each point is represented as a "[kernel](http://en.wikipedia.org/wiki/Kernel_(statistics))" function. A kernel is just some kind of weighting function that models a decay in intensity around the point. At the very simplest, it could be something like the uniform kernel, which just converts the point into a rectangular region over the X-axis, but most applications use something smoother like the Epanechnikov or Gaussian functions. The important thing, though, is that the kernel transforms the point into a _range_ or _interval_ of significance, instead of just a one-dimensional dot. This is nice because it maps well onto basic intuitions about the "scope" of a word in a text. When you come across a word, _where_ exactly does it have significance? Definitely right there, where it appears, but not _just_ there - it also makes sense to think of a kind of "relevance" or "meaning energy" that dissipates around the word, slowly at first across the immediately surrounding words and then more quickly as the distance increases. 23 | 24 | Anyway, once the all of the kernels are in place, estimating the density function is just a matter of stepping through each position on the X-axis and adding up the values of all the kernel functions at that particular location. This gives a composite curve that captures the overall distributon of the term. Here's "horse" again: 25 | 26 | ![Density estimate of "horse"](figures/horse-kde.png) 27 | 28 | This makes it possible to visually confirm the earlier intuitions about the groups of words that tend to hang together in the text. Here's the peace-y cluster from above: 29 | 30 | ![Peace-y terms](figures/peace.png) 31 | 32 | And the war-y cluster: 33 | 34 | ![War-y terms](figures/war.png) 35 | 36 | And all together, which shakes out the contours of the two general categories. When one goes up, the other goes down: 37 | 38 | ![War vs. Peace](figures/war-v-peace.png) 39 | 40 | ### "More like this" 41 | 42 | These are fun to look at, but the real payoff is that the density functions make it easy to compute a really precise, fine-grained similarity score that measures the extent to which any two words appear in the same locations in the text. Since the end result is just a regular probability density function, we can make use of any of the dozens of statistical tests that measure the closenes of two distributions (see [this paper](http://csis.pace.edu/ctappert/dps/d861-12/session4-p2.pdf) for a really good survey of the options). One of the simplest and most efficient ways to do this is just to measure the size of the geometric overlap between the two distributions. This gives a score between 0 and 1, where 0 would mean that the two words appear in completely different parts of the text, and 1 would mean that the words appear in _exactly_ the same places (ie, they're the same word). For example, how similar is "horse" to "rode"? 43 | 44 | !["horse" vs. "rode"](figures/horse-v-rode.png) 45 | 46 | Very close - their density functions have about an 80% overlap, which puts "rode" just a bit closer than "galloped," which weighs in at ~0.78: 47 | 48 | !["horse" vs. "galloped"](figures/horse-v-galloped.png) 49 | 50 | Or, at the opposite end of the spectrum, words that show up in very different parts of the document will have much less overlap, and the score will edge towards 0. For example, battles and dances don't have much to do with each other: 51 | 52 | !["battle" vs. "dance"](figures/battle-v-dance.png) 53 | 54 | This, then, points to a interesting next step - for any given word, you can compute its similarity score with _every other word in the text_, and then sort the results in descending order to create a kind of "more-like-this" list. For example, here are the twenty words that distribute most closely with "Napoleon," all clearly related to war, conquest, power, etc: 55 | 56 | ```bash 57 | [('napoleon', 1.0), 58 | ('war', 0.65319871313854128), 59 | ('military', 0.64782349297012154), 60 | ('men', 0.63958189887106576), 61 | ('order', 0.63636730075877446), 62 | ('general', 0.62621616907584432), 63 | ('russia', 0.62233286026418089), 64 | ('king', 0.61854160459241103), 65 | ('single', 0.61630514751638699), 66 | ('killed', 0.61262010905310182), 67 | ('peace', 0.60775702746632576), 68 | ('contrary', 0.60750138486684579), 69 | ('number', 0.59936009740377516), 70 | ('accompanied', 0.59748552019874168), 71 | ('clear', 0.59661288775164523), 72 | ('force', 0.59657370362505935), 73 | ('army', 0.59584331507492383), 74 | ('authority', 0.59523854206807647), 75 | ('troops', 0.59293965397478188), 76 | ('russian', 0.59077308177196441)] 77 | ``` 78 | 79 | Or, at the other end of the spectrum, "Natasha" sits atop a stack of very Natasha-esque words related to family, emotion, youth, and general peace-time happiness (with the exception of "sad," which, presumably, is the unhappy endings with Anatole and Andrei): 80 | 81 | ```bash 82 | [('natasha', 1.0), 83 | ('sonya', 0.70886263341693823), 84 | ('countess', 0.69992603393549424), 85 | ('mother', 0.69396076158543107), 86 | ('love', 0.69394361206264776), 87 | ('tender', 0.69022062349028213), 88 | ('family', 0.63830887117531232), 89 | ('marry', 0.63600169904982695), 90 | ('secret', 0.6352113995040839), 91 | ('happy', 0.63179263139217623), 92 | ('girl', 0.62577947223072128), 93 | ('flushed', 0.61694787819224595), 94 | ('rapturous', 0.61229277139972438), 95 | ('sad', 0.6121299034400407), 96 | ('happened', 0.60853750169005538), 97 | ('invited', 0.60431370654414285), 98 | ('parents', 0.60292426299430668), 99 | ('jumped', 0.59803596295531403), 100 | ('realized', 0.59801227498210729), 101 | ('lady', 0.596816756054939)] 102 | ``` 103 | 104 | By skimming off the strongest links at the top of the stack, you end up with a custom little "distribution topic" for the word, a community of siblings that intuitively hang together. It's sort of like really simple, "intra-document" form of topic modeling. 105 | 106 | ### Twisty little passages 107 | 108 | The cool thing about this, though, is that it makes it possible to traverse the internal topic structure of the document, instead of just sliding back and forth on the linear axis of words. For example, once you've computed the sibling community for "napoleon," you can then do the same thing for any of the other words in the stack. If you take the second word, for example - "war" - and compute _its_ sibling community, you'll see many of the same words again. But, since the distribution of "war" is a bit different, other terms will start to creep into view. Each time you do this, the semantic field will shift to center most closely on the anchoring word at the top of the stack. And, as you do this again and again, you start to traverse into completely different domains of meaning. The terms related to family under the "Natasha" topic - "mother," "marry," "love," "family" - can be followed into a more abstract cluster of words related to people in general - "relations," "act," "person," "people." Which, then, bleeds into Tolstoy's theory of history, which largely takes the form of _military_ history, which makes the hop over into the words about war - "military," "men," "general," and, of course, "napoleon." Each sibling community is like a room in a maze, and each of the words is like a door that leads into an adjacent room that occupies a similar but slightly different place in the overall organization of the document. 109 | 110 | This fascinates me because it _de-linearizes_ the text - which, I think, is closer to the form it takes when it's staged in the mind of a reader. Texts are one-dimensional lines, but we don't really think of texts as lines, or at least not _just_ as lines. We think of them as landscapes, diagrams, networks, maps - clusters of characters, scenes, ideas, emotional valences, and color palettes, all set in relation to one another and wired up in lots of different ways. Notions of "proximity" or "closeness" become divorced from the literal, X-axis positions of things in the document. In _War and Peace_, for example, I think of the battles at Borodino and Austerliz as being very "close" to one another, in the sense that they're the two major military set pieces in the plot. In fact, though, they're actually very "distant" in terms of where they actually appear in the text - they're separated by about 300,000 words, and their density functions only have an overlap of ~0.32, meaning, essentially, that they _don't_ overlap with each other about 70% of the time: 111 | 112 | !["borodino" vs. "austerlitz"](figures/borodino-v-austerlitz.png) 113 | 114 | So, how to operationalize that "conceptual" closeness? It turns out that this can be captured really easily just by building out a comprehensive network that traces out all of the connections between all the words at once. The basic idea here - converting a text into a network - is an old one. Lots of projects have experiment with representing a text as a _social_ network, a set of relationships between characters who speak to one another or appear together in the same sections of teh text. And lots of other projects have looked into different ways of representing _all_ the terms in a text, like I'm doing here. Back in 2011, a really interesting project called [TexTexture](http://textexture.com) devised a method for visualizing the relationships between words that appear within a 2- or 5-word radius in the document. As I'll show in a moment, though, I think there are some interesting advantages to using the density functions as the underlying statistic - the distributions tease out a kind of architectural "blueprint" of the document, which often maps onto the cognitive experience of the text in interesting ways. 115 | 116 | Anyway, once we've laid down all the piping to compute the little distribution topic for a word, it's easy to do this for _all_ of the words, and then shovel the strongest connections into the network. For example, if we take the top 10 strongest links, the "napoleon" topic would result in these edges: 117 | 118 | ```bash 119 | 'napoleon' -> 'war' (0.65319871313854128) 120 | 'napoleon' -> 'military' (0.64782349297012154) 121 | 'napoleon' -> 'men' (0.63958189887106576) 122 | 'napoleon' -> 'order' (0.63636730075877446) 123 | 'napoleon' -> 'general' (0.62621616907584432) 124 | 'napoleon' -> 'russia' (0.62233286026418089) 125 | 'napoleon' -> 'king' (0.61854160459241103) 126 | 'napoleon' -> 'single' (0.61630514751638699) 127 | 'napoleon' -> 'killed' (0.61262010905310182) 128 | 'napoleon' -> 'peace' (0.60775702746632576) 129 | ``` 130 | 131 | Once this is in place, we get access to the whole scientific literature of graph-theoretic concepts, and the conceptual relationship between "austerlitz" and "borodino" falls out really easily - we can use Dijkstra's algorithm to get the shortest path between the two, which, unsurprisingly, makes just a single hop through the word "battle": 132 | 133 | ```bash 134 | 'austerlitz' -> 'battle' -> 'borodino' 135 | ``` 136 | 137 | With a path length of ~`1.12`, which puts "borodino" as the 17th closest word to "austerlitz" out of the 1000 most frequent words in the text, closer than **98%** of the list, even though they only co-occur about 30% of the time: 138 | 139 | ```bash 140 | [('austerlitz', 0), 141 | ('campaign', 0.65615563830253976), 142 | ('military', 0.66679539911035457), 143 | ('success', 0.67787007836939139), 144 | ('proposed', 0.67865808794484395), 145 | ('general', 0.68398944996815592), 146 | ('emperor', 0.68560221976349289), 147 | ('suggested', 0.68648086875769576), 148 | ('battle', 0.68844748754808149), 149 | ('war', 0.6976220338260869), 150 | ('kutuzov', 0.70172001357385272), 151 | ('men', 1.0728368565805544), 152 | ('army', 1.0815712054823732), 153 | ('russian', 1.1133724933839888), 154 | ('commander', 1.1138451527645024), 155 | ('sovereign', 1.1151396260899902), 156 | ('french', 1.1172586830800189), 157 | ('borodino', 1.1183098845396797), <--- 158 | ('chief', 1.1219820113396164), 159 | ('day', 1.1225800115684308)] 160 | ``` 161 | 162 | ### Mapping the maze 163 | 164 | This is useful as a confirmation that the network is capturing something real about the text. But it's sort of like stumbling through one little passage in the labyrinth with a torch, tracing out a single thread of connection in the document. What you really want is to be able to zoom back and see a bird's-eye view of the entire thing at once, to wrap your head around the complete set of relations that bind all of the words together. This is a perfect task job for any of the off-the-shelf network layout algorithms, which treat all of the nodes as "particles" that repel one another by default, but which are bound together by a set of attractive forces exerted by the edges that connect them. Force Atlas 2 in Gephi works well - _War and Peace_ unfolds into a huge, spindly triangle: 165 | 166 | **[Click to zoom](http://textplot.s3-website-us-west-1.amazonaws.com/#mental-maps/war-and-peace)** 167 | 168 | ![War and Peace](networks/war-and-peace.jpg) 169 | 170 | _(Click the bold links to auto-focus the zoomed image in a new tab.)_ 171 | 172 | **War** to the left, **peace** to the right, and **history** on top, between the two. Of course, the "on top" has no meaning in and of itself, since the orientation of the layout is random - here and elsewhere, I've rotated the final render to make it easy on the eyes. What does have meaning, though, is the _relative_ position of the words, the relationships between the regions - that history is "between" war and peace, in this case. 173 | 174 | This makes it possible to position different elements of text as they relate to the high-level categories - kind of like a big, nerdy, literary "where's Waldo." This can lead to some interesting insights, though - things that I'm not sure that I would have come up with on my own. For example, look at the huge amount of of space between "**Napoleon**" and "**Bonaparte**," which I would have expected to hang together pretty closely. "Napoleon" sits along the top left shoulder of the triangle, along the gradient between "battle" and "history," in the middle of a **section related to military strategy and tactics** ("military," "plan," "campaign," "men," "group"). Whereas "Bonaparte" is way down at the bottom of the triangle, almost exactly in the middle of the gradient running between war and peace, just shy of a **cluster of words related to the aristocratic salon** ("Anna," "Pavlovna," "sitting," "laughing") and right next to "**company**," which has the perfect polysemy to bind the two sides together - _social_ company to the right, and the _military_ company to the left. The two names enact different roles in the text - "Napoleon" is the man himself, winning battles and participating in the abstract notion of history, and "Bonaparte" is the Russian imagination of the man, a name whispered at parties in Moscow and St. Petersburg. Pierre, meanwhile, shows up near the connection point with the history cluster, surrounded by words of **spiritual anxiety and questing** - "doubt," "soul," "time," "considered." Anatole is in the **furthest reachest of the peace section**, right next to "visitors" (he was one) and "daughters" (he eloped with one). **Rostov** and **Andrei** (Andrew, in the Garnett translation) are at the bottom center, right near "Bonaparte" in the bridge between war and peace. The **women and children**, meanwhile, are almost completely confined to the peace cluster - Natasha, Marya, Sonya, Anna, Helene, along with basically all words about or related to women - "lady," "girl," "mother," "countess," "daughter," etc. Women essentially _instantiate_ peace, and have very little interaction with history or war - it's almost as much _War and Women_ as _War and Peace_. 175 | 176 | Also, take a look at the gradients that run between the conceptual extremes - the means by which the different sections transmute into one another. For example, look again at the bottom center of the network, near "Bonaparte," right where war crosses over into peace. How is that transition actually accomplished? If you look closely, there's a cluster of terms right between the two related to the **body and physical contact** - "lips," "hand," "fingers," "touched," "eyes," "face," "shoulders," "arm," "foot," "legs." Which, it seems, are used to describe both the physicality of military life and the niceties of Russian high society - the embraces, clasps, arms over shoulders, pats on backs, etc. War becomes peace by way of the _body_, which is subject both to the violence of war and the sensuality of peace. Or, more broadly, look at the left and right sides of the triangle, the gradients running from **war to history** on the left and **peace to history** on the right. In both cases, these are also gradients from _concrete to general_, specific to abstract. The individual women and children that represent the furthest extreme of the peace corner give way to a cluster of terms about **family in general** - "children," "wife," "husband," "family" - before rising up into the history cluster by way of "**life**" and "**live**." On the right side, terms related to the **specifics of battle** - "guns," "flank," "line," "borodino," "battle" - give way to Napoleon's cluster of words related to **strategy and tactics** - "plan," "military," "campaign," "strength," "number" - which then join the history section by way of "**direction**." It's a big diagram of the _idea_ of the text. 177 | 178 | Here's the _Odyssey_: 179 | 180 | ![Odyssey](networks/odyssey.jpg) 181 | 182 | Here, instead of war/peace, it's an opposition between land and sea, home and away. At the bottom is **Ithaca, Penelope, the suitors, the world of people, civilization, conflict**; at the top, the world of the "raft," the home away from home, the natural world, the **physical and metaphorical space between Troy and Ithaca** - "waves," "sea," "wind," "island," "cave," "shore," the cyclops, the sirens. Compare this with the architecture of _Walden_, which takes the form of long, narrow pillar of words, which also span a gradient between land/civilization and water/wilderness: 183 | 184 | ![Walden](networks/walden.jpg) 185 | 186 | The world of **Concord** is at the bottom - "civilization," "enterprise," "comforts," "luxury," "dollars," "fashion." As you move up, this gives way to Thoreau's narrative about his attempt to build **his own, simplified version of the this world** - "roof," "built," "dwelling," "simple." Which in turn bleeds into the world of his **day-to-day existince at Walden**, anchored around the word "**day**" - "hoeing" the field, "planting beans," "singing" to himself, "sitting", "thinking." Then the network crosses over completely into the **world of the pond** - "water," "surface," "depth," "waves," and "walden." Remarkably, at the very top of the network, along with "lake" and "shore," is "**_boat_**," which is eerily similar to the "**raft**" on top of the _Odyssey_ - the most extreme removal from human civilization, the smallest outpost of habitable space. Both enact the same dialectic - between a world of men on land, and a world of solitude out in the midst of some kind of watery wilderness. 187 | 188 | The _Divine Comedy_ looks almost exactly like _Walden_, except Concord/Walden is replaced with **hell** / **heaven**, with, fittingly enough, "**christ**" perched on top of the whole thing: 189 | 190 | ![Divine Comedy](networks/divine-comedy.jpg) 191 | 192 | Shakespeare's _The Winter's Tale_ unwinds into a big, crisply-differentiated stack that runs from **Leontes' psychotic jealousy in Sicily** at the bottom ("tyrant," "accusation," "traitor") to the **pastoral joy of Perdita's life in Bohemia**: 193 | 194 | ![The Winter's Tale](networks/winters-tale.jpg) 195 | 196 | _Moby-Dick_ is roughly a big square - **New Bedford and the world of land** on the bottom ("room," "bed," "house," "landlord," "island"), **Ahab, Starbuck, and Moby Dick** to the right, the **history of whaling** on the left (whales in the abstract), and the **hunting of whales** at the top (whales in the flesh). Amusingly, it kind of looks like a big whale: 197 | 198 | ![Moby-Dick](networks/moby-dick.jpg) 199 | 200 | It's kind of like reading literary x-rays (hopefully not tea leaves). Here's _Notes from Underground_, which, like the text, splits along the center into two sections - the existentialist rant of "**Underground**" on the left, the adventures with **Zverkov** and **Liza** from "Apropos of the Wet Snow" on the right: 201 | 202 | ![Notes from Undergound](networks/notes-from-underground.jpg) 203 | 204 | Here's the _Origin of Species_, which I've only read in small parts. But, it's actually interesting to do this with a text that you _don't_ know, and see what you can infer about the organization of the document. _Origin of Species_ gives a lot of structure to chew on: 205 | 206 | ![Origin of Species](networks/origin-of-species.jpg) 207 | 208 | ### Failures, null results 209 | 210 | This big weakness with this, of course, is that it doesn't work nearly as well with texts that don't naturally split up into these kinds of cleanly-defined sections. For example, take a look at _Leaves of Grass_: 211 | 212 | ![Leaves of Grass](networks/leaves-of-grass.jpg) 213 | 214 | It's more scrambled, less differentiated, less obviously "accurate" than the tidy triangle of _War and Peace_ or the cosmological pillar of the _Divine Comedy_. If you squint at it for a few minutes, it starts to assemble into some recognizable constellations of meaning, but it's much more of an interpretive exertion to make a case for how the lines should be drawn. Two regions of meaning are fairly clear - on top, a section about **war** ("soldiers," "battle," "camp," "armies," "war"), and, at the bottom left, a big, diffuse shoulder of the network related to the **body, sensuality, sexuality** - "neck," "fingers," "limbs," "flesh," "kiss," "touch," "hand," "happy." The right side of the network doesn't hold together as well, but, if this post weren't already much too long, I'd argue that lots of things on the right side converge on a shared preoccupation about _time_ - "**eidolons**," from the inscription of the same name about how the actions and accomplishments of people are ground into shadows over time; "**pioneers**," from "Pioneers! O Pioneers," one of the triumphalist narratives about the inevitability of American expansion in the west; and a cluster of terms related to **geopolitics and deep time** - "universe," "nation," "modern," "centuries," "globe," "liberty," "kings," "America," "mighty." This is Whitman looking back at Europe and forward to what he sees as an American future, both in a political and cultural sense but also in terms of his own relationship, as a poet, to literary and intellectual tradition. It's Whitman thinking about how things change over time. (If you buy this, the war/body/time triad starts to look interestingly similar to war/peace/history). 215 | 216 | But, this is much more of a stretch - it's muddled, less legible. In one way, this probably just reflects something true about _Leaves of Grass_ - it's more finely chopped, more heterogeneous, more evenly mixed than something like _War and Peace_. But I think this is also exposing a weakness in the technique - my intuition is that there are, actually, some really distinct topic clusters that should be surfaced out of _Leaves of Grass_, and I wonder if they're getting buried by the simplistic way that I'm picking the words that get included in the network. Right now, I just take the top X most frequent words (excluding stopwords), and compute the relations among just those words. The problem with this, I think, is that it doesn't do anything to filter out words that are very evenly distributed - words that aren't "typical" of any particular topic. Which, since they're similar to everything, act like binding agents that lock down the network and prevent it from differentiating into a more useful map of the document. This happens to a lesser degree in all of the networks, which tend to have a big clump of words in the ceter that don't really get pulled out towards the more conceptually focused regions at the edges. Or, to borrow again from the terminology of topic modeling, I wonder if there's a way to automatically pick the words that anchor the most "interpretable" or "coherent" distribution topics - the terms that serve as the most reliable markers for whether or not a given topic is "active" at some given point in the text. In _War and Peace_, for example, "**battle**" should score very highly, since it's way off at the edge of the war region, but words like "**make**" or "**began**" should get low scores, since they end up right in the middle of the network and don't glom onto any particular thread of meaning in the text. You want the "clumpy" words - terms that appear very frequently, but also very unevenly. 217 | 218 | Anyway, this was fun, but I'm still just stabbing around in the dark, for the most part. The visualizations are interesting as a type of critical "deformance," to borrow a word from Jerry McGann, Lisa Samuels, and Steve Ramsay - derivative texts, the products of an algorithmic sieve that can confirm or challenge intuitions about the originals. In the long run, though, I'm actually more interested in the question of whether this kind of network information could be tapped to get a more general understanding of the "shapes" of texts in the aggregate, at the scale of hundreds or thousands of documents instead of just a handful. Would it be possible to move beyond the visual close reading, fun as it might be, and find a way to _classify_ and _compare_ the networks, in the same way that the something like the Bray-Curtis dissimilarity makes it possible to operationalize the distributions of the individual words? 219 | 220 | For example, the _Divine Comedy_ and _Walden_ just _look_ almost identical to the eye - but how to capture that quantitatively? What exactly is the underlying similarity? Is it something real, or is it just a coincidence? Could it boiled out as some kind of portable, lightweight, network-theoretical measurement? Maybe some notion of the "width" or "breadth" of the text - the maximum distance between nodes, the extent to which the text traverses across a semantic space without looping back on itself? If this is computable - what other texts are "wide" or "long" in this way? Do they cohere at a literary level? Do they all tend to be "oppositional" or "dialectical" texts like the _Divine Comedy_ and _Walden_ - heaven/hell, nature/wilderness, land/water? Maybe they're all "travelogues," in a conceptual sense - narratives about a continuous, gradual movement from one pole to another, which prevents the network from folding back on itself and wiring up a shorter distance between the most removed terms? What other new taxonomies and measurements of "shape" might be possible? Maybe some measure of "modularity" or "clumpiness," the extent to which a text separates out into discrete, self-contained little ciruits of meaning? How many shapes are there? Does every text have a unique shape, or at they all variations on a small set of archetypes (the line, the triangle, the loop, etc.)? If patterns do exist, and if they could be quantified - (how) do they correlate with authors, periods, regions, genres, gender, nationality, identity? 221 | -------------------------------------------------------------------------------- /notes/mental-maps/networks/divine-comedy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/divine-comedy.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/leaves-of-grass.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/leaves-of-grass.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/moby-dick.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/moby-dick.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/notes-from-underground.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/notes-from-underground.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/odyssey.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/odyssey.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/origin-of-species.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/origin-of-species.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/walden.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/walden.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/war-and-peace.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/war-and-peace.jpg -------------------------------------------------------------------------------- /notes/mental-maps/networks/winters-tale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/notes/mental-maps/networks/winters-tale.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipython 2 | scikit-learn 3 | numpy 4 | scipy 5 | matplotlib 6 | nltk 7 | networkx 8 | clint 9 | pytest 10 | click 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from setuptools import setup, find_packages 4 | 5 | 6 | setup( 7 | 8 | name='textplot', 9 | version='0.1.1', 10 | description='(Mental) maps of texts.', 11 | url='https://github.com/davidmcclure/textplot', 12 | license='MIT', 13 | author='David McClure', 14 | author_email='davidwilliammcclure@gmail.com', 15 | scripts=['bin/textplot'], 16 | packages=find_packages(), 17 | package_data={'textplot': ['data/*']}, 18 | 19 | install_requires=[ 20 | 'scikit-learn', 21 | 'numpy', 22 | 'scipy', 23 | 'matplotlib', 24 | 'nltk', 25 | 'networkx', 26 | 'clint', 27 | 'pytest', 28 | 'click', 29 | ] 30 | 31 | ) 32 | -------------------------------------------------------------------------------- /test/matrix/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/test/matrix/__init__.py -------------------------------------------------------------------------------- /test/matrix/test_anchored_pairs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from textplot.matrix import Matrix 5 | 6 | 7 | def test_anchored_pairs(): 8 | 9 | """ 10 | For a given anchor term, anchored_pairs() should return an ordered map of 11 | term -> distance for all other indexed terms. 12 | """ 13 | 14 | t = Text('aa bb cc dd') 15 | m = Matrix() 16 | 17 | m.index(t) 18 | 19 | pairs = m.anchored_pairs('aa') 20 | 21 | assert list(pairs.keys()) == ['bb', 'cc', 'dd'] 22 | assert pairs['bb'] > pairs['cc'] > pairs['dd'] 23 | -------------------------------------------------------------------------------- /test/matrix/test_index.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from textplot.matrix import Matrix 5 | 6 | 7 | def test_index(): 8 | 9 | """ 10 | index() should index the Bray-Curtis distances between terms. 11 | """ 12 | 13 | t = Text('aa bb cc') 14 | m = Matrix() 15 | 16 | m.index(t) 17 | 18 | assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb') 19 | assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc') 20 | assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc') 21 | 22 | 23 | def test_term_subset(): 24 | 25 | """ 26 | When a subset of terms is passed, just those terms should be indexed. 27 | """ 28 | 29 | t = Text('aa bb cc') 30 | m = Matrix() 31 | 32 | m.index(t, ['aa', 'bb']) 33 | 34 | # Should index 'aa' and 'bb'. 35 | assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb') 36 | 37 | # Should ignore 'cc'. 38 | assert not m.get_pair('aa', 'cc') 39 | assert not m.get_pair('bb', 'cc') 40 | -------------------------------------------------------------------------------- /test/matrix/test_set_get.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.matrix import Matrix 4 | 5 | 6 | def test_set_pair(): 7 | 8 | """ 9 | set_pair() should set the value under an order-independent key. 10 | """ 11 | 12 | m = Matrix() 13 | m.set_pair('a', 'b', 1) 14 | 15 | assert m.get_pair('a', 'b') == 1 16 | assert m.get_pair('b', 'a') == 1 17 | 18 | 19 | def test_update_key_set(): 20 | 21 | """ 22 | Keys should be added to a set of stored keys. 23 | """ 24 | 25 | m = Matrix() 26 | m.set_pair('a', 'b', 1) 27 | m.set_pair('a', 'c', 2) 28 | 29 | assert m.keys == set(['a', 'b', 'c']) 30 | 31 | 32 | def test_missing_key(): 33 | 34 | """ 35 | If an unindexed key pair is passed, return None. 36 | """ 37 | 38 | m = Matrix() 39 | m.set_pair('a', 'b', 1) 40 | 41 | assert m.get_pair('a', 'c') == None 42 | -------------------------------------------------------------------------------- /test/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/test/text/__init__.py -------------------------------------------------------------------------------- /test/text/fixtures/stopwords.txt: -------------------------------------------------------------------------------- 1 | sa 2 | sb 3 | sc 4 | -------------------------------------------------------------------------------- /test/text/test_load_stopwords.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pkgutil 4 | import os 5 | 6 | from textplot.text import Text 7 | 8 | 9 | def test_default_file(): 10 | 11 | """ 12 | When no path is passed to load_stopwords(), the default file in the 13 | textplot module should be loaded. 14 | """ 15 | 16 | defaults = set( 17 | pkgutil 18 | .get_data('textplot', 'data/stopwords.txt') 19 | .decode('utf8') 20 | .splitlines() 21 | ) 22 | 23 | t = Text('test') 24 | 25 | assert t.stopwords == defaults 26 | 27 | 28 | def test_custom_file(): 29 | 30 | """ 31 | Load a custom file, when a path is passed. 32 | """ 33 | 34 | path = os.path.join( 35 | os.path.dirname(__file__), 36 | 'fixtures/stopwords.txt' 37 | ) 38 | 39 | t = Text('test', stopwords=path) 40 | 41 | assert t.stopwords == set(['sa', 'sb', 'sc']) 42 | -------------------------------------------------------------------------------- /test/text/test_most_frequent_terms.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from collections import OrderedDict 5 | 6 | 7 | def test_most_frequent_terms(): 8 | 9 | """ 10 | most_frequent_terms() should return the N most frequent terms. 11 | """ 12 | 13 | t = Text('aa bb bb cc cc cc') 14 | 15 | # Top 2 words are 'cc' and 'bb' 16 | assert t.most_frequent_terms(2) == set(['cc', 'bb']) 17 | 18 | 19 | def test_merge_smallest_bucket(): 20 | 21 | """ 22 | Say 1000 gets passed as the depth, and the 1000th term in the term-counts 23 | dictionary has a count of 10. But, there are 20 other terms that also show 24 | up 10 times in the text. In this case, all of the terms in this smallest 25 | bucket should be included, so as not to arbitrarily omit words that appear 26 | with the same frequency as words that do get included. 27 | """ 28 | 29 | t = Text('aa bb bb cc cc dd dd dd') 30 | 31 | # Top 2 words are 'cc' and 'bb' 32 | assert t.most_frequent_terms(2) == set(['dd', 'cc', 'bb']) 33 | -------------------------------------------------------------------------------- /test/text/test_term_count_buckets.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from collections import OrderedDict 5 | 6 | 7 | def test_term_count_buckets(): 8 | 9 | """ 10 | term_count_buckets() should map integer counts to the list of terms in the 11 | text that appear that many times. 12 | """ 13 | 14 | t = Text('aa bb bb cc cc dd dd dd') 15 | 16 | assert t.term_count_buckets() == { 17 | 1: ['aa'], 18 | 2: ['bb', 'cc'], 19 | 3: ['dd'] 20 | } 21 | -------------------------------------------------------------------------------- /test/text/test_term_counts.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from collections import OrderedDict 5 | 6 | 7 | def test_term_counts(): 8 | 9 | """ 10 | term_counts() should return a map of term -> count. 11 | """ 12 | 13 | t = Text('aa bb bb cc cc cc') 14 | 15 | assert t.term_counts() == OrderedDict([ 16 | ('cc', 3), 17 | ('bb', 2), 18 | ('aa', 1) 19 | ]) 20 | -------------------------------------------------------------------------------- /test/text/test_tokenize.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | 5 | 6 | def test_set_tokens(): 7 | 8 | """ 9 | tokenize() should record individual tokens. 10 | """ 11 | 12 | t = Text('aa bb cc') 13 | 14 | assert t.tokens[0]['unstemmed'] == 'aa' 15 | assert t.tokens[1]['unstemmed'] == 'bb' 16 | assert t.tokens[2]['unstemmed'] == 'cc' 17 | assert len(t.tokens) == 3 18 | 19 | 20 | def test_set_term_offsets(): 21 | 22 | """ 23 | During tokenization, store map of token -> offsets positions. 24 | """ 25 | 26 | t = Text('aa bb aa bb') 27 | 28 | assert t.terms['aa'] == [0, 2] 29 | assert t.terms['bb'] == [1, 3] 30 | 31 | 32 | def test_ignore_stopwords(): 33 | 34 | """ 35 | Stopwords should be represented as None in the token list. 36 | """ 37 | 38 | t = Text('aa the bb an cc') 39 | 40 | assert t.tokens[0]['unstemmed'] == 'aa' 41 | assert t.tokens[1] == None 42 | assert t.tokens[2]['unstemmed'] == 'bb' 43 | assert t.tokens[3] == None 44 | assert t.tokens[4]['unstemmed'] == 'cc' 45 | assert len(t.tokens) == 5 46 | -------------------------------------------------------------------------------- /test/text/test_unstem.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.text import Text 4 | from nltk.stem import PorterStemmer 5 | 6 | 7 | def test_unstem(): 8 | 9 | """ 10 | Given a word stem, unstem() should return the most frequently- occurring 11 | unstemmed variant in the text. 12 | """ 13 | 14 | # cat > cats 15 | t = Text('cat cat cats') 16 | assert t.unstem('cat') == 'cat' 17 | 18 | # cats > cat 19 | t = Text('cat cat cats cats cats') 20 | assert t.unstem('cat') == 'cats' 21 | -------------------------------------------------------------------------------- /test/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/test/utils/__init__.py -------------------------------------------------------------------------------- /test/utils/test_sort_dict.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.utils import sort_dict 4 | from collections import OrderedDict 5 | 6 | 7 | def test_descending(): 8 | 9 | """ 10 | sort_dict() should sort an ordered dictionary in descending order. 11 | """ 12 | 13 | d = OrderedDict([ 14 | ('a', 1), 15 | ('b', 2), 16 | ('c', 3) 17 | ]) 18 | 19 | desc = sort_dict(d) 20 | 21 | assert list(desc.items()) == [ 22 | ('c', 3), 23 | ('b', 2), 24 | ('a', 1) 25 | ] 26 | 27 | 28 | def test_ascending(): 29 | 30 | """ 31 | When desc=False is passed, sort in ascending order. 32 | """ 33 | 34 | d = OrderedDict([ 35 | ('c', 3), 36 | ('b', 2), 37 | ('a', 1) 38 | ]) 39 | 40 | asc = sort_dict(d, desc=False) 41 | 42 | assert list(asc.items()) == [ 43 | ('a', 1), 44 | ('b', 2), 45 | ('c', 3) 46 | ] 47 | -------------------------------------------------------------------------------- /test/utils/test_tokenize.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.utils import tokenize 4 | 5 | 6 | def assert_abc(text): 7 | 8 | """ 9 | Assert tokens aa/bb/cc. 10 | 11 | Args: 12 | text (str): A raw text string. 13 | """ 14 | 15 | tokens = tokenize(text) 16 | 17 | aa = next(tokens) 18 | assert aa['stemmed'] == 'aa' 19 | assert aa['unstemmed'] == 'aa' 20 | assert aa['offset'] == 0 21 | 22 | bb = next(tokens) 23 | assert bb['stemmed'] == 'bb' 24 | assert bb['unstemmed'] == 'bb' 25 | assert bb['offset'] == 1 26 | 27 | cc = next(tokens) 28 | assert cc['stemmed'] == 'cc' 29 | assert cc['unstemmed'] == 'cc' 30 | assert cc['offset'] == 2 31 | 32 | 33 | def test_tokenize(): 34 | 35 | """ 36 | tokenize() should yield token dicts from a string. 37 | """ 38 | 39 | assert_abc('aa bb cc') 40 | 41 | 42 | def test_ignore_non_letters(): 43 | 44 | """ 45 | tokenize() should ignore non [a-z] characters. 46 | """ 47 | 48 | assert_abc('aa. 12 bb? 34 cc!') 49 | 50 | 51 | def test_stem(): 52 | 53 | """ 54 | Stemm-able tokens should be stemmed. 55 | """ 56 | 57 | text = 'happy lovely days' 58 | 59 | tokens = tokenize(text) 60 | 61 | t1 = next(tokens) 62 | assert t1['stemmed'] == 'happi' 63 | assert t1['unstemmed'] == 'happy' 64 | assert t1['offset'] == 0 65 | 66 | t2 = next(tokens) 67 | assert t2['stemmed'] == 'love' 68 | assert t2['unstemmed'] == 'lovely' 69 | assert t2['offset'] == 1 70 | 71 | t3 = next(tokens) 72 | assert t3['stemmed'] == 'day' 73 | assert t3['unstemmed'] == 'days' 74 | assert t3['offset'] == 2 75 | 76 | 77 | def test_ignore_case(): 78 | 79 | """ 80 | Tokens should be downcased. 81 | """ 82 | 83 | text = 'One TWO ThReE' 84 | 85 | tokens = tokenize(text) 86 | 87 | t1 = next(tokens) 88 | assert t1['stemmed'] == 'one' 89 | assert t1['unstemmed'] == 'one' 90 | assert t1['offset'] == 0 91 | 92 | t2 = next(tokens) 93 | assert t2['stemmed'] == 'two' 94 | assert t2['unstemmed'] == 'two' 95 | assert t2['offset'] == 1 96 | 97 | t2 = next(tokens) 98 | assert t2['stemmed'] == 'three' 99 | assert t2['unstemmed'] == 'three' 100 | assert t2['offset'] == 2 101 | -------------------------------------------------------------------------------- /test/utils/test_window.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from textplot.utils import window 4 | 5 | 6 | def test_window(): 7 | 8 | """ 9 | window() should generate a sliding window over an iterable. 10 | """ 11 | 12 | itr = [1, 2, 3, 4, 5, 6] 13 | w = window(itr, 3) 14 | 15 | assert next(w) == (1, 2, 3) 16 | assert next(w) == (2, 3, 4) 17 | assert next(w) == (3, 4, 5) 18 | assert next(w) == (4, 5, 6) 19 | -------------------------------------------------------------------------------- /textplot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmcclure/textplot/889b949a637d99097ecec44ed4bfee53b1964dee/textplot/__init__.py -------------------------------------------------------------------------------- /textplot/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero 572 | -------------------------------------------------------------------------------- /textplot/graphs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import networkx as nx 4 | import matplotlib.pyplot as plt 5 | 6 | from abc import ABCMeta, abstractmethod 7 | from clint.textui.progress import bar 8 | 9 | 10 | class Graph(metaclass=ABCMeta): 11 | 12 | 13 | def __init__(self): 14 | 15 | """ 16 | Initialize the graph. 17 | """ 18 | 19 | self.graph = nx.Graph() 20 | 21 | 22 | @abstractmethod 23 | def build(self): 24 | pass 25 | 26 | 27 | def draw_spring(self, **kwargs): 28 | 29 | """ 30 | Render a spring layout. 31 | """ 32 | 33 | nx.draw_spring( 34 | self.graph, 35 | with_labels=True, 36 | font_size=10, 37 | edge_color='#dddddd', 38 | node_size=0, 39 | **kwargs 40 | ) 41 | 42 | plt.show() 43 | 44 | 45 | def write_gml(self, path): 46 | 47 | """ 48 | Write a GML file. 49 | 50 | Args: 51 | path (str): The file path. 52 | """ 53 | 54 | nx.write_gml(self.graph, path) 55 | 56 | 57 | def write_graphml(self, path): 58 | 59 | """ 60 | Write a GraphML file. 61 | 62 | Args: 63 | path (str): The file path. 64 | """ 65 | 66 | nx.write_graphml(self.graph, path) 67 | 68 | 69 | class Skimmer(Graph): 70 | 71 | 72 | def build(self, text, matrix, skim_depth=10, d_weights=False): 73 | 74 | """ 75 | 1. For each term in the passed matrix, score its KDE similarity with 76 | all other indexed terms. 77 | 78 | 2. With the ordered stack of similarities in hand, skim off the top X 79 | pairs and add them as edges. 80 | 81 | Args: 82 | text (Text): The source text instance. 83 | matrix (Matrix): An indexed term matrix. 84 | skim_depth (int): The number of siblings for each term. 85 | d_weights (bool): If true, give "close" words low edge weights. 86 | """ 87 | 88 | for anchor in bar(matrix.keys): 89 | 90 | n1 = text.unstem(anchor) 91 | 92 | # Heaviest pair scores: 93 | pairs = matrix.anchored_pairs(anchor).items() 94 | for term, weight in list(pairs)[:skim_depth]: 95 | 96 | # If edges represent distance, use the complement of the raw 97 | # score, so that similar words are connected by "short" edges. 98 | if d_weights: weight = 1-weight 99 | 100 | n2 = text.unstem(term) 101 | 102 | # NetworkX does not handle numpy types when writing graphml, 103 | # so we cast the weight to a regular float. 104 | self.graph.add_edge(n1, n2, weight=float(weight)) 105 | -------------------------------------------------------------------------------- /textplot/helpers.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import click 4 | 5 | from textplot.text import Text 6 | from textplot.graphs import Skimmer 7 | from textplot.matrix import Matrix 8 | 9 | 10 | def build_graph(path, term_depth=1000, skim_depth=10, 11 | d_weights=False, **kwargs): 12 | 13 | """ 14 | Tokenize a text, index a term matrix, and build out a graph. 15 | 16 | Args: 17 | path (str): The file path. 18 | term_depth (int): Consider the N most frequent terms. 19 | skim_depth (int): Connect each word to the N closest siblings. 20 | d_weights (bool): If true, give "close" nodes low weights. 21 | 22 | Returns: 23 | Skimmer: The indexed graph. 24 | """ 25 | 26 | # Tokenize text. 27 | click.echo('\nTokenizing text...') 28 | t = Text.from_file(path) 29 | click.echo('Extracted %d tokens' % len(t.tokens)) 30 | 31 | m = Matrix() 32 | 33 | # Index the term matrix. 34 | click.echo('\nIndexing terms:') 35 | m.index(t, t.most_frequent_terms(term_depth), **kwargs) 36 | 37 | g = Skimmer() 38 | 39 | # Construct the network. 40 | click.echo('\nGenerating graph:') 41 | g.build(t, m, skim_depth, d_weights) 42 | 43 | return g 44 | -------------------------------------------------------------------------------- /textplot/matrix.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import textplot.utils as utils 5 | 6 | from itertools import combinations 7 | from clint.textui.progress import bar 8 | from scipy.misc import comb 9 | from collections import OrderedDict 10 | 11 | 12 | class Matrix: 13 | 14 | 15 | def __init__(self): 16 | 17 | """ 18 | Initialize the underlying dictionary. 19 | """ 20 | 21 | self.clear() 22 | 23 | 24 | def clear(self): 25 | 26 | """ 27 | Reset the pair mappings and key set. 28 | """ 29 | 30 | self.keys = set() 31 | self.pairs = {} 32 | 33 | 34 | def key(self, term1, term2): 35 | 36 | """ 37 | Get an order-independent key for a pair of terms. 38 | 39 | Args: 40 | term1 (str) 41 | term2 (str) 42 | 43 | Returns: 44 | str: The dictionary key. 45 | """ 46 | 47 | return tuple(sorted((term1, term2))) 48 | 49 | 50 | def set_pair(self, term1, term2, value, **kwargs): 51 | 52 | """ 53 | Set the value for a pair of terms. 54 | 55 | Args: 56 | term1 (str) 57 | term2 (str) 58 | value (mixed) 59 | """ 60 | 61 | key = self.key(term1, term2) 62 | self.keys.update([term1, term2]) 63 | self.pairs[key] = value 64 | 65 | 66 | def get_pair(self, term1, term2): 67 | 68 | """ 69 | Get the value for a pair of terms. 70 | 71 | Args: 72 | term1 (str) 73 | term2 (str) 74 | 75 | Returns: 76 | The stored value. 77 | """ 78 | 79 | key = self.key(term1, term2) 80 | return self.pairs.get(key, None) 81 | 82 | 83 | def index(self, text, terms=None, **kwargs): 84 | 85 | """ 86 | Index all term pair distances. 87 | 88 | Args: 89 | text (Text): The source text. 90 | terms (list): Terms to index. 91 | """ 92 | 93 | self.clear() 94 | 95 | # By default, use all terms. 96 | terms = terms or text.terms.keys() 97 | 98 | pairs = combinations(terms, 2) 99 | count = comb(len(terms), 2) 100 | 101 | for t1, t2 in bar(pairs, expected_size=count, every=1000): 102 | 103 | # Set the Bray-Curtis distance. 104 | score = text.score_braycurtis(t1, t2, **kwargs) 105 | self.set_pair(t1, t2, score) 106 | 107 | 108 | def anchored_pairs(self, anchor): 109 | 110 | """ 111 | Get distances between an anchor term and all other terms. 112 | 113 | Args: 114 | anchor (str): The anchor term. 115 | 116 | Returns: 117 | OrderedDict: The distances, in descending order. 118 | """ 119 | 120 | pairs = OrderedDict() 121 | 122 | for term in self.keys: 123 | score = self.get_pair(anchor, term) 124 | if score: pairs[term] = score 125 | 126 | return utils.sort_dict(pairs) 127 | -------------------------------------------------------------------------------- /textplot/text.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import re 5 | import matplotlib.pyplot as plt 6 | import textplot.utils as utils 7 | import numpy as np 8 | import pkgutil 9 | 10 | from nltk.stem import PorterStemmer 11 | from sklearn.neighbors import KernelDensity 12 | from collections import OrderedDict, Counter 13 | from scipy.spatial import distance 14 | from scipy import ndimage 15 | from functools import lru_cache 16 | 17 | 18 | class Text: 19 | 20 | 21 | @classmethod 22 | def from_file(cls, path): 23 | 24 | """ 25 | Create a text from a file. 26 | 27 | Args: 28 | path (str): The file path. 29 | """ 30 | 31 | with open(path, 'r', errors='replace') as f: 32 | return cls(f.read()) 33 | 34 | 35 | def __init__(self, text, stopwords=None): 36 | 37 | """ 38 | Store the raw text, tokenize. 39 | 40 | Args: 41 | text (str): The raw text string. 42 | stopwords (str): A custom stopwords list path. 43 | """ 44 | 45 | self.text = text 46 | self.load_stopwords(stopwords) 47 | self.tokenize() 48 | 49 | 50 | def load_stopwords(self, path): 51 | 52 | """ 53 | Load a set of stopwords. 54 | 55 | Args: 56 | path (str): The stopwords file path. 57 | """ 58 | 59 | if path: 60 | with open(path) as f: 61 | self.stopwords = set(f.read().splitlines()) 62 | 63 | else: 64 | self.stopwords = set( 65 | pkgutil 66 | .get_data('textplot', 'data/stopwords.txt') 67 | .decode('utf8') 68 | .splitlines() 69 | ) 70 | 71 | 72 | def tokenize(self): 73 | 74 | """ 75 | Tokenize the text. 76 | """ 77 | 78 | self.tokens = [] 79 | self.terms = OrderedDict() 80 | 81 | # Generate tokens. 82 | for token in utils.tokenize(self.text): 83 | 84 | # Ignore stopwords. 85 | if token['unstemmed'] in self.stopwords: 86 | self.tokens.append(None) 87 | 88 | else: 89 | 90 | # Token: 91 | self.tokens.append(token) 92 | 93 | # Term: 94 | offsets = self.terms.setdefault(token['stemmed'], []) 95 | offsets.append(token['offset']) 96 | 97 | 98 | def term_counts(self): 99 | 100 | """ 101 | Returns: 102 | OrderedDict: An ordered dictionary of term counts. 103 | """ 104 | 105 | counts = OrderedDict() 106 | for term in self.terms: 107 | counts[term] = len(self.terms[term]) 108 | 109 | return utils.sort_dict(counts) 110 | 111 | 112 | def term_count_buckets(self): 113 | 114 | """ 115 | Returns: 116 | dict: A dictionary that maps occurrence counts to the terms that 117 | appear that many times in the text. 118 | """ 119 | 120 | buckets = {} 121 | for term, count in self.term_counts().items(): 122 | if count in buckets: buckets[count].append(term) 123 | else: buckets[count] = [term] 124 | 125 | return buckets 126 | 127 | 128 | def most_frequent_terms(self, depth): 129 | 130 | """ 131 | Get the X most frequent terms in the text, and then probe down to get 132 | any other terms that have the same count as the last term. 133 | 134 | Args: 135 | depth (int): The number of terms. 136 | 137 | Returns: 138 | set: The set of frequent terms. 139 | """ 140 | 141 | counts = self.term_counts() 142 | 143 | # Get the top X terms and the instance count of the last word. 144 | top_terms = set(list(counts.keys())[:depth]) 145 | end_count = list(counts.values())[:depth][-1] 146 | 147 | # Merge in all other words with that appear that number of times, so 148 | # that we don't truncate the last bucket - eg, half of the words that 149 | # appear 5 times, but not the other half. 150 | 151 | bucket = self.term_count_buckets()[end_count] 152 | return top_terms.union(set(bucket)) 153 | 154 | 155 | def unstem(self, term): 156 | 157 | """ 158 | Given a stemmed term, get the most common unstemmed variant. 159 | 160 | Args: 161 | term (str): A stemmed term. 162 | 163 | Returns: 164 | str: The unstemmed token. 165 | """ 166 | 167 | originals = [] 168 | for i in self.terms[term]: 169 | originals.append(self.tokens[i]['unstemmed']) 170 | 171 | mode = Counter(originals).most_common(1) 172 | return mode[0][0] 173 | 174 | 175 | @lru_cache(maxsize=None) 176 | def kde(self, term, bandwidth=2000, samples=1000, kernel='gaussian'): 177 | 178 | """ 179 | Estimate the kernel density of the instances of term in the text. 180 | 181 | Args: 182 | term (str): A stemmed term. 183 | bandwidth (int): The kernel bandwidth. 184 | samples (int): The number of evenly-spaced sample points. 185 | kernel (str): The kernel function. 186 | 187 | Returns: 188 | np.array: The density estimate. 189 | """ 190 | 191 | # Get the offsets of the term instances. 192 | terms = np.array(self.terms[term])[:, np.newaxis] 193 | 194 | # Fit the density estimator on the terms. 195 | kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(terms) 196 | 197 | # Score an evely-spaced array of samples. 198 | x_axis = np.linspace(0, len(self.tokens), samples)[:, np.newaxis] 199 | scores = kde.score_samples(x_axis) 200 | 201 | # Scale the scores to integrate to 1. 202 | return np.exp(scores) * (len(self.tokens) / samples) 203 | 204 | 205 | def score_intersect(self, term1, term2, **kwargs): 206 | 207 | """ 208 | Compute the geometric area of the overlap between the kernel density 209 | estimates of two terms. 210 | 211 | Args: 212 | term1 (str) 213 | term2 (str) 214 | 215 | Returns: float 216 | """ 217 | 218 | t1_kde = self.kde(term1, **kwargs) 219 | t2_kde = self.kde(term2, **kwargs) 220 | 221 | # Integrate the overlap. 222 | overlap = np.minimum(t1_kde, t2_kde) 223 | return np.trapz(overlap) 224 | 225 | 226 | def score_cosine(self, term1, term2, **kwargs): 227 | 228 | """ 229 | Compute a weighting score based on the cosine distance between the 230 | kernel density estimates of two terms. 231 | 232 | Args: 233 | term1 (str) 234 | term2 (str) 235 | 236 | Returns: float 237 | """ 238 | 239 | t1_kde = self.kde(term1, **kwargs) 240 | t2_kde = self.kde(term2, **kwargs) 241 | 242 | return 1-distance.cosine(t1_kde, t2_kde) 243 | 244 | 245 | def score_braycurtis(self, term1, term2, **kwargs): 246 | 247 | """ 248 | Compute a weighting score based on the "City Block" distance between 249 | the kernel density estimates of two terms. 250 | 251 | Args: 252 | term1 (str) 253 | term2 (str) 254 | 255 | Returns: float 256 | """ 257 | 258 | t1_kde = self.kde(term1, **kwargs) 259 | t2_kde = self.kde(term2, **kwargs) 260 | 261 | return 1-distance.braycurtis(t1_kde, t2_kde) 262 | 263 | 264 | def plot_term_kdes(self, words, **kwargs): 265 | 266 | """ 267 | Plot kernel density estimates for multiple words. 268 | 269 | Args: 270 | words (list): A list of unstemmed terms. 271 | """ 272 | 273 | stem = PorterStemmer().stem 274 | 275 | for word in words: 276 | kde = self.kde(stem(word), **kwargs) 277 | plt.plot(kde) 278 | 279 | plt.show() 280 | -------------------------------------------------------------------------------- /textplot/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import re 4 | import numpy as np 5 | import functools 6 | 7 | from collections import OrderedDict 8 | from nltk.stem import PorterStemmer 9 | from itertools import islice 10 | 11 | 12 | def tokenize(text): 13 | 14 | """ 15 | Yield tokens. 16 | 17 | Args: 18 | text (str): The original text. 19 | 20 | Yields: 21 | dict: The next token. 22 | """ 23 | 24 | stem = PorterStemmer().stem 25 | tokens = re.finditer('[a-z]+', text.lower()) 26 | 27 | for offset, match in enumerate(tokens): 28 | 29 | # Get the raw token. 30 | unstemmed = match.group(0) 31 | 32 | yield { # Emit the token. 33 | 'stemmed': stem(unstemmed), 34 | 'unstemmed': unstemmed, 35 | 'offset': offset 36 | } 37 | 38 | 39 | def sort_dict(d, desc=True): 40 | 41 | """ 42 | Sort an ordered dictionary by value, descending. 43 | 44 | Args: 45 | d (OrderedDict): An ordered dictionary. 46 | desc (bool): If true, sort desc. 47 | 48 | Returns: 49 | OrderedDict: The sorted dictionary. 50 | """ 51 | 52 | sort = sorted(d.items(), key=lambda x: x[1], reverse=desc) 53 | return OrderedDict(sort) 54 | 55 | 56 | def window(seq, n=2): 57 | 58 | """ 59 | Yield a sliding window over an iterable. 60 | 61 | Args: 62 | seq (iter): The sequence. 63 | n (int): The window width. 64 | 65 | Yields: 66 | tuple: The next window. 67 | """ 68 | 69 | it = iter(seq) 70 | result = tuple(islice(it, n)) 71 | 72 | if len(result) == n: 73 | yield result 74 | 75 | for token in it: 76 | result = result[1:] + (token,) 77 | yield result 78 | --------------------------------------------------------------------------------