├── .gitignore
├── AUTHORS
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.cfg
├── setup.py
├── tests
    └── __init__.py
└── uts
    ├── __init__.py
    ├── c99.py
    ├── texttiling.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | build/
4 | dist/
5 | *egg-info/
6 | *.bak
7 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | uts is written and maintained by Liang Wang and various contributors:
 2 | 
 3 | Development Lead
 4 | ~~~~~~~~~~~~~~~~
 5 | 
 6 | - Liang Wang <wangliangpeking@gmail.com>
 7 | 
 8 | 
 9 | Patches and Suggestions
10 | ~~~~~~~~~~~~~~~~~~~~~~~
11 | 
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2016 by Liang Wang and individual contributors.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py
2 | 
3 | include AUTHORS
4 | include LICENSE
5 | include README.md
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Unsupervised Text Segmentation
 2 | 
 3 | ### Install
 4 | 
 5 | For ```python 2.x```:
 6 | 
 7 |     sudo pip install uts
 8 | 
 9 | For ```python 3.x```:
10 | 
11 |     sudo pip3 install uts
12 | 
13 | ### Usage
14 | 
15 | ```python
16 | import uts
17 | 
18 | document = ['this is a good day', 'good day means good weather',\
19 |             'I love computer science', 'computer science is cool']
20 | model = uts.C99(window=2)
21 | boundary = model.segment(document)
22 | # output: [1, 0, 1, 0]
23 | print(boundary)
24 | ```
25 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | with-coverage = true
3 | cover-package = myapp
4 | cover-html = true
5 | cover-erase = true
6 | cover-inclusive = true
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | from uts import __version__
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | requirements = ['numpy>=1.10.4']
10 | 
11 | setup(
12 |     name = "uts",
13 |     version = ".".join(map(str, __version__)),
14 |     description = "python package for unsupervised text segmentation",
15 |     long_description = read('README.md'),
16 |     url = 'https://github.com/intfloat/uts',
17 |     license = 'MIT',
18 |     author = 'Liang Wang',
19 |     author_email = 'wangliangpeking@gmail.com',
20 |     packages = find_packages(exclude=['tests']),
21 |     include_package_data = True,
22 |     classifiers = [
23 |         'Development Status :: 3 - Alpha',
24 |         'Environment :: Console',
25 |         'Intended Audience :: Developers',
26 |         'Intended Audience :: Information Technology',
27 |         'License :: OSI Approved :: MIT License',
28 |         'Operating System :: OS Independent',
29 |         'Programming Language :: Python',
30 |     ],
31 |     install_requires = requirements,
32 |     tests_require = [],
33 | )
34 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intfloat/uts/5748a64264283829dc7910c2c53e9c9583e5825e/tests/__init__.py


--------------------------------------------------------------------------------
/uts/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = (0, 0, 4)
2 | 
3 | from .c99 import C99
4 | from .texttiling import TextTiling
5 | 


--------------------------------------------------------------------------------
/uts/c99.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # A python implementation of C99 algorithm for topic segmentation
  3 | from collections import Counter
  4 | import numpy as np
  5 | from .utils import *
  6 | 
  7 | class C99:
  8 |     """
  9 |     Reference:
 10 |         "Advances in domain independent linear text segmentation"
 11 |     """
 12 |     def __init__(self, window=4, std_coeff=1.2, tokenizer=EnglishTokenizer()):
 13 |         """
 14 |         window: int, window size for local similarity ranking
 15 |         std_coeff: double, threshold to determine boundary, see paper for more details
 16 |         tokenizer: an object with tokenize() method,
 17 |                    which takes a string as argument and return a sequence of tokens.
 18 |         """
 19 |         self.window = window
 20 |         self.sim = None
 21 |         self.rank = None
 22 |         self.sm = None
 23 |         self.std_coeff = std_coeff
 24 |         self.tokenizer = tokenizer
 25 | 
 26 |     def segment(self, document):
 27 |         """
 28 |         document: list[str]
 29 |         return list[int],
 30 |             i-th element denotes whether exists a boundary right before paragraph i(0 indexed)
 31 |         """
 32 |         assert(len(document) > 0 and len([d for d in document if not isinstance(d, str)]) == 0)
 33 |         if len(document) < 3:
 34 |             return [1] + [0 for _ in range(len(document) - 1)]
 35 |         # step 1, preprocessing
 36 |         n = len(document)
 37 |         self.window = min(self.window, n)
 38 |         cnts = [Counter(self.tokenizer.tokenize(document[i])) for i in range(n)]
 39 | 
 40 |         # step 2, compute similarity matrix
 41 |         self.sim = np.zeros((n, n))
 42 |         for i in range(n):
 43 |             for j in range(i, n):
 44 |                 self.sim[i][j] = cosine_sim(cnts[i], cnts[j])
 45 |                 self.sim[j][i] = self.sim[i][j]
 46 | 
 47 |         # step 3, compute rank matrix & sum matrix
 48 |         self.rank = np.zeros((n, n))
 49 |         for i in range(n):
 50 |             for j in range(i, n):
 51 |                 r1 = max(0, i - self.window + 1)
 52 |                 r2 = min(n - 1, i + self.window - 1)
 53 |                 c1 = max(0, j - self.window + 1)
 54 |                 c2 = min(n - 1, j + self.window - 1)
 55 |                 sublist = self.sim[r1:(r2 + 1), c1:(c2+1)].flatten()
 56 |                 lowlist = [x for x in sublist if x < self.sim[i][j]]
 57 |                 self.rank[i][j] = 1.0 * len(lowlist) / ((r2 - r1 + 1) * (c2 - c1 + 1))
 58 |                 self.rank[j][i] = self.rank[i][j]
 59 | 
 60 |         self.sm = np.zeros((n, n))
 61 |         # O(n^4) solution
 62 |         # for i in xrange(n):
 63 |         #     for j in xrange(i, n):
 64 |         #         self.sm[i][j] = sum(self.rank[i:(j + 1), i:(j + 1)].flatten())
 65 |         #         self.sm[j][i] = self.sm[i][j]
 66 |         # O(n^2) solution
 67 |         prefix_sm = np.zeros((n, n))
 68 |         for i in range(n):
 69 |             for j in range(n):
 70 |                 prefix_sm[i][j] = self.rank[i][j]
 71 |                 if i - 1 >= 0: prefix_sm[i][j] += prefix_sm[i - 1][j]
 72 |                 if j - 1 >= 0: prefix_sm[i][j] += prefix_sm[i][j - 1]
 73 |                 if i - 1 >= 0 and j - 1 >= 0: prefix_sm[i][j] -= prefix_sm[i - 1][j - 1]
 74 |         for i in range(n):
 75 |             for j in range(i, n):
 76 |                 if i == 0:
 77 |                     self.sm[i][j] = prefix_sm[j][j]
 78 |                 else:
 79 |                     self.sm[i][j] = prefix_sm[j][j] - prefix_sm[i - 1][j] \
 80 |                                     - prefix_sm[j][i - 1] + prefix_sm[i - 1][i - 1]
 81 |                 self.sm[j][i] = self.sm[i][j]
 82 | 
 83 |         # step 4, determine boundaries
 84 |         D = 1.0 * self.sm[0][n - 1] / (n * n)
 85 |         darr, region_arr, idx = [D], [Region(0, n - 1, self.sm)], []
 86 |         sum_region, sum_area = float(self.sm[0][n - 1]), float(n * n)
 87 |         for i in range(n - 1):
 88 |             mx, pos = -1e9, -1
 89 |             for j, region in enumerate(region_arr):
 90 |                 if region.l == region.r:
 91 |                     continue
 92 |                 region.split(self.sm)
 93 |                 den = sum_area - region.area + region.lch.area + region.rch.area
 94 |                 cur = (sum_region - region.tot + region.lch.tot + region.rch.tot) / den
 95 |                 if cur > mx:
 96 |                     mx, pos = cur, j
 97 |             assert(pos >= 0)
 98 |             tmp = region_arr[pos]
 99 |             region_arr[pos] = tmp.rch
100 |             region_arr.insert(pos, tmp.lch)
101 |             sum_region += tmp.lch.tot + tmp.rch.tot - tmp.tot
102 |             sum_area += tmp.lch.area + tmp.rch.area - tmp.area
103 |             darr.append(sum_region / sum_area)
104 |             idx.append(tmp.best_pos)
105 | 
106 |         dgrad = [(darr[i + 1] - darr[i]) for i in range(len(darr) - 1)]
107 | 
108 |         # optional step, smooth gradient
109 |         smooth_dgrad = [dgrad[i] for i in range(len(dgrad))]
110 |         if len(dgrad) > 1:
111 |             smooth_dgrad[0] = (dgrad[0] * 2 + dgrad[1]) / 3.0
112 |             smooth_dgrad[-1] = (dgrad[-1] * 2 + dgrad[-2]) / 3.0
113 |         for i in range(1, len(dgrad) - 1):
114 |             smooth_dgrad[i] = (dgrad[i - 1] + 2 * dgrad[i] + dgrad[i + 1]) / 4.0
115 |         dgrad = smooth_dgrad
116 | 
117 |         avg, stdev = np.average(dgrad), np.std(dgrad)
118 |         cutoff = avg + self.std_coeff * stdev
119 |         assert(len(idx) == len(dgrad))
120 |         above_cutoff_idx = [i for i in range(len(dgrad)) if dgrad[i] >= cutoff]
121 |         if len(above_cutoff_idx) == 0: boundary = []
122 |         else: boundary = idx[:max(above_cutoff_idx) + 1]
123 |         ret = [0 for _ in range(n)]
124 |         for i in boundary:
125 |             ret[i] = 1
126 |             # boundary should not be too close
127 |             for j in range(i - 1, i + 2):
128 |                 if j >= 0 and j < n and j != i and ret[j] == 1:
129 |                     ret[i] = 0
130 |                     break
131 |         return [1] + ret[:-1]
132 | 
133 | class Region:
134 |     """
135 |     Used to denote a rectangular region of similarity matrix,
136 |     never instantiate this class outside the package.
137 |     """
138 |     def __init__(self, l, r, sm_matrix):
139 |         assert(r >= l)
140 |         self.tot = sm_matrix[l][r]
141 |         self.l = l
142 |         self.r = r
143 |         self.area = (r - l + 1)**2
144 |         self.lch, self.rch, self.best_pos = None, None, -1
145 | 
146 |     def split(self, sm_matrix):
147 |         if self.best_pos >= 0:
148 |             return
149 |         if self.l == self.r:
150 |             self.best_pos = self.l
151 |             return
152 |         assert(self.r > self.l)
153 |         mx, pos = -1e9, -1
154 |         for i in range(self.l, self.r):
155 |             carea = (i - self.l + 1)**2 + (self.r - i)**2
156 |             cur = (sm_matrix[self.l][i] + sm_matrix[i + 1][self.r]) / carea
157 |             if cur > mx:
158 |                 mx, pos = cur, i
159 |         assert(pos >= self.l and pos < self.r)
160 |         self.lch = Region(self.l, pos, sm_matrix)
161 |         self.rch = Region(pos + 1, self.r, sm_matrix)
162 |         self.best_pos = pos
163 | 


--------------------------------------------------------------------------------
/uts/texttiling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Based on nltk texttiling implementation with some modifications
 3 | import re
 4 | import math
 5 | import numpy as np
 6 | from collections import Counter
 7 | from copy import deepcopy
 8 | from .utils import *
 9 | 
10 | class TextTiling:
11 |     """
12 |     Reference:
13 |         "TextTiling: Segmenting Text into Multi-paragraph Subtopic Passages"
14 |     """
15 | 
16 |     def __init__(self, window=5, tokenizer=EnglishTokenizer()):
17 |         """
18 |         window: int, window size for similarity computation
19 |         tokenizer: an object with tokenize() method,
20 |                    which takes a string as argument and return a sequence of tokens.
21 |         """
22 |         self.window = window
23 |         self.tokenizer = tokenizer
24 | 
25 |     def segment(self, document):
26 |         """
27 |         document: list[str]
28 |         return list[int],
29 |             i-th element denotes whether exists a boundary right before paragraph i(0 indexed)
30 |         """
31 |         # ensure document is not empty and every element is an instance of str
32 |         assert(len(document) > 0 and len([d for d in document if not isinstance(d, str)]) == 0)
33 |         # step 1, do preprocessing
34 |         n = len(document)
35 |         self.window = max(min(self.window, n / 3), 1)
36 |         cnts = [Counter(self.tokenizer.tokenize(document[i])) for i in range(n)]
37 | 
38 |         # step 2, calculate gap score
39 |         gap_score = [0 for _ in range(n)]
40 |         for i in range(n):
41 |             sz = min(min(i + 1, n - i - 1), self.window)
42 |             lcnt, rcnt = Counter(), Counter()
43 |             for j in range(i - sz + 1, i + 1):
44 |                 lcnt += cnts[j]
45 |             for j in range(i + 1, i + sz + 1):
46 |                 rcnt += cnts[j]
47 |             gap_score[i] = cosine_sim(lcnt, rcnt)
48 | 
49 |         # step 3, calculate depth score
50 |         depth_score = [0 for _ in range(n)]
51 |         for i in range(n):
52 |             if i < self.window or i + self.window >= n:
53 |                 continue
54 |             ptr = i - 1
55 |             while ptr >= 0 and gap_score[ptr] >= gap_score[ptr + 1]:
56 |                 ptr -= 1
57 |             lval = gap_score[ptr + 1]
58 |             ptr = i + 1
59 |             while ptr < n and gap_score[ptr] >= gap_score[ptr - 1]:
60 |                 ptr += 1
61 |             rval = gap_score[ptr - 1]
62 |             depth_score[i] = lval + rval - 2 * gap_score[i]
63 | 
64 |         # step 4, smooth depth score with fixed window size 3
65 |         smooth_dep_score = [0 for _ in range(n)]
66 |         for i in range(n):
67 |             if i - 1 < 0 or i + 1 >= n:
68 |                 smooth_dep_score[i] = depth_score[i]
69 |             else:
70 |                 smooth_dep_score[i] = np.average(depth_score[(i - 1):(i + 2)])
71 | 
72 |         # step 5, determine boundaries
73 |         boundaries = [0 for _ in range(n)]
74 |         avg = np.average(smooth_dep_score)
75 |         stdev = np.std(smooth_dep_score)
76 |         cutoff = avg - stdev / 2.0
77 | 
78 |         depth_tuples = list(zip(smooth_dep_score, list(range(len(smooth_dep_score)))))
79 |         depth_tuples.sort()
80 |         depth_tuples.reverse()
81 |         hp = [x for x in depth_tuples if (x[0] > cutoff)]
82 |         for dt in hp:
83 |             boundaries[dt[1]] = 1
84 |             for i in range(dt[1] - 4, dt[1] + 4 + 1):
85 |                 if i != dt[1] and i >= 0 and i < n and boundaries[i] == 1:
86 |                     boundaries[dt[1]] = 0
87 |                     break
88 |         return [1] + boundaries[:-1]
89 | 


--------------------------------------------------------------------------------
/uts/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from collections import Counter
 4 | def cosine_sim(c1, c2):
 5 |     try:
 6 |         # works for Counter
 7 |         n1 = np.sqrt(sum([x * x for x in list(c1.values())]))
 8 |         n2 = np.sqrt(sum([x * x for x in list(c2.values())]))
 9 |         num = sum([c1[key] * c2[key] for key in c1])
10 |     except:
11 |         # works for ordinary list
12 |         assert(len(c1) == len(c2))
13 |         n1 = np.sqrt(sum([x * x for x in c1]))
14 |         n2 = np.sqrt(sum([x * x for x in c2]))
15 |         num = sum([c1[i] * c2[i] for i in range(len(c1))])
16 |     try:
17 |         if n1 * n2 < 1e-9: # divide by zero case
18 |             return 0
19 |         return num / (n1 * n2)
20 |     except:
21 |         return 0
22 | 
23 | class EnglishTokenizer:
24 |     """
25 |     A tokenizer is a class with tokenize(text) method
26 |     """
27 |     def __init__(self):
28 |         pass
29 | 
30 |     def tokenize(self, text):
31 |         return text.lower().split()
32 | 


--------------------------------------------------------------------------------