├── .gitignore ├── AUTHORS ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py ├── tests └── __init__.py └── uts ├── __init__.py ├── c99.py ├── texttiling.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | build/ 4 | dist/ 5 | *egg-info/ 6 | *.bak 7 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | uts is written and maintained by Liang Wang and various contributors: 2 | 3 | Development Lead 4 | ~~~~~~~~~~~~~~~~ 5 | 6 | - Liang Wang 7 | 8 | 9 | Patches and Suggestions 10 | ~~~~~~~~~~~~~~~~~~~~~~~ 11 | 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2016 by Liang Wang and individual contributors. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | 3 | include AUTHORS 4 | include LICENSE 5 | include README.md 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Unsupervised Text Segmentation 2 | 3 | ### Install 4 | 5 | For ```python 2.x```: 6 | 7 | sudo pip install uts 8 | 9 | For ```python 3.x```: 10 | 11 | sudo pip3 install uts 12 | 13 | ### Usage 14 | 15 | ```python 16 | import uts 17 | 18 | document = ['this is a good day', 'good day means good weather',\ 19 | 'I love computer science', 'computer science is cool'] 20 | model = uts.C99(window=2) 21 | boundary = model.segment(document) 22 | # output: [1, 0, 1, 0] 23 | print(boundary) 24 | ``` 25 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | with-coverage = true 3 | cover-package = myapp 4 | cover-html = true 5 | cover-erase = true 6 | cover-inclusive = true 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | from uts import __version__ 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | requirements = ['numpy>=1.10.4'] 10 | 11 | setup( 12 | name = "uts", 13 | version = ".".join(map(str, __version__)), 14 | description = "python package for unsupervised text segmentation", 15 | long_description = read('README.md'), 16 | url = 'https://github.com/intfloat/uts', 17 | license = 'MIT', 18 | author = 'Liang Wang', 19 | author_email = 'wangliangpeking@gmail.com', 20 | packages = find_packages(exclude=['tests']), 21 | include_package_data = True, 22 | classifiers = [ 23 | 'Development Status :: 3 - Alpha', 24 | 'Environment :: Console', 25 | 'Intended Audience :: Developers', 26 | 'Intended Audience :: Information Technology', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Operating System :: OS Independent', 29 | 'Programming Language :: Python', 30 | ], 31 | install_requires = requirements, 32 | tests_require = [], 33 | ) 34 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intfloat/uts/5748a64264283829dc7910c2c53e9c9583e5825e/tests/__init__.py -------------------------------------------------------------------------------- /uts/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = (0, 0, 4) 2 | 3 | from .c99 import C99 4 | from .texttiling import TextTiling 5 | -------------------------------------------------------------------------------- /uts/c99.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # A python implementation of C99 algorithm for topic segmentation 3 | from collections import Counter 4 | import numpy as np 5 | from .utils import * 6 | 7 | class C99: 8 | """ 9 | Reference: 10 | "Advances in domain independent linear text segmentation" 11 | """ 12 | def __init__(self, window=4, std_coeff=1.2, tokenizer=EnglishTokenizer()): 13 | """ 14 | window: int, window size for local similarity ranking 15 | std_coeff: double, threshold to determine boundary, see paper for more details 16 | tokenizer: an object with tokenize() method, 17 | which takes a string as argument and return a sequence of tokens. 18 | """ 19 | self.window = window 20 | self.sim = None 21 | self.rank = None 22 | self.sm = None 23 | self.std_coeff = std_coeff 24 | self.tokenizer = tokenizer 25 | 26 | def segment(self, document): 27 | """ 28 | document: list[str] 29 | return list[int], 30 | i-th element denotes whether exists a boundary right before paragraph i(0 indexed) 31 | """ 32 | assert(len(document) > 0 and len([d for d in document if not isinstance(d, str)]) == 0) 33 | if len(document) < 3: 34 | return [1] + [0 for _ in range(len(document) - 1)] 35 | # step 1, preprocessing 36 | n = len(document) 37 | self.window = min(self.window, n) 38 | cnts = [Counter(self.tokenizer.tokenize(document[i])) for i in range(n)] 39 | 40 | # step 2, compute similarity matrix 41 | self.sim = np.zeros((n, n)) 42 | for i in range(n): 43 | for j in range(i, n): 44 | self.sim[i][j] = cosine_sim(cnts[i], cnts[j]) 45 | self.sim[j][i] = self.sim[i][j] 46 | 47 | # step 3, compute rank matrix & sum matrix 48 | self.rank = np.zeros((n, n)) 49 | for i in range(n): 50 | for j in range(i, n): 51 | r1 = max(0, i - self.window + 1) 52 | r2 = min(n - 1, i + self.window - 1) 53 | c1 = max(0, j - self.window + 1) 54 | c2 = min(n - 1, j + self.window - 1) 55 | sublist = self.sim[r1:(r2 + 1), c1:(c2+1)].flatten() 56 | lowlist = [x for x in sublist if x < self.sim[i][j]] 57 | self.rank[i][j] = 1.0 * len(lowlist) / ((r2 - r1 + 1) * (c2 - c1 + 1)) 58 | self.rank[j][i] = self.rank[i][j] 59 | 60 | self.sm = np.zeros((n, n)) 61 | # O(n^4) solution 62 | # for i in xrange(n): 63 | # for j in xrange(i, n): 64 | # self.sm[i][j] = sum(self.rank[i:(j + 1), i:(j + 1)].flatten()) 65 | # self.sm[j][i] = self.sm[i][j] 66 | # O(n^2) solution 67 | prefix_sm = np.zeros((n, n)) 68 | for i in range(n): 69 | for j in range(n): 70 | prefix_sm[i][j] = self.rank[i][j] 71 | if i - 1 >= 0: prefix_sm[i][j] += prefix_sm[i - 1][j] 72 | if j - 1 >= 0: prefix_sm[i][j] += prefix_sm[i][j - 1] 73 | if i - 1 >= 0 and j - 1 >= 0: prefix_sm[i][j] -= prefix_sm[i - 1][j - 1] 74 | for i in range(n): 75 | for j in range(i, n): 76 | if i == 0: 77 | self.sm[i][j] = prefix_sm[j][j] 78 | else: 79 | self.sm[i][j] = prefix_sm[j][j] - prefix_sm[i - 1][j] \ 80 | - prefix_sm[j][i - 1] + prefix_sm[i - 1][i - 1] 81 | self.sm[j][i] = self.sm[i][j] 82 | 83 | # step 4, determine boundaries 84 | D = 1.0 * self.sm[0][n - 1] / (n * n) 85 | darr, region_arr, idx = [D], [Region(0, n - 1, self.sm)], [] 86 | sum_region, sum_area = float(self.sm[0][n - 1]), float(n * n) 87 | for i in range(n - 1): 88 | mx, pos = -1e9, -1 89 | for j, region in enumerate(region_arr): 90 | if region.l == region.r: 91 | continue 92 | region.split(self.sm) 93 | den = sum_area - region.area + region.lch.area + region.rch.area 94 | cur = (sum_region - region.tot + region.lch.tot + region.rch.tot) / den 95 | if cur > mx: 96 | mx, pos = cur, j 97 | assert(pos >= 0) 98 | tmp = region_arr[pos] 99 | region_arr[pos] = tmp.rch 100 | region_arr.insert(pos, tmp.lch) 101 | sum_region += tmp.lch.tot + tmp.rch.tot - tmp.tot 102 | sum_area += tmp.lch.area + tmp.rch.area - tmp.area 103 | darr.append(sum_region / sum_area) 104 | idx.append(tmp.best_pos) 105 | 106 | dgrad = [(darr[i + 1] - darr[i]) for i in range(len(darr) - 1)] 107 | 108 | # optional step, smooth gradient 109 | smooth_dgrad = [dgrad[i] for i in range(len(dgrad))] 110 | if len(dgrad) > 1: 111 | smooth_dgrad[0] = (dgrad[0] * 2 + dgrad[1]) / 3.0 112 | smooth_dgrad[-1] = (dgrad[-1] * 2 + dgrad[-2]) / 3.0 113 | for i in range(1, len(dgrad) - 1): 114 | smooth_dgrad[i] = (dgrad[i - 1] + 2 * dgrad[i] + dgrad[i + 1]) / 4.0 115 | dgrad = smooth_dgrad 116 | 117 | avg, stdev = np.average(dgrad), np.std(dgrad) 118 | cutoff = avg + self.std_coeff * stdev 119 | assert(len(idx) == len(dgrad)) 120 | above_cutoff_idx = [i for i in range(len(dgrad)) if dgrad[i] >= cutoff] 121 | if len(above_cutoff_idx) == 0: boundary = [] 122 | else: boundary = idx[:max(above_cutoff_idx) + 1] 123 | ret = [0 for _ in range(n)] 124 | for i in boundary: 125 | ret[i] = 1 126 | # boundary should not be too close 127 | for j in range(i - 1, i + 2): 128 | if j >= 0 and j < n and j != i and ret[j] == 1: 129 | ret[i] = 0 130 | break 131 | return [1] + ret[:-1] 132 | 133 | class Region: 134 | """ 135 | Used to denote a rectangular region of similarity matrix, 136 | never instantiate this class outside the package. 137 | """ 138 | def __init__(self, l, r, sm_matrix): 139 | assert(r >= l) 140 | self.tot = sm_matrix[l][r] 141 | self.l = l 142 | self.r = r 143 | self.area = (r - l + 1)**2 144 | self.lch, self.rch, self.best_pos = None, None, -1 145 | 146 | def split(self, sm_matrix): 147 | if self.best_pos >= 0: 148 | return 149 | if self.l == self.r: 150 | self.best_pos = self.l 151 | return 152 | assert(self.r > self.l) 153 | mx, pos = -1e9, -1 154 | for i in range(self.l, self.r): 155 | carea = (i - self.l + 1)**2 + (self.r - i)**2 156 | cur = (sm_matrix[self.l][i] + sm_matrix[i + 1][self.r]) / carea 157 | if cur > mx: 158 | mx, pos = cur, i 159 | assert(pos >= self.l and pos < self.r) 160 | self.lch = Region(self.l, pos, sm_matrix) 161 | self.rch = Region(pos + 1, self.r, sm_matrix) 162 | self.best_pos = pos 163 | -------------------------------------------------------------------------------- /uts/texttiling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Based on nltk texttiling implementation with some modifications 3 | import re 4 | import math 5 | import numpy as np 6 | from collections import Counter 7 | from copy import deepcopy 8 | from .utils import * 9 | 10 | class TextTiling: 11 | """ 12 | Reference: 13 | "TextTiling: Segmenting Text into Multi-paragraph Subtopic Passages" 14 | """ 15 | 16 | def __init__(self, window=5, tokenizer=EnglishTokenizer()): 17 | """ 18 | window: int, window size for similarity computation 19 | tokenizer: an object with tokenize() method, 20 | which takes a string as argument and return a sequence of tokens. 21 | """ 22 | self.window = window 23 | self.tokenizer = tokenizer 24 | 25 | def segment(self, document): 26 | """ 27 | document: list[str] 28 | return list[int], 29 | i-th element denotes whether exists a boundary right before paragraph i(0 indexed) 30 | """ 31 | # ensure document is not empty and every element is an instance of str 32 | assert(len(document) > 0 and len([d for d in document if not isinstance(d, str)]) == 0) 33 | # step 1, do preprocessing 34 | n = len(document) 35 | self.window = max(min(self.window, n / 3), 1) 36 | cnts = [Counter(self.tokenizer.tokenize(document[i])) for i in range(n)] 37 | 38 | # step 2, calculate gap score 39 | gap_score = [0 for _ in range(n)] 40 | for i in range(n): 41 | sz = min(min(i + 1, n - i - 1), self.window) 42 | lcnt, rcnt = Counter(), Counter() 43 | for j in range(i - sz + 1, i + 1): 44 | lcnt += cnts[j] 45 | for j in range(i + 1, i + sz + 1): 46 | rcnt += cnts[j] 47 | gap_score[i] = cosine_sim(lcnt, rcnt) 48 | 49 | # step 3, calculate depth score 50 | depth_score = [0 for _ in range(n)] 51 | for i in range(n): 52 | if i < self.window or i + self.window >= n: 53 | continue 54 | ptr = i - 1 55 | while ptr >= 0 and gap_score[ptr] >= gap_score[ptr + 1]: 56 | ptr -= 1 57 | lval = gap_score[ptr + 1] 58 | ptr = i + 1 59 | while ptr < n and gap_score[ptr] >= gap_score[ptr - 1]: 60 | ptr += 1 61 | rval = gap_score[ptr - 1] 62 | depth_score[i] = lval + rval - 2 * gap_score[i] 63 | 64 | # step 4, smooth depth score with fixed window size 3 65 | smooth_dep_score = [0 for _ in range(n)] 66 | for i in range(n): 67 | if i - 1 < 0 or i + 1 >= n: 68 | smooth_dep_score[i] = depth_score[i] 69 | else: 70 | smooth_dep_score[i] = np.average(depth_score[(i - 1):(i + 2)]) 71 | 72 | # step 5, determine boundaries 73 | boundaries = [0 for _ in range(n)] 74 | avg = np.average(smooth_dep_score) 75 | stdev = np.std(smooth_dep_score) 76 | cutoff = avg - stdev / 2.0 77 | 78 | depth_tuples = list(zip(smooth_dep_score, list(range(len(smooth_dep_score))))) 79 | depth_tuples.sort() 80 | depth_tuples.reverse() 81 | hp = [x for x in depth_tuples if (x[0] > cutoff)] 82 | for dt in hp: 83 | boundaries[dt[1]] = 1 84 | for i in range(dt[1] - 4, dt[1] + 4 + 1): 85 | if i != dt[1] and i >= 0 and i < n and boundaries[i] == 1: 86 | boundaries[dt[1]] = 0 87 | break 88 | return [1] + boundaries[:-1] 89 | -------------------------------------------------------------------------------- /uts/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from collections import Counter 4 | def cosine_sim(c1, c2): 5 | try: 6 | # works for Counter 7 | n1 = np.sqrt(sum([x * x for x in list(c1.values())])) 8 | n2 = np.sqrt(sum([x * x for x in list(c2.values())])) 9 | num = sum([c1[key] * c2[key] for key in c1]) 10 | except: 11 | # works for ordinary list 12 | assert(len(c1) == len(c2)) 13 | n1 = np.sqrt(sum([x * x for x in c1])) 14 | n2 = np.sqrt(sum([x * x for x in c2])) 15 | num = sum([c1[i] * c2[i] for i in range(len(c1))]) 16 | try: 17 | if n1 * n2 < 1e-9: # divide by zero case 18 | return 0 19 | return num / (n1 * n2) 20 | except: 21 | return 0 22 | 23 | class EnglishTokenizer: 24 | """ 25 | A tokenizer is a class with tokenize(text) method 26 | """ 27 | def __init__(self): 28 | pass 29 | 30 | def tokenize(self, text): 31 | return text.lower().split() 32 | --------------------------------------------------------------------------------