├── tests ├── __init__.py ├── output │ └── .hgkeep ├── README.markdown ├── api.py ├── task.py ├── runtests.py ├── test.py └── models.py ├── detdup ├── services │ ├── __init__.py │ ├── api.py │ └── task.py ├── __init__.py ├── features │ ├── __init__.py │ └── default.py ├── data_model │ ├── __init__.py │ ├── base.py │ └── fake_item_ids.py ├── utils.py └── core.py ├── TODO.markdown ├── .travis.yml ├── .gitignore ├── setup.py ├── tox.ini ├── LICENSE ├── USAGE.markdown ├── README.markdown └── keynote.markdown /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/output/.hgkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /detdup/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /detdup/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import DetDupCore 2 | -------------------------------------------------------------------------------- /detdup/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .default import DefaultFeatures 2 | -------------------------------------------------------------------------------- /TODO.markdown: -------------------------------------------------------------------------------- 1 | * Support int type item\_id 2 | * In addition to pymongo, support more ORM 3 | -------------------------------------------------------------------------------- /detdup/data_model/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .base import DetDupDataModel 4 | -------------------------------------------------------------------------------- /tests/README.markdown: -------------------------------------------------------------------------------- 1 | run 2 | =================== 3 | ```bash 4 | pip install nose 5 | nosetests 6 | ``` 7 | -------------------------------------------------------------------------------- /tests/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from tests.task import * 4 | from detdup.services.api import DetDupApi 5 | 6 | dda = DetDupApi(detdup_opts) 7 | dda.init_api() 8 | -------------------------------------------------------------------------------- /detdup/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import glob, sys 5 | from bson.objectid import ObjectId 6 | from termcolor import colored, cprint 7 | from datetime import datetime 8 | import logging 9 | 10 | from etl_utils import cpickle_cache, process_notifier, HashUtils 11 | hashvalue_with_sorted = HashUtils.hashvalue_with_sorted 12 | 13 | # system info 14 | import multiprocessing 15 | max_process_count = ('Darwin' in os.uname()) and (multiprocessing.cpu_count()-1) or 8 16 | -------------------------------------------------------------------------------- /tests/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, sys 4 | current_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 5 | sys.path.append(current_dir) 6 | 7 | from detdup.services.task import DetDupTask 8 | 9 | from tests.models import cache_dir, OriginalModel, CleanModel, PLFeature 10 | 11 | detdup_opts = { 12 | "process_count" : 3, 13 | 14 | "cache_dir" : cache_dir, 15 | 16 | "original_model" : OriginalModel, 17 | "items_model" : CleanModel, 18 | 19 | "features" : [PLFeature], 20 | 21 | "query_check_columns" : ["desc"], 22 | } 23 | ddt = DetDupTask(detdup_opts) 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | env: 4 | global: 5 | - PIP_DOWNLOAD_CACHE=$HOME/.pip-cache 6 | matrix: 7 | - TOXENV=py27 8 | 9 | sudo: false 10 | 11 | cache: 12 | - $HOME/.pip-cache 13 | 14 | install: 15 | - pip install coveralls 16 | - pip install tox 17 | 18 | before_script: 19 | # allow ssh loopback 20 | - ssh-keygen -t rsa -N '' -C '' -f ~/.ssh/id_rsa 21 | - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 22 | - ssh -o StrictHostKeyChecking=no localhost true 23 | 24 | script: 25 | - python setup.py install 26 | - nosetests 27 | - coverage run --source=detdup setup.py test 28 | 29 | after_failure: 30 | 31 | after_success: 32 | - coveralls 33 | 34 | branches: 35 | only: 36 | - master 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | *.db 55 | *.0 56 | *.cPickle 57 | *.json 58 | 59 | .coveralls.yml 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='detdup', 5 | version='0.0.2', 6 | url='http://github.com/17zuoye/detdup/', 7 | license='MIT', 8 | author='David Chen', 9 | author_email=''.join(reversed("moc.liamg@emojvm")), 10 | description='Detect duplicated items.', 11 | long_description='Detect duplicated items.', 12 | packages=['detdup', 'detdup/data_model', 'detdup/features', 'detdup/services'], 13 | include_package_data=True, 14 | zip_safe=False, 15 | platforms='any', 16 | install_requires=[ 17 | 'etl_utils >= 0.1.7', 18 | 'peewee', 19 | 'pymongo', 20 | 'sqlitebck', 21 | 'mongomock', 22 | 'termcolor', 23 | 'model_cache >=0.0.9', 24 | ], 25 | classifiers=[ 26 | 'Intended Audience :: Developers', 27 | 'Operating System :: OS Independent', 28 | 'Programming Language :: Python', 29 | 'Topic :: Software Development :: Libraries :: Python Modules' 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /tests/runtests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2012-2015 Spotify AB 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import warnings 19 | 20 | import nose 21 | 22 | if __name__ == '__main__': 23 | with warnings.catch_warnings(): 24 | warnings.simplefilter("default") 25 | warnings.filterwarnings( 26 | "ignore", 27 | message='(.*)outputs has no custom(.*)', 28 | category=UserWarning 29 | ) 30 | nose.main() 31 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | skipsdist = True 4 | 5 | [testenv] 6 | usedevelop = True 7 | deps= 8 | coverage>=3.6,<3.999 9 | coveralls 10 | setenv = 11 | COVERAGE_PROCESS_START={toxinidir}/.coveragerc 12 | FULL_COVERAGE=true 13 | commands = 14 | python --version 15 | pip install nose 16 | coverage run tests/runtests.py -v {posargs:} 17 | coverage combine 18 | coveralls 19 | 20 | [testenv:pep8] 21 | deps = pep8 22 | commands = pep8 --ignore E501,E402,E731 luiti tests 23 | 24 | [testenv:autopep8] 25 | deps = autopep8 26 | commands = autopep8 --ignore E309,E501 -a -i -r luiti tests 27 | 28 | [testenv:isort] 29 | deps = isort 30 | commands = isort -w 120 -rc detdup tests 31 | 32 | [testenv:clean] 33 | commands= 34 | coverage erase 35 | 36 | [testenv:stats] 37 | commands= 38 | coverage report 39 | covarage html 40 | 41 | 42 | [testenv:docs] 43 | # Build documentation using sphinx. 44 | # Call this using `tox -e docs`. 45 | deps = 46 | commands = 47 | 48 | 49 | whitelist_externals = 50 | cp 51 | mv 52 | sed 53 | rm 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 一起作业 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, time, sys 4 | os.system("rm -rf tests/output/*") 5 | 6 | current_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 7 | sys.path.insert(0, current_dir) 8 | 9 | from tests.models import * 10 | from tests.api import * 11 | 12 | import unittest 13 | class TestDetDup(unittest.TestCase): 14 | def test_detdup(self): 15 | # 1. 冷启动全流程 16 | ddt.extract() 17 | ddt.train() 18 | 19 | # 2. 热启动全流程 20 | print "\n"*80 21 | similar_ids = [i1.item_id for i1 in CleanModel.values() if i1.item_content == 'Python'] 22 | self.assertTrue(dda.is_all_duplicated(similar_ids)) 23 | print "\n"*80 24 | 25 | temp_record = generate_record("CoffeeScript") 26 | target_id = [i1.item_id for i1 in CleanModel.values() if i1.item_content == 'CoffeeScript'][0] 27 | result = dda.detect_duplicated_items(temp_record) 28 | self.assertTrue(str(target_id) in result, "Found duplicated CoffeeScript") 29 | assert len(result), 2 30 | 31 | assert dda.detect_duplicated_items(temp_record), 2 # 再排重一次 32 | dup_storage = CleanModel.fake_item_ids_store.storage 33 | self.assertEqual(dup_storage.select().count(), 2, "Find twice") 34 | self.assertEqual(dup_storage.select().where(dup_storage.is_deleted==True).count(), 1, "Find is_deleted only one, the last one will be deleted at the next time") 35 | 36 | if __name__ == '__main__': unittest.main() 37 | #import pdb; pdb.set_trace() 38 | -------------------------------------------------------------------------------- /USAGE.markdown: -------------------------------------------------------------------------------- 1 | Requirements 2 | ---------------------- 3 | 1. Install required python library from requirements.txt 4 | 2. storage: 1. sqlite library, 2. cPickle, 3. redis(optional) 5 | 6 | 项目架构流程 7 | ---------------------- 8 | 具体代码流程见 services/task.py 9 | 10 | ### extract 11 | 12 | * 为要处理的数据 继承DetDupDataModel类, 提供 多种数据特征 和 清洗后的文本内容, 具体见文档注释。 13 | * 并导入特征数据库。 14 | 15 | ### train 16 | 17 | * 去除特征数据库里没有同类的 18 | * 检测重复条目 19 | * 合并并打印结果列表 20 | * 检测召回率 21 | 22 | Usage 23 | ---------------------- 24 | 1. 接口服务见 detdup/services 25 | 2. 示例见 tests 26 | 27 | 文本相似度 性能数据 28 | ----------------------- 29 | 1. 文本相似度在 0.95时,排重几乎全是正确的, 重复元素有3199个, 组有1463个。 30 | 2. 文本相似度在 0.90时,排重一点点错误,重复元素有3297个, 组有1507个。 31 | 32 | 相当于重复元素多了98个, 重复组多了44个, 重复[组]90-95之间多了 44 / 1463.0 = 3.0%, 重复元素90-100%元素约为 7.4%。 33 | 在文本相似度为90%时,误判率大概在 重复元素 19 / 3297.0 = 0.57%, 重复组在 9 / 1507.0 = 0.59%; 34 | 35 | 性能和总数以及重复元素总量成线性增长关系。 36 | 37 | 90万数据 38 | data_extract 13分钟, 8核 39 | data_index 11分钟, 1核 40 | data_index_remove_unneeded 1分钟, 1核 41 | data_detdup 5.5分钟, 8核 42 | 43 | 160万数据 44 | data_extract 26-32分钟, 8核,比上面慢的原因是90万数据是用SSD读的。 45 | data_index 25分钟, 1核 46 | data_index_remove_unneeded 1分钟, 1核 47 | data_detdup 5.75分钟, 8核 48 | 49 | 读取数据 编程接口 50 | ----------------------- 51 | ```txt 52 | >>> import json 53 | >>> data = json.load(open("detdup.json", "rb")) 54 | >>> data.result[0:3] 55 | [[a3c67f3da591b518cb535bd7, 76d6aeed4b31b569310db1a6], [e05f6e6da5aff02a81411342, 75a8e395b87ad910e0cef062], 56 | [75e7db33f06264d80c77b669, 99b6ef2b6a32d2f8317763fc, 770e993816f258edc7f3fe6b],] 57 | ``` 58 | -------------------------------------------------------------------------------- /tests/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | root_dir = os.path.dirname(os.path.abspath(__file__)) 5 | cache_dir = root_dir + '/output' 6 | 7 | import mongomock 8 | OriginalModel = mongomock.Connection().db.OriginalModel 9 | # 这里只有Python和JavaScript是重复的 10 | pls = (u"Ruby Rails Python Python Java JavaScript JavaScript CoffeeScript Julia" 11 | u"Juila Closure Scala Groovy Objective-C C C++ Perl PHP Haskell Erlang R").split(u" ") 12 | 13 | def generate_record(name, idx=None): 14 | d1 = {u"content" : name + u" programming language"} 15 | if idx: d1[u"subject_id"] = idx 16 | return d1 17 | 18 | for idx, pl in enumerate(pls): 19 | OriginalModel.insert(generate_record(pl, idx)) 20 | 21 | from detdup.data_model import DetDupDataModel 22 | from model_cache import ModelCache 23 | 24 | @ModelCache.connect(OriginalModel, storage_type='sqlite', \ 25 | cache_dir=cache_dir, \ 26 | included_class=DetDupDataModel) 27 | class CleanModel(): 28 | def init__load_data(self, record): 29 | if u"item_id" not in dir(self): self.item_id = record[u'_id'] 30 | self.item_id = unicode(self.item_id) 31 | self.item_content = record[u'content'].split(u" ")[0] 32 | 33 | self.desc = u" ".join(record[u'content'].split(u" ")[1:]) 34 | self.typename = "pl" 35 | 36 | from detdup.features.default import DefaultFeatures 37 | class PLFeature(DefaultFeatures): 38 | def post_init(self): 39 | self.typename = 'pl' 40 | 41 | self.custom_features = { 42 | 'desc' : str, 43 | } 44 | -------------------------------------------------------------------------------- /detdup/services/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .task import * 4 | from ..data_model.fake_item_ids import FakeItemIds 5 | 6 | class DetDupApi(DetDupTask): 7 | 8 | def init_api(self, storage_type='disk'): 9 | self.items_model.bind_a_fake_item_ids_store() 10 | 11 | self.core = self.new_detdup_core(storage_type) 12 | self.core.should_reduce_items = False 13 | 14 | def is_all_duplicated(self, item_ids): 15 | """ Check item_ids is same to each other """ 16 | if item_ids <= 1: return True 17 | 18 | result = list() 19 | for item_id1 in item_ids: 20 | ids = self.core.detect_duplicated_items_verbose(item_id1) 21 | ids.remove(item_id1) 22 | result.extend(ids) 23 | return sorted(set(result)) == sorted(item_ids) 24 | 25 | def process_record(self, record): 26 | # 被动式清理 老的数据,在它的下一次数据请求前 27 | self.items_model.fake_item_ids_store.remove_all() 28 | 29 | # NOTE 这里就不能直接往 items.cPickleCache 里直接写了 30 | # 1. insert item 31 | item1 = self.items_model(record) 32 | 33 | # 参考 .task.py 34 | self.items_model.feed_data([item1]) # 提供给 #[] 使用 35 | 36 | # 2. append to indexes 37 | self.core.feed_items([item1], persist=False) 38 | 39 | return item1 40 | 41 | def query_item_features(self, record): 42 | item1 = self.process_record(record) 43 | 44 | return item1.inspect() 45 | 46 | 47 | def detect_duplicated_items(self, record): 48 | item1 = self.process_record(record) 49 | 50 | # 3. query duplicated 51 | result = self.core.detect_duplicated_items_verbose(item1.item_id) 52 | 53 | # 4. remove self 54 | result.remove(item1.item_id) 55 | 56 | return result 57 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | DetDup 2 | ====================== 3 | [![Build Status](https://img.shields.io/travis/17zuoye/detdup/master.svg?style=flat)](https://travis-ci.org/17zuoye/detdup) 4 | [![Coverage Status](https://coveralls.io/repos/17zuoye/detdup/badge.svg)](https://coveralls.io/r/17zuoye/detdup) 5 | [![Health](https://landscape.io/github/17zuoye/detdup/master/landscape.svg?style=flat)](https://landscape.io/github/17zuoye/detdup/master) 6 | [![Download](https://img.shields.io/pypi/dm/detdup.svg?style=flat)](https://pypi.python.org/pypi/detdup) 7 | [![License](https://img.shields.io/pypi/l/detdup.svg?style=flat)](https://pypi.python.org/pypi/detdup) 8 | 9 | Detect duplicated items. 内容排重框架。 10 | 11 | Usage 12 | ---------------------- 13 | 见 USAGE.markdown 14 | 15 | 演讲稿 16 | ------------- 17 | https://speakerdeck.com/mvj3/detdup 18 | 19 | 内容排重功能列表 20 | ---------------------- 21 | 1. 返回 重复题目列表。 22 | 2. 发送题目ID,服务器端载入对应题库到内存中,查找和该项重复的条目,并返回题目ID列表。 23 | 24 | 常见内容重复特征 25 | ---------------------- 26 | 1. [长度] 基本相似或相等, 两者长度的平方根相差不超过1。 27 | 2. [重复] 在任意位置, 多个逗号, 空格, s字符等。 28 | 3. [同义] 全角半角编码。分隔符号不同。am, 'm。 29 | 4. [顺序] 内部句子位子换了,比如从连线题里抽取的数据 30 | 31 | 导致不能使用基于分词的倒排索引。 32 | 33 | 召回率 和 正确率 34 | ---------------------- 35 | 召回率: 如果特征抽取不是太准确的话,会导致有些groups漏了一两个。 36 | 正确率: 几乎100%的,因为是按原文本相似度算的。 37 | 38 | DetDup 和 simhash, shingling 的关系。 39 | ---------------------- 40 | 1. 其中 1 和 2 的功能 类似与simhash里 把文本变短为01序列的局部敏感hash 以及分块快速检索比较。 41 | simhash不利于 题库排重的原因见 #参考文献# , 这边几十个字符占很大比例, simhash适合于大网页的排重, 42 | 而且simhash调hash参数应该比较繁琐和难以解释。 43 | 2. 3 类似于 shingling, 区别是 shingling 用的是分词, 这边直接比较全部字符。 44 | 以兼容类似 `_range` 和 `orange` 的比较。 45 | 46 | 文本相似度定义 47 | ---------------------- 48 | 把两段文本共同的字母都取出来 除以 两者文本的总长度得出的比率。比如 of 和 off 的文本相似度为 4 / 5 = 80% 49 | 50 | 参考文献 51 | ----------------------- 52 | [海量数据相似度计算之simhash和海明距离](http://www.lanceyan.com/tech/arch/simhash_hamming_distance_similarity.html) 53 | 54 | ```txt 55 | 2、通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。但是如果我们处理的是微博信息,最多也就140个字,使用simhash的效果并不那么理想。看如下图,在距离为3时是一个比较折中的点,在距离为10时效果已经很差了,不过我们测试短文本很多看起来相似的距离确实为10。如果使用距离为3,短文本大量重复信息不会被过滤,如果使用距离为10,长文本的错误率也非常高,如何解决? 56 | ``` 57 | -------------------------------------------------------------------------------- /detdup/data_model/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from etl_utils import StringUtils, UnicodeUtils, cached_property 4 | from bson.objectid import ObjectId 5 | import os, math, json 6 | from .fake_item_ids import FakeItemIds 7 | 8 | class DetDupDataModel(object): 9 | """ 10 | Usage: 11 | 1. implement #init__load_data 12 | 2. DetDupDataModel.feed_data, #[] 13 | """ 14 | 15 | core = None 16 | fake_item_ids_store = None 17 | 18 | @classmethod 19 | def bind_a_fake_item_ids_store(cls): 20 | FakeItemIds(cls) 21 | 22 | def check_valid_id_in_record(self, record): return u'_id' in record 23 | 24 | def init__before(self, record): 25 | # ensure_an_item_id_or_fake 26 | if not self.check_valid_id_in_record(record): 27 | self.item_id = unicode(ObjectId()) 28 | self.fake_item_ids_store.insert(self.item_id, self.dump_record(record)) 29 | 30 | def init__after(self, record): 31 | # 会把全角都转换为半角 32 | self.item_content = UnicodeUtils.stringQ2B(self.item_content) 33 | 34 | # common extract data part 35 | info1 = StringUtils.frequence_chars_info(self.item_content, lambda len1 : len1 * 0.75) 36 | self.uniq_chars__len = info1['uniq_chars__len'] 37 | self.sorted_freq_chars = info1['sorted_freq_chars'] 38 | self.sqrt_chars__len = int(round(math.sqrt(len(self.item_content)))) 39 | 40 | def inspect(self): 41 | info = [] 42 | for col1 in type(self).attr_columns(): 43 | if not hasattr(self, col1): continue # 是别的feature才有的 44 | val1 = getattr(self, col1) 45 | if isinstance(val1, unicode): val1 = val1.encode("UTF-8") 46 | info.append(' '.join([col1.rjust(24, ' '), ":", str(val1), ';'])) 47 | if type(val1) in [str, unicode]: 48 | info.append(' '.join(["".rjust(26, ' '), str(len(val1))])) 49 | info.append("\n") 50 | print "\n".join(info) 51 | return info 52 | 53 | @classmethod 54 | def attr_columns(cls): 55 | """ 枚举 ETL 出来的所有字段 """ 56 | 57 | _attr_columns = [] 58 | for feature1 in cls.core.features: 59 | _attr_columns.extend(feature1.table_columns()) 60 | _attr_columns = list(set(_attr_columns)) 61 | _attr_columns.remove('item_id') 62 | _attr_columns.insert(0, 'item_id') 63 | _attr_columns.append('item_content') 64 | return _attr_columns 65 | -------------------------------------------------------------------------------- /detdup/data_model/fake_item_ids.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from peewee import SqliteDatabase, Model, CharField, BooleanField, TextField, TimeField 4 | import os 5 | import datetime 6 | 7 | 8 | class FakeItemIds(object): 9 | """ 10 | Manage fake item ids and their cache and index. 11 | 12 | Client send item with no item id, so we need to make a fake item id for it, cause index db need one. 13 | 14 | TODO: while just not delete it after query immediately, not until the next query coming. 15 | """ 16 | 17 | def __init__(self, data_model): 18 | self.data_model = data_model 19 | self.data_model.fake_item_ids_store = self 20 | 21 | assert self.data_model.cache_dir, "FakeItemIds need cache_dir from data_model!" 22 | sqlite_path = os.path.join(self.data_model.cache_dir, "fake_item_ids_store.db") 23 | 24 | sqlite_database = SqliteDatabase(sqlite_path, check_same_thread=False) 25 | 26 | class FakeItemIdsStore(Model): 27 | is_deleted = BooleanField(default=False) # mark processed or duplicated items 28 | item_id = CharField() 29 | item_content_json = TextField() 30 | created_at = TimeField(default=datetime.datetime.now) 31 | 32 | class Meta: 33 | database = sqlite_database 34 | self.storage = FakeItemIdsStore 35 | 36 | if not self.storage.table_exists(): 37 | self.storage.create_table() 38 | sqlite_database.create_index(self.storage, "is_deleted item_id".split(" ")) 39 | 40 | def insert(self, item_id, item_content_json=None): 41 | self.storage.create(item_id=item_id, 42 | item_content_json=item_content_json) 43 | 44 | def remove(self, item_id): 45 | print "[删除]", item_id 46 | # 1. 软删除 fake_item_ids_store 记录 47 | self.storage.update(is_deleted=True).where(self.storage.item_id == str(item_id)).execute() 48 | 49 | # 2. 从feature索引中删除 50 | item1 = self.data_model.get(item_id, None) 51 | if item1 is None: 52 | return False # compact with unkonw error TODO 20150430 53 | table = self.data_model.core.select_feature(item1).features_tree 54 | table.delete().where(table.item_id == str(item_id)).execute() 55 | 56 | # 3. 从items_model中删除 57 | del self.data_model[item_id] 58 | return True 59 | 60 | def remove_all(self): 61 | delete_scope = self.storage.select().where( 62 | self.storage.is_deleted == eval("False")) # compact with PEP E712 63 | for i1 in delete_scope: 64 | self.remove(i1.item_id) 65 | -------------------------------------------------------------------------------- /keynote.markdown: -------------------------------------------------------------------------------- 1 | 标题 100 pt 2 | 次标题 80 pt 3 | 脚注 36 pt 4 | 列表 50 pt 5 | 列表缩进 15 pt 6 | 列表大字 30, 18 7 | 8 | # DetDup 9 | Detect duplicated items. 10 | 11 | 重复示意图 12 | 13 | # Agenda 14 | 1. "重复内容"的定义 15 | 2. 两两比较复杂度 16 | 3. 相似性算法挑选 17 | 4. 软件工程架构和优化 18 | 19 | # "重复内容"的定义 20 | 1. [长度] 基本相似或相等, 两者长度的平方根相差不超过1。 21 | 2. [重复] 在任意位置, 多个逗号, 空格, s字符等。 22 | 3. [同义] 全角半角编码。分隔符号不同。am, 'm。 23 | 4. [顺序] 内部句子位子换了,比如从连线题里抽取的数据。 24 | 25 | 原始字符 VS 分词: 文本越小,分词效果的差异越大。 26 | 27 | # 相似性算法挑选 28 | AGoodnightGoodmorning勾选 29 | AGoodnightGoodmorning圈选 30 | 31 | 分词 10/12 # => 83.33% 32 | unicode 44/46 # => 95.65% 33 | 34 | # "两两比较"时间复杂度 35 | 一个朴素的问题 36 | 37 | n log n 38 | 39 | # simhash 40 | 选择的特征都很大 41 | TODO 42 | 43 | 44 | shingle(瓦), 4-grams 45 | { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is), (a,rose,is,a), (rose,is,a,rose) } = { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is) } 46 | 47 | 48 | # 软件架构 49 | 50 | Task API 51 | 52 | Core 53 | 54 | ModelCache Features-Trees 55 | 56 | 57 | Task = [`extract`, `train`] 58 | API = [`is_all_duplicated`, `process_record`, `query_item_features`, `detect_duplicated_items`] 59 | Core = 管理Features-Trees 60 | 61 | # 配置特征 62 | 通用 63 | `uniq_chars__len 64 | sqrt_chars__len 65 | sorted_freq_chars` 66 | 67 | 业务 68 | `options_uniq_chars__len 69 | options_sorted_freq_chars 70 | options__len 71 | ...` 72 | 73 | from detdup.features.default import DefaultFeatures 74 | class PLFeature(DefaultFeatures): 75 | """ programming language """ 76 | def post_init(self): 77 | # 在特征数据库级别划分 78 | self.typename = 'pl' 79 | 80 | self.custom_features = { 81 | 'desc' : str, 82 | } 83 | 84 | # 数据准备 85 | 操作 extract => build features-trees and model-cache 86 | 存储 cPickle sqlite and ModelCache 87 | 88 | # 预先排重 89 | 1. 选出需要排重的item-ids 90 | `SELECT 91 | t1."sorted_freq_chars", group_concat(t1."item_id") AS item_ids 92 | FROM 93 | "DefaultFeaturesTree" AS t1 94 | GROUP BY 95 | t1."sorted_freq_chars` 96 | 97 | 2. 给每一个item划分排重域 98 | `SELECT 99 | t1."id", t1."uniq_chars__len", t1."sqrt_chars__len", t1."sorted_freq_chars", t1."item_id", t1."desc" 100 | FROM 101 | "Desc" AS t1 102 | WHERE 103 | (((((t1."uniq_chars__len" >= 3)) AND (t1."uniq_chars__len" <= 9)) AND (t1."sorted_freq_chars" = 'hn')) AND (t1."desc" = 'programming language')) 104 | 105 | 3. 排重缓存。 106 | 107 | item1 => [item1, item2, item3] 108 | item2 => 缓存命中(ItemsGroupAndIndexes) 109 | 110 | # 实时排重 111 | 112 | 放入排重特征库中比对 113 | 1. 临时(FakeItemIds) 114 | 2. 永久 115 | 116 | # 软件工程优化 117 | 1. 多进程数据清洗 118 | 2. sqlitebck 内存磁盘相互拷贝 119 | 3. 动态定义特征数据库表 120 | 121 | # 性能数据 122 | ----------------------- 123 | 1. 文本相似度在 0.95时,排重几乎全是正确的, 重复元素有3199个, 组有1463个。 124 | 2. 文本相似度在 0.90时,排重一点点错误,重复元素有3297个, 组有1507个。 125 | 126 | 相当于重复元素多了98个, 重复组多了44个, 重复[组]90-95之间多了 44 / 1463.0 = 3.0%, 重复元素90-100%元素约为 7.4%。 127 | 在文本相似度为90%时,误判率大概在 重复元素 19 / 3297.0 = 0.57%, 重复组在 9 / 1507.0 = 0.59%; 128 | 129 | 性能和总数以及重复元素总量成线性增长关系。 130 | 131 | # 其他开源项目 132 | 133 | fill_broken_words model_cache phrase_recognizer tfidf article_segment region_unit_recognizer compare_word etl_utils split_block 134 | 135 | 136 | 137 | http://weibo.com/1665335994/Alp0uAOL9?type=comment&sudaref=www.aszxqw.com 138 | http://dl.acm.org/citation.cfm?id=509965 139 | http://www.lanceyan.com/tech/arch/simhash_hamming_distance_similarity.html 140 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.78.7794&rep=rep1&type=pdf 141 | simhash与Google的网页去重 http://leoncom.org/?p=650607 对比较域进行优化,没看懂。 142 | http://www.aszxqw.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html simhash算法原理及实现 143 | -------------------------------------------------------------------------------- /detdup/services/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # TODO 多进程 有木桶最短效应。但是目前没法解决变量共享。 4 | 5 | import json 6 | import math 7 | import time 8 | 9 | from ..utils import * 10 | from ..core import DetDupCore 11 | 12 | class DetDupTask(object): 13 | default_attrs = [ 14 | "process_count", 15 | 16 | "cache_dir", 17 | 18 | "original_model", 19 | "items_model", 20 | 21 | "features", 22 | 23 | "query_check_columns", 24 | ] 25 | 26 | def __init__(self, opts): 27 | for key1 in DetDupTask.default_attrs: 28 | setattr(self, key1, opts.get(key1, None)) 29 | self.process_count = self.process_count or max_process_count 30 | 31 | self.items_model = opts['items_model'] 32 | self.items_model.cache_dir = self.cache_dir 33 | self.items_model.datadict_type = "sqlite" 34 | 35 | self.result_cPickle = os.path.join(self.cache_dir, "detdup.cPickle") 36 | self.result_json = os.path.join(self.cache_dir, "detdup.json") 37 | 38 | def new_detdup_core(self, storage_type='memory'): 39 | """ new queryinterface """ 40 | # 每个instance只能被generator, 否则这些进程同时访问一个IO, 那么所有CPU就都处于等待IO中不动了 41 | 42 | detdup = DetDupCore(self.cache_dir, self.items_model) 43 | 44 | detdup.is_inspect_detail = False 45 | 46 | detdup.storage_type = storage_type 47 | 48 | for feature1 in self.features: 49 | detdup.plug_features(feature1()) 50 | 51 | # 确保不会覆盖 .original.db 52 | if (detdup.storage_type == 'memory') and detdup.feeded(): 53 | detdup.load_features_from_db() 54 | 55 | self.items_model.core = detdup 56 | 57 | return detdup 58 | 59 | def extract(self): 60 | self.items_model.pull_data() 61 | 62 | """ 重新生成请全部删除 model.db 和 features.db 文件 """ 63 | cprint("[建立 self.items_model 索引] ...", "blue") 64 | core = self.new_detdup_core() 65 | 66 | tmp_items = [] 67 | def write(tmp_items): 68 | core.feed_items(tmp_items) 69 | return [] 70 | 71 | for item_id1, item1 in self.items_model.iteritems(): 72 | tmp_items.append(item1) 73 | if len(tmp_items) >= 10000: 74 | tmp_items = write(tmp_items) 75 | tmp_items = write(tmp_items) 76 | 77 | def train(self): 78 | core = self.new_detdup_core() 79 | 80 | def delete_item_ids(table, item_ids_1): 81 | step = 100 82 | for i1 in xrange(0, len(item_ids_1), step): 83 | table.delete().where(table.item_id << item_ids_1[i1:i1+step]).execute() 84 | 85 | pickle_filename = os.path.join(self.cache_dir, "detdup.cPickle") 86 | def load_result_func(): 87 | core = self.new_detdup_core('memory') 88 | 89 | for feature1 in core.features: 90 | table = feature1.features_tree 91 | 92 | # 1. 先排除一定是不重复的 93 | candidate_list, uniq_list = feature1.divided_into_two_parts() 94 | 95 | delete_item_ids(table, uniq_list) 96 | 97 | # 2. 删除内容空白的条目 98 | table.delete().where(table.uniq_chars__len == 0).execute() 99 | 100 | core.candidate_dup_count = table.select().count() 101 | 102 | # 3. 正式排重 103 | for item1 in process_notifier(feature1.features_tree.select()): 104 | dup_ids = core.detect_duplicated_items_verbose(item1.item_id, verbose=True) 105 | delete_item_ids(table, dup_ids) 106 | 107 | return core.result 108 | result = cpickle_cache(pickle_filename, load_result_func) 109 | json.dump(result.result_json(), open(self.result_json, 'wb')) 110 | 111 | # def data_check(self): 112 | """ 验证正确率,随机找个重复的,遍历全库比对。 """ 113 | """ 114 | from random import randrange 115 | from etl_utils import String 116 | cases_count = 10 117 | 118 | recall_program_count = 0 119 | recall_real_count = float(cases_count) 120 | 121 | for idx in xrange(cases_count): 122 | result = cpickle_cache(self.result_cPickle, lambda : True) 123 | core = self.new_detdup_core('memory') 124 | 125 | total_count = len(result.result) 126 | if not total_count: 127 | print 'NO DUPLICATION FOUND!' 128 | return 129 | program_similar_ids = result.result[randrange(total_count)] 130 | 131 | # 随机抽取的一个item 132 | current_item = self.items_model[program_similar_ids[0]] 133 | 134 | print "Begin to find %s's similar item_ids" % current_item.item_id 135 | real_similar_ids = set([current_item.item_id]) 136 | program_similar_ids = set(program_similar_ids) 137 | 138 | table = core.select_feature(current_item).features_tree 139 | basic_query = table.item_id != str(current_item.item_id) 140 | for column1 in self.query_check_columns: 141 | basic_query = basic_query & (getattr(table, column1) == getattr(current_item, column1)) 142 | scope = table.select().where(basic_query) #.dicts() 143 | 144 | for i1 in process_notifier(scope): 145 | rate1 = String.calculate_text_similarity(current_item.item_content, self.items_model[i1.item_id].item_content)['similarity_rate'] 146 | if rate1 > core.similarity_rate: 147 | real_similar_ids.add(i1.item_id) 148 | print "real_similar_ids :", sorted([str(i1) for i1 in real_similar_ids]) 149 | print "program_similar_ids:", sorted([str(i1) for i1 in program_similar_ids]) 150 | print 151 | if real_similar_ids == program_similar_ids: 152 | recall_program_count += 1 153 | print "recall :", recall_program_count / recall_real_count 154 | """ 155 | -------------------------------------------------------------------------------- /detdup/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .utils import * 4 | 5 | from etl_utils import String, Speed, BufferLogger, ItemsGroupAndIndexes 6 | 7 | # TODO 检测 item.typename() 存在 8 | 9 | from .features import DefaultFeatures 10 | 11 | class DetDupCore(object): 12 | """ 13 | Detect duplicated items, use decision tree. 14 | 15 | Usage: 16 | ----------- 17 | """ 18 | 19 | similarity_rate = 0.90 20 | 21 | def __init__(self, features_dir, detdup_data_model): 22 | self.features_dir = features_dir 23 | 24 | self.model = detdup_data_model 25 | 26 | self.features = [DefaultFeatures()] 27 | self.features_map = dict() 28 | 29 | self.storage_type = ['memory', 'disk'][0] 30 | 31 | self.is_logger = True 32 | self.is_inspect_detail = False 33 | self.buffer_logger = BufferLogger(os.path.join(self.features_dir, 'process.log')) 34 | 35 | self.result = ItemsGroupAndIndexes() 36 | self.count = 0 37 | 38 | self.candidate_dup_count = None 39 | 40 | def select_feature(self, item1): 41 | f1 = item1.typename 42 | if not isinstance(f1, str) and not isinstance(f1, unicode): f1 = f1() 43 | return self.features_map[f1].insert_item(item1) 44 | 45 | def feeded(self): 46 | for feature1 in self.features: 47 | # 这个Feature是否有效 48 | if not feature1.link_to_detdup: 49 | continue 50 | # 之前已经导出数据库啦?! 51 | if os.path.exists(feature1.sqlite3db_path()): 52 | return True 53 | return False 54 | 55 | def load_features_from_db(self): 56 | for feature1 in self.features: feature1.load_features_tree() 57 | 58 | def dump_features_from_memory(self): 59 | for feature1 in self.features: feature1.dump_features_tree() 60 | 61 | def feed_items(self, obj, persist=True): 62 | """ Feed items to features """ 63 | # 1. insert it into memory 64 | [self.select_feature(item1).feed_item() for item1 in process_notifier(obj)] 65 | # 2. backup into files fully! 66 | if persist: 67 | self.dump_features_from_memory() 68 | return self 69 | 70 | def plug_features(self, features1): 71 | """ 72 | 1. Plug features, and bind typename to classify items 73 | 2. init features tree, memory or disk 74 | """ 75 | if not isinstance(features1, list): features1 = [features1] 76 | self.features.extend(features1) 77 | for f1 in self.features: 78 | f1.link_to_detdup = self 79 | f1.build_features_tree() 80 | 81 | for f1 in self.features: 82 | self.features_map[f1.typename] = f1 83 | return self 84 | 85 | time_sql = 0 86 | time_calculate_text_similarity = 0 87 | time_fetch_content = 0 88 | 89 | def detect_duplicated_items(self, item1): 90 | feature1 = self.select_feature(item1) 91 | speed = Speed() 92 | 93 | t1 = datetime.now() 94 | item_ids = feature1.fetch_matched_item_ids() 95 | t2 = datetime.now(); self.time_sql += (t2 - t1).total_seconds(); 96 | 97 | # 4. 看看题目相似度 98 | # 相似度得大于 95% 99 | new_ids = list() 100 | for item_id1 in item_ids: 101 | # 2. 排除自己 102 | if item_id1 == unicode(item1.item_id): continue 103 | 104 | if item_id1 not in self.model: 105 | # 删除不一致数据, 以在self.model里为准 106 | feature1.delete_item_ids([item_id1]) 107 | continue 108 | 109 | t11 = datetime.now() 110 | content1 = self.model[item_id1].item_content 111 | t12 = datetime.now(); self.time_fetch_content += (t12 - t11).total_seconds(); 112 | 113 | t11 = datetime.now() 114 | res1 = String.calculate_text_similarity(item1.item_content, 115 | content1, 116 | inspect=True, 117 | skip_special_chars=True, 118 | similar_rate_baseline=self.similarity_rate) 119 | t12 = datetime.now(); self.time_calculate_text_similarity += (t12 - t11).total_seconds(); 120 | 121 | if res1['similarity_rate'] > self.similarity_rate: 122 | new_ids.append(item_id1) 123 | self.buffer_logger.append(res1['info']) 124 | self.buffer_logger.inspect() 125 | print "字符串相似度 [前]", (len(item_ids) - 1), "个,[后]", len(new_ids), "个" 126 | 127 | item_ids = new_ids 128 | 129 | # 如果要排除已处理过为排重的 130 | speed.tick().inspect() 131 | 132 | print "self.time_sql", self.time_sql 133 | print "self.time_calculate_text_similarity", self.time_calculate_text_similarity 134 | print "self.time_fetch_content", self.time_fetch_content 135 | 136 | return item_ids 137 | 138 | def detect_duplicated_items_verbose(self, item_id1, verbose=False): 139 | self.count += 1 140 | print "\n"*5, "从", self.candidate_dup_count, "个候选题目中 排重第", self.count, "个题目。", item_id1 141 | 142 | # 如果结果已经计算出来 143 | if self.result.exists(item_id1): 144 | return self.result.find(item_id1) 145 | 146 | self.buffer_logger.append("-"*80) 147 | self.buffer_logger.append("要处理的记录") 148 | 149 | item1 = self.model[item_id1] 150 | if verbose: item1.inspect() 151 | 152 | self.buffer_logger.append("") 153 | item_ids = self.detect_duplicated_items(item1) 154 | self.buffer_logger.append("疑似和", item1.item_id, "重复的条目有", len(item_ids), "个") 155 | for item_id1 in item_ids: 156 | if verbose: self.model[item_id1].inspect() 157 | self.buffer_logger.append("") 158 | 159 | # 输出日志 160 | if (len(item_ids) > 0) and self.is_logger: 161 | self.buffer_logger.inspect() 162 | else: 163 | self.buffer_logger.clear() 164 | 165 | item_ids.append(unicode(item1.item_id)) 166 | 167 | # 有重复结果,就存储一下 168 | if len(item_ids) > 1: 169 | self.result.add([i1 for i1 in item_ids]) 170 | 171 | return item_ids 172 | -------------------------------------------------------------------------------- /detdup/features/default.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from etl_utils import cpickle_cache, process_notifier 4 | import os 5 | import sqlitebck 6 | import sqlite3 7 | import shutil 8 | from peewee import fn as peewee_fn 9 | 10 | class DefaultFeatures(object): 11 | """ 12 | Default processing features. 13 | 14 | Notice: 15 | 1. process only one current_item at a time. 16 | 17 | Inteface: 18 | 1. post_init: add your self.custom_features and self.typename 19 | 20 | Regenerate cache: 21 | 1. delete *.db and *.idnamecache 22 | 23 | """ 24 | 25 | def __init__(self): 26 | self.link_to_detdup = None 27 | 28 | self.current_item = None 29 | self.typename = 'default' 30 | 31 | self.default_features = { 32 | "uniq_chars__len" : int, 33 | "sqrt_chars__len" : int, 34 | "sorted_freq_chars" : str, 35 | # TODO sorted_mid_freq_chars 36 | } 37 | self.custom_features = dict() 38 | self.features_tree = None 39 | self.uniq_chars__len = 3 40 | 41 | # private 42 | self.columns = [] 43 | 44 | self.post_init() 45 | 46 | def post_init(self): 47 | """ 48 | e.g. 49 | 50 | self.typename = 'choices' 51 | self.custom_features = { 52 | 'answers_length': int, 53 | 'sorted_answers_hash': str 54 | } 55 | """ 56 | pass 57 | 58 | def table_columns(self): 59 | if self.columns: return self.columns 60 | self.columns = self.features_tree._meta.get_field_names() 61 | self.columns.remove('id') 62 | return self.columns 63 | 64 | def insert_item(self, q1): 65 | self.current_item = q1 66 | return self 67 | 68 | def feed_item(self, item1=None): 69 | self.current_item = item1 or self.current_item 70 | # query item_content through self.features_tree is slow, so add build a hash index directly. 71 | data_json = {k1: getattr(self.current_item, k1) for k1 in self.table_columns()} 72 | self.features_tree.create(**data_json) 73 | 74 | def delete_item_ids(self, oids): 75 | self.features_tree.delete().where(self.features_tree.item_id << oids).execute() 76 | 77 | def fetch_matched_item_ids(self): 78 | """ 79 | 如果不是一个一个从DefaultFeatures遍历查,然后筛选,那么就浪费了每次SQL查询的时间。 80 | 81 | 因为有范围查找的原因,所以不能直接在特征数据库GroupBy基础上直接再排重,还是得一个一个排重。 82 | """ 83 | item1 = self.current_item 84 | 85 | sqrt_chars__len_range_query_left = self.features_tree.sqrt_chars__len >= (item1.sqrt_chars__len - 1) 86 | sqrt_chars__len_range_query_right = self.features_tree.sqrt_chars__len <= (item1.sqrt_chars__len + 1) 87 | 88 | # uniq_chars__len range query 89 | # NOTE 不用对数 是因为已经唯一过了。 90 | uniq_chars__len_range_query_left = self.features_tree.uniq_chars__len >= (item1.uniq_chars__len - self.uniq_chars__len) 91 | uniq_chars__len_range_query_right = self.features_tree.uniq_chars__len <= (item1.uniq_chars__len + self.uniq_chars__len) 92 | 93 | # sorted_freq_chars equal query 94 | sorted_freq_chars = self.features_tree.sorted_freq_chars == item1.sorted_freq_chars 95 | 96 | default_query = uniq_chars__len_range_query_left & uniq_chars__len_range_query_right & sorted_freq_chars 97 | 98 | # extend query 99 | for feature_k1 in self.custom_features: 100 | # support custom int attribute query 101 | if isinstance(self.custom_features[feature_k1], int): 102 | feature_v1 = self.custom_features[feature_k1] 103 | delta_query1 = (getattr(self.features_tree, feature_k1) >= (getattr(item1, feature_k1) - feature_v1)) 104 | delta_query2 = (getattr(self.features_tree, feature_k1) <= (getattr(item1, feature_k1) + feature_v1)) 105 | delta_query = delta_query1 & delta_query2 106 | else: # str 107 | delta_query = (getattr(self.features_tree, feature_k1) == getattr(item1, feature_k1)) 108 | default_query = default_query & delta_query 109 | 110 | ffs = self.features_tree.select().where(default_query) 111 | 112 | return [f1.item_id for f1 in ffs] 113 | 114 | def build_features_tree(self): 115 | from peewee import SqliteDatabase, Model, IntegerField, CharField, BooleanField 116 | 117 | # built or connect database 118 | sqlite_path = { 119 | "memory" : ":memory:", 120 | "disk" : self.sqlite3db_path(), 121 | }[self.link_to_detdup.storage_type] 122 | sqlite_database = SqliteDatabase(sqlite_path, check_same_thread=False) 123 | 124 | class BaseFeaturesTree(Model): 125 | uniq_chars__len = IntegerField(default=0) 126 | sqrt_chars__len = IntegerField(default=0) 127 | sorted_freq_chars = CharField() 128 | # TODO support item_id as int or str type 129 | item_id = CharField() 130 | 131 | class Meta: 132 | database = sqlite_database 133 | self.features_tree = BaseFeaturesTree 134 | 135 | tablename = "_".join(self.custom_features).capitalize() or "DefaultFeaturesTree" 136 | 137 | # If customize more features 138 | if self.custom_features: 139 | self.features_tree = type(tablename, (BaseFeaturesTree,), dict()) 140 | for feature_k1 in self.custom_features: 141 | # http://stackoverflow.com/questions/22358489/dynamically-define-fields-in-a-peewee-model 142 | feature_v1 = self.custom_features[feature_k1] 143 | # Compact with (int) instance 144 | if type(feature_v1) is int: feature_v1 = int 145 | field1 = {int: IntegerField, str: CharField}[feature_v1]() 146 | field1.add_to_class(self.features_tree, feature_k1) 147 | 148 | self.features_tree._meta.db_table = tablename 149 | 150 | # create table and indexes 151 | if not self.features_tree.table_exists(): 152 | self.features_tree.create_table() 153 | sqlite_database.create_index(self.features_tree, "item_id".split(" ")) 154 | 155 | # TODO 让大str在前面,加快索引搜索速度 156 | index_columns = self.default_features.keys() + self.custom_features.keys() 157 | sqlite_database.create_index(self.features_tree, index_columns) 158 | 159 | print "[build_features_tree]", self.features_tree, "self.default_features :", self.default_features, "self.custom_features :", self.custom_features 160 | print 161 | 162 | def dump_features_tree(self): self.copy_features_tree('memory_to_file') 163 | def load_features_tree(self): self.copy_features_tree('file_to_memory') 164 | def copy_features_tree(self, schema='memory_to_file'): 165 | # TODO reduce file disk size 166 | if not self.link_to_detdup: return False 167 | # 1. copy database 168 | backup_conn = sqlite3.connect(self.sqlite3db_path()) 169 | current_conn = self.features_tree._meta.database.get_conn() 170 | if schema == 'memory_to_file': _from = current_conn; _to = backup_conn 171 | if schema == 'file_to_memory': _from = backup_conn ; _to = current_conn 172 | if not (_from or _to): raise Exception("schema don't match!") 173 | sqlitebck.copy(_from, _to) 174 | backup_conn.close() 175 | 176 | print "loaded %s by %s" % (self.sqlite3db_path(), schema) 177 | 178 | def sqlite3db_path(self): return os.path.join(self.link_to_detdup.features_dir, self.typename + '.db') 179 | 180 | def divided_into_two_parts(self): 181 | """ 重复毕竟是小部分, 所以_set只存可能是重复的ID列表 """ 182 | candidate_set = set([]) 183 | uniq_set = set([]) 184 | 185 | feature1_dict = dict(self.default_features.items() + self.custom_features.items()) 186 | 187 | # group by 不支持范围,比如整数范围查询 188 | group_by_columns = [f1 for f1 in feature1_dict if feature1_dict[f1] == str] 189 | 190 | if group_by_columns: 191 | table = self.features_tree 192 | group_by_query = [getattr(table, f1) for f1 in group_by_columns] 193 | group_concat = [peewee_fn.group_concat(table.item_id).alias('item_ids')] 194 | 195 | group_by_sql = table.select(*(group_concat)).group_by(*group_by_query) 196 | for i1 in process_notifier(group_by_sql): 197 | items_len = len(i1.item_ids) 198 | if items_len > 24: 199 | candidate_set = candidate_set | set(i1.item_ids.split(",")) 200 | elif items_len == 24: # 只有一个object_id 201 | uniq_set.add(i1.item_ids) 202 | else: 203 | raise Exception("item_ids is invalid") 204 | else: 205 | print feature1, "has none features" 206 | 207 | return (list(candidate_set), list(uniq_set)) 208 | --------------------------------------------------------------------------------