├── .idea ├── .name ├── scopes │ └── scope_settings.xml ├── encodings.xml ├── modules.xml ├── check_file_system.iml ├── vcs.xml └── misc.xml ├── app ├── crawler.py ├── run.py ├── views.py ├── __init__.py ├── static │ └── uploads │ │ ├── woai4.txt │ │ ├── woai5.txt │ │ └── woai6.txt ├── config.ini ├── templates │ ├── upload.html │ └── results.html ├── read_config.py ├── get_key_word.py ├── file_sim_hash.py ├── db.py └── simhash.py ├── testunit ├── __init__.py └── file_test.py └── README.mdgit /.idea/.name: -------------------------------------------------------------------------------- 1 | check_file_system -------------------------------------------------------------------------------- /app/crawler.py: -------------------------------------------------------------------------------- 1 | # coding=utf8 2 | -------------------------------------------------------------------------------- /testunit/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'li' 2 | -------------------------------------------------------------------------------- /app/run.py: -------------------------------------------------------------------------------- 1 | __author__ = 'li' 2 | from app import app 3 | 4 | app.run(debug='true') -------------------------------------------------------------------------------- /app/views.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/views.py -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'li' 2 | from flask import Flask 3 | 4 | app = Flask(__name__) 5 | 6 | from app import views -------------------------------------------------------------------------------- /app/static/uploads/woai4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai4.txt -------------------------------------------------------------------------------- /app/static/uploads/woai5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai5.txt -------------------------------------------------------------------------------- /app/static/uploads/woai6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai6.txt -------------------------------------------------------------------------------- /app/config.ini: -------------------------------------------------------------------------------- 1 | [database] 2 | db_user = root 3 | db_password = 19901023 4 | db_ip = 127.0.0.1 5 | db_port = 3306 6 | db_name = filedict -------------------------------------------------------------------------------- /.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /README.mdgit: -------------------------------------------------------------------------------- 1 | "# check_file_system" initgit add README.mdgit commit -m "first commit"git remote add origin https://github.com/lrcUnlimited/check_file_system.gitgit push -u origin master 2 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /testunit/file_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: gb2312 -*- 2 | __author__ = 'li' 3 | import os 4 | import sys 5 | import codecs 6 | 7 | from app import simhash 8 | f = codecs.open('.././app/static/uploads','r','gb2312') 9 | str=f.read() 10 | print simhash.simhash(str) 11 | 12 | f.close() 13 | print(str) 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/check_file_system.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /app/templates/upload.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 上传文件 6 | 7 | 8 |
9 |
10 | 11 |
12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /app/read_config.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: UTF-8 -*- 3 | __author__ = 'li' 4 | import sys, os, time, ConfigParser 5 | 6 | 7 | class Config: 8 | def __init__(self, path): 9 | self.path = path 10 | self.cf = ConfigParser.ConfigParser() 11 | self.cf.read(self.path) 12 | 13 | def get(self, field, key): 14 | result = "" 15 | try: 16 | result = self.cf.get(field, key) 17 | except: 18 | result = "" 19 | return result 20 | 21 | def set(self, field, key, value): 22 | try: 23 | self.cf.set(field, key, value) 24 | self.cf.write(open(self.path, 'w')) 25 | except: 26 | return False 27 | return True 28 | 29 | 30 | config = Config('config.ini') -------------------------------------------------------------------------------- /app/get_key_word.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | __author__ = 'li' 3 | """ 4 | 利用jieba中文分词组件,对文章的关键词进行抽取 5 | """ 6 | import jieba, codecs 7 | import jieba.analyse 8 | 9 | content = codecs.open('static/uploads/woai4.txt', 'r', 'gb2312').read() 10 | 11 | tags = jieba.analyse.extract_tags(content, 20, True) 12 | 13 | 14 | # 获取关键词的hash码 15 | def get_key_hash(v): 16 | # A variable-length version of Python's builtin hash 17 | if v == "": 18 | return 0 19 | else: 20 | x = ord(v[0]) << 7 21 | m = 1000003 22 | mask = 2 ** 64 - 1 23 | for c in v: 24 | x = ((x * m) ^ ord(c)) & mask 25 | x ^= len(v) 26 | if x == -1: 27 | x = -2 28 | return x 29 | 30 | 31 | fingerprint=bin(get_key_hash('我很乐意将扩大开放空间爱的色放金卡交电费卡积分卡金卡黛珊就付款')) 32 | print len(fingerprint) -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 17 | -------------------------------------------------------------------------------- /app/templates/results.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 21 | 文件查重结果 22 | 23 | 24 | 25 |
26 | 27 | {{sourcefilecontent}} 28 |
29 |
30 | 31 | {{result}} 32 |
33 | 34 | -------------------------------------------------------------------------------- /app/file_sim_hash.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'li' 3 | 4 | import codecs 5 | from app import simhash 6 | from app import db 7 | 8 | """ 9 | 获取文件的hash码 10 | """ 11 | 12 | 13 | def get_simlar_file(filepath): 14 | f = codecs.open(filepath, 'r', 'gb2312') 15 | filecontent = f.read() 16 | # 当前文件的hash 17 | filehash = simhash.simhash(filecontent) 18 | # 获取数据库中的hash值 19 | dict_hash = db.get_data() 20 | # 存储文件与数据库hash码的汉明距离 21 | minfilepath = '' 22 | min_distance = 64 23 | for k, v in dict_hash.items(): 24 | distance = filehash.hamming_distance(int(v)) 25 | if (distance < min_distance): 26 | min_distance = distance 27 | minfilepath = k 28 | param = (filepath, filehash) 29 | db.save_file_hash(param) 30 | simpercent = 0 31 | if minfilepath == '': 32 | simpercent = 0 33 | else: 34 | simpercent = filehash.similarity(int(dict_hash[minfilepath])) 35 | return (minfilepath, filecontent, min_distance, simpercent) 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /app/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | __author__ = 'li' 3 | import MySQLdb 4 | from read_config import config 5 | 6 | user = config.get('database', 'db_user') 7 | pwd = config.get('database', 'db_password') 8 | host = config.get('database', 'db_ip') 9 | db = config.get('database', 'db_name') 10 | port = int(config.get('database', 'db_port')) 11 | select_sql = 'select * from filehash' 12 | """ 13 | get data from db 14 | 15 | """ 16 | 17 | 18 | def get_data(): 19 | cnx = MySQLdb.connect(user=user, passwd=pwd, port=port, host=host, db=db, charset="utf8") 20 | cursor = cnx.cursor() 21 | d = {} 22 | try: 23 | cursor.execute(select_sql) 24 | rows = cursor.fetchall() 25 | for row in rows: 26 | d[row[1]] = row[2] 27 | return d 28 | except Exception as err: 29 | print("query database' failed.") 30 | print("Error: {}".format(err.msg)) 31 | finally: 32 | cursor.close() 33 | cnx.close() 34 | 35 | 36 | # 保存文件hash值到数据库 37 | def save_file_hash(param): 38 | try: 39 | cnx = MySQLdb.connect(user=user, passwd=pwd, port=port, host=host, db=db, charset="utf8") 40 | cursor = cnx.cursor() 41 | sql = 'INSERT INTO filehash (filename, filehash) VALUES (%s, %s)' 42 | cursor.execute(sql, param) 43 | cnx.commit() 44 | except Exception as err: 45 | print 'error' 46 | 47 | finally: 48 | cursor.close() 49 | cnx.close() 50 | 51 | 52 | -------------------------------------------------------------------------------- /app/simhash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding=utf-8 -*- 3 | """相似文件比较的simhash算法,distance小于3说明两个文件十分相似""" 4 | 5 | 6 | class simhash(): 7 | def __init__(self, tokens='', hashbits=128): 8 | self.hashbits = hashbits 9 | self.hash = self.simhash(tokens) 10 | 11 | def __str__(self): 12 | return str(self.hash) 13 | 14 | def __long__(self): 15 | return long(self.hash) 16 | 17 | def __float__(self): 18 | return float(self.hash) 19 | 20 | def simhash(self, tokens): 21 | # Returns a Charikar simhash with appropriate bitlength 22 | v = [0] * self.hashbits 23 | 24 | for t in [self._string_hash(x) for x in tokens]: 25 | bitmask = 0 26 | # print (t) 27 | for i in range(self.hashbits): 28 | bitmask = 1 << i 29 | # print(t,bitmask, t & bitmask) 30 | if t & bitmask: 31 | v[i] += 1 # 查看当前bit位是否为1,是的话则将该位+1 32 | else: 33 | v[i] += -1 # 否则得话,该位减1 34 | 35 | fingerprint = 0 36 | for i in range(self.hashbits): 37 | if v[i] >= 0: 38 | fingerprint += 1 << i 39 | # 整个文档的fingerprint为最终各个位大于等于0的位的和 40 | return fingerprint 41 | 42 | def _string_hash(self, v): 43 | # A variable-length version of Python's builtin hash 44 | if v == "": 45 | return 0 46 | else: 47 | x = ord(v[0]) << 7 48 | m = 1000003 49 | mask = 2 ** self.hashbits - 1 50 | for c in v: 51 | x = ((x * m) ^ ord(c)) & mask 52 | x ^= len(v) 53 | if x == -1: 54 | x = -2 55 | return x 56 | 57 | def hamming_distance(self, other_hash): 58 | x = (self.hash ^ other_hash) & ((1 << self.hashbits) - 1) 59 | tot = 0 60 | while x: 61 | tot += 1 62 | x &= x - 1 63 | return tot 64 | 65 | def similarity(self, other_hash): 66 | a = float(self.hash) 67 | b = float(other_hash) 68 | if a > b: return b / a 69 | return a / b 70 | 71 | 72 | --------------------------------------------------------------------------------