├── .idea
├── .name
├── scopes
│ └── scope_settings.xml
├── encodings.xml
├── modules.xml
├── check_file_system.iml
├── vcs.xml
└── misc.xml
├── app
├── crawler.py
├── run.py
├── views.py
├── __init__.py
├── static
│ └── uploads
│ │ ├── woai4.txt
│ │ ├── woai5.txt
│ │ └── woai6.txt
├── config.ini
├── templates
│ ├── upload.html
│ └── results.html
├── read_config.py
├── get_key_word.py
├── file_sim_hash.py
├── db.py
└── simhash.py
├── testunit
├── __init__.py
└── file_test.py
└── README.mdgit
/.idea/.name:
--------------------------------------------------------------------------------
1 | check_file_system
--------------------------------------------------------------------------------
/app/crawler.py:
--------------------------------------------------------------------------------
1 | # coding=utf8
2 |
--------------------------------------------------------------------------------
/testunit/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'li'
2 |
--------------------------------------------------------------------------------
/app/run.py:
--------------------------------------------------------------------------------
1 | __author__ = 'li'
2 | from app import app
3 |
4 | app.run(debug='true')
--------------------------------------------------------------------------------
/app/views.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/views.py
--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'li'
2 | from flask import Flask
3 |
4 | app = Flask(__name__)
5 |
6 | from app import views
--------------------------------------------------------------------------------
/app/static/uploads/woai4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai4.txt
--------------------------------------------------------------------------------
/app/static/uploads/woai5.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai5.txt
--------------------------------------------------------------------------------
/app/static/uploads/woai6.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrcUnlimited/check_file_system/HEAD/app/static/uploads/woai6.txt
--------------------------------------------------------------------------------
/app/config.ini:
--------------------------------------------------------------------------------
1 | [database]
2 | db_user = root
3 | db_password = 19901023
4 | db_ip = 127.0.0.1
5 | db_port = 3306
6 | db_name = filedict
--------------------------------------------------------------------------------
/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/README.mdgit:
--------------------------------------------------------------------------------
1 | "# check_file_system" initgit add README.mdgit commit -m "first commit"git remote add origin https://github.com/lrcUnlimited/check_file_system.gitgit push -u origin master
2 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/testunit/file_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: gb2312 -*-
2 | __author__ = 'li'
3 | import os
4 | import sys
5 | import codecs
6 |
7 | from app import simhash
8 | f = codecs.open('.././app/static/uploads','r','gb2312')
9 | str=f.read()
10 | print simhash.simhash(str)
11 |
12 | f.close()
13 | print(str)
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/check_file_system.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/app/templates/upload.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 上传文件
6 |
7 |
8 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/app/read_config.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: UTF-8 -*-
3 | __author__ = 'li'
4 | import sys, os, time, ConfigParser
5 |
6 |
7 | class Config:
8 | def __init__(self, path):
9 | self.path = path
10 | self.cf = ConfigParser.ConfigParser()
11 | self.cf.read(self.path)
12 |
13 | def get(self, field, key):
14 | result = ""
15 | try:
16 | result = self.cf.get(field, key)
17 | except:
18 | result = ""
19 | return result
20 |
21 | def set(self, field, key, value):
22 | try:
23 | self.cf.set(field, key, value)
24 | self.cf.write(open(self.path, 'w'))
25 | except:
26 | return False
27 | return True
28 |
29 |
30 | config = Config('config.ini')
--------------------------------------------------------------------------------
/app/get_key_word.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | __author__ = 'li'
3 | """
4 | 利用jieba中文分词组件,对文章的关键词进行抽取
5 | """
6 | import jieba, codecs
7 | import jieba.analyse
8 |
9 | content = codecs.open('static/uploads/woai4.txt', 'r', 'gb2312').read()
10 |
11 | tags = jieba.analyse.extract_tags(content, 20, True)
12 |
13 |
14 | # 获取关键词的hash码
15 | def get_key_hash(v):
16 | # A variable-length version of Python's builtin hash
17 | if v == "":
18 | return 0
19 | else:
20 | x = ord(v[0]) << 7
21 | m = 1000003
22 | mask = 2 ** 64 - 1
23 | for c in v:
24 | x = ((x * m) ^ ord(c)) & mask
25 | x ^= len(v)
26 | if x == -1:
27 | x = -2
28 | return x
29 |
30 |
31 | fingerprint=bin(get_key_hash('我很乐意将扩大开放空间爱的色放金卡交电费卡积分卡金卡黛珊就付款'))
32 | print len(fingerprint)
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/app/templates/results.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
21 | 文件查重结果
22 |
23 |
24 |
25 |
26 |
27 | {{sourcefilecontent}}
28 |
29 |
30 |
31 | {{result}}
32 |
33 |
34 |
--------------------------------------------------------------------------------
/app/file_sim_hash.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'li'
3 |
4 | import codecs
5 | from app import simhash
6 | from app import db
7 |
8 | """
9 | 获取文件的hash码
10 | """
11 |
12 |
13 | def get_simlar_file(filepath):
14 | f = codecs.open(filepath, 'r', 'gb2312')
15 | filecontent = f.read()
16 | # 当前文件的hash
17 | filehash = simhash.simhash(filecontent)
18 | # 获取数据库中的hash值
19 | dict_hash = db.get_data()
20 | # 存储文件与数据库hash码的汉明距离
21 | minfilepath = ''
22 | min_distance = 64
23 | for k, v in dict_hash.items():
24 | distance = filehash.hamming_distance(int(v))
25 | if (distance < min_distance):
26 | min_distance = distance
27 | minfilepath = k
28 | param = (filepath, filehash)
29 | db.save_file_hash(param)
30 | simpercent = 0
31 | if minfilepath == '':
32 | simpercent = 0
33 | else:
34 | simpercent = filehash.similarity(int(dict_hash[minfilepath]))
35 | return (minfilepath, filecontent, min_distance, simpercent)
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/app/db.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | __author__ = 'li'
3 | import MySQLdb
4 | from read_config import config
5 |
6 | user = config.get('database', 'db_user')
7 | pwd = config.get('database', 'db_password')
8 | host = config.get('database', 'db_ip')
9 | db = config.get('database', 'db_name')
10 | port = int(config.get('database', 'db_port'))
11 | select_sql = 'select * from filehash'
12 | """
13 | get data from db
14 |
15 | """
16 |
17 |
18 | def get_data():
19 | cnx = MySQLdb.connect(user=user, passwd=pwd, port=port, host=host, db=db, charset="utf8")
20 | cursor = cnx.cursor()
21 | d = {}
22 | try:
23 | cursor.execute(select_sql)
24 | rows = cursor.fetchall()
25 | for row in rows:
26 | d[row[1]] = row[2]
27 | return d
28 | except Exception as err:
29 | print("query database' failed.")
30 | print("Error: {}".format(err.msg))
31 | finally:
32 | cursor.close()
33 | cnx.close()
34 |
35 |
36 | # 保存文件hash值到数据库
37 | def save_file_hash(param):
38 | try:
39 | cnx = MySQLdb.connect(user=user, passwd=pwd, port=port, host=host, db=db, charset="utf8")
40 | cursor = cnx.cursor()
41 | sql = 'INSERT INTO filehash (filename, filehash) VALUES (%s, %s)'
42 | cursor.execute(sql, param)
43 | cnx.commit()
44 | except Exception as err:
45 | print 'error'
46 |
47 | finally:
48 | cursor.close()
49 | cnx.close()
50 |
51 |
52 |
--------------------------------------------------------------------------------
/app/simhash.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding=utf-8 -*-
3 | """相似文件比较的simhash算法,distance小于3说明两个文件十分相似"""
4 |
5 |
6 | class simhash():
7 | def __init__(self, tokens='', hashbits=128):
8 | self.hashbits = hashbits
9 | self.hash = self.simhash(tokens)
10 |
11 | def __str__(self):
12 | return str(self.hash)
13 |
14 | def __long__(self):
15 | return long(self.hash)
16 |
17 | def __float__(self):
18 | return float(self.hash)
19 |
20 | def simhash(self, tokens):
21 | # Returns a Charikar simhash with appropriate bitlength
22 | v = [0] * self.hashbits
23 |
24 | for t in [self._string_hash(x) for x in tokens]:
25 | bitmask = 0
26 | # print (t)
27 | for i in range(self.hashbits):
28 | bitmask = 1 << i
29 | # print(t,bitmask, t & bitmask)
30 | if t & bitmask:
31 | v[i] += 1 # 查看当前bit位是否为1,是的话则将该位+1
32 | else:
33 | v[i] += -1 # 否则得话,该位减1
34 |
35 | fingerprint = 0
36 | for i in range(self.hashbits):
37 | if v[i] >= 0:
38 | fingerprint += 1 << i
39 | # 整个文档的fingerprint为最终各个位大于等于0的位的和
40 | return fingerprint
41 |
42 | def _string_hash(self, v):
43 | # A variable-length version of Python's builtin hash
44 | if v == "":
45 | return 0
46 | else:
47 | x = ord(v[0]) << 7
48 | m = 1000003
49 | mask = 2 ** self.hashbits - 1
50 | for c in v:
51 | x = ((x * m) ^ ord(c)) & mask
52 | x ^= len(v)
53 | if x == -1:
54 | x = -2
55 | return x
56 |
57 | def hamming_distance(self, other_hash):
58 | x = (self.hash ^ other_hash) & ((1 << self.hashbits) - 1)
59 | tot = 0
60 | while x:
61 | tot += 1
62 | x &= x - 1
63 | return tot
64 |
65 | def similarity(self, other_hash):
66 | a = float(self.hash)
67 | b = float(other_hash)
68 | if a > b: return b / a
69 | return a / b
70 |
71 |
72 |
--------------------------------------------------------------------------------