.
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # bookmarks-classifier
2 |
3 | A classifier for bookmarks file.
4 |
5 | ### Environment
6 |
7 | - Linux/MacOS/Windows
8 | - Python 3
9 |
10 | ### How to use it
11 |
12 | 1. Export the bookmarks file from your browser.
13 | 2. Run bookmarks.py with python3 and get a new bookmarks file.
14 | 3. Import the new bookmarks file above to your browser.
15 |
16 | ### To be continued
17 |
18 | To achieve a classification algorithm without the configuration file.
19 |
20 | ### More
21 |
22 | Welcome any type of contribution! Welcome your pull request! Thanks!
23 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybbz/bookmarks-classifier/4fec33ac0c12fd9dbf7cf43370d19deb8ff02878/__init__.py
--------------------------------------------------------------------------------
/bookmarks.py:
--------------------------------------------------------------------------------
1 | import re, json
2 |
3 | # the bookmarks file exported from your browser
4 | html = 'bookmarks.html'
5 | # the new bookmarks file we want to get
6 | html_new = 'bookmarks_new.html'
7 | # init list
8 | link_list = []
9 | link_list_new = [[] for i in range(6)]
10 | # config file of classifier
11 | category_dict = json.load(open('classify.txt', 'r'))
12 | # config file of classifier type
13 | type_dict = json.load(open('classify_type.txt', 'r'))
14 | # reverse the dict above
15 | type_dict_reverse = dict(zip(type_dict.values(), type_dict.keys()))
16 |
17 |
18 | # the classifier of bookmarks
19 | def classify(list):
20 | for domain, link, text in list:
21 | if domain not in category_dict:
22 | cate = 'other'
23 | else:
24 | cate = category_dict[domain]
25 | link_item_new = (link, text)
26 | link_list_new[type_dict[cate]].append(link_item_new)
27 | print(link_list_new)
28 | print('classify:' + str(len(link_list_new)))
29 |
30 |
31 | # read the original bookmarks html, filter the link and text of
32 | with open(html, 'r') as f_origin:
33 | lines = re.findall('(.*?)', f_origin.read(), re.S)
34 | print('Total:' + str(len(lines)))
35 | for line in lines:
36 | domain = re.findall('://[a-zA-Z0-9]*\.(.*?)\.', line, re.S)
37 | link = re.findall('HREF="(.*?)"', line, re.S)
38 | text = re.findall('">(.*?)', line, re.S)
39 | if len(domain) > 0 and len(link) > 0 and len(text) > 0:
40 | link_item = (domain[0], link[0], text[0])
41 | link_list.append(link_item)
42 | print(link_list)
43 | print('Filter:' + str(len(link_list)))
44 | classify(link_list)
45 |
46 | # write the results to a new bookmarks html
47 | with open(html_new, 'w') as f_new:
48 | group = '\n' \
49 | + '\n' \
50 | + 'Bookmarks\n' \
51 | + 'Bookmarks
\n' \
52 | + '\n'
53 | for i, item in enumerate(link_list_new):
54 | group += '\t
' + type_dict_reverse[i] + '
\n\t\n'
55 | for j in item:
56 | one = '\t\t
- ' + j[1] + '\n'
57 | group += one
58 | group += '\t
\n'
59 | group += '
\n'
60 | f_new.write(group)
61 |
--------------------------------------------------------------------------------
/classify.txt:
--------------------------------------------------------------------------------
1 | {
2 | "google":"tech",
3 | "facebook":"tech",
4 | "baidu":"tech",
5 | "sougou":"tech",
6 | "163":"tech",
7 | "youtube":"tech",
8 | "github":"tech",
9 |
10 | "zhihu":"IT",
11 | "csdn":"IT",
12 | "toutiao":"IT",
13 | "segmentfault":"IT",
14 | "tuicool":"IT",
15 | "ftqq":"IT",
16 | "jianshu":"IT",
17 | "jobbole":"IT",
18 | "sdk":"IT",
19 | "guokr":"IT",
20 | "douban":"IT",
21 | "oschina":"IT",
22 | "leiphone":"IT",
23 |
24 | "weibo":"social",
25 | "bilibili":"social",
26 | "qq":"social",
27 | "weixin":"social",
28 |
29 | "cnblogs":"blog",
30 | "liaoxuefeng":"blog",
31 | "yiibai":"blog",
32 | "runoob":"blog",
33 | "ybbz":"blog",
34 | "codingpy":"blog",
35 | "trinea":"blog",
36 |
37 | "taobao":"shop",
38 | "jd":"shop",
39 | "dangdang":"shop",
40 | "meituan":"shop",
41 | "amazon":"shop",
42 | "xiaomi":"shop",
43 | "uniqlo":"shop"
44 | }
--------------------------------------------------------------------------------
/classify_type.txt:
--------------------------------------------------------------------------------
1 | {
2 | "other":0,
3 | "tech":1,
4 | "IT":2,
5 | "social":3,
6 | "blog":4,
7 | "shop":5
8 | }
--------------------------------------------------------------------------------