├── .gitignore
├── .idea
├── .name
├── compiler.xml
├── copyright
│ └── profiles_settings.xml
├── libraries
│ ├── Maven__com_google_guava_guava_19_0.xml
│ ├── Maven__junit_junit_4_12.xml
│ └── Maven__org_hamcrest_hamcrest_core_1_3.xml
├── misc.xml
├── modules.xml
├── uiDesigner.xml
├── vcs.xml
└── workspace.xml
├── .travis.yml
├── BloomFilter.iml
├── LICENSE
├── README.md
├── pom.xml
├── src
├── main
│ ├── java
│ │ └── com
│ │ │ └── ruyuapp
│ │ │ ├── BloomFilter.java
│ │ │ ├── CachedBloomFilter.java
│ │ │ └── CountBloomFilter.java
│ └── resources
│ │ ├── bloom-filter.JPG
│ │ └── cached-bloom-filter.JPG
└── test
│ └── java
│ └── com
│ └── ruyuapp
│ └── BloomFilterTest.java
└── target
├── classes
└── com
│ └── ruyuapp
│ ├── BloomFilter.class
│ ├── CachedBloomFilter.class
│ └── CountBloomFilter.class
└── test-classes
└── com
└── ruyuapp
└── BloomFilterTest.class
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | BloomFilter
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_guava_guava_19_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__junit_junit_4_12.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
12 |
15 |
16 |
17 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 | true
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 | true
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 | 1456139843252
668 |
669 | 1456139843252
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 | 1457867886664
680 |
681 |
682 | 1457867886664
683 |
684 |
685 | 1466145967908
686 |
687 |
688 | 1466145967908
689 |
690 |
691 | 1466146054024
692 |
693 |
694 | 1466146054024
695 |
696 |
697 | 1466146179821
698 |
699 |
700 | 1466146179821
701 |
702 |
703 | 1466146906437
704 |
705 |
706 | 1466146906437
707 |
708 |
709 | 1466146915045
710 |
711 |
712 | 1466146915045
713 |
714 |
715 | 1466146957713
716 |
717 |
718 | 1466146957713
719 |
720 |
721 | 1466147087706
722 |
723 |
724 | 1466147087706
725 |
726 |
727 | 1466147586814
728 |
729 |
730 | 1466147586814
731 |
732 |
733 | 1466147857194
734 |
735 |
736 | 1466147857194
737 |
738 |
739 | 1466148227014
740 |
741 |
742 | 1466148227014
743 |
744 |
745 | 1466148864164
746 |
747 |
748 | 1466148864164
749 |
750 |
751 | 1466149332233
752 |
753 |
754 | 1466149332233
755 |
756 |
757 | 1466149417420
758 |
759 |
760 | 1466149417420
761 |
762 |
763 | 1466150842597
764 |
765 |
766 | 1466150842597
767 |
768 |
769 | 1466150994852
770 |
771 |
772 | 1466150994852
773 |
774 |
775 | 1466151549503
776 |
777 |
778 | 1466151549503
779 |
780 |
781 | 1466152245396
782 |
783 |
784 | 1466152245396
785 |
786 |
787 | 1466152511295
788 |
789 |
790 | 1466152511295
791 |
792 |
793 | 1466152620873
794 |
795 |
796 | 1466152620873
797 |
798 |
799 | 1466152664544
800 |
801 |
802 | 1466152664544
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 |
911 |
912 |
913 |
914 |
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 |
923 |
924 |
925 |
926 |
927 |
928 |
929 |
930 |
931 |
932 |
933 |
934 |
935 |
936 |
937 |
938 |
939 |
940 |
941 |
942 |
943 |
944 |
945 |
946 |
947 |
948 |
949 |
950 |
951 |
952 |
953 |
954 |
955 |
956 |
957 |
958 |
959 |
960 |
961 |
962 |
963 |
964 |
965 |
966 |
967 |
968 |
969 |
970 |
971 |
972 |
973 |
974 |
975 |
976 |
977 |
978 |
979 |
980 |
981 |
982 |
983 |
984 |
985 |
986 |
987 |
988 |
989 |
990 |
991 |
992 |
993 |
994 |
995 |
996 |
997 |
998 |
999 |
1000 |
1001 |
1002 |
1003 |
1004 |
1005 |
1006 |
1007 |
1008 |
1009 |
1010 |
1011 |
1012 |
1013 |
1014 |
1015 |
1016 |
1017 |
1018 |
1019 |
1020 |
1021 |
1022 |
1023 |
1024 |
1025 |
1026 |
1027 |
1028 |
1029 |
1030 |
1031 |
1032 |
1033 |
1034 |
1035 |
1036 |
1037 |
1038 |
1039 |
1040 |
1041 |
1042 |
1043 |
1044 |
1045 |
1046 |
1047 |
1048 |
1049 |
1050 |
1051 |
1052 |
1053 |
1054 |
1055 |
1056 |
1057 |
1058 |
1059 |
1060 |
1061 |
1062 |
1063 |
1064 |
1065 |
1066 |
1067 |
1068 |
1069 |
1070 |
1071 |
1072 |
1073 |
1074 |
1075 |
1076 |
1077 |
1078 |
1079 |
1080 |
1081 |
1082 |
1083 |
1084 |
1085 |
1086 |
1087 |
1088 |
1089 |
1090 |
1091 |
1092 |
1093 |
1094 |
1095 |
1096 |
1097 |
1098 |
1099 |
1100 |
1101 |
1102 |
1103 |
1104 |
1105 |
1106 |
1107 |
1108 |
1109 |
1110 |
1111 |
1112 |
1113 |
1114 |
1115 |
1116 |
1117 |
1118 |
1119 |
1120 |
1121 |
1122 |
1123 |
1124 |
1125 |
1126 |
1127 |
1128 |
1129 |
1130 |
1131 |
1132 |
1133 |
1134 |
1135 |
1136 |
1137 |
1138 |
1139 |
1140 |
1141 |
1142 |
1143 |
1144 |
1145 |
1146 |
1147 |
1148 |
1149 |
1150 |
1151 |
1152 |
1153 |
1154 |
1155 |
1156 |
1157 |
1158 |
1159 |
1160 |
1161 |
1162 |
1163 |
1164 |
1165 |
1166 |
1167 |
1168 |
1169 |
1170 |
1171 |
1172 |
1173 |
1174 |
1175 |
1176 |
1177 |
1178 |
1179 |
1180 |
1181 |
1182 |
1183 |
1184 |
1185 |
1186 |
1187 |
1188 |
1189 |
1190 |
1191 |
1192 |
1193 |
1194 |
1195 |
1196 |
1197 |
1198 |
1199 |
1200 |
1201 | No facets are configured
1202 |
1203 |
1204 |
1205 |
1206 |
1207 |
1208 |
1209 |
1210 |
1211 |
1212 |
1213 |
1214 |
1215 |
1216 |
1217 |
1218 |
1219 |
1220 |
1221 |
1222 |
1223 |
1224 | 1.7
1225 |
1226 |
1227 |
1228 |
1229 |
1230 |
1231 |
1232 |
1233 |
1234 |
1235 |
1236 | BloomFilter
1237 |
1238 |
1239 |
1240 |
1241 |
1242 |
1243 |
1244 |
1245 |
1246 |
1247 |
1248 |
1249 | 1.8
1250 |
1251 |
1252 |
1253 |
1254 |
1255 |
1256 |
1257 |
1258 |
1259 |
1260 |
1261 | Maven: com.google.guava:guava:19.0
1262 |
1263 |
1264 |
1265 |
1266 |
1267 |
1268 |
1269 |
1270 |
1271 |
1272 |
1273 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk:
3 | - oraclejdk8
--------------------------------------------------------------------------------
/BloomFilter.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Xiaocheng Guo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## BloomFilter
2 | [](https://travis-ci.org/letcheng/BloomFilter)
3 | [](https://jitpack.io/#letcheng/BloomFilter)
4 |
5 | Bloom Filter && Count Bloom Filter && Cached Bloom Filter
6 |
7 | ### Bloom Filter
8 |
9 | - 原理
10 |
11 | 
12 |
13 |
14 | - 性能
15 | + 添加元素: 0.943s, 53022.269353128315 元素/s
16 | + 测试已经存在的元素: 0.907s, 55126.79162072767 元素/s
17 | + 测试不存在的元素: 0.519s, 96339.11368015414 元素/s
18 |
19 | ### Cached Bloom Filter
20 |
21 | - 原理
22 |
23 | 
24 |
25 | > 可以进行高效缓存替换的Bloom Filter数据结构,可以应用于爬虫的URL去重中,在以雪球爬行策略过程中,一个爬虫线程在一定时间范围内遇到的链接集中在一个URL集合中的。这时可以采用CachedBloomFilter数据结构。
26 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.github.letcheng
8 | BloomFilter
9 | 1.0
10 |
11 |
12 |
13 | junit
14 | junit
15 | 4.12
16 |
17 |
18 | com.google.guava
19 | guava
20 | 19.0
21 |
22 |
23 |
24 |
25 |
26 |
27 | org.apache.maven.plugins
28 | maven-compiler-plugin
29 |
30 | 1.8
31 | 1.8
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/src/main/java/com/ruyuapp/BloomFilter.java:
--------------------------------------------------------------------------------
1 | package com.ruyuapp;
2 |
3 | import java.nio.charset.Charset;
4 | import java.security.MessageDigest;
5 | import java.security.NoSuchAlgorithmException;
6 | import java.util.BitSet;
7 | import java.util.Collection;
8 |
9 | /**
10 | *
11 | * Bloom Filter数据结构实现
12 | *
13 | * @author Letcheng on 2016/2/23.
14 | */
15 | public class BloomFilter{
16 |
17 | protected BitSet bitset;
18 | protected int m; //Bloom Filter的位数
19 | protected double c; //每个元素的位数
20 | protected int n_max; //Bloom Filter最大的元素个数
21 | protected int n; // Bloom Filter实际元素的个数
22 | protected int k; // hash函数的个数
23 |
24 |
25 | public static final Charset charset = Charset.forName("UTF-8");
26 |
27 | //在大多数情况下MD5准确率较好,也可以选择SHA1
28 | public static final String hashName = "MD5";
29 | public static final MessageDigest digestFunction;
30 |
31 |
32 | static {
33 | MessageDigest tmp;
34 | try {
35 | tmp = MessageDigest.getInstance(hashName);
36 | } catch (NoSuchAlgorithmException e) {
37 | tmp = null;
38 | }
39 | digestFunction = tmp;
40 | }
41 |
42 | /**
43 | * @param c
44 | * @param n_max
45 | * @param k
46 | */
47 | public BloomFilter(double c, int n_max, int k) {
48 | this.n_max = n_max;
49 | this.k = k;
50 | this.c = c;
51 | this.m = (int)Math.ceil(c * n_max);
52 | n = 0;
53 | this.bitset = new BitSet(m);
54 | }
55 |
56 | public BloomFilter(int m,int n_max,int k){
57 | this.n_max = n_max;
58 | this.k = k;
59 | this.m = m;
60 | this.c = 1.0d * m / n_max;
61 | n = 0;
62 | this.bitset = new BitSet(m);
63 | }
64 |
65 | /**
66 | *
67 | * 根据m和n_max计算k的最优值
68 | * 根据论文的推导:k = lg2*(m/n_max)
69 | * @param m
70 | * @param n_max
71 | */
72 | public BloomFilter(int m, int n_max) {
73 | this(m / (double) n_max,
74 | n_max,
75 | (int) Math.round((m / (double) n_max) * Math.log(2.0))); //k = log2*(m/n_max)
76 | }
77 |
78 |
79 | /**
80 | * 最常用的构造方法
81 | * @param fpp
82 | * @param n_max
83 | */
84 | public BloomFilter(double fpp, int n_max) {
85 | this(Math.ceil(-(Math.log(fpp) / Math.log(2))) / Math.log(2), // c = k / ln(2)
86 | n_max,
87 | (int) Math.ceil(-(Math.log(fpp) / Math.log(2)))); // k = ceil(-lg_2(fpp))
88 | }
89 |
90 | /**
91 | *
92 | * 根据Hash的个数,生成散列值
93 | * @param data
94 | * @param hashes
95 | * @return
96 | */
97 | public static int[] createHashes(byte[] data, int hashes) {
98 | int[] result = new int[hashes];
99 |
100 | int k = 0;
101 | byte salt = 0;
102 | while (k < hashes) {
103 | byte[] digest;
104 | synchronized (digestFunction) {
105 | digestFunction.update(salt);
106 | salt++;
107 | digest = digestFunction.digest(data);
108 | }
109 |
110 | for (int i = 0; i < digest.length/4 && k < hashes; i++) {
111 | int h = 0;
112 | for (int j = (i*4); j < (i*4)+4; j++) {
113 | h <<= 8;
114 | h |= ((int) digest[j]) & 0xFF;
115 | }
116 | result[k] = h;
117 | k++;
118 | }
119 | }
120 | return result;
121 | }
122 |
123 | /**
124 | * 计算在插入最大元素的情况下的误判率
125 | * @return
126 | */
127 | public double maxFpp() {
128 | return getFpp(n_max);
129 | }
130 |
131 | /**
132 | * 根据当前的元素计算误判率
133 | * @param n
134 | * @return
135 | */
136 | public double getFpp(double n) {
137 | // (1 - e^(-k * n / m)) ^ k
138 | return Math.pow((1 - Math.exp(-k * (double) n
139 | / (double) m)), k);
140 |
141 | }
142 |
143 | /**
144 | * 计算当前元素个数的误判率
145 | * @return
146 | */
147 | public double getFpp() {
148 | return getFpp(n);
149 | }
150 |
151 |
152 | public int getK() {
153 | return k;
154 | }
155 |
156 | /**
157 | * 重置Bloom Filter
158 | */
159 | public void clear() {
160 | bitset.clear();
161 | n = 0;
162 | }
163 |
164 | /**
165 | *
166 | * 添加对象到Bloom Filter中,会调用对象的toString()方法作为Hash方法的输入
167 | * @param element
168 | */
169 | public void add(E element) {
170 | add(element.toString().getBytes(charset));
171 | }
172 |
173 | /**
174 | * 添加字节数组到Bloom Filter中
175 | * @param bytes
176 | */
177 | public void add(byte[] bytes) {
178 | int[] hashes = createHashes(bytes, k);
179 | for (int hash : hashes)
180 | bitset.set(Math.abs(hash % m), true); //使用K个Hash函数映射到1位
181 | n++;//添加了一个元素
182 | }
183 |
184 | /**
185 | * 添加一个对象集合到Bloom Filter中
186 | * @param c
187 | */
188 | public void addAll(Collection extends E> c) {
189 | for (E element : c)
190 | add(element);
191 | }
192 |
193 | /**
194 | *
195 | * 获取某个对象是否已经插入到Bloom Filter中,可以使用getFpp()方法计算结果正确的概率
196 | *
197 | * @param element
198 | * @return
199 | */
200 | public boolean contains(E element) {
201 | return contains(element.toString().getBytes(charset));
202 | }
203 |
204 | /**
205 | * 判定某个字节数组是否已经插入到Bloom Filter中,可以使用getFpp()方法计算结果正确的概率
206 | * @param bytes
207 | * @return
208 | */
209 | public boolean contains(byte[] bytes) {
210 | int[] hashes = createHashes(bytes, k);
211 | for (int hash : hashes) {
212 | if (!bitset.get(Math.abs(hash % m))) { //如果有一位未设置,则该元素未插入,但是返回true,并不代表这个元素一定插入过,即存在误判率的概念。
213 | return false;
214 | }
215 | }
216 | return true;
217 | }
218 |
219 | /**
220 | * 如果有一个元素未被插入到Bloom Filter中,则返回false
221 | * @param c elements to check.
222 | * @returnr.
223 | */
224 | public boolean containsAll(Collection extends E> c) {
225 | for (E element : c)
226 | if (!contains(element))
227 | return false;
228 | return true;
229 | }
230 |
231 | /**
232 | * 获取Bloom Filter中某一位的值
233 | * @param bit
234 | * @return
235 | */
236 | public boolean getBit(int bit) {
237 | return bitset.get(bit);
238 | }
239 |
240 | /**
241 | * 设置Bloom Filter每一位的值
242 | * @param bit
243 | * @param value true代表该位已经被设置,false代表未进行设置
244 | */
245 | public void setBit(int bit, boolean value) {
246 | bitset.set(bit, value);
247 | }
248 |
249 | public BitSet getBitSet() {
250 | return bitset;
251 | }
252 |
253 | /**
254 | * 获取当前的位数
255 | * @return
256 | */
257 | public int size() {
258 | return this.m;
259 | }
260 |
261 | /**
262 | * 获取当前的插入的元素的个数
263 | * @return
264 | */
265 | public int count() {
266 | return this.n;
267 | }
268 |
269 | /**
270 | * 获取Bloom Filter可以插入的最大元素
271 | * @return
272 | */
273 | public int getNMax() {
274 | return n_max;
275 | }
276 |
277 | /**
278 | *
279 | * 当Bloom Filter满的时候,每个元素占的位数,通过构造方法进行设置
280 | * @return
281 | */
282 | public double getC() {
283 | return this.c;
284 | }
285 |
286 | /**
287 | * 获取当前情况下,Bloom Filter实际上每个元素占的位数
288 | * @return
289 | */
290 | public double getBitsPerElement() {
291 | return this.m / (double)n;
292 | }
293 | }
--------------------------------------------------------------------------------
/src/main/java/com/ruyuapp/CachedBloomFilter.java:
--------------------------------------------------------------------------------
1 | package com.ruyuapp;
2 |
3 | import java.util.*;
4 | import java.util.concurrent.LinkedBlockingQueue;
5 |
6 | /**
7 | *
8 | * 可以进行替换策略的Bloom Filter数据结构实现
9 | *
10 | * @author Letcheng on 2016/2/24.
11 | */
12 | public class CachedBloomFilter extends CountBloomFilter {
13 |
14 | private double t;
15 | private Queue elements = new LinkedBlockingQueue(); // 对应CBF存储的元素
16 |
17 | public CachedBloomFilter(int m,int n_max,int k){
18 | super(m,n_max,k);
19 | t = m/14.5;
20 | }
21 |
22 | public CachedBloomFilter(double fpp, int n_max) {
23 | super(fpp, n_max);
24 | t = m/14.5;
25 | }
26 |
27 | @Override
28 | public void add(E element) {
29 | elements.add(element);
30 | super.add(element);
31 | if(c_m > t){ // 执行移除策略
32 | int tmp = n/3;
33 | for(int i=0;i extends BloomFilter {
10 |
11 | protected int count[];
12 | protected int c_m = 0; // 当前使用的位数
13 | public CountBloomFilter(int m,int n_max,int k){
14 | super(m,n_max,k);
15 | this.count = new int[super.m];
16 | }
17 |
18 | public CountBloomFilter(double fpp, int n_max) {
19 | super(fpp, n_max);
20 | this.count = new int[super.m];
21 | }
22 |
23 | @Override
24 | public void add(byte[] bytes) {
25 | int[] hashes = createHashes(bytes, k);
26 | for (int hash : hashes) {
27 | bitset.set(Math.abs(hash % m), true); //使用K个Hash函数映射到1位
28 | if(count[Math.abs(hash % m)]==0){
29 | c_m++;
30 | }
31 | count[Math.abs(hash % m)] ++;
32 | }
33 | n++;//添加了一个元素
34 | }
35 |
36 | /**
37 | * 移除元素
38 | * @param element
39 | */
40 | public void remove(E element) {
41 | if(element!=null){
42 | remove(element.toString().getBytes(charset));
43 | }
44 | }
45 |
46 | /**
47 | * 移除字节数组
48 | * @param bytes
49 | */
50 | public void remove(byte[] bytes){
51 | int[] hashes = createHashes(bytes, k);
52 | for (int hash : hashes) {
53 | if(--count[Math.abs(hash % m)] == 0){ //如果数据为空,则将标志位也归位
54 | c_m --;
55 | bitset.clear(Math.abs(hash % m));
56 | }
57 | }
58 | n--;
59 | }
60 | @Override
61 | public void clear() {
62 | this.count = new int[super.m];
63 | super.clear();
64 | }
65 |
66 | public int[] getCount(){
67 | return this.count;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/resources/bloom-filter.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/src/main/resources/bloom-filter.JPG
--------------------------------------------------------------------------------
/src/main/resources/cached-bloom-filter.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/src/main/resources/cached-bloom-filter.JPG
--------------------------------------------------------------------------------
/src/test/java/com/ruyuapp/BloomFilterTest.java:
--------------------------------------------------------------------------------
1 | package com.ruyuapp;
2 |
3 | import com.google.common.hash.Funnels;
4 | import com.ruyuapp.BloomFilter;
5 | import org.junit.After;
6 | import org.junit.Before;
7 | import org.junit.Test;
8 |
9 | import java.nio.charset.Charset;
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | import java.util.Random;
13 |
14 | /**
15 | * @author Letcheng on 2016/2/23.
16 | */
17 | public class BloomFilterTest {
18 |
19 | private int total = 100000; //测试元素的总数
20 |
21 | private List existingElements = null;
22 | private List nonExistingElements = null;
23 |
24 | private void printStat(long start, long end) {
25 | double diff = (end - start) / 1000.0;
26 | System.out.println(diff + "s, " + (total / diff) + " 元素/s");
27 | }
28 |
29 | @Before
30 | public void prepare(){
31 |
32 | final Random r = new Random();
33 | existingElements = new ArrayList(total);
34 | for (int i = 0; i < total; i++) {
35 | existingElements.add(Double.toString(r.nextDouble()));
36 | }
37 |
38 | nonExistingElements = new ArrayList(total);
39 | for (int i = 0; i < total; i++) {
40 | nonExistingElements.add(Double.toString(r.nextDouble()));
41 | }
42 |
43 | }
44 |
45 | @Test
46 | public void test(){
47 |
48 | double fpp = 0.001d;
49 |
50 | BloomFilter ruyu_bf = new BloomFilter(fpp, total);
51 | com.google.common.hash.BloomFilter google_bf = com.google.common.hash.BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), total, fpp);
52 |
53 | // 添加元素
54 | System.out.print("Ruyu Bloom Filter添加元素: ");
55 | long start = System.currentTimeMillis();
56 | for (int i = 0; i < total; i++) {
57 | ruyu_bf.add(existingElements.get(i));
58 | }
59 | long end = System.currentTimeMillis();
60 | printStat(start, end);
61 |
62 | System.out.print("Google Bloom Filter添加元素: ");
63 | start = System.currentTimeMillis();
64 | for (int i = 0; i < total; i++) {
65 | google_bf.put(existingElements.get(i));
66 | }
67 | end = System.currentTimeMillis();
68 | printStat(start, end);
69 |
70 | //测试已经存在的元素
71 | System.out.print("Ruyu Bloom Filter测试已经存在的元素: ");
72 | start = System.currentTimeMillis();
73 | for (int i = 0; i < total; i++) {
74 | ruyu_bf.contains(existingElements.get(i));
75 | }
76 | end = System.currentTimeMillis();
77 | printStat(start, end);
78 |
79 | System.out.print("Google Bloom Filter测试已经存在的元素: ");
80 | start = System.currentTimeMillis();
81 | for (int i = 0; i < total; i++) {
82 | ruyu_bf.contains(existingElements.get(i));
83 | }
84 | end = System.currentTimeMillis();
85 | printStat(start, end);
86 |
87 | //测试不存在的元素
88 | System.out.print("Ruyu Bloom Filter 测试不存在的元素: ");
89 | start = System.currentTimeMillis();
90 | for (int i = 0; i < total; i++) {
91 | ruyu_bf.contains(nonExistingElements.get(i));
92 | }
93 | end = System.currentTimeMillis();
94 | printStat(start, end);
95 |
96 | System.out.print("Google Bloom Filter 测试不存在的元素: ");
97 | start = System.currentTimeMillis();
98 | for (int i = 0; i < total; i++) {
99 | ruyu_bf.contains(nonExistingElements.get(i));
100 | }
101 | end = System.currentTimeMillis();
102 | printStat(start, end);
103 |
104 | }
105 |
106 | @Test
107 | public void test2(){
108 |
109 | int count = 0;
110 |
111 | CachedBloomFilter cbf = new CachedBloomFilter(0.01,total);
112 | //BloomFilter bf = new BloomFilter(0.01,total);
113 |
114 | existingElements.forEach(x->{
115 | cbf.add(x);
116 | //bf.add(x);
117 | });
118 |
119 | for (int i = 0; i < total; i++) {
120 | if(!cbf.contains(existingElements.get(i))){
121 | count++;
122 | };
123 | /*if(!bf.contains(existingElements.get(i))){
124 | count++;
125 | };*/
126 | }
127 | System.out.println(count);
128 | }
129 |
130 | }
131 |
--------------------------------------------------------------------------------
/target/classes/com/ruyuapp/BloomFilter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/BloomFilter.class
--------------------------------------------------------------------------------
/target/classes/com/ruyuapp/CachedBloomFilter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/CachedBloomFilter.class
--------------------------------------------------------------------------------
/target/classes/com/ruyuapp/CountBloomFilter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/CountBloomFilter.class
--------------------------------------------------------------------------------
/target/test-classes/com/ruyuapp/BloomFilterTest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/test-classes/com/ruyuapp/BloomFilterTest.class
--------------------------------------------------------------------------------