├── .gitignore ├── .idea ├── .name ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── libraries │ ├── Maven__com_google_guava_guava_19_0.xml │ ├── Maven__junit_junit_4_12.xml │ └── Maven__org_hamcrest_hamcrest_core_1_3.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── .travis.yml ├── BloomFilter.iml ├── LICENSE ├── README.md ├── pom.xml ├── src ├── main │ ├── java │ │ └── com │ │ │ └── ruyuapp │ │ │ ├── BloomFilter.java │ │ │ ├── CachedBloomFilter.java │ │ │ └── CountBloomFilter.java │ └── resources │ │ ├── bloom-filter.JPG │ │ └── cached-bloom-filter.JPG └── test │ └── java │ └── com │ └── ruyuapp │ └── BloomFilterTest.java └── target ├── classes └── com │ └── ruyuapp │ ├── BloomFilter.class │ ├── CachedBloomFilter.class │ └── CountBloomFilter.class └── test-classes └── com └── ruyuapp └── BloomFilterTest.class /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | BloomFilter -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_guava_19_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__junit_junit_4_12.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 15 | 16 | 17 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 80 | 81 | 82 | 84 | 85 | 105 | 106 | 107 | 108 | true 109 | 110 | 111 | 116 | 121 | 122 | 123 | 125 | 126 | 127 | 128 | 129 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 166 | 167 | 168 | 169 | 172 | 173 | 176 | 177 | 178 | 179 | 182 | 183 | 186 | 187 | 190 | 191 | 192 | 193 | 196 | 197 | 200 | 201 | 204 | 205 | 208 | 209 | 212 | 213 | 214 | 215 | 218 | 219 | 222 | 223 | 226 | 227 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 267 | 268 | 269 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 293 | 294 | 295 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 319 | 320 | 321 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 345 | 346 | 347 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 371 | 372 | 373 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 397 | 398 | 404 | 405 | 406 | 419 | 420 | 421 | 438 | 439 | 460 | 473 | 474 | 483 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 545 | 546 | 547 | 548 | 549 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 583 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | true 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 1456139843252 668 | 678 | 679 | 1457867886664 680 | 684 | 685 | 1466145967908 686 | 690 | 691 | 1466146054024 692 | 696 | 697 | 1466146179821 698 | 702 | 703 | 1466146906437 704 | 708 | 709 | 1466146915045 710 | 714 | 715 | 1466146957713 716 | 720 | 721 | 1466147087706 722 | 726 | 727 | 1466147586814 728 | 732 | 733 | 1466147857194 734 | 738 | 739 | 1466148227014 740 | 744 | 745 | 1466148864164 746 | 750 | 751 | 1466149332233 752 | 756 | 757 | 1466149417420 758 | 762 | 763 | 1466150842597 764 | 768 | 769 | 1466150994852 770 | 774 | 775 | 1466151549503 776 | 780 | 781 | 1466152245396 782 | 786 | 787 | 1466152511295 788 | 792 | 793 | 1466152620873 794 | 798 | 799 | 1466152664544 800 | 804 | 807 | 808 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 853 | 856 | 857 | 858 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 869 | 870 | 871 | 873 | 874 | 875 | 876 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 | 1129 | 1130 | 1131 | 1132 | 1133 | 1134 | 1135 | 1136 | 1137 | 1138 | 1139 | 1140 | 1141 | 1142 | 1143 | 1144 | 1145 | 1146 | 1147 | 1148 | 1149 | 1150 | 1151 | 1152 | 1153 | 1154 | 1155 | 1156 | 1157 | 1158 | 1159 | 1160 | 1161 | 1162 | 1163 | 1164 | 1165 | 1166 | 1167 | 1168 | 1169 | 1170 | 1171 | 1172 | 1173 | 1174 | 1175 | 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | 1188 | 1189 | 1190 | 1191 | 1196 | 1197 | 1198 | 1199 | 1200 | 1201 | No facets are configured 1202 | 1203 | 1208 | 1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1.7 1225 | 1226 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | BloomFilter 1237 | 1238 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1.8 1250 | 1251 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | Maven: com.google.guava:guava:19.0 1262 | 1263 | 1268 | 1269 | 1270 | 1271 | 1272 | 1273 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 -------------------------------------------------------------------------------- /BloomFilter.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Xiaocheng Guo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## BloomFilter 2 | [![Build Status](https://travis-ci.org/letcheng/BloomFilter.svg?branch=master)](https://travis-ci.org/letcheng/BloomFilter) 3 | [![Release](https://jitpack.io/v/letcheng/BloomFilter.svg)](https://jitpack.io/#letcheng/BloomFilter) 4 | 5 | Bloom Filter && Count Bloom Filter && Cached Bloom Filter 6 | 7 | ### Bloom Filter 8 | 9 | - 原理 10 | 11 | ![image](https://github.com/letcheng/BloomFilter/raw/master/src/main/resources/bloom-filter.JPG) 12 | 13 | 14 | - 性能 15 | + 添加元素: 0.943s, 53022.269353128315 元素/s 16 | + 测试已经存在的元素: 0.907s, 55126.79162072767 元素/s 17 | + 测试不存在的元素: 0.519s, 96339.11368015414 元素/s 18 | 19 | ### Cached Bloom Filter 20 | 21 | - 原理 22 | 23 | ![image](https://github.com/letcheng/BloomFilter/raw/master/src/main/resources/cached-bloom-filter.JPG) 24 | 25 | > 可以进行高效缓存替换的Bloom Filter数据结构,可以应用于爬虫的URL去重中,在以雪球爬行策略过程中,一个爬虫线程在一定时间范围内遇到的链接集中在一个URL集合中的。这时可以采用CachedBloomFilter数据结构。 26 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.github.letcheng 8 | BloomFilter 9 | 1.0 10 | 11 | 12 | 13 | junit 14 | junit 15 | 4.12 16 | 17 | 18 | com.google.guava 19 | guava 20 | 19.0 21 | 22 | 23 | 24 | 25 | 26 | 27 | org.apache.maven.plugins 28 | maven-compiler-plugin 29 | 30 | 1.8 31 | 1.8 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/main/java/com/ruyuapp/BloomFilter.java: -------------------------------------------------------------------------------- 1 | package com.ruyuapp; 2 | 3 | import java.nio.charset.Charset; 4 | import java.security.MessageDigest; 5 | import java.security.NoSuchAlgorithmException; 6 | import java.util.BitSet; 7 | import java.util.Collection; 8 | 9 | /** 10 | * 11 | * Bloom Filter数据结构实现 12 | * 13 | * @author Letcheng on 2016/2/23. 14 | */ 15 | public class BloomFilter{ 16 | 17 | protected BitSet bitset; 18 | protected int m; //Bloom Filter的位数 19 | protected double c; //每个元素的位数 20 | protected int n_max; //Bloom Filter最大的元素个数 21 | protected int n; // Bloom Filter实际元素的个数 22 | protected int k; // hash函数的个数 23 | 24 | 25 | public static final Charset charset = Charset.forName("UTF-8"); 26 | 27 | //在大多数情况下MD5准确率较好,也可以选择SHA1 28 | public static final String hashName = "MD5"; 29 | public static final MessageDigest digestFunction; 30 | 31 | 32 | static { 33 | MessageDigest tmp; 34 | try { 35 | tmp = MessageDigest.getInstance(hashName); 36 | } catch (NoSuchAlgorithmException e) { 37 | tmp = null; 38 | } 39 | digestFunction = tmp; 40 | } 41 | 42 | /** 43 | * @param c 44 | * @param n_max 45 | * @param k 46 | */ 47 | public BloomFilter(double c, int n_max, int k) { 48 | this.n_max = n_max; 49 | this.k = k; 50 | this.c = c; 51 | this.m = (int)Math.ceil(c * n_max); 52 | n = 0; 53 | this.bitset = new BitSet(m); 54 | } 55 | 56 | public BloomFilter(int m,int n_max,int k){ 57 | this.n_max = n_max; 58 | this.k = k; 59 | this.m = m; 60 | this.c = 1.0d * m / n_max; 61 | n = 0; 62 | this.bitset = new BitSet(m); 63 | } 64 | 65 | /** 66 | * 67 | * 根据m和n_max计算k的最优值 68 | * 根据论文的推导:k = lg2*(m/n_max) 69 | * @param m 70 | * @param n_max 71 | */ 72 | public BloomFilter(int m, int n_max) { 73 | this(m / (double) n_max, 74 | n_max, 75 | (int) Math.round((m / (double) n_max) * Math.log(2.0))); //k = log2*(m/n_max) 76 | } 77 | 78 | 79 | /** 80 | * 最常用的构造方法 81 | * @param fpp 82 | * @param n_max 83 | */ 84 | public BloomFilter(double fpp, int n_max) { 85 | this(Math.ceil(-(Math.log(fpp) / Math.log(2))) / Math.log(2), // c = k / ln(2) 86 | n_max, 87 | (int) Math.ceil(-(Math.log(fpp) / Math.log(2)))); // k = ceil(-lg_2(fpp)) 88 | } 89 | 90 | /** 91 | * 92 | * 根据Hash的个数,生成散列值 93 | * @param data 94 | * @param hashes 95 | * @return 96 | */ 97 | public static int[] createHashes(byte[] data, int hashes) { 98 | int[] result = new int[hashes]; 99 | 100 | int k = 0; 101 | byte salt = 0; 102 | while (k < hashes) { 103 | byte[] digest; 104 | synchronized (digestFunction) { 105 | digestFunction.update(salt); 106 | salt++; 107 | digest = digestFunction.digest(data); 108 | } 109 | 110 | for (int i = 0; i < digest.length/4 && k < hashes; i++) { 111 | int h = 0; 112 | for (int j = (i*4); j < (i*4)+4; j++) { 113 | h <<= 8; 114 | h |= ((int) digest[j]) & 0xFF; 115 | } 116 | result[k] = h; 117 | k++; 118 | } 119 | } 120 | return result; 121 | } 122 | 123 | /** 124 | * 计算在插入最大元素的情况下的误判率 125 | * @return 126 | */ 127 | public double maxFpp() { 128 | return getFpp(n_max); 129 | } 130 | 131 | /** 132 | * 根据当前的元素计算误判率 133 | * @param n 134 | * @return 135 | */ 136 | public double getFpp(double n) { 137 | // (1 - e^(-k * n / m)) ^ k 138 | return Math.pow((1 - Math.exp(-k * (double) n 139 | / (double) m)), k); 140 | 141 | } 142 | 143 | /** 144 | * 计算当前元素个数的误判率 145 | * @return 146 | */ 147 | public double getFpp() { 148 | return getFpp(n); 149 | } 150 | 151 | 152 | public int getK() { 153 | return k; 154 | } 155 | 156 | /** 157 | * 重置Bloom Filter 158 | */ 159 | public void clear() { 160 | bitset.clear(); 161 | n = 0; 162 | } 163 | 164 | /** 165 | * 166 | * 添加对象到Bloom Filter中,会调用对象的toString()方法作为Hash方法的输入 167 | * @param element 168 | */ 169 | public void add(E element) { 170 | add(element.toString().getBytes(charset)); 171 | } 172 | 173 | /** 174 | * 添加字节数组到Bloom Filter中 175 | * @param bytes 176 | */ 177 | public void add(byte[] bytes) { 178 | int[] hashes = createHashes(bytes, k); 179 | for (int hash : hashes) 180 | bitset.set(Math.abs(hash % m), true); //使用K个Hash函数映射到1位 181 | n++;//添加了一个元素 182 | } 183 | 184 | /** 185 | * 添加一个对象集合到Bloom Filter中 186 | * @param c 187 | */ 188 | public void addAll(Collection c) { 189 | for (E element : c) 190 | add(element); 191 | } 192 | 193 | /** 194 | * 195 | * 获取某个对象是否已经插入到Bloom Filter中,可以使用getFpp()方法计算结果正确的概率 196 | * 197 | * @param element 198 | * @return 199 | */ 200 | public boolean contains(E element) { 201 | return contains(element.toString().getBytes(charset)); 202 | } 203 | 204 | /** 205 | * 判定某个字节数组是否已经插入到Bloom Filter中,可以使用getFpp()方法计算结果正确的概率 206 | * @param bytes 207 | * @return 208 | */ 209 | public boolean contains(byte[] bytes) { 210 | int[] hashes = createHashes(bytes, k); 211 | for (int hash : hashes) { 212 | if (!bitset.get(Math.abs(hash % m))) { //如果有一位未设置,则该元素未插入,但是返回true,并不代表这个元素一定插入过,即存在误判率的概念。 213 | return false; 214 | } 215 | } 216 | return true; 217 | } 218 | 219 | /** 220 | * 如果有一个元素未被插入到Bloom Filter中,则返回false 221 | * @param c elements to check. 222 | * @returnr. 223 | */ 224 | public boolean containsAll(Collection c) { 225 | for (E element : c) 226 | if (!contains(element)) 227 | return false; 228 | return true; 229 | } 230 | 231 | /** 232 | * 获取Bloom Filter中某一位的值 233 | * @param bit 234 | * @return 235 | */ 236 | public boolean getBit(int bit) { 237 | return bitset.get(bit); 238 | } 239 | 240 | /** 241 | * 设置Bloom Filter每一位的值 242 | * @param bit 243 | * @param value true代表该位已经被设置,false代表未进行设置 244 | */ 245 | public void setBit(int bit, boolean value) { 246 | bitset.set(bit, value); 247 | } 248 | 249 | public BitSet getBitSet() { 250 | return bitset; 251 | } 252 | 253 | /** 254 | * 获取当前的位数 255 | * @return 256 | */ 257 | public int size() { 258 | return this.m; 259 | } 260 | 261 | /** 262 | * 获取当前的插入的元素的个数 263 | * @return 264 | */ 265 | public int count() { 266 | return this.n; 267 | } 268 | 269 | /** 270 | * 获取Bloom Filter可以插入的最大元素 271 | * @return 272 | */ 273 | public int getNMax() { 274 | return n_max; 275 | } 276 | 277 | /** 278 | * 279 | * 当Bloom Filter满的时候,每个元素占的位数,通过构造方法进行设置 280 | * @return 281 | */ 282 | public double getC() { 283 | return this.c; 284 | } 285 | 286 | /** 287 | * 获取当前情况下,Bloom Filter实际上每个元素占的位数 288 | * @return 289 | */ 290 | public double getBitsPerElement() { 291 | return this.m / (double)n; 292 | } 293 | } -------------------------------------------------------------------------------- /src/main/java/com/ruyuapp/CachedBloomFilter.java: -------------------------------------------------------------------------------- 1 | package com.ruyuapp; 2 | 3 | import java.util.*; 4 | import java.util.concurrent.LinkedBlockingQueue; 5 | 6 | /** 7 | * 8 | * 可以进行替换策略的Bloom Filter数据结构实现 9 | * 10 | * @author Letcheng on 2016/2/24. 11 | */ 12 | public class CachedBloomFilter extends CountBloomFilter { 13 | 14 | private double t; 15 | private Queue elements = new LinkedBlockingQueue(); // 对应CBF存储的元素 16 | 17 | public CachedBloomFilter(int m,int n_max,int k){ 18 | super(m,n_max,k); 19 | t = m/14.5; 20 | } 21 | 22 | public CachedBloomFilter(double fpp, int n_max) { 23 | super(fpp, n_max); 24 | t = m/14.5; 25 | } 26 | 27 | @Override 28 | public void add(E element) { 29 | elements.add(element); 30 | super.add(element); 31 | if(c_m > t){ // 执行移除策略 32 | int tmp = n/3; 33 | for(int i=0;i extends BloomFilter { 10 | 11 | protected int count[]; 12 | protected int c_m = 0; // 当前使用的位数 13 | public CountBloomFilter(int m,int n_max,int k){ 14 | super(m,n_max,k); 15 | this.count = new int[super.m]; 16 | } 17 | 18 | public CountBloomFilter(double fpp, int n_max) { 19 | super(fpp, n_max); 20 | this.count = new int[super.m]; 21 | } 22 | 23 | @Override 24 | public void add(byte[] bytes) { 25 | int[] hashes = createHashes(bytes, k); 26 | for (int hash : hashes) { 27 | bitset.set(Math.abs(hash % m), true); //使用K个Hash函数映射到1位 28 | if(count[Math.abs(hash % m)]==0){ 29 | c_m++; 30 | } 31 | count[Math.abs(hash % m)] ++; 32 | } 33 | n++;//添加了一个元素 34 | } 35 | 36 | /** 37 | * 移除元素 38 | * @param element 39 | */ 40 | public void remove(E element) { 41 | if(element!=null){ 42 | remove(element.toString().getBytes(charset)); 43 | } 44 | } 45 | 46 | /** 47 | * 移除字节数组 48 | * @param bytes 49 | */ 50 | public void remove(byte[] bytes){ 51 | int[] hashes = createHashes(bytes, k); 52 | for (int hash : hashes) { 53 | if(--count[Math.abs(hash % m)] == 0){ //如果数据为空,则将标志位也归位 54 | c_m --; 55 | bitset.clear(Math.abs(hash % m)); 56 | } 57 | } 58 | n--; 59 | } 60 | @Override 61 | public void clear() { 62 | this.count = new int[super.m]; 63 | super.clear(); 64 | } 65 | 66 | public int[] getCount(){ 67 | return this.count; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/resources/bloom-filter.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/src/main/resources/bloom-filter.JPG -------------------------------------------------------------------------------- /src/main/resources/cached-bloom-filter.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/src/main/resources/cached-bloom-filter.JPG -------------------------------------------------------------------------------- /src/test/java/com/ruyuapp/BloomFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.ruyuapp; 2 | 3 | import com.google.common.hash.Funnels; 4 | import com.ruyuapp.BloomFilter; 5 | import org.junit.After; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import java.nio.charset.Charset; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | /** 15 | * @author Letcheng on 2016/2/23. 16 | */ 17 | public class BloomFilterTest { 18 | 19 | private int total = 100000; //测试元素的总数 20 | 21 | private List existingElements = null; 22 | private List nonExistingElements = null; 23 | 24 | private void printStat(long start, long end) { 25 | double diff = (end - start) / 1000.0; 26 | System.out.println(diff + "s, " + (total / diff) + " 元素/s"); 27 | } 28 | 29 | @Before 30 | public void prepare(){ 31 | 32 | final Random r = new Random(); 33 | existingElements = new ArrayList(total); 34 | for (int i = 0; i < total; i++) { 35 | existingElements.add(Double.toString(r.nextDouble())); 36 | } 37 | 38 | nonExistingElements = new ArrayList(total); 39 | for (int i = 0; i < total; i++) { 40 | nonExistingElements.add(Double.toString(r.nextDouble())); 41 | } 42 | 43 | } 44 | 45 | @Test 46 | public void test(){ 47 | 48 | double fpp = 0.001d; 49 | 50 | BloomFilter ruyu_bf = new BloomFilter(fpp, total); 51 | com.google.common.hash.BloomFilter google_bf = com.google.common.hash.BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), total, fpp); 52 | 53 | // 添加元素 54 | System.out.print("Ruyu Bloom Filter添加元素: "); 55 | long start = System.currentTimeMillis(); 56 | for (int i = 0; i < total; i++) { 57 | ruyu_bf.add(existingElements.get(i)); 58 | } 59 | long end = System.currentTimeMillis(); 60 | printStat(start, end); 61 | 62 | System.out.print("Google Bloom Filter添加元素: "); 63 | start = System.currentTimeMillis(); 64 | for (int i = 0; i < total; i++) { 65 | google_bf.put(existingElements.get(i)); 66 | } 67 | end = System.currentTimeMillis(); 68 | printStat(start, end); 69 | 70 | //测试已经存在的元素 71 | System.out.print("Ruyu Bloom Filter测试已经存在的元素: "); 72 | start = System.currentTimeMillis(); 73 | for (int i = 0; i < total; i++) { 74 | ruyu_bf.contains(existingElements.get(i)); 75 | } 76 | end = System.currentTimeMillis(); 77 | printStat(start, end); 78 | 79 | System.out.print("Google Bloom Filter测试已经存在的元素: "); 80 | start = System.currentTimeMillis(); 81 | for (int i = 0; i < total; i++) { 82 | ruyu_bf.contains(existingElements.get(i)); 83 | } 84 | end = System.currentTimeMillis(); 85 | printStat(start, end); 86 | 87 | //测试不存在的元素 88 | System.out.print("Ruyu Bloom Filter 测试不存在的元素: "); 89 | start = System.currentTimeMillis(); 90 | for (int i = 0; i < total; i++) { 91 | ruyu_bf.contains(nonExistingElements.get(i)); 92 | } 93 | end = System.currentTimeMillis(); 94 | printStat(start, end); 95 | 96 | System.out.print("Google Bloom Filter 测试不存在的元素: "); 97 | start = System.currentTimeMillis(); 98 | for (int i = 0; i < total; i++) { 99 | ruyu_bf.contains(nonExistingElements.get(i)); 100 | } 101 | end = System.currentTimeMillis(); 102 | printStat(start, end); 103 | 104 | } 105 | 106 | @Test 107 | public void test2(){ 108 | 109 | int count = 0; 110 | 111 | CachedBloomFilter cbf = new CachedBloomFilter(0.01,total); 112 | //BloomFilter bf = new BloomFilter(0.01,total); 113 | 114 | existingElements.forEach(x->{ 115 | cbf.add(x); 116 | //bf.add(x); 117 | }); 118 | 119 | for (int i = 0; i < total; i++) { 120 | if(!cbf.contains(existingElements.get(i))){ 121 | count++; 122 | }; 123 | /*if(!bf.contains(existingElements.get(i))){ 124 | count++; 125 | };*/ 126 | } 127 | System.out.println(count); 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /target/classes/com/ruyuapp/BloomFilter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/BloomFilter.class -------------------------------------------------------------------------------- /target/classes/com/ruyuapp/CachedBloomFilter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/CachedBloomFilter.class -------------------------------------------------------------------------------- /target/classes/com/ruyuapp/CountBloomFilter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/classes/com/ruyuapp/CountBloomFilter.class -------------------------------------------------------------------------------- /target/test-classes/com/ruyuapp/BloomFilterTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/letcheng/BloomFilter/73467cb61f4f48817cac2c1d54e11289f72f24c0/target/test-classes/com/ruyuapp/BloomFilterTest.class --------------------------------------------------------------------------------