├── .gitmodules ├── .gitignore ├── test ├── CMakeLists.txt ├── testdata │ ├── news_content.3 │ ├── news_content.4 │ ├── news_content │ └── news_content.2 ├── unittest │ ├── TJenkins.cpp │ ├── CMakeLists.txt │ ├── gtest_main.cpp │ └── TSimhash.cpp └── load_test.cpp ├── benchmark ├── lib │ ├── utils.h │ ├── vals.h │ └── Simhasher_benchmark.h ├── CMakeLists.txt └── benchmarking.cpp ├── .github └── workflows │ ├── stale-issues.yml │ └── cmake.yml ├── CMakeLists.txt ├── README.md └── include └── simhash ├── Simhasher.hpp └── jenkins.h /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/cppjieba"] 2 | path = submodules/cppjieba 3 | url = https://github.com/yanyiwu/cppjieba.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tags 2 | *swp 3 | *.out 4 | *.o 5 | *.d 6 | *.ut 7 | log 8 | *.log 9 | *.pyc 10 | *.pyo 11 | *.a 12 | t.cpp 13 | build 14 | *.un~ 15 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test) 2 | 3 | ADD_EXECUTABLE(load_test load_test.cpp) 4 | ADD_SUBDIRECTORY(unittest) 5 | -------------------------------------------------------------------------------- /test/testdata/news_content.3: -------------------------------------------------------------------------------- 1 |  中新网金华11月13日电(见习记者 胡丰盛 实习生 李婷婷)13日,浙江省金华市金东公安分局发布通报称:11月7日17时许,该区多湖街道里秧田村的一对夫妻发生争吵,女方往自己身上浇汽油,并溅到了怀里的4岁儿子多多,之后,在客厅里燃烧的煤炉不慎将汽油点燃,一家三人被大面积烧伤,儿子多多重伤身亡。   据悉,这对夫妇名为张秋生和沈丹,分别是48岁和40岁。两人都是各自离过婚再组成家庭的,婚后生下一个儿子,小名叫多多,今年4岁,在上幼儿园。   然而婚后,这对夫妇家庭关系却不和睦。近日,张秋生提出要离婚,而沈丹则提出条件,要把儿子多多一起带走,对此张秋生坚决不同意。   11月7日下午16时左右,沈丹将汽油灌入饮料瓶中,欲以自杀威胁张秋生。但二人随后言语不合,发生争吵。争吵过程中,女方拿出汽油往自己身上浇,并溅到了她抱着的儿子身上。两人推搡过程中,在客厅里燃烧的煤炉不慎将汽油点燃,并发生大火,造成了三人不同面积的烧伤。   随后,金华金东区多湖派出所民警在接到报警后迅速赶到现场,立即将三人送往医院救治,并进行立案调查。   据了解,10日,多多因伤重不幸身亡,张秋生、沈丹夫妇仍在医院治疗。   11月11日,沈丹因涉嫌过失致人死亡被金东公安分局监视居住 2 | -------------------------------------------------------------------------------- /test/testdata/news_content.4: -------------------------------------------------------------------------------- 1 |  中新网金华11月13日电(见习记者 胡丰盛 实习生 李婷婷)13日,浙江省金华市金东公安分局发布通报称:11月7日17时许,该区多湖街道里秧田村的一对夫妻发生争吵,女方往自己身上浇汽油,并溅到了怀里的4岁儿子多多,之后,在客厅里燃烧的煤炉不慎将汽油点燃,一家三人被大面积烧伤,儿子多多重伤身亡。   据悉,这对夫妇名为张秋生和沈丹,分别是48岁和40岁。两人都是各自离过婚再组成家庭的,婚后生下一个儿子,小名叫多多,今年4岁,在上幼儿园。   然而婚后,这对夫妇家庭关系却不和睦。近日,张秋生提出要离婚,而沈丹则提出条件,要把儿子多多一起带走,对此张秋生坚决不同意。   11月7日下午16时左右,沈丹将汽油灌入饮料瓶中,欲以自杀威胁张秋生。但二人随后言语不合,发生争吵。争吵过程中,女方拿出汽油往自己身上浇,并溅到了她抱着的儿子身上。两人推搡过程中,在客厅里燃烧的煤炉不慎将汽油点燃,并发生大火,造成了三人不同面积的烧伤。   随后,金华金东区多湖派出所民警在接到报警后迅速赶到现场,立即将三人送往医院救治,并进行立案调查。   据了解,10日,多多因伤重不幸身亡,张秋生、沈丹夫妇仍在医院治疗。   11月11日,沈丹因涉嫌过失致人死亡被金东公安分局监视居住。 2 | -------------------------------------------------------------------------------- /test/unittest/TJenkins.cpp: -------------------------------------------------------------------------------- 1 | #include "simhash/jenkins.h" 2 | #include "gtest/gtest.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace simhash; 9 | 10 | TEST(JenkinsTest, Test1) 11 | { 12 | jenkins hasher; 13 | ifstream ifs("../test/testdata/news_content"); 14 | string doc((istreambuf_iterator(ifs)), istreambuf_iterator()); 15 | //cout<< doc< 9 | #include 10 | #include 11 | 12 | 13 | namespace simhash_benchmark { 14 | 15 | 16 | std::vector > fake_keywords( 17 | int32_t num) { 18 | std::vector > keywords; 19 | double score = 1.0 / (double)num; 20 | for (int32_t i = 0; i < num; ++i) { 21 | std::string keyword = "fake_" + std::to_string(i); 22 | keywords.emplace_back(std::make_pair(keyword, score)); 23 | } 24 | return keywords; 25 | } 26 | 27 | 28 | } // namespace simhash_benchmark 29 | 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # file: CMakeLists.txt 2 | # date: 2022-01-18 3 | 4 | 5 | if(${CMAKE_VERSION} VERSION_GREATER "3.16.0" 6 | AND BENCHMARK) 7 | message( 8 | STATUS 9 | "CMake version is eligible for benchmarking and benchmarking flag is '${BENCHMARK}'.") 10 | 11 | include(FetchContent) 12 | set(FETCHCONTENT_QUIET FALSE) 13 | 14 | FetchContent_Declare(googlebenchmark 15 | GIT_REPOSITORY https://github.com.cnpmjs.org/google/benchmark.git 16 | GIT_TAG v1.6.1 17 | GIT_PROGRESS TRUE 18 | ) 19 | set(BENCHMARK_ENABLE_GTEST_TESTS OFF) 20 | FetchContent_MakeAvailable(googlebenchmark) 21 | 22 | add_executable(benchmarking ./benchmarking.cpp) 23 | target_link_libraries(benchmarking benchmark::benchmark) 24 | endif() 25 | -------------------------------------------------------------------------------- /.github/workflows/stale-issues.yml: -------------------------------------------------------------------------------- 1 | name: Close Stale Issues 2 | 3 | on: 4 | schedule: 5 | - cron: '0 13 12 */3 *' # Run every 12th day of every 3rd month 6 | 7 | jobs: 8 | stale: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | issues: write 12 | pull-requests: write 13 | 14 | steps: 15 | - uses: actions/stale@v5 16 | with: 17 | repo-token: ${{ secrets.GITHUB_TOKEN }} 18 | stale-issue-message: 'This issue has not been updated for over 1 year and will be marked as stale. If the issue still exists, please comment or update the issue, otherwise it will be closed after 7 days.' 19 | close-issue-message: 'This issue has been automatically closed due to inactivity. If the issue still exists, please reopen it.' 20 | days-before-issue-stale: 365 21 | days-before-issue-close: 7 22 | stale-issue-label: 'Stale' 23 | exempt-issue-labels: 'pinned,security' 24 | operations-per-run: 100 25 | -------------------------------------------------------------------------------- /test/load_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "simhash/Simhasher.hpp" 5 | 6 | using namespace simhash; 7 | 8 | void LoadTest(size_t times = 2000) 9 | { 10 | Simhasher simhasher("../submodules/cppjieba/dict/jieba.dict.utf8", "../submodules/cppjieba/dict/hmm_model.utf8", "../submodules/cppjieba/dict/idf.utf8", "../submodules/cppjieba/dict/stop_words.utf8"); 11 | uint64_t value; 12 | string doc; 13 | 14 | ifstream ifs("../test/testdata/news_content.2"); 15 | assert(ifs); 16 | doc << ifs; 17 | long beginTime = clock(); 18 | for(size_t i = 0; i < times; i ++) 19 | { 20 | printf("process [%3.0lf %%]\r", 100.0*(i+1)/times); 21 | fflush(stdout); 22 | simhasher.make(doc, 5, value); 23 | } 24 | long endTime = clock(); 25 | printf("\nextract: [%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC); 26 | } 27 | 28 | int main(int argc, char ** argv) 29 | { 30 | LoadTest(); 31 | return EXIT_SUCCESS; 32 | } 33 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | 3 | PROJECT(simhash) 4 | 5 | # Define a variable to check if this is the top-level project 6 | if(NOT DEFINED simhash_TOP_LEVEL_PROJECT) 7 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) 8 | set(simhash_TOP_LEVEL_PROJECT ON) 9 | else() 10 | set(simhash_TOP_LEVEL_PROJECT OFF) 11 | endif() 12 | endif() 13 | 14 | option(simhash_BUILD_TESTS "Build simhash tests" ${simhash_TOP_LEVEL_PROJECT}) 15 | 16 | if(simhash_BUILD_TESTS) 17 | #OPTION(BENCHMARK "Build benchmarking library" ON) 18 | 19 | INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/submodules/cppjieba/include 20 | ${PROJECT_SOURCE_DIR}/submodules/cppjieba/deps/limonp/include 21 | ${PROJECT_SOURCE_DIR}/include) 22 | 23 | ADD_DEFINITIONS(-O3 -g) 24 | 25 | if(NOT DEFINED CMAKE_CXX_STANDARD) 26 | set(CMAKE_CXX_STANDARD 11) 27 | endif() 28 | 29 | ADD_SUBDIRECTORY(test) 30 | ADD_SUBDIRECTORY(benchmark) 31 | 32 | ENABLE_TESTING() 33 | ADD_TEST(NAME ./test/test.run COMMAND ./test/test.run) 34 | ADD_TEST(NAME ./test/load_test COMMAND ./test/load_test) 35 | endif() 36 | -------------------------------------------------------------------------------- /.github/workflows/cmake.yml: -------------------------------------------------------------------------------- 1 | name: CMake 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. 10 | # You can convert this to a matrix build if you need cross-platform coverage. 11 | # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ 16 | ubuntu-22.04, 17 | ubuntu-24.04, 18 | macos-13, 19 | macos-14, 20 | windows-2019, 21 | windows-2022, 22 | ] 23 | cpp_version: [11, 14, 17, 20] 24 | build_type: [Release, Debug] 25 | 26 | steps: 27 | - uses: actions/checkout@v3 28 | with: 29 | submodules: true 30 | fetch-depth: 0 31 | 32 | - name: Update submodules 33 | run: | 34 | git submodule update --init --recursive --remote 35 | cd submodules/cppjieba 36 | git checkout master 37 | 38 | - name: Configure CMake 39 | run: cmake -B ${{github.workspace}}/build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=${{matrix.cpp_version}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} 40 | 41 | - name: Build 42 | run: cmake --build ${{github.workspace}}/build --config ${{matrix.build_type}} 43 | 44 | - name: Test 45 | working-directory: ${{github.workspace}}/build 46 | run: ctest -C ${{matrix.build_type}} --output-on-failure --verbose 47 | 48 | -------------------------------------------------------------------------------- /benchmark/benchmarking.cpp: -------------------------------------------------------------------------------- 1 | /// file: benchmarking.cpp 2 | /// date: 2022-01-17 3 | 4 | 5 | #include 6 | #include "./lib/Simhasher_benchmark.h" 7 | 8 | using namespace simhash_benchmark; 9 | 10 | 11 | BENCHMARK(BENCHMARK_Simhasher_extract_text50_top5); 12 | BENCHMARK(BENCHMARK_Simhasher_extract_text50_top10); 13 | BENCHMARK(BENCHMARK_Simhasher_extract_text50_top15); 14 | BENCHMARK(BENCHMARK_Simhasher_extract_text50_top20); 15 | 16 | BENCHMARK(BENCHMARK_Simhasher_extract_text500_top5); 17 | 18 | 19 | BENCHMARK(BENCHMARK_Simhasher_make_text50_top5); 20 | BENCHMARK(BENCHMARK_Simhasher_make_text50_top10); 21 | BENCHMARK(BENCHMARK_Simhasher_make_text50_top15); 22 | BENCHMARK(BENCHMARK_Simhasher_make_text50_top20); 23 | 24 | 25 | BENCHMARK(BENCHMARK_Simhasher_binaryStringToUint64); 26 | BENCHMARK(BENCHMARK_Simhasher_toBinaryString); 27 | 28 | 29 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords5); 30 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords10); 31 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords20); 32 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords50); 33 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords100); 34 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords200); 35 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords500); 36 | BENCHMARK(BENCHMARK_Simhasher_make_from_predefined_keywords1000); 37 | 38 | 39 | BENCHMARK(BENCHMARK_Simhasher_binaryStringToUint64_isEqual); 40 | BENCHMARK(BENCHMARK_Simhasher_binaryStringToUint64_isEqual_10k); 41 | BENCHMARK(BENCHMARK_Simhasher_binaryStringToUint64_isEqual_1000k); 42 | 43 | 44 | BENCHMARK_MAIN(); 45 | -------------------------------------------------------------------------------- /test/unittest/gtest_main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2006, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | #include 31 | 32 | #include "gtest/gtest.h" 33 | 34 | GTEST_API_ int main(int argc, char **argv) { 35 | std::cout << "Running main() from gtest_main.cc\n"; 36 | 37 | testing::InitGoogleTest(&argc, argv); 38 | return RUN_ALL_TESTS(); 39 | } 40 | -------------------------------------------------------------------------------- /benchmark/lib/vals.h: -------------------------------------------------------------------------------- 1 | /// file: vals.h 2 | /// date: 2022-01-18 3 | 4 | 5 | #ifndef __VALS_H__ 6 | #define __VALS_H__ 7 | 8 | #include 9 | #include "simhash/Simhasher.hpp" 10 | #include "./utils.h" 11 | 12 | 13 | using namespace simhash; 14 | 15 | 16 | namespace simhash_benchmark { 17 | 18 | 19 | Simhasher simhasher( 20 | "../submodules/cppjieba/dict/jieba.dict.utf8", "../submodules/cppjieba/dict/hmm_model.utf8", 21 | "../submodules/cppjieba/dict/idf.utf8", "../submodules/cppjieba/dict/stop_words.utf8"); 22 | 23 | 24 | /// Text with length 50 chars. 25 | std::string text50 = 26 | "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,走上人生巅峰。"; 27 | 28 | 29 | /// Text with length 500. 30 | /// Ref to 'https://mp.weixin.qq.com/s/Zdwb7Qo-044fVzO1Di2krw'. 31 | std::string text500 = R"( 32 | 火山主要是因地壳板块推挤,岩浆中的气体压力到达一定程度造成喷发所致。海底火山与陆地火山原理相同,都是地球内部能量在地表的一种释放,也都有死火山与活火山之分。据不完全统计,目前全球的海底火山超过2万多座,其中太平洋拥有50%以上,是地球上火山最多、最密集的地方。 33 | 34 | 汤加位于世界第二深海沟汤加海沟(最深10882米)以及太平洋板块与印度—澳大利亚板块俯冲边界,是环太平洋地震带的地壳活动强烈区。地壳强烈活动使得汤加频繁遭遇地震和海底火山喷发。最近几年来,汤加里氏6级以上的大地震就发生过多次。 35 | 36 | 汤加的洪阿哈阿帕伊岛是汤加附近海底、汤加—克尔马德克群岛火山弧的一部分,坐落在新西兰东北向北延伸至斐济的地壳俯冲带。该处地壳十分活跃,火山近年多次喷发。汤加首席地质学家库拉认为,上一次喷发是在2021年12月20日,但那次的威力并不大,此次喷发释放的能量为上一次的7倍以上。 37 | 38 | 海底火山喷发的破坏是全方位的。喷发出来的火山气体,包括甲烷和硫化物等会溶到水中,对附近海底生态造成毁灭性打击。火山喷发导致汤加全境迅速被火山灰遮盖,空气和水严重污染,引发的海啸导致海水倒灌,居民无法正常生活。 39 | 40 | 由于汤加的海底电缆在火山喷发中断裂,导致汤加瞬间“失联”,目前灾情与人员伤亡数字仍无法统计。 41 | )"; 42 | 43 | 44 | // Simhash bin strings. 45 | std::string simhash_bin_str1 = "100010110110"; 46 | std::string simhash_bin_str2 = "110001110011"; 47 | 48 | 49 | // Pre-defined keywords 50 | std::vector > keywords5 = fake_keywords(5); 51 | std::vector > keywords10 = fake_keywords(10); 52 | std::vector > keywords20 = fake_keywords(20); 53 | std::vector > keywords50 = fake_keywords(50); 54 | std::vector > keywords100 = fake_keywords(100); 55 | std::vector > keywords200 = fake_keywords(200); 56 | std::vector > keywords500 = fake_keywords(500); 57 | std::vector > keywords1000 = fake_keywords(1000); 58 | 59 | 60 | } // namespace simhash_benchmark 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /test/testdata/news_content: -------------------------------------------------------------------------------- 1 | 日军侵占南京后,国民政府虽西迁重庆,但政府机关大部和军事统帅部却在武汉,武汉实际上成为当时全国军事、政治、经济的中心。1937年12月13日,国民政府军事委员会拟定保卫武汉作战计划。在徐州失守后,即调整部署,先后调集约130个师和各型飞机200余架、各型舰艇及布雷小轮30余艘,共100万余人,利用大别山、鄱阳湖和长江两岸地区有利地形,组织防御,保卫武汉。由第五战区司令长官李宗仁(7月中旬~9月中旬由白崇禧代理)指挥所部负责江北防务;第九战区司令长官陈诚指挥所部负责江南防务。另以第一战区在平汉铁路(今北京一汉口)的郑州至信阳段以西地区,防备华北日军南下;第三战区在安徽芜湖、安庆间的长江南岸和江西南昌以东地区,防备日军经浙赣铁路(杭州一株洲)向粤汉铁路(广州一武昌)迂回。   1938年5月日军攻陷徐州后,决定先以一部兵力攻占安庆,作为进攻武汉的前进基地,然后以主力沿淮河进攻大别山以北地区,由武胜关攻取武汉,另以一部沿长江西进。后因黄河决口,被迫中止沿淮河主攻武汉的计划,改以主力沿长江两岸进攻。6月初,第6师从合肥南下;波田支队(相当于旅)由芜湖溯江西进,向安庆进攻。守军第26、第27集团军节节阻击。12日波田支队攻占安庆,继续沿江西进。13日第6师攻占桐城后,转向西南方向进攻,17日陷潜山。至7月初,日军在江北占领太湖、望江以东,在江南占领江西湖口以东的长江沿岸地区。4日,日军华中派遣军调整战斗序列,由其司令官?俊六指挥第2、第11集团军,负责对武汉的作战。以第11集团军沿长江两岸主攻武汉;第2集团军沿大别山北麓助攻武汉。日军先后投入作战的兵力共9个师、1个旅、2个支队(相当于旅)和2个野战重炮旅、2个战车团,航空兵3个飞行团各型飞机300余架,海军第3舰队各型舰艇120余艘,共约25万人。另以华中派遣军直辖的5个师分别担任对上海、南京、杭州等地区的警备任务,以巩固后方,保障此次作战。   长江南岸地区作战 第九战区以第1兵团在鄱阳湖西岸地区,第2兵团在江西星子、九江至码头镇之线组织防御。日军第11集团军主力沿长江南岸地区进攻,7月23日,波田支队在九江东面的姑塘登岸。守军第2兵团以第70、第64军等部协同第8军实施反击,由于日军第106师继续登岸,26日九江失守。波田支队沿长江西进,8月10日,在瑞昌东北的港口登岸,向瑞昌进攻。第3集团军在第32军团增援下奋力抗击。后因日军第9师加入战斗,守军力战不支,24日瑞昌失守。第9师和波田支队继续沿长江西进,同时以第27师向箬溪方向进犯。第30集团军和第18军等部在瑞昌-武宁公路沿途地区逐次抗击,相持月余,至10月5日,日军第27师攻占箬溪后,转向西北进攻,18日陷湖北辛潭铺(属阳新),向金牛(今属大冶)方向进犯。在此期间,守军第31集团军和第32军团等部在瑞昌以西地区节节抵抗沿长江西进的日军,至9月24日,码头镇、富池口(属阳新)先后陷落。第2兵团组织第6、第54、第75、第98军和第26、第30军团等部在阳新地区加强防御,战至10月22日,阳新、大冶、鄂城(今鄂州)相继失守,日军第9师和波田支队向武昌逼近。   当西进日军进攻瑞昌的同时,第106师从九江沿南浔铁路(南昌-九江)南犯。守军第1兵团第29军团和第4、第8军等部依托庐山两侧及南浔铁路北段的有利地形进行顽强抗击,日军进攻受挫。8月20日,日军第101师从湖口横渡鄱阳湖增援,突破第25军防线,攻占星子,协同第106师企图攻占德安,夺取南昌,以保障西进日军的南侧安全。第1兵团总司令薛岳以第66、第74、第4、第29军等部协同第25军在德安以北的隘口、马回岭地区与之激战,双方成胶着状态。9月底,日军第106师第123、第145、第147团和第101师第149团孤军深入,进至德安西面万家岭地区。薛岳指挥第4、第66、第74军等部从侧后迂回,将其包围。日军第27师一部增援,在万家岭西面白水街地区被第32军等部击退。10月7日,中国军队发起总攻,激战三昼夜,多次击败日军反扑。日军由于孤立无援,补给断绝,战至10日,4个团大部被歼。史称万家岭大捷。   长江北岸地区作战 7月24日,日军第11集团军第6师从安徽潜山向太湖进攻,相继突破第31、第68军防线,至8月3日,先后攻占太湖、宿松、黄梅(属湖北)等地,继续西进。第五战区第4兵团以主力在湖北广济(今梅川)、田家镇、浠水地区准备迎击日军,第11集团军和第68军固守黄梅西北一线,调第21、第26、第29集团军由潜山、黄梅西北山区南下侧击日军,至28日先后收复太湖、宿松。第11集团军和第68军乘势反攻,未果,退至广济地区,协同第26、第86、第55军等部继续抗击日军。第4兵团令第21、第29集团军自黄梅西北实施侧击,未能阻止日军,至9月17日广济、武穴相继沦陷。接着日军围攻田家镇要塞。第4兵团以守备要塞的第2军并加强第87军一部固守阵地,以第26、第48、第86军在外围策应作战,攻击日军侧背,激战旬余,终因阵地被日军优势火力摧毁,伤亡甚重,29日田家镇要塞失守。日军继续进攻,10月19日陷浠水,24日占黄陂,直逼汉口。   大别山北麓地区作战 第五战区第3兵团以第51军和第19军团第77军在安徽六安、霍山地区,第71军在富金山、固始(属河南)地区,第2集团军在河南商城、湖北麻城地区,第27军团第59军在河南潢川地区,第17军团在信阳地区组织防御。8月下旬,日军第2集团军从合肥分南北两路进攻。南路第13师于29日突破第77军防线攻占霍山,向叶家集方向进犯。第71军和第2集团军在叶家集附近的富金山至商城一带依托既设阵地顽强抵抗。日军第13师受挫,得第16师增援,9月16日攻占商城。守军退守商城以南打船店、沙窝地区,凭借大别山各要隘,顽强抵抗,至10月24日,日军逼近麻城。北路日军第10师于8月28日突破第51军防线攻占六安后,强渡淠河和史河,9月6日进占固始,继续西进。第27军团第59军在春河集(属固始)、潢川一带组织抗击,鏖战旬余,19日潢川失守。21日日军第10师突破第17军团第45军阵地,攻占罗山,继续西进,在信阳以东地区遭第17军团反击,被迫撤回罗山。日军第2集团军以第3师增援,协同第10师向信阳进攻。10月6日,一部迂回信阳以南,攻占平汉铁路上的柳林站。12日日军第2集团军攻占信阳,然后沿平汉铁路南下,协同第11集团军进攻武汉。   中国军队由于处处设防,分兵把守,且未掌握强有力的预备队,没有充分发动群众,破坏对方交通线,因此,未能重创日军。在日军已达成对武汉包围的情况下,为保存力量,中国军队不得不于10月25日弃守该城。日军26日占领武昌、汉口,27日占领汉阳。   武汉保卫战,是抗日战争战略防御阶段规模最大的一次战役,中国军队英勇抗击,消耗了日军有生力量(日军承认伤亡共3万余人),迟滞了日军行动。日军虽然攻占了武汉,但其速战速决,迫国民政府屈服以结束战争的战略企图并未达到。此后,抗日战争进入战略相持阶段。    2 | -------------------------------------------------------------------------------- /test/testdata/news_content.2: -------------------------------------------------------------------------------- 1 | 日军侵占南京后,国民政府虽西迁重庆,但政府机关大部和军事统帅部却在武汉,武汉实际上成为当时全国军事、政治、经济的中心。1937年12月13日,国民政府军事委员会拟定保卫武汉作战计划。在徐州失守后,即调整部署,先后调集约130个师和各型飞机200余架、各型舰艇及布雷小轮30余艘,共100万余人,利用大别山、鄱阳湖和长江两岸地区有利地形,组织防御,保卫武汉。由第五战区司令长官李宗仁(7月中旬~9月中旬由白崇禧代理)指挥所部负责江北防务;第九战区司令长官陈诚指挥所部负责江南防务。另以第一战区在平汉铁路(今北京一汉口)的郑州至信阳段以西地区,防备华北日军南下;第三战区在安徽芜湖、安庆间的长江南岸和江西南昌以东地区,防备日军经浙赣铁路(杭州一株洲)向粤汉铁路(广州一武昌)迂回。   1938年5月日军攻陷徐州后,决定先以一部兵力攻占安庆,作为进攻武汉的前进基地,然后以主力沿淮河进攻大别山以北地区,由武胜关攻取武汉,另以一部沿长江西进。后因黄河决口,被迫中止沿淮河主攻武汉的计划,改以主力沿长江两岸进攻。6月初,第6师从合肥南下;波田支队(相当于旅)由芜湖溯江西进,向安庆进攻。守军第26、第27集团军节节阻击。12日波田支队攻占安庆,继续沿江西进。13日第6师攻占桐城后,转向西南方向进攻,17日陷潜山。至7月初,日军在江北占领太湖、望江以东,在江南占领江西湖口以东的长江沿岸地区。4日,日军华中派遣军调整战斗序列,由其司令官?俊六指挥第2、第11集团军,负责对武汉的作战。以第11集团军沿长江两岸主攻武汉;第2集团军沿大别山北麓助攻武汉。日军先后投入作战的兵力共9个师、1个旅、2个支队(相当于旅)和2个野战重炮旅、2个战车团,航空兵3个飞行团各型飞机300余架,海军第3舰队各型舰艇120余艘,共约25万人。另以华中派遣军直辖的5个师分别担任对上海、南京、杭州等地区的警备任务,以巩固后方,保障此次作战。   长江南岸地区作战 第九战区以第1兵团在鄱阳湖西岸地区,第2兵团在江西星子、九江至码头镇之线组织防御。日军第11集团军主力沿长江南岸地区进攻,7月23日,波田支队在九江东面的姑塘登岸。守军第2兵团以第70、第64军等部协同第8军实施反击,由于日军第106师继续登岸,26日九江失守。波田支队沿长江西进,8月10日,在瑞昌东北的港口登岸,向瑞昌进攻。第3集团军在第32军团增援下奋力抗击。后因日军第9师加入战斗,守军力战不支,24日瑞昌失守。第9师和波田支队继续沿长江西进,同时以第27师向箬溪方向进犯。第30集团军和第18军等部在瑞昌-武宁公路沿途地区逐次抗击,相持月余,至10月5日,日军第27师攻占箬溪后,转向西北进攻,18日陷湖北辛潭铺(属阳新),向金牛(今属大冶)方向进犯。在此期间,守军第31集团军和第32军团等部在瑞昌以西地区节节抵抗沿长江西进的日军,至9月24日,码头镇、富池口(属阳新)先后陷落。第2兵团组织第6、第54、第75、第98军和第26、第30军团等部在阳新地区加强防御,战至10月22日,阳新、大冶、鄂城(今鄂州)相继失守,日军第9师和波田支队向武昌逼近。   当西进日军进攻瑞昌的同时,第106师从九江沿南浔铁路(南昌-九江)南犯。守军第1兵团第29军团和第4、第8军等部依托庐山两侧及南浔铁路北段的有利地形进行顽强抗击,日军进攻受挫。8月20日,日军第101师从湖口横渡鄱阳湖增援,突破第25军防线,攻占星子,协同第106师企图攻占德安,夺取南昌,以保障西进日军的南侧安全。第1兵团总司令薛岳以第66、第74、第4、第29军等部协同第25军在德安以北的隘口、马回岭地区与之激战,双方成胶着状态。9月底,日军第106师第123、第145、第147团和第101师第149团孤军深入,进至德安西面万家岭地区。薛岳指挥第4、第66、第74军等部从侧后迂回,将其包围。日军第27师一部增援,在万家岭西面白水街地区被第32军等部击退。10月7日,中国军队发起总攻,激战三昼夜,多次击败日军反扑。日军由于孤立无援,补给断绝,战至10日,4个团大部被歼。史称万家岭大捷。   长江北岸地区作战 7月24日,日军第11集团军第6师从安徽潜山向太湖进攻,相继突破第31、第68军防线,至8月3日,先后攻占太湖、宿松、黄梅(属湖北)等地,继续西进。第五战区第4兵团以主力在湖北广济(今梅川)、田家镇、浠水地区准备迎击日军,第11集团军和第68军固守黄梅西北一线,调第21、第26、第29集团军由潜山、黄梅西北山区南下侧击日军,至28日先后收复太湖、宿松。第11集团军和第68军乘势反攻,未果,退至广济地区,协同第26、第86、第55军等部继续抗击日军。第4兵团令第21、第29集团军自黄梅西北实施侧击,未能阻止日军,至9月17日广济、武穴相继沦陷。接着日军围攻田家镇要塞。第4兵团以守备要塞的第2军并加强第87军一部固守阵地,以第26、第48、第86军在外围策应作战,攻击日军侧背,激战旬余,终因阵地被日军优势火力摧毁,伤亡甚重,29日田家镇要塞失守。日军继续进攻,10月19日陷浠水,24日占黄陂,直逼汉口。   大别山北麓地区作战 第五战区第3兵团以第51军和第19军团第77军在安徽六安、霍山地区,第71军在富金山、固始(属河南)地区,第2集团军在河南商城、湖北麻城地区,第27军团第59军在河南潢川地区,第17军团在信阳地区组织防御。8月下旬,日军第2集团军从合肥分南北两路进攻。南路第13师于29日突破第77军防线攻占霍山,向叶家集方向进犯。第71军和第2集团军在叶家集附近的富金山至商城一带依托既设阵地顽强抵抗。日军第13师受挫,得第16师增援,9月16日攻占商城。守军退守商城以南打船店、沙窝地区,凭借大别山各要隘,顽强抵抗,至10月24日,日军逼近麻城。北路日军第10师于8月28日突破第51军防线攻占六安后,强渡淠河和史河,9月6日进占固始,继续西进。第27军团第59军在春河集(属固始)、潢川一带组织抗击,鏖战旬余,19日潢川失守。21日日军第10师突破第17军团第45军阵地,攻占罗山,继续西进,在信阳以东地区遭第17军团反击,被迫撤回罗山。日军第2集团军以第3师增援,协同第10师向信阳进攻。10月6日,一部迂回信阳以南,攻占平汉铁路上的柳林站。12日日军第2集团军攻占信阳,然后沿平汉铁路南下,协同第11集团军进攻武汉。   中国军队由于处处设防,分兵把守,且未掌握强有力的预备队,没有充分发动群众,破坏对方交通线,因此,未能重创日军。在日军已达成对武汉包围的情况下,为保存力量,中国军队不得不于10月25日弃守该城。日军26日占领武昌、汉口,27日占领汉阳。   武汉保卫战,是抗日战争战略防御阶段规模最大的一次战役,中国军队英勇抗击,消耗了日军有生力量(日军承认伤亡共3万余人),迟滞了日军行动。日军虽然攻占了武汉,但其速战速决,迫国民政府屈服以结束战争的战略企图并未达到。此后,抗日战争进入战略相持阶段。    2 | -------------------------------------------------------------------------------- /test/unittest/TSimhash.cpp: -------------------------------------------------------------------------------- 1 | #include "simhash/Simhasher.hpp" 2 | #include "gtest/gtest.h" 3 | 4 | using namespace simhash; 5 | 6 | static bool loadFile2Str(const string& filePath, string& s) 7 | { 8 | ifstream ifs(filePath.c_str()); 9 | if(!ifs) 10 | { 11 | return false; 12 | } 13 | s << ifs; 14 | return true; 15 | } 16 | 17 | TEST(SimhasherTest, Test1) 18 | { 19 | Simhasher shash("../submodules/cppjieba/dict/jieba.dict.utf8", "../submodules/cppjieba/dict/hmm_model.utf8", "../submodules/cppjieba/dict/idf.utf8", "../submodules/cppjieba/dict/stop_words.utf8"); 20 | { 21 | string s; 22 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content", s)); 23 | vector > v64s; 24 | ASSERT_TRUE(shash.make(s, 5, v64s)); 25 | string res; 26 | res << v64s; 27 | //ASSERT_EQ(res, "[\"15142046212652221781:318.452\", \"2117559126361955906:151.001\", \"14814508906697812479:140.87\", \"13004687738940023035:106.978\", \"18264030747823598625:103.739\"]"); 28 | ASSERT_EQ(res, "[15142046212652221781:318.452, 2117559126361955906:151.001, 14814508906697812479:140.87, 13004687738940023035:106.978, 18264030747823598625:103.739]"); 29 | } 30 | { 31 | uint64_t u64; 32 | string s; 33 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content", s)); 34 | ASSERT_TRUE(shash.make(s, 5, u64)); 35 | string res; 36 | res << u64; 37 | ASSERT_EQ("15286165794479097173", res); 38 | } 39 | { 40 | uint64_t u1, u2; 41 | string s1, s2; 42 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content", s1)); 43 | ASSERT_TRUE(shash.make(s1, 50, u1)); 44 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content.2", s2)); 45 | ASSERT_TRUE(shash.make(s2, 50, u2)); 46 | ASSERT_EQ(u1, u2); 47 | } 48 | { 49 | uint64_t u1, u2; 50 | string s1, s2; 51 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content.3", s1)); 52 | ASSERT_TRUE(shash.make(s1, 50, u1)); 53 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content.4", s2)); 54 | ASSERT_TRUE(shash.make(s2, 50, u2)); 55 | ASSERT_EQ(u1, u2); 56 | } 57 | { 58 | uint64_t u1, u2; 59 | string s1, s2; 60 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content.2", s1)); 61 | ASSERT_TRUE(shash.make(s1, 50, u1)); 62 | ASSERT_TRUE(loadFile2Str("../test/testdata/news_content.3", s2)); 63 | ASSERT_TRUE(shash.make(s2, 50, u2)); 64 | ASSERT_NE(u1, u2); 65 | } 66 | { 67 | uint64_t u1; 68 | const char * const sentence = "你好世界"; 69 | //vector > res; 70 | shash.make(sentence, 3, u1); 71 | string s; 72 | s << u1; 73 | ASSERT_EQ(s, "17676873585679812141"); 74 | } 75 | } 76 | 77 | TEST(SimhasherTest, Test2) 78 | { 79 | uint64_t u1, u2; 80 | u1 = Simhasher::binaryStringToUint64("100010110110"); 81 | u2 = Simhasher::binaryStringToUint64("110001110011"); 82 | ASSERT_FALSE(Simhasher::isEqual(u1, u2)); // default 3 83 | ASSERT_TRUE(Simhasher::isEqual(u1, u2, 5)); 84 | u1 = Simhasher::binaryStringToUint64("100010110110"); 85 | u2 = Simhasher::binaryStringToUint64("110010110111"); 86 | ASSERT_TRUE(Simhasher::isEqual(u1, u2)); 87 | 88 | } 89 | 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 专门针对中文文档的simhash算法库 2 | 3 | [![Test](https://github.com/yanyiwu/simhash/actions/workflows/cmake.yml/badge.svg)](https://github.com/yanyiwu/simhash/actions/workflows/cmake.yml) 4 | [![Platform](https://img.shields.io/badge/platform-Linux,macOS-green.svg?style=flat)](https://github.com/yanyiwu/simhash) 5 | [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/) 6 | [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org) 7 | [![Tag](https://img.shields.io/github/v/tag/yanyiwu/simhash.svg)](https://github.com/yanyiwu/simhash/releases) 8 | 9 | 10 | ## 简介 11 | 12 | 此项目用来对中文文档计算出对应的 simhash 值。 simhash 是谷歌用来进行文本去重的算法,现在广泛应用在文本处理中。 13 | 14 | 详见[simhash算法原理及实现] 15 | 16 | ## 特性 17 | 18 | + 使用 [CppJieba] 作为分词器和关键词抽取器 19 | + 使用 [jenkins] 作为 hash 函数 20 | + `hpp` 风格,所有源码都是 `.hpp` 文件里面,方便使用。 `没有链接,就没有伤害。` 21 | + 本项目的副产品项目:[simhash\_server] 提供了简单的 simhash HTTP 服务。 22 | 23 | ## 依赖 24 | 25 | * g++ (version >= 4.1 recommended), or clang++ . 26 | 27 | ## 用法 28 | 29 | ```sh 30 | git clone --recurse-submodules https://github.com/yanyiwu/simhash.git 31 | cd simhash 32 | mkdir build 33 | cd build 34 | cmake .. 35 | make 36 | ``` 37 | 38 | 测试 39 | 40 | ``` 41 | make test 42 | ``` 43 | 44 | ### 演示 45 | 46 | ``` 47 | 文本:"我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,走上人生巅峰。" 48 | 关键词序列是: ["蓝翔:11.7392", "CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089"] 49 | simhash值是: 17831459094038722629 50 | 100010110110和110001110011 simhash值的相等判断如下: 51 | 海明距离阈值默认设置为3,则isEqual结果为:0 52 | 海明距离阈值默认设置为5,则isEqual结果为:1 53 | ``` 54 | 55 | 详情请看 [demo](https://github.com/yanyiwu/simhash-demo) 56 | 57 | ### Benchmark 58 | ```sh 59 | ./benchmark/benchmarking 60 | ``` 61 | 结果如下: 62 | ``` 63 | Running ./benchmark/benchmarking 64 | Run on (16 X 2494.14 MHz CPU s) 65 | CPU Caches: 66 | L1 Data 32 KiB (x16) 67 | L1 Instruction 32 KiB (x16) 68 | L2 Unified 4096 KiB (x16) 69 | L3 Unified 36608 KiB (x1) 70 | Load Average: 0.07, 0.04, 0.03 71 | ***WARNING*** Library was built as DEBUG. Timings may be affected. 72 | ------------------------------------------------------------------------------------------------- 73 | Benchmark Time CPU Iterations 74 | ------------------------------------------------------------------------------------------------- 75 | BENCHMARK_Simhasher_extract_text50_top5 13478 ns 13478 ns 52013 76 | BENCHMARK_Simhasher_extract_text50_top10 13843 ns 13843 ns 50833 77 | BENCHMARK_Simhasher_extract_text50_top15 13929 ns 13929 ns 49488 78 | BENCHMARK_Simhasher_extract_text50_top20 13842 ns 13842 ns 50541 79 | BENCHMARK_Simhasher_extract_text500_top5 184074 ns 184067 ns 3775 80 | BENCHMARK_Simhasher_make_text50_top5 14457 ns 14457 ns 48341 81 | BENCHMARK_Simhasher_make_text50_top10 15170 ns 15169 ns 46203 82 | BENCHMARK_Simhasher_make_text50_top15 15585 ns 15585 ns 44903 83 | BENCHMARK_Simhasher_make_text50_top20 15743 ns 15742 ns 44466 84 | BENCHMARK_Simhasher_binaryStringToUint64 0.000 ns 0.000 ns 1000000000 85 | BENCHMARK_Simhasher_toBinaryString 63.9 ns 63.9 ns 10937009 86 | BENCHMARK_Simhasher_make_from_predefined_keywords5 423 ns 423 ns 1644823 87 | BENCHMARK_Simhasher_make_from_predefined_keywords10 735 ns 735 ns 950156 88 | BENCHMARK_Simhasher_make_from_predefined_keywords20 1364 ns 1364 ns 508935 89 | BENCHMARK_Simhasher_make_from_predefined_keywords50 7876 ns 7875 ns 89006 90 | BENCHMARK_Simhasher_make_from_predefined_keywords100 21409 ns 21409 ns 32743 91 | BENCHMARK_Simhasher_make_from_predefined_keywords200 47469 ns 47468 ns 14728 92 | BENCHMARK_Simhasher_make_from_predefined_keywords500 124316 ns 124314 ns 5627 93 | BENCHMARK_Simhasher_make_from_predefined_keywords1000 251336 ns 251329 ns 2785 94 | BENCHMARK_Simhasher_binaryStringToUint64_isEqual 0.000 ns 0.000 ns 1000000000 95 | BENCHMARK_Simhasher_binaryStringToUint64_isEqual_10k 0.000 ns 0.000 ns 1000000000 96 | BENCHMARK_Simhasher_binaryStringToUint64_isEqual_1000k 0.000 ns 0.000 ns 1000000000 97 | ``` 98 | 99 | 100 | 101 | [simhash算法原理及实现]:http://yanyiwu.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html 102 | [CppJieba]:https://github.com/yanyiwu/cppjieba 103 | [jenkins]:https://github.com/seomoz/simhash-cpp/blob/master/src/hashes/jenkins.h 104 | [simhash\_server]:https://github.com/yanyiwu/simhash_server 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /benchmark/lib/Simhasher_benchmark.h: -------------------------------------------------------------------------------- 1 | /// file: Simhasher_benchmark.h 2 | /// date: 2022-01-17 3 | 4 | 5 | #ifndef __SIMHASHED_HENCHMARK_H__ 6 | 7 | #include 8 | #include "simhash/Simhasher.hpp" 9 | 10 | #include "./vals.h" 11 | 12 | using namespace simhash; 13 | 14 | 15 | namespace simhash_benchmark { 16 | 17 | 18 | int32_t top_n_keywords = 5; 19 | uint64_t simhash64 = 0; 20 | std::vector > keywords; 21 | 22 | 23 | static void BENCHMARK_Simhasher_extract_text50_top5( 24 | benchmark::State& state) { 25 | keywords.clear(); 26 | for (auto _ : state) { 27 | simhasher.extract(text50, keywords, 5); 28 | } 29 | } 30 | 31 | 32 | static void BENCHMARK_Simhasher_extract_text50_top10( 33 | benchmark::State& state) { 34 | keywords.clear(); 35 | for (auto _ : state) { 36 | simhasher.extract(text50, keywords, 10); 37 | } 38 | } 39 | 40 | 41 | static void BENCHMARK_Simhasher_extract_text50_top15( 42 | benchmark::State& state) { 43 | keywords.clear(); 44 | for (auto _ : state) { 45 | simhasher.extract(text50, keywords, 15); 46 | } 47 | } 48 | 49 | 50 | static void BENCHMARK_Simhasher_extract_text50_top20( 51 | benchmark::State& state) { 52 | keywords.clear(); 53 | for (auto _ : state) { 54 | simhasher.extract(text50, keywords, 20); 55 | } 56 | } 57 | 58 | 59 | static void BENCHMARK_Simhasher_extract_text500_top5( 60 | benchmark::State& state) { 61 | keywords.clear(); 62 | for (auto _ : state) { 63 | simhasher.extract(text500, keywords, 5); 64 | } 65 | } 66 | 67 | 68 | static void BENCHMARK_Simhasher_make_text50_top5( 69 | benchmark::State& state) { 70 | keywords.clear(); 71 | for (auto _ : state) { 72 | simhasher.make(text50, 5, simhash64); 73 | } 74 | } 75 | 76 | 77 | static void BENCHMARK_Simhasher_make_text50_top10( 78 | benchmark::State& state) { 79 | keywords.clear(); 80 | for (auto _ : state) { 81 | simhasher.make(text50, 10, simhash64); 82 | } 83 | } 84 | 85 | 86 | static void BENCHMARK_Simhasher_make_text50_top15( 87 | benchmark::State& state) { 88 | keywords.clear(); 89 | for (auto _ : state) { 90 | simhasher.make(text50, 15, simhash64); 91 | } 92 | } 93 | 94 | 95 | static void BENCHMARK_Simhasher_make_text50_top20( 96 | benchmark::State& state) { 97 | keywords.clear(); 98 | for (auto _ : state) { 99 | simhasher.make(text50, 20, simhash64); 100 | } 101 | } 102 | 103 | 104 | static void BENCHMARK_Simhasher_binaryStringToUint64( 105 | benchmark::State& state) { 106 | for (auto _ : state) { 107 | Simhasher::binaryStringToUint64(simhash_bin_str1); 108 | } 109 | } 110 | 111 | 112 | static void BENCHMARK_Simhasher_toBinaryString( 113 | benchmark::State& state) { 114 | std::string res = ""; 115 | for (auto _ : state) { 116 | Simhasher::toBinaryString(999999999, res); 117 | } 118 | } 119 | 120 | 121 | static void BENCHMARK_Simhasher_make_from_predefined_keywords5( 122 | benchmark::State& state) { 123 | for (auto _ : state) { 124 | Simhasher::make_from_predefined_keywords(keywords5); 125 | } 126 | } 127 | 128 | 129 | static void BENCHMARK_Simhasher_make_from_predefined_keywords10( 130 | benchmark::State& state) { 131 | for (auto _ : state) { 132 | Simhasher::make_from_predefined_keywords(keywords10); 133 | } 134 | } 135 | 136 | 137 | static void BENCHMARK_Simhasher_make_from_predefined_keywords20( 138 | benchmark::State& state) { 139 | for (auto _ : state) { 140 | Simhasher::make_from_predefined_keywords(keywords20); 141 | } 142 | } 143 | 144 | 145 | static void BENCHMARK_Simhasher_make_from_predefined_keywords50( 146 | benchmark::State& state) { 147 | for (auto _ : state) { 148 | Simhasher::make_from_predefined_keywords(keywords50); 149 | } 150 | } 151 | 152 | 153 | static void BENCHMARK_Simhasher_make_from_predefined_keywords100( 154 | benchmark::State& state) { 155 | for (auto _ : state) { 156 | Simhasher::make_from_predefined_keywords(keywords100); 157 | } 158 | } 159 | 160 | 161 | static void BENCHMARK_Simhasher_make_from_predefined_keywords200( 162 | benchmark::State& state) { 163 | for (auto _ : state) { 164 | Simhasher::make_from_predefined_keywords(keywords200); 165 | } 166 | } 167 | 168 | 169 | static void BENCHMARK_Simhasher_make_from_predefined_keywords500( 170 | benchmark::State& state) { 171 | for (auto _ : state) { 172 | Simhasher::make_from_predefined_keywords(keywords500); 173 | } 174 | } 175 | 176 | 177 | static void BENCHMARK_Simhasher_make_from_predefined_keywords1000( 178 | benchmark::State& state) { 179 | for (auto _ : state) { 180 | Simhasher::make_from_predefined_keywords(keywords1000); 181 | } 182 | } 183 | 184 | 185 | static void BENCHMARK_Simhasher_binaryStringToUint64_isEqual( 186 | benchmark::State& state) { 187 | for (auto _ : state) { 188 | Simhasher::isEqual( 189 | Simhasher::binaryStringToUint64(simhash_bin_str1), 190 | Simhasher::binaryStringToUint64(simhash_bin_str2)); 191 | } 192 | } 193 | 194 | 195 | void isEqual_n_times(int32_t n) { 196 | for (int32_t i = 0; i < n; ++i) { 197 | Simhasher::isEqual( 198 | Simhasher::binaryStringToUint64(simhash_bin_str1), 199 | Simhasher::binaryStringToUint64(simhash_bin_str2)); 200 | } 201 | } 202 | 203 | 204 | static void BENCHMARK_Simhasher_binaryStringToUint64_isEqual_10k( 205 | benchmark::State& state) { 206 | for (auto _ : state) { isEqual_n_times(10000); } 207 | } 208 | 209 | 210 | static void BENCHMARK_Simhasher_binaryStringToUint64_isEqual_1000k( 211 | benchmark::State& state) { 212 | for (auto _ : state) { isEqual_n_times(1000000); } 213 | } 214 | 215 | 216 | } // simhash_benchmark 217 | 218 | #endif 219 | 220 | 221 | -------------------------------------------------------------------------------- /include/simhash/Simhasher.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SIMHASH_SIMHASHER_HPP 2 | #define SIMHASH_SIMHASHER_HPP 3 | 4 | #include "cppjieba/Jieba.hpp" 5 | #include "jenkins.h" 6 | 7 | namespace simhash 8 | { 9 | using namespace cppjieba; 10 | class Simhasher 11 | { 12 | private: 13 | enum{BITS_LENGTH = 64}; 14 | jenkins _hasher; 15 | cppjieba::Jieba _jieba; 16 | public: 17 | Simhasher(const string& dictPath = "", const string& modelPath = "", const string& idfPath = "", const string& stopWords = ""): _jieba(dictPath, modelPath, "", idfPath, stopWords) 18 | {} 19 | ~Simhasher(){}; 20 | 21 | bool extract(const string& text, vector > & res, size_t topN) const 22 | { 23 | _jieba.extractor.Extract(text, res, topN); 24 | return true; 25 | } 26 | bool make(const string& text, size_t topN, vector >& res) const 27 | { 28 | vector > wordweights; 29 | if(!extract(text, wordweights, topN)) 30 | { 31 | XLOG(ERROR) << "extract failed"; 32 | return false; 33 | } 34 | res.resize(wordweights.size()); 35 | for(size_t i = 0; i < res.size(); i++) 36 | { 37 | res[i].first = _hasher(wordweights[i].first.c_str(), wordweights[i].first.size(), 0); 38 | res[i].second = wordweights[i].second; 39 | } 40 | 41 | return true; 42 | } 43 | 44 | bool make(const string& text, size_t topN, uint64_t& v64) const 45 | { 46 | vector > hashvalues; 47 | if(!make(text, topN, hashvalues)) 48 | { 49 | return false; 50 | } 51 | vector weights(BITS_LENGTH, 0.0); 52 | const uint64_t u64_1(1); 53 | for(size_t i = 0; i < hashvalues.size(); i++) 54 | { 55 | for(size_t j = 0; j < BITS_LENGTH; j++) 56 | { 57 | weights [j] += (((u64_1 << j) & hashvalues[i].first) ? 1: -1) * hashvalues[i].second; 58 | } 59 | } 60 | 61 | v64 = 0; 62 | for(size_t j = 0; j < BITS_LENGTH; j++) 63 | { 64 | if(weights[j] > 0.0) 65 | { 66 | v64 |= (u64_1 << j); 67 | } 68 | } 69 | 70 | return true; 71 | } 72 | 73 | /** 74 | * @brief 75 | * Directly calculate weighted hash of pre-defined keywords. 76 | */ 77 | static uint64_t make_from_predefined_keywords(const std::vector< std::pair >& keywords) { 78 | uint64_t v64 = 0; 79 | std::vector< std::pair > hashvalues; 80 | jenkins _tmp_hasher; 81 | 82 | hashvalues.resize(keywords.size()); 83 | for (int32_t i = 0; i < keywords.size(); ++i) { 84 | hashvalues[i].first = _tmp_hasher(keywords[i].first.c_str(), keywords[i].first.size(), 0); 85 | hashvalues[i].second = keywords[i].second; 86 | } 87 | 88 | vector weights(BITS_LENGTH, 0.0); 89 | const uint64_t u64_1(1); 90 | for(size_t i = 0; i < hashvalues.size(); i++) 91 | { 92 | for(size_t j = 0; j < BITS_LENGTH; j++) 93 | { 94 | weights [j] += (((u64_1 << j) & hashvalues[i].first) ? 1: -1) * hashvalues[i].second; 95 | } 96 | } 97 | 98 | for(size_t j = 0; j < BITS_LENGTH; j++) 99 | { 100 | if(weights[j] > 0.0) 101 | { 102 | v64 |= (u64_1 << j); 103 | } 104 | } 105 | 106 | return v64; 107 | } 108 | 109 | static bool isEqual(uint64_t lhs, uint64_t rhs, unsigned short n = 3) 110 | { 111 | unsigned short cnt = 0; 112 | lhs ^= rhs; 113 | #if defined(__GNUC__) || defined(__clang__) 114 | cnt = __builtin_popcountll(lhs); 115 | #else 116 | /* 117 | * FIXME: There are actually also builtin_popcount-like functions 118 | * in other compilers.. 119 | * Anyway..Here, we just roll back to look-up table (8bits). 120 | */ 121 | static const int pop_lut[256] = { 122 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 123 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 124 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 125 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 126 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 127 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 128 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 129 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 130 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 131 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 132 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 133 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 134 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 135 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 136 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 137 | 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, 138 | }; 139 | 140 | while(lhs && cnt <= n) 141 | { 142 | cnt += pop_lut[lhs & 0xff]; 143 | lhs >>= 8; 144 | } 145 | #endif 146 | if(cnt <= n) 147 | { 148 | return true; 149 | } 150 | return false; 151 | } 152 | 153 | static void toBinaryString(uint64_t req, string& res) 154 | { 155 | res.resize(64); 156 | for(signed i = 63; i >= 0; i--) 157 | { 158 | req & 1 ? res[i] = '1' : res[i] = '0'; 159 | req >>= 1; 160 | } 161 | } 162 | 163 | static uint64_t binaryStringToUint64(const string& bin) 164 | { 165 | uint64_t res = 0; 166 | for(size_t i = 0; i < bin.size(); i++) 167 | { 168 | res <<= 1; 169 | if(bin[i] == '1') 170 | { 171 | res += 1; 172 | } 173 | } 174 | return res; 175 | } 176 | 177 | }; 178 | } 179 | 180 | #endif 181 | 182 | 183 | -------------------------------------------------------------------------------- /include/simhash/jenkins.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMHASH_HASHES_JENKINS_H 2 | #define SIMHASH_HASHES_JENKINS_H 3 | 4 | /* This code was taken from http://www.burtleburtle.net/bob/c/lookup3.c, and 5 | * under a public domain licence on May 25, 2012, reproduced below: 6 | * ----------------------------------------------------------------------------- 7 | * lookup3.c, by Bob Jenkins, May 2006, Public Domain. 8 | * 9 | * These are functions for producing 32-bit hashes for hash table lookup. 10 | * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() 11 | * are externally useful functions. Routines to test the hash are included 12 | * if SELF_TEST is defined. You can use this free for any purpose. It's in 13 | * the public domain. It has no warranty. 14 | * 15 | * You probably want to use hashlittle(). hashlittle() and hashbig() 16 | * hash byte arrays. hashlittle() is is faster than hashbig() on 17 | * little-endian machines. Intel and AMD are little-endian machines. 18 | * On second thought, you probably want hashlittle2(), which is identical to 19 | * hashlittle() except it returns two 32-bit hashes for the price of one. 20 | * You could implement hashbig2() if you wanted but I haven't bothered here. 21 | * 22 | * If you want to find a hash of, say, exactly 7 integers, do 23 | * a = i1; b = i2; c = i3; 24 | * mix(a,b,c); 25 | * a += i4; b += i5; c += i6; 26 | * mix(a,b,c); 27 | * a += i7; 28 | * final(a,b,c); 29 | * then use c as the hash value. If you have a variable length array of 30 | * 4-byte integers to hash, use hashword(). If you have a byte array (like 31 | * a character string), use hashlittle(). If you have several byte arrays, or 32 | * a mix of things, see the comments above hashlittle(). 33 | * 34 | * Why is this so big? I read 12 bytes at a time into 3 4-byte integers, 35 | * then mix those integers. This is fast (you can do a lot more thorough 36 | * mixing with 12*3 instructions on 3 integers than you can with 3 instructions 37 | * on 1 byte), but shoehorning those bytes into integers efficiently is messy. 38 | * ----------------------------------------------------------------------------- 39 | */ 40 | 41 | #include /* defined size_t */ 42 | #include /* defines uint32_t etc */ 43 | 44 | #ifdef _WIN32 45 | # define HASH_LITTLE_ENDIAN 1 46 | # define HASH_BIG_ENDIAN 0 47 | #else 48 | # include /* attempt to define endianness */ 49 | # ifdef linux 50 | # include /* attempt to define endianness */ 51 | # endif 52 | #endif 53 | 54 | #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) 55 | 56 | #define mix(a,b,c) \ 57 | { \ 58 | a -= c; a ^= rot(c, 4); c += b; \ 59 | b -= a; b ^= rot(a, 6); a += c; \ 60 | c -= b; c ^= rot(b, 8); b += a; \ 61 | a -= c; a ^= rot(c,16); c += b; \ 62 | b -= a; b ^= rot(a,19); a += c; \ 63 | c -= b; c ^= rot(b, 4); b += a; \ 64 | } 65 | 66 | #define final(a,b,c) \ 67 | { \ 68 | c ^= b; c -= rot(b,14); \ 69 | a ^= c; a -= rot(c,11); \ 70 | b ^= a; b -= rot(a,25); \ 71 | c ^= b; c -= rot(b,16); \ 72 | a ^= c; a -= rot(c,4); \ 73 | b ^= a; b -= rot(a,14); \ 74 | c ^= b; c -= rot(b,24); \ 75 | } 76 | 77 | /* 78 | * My best guess at if you are big-endian or little-endian. This may 79 | * need adjustment. 80 | */ 81 | #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ 82 | __BYTE_ORDER == __LITTLE_ENDIAN) || \ 83 | (defined(i386) || defined(__i386__) || defined(__i486__) || \ 84 | defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL)) 85 | # define HASH_LITTLE_ENDIAN 1 86 | # define HASH_BIG_ENDIAN 0 87 | #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \ 88 | __BYTE_ORDER == __BIG_ENDIAN) || \ 89 | (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel)) 90 | # define HASH_LITTLE_ENDIAN 0 91 | # define HASH_BIG_ENDIAN 1 92 | #else 93 | # define HASH_LITTLE_ENDIAN 0 94 | # define HASH_BIG_ENDIAN 0 95 | #endif 96 | 97 | //#warning "Jenkins" 98 | 99 | 100 | #include 101 | 102 | namespace simhash { 103 | struct jenkins { 104 | uint64_t operator()(const char* data, size_t len, uint64_t s) const { 105 | uint32_t a = static_cast(s >> 32); 106 | uint32_t b = static_cast(s & 0xffffffff); 107 | hashlittle2(static_cast(const_cast(data)), 108 | len, &a, &b); 109 | return (static_cast(a) << 32) | static_cast(b); 110 | } 111 | 112 | /* 113 | * hashlittle2: return 2 32-bit hash values 114 | * 115 | * This is identical to hashlittle(), except it returns two 32-bit hash 116 | * values instead of just one. This is good enough for hash table 117 | * lookup with 2^^64 buckets, or if you want a second hash if you're 118 | * not happy with the first, or if you want a probably-unique 64-bit 119 | * ID for the key. *pc is better mixed than *pb, so use *pc first. 120 | * If you want a 64-bit value do something like: 121 | * "*pc + (((uint64_t)*pb) <<32)". 122 | */ 123 | void hashlittle2( 124 | const void *key, /* the key to hash */ 125 | size_t length, /* length of the key */ 126 | uint32_t *pc, // IN: primary initval, OUT: primary hash 127 | uint32_t *pb) const { // IN: secondary initval, OUT: secondary 128 | uint32_t a,b,c; /* internal state */ 129 | 130 | union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ 131 | 132 | /* Set up the internal state */ 133 | a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc; 134 | c += *pb; 135 | 136 | u.ptr = key; 137 | if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { 138 | const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ 139 | #ifdef VALGRIND 140 | const uint8_t *k8; 141 | #endif 142 | 143 | /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ 144 | while (length > 12) { 145 | a += k[0]; 146 | b += k[1]; 147 | c += k[2]; 148 | mix(a,b,c); 149 | length -= 12; 150 | k += 3; 151 | } 152 | 153 | /*----------------------------- handle the last (probably partial) block */ 154 | /* 155 | * "k[2]&0xffffff" actually reads beyond the end of the string, but 156 | * then masks off the part it's not allowed to read. Because the 157 | * string is aligned, the masked-off tail is in the same word as the 158 | * rest of the string. Every machine with memory protection I've seen 159 | * does it on word boundaries, so is OK with this. But VALGRIND will 160 | * still catch it and complain. The masking trick does make the hash 161 | * noticably faster for short strings (like English words). 162 | */ 163 | #ifndef VALGRIND 164 | switch(length) { 165 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 166 | case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; 167 | case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; 168 | case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; 169 | case 8 : b+=k[1]; a+=k[0]; break; 170 | case 7 : b+=k[1]&0xffffff; a+=k[0]; break; 171 | case 6 : b+=k[1]&0xffff; a+=k[0]; break; 172 | case 5 : b+=k[1]&0xff; a+=k[0]; break; 173 | case 4 : a+=k[0]; break; 174 | case 3 : a+=k[0]&0xffffff; break; 175 | case 2 : a+=k[0]&0xffff; break; 176 | case 1 : a+=k[0]&0xff; break; 177 | case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ 178 | } 179 | #else /* make valgrind happy */ 180 | k8 = (const uint8_t *)k; 181 | switch(length) { 182 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 183 | case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ 184 | case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ 185 | case 9 : c+=k8[8]; /* fall through */ 186 | case 8 : b+=k[1]; a+=k[0]; break; 187 | case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ 188 | case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ 189 | case 5 : b+=k8[4]; /* fall through */ 190 | case 4 : a+=k[0]; break; 191 | case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ 192 | case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ 193 | case 1 : a+=k8[0]; break; 194 | case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ 195 | } 196 | #endif /* !valgrind */ 197 | } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { 198 | const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ 199 | const uint8_t *k8; 200 | /*--------------- all but last block: aligned reads and different mixing */ 201 | while (length > 12) { 202 | a += k[0] + (((uint32_t)k[1])<<16); 203 | b += k[2] + (((uint32_t)k[3])<<16); 204 | c += k[4] + (((uint32_t)k[5])<<16); 205 | mix(a,b,c); 206 | length -= 12; 207 | k += 6; 208 | } 209 | 210 | /*----------------------------- handle the last (probably partial) block */ 211 | k8 = (const uint8_t *)k; 212 | switch(length) { 213 | case 12: c+=k[4]+(((uint32_t)k[5])<<16); 214 | b+=k[2]+(((uint32_t)k[3])<<16); 215 | a+=k[0]+(((uint32_t)k[1])<<16); 216 | break; 217 | case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ 218 | case 10: c+=k[4]; 219 | b+=k[2]+(((uint32_t)k[3])<<16); 220 | a+=k[0]+(((uint32_t)k[1])<<16); 221 | break; 222 | case 9 : c+=k8[8]; /* fall through */ 223 | case 8 : b+=k[2]+(((uint32_t)k[3])<<16); 224 | a+=k[0]+(((uint32_t)k[1])<<16); 225 | break; 226 | case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ 227 | case 6 : b+=k[2]; 228 | a+=k[0]+(((uint32_t)k[1])<<16); 229 | break; 230 | case 5 : b+=k8[4]; /* fall through */ 231 | case 4 : a+=k[0]+(((uint32_t)k[1])<<16); 232 | break; 233 | case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ 234 | case 2 : a+=k[0]; 235 | break; 236 | case 1 : a+=k8[0]; 237 | break; 238 | case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ 239 | } 240 | } else { /* need to read the key one byte at a time */ 241 | const uint8_t *k = (const uint8_t *)key; 242 | /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ 243 | while (length > 12) { 244 | a += k[0]; 245 | a += ((uint32_t)k[1])<<8; 246 | a += ((uint32_t)k[2])<<16; 247 | a += ((uint32_t)k[3])<<24; 248 | b += k[4]; 249 | b += ((uint32_t)k[5])<<8; 250 | b += ((uint32_t)k[6])<<16; 251 | b += ((uint32_t)k[7])<<24; 252 | c += k[8]; 253 | c += ((uint32_t)k[9])<<8; 254 | c += ((uint32_t)k[10])<<16; 255 | c += ((uint32_t)k[11])<<24; 256 | mix(a,b,c); 257 | length -= 12; 258 | k += 12; 259 | } 260 | 261 | /*-------------------------------- last block: affect all 32 bits of (c) */ 262 | switch(length) { /* all the case statements fall through */ 263 | case 12: c+=((uint32_t)k[11])<<24; 264 | case 11: c+=((uint32_t)k[10])<<16; 265 | case 10: c+=((uint32_t)k[9])<<8; 266 | case 9 : c+=k[8]; 267 | case 8 : b+=((uint32_t)k[7])<<24; 268 | case 7 : b+=((uint32_t)k[6])<<16; 269 | case 6 : b+=((uint32_t)k[5])<<8; 270 | case 5 : b+=k[4]; 271 | case 4 : a+=((uint32_t)k[3])<<24; 272 | case 3 : a+=((uint32_t)k[2])<<16; 273 | case 2 : a+=((uint32_t)k[1])<<8; 274 | case 1 : a+=k[0]; 275 | break; 276 | case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ 277 | } 278 | } 279 | 280 | final(a,b,c); 281 | *pc=c; *pb=b; 282 | } 283 | }; 284 | } 285 | 286 | #endif 287 | --------------------------------------------------------------------------------