├── .arcconfig ├── .arclint ├── .clang-format ├── .github └── workflows │ └── CI.yml ├── .gitignore ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── TODO ├── analytics ├── CMakeLists.txt ├── README.md └── example.cpp ├── config ├── CMakeLists.txt ├── arxiv-harvester.json ├── config.h.in ├── doxyfile.in ├── mathhub-harvester.json └── zbl-harvester.json ├── data └── zbl │ └── zbl1.harvest ├── doc ├── XLSearch.pdf ├── aisc06.pdf ├── cade21.pdf ├── cicm12.pdf ├── images │ └── structure-2011-06.jpg ├── mir12.pdf ├── mkm07.pdf ├── mkm08.pdf ├── mws05.pdf └── ntcir10.pdf ├── scripts ├── CMakeLists.txt ├── cmake-modules │ ├── FindHtmlCxx.cmake │ ├── FindJson.cmake │ ├── FindLevelDb.cmake │ ├── FindMicroHttpd.cmake │ └── FindSnappy.cmake ├── elastic-search │ ├── config.sh │ ├── es-env.sh │ ├── run-bulk │ └── run-setup ├── other │ ├── ntcir-11-wiki │ │ ├── get-NTCIR-answers.rb │ │ ├── queries.xml │ │ └── submission │ │ │ └── KWARC-ntcir11-wiki.txt │ └── ntcir-11 │ │ ├── .gitignore │ │ ├── README.md │ │ ├── get-NTCIR-answers.rb │ │ ├── queries.pdf │ │ ├── queries.xml │ │ ├── random_hits.txt │ │ └── submission │ │ └── KWARC_default.tsv ├── send-mws-query.sh └── sysv │ ├── CMakeLists.txt │ ├── mws-config.in │ └── mwsd-service.sh.in ├── src ├── common │ ├── CMakeLists.txt │ ├── socket │ │ ├── CMakeLists.txt │ │ ├── InSocket.cpp │ │ ├── InSocket.hpp │ │ ├── OutSocket.cpp │ │ └── OutSocket.hpp │ ├── thread │ │ ├── CMakeLists.txt │ │ ├── ThreadWrapper.cpp │ │ └── ThreadWrapper.hpp │ ├── types │ │ ├── CMakeLists.txt │ │ ├── ControlSequence.cpp │ │ ├── ControlSequence.hpp │ │ ├── DataFormat.cpp │ │ ├── DataFormat.hpp │ │ ├── IdDictionary.hpp │ │ ├── Parcelable.cpp │ │ └── Parcelable.hpp │ └── utils │ │ ├── CMakeLists.txt │ │ ├── ContainerIterator.hpp │ │ ├── FlagParser.cpp │ │ ├── FlagParser.hpp │ │ ├── Path.hpp │ │ ├── TimeStamp.hpp │ │ ├── compiler_defs.h │ │ ├── fmemopen.c │ │ ├── getBoolType.cpp │ │ ├── getBoolType.hpp │ │ ├── getSockAddrLog.cpp │ │ ├── getSockAddrLog.hpp │ │ ├── memstream.h │ │ ├── mmap.c │ │ ├── mmap.h │ │ ├── open_memstream.c │ │ ├── save_pid_file.c │ │ ├── save_pid_file.h │ │ ├── util.cpp │ │ └── util.hpp ├── crawler │ ├── CMakeLists.txt │ ├── crawler │ │ ├── CMakeLists.txt │ │ ├── MwsCrawler.cpp │ │ └── MwsCrawler.hpp │ ├── crawlerd.cpp │ ├── daemon │ │ ├── CMakeLists.txt │ │ ├── CrawlerDaemon.cpp │ │ ├── CrawlerDaemon.hpp │ │ └── GetResponse.html │ ├── docs2harvest.cpp │ ├── harvests2json.cpp │ ├── parser │ │ ├── CMakeLists.txt │ │ ├── MathParser.cpp │ │ ├── MathParser.hpp │ │ ├── XmlParser.cpp │ │ └── XmlParser.hpp │ ├── types │ │ ├── CMakeLists.txt │ │ ├── Robotstxt.cpp │ │ ├── Robotstxt.hpp │ │ └── SharedQueue.hpp │ └── utils │ │ ├── CMakeLists.txt │ │ ├── MwsGetMath.cpp │ │ ├── MwsGetMath.hpp │ │ ├── MwsMathMLTags.txt │ │ ├── Page.cpp │ │ └── Page.hpp └── mws │ ├── CMakeLists.txt │ ├── analytics │ ├── CMakeLists.txt │ ├── analytics.cpp │ └── analytics.hpp │ ├── daemon │ ├── CMakeLists.txt │ ├── Daemon.cpp │ ├── Daemon.hpp │ ├── GenericHttpResponses.hpp │ ├── HarvestQueryHandler.cpp │ ├── HarvestQueryHandler.hpp │ ├── IndexQueryHandler.cpp │ ├── IndexQueryHandler.hpp │ ├── QueryHandler.hpp │ ├── SchemaQueryHandler.cpp │ ├── SchemaQueryHandler.hpp │ └── microhttpd_linux.h │ ├── dbc │ ├── CMakeLists.txt │ ├── CrawlDb.hpp │ ├── DbQueryManager.cpp │ ├── DbQueryManager.hpp │ ├── FormulaDb.hpp │ ├── LevCrawlDb.cpp │ ├── LevCrawlDb.hpp │ ├── LevFormulaDb.cpp │ ├── LevFormulaDb.hpp │ ├── MemCrawlDb.cpp │ ├── MemCrawlDb.hpp │ ├── MemFormulaDb.cpp │ └── MemFormulaDb.hpp │ ├── index │ ├── CMakeLists.txt │ ├── CallbackIndexIterator.hpp │ ├── ExpressionEncoder.cpp │ ├── ExpressionEncoder.hpp │ ├── IndexAccessor.hpp │ ├── IndexBuilder.cpp │ ├── IndexBuilder.hpp │ ├── IndexIterator.hpp │ ├── IndexLoader.cpp │ ├── IndexLoader.hpp │ ├── IndexWriter.cpp │ ├── IndexWriter.hpp │ ├── MeaningDictionary.hpp │ ├── TmpIndex.cpp │ ├── TmpIndex.hpp │ ├── TmpIndexAccessor.hpp │ ├── encoded_token.h │ ├── index.h │ ├── memsector.c │ └── memsector.h │ ├── mws-index.cpp │ ├── mwsd.cpp │ ├── query │ ├── CMakeLists.txt │ ├── SchemaEngine.cpp │ ├── SchemaEngine.hpp │ ├── SearchContext.cpp │ ├── SearchContext.hpp │ ├── engine.c │ └── engine.h │ ├── schemad.cpp │ ├── types │ ├── Answer.hpp │ ├── CMakeLists.txt │ ├── CmmlToken.cpp │ ├── CmmlToken.hpp │ ├── ExprSchema.hpp │ ├── FormulaPath.hpp │ ├── GenericAnswer.hpp │ ├── MwsAnswset.hpp │ ├── MwsSubst.hpp │ ├── Query.hpp │ ├── SchemaAnswset.hpp │ └── VectorMap.hpp │ └── xmlparser │ ├── CMakeLists.txt │ ├── MwsIdsResponseFormatter.cpp │ ├── MwsIdsResponseFormatter.hpp │ ├── MwsJsonResponseFormatter.cpp │ ├── MwsJsonResponseFormatter.hpp │ ├── MwsXmlResponseFormatter.cpp │ ├── MwsXmlResponseFormatter.hpp │ ├── SchemaJsonResponseFormatter.cpp │ ├── SchemaJsonResponseFormatter.hpp │ ├── SchemaXmlResponseFormatter.cpp │ ├── SchemaXmlResponseFormatter.hpp │ ├── processMwsHarvest.cpp │ ├── processMwsHarvest.hpp │ ├── readMwsQuery.cpp │ ├── readMwsQuery.hpp │ ├── xmlparser.cpp │ └── xmlparser.hpp ├── test ├── data │ ├── MwsQuery1.xml │ ├── SchemaQuery.xml │ ├── astro-ph0001197.html │ ├── ci_renaming.harvest │ ├── data1.harvest │ ├── data2.harvest │ ├── data3.harvest │ ├── data4.harvest │ ├── empty.harvest │ ├── eq_ambiguity.harvest │ └── zbl4138077.xhtml └── src │ ├── common │ ├── CMakeLists.txt │ ├── types │ │ ├── CMakeLists.txt │ │ └── Parcelable.cpp │ └── utils │ │ ├── CMakeLists.txt │ │ └── common_utils_mmap.c │ └── mws │ ├── CMakeLists.txt │ ├── dbc │ ├── CMakeLists.txt │ ├── MemCrawlDb.cpp │ └── MemFormulaDb.cpp │ ├── index │ ├── CMakeLists.txt │ ├── TmpIndexNode_exportToMemsector.cpp │ ├── ci_renaming.cpp │ └── loadMwsHarvestFromFdTest.cpp │ ├── mws-integration-test.sh.in │ ├── parser │ ├── CMakeLists.txt │ ├── readMwsQueryTest.cpp │ └── writeXmlAnswsetTest.cpp │ └── query │ ├── CMakeLists.txt │ ├── engine_novars0.cpp │ ├── engine_qvar0.cpp │ ├── engine_qvar_hvar0.cpp │ ├── engine_rep_qvar0.cpp │ ├── engine_rep_qvar1.cpp │ ├── engine_rep_qvars0.cpp │ ├── engine_rep_qvars1.cpp │ ├── engine_rep_qvars2.cpp │ ├── engine_rootqvar0.cpp │ ├── engine_tester.hpp │ ├── range_query.cpp │ ├── schema_decoder.cpp │ ├── schema_engine_tester.hpp │ ├── schema_hashing.cpp │ └── schema_reducer.cpp └── third_party ├── cmake-modules └── FindICU.cmake ├── common ├── CMakeLists.txt └── crc32 │ ├── CMakeLists.txt │ ├── crc32.c │ └── crc32 │ └── crc32.h └── crawler ├── CMakeLists.txt └── googleurl ├── CMakeLists.txt ├── LICENSE.txt ├── README.txt └── src ├── basictypes.h ├── gurl.cc ├── gurl.h ├── logging.h ├── macros.h ├── scoped_ptr.h ├── string16.cc ├── string16.h ├── url_canon.h ├── url_canon_etc.cc ├── url_canon_filesystemurl.cc ├── url_canon_fileurl.cc ├── url_canon_host.cc ├── url_canon_icu.cc ├── url_canon_icu.h ├── url_canon_internal.cc ├── url_canon_internal.h ├── url_canon_internal_file.h ├── url_canon_ip.cc ├── url_canon_ip.h ├── url_canon_mailtourl.cc ├── url_canon_path.cc ├── url_canon_pathurl.cc ├── url_canon_query.cc ├── url_canon_relative.cc ├── url_canon_stdstring.h ├── url_canon_stdurl.cc ├── url_common.h ├── url_file.h ├── url_parse.cc ├── url_parse.h ├── url_parse_file.cc ├── url_parse_internal.h ├── url_util.cc ├── url_util.h └── url_util_internal.h /.arcconfig: -------------------------------------------------------------------------------- 1 | { 2 | "project_id" : "mathwebsearch", 3 | "conduit_uri" : "http://phab.code4fun.de/" 4 | } 5 | -------------------------------------------------------------------------------- /.arclint: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": [ 3 | "(^third_party/)", 4 | "(^data/)", 5 | "(^test/data/)", 6 | "(^\\.)" 7 | ], 8 | "linters": { 9 | "cpplint": { 10 | "type": "cpplint", 11 | "include": "(\\.cpp$|\\.hpp|\\.c|\\.h)" 12 | }, 13 | "filename": { 14 | "type": "filename" 15 | }, 16 | "generated": { 17 | "type": "generated" 18 | }, 19 | "merge-conflict": { 20 | "type": "merge-conflict" 21 | }, 22 | "nolint": { 23 | "type": "nolint" 24 | }, 25 | "text": { 26 | "type": "text" 27 | }, 28 | "spelling": { 29 | "type": "spelling" 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | AccessModifierOffset: -3 3 | IndentCaseLabels: false 4 | IndentWidth: 4 5 | PointerBindsToType: true 6 | DerivePointerBinding: false 7 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | buildlinux: 7 | runs-on: ubuntu-latest 8 | name: 'Ubuntu / ${{ matrix.compiler }}' 9 | strategy: 10 | matrix: 11 | compiler: [gcc] 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Setup dependencies 15 | run: | 16 | sudo apt-get update && \ 17 | sudo apt-get -y install \ 18 | cmake \ 19 | make \ 20 | pkg-config \ 21 | libmicrohttpd-dev \ 22 | libxml2-dev \ 23 | libleveldb-dev \ 24 | libsnappy-dev \ 25 | libjson-c-dev \ 26 | libhtmlcxx-dev \ 27 | libicu-dev \ 28 | libcurl4-gnutls-dev \ 29 | netcat \ 30 | curl 31 | 32 | - run: make 33 | env: 34 | CC: ${{ matrix.compiler }} 35 | - run: make test 36 | env: 37 | CC: ${{ matrix.compiler }} 38 | buildmac: 39 | runs-on: macos-latest 40 | name: 'macOS / ${{ matrix.compiler }}' 41 | strategy: 42 | matrix: 43 | compiler: [gcc, clang] 44 | steps: 45 | - uses: actions/checkout@v2 46 | - name: Setup dependencies 47 | run: | 48 | brew install \ 49 | cmake \ 50 | curl \ 51 | gcc \ 52 | gnutls \ 53 | htmlcxx \ 54 | icu4c \ 55 | json-c \ 56 | leveldb \ 57 | libmicrohttpd \ 58 | libxml2 \ 59 | make \ 60 | netcat \ 61 | pkg-config \ 62 | snappy \ 63 | && brew link --force icu4c 64 | 65 | - run: make 66 | env: 67 | CC: ${{ matrix.compiler }} 68 | - run: make test 69 | env: 70 | CC: ${{ matrix.compiler }} 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | 15 | # Build directory 16 | bin/ 17 | 18 | # others 19 | .DS_Store 20 | 21 | # Eclipse CBP 22 | .project 23 | 24 | # qtCreator 25 | CMakeLists.txt.user* 26 | 27 | # vim temporary files 28 | *.swp 29 | *.swo 30 | 31 | # ctags 32 | tags 33 | 34 | # idea 35 | .idea 36 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ### Dockerfile for MathWebSearch 2 | 3 | ## Builder Image 4 | FROM debian:stretch-slim as builder 5 | 6 | # Install dependencies 7 | RUN apt-get update && apt-get -y install \ 8 | cmake \ 9 | g++ \ 10 | make \ 11 | pkg-config \ 12 | libmicrohttpd-dev \ 13 | libxml2-dev \ 14 | libleveldb-dev \ 15 | libsnappy-dev \ 16 | libjson-c-dev \ 17 | libhtmlcxx-dev \ 18 | libgnutls28-dev \ 19 | libicu-dev \ 20 | libcurl4-gnutls-dev \ 21 | doxygen \ 22 | netcat \ 23 | curl \ 24 | && apt-get clean 25 | 26 | ## Setup file structure under /mws 27 | ADD analytics/ /mws/analytics 28 | ADD config/ mws/config 29 | ADD scripts/ mws/scripts 30 | ADD data /mws/data 31 | ADD src /mws/src 32 | ADD test/ mws/test 33 | ADD third_party/ mws/third_party/ 34 | 35 | ADD .arcconfig /mws/ 36 | ADD .arclint /mws/ 37 | ADD .clang-format /mws/ 38 | ADD CMakeLists.txt /mws/ 39 | ADD Makefile /mws/ 40 | 41 | # Build and install into /install 42 | WORKDIR /mws/ 43 | RUN make all test 44 | 45 | ## 46 | ## add a runtime image 47 | FROM debian:stretch-slim 48 | 49 | # Install runtime libraries 50 | RUN apt-get update && apt-get --no-install-recommends -y install \ 51 | libmicrohttpd12 \ 52 | libxml2 \ 53 | libleveldb1v5 \ 54 | libsnappy1v5 \ 55 | libjson-c3 \ 56 | libhtmlcxx3v5 \ 57 | libgnutlsxx28 \ 58 | libicu57 \ 59 | libcurl3-gnutls \ 60 | && apt-get clean 61 | 62 | ## Setup file structure under /mws 63 | ADD config/ mws/config 64 | ADD scripts/ mws/scripts 65 | ADD README.md /mws 66 | ADD LICENSE /mws/ 67 | COPY --from=builder /mws/bin/ /mws/bin 68 | 69 | ## And expand the path variable 70 | ENV HOST="0.0.0.0" 71 | ENV PATH="/mws/bin:${PATH}" 72 | 73 | # Add a /data/ volume and a port to run on 74 | VOLUME /data/ 75 | EXPOSE 8080 76 | 77 | # Run the MWS Daemon 78 | CMD "/mws/bin/mwsd" "-I" "/data/" "-p" "8080" -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2014 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # Makefile -- 21 | # 22 | 23 | all: bin/cmake_bootstrap_success 24 | @cd bin && make --no-print-directory $@ 25 | 26 | clean: 27 | @rm -rf bin/ build/ 28 | 29 | config: bin/CMakeCache.txt 30 | @ccmake bin/ 31 | 32 | test: bin/cmake_bootstrap_success all 33 | @cd bin && make --no-print-directory $@ 34 | 35 | # CMake setup 36 | bin/CMakeCache.txt: 37 | @mkdir -p bin 38 | -@cd bin && cmake .. 39 | 40 | # CMake successful setup 41 | bin/cmake_bootstrap_success: 42 | @mkdir -p bin 43 | @cd bin && cmake .. 44 | @touch bin/cmake_bootstrap_success 45 | 46 | # Forward targets to cmake generated makefile 47 | %: bin/cmake_bootstrap_success 48 | @cd bin && make --no-print-directory $@ 49 | 50 | .PHONY: all clean config test 51 | .SECONDARY: bin/cmake_bootstrap_success 52 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | WONTFIX: 2 | - memleak in xmlparser/write (when xmlOutputFlush fails) is singular and 3 | caused by the text buffer within __xmlRaiseError not being freed 4 | -------------------------------------------------------------------------------- /analytics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # analytics/CMakeLists.txt -- 21 | # 22 | 23 | # Dependencies 24 | 25 | # Includes 26 | 27 | # Flags 28 | 29 | # Sources 30 | FILE(GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp") 31 | 32 | # Binaries 33 | FOREACH(source ${SOURCES}) 34 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 35 | # Generate Binaries 36 | ADD_EXECUTABLE(analyze_${SourceName} ${source}) 37 | TARGET_LINK_LIBRARIES(analyze_${SourceName} 38 | mwsanalytics 39 | commonutils 40 | ) 41 | ENDFOREACH(source) 42 | -------------------------------------------------------------------------------- /analytics/README.md: -------------------------------------------------------------------------------- 1 | MathWebSearch Analytics 2 | ======================= 3 | 4 | About 5 | ----- 6 | This provides an interface for analyzing all expressions contained 7 | in a MWS index. 8 | 9 | Workflow 10 | -------- 11 | 12 | Create a C++ source following this format and save it in 13 | the [analytics/](.) directory. To access documentation on the API 14 | functions, see [analytics.hpp](/src/mws/analytics/analytics.hpp). 15 | 16 | ``` cpp 17 | #include "mws/analytics/analytics.hpp" 18 | 19 | namespace mws { 20 | namespace analytics { 21 | 22 | AnalyticsStatus analyze_begin(const index_handle_t* index, 23 | const inode_t* root) { 24 | // ... 25 | return ANALYTICS_OK; 26 | } 27 | 28 | AnalyticsStatus analyze_expression(const types::CmmlToken* cmmlToken, 29 | uint32_t num_hits) { 30 | // ... 31 | return ANALYTICS_OK; 32 | } 33 | 34 | void analyze_end() { 35 | // ... 36 | } 37 | 38 | } // namespace analytics 39 | } // namespace mws 40 | ``` 41 | 42 | When building MWS, sources in `analytics/` will automatically generate 43 | executable targets. As such, `example.cpp` will generate 44 | `bin/analytics/analyze_example`. This can be run as: 45 | 46 | bin/analytics/analyze_example -I /path/to/saved/index/ 47 | 48 | To generate an index, run 49 | 50 | bin/mws-index -I /path/to/harvests -o /path/to/save/index 51 | 52 | -------------------------------------------------------------------------------- /analytics/example.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * This is an example usecase of the analytics API. 23 | * This program computes average depth and size of expressions, 24 | * as well as number of unique expressions in the index. 25 | * 26 | * @brief Simple expression analytics 27 | * @file example.cpp 28 | * @author Corneliu Prodescu 29 | * @date 20 Jun 2014 30 | */ 31 | 32 | #include 33 | #include 34 | 35 | #include "common/utils/compiler_defs.h" 36 | #include "mws/analytics/analytics.hpp" 37 | 38 | namespace mws { 39 | namespace analytics { 40 | 41 | static uint64_t total_hits; 42 | static uint64_t unique_hits; 43 | static double depth_sum; 44 | static double size_sum; 45 | 46 | AnalyticsStatus analyze_begin(const index_handle_t* index, 47 | const inode_t* root) { 48 | UNUSED(index); 49 | printf("Root has %" PRIu64 " children\n", root->size); 50 | return ANALYTICS_OK; 51 | } 52 | 53 | AnalyticsStatus analyze_expression(const types::CmmlToken* cmmlToken, 54 | uint32_t num_hits) { 55 | if (verbose) { 56 | PRINT_LOG("Analyzing expression %s\n", cmmlToken->toString().c_str()); 57 | } 58 | total_hits += num_hits; 59 | unique_hits++; 60 | depth_sum += cmmlToken->getExprDepth(); 61 | size_sum += cmmlToken->getExprSize(); 62 | return ANALYTICS_OK; 63 | } 64 | 65 | void analyze_end() { 66 | printf("Index contains %" PRIu64 " hits\n", total_hits); 67 | printf("Index contains %" PRIu64 " unique expressions\n", unique_hits); 68 | printf("Average expression depth is %f\n", depth_sum / unique_hits); 69 | printf("Average expression size is %f\n", size_sum / unique_hits); 70 | } 71 | 72 | } // namespace analytics 73 | } // namespace mws 74 | -------------------------------------------------------------------------------- /config/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | CONFIGURE_FILE(config.h.in 20 | ${PROJECT_BINARY_DIR}/build-gen/config.h) 21 | CONFIGURE_FILE(doxyfile.in doxyfile) 22 | -------------------------------------------------------------------------------- /config/arxiv-harvester.json: -------------------------------------------------------------------------------- 1 | { 2 | "shouldSaveData" : true, 3 | "textWithMathXpath" : "//body", 4 | "documentIdXpath" : "", 5 | "metadata" : { 6 | "title" : "//title", 7 | "author" : "//span[@class='ltx_authors']" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /config/config.h.in: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file config.h 23 | * @brief Configuration header 24 | */ 25 | 26 | #ifndef _CONFIG_CONFIG_H 27 | #define _CONFIG_CONFIG_H 28 | 29 | // CMake Exported 30 | 31 | #define MWS_NAME "${PROJECT_NAME}" 32 | #define MWS_VERSION "${MWS_FULL_VERSION}" 33 | #define MATHMLTAGS_PATH "${PROJECT_BINARY_DIR}/src/crawler/utils/MwsMathMLTags.txt" 34 | #define GETRESPONSE_PATH "${PROJECT_BINARY_DIR}/src/crawler/daemon/GetResponse.html" 35 | #define HARVESTFILES_PATH "${PROJECT_SOURCE_DIR}/data" 36 | #define MWS_TESTDATA_PATH "${PROJECT_SOURCE_DIR}/test/data" 37 | 38 | #define MWS_BUILD MWS_NAME "-" MWS_VERSION " as of " __DATE__ " " __TIME__ 39 | 40 | // Common 41 | 42 | //! Extension of Mws Harvest files 43 | #define DEFAULT_MWS_HARVEST_EXTENSION "harvest" 44 | 45 | // MWS Daemon 46 | #define DEFAULT_MWS_PORT 9090 47 | 48 | // MWS Query 49 | 50 | /// Number of computed hits (returned or not) 51 | #define MAX_QUERY_RESULT_TOTAL 12000 52 | #define DEFAULT_QUERY_RESULT_TOTAL 100000 53 | 54 | /// Number of requested results 55 | #define MAX_QUERY_RESULT_SIZE 100 56 | #define DEFAULT_QUERY_RESULT_SIZE 30 57 | 58 | /// Query offset 59 | #define MAX_QUERY_OFFSET 12000 60 | #define DEFAULT_QUERY_OFFSET 0 61 | 62 | /// Request total number of results 63 | #define DEFAULT_QUERY_TOTALREQ true 64 | 65 | // Index 66 | #define INDEX_MEMSECTOR_FILE "index.memsector" 67 | #define MEANING_DICTIONARY_FILE "meanings.dat" 68 | #define CRAWL_DB_FILE "crawl.db" 69 | #define FORMULA_DB_FILE "formula.db" 70 | 71 | #cmakedefine APPLY_RESTRICTIONS 72 | 73 | // Schema Daemon 74 | #define DEFAULT_SCHEMA_PORT 9080 75 | 76 | // Schema Query 77 | #define DEFAULT_SCHEMA_DEPTH 3 78 | 79 | #endif // _CONFIG_CONFIG_H 80 | -------------------------------------------------------------------------------- /config/mathhub-harvester.json: -------------------------------------------------------------------------------- 1 | { 2 | "shouldSaveData" : true, 3 | "textWithMathXpath" : "//body", 4 | "documentIdXpath" : "//meta[@name='mmturi']/@content", 5 | "metadata" : { 6 | "title" : "//title", 7 | "url" : "//meta[@name='url']/@content", 8 | "mmturi" : "//meta[@name='mmturi']/@content" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /config/zbl-harvester.json: -------------------------------------------------------------------------------- 1 | { 2 | "shouldSaveData" : true, 3 | "documentIdXpath" : "//*[local-name()='span' and @class='number']", 4 | "textWithMathXpath" : "//*[local-name()='div' and @class='review-body']", 5 | "metadata" : { 6 | "title" : "//*[local-name()='div' and @class='title']", 7 | "author" : "//*[local-name()='span' and @class='author']", 8 | "language" : "//*[local-name()='div' and @class='language']", 9 | "class" : "//*[local-name()='div' and @class='class']", 10 | "keywords" : "//*[local-name()='div' and @class='keywords']", 11 | "doctype" : "//*[local-name()='div' and @class='doctype']" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /doc/XLSearch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/XLSearch.pdf -------------------------------------------------------------------------------- /doc/aisc06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/aisc06.pdf -------------------------------------------------------------------------------- /doc/cade21.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/cade21.pdf -------------------------------------------------------------------------------- /doc/cicm12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/cicm12.pdf -------------------------------------------------------------------------------- /doc/images/structure-2011-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/images/structure-2011-06.jpg -------------------------------------------------------------------------------- /doc/mir12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/mir12.pdf -------------------------------------------------------------------------------- /doc/mkm07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/mkm07.pdf -------------------------------------------------------------------------------- /doc/mkm08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/mkm08.pdf -------------------------------------------------------------------------------- /doc/mws05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/mws05.pdf -------------------------------------------------------------------------------- /doc/ntcir10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/doc/ntcir10.pdf -------------------------------------------------------------------------------- /scripts/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | 20 | ADD_SUBDIRECTORY(sysv) 21 | -------------------------------------------------------------------------------- /scripts/cmake-modules/FindHtmlCxx.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # Find htmlcxx headers and libraries. 20 | # 21 | # HTMLCXX_INCLUDE_DIRS - where to find htmlcxx/html/tree.h, etc. 22 | # HTMLCXX_LIBRARIES - List of libraries when using HTMLCXX. 23 | # HTMLCXX_FOUND - True if HTMLCXX found. 24 | 25 | IF (HTMLCXX_INCLUDE_DIRS AND HTMLCXX_LIBRARIES) 26 | SET(HTMLCXX_FIND_QUIETLY TRUE) 27 | ENDIF (HTMLCXX_INCLUDE_DIRS AND HTMLCXX_LIBRARIES) 28 | 29 | FIND_PACKAGE (PkgConfig QUIET) 30 | IF (PKGCONFIG_FOUND) 31 | PKG_CHECK_MODULES(PC_HTMLCXX QUIET htmlcxx) 32 | ENDIF (PKGCONFIG_FOUND) 33 | 34 | FIND_PATH(HTMLCXX_INCLUDE_DIRS 35 | NAMES 36 | htmlcxx/html/tree.h 37 | PATHS 38 | ${PC_HTMLCXX_INCLUDE_DIRS} 39 | /usr/local/include 40 | /usr/include 41 | $ENV{HTMLCXX} 42 | $ENV{HTMLCXX}/include 43 | ) 44 | 45 | FIND_LIBRARY(HTMLCXX_LIBRARIES 46 | NAMES 47 | htmlcxx 48 | PATHS 49 | ${PC_HTMLCXX_LIBRARY_DIRS} 50 | /usr/local/lib 51 | /usr/lib 52 | $ENV{HTMLCXX} 53 | $ENV{HTMLCXX}/lib 54 | ) 55 | 56 | # handle the QUIETLY and REQUIRED arguments and set *_FOUND 57 | INCLUDE(FindPackageHandleStandardArgs) 58 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(HTMLCXX DEFAULT_MSG HTMLCXX_LIBRARIES HTMLCXX_INCLUDE_DIRS) 59 | 60 | MARK_AS_ADVANCED(HTMLCXX_INCLUDE_DIRS HTMLCXX_LIBRARIES) 61 | -------------------------------------------------------------------------------- /scripts/cmake-modules/FindJson.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # JSON_FOUND - system has LibJson 21 | # JSON_INCLUDE_DIRS - Json include directory 22 | # JSON_LIBRARIES - Link these to use Json 23 | 24 | IF (JSON_INCLUDE_DIRS AND JSON_LIBRARIES) 25 | SET(JSON_FIND_QUIETLY TRUE) 26 | ENDIF (JSON_INCLUDE_DIRS AND JSON_LIBRARIES) 27 | 28 | FIND_PACKAGE (PkgConfig QUIET) 29 | IF (PKGCONFIG_FOUND) 30 | PKG_CHECK_MODULES(PC_JSON QUIET json) 31 | ENDIF (PKGCONFIG_FOUND) 32 | 33 | FIND_PATH(JSON_INCLUDE_DIRS 34 | NAMES 35 | json-c/json.h 36 | json-c/json_object.h 37 | PATHS 38 | ${PC_JSON_INCLUDE_DIRS} 39 | /usr/include 40 | /usr/local/include 41 | $ENV{JSON} 42 | $ENV{JSON}/include 43 | ) 44 | 45 | FIND_LIBRARY(JSON_LIBRARIES 46 | NAMES 47 | json-c json 48 | PATHS 49 | ${PC_JSON_LIBRARY_DIRS} 50 | /usr/lib 51 | /usr/local/lib 52 | $ENV{JSON} 53 | $ENV{JSON}/lib 54 | ) 55 | 56 | # handle the QUIETLY and REQUIRED arguments and set *_FOUND 57 | INCLUDE(FindPackageHandleStandardArgs) 58 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(JSON DEFAULT_MSG JSON_LIBRARIES JSON_INCLUDE_DIRS) 59 | 60 | MARK_AS_ADVANCED(JSON_INCLUDE_DIRS JSON_LIBRARIES) 61 | -------------------------------------------------------------------------------- /scripts/cmake-modules/FindLevelDb.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # LEVELDB_FOUND - system has LevelDB 21 | # LEVELDB_INCLUDE_DIR - the LevelDB include directory 22 | # LEVELDB_LIBRARIES - Link these to use LevelDB 23 | # LEVELDB_DEFINITIONS - Compiler switches required for using LevelDB 24 | # LEVELDB_NEED_PREFIX - this is set if the functions are prefixed with LevelDB 25 | 26 | IF (LEVELDB_INCLUDE_DIR AND LEVELDB_LIBRARIES) 27 | SET(LEVELDB_FIND_QUIETLY TRUE) 28 | ENDIF (LEVELDB_INCLUDE_DIR AND LEVELDB_LIBRARIES) 29 | 30 | FIND_PATH(LEVELDB_INCLUDE_DIR NAMES leveldb/db.h HINTS 31 | /usr/include 32 | /usr/local/include 33 | $ENV{LEVELDB} 34 | $ENV{LEVELDB}/include 35 | ) 36 | 37 | FIND_LIBRARY(LEVELDB_LIBRARIES NAMES leveldb leveldb.dll.a leveldb.a HINTS 38 | /usr/lib 39 | /usr/local/lib 40 | $ENV{LEVELDB} 41 | $ENV{LEVELDB}/lib ) 42 | 43 | # handle the QUIETLY and REQUIRED arguments and set *_FOUND 44 | INCLUDE(FindPackageHandleStandardArgs) 45 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(LevelDb DEFAULT_MSG LEVELDB_LIBRARIES LEVELDB_INCLUDE_DIR) 46 | 47 | MARK_AS_ADVANCED(LEVELDB_INCLUDE_DIR LEVELDB_LIBRARIES) 48 | -------------------------------------------------------------------------------- /scripts/cmake-modules/FindMicroHttpd.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2010-2014 KWARC Group 2 | # 3 | # This file is part of MathWebSearch. 4 | # 5 | # MathWebSearch is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # MathWebSearch is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with MathWebSearch. If not, see . 17 | # 18 | # 19 | # MICROHTTPD_FOUND - system has MicroHttpd 20 | # MICROHTTPD_INCLUDE_DIRS - the MicroHttpd include directory 21 | # MICROHTTPD_LIBRARIES - Link these to use MicroHttpd 22 | # MICROHTTPD_DEFINITIONS - Compiler switches required for using MicroHttpd 23 | 24 | IF (MICROHTTPD_INCLUDE_DIRS AND MICROHTTPD_LIBRARIES) 25 | SET(MICROHTTPD_FIND_QUIETLY TRUE) 26 | ENDIF (MICROHTTPD_INCLUDE_DIRS AND MICROHTTPD_LIBRARIES) 27 | 28 | FIND_PACKAGE (PkgConfig REQUIRED QUIET) 29 | PKG_CHECK_MODULES (PC_MICROHTTPD QUIET libmicrohttpd) 30 | 31 | FIND_PATH(MICROHTTPD_INCLUDE_DIRS 32 | NAMES 33 | microhttpd.h 34 | PATH_SUFFIXES 35 | microhttpd 36 | HINTS 37 | ${PC_MICROHTTPD_INCLUDE_DIRS} 38 | /usr/include 39 | /usr/local/include 40 | $ENV{MICROHTTPD} 41 | $ENV{MICROHTTPD}/include 42 | ) 43 | 44 | FIND_LIBRARY(MICROHTTPD_LIBRARIES 45 | NAMES 46 | microhttpd microhttpd.dll.a microhttd.a 47 | HINTS 48 | ${PC_MICROHTTPD_LIBRARY_DIRS} 49 | /usr/lib 50 | /usr/local/lib 51 | $ENV{MICROHTTPD} 52 | $ENV{MICROHTTPD}/lib 53 | ) 54 | 55 | SET(MICROHTTPD_DEFINITIONS "") 56 | IF (${PC_MICROHTTPD_FOUND}) 57 | IF (${PC_MICROHTTPD_VERSION} LESS 0.9) 58 | SET(MICROHTTPD_DEFINITIONS "-DMICROHTTPD_DEPRECATED") 59 | ENDIF(${PC_MICROHTTPD_VERSION} LESS 0.9) 60 | ENDIF(${PC_MICROHTTPD_FOUND}) 61 | 62 | # handle the QUIETLY and REQUIRED arguments and set *_FOUND 63 | INCLUDE(FindPackageHandleStandardArgs) 64 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(MicroHttpd DEFAULT_MSG MICROHTTPD_LIBRARIES MICROHTTPD_INCLUDE_DIRS) 65 | 66 | MARK_AS_ADVANCED(MICROHTTPD_INCLUDE_DIRS MICROHTTPD_LIBRARIES MICROHTTPD_DEFINITIONS) 67 | -------------------------------------------------------------------------------- /scripts/cmake-modules/FindSnappy.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # SNAPPY_FOUND - system has Snappy 21 | # SNAPPY_INCLUDE_DIR - the Snappy include directory 22 | # SNAPPY_LIBRARIES - Link these to use Snappy 23 | # SNAPPY_DEFINITIONS - Compiler switches required for using Snappy 24 | # SNAPPY_NEED_PREFIX - this is set if the functions are prefixed with Snappy 25 | 26 | IF (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARIES) 27 | SET(SNAPPY_FIND_QUIETLY TRUE) 28 | ENDIF (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARIES) 29 | 30 | FIND_PATH(SNAPPY_INCLUDE_DIR NAMES snappy.h PATH_SUFFIXES snappy HINTS 31 | /usr/include 32 | /usr/local/include 33 | $ENV{SNAPPY} 34 | $ENV{SNAPPY}/include 35 | ) 36 | 37 | FIND_LIBRARY(SNAPPY_LIBRARIES NAMES snappy snappy.dll.a snappy.a HINTS 38 | /usr/lib 39 | /usr/local/lib 40 | $ENV{SNAPPY} 41 | $ENV{SNAPPY}/lib ) 42 | 43 | # handle the QUIETLY and REQUIRED arguments and set *_FOUND 44 | INCLUDE(FindPackageHandleStandardArgs) 45 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Snappy DEFAULT_MSG SNAPPY_LIBRARIES SNAPPY_INCLUDE_DIR) 46 | 47 | MARK_AS_ADVANCED(SNAPPY_INCLUDE_DIR SNAPPY_LIBRARIES) 48 | -------------------------------------------------------------------------------- /scripts/elastic-search/config.sh: -------------------------------------------------------------------------------- 1 | HOST="localhost" 2 | PORT="9200" 3 | INDEX="tema" 4 | NUM_INSTANCES="6" 5 | -------------------------------------------------------------------------------- /scripts/elastic-search/es-env.sh: -------------------------------------------------------------------------------- 1 | export ES_HEAP_SIZE="10g" 2 | export JAVA_OPTS="-XX:+UseConcMarkSweepGC" 3 | -------------------------------------------------------------------------------- /scripts/elastic-search/run-bulk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curr_dir=${0%/*} 4 | source $curr_dir/config.sh 5 | 6 | MIN_PORT=$PORT 7 | MAX_PORT=$(($MIN_PORT + $NUM_INSTANCES - 1)) 8 | 9 | if [ $# -lt 1 ]; then 10 | cat << EOF 11 | Expected at least 1 bulk file 12 | Usage: 13 | $0 ... 14 | EOF 15 | exit 1 16 | fi 17 | 18 | while [ $# -gt 0 ]; do 19 | if [ -f "$1.ok" ] || [ -f "$1.err" ]; then 20 | echo Skipping "$1": already processed 21 | else 22 | TMP_DIR="/tmp/bulk-split" 23 | rm -rf $TMP_DIR 24 | mkdir -p $TMP_DIR 25 | split --lines 100 "$1" "$TMP_DIR/file" 26 | for file in $TMP_DIR/*; do 27 | PORT=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) 28 | OUTPUT="$(curl -s -S -XPOST $HOST:$PORT/_bulk?index=$INDEX\&type=doc --data-binary @$file 2>&1)" 29 | echo "$OUTPUT" | grep '"errors":false' &> /dev/null || { 30 | echo Error at "$1" 31 | echo $OUTPUT > "$1.err" 32 | break 33 | } 34 | done 35 | rm -rf $TMP_DIR 36 | [[ -r "$1.err" ]] || { 37 | echo Processed "$1" 38 | touch "$1.ok" 39 | } 40 | fi 41 | 42 | shift 43 | done 44 | 45 | curl -s -XPOST $HOST:$PORT/$INDEX/_refresh &> /dev/null 46 | -------------------------------------------------------------------------------- /scripts/elastic-search/run-setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | curr_dir=${0%/*} 4 | source $curr_dir/config.sh 5 | 6 | curl -s -S -XPUT $HOST:$PORT/$INDEX/ -d ' 7 | { 8 | "settings" : { 9 | "index" : { 10 | "refresh_interval" : "-1", 11 | "number_of_replicas" : 0, 12 | "number_of_shards" : 128, 13 | "translog" : { 14 | "flush_threshold_ops" : 100000 15 | }, 16 | "mapper" : { 17 | "dynamic" : false 18 | } 19 | }, 20 | "indices" : { 21 | "memory" : { 22 | "index_buffer_size" : "50%" 23 | } 24 | } 25 | }, 26 | "mappings" : { 27 | "doc" : { 28 | "dynamic" : "strict", 29 | "properties" : { 30 | "metadata" : { "dynamic" : true, "type" : "object" }, 31 | "mws_ids" : { "type" : "long", "store" : false, "index" : "not_analyzed"}, 32 | "text" : { "type" : "string", "store" : false, "index" : "analyzed" }, 33 | "mws_id" : { "enabled" : false, "type" : "object" }, 34 | "math" : { "enabled" : false, "type" : "object" } 35 | } 36 | } 37 | } }'; echo 38 | -------------------------------------------------------------------------------- /scripts/other/ntcir-11-wiki/get-NTCIR-answers.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -w 2 | 3 | require 'rexml/document' # xml parsing 4 | include REXML 5 | require 'open-uri' # params escaping 6 | require 'net/http' # get requests 7 | require 'json' # tema answer parsing 8 | 9 | OUTPUT = "./KWARC.txt" 10 | RUN = "default" 11 | 12 | $output = File.new(OUTPUT, "w"); 13 | 14 | $results = {} 15 | def register_result num, resp, hasFormula 16 | $results[num] ||= {} 17 | resp_json = JSON.parse resp 18 | hits = resp_json["hits"] 19 | return if hits.nil? 20 | 21 | hits.each do |hit| 22 | fileName = hit["id"] 23 | score = hit["score"] 24 | currScore = $results[num][fileName] 25 | if currScore 26 | $results[num][fileName] = [currScore, score].max 27 | else 28 | $results[num][fileName] = score 29 | end 30 | end 31 | end 32 | 33 | class Hit 34 | attr_accessor :fileName, :score 35 | 36 | def initialize fileName, score 37 | @fileName, @score = fileName, score 38 | end 39 | end 40 | 41 | def process_results num 42 | ret = [] 43 | $results[num].each { |k, v| ret << Hit.new(k,v) } 44 | puts "Processed results for #{num}" 45 | $stdout.flush 46 | ret.sort! {|h1, h2| h2.score <=> h1.score } 47 | ret[0...1000] 48 | end 49 | 50 | 51 | QUERY_LOCATION = "./queries.xml" 52 | TEMA_URL = "http://localhost:8901/" 53 | PARAM_FROM = 0 54 | PARAM_SIZE = 100 55 | 56 | queries_xml = File.new(QUERY_LOCATION) 57 | queries_doc = Document.new(queries_xml) 58 | 59 | topics = XPath.match(queries_doc, "/topics/*") 60 | topics.each do |topic| 61 | num = XPath.match(topic, "./num[1]").first.text 62 | formulas = XPath.match(topic, "./query/formula/m:math/m:semantics/m:apply").map(&:to_s) 63 | if formulas.empty? 64 | formulas = XPath.match(topic, "./query/formula/m:math/m:semantics/m:cerror").map(&:to_s) 65 | end 66 | 67 | tema_url = URI.parse TEMA_URL 68 | 69 | formulas.each do |fmla| 70 | resp = Net::HTTP.post_form(tema_url, { 71 | 'text' => "", 72 | 'math' => "#{fmla}", 73 | 'from' => "#{PARAM_FROM}", 74 | 'size' => "#{PARAM_SIZE}" 75 | }); 76 | 77 | register_result num, resp.body, (fmla != "") 78 | puts "Registered result #{num}" 79 | $stdout.flush 80 | end 81 | 82 | end 83 | 84 | $results.keys.each do |num| 85 | hits = process_results num 86 | 87 | next if hits.nil? 88 | hits.each_with_index do |h, rank| 89 | $output.puts "#{num}\t1\t#{h.fileName}\t#{rank + 1}\t#{h.score}\tKWARC_#{RUN}" 90 | end 91 | end 92 | $output.close 93 | -------------------------------------------------------------------------------- /scripts/other/ntcir-11/.gitignore: -------------------------------------------------------------------------------- 1 | ntcir_answers* 2 | -------------------------------------------------------------------------------- /scripts/other/ntcir-11/README.md: -------------------------------------------------------------------------------- 1 | NTCIR-11 Results Script 2 | ======================= 3 | 4 | Script used to generate a submission for the 5 | [NTCIR-11](http://research.nii.ac.jp/ntcir/ntcir-11/index.html) information 6 | retrieval task. Results are fetched from the 7 | [TeMa proxy](https://github.com/KWARC/tema-proxy/releases/tag/ntcir11) 8 | server running locally. 9 | -------------------------------------------------------------------------------- /scripts/other/ntcir-11/queries.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/scripts/other/ntcir-11/queries.pdf -------------------------------------------------------------------------------- /scripts/send-mws-query.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | if [ "$#" -ne 4 ]; then 4 | cat << EOF 5 | Usage: $0 6 | Example: 7 | > $0 9090 0 5 "" 8 | 9 | This script sends a query to a MWS rest interface running locally. 10 | If no query is provided, the default is which should 11 | return all indexed expressions (up to a limit). 12 | 13 | EOF 14 | exit 1 15 | fi 16 | 17 | PORT="$1" 18 | HOST="http://localhost:$PORT" 19 | LIMITMIN="$2" 20 | LIMITSIZE="$3" 21 | QUERY="$4" 22 | 23 | curl -s -S "http://localhost:$PORT" -d " 24 | 31 | 32 | $QUERY 33 | 34 | 35 | " 36 | -------------------------------------------------------------------------------- /scripts/sysv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | CONFIGURE_FILE(mws-config.in mws-config @ONLY) 20 | INSTALL(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/mws-config 21 | DESTINATION bin) 22 | 23 | CONFIGURE_FILE(mwsd-service.sh.in mwsd-service.sh @ONLY) 24 | INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/mwsd-service.sh 25 | DESTINATION ${CMAKE_INSTALL_PREFIX}/${SYSCONFDIR}) 26 | 27 | INSTALL(DIRECTORY 28 | DESTINATION ${CMAKE_INSTALL_PREFIX}/${SYSCONFDIR}/configs) 29 | -------------------------------------------------------------------------------- /src/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # Common Modules 20 | ADD_SUBDIRECTORY( socket ) # commonsocket 21 | ADD_SUBDIRECTORY( thread ) # commonthread 22 | ADD_SUBDIRECTORY( types ) # commontypes 23 | ADD_SUBDIRECTORY( utils ) # communutils 24 | 25 | -------------------------------------------------------------------------------- /src/common/socket/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/common/socket/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "commonsocket") 28 | 29 | # Dependencies 30 | 31 | # Includes 32 | 33 | # Flags 34 | 35 | # Sources 36 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 37 | 38 | # Binaries 39 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 40 | TARGET_LINK_LIBRARIES(${MODULE} 41 | commonutils) 42 | -------------------------------------------------------------------------------- /src/common/thread/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/common/thread/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "commonthread") 28 | 29 | # Dependencies 30 | FIND_PACKAGE( Threads REQUIRED ) 31 | 32 | # Includes 33 | 34 | # Flags 35 | 36 | # Sources 37 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 38 | 39 | # Binaries 40 | ADD_LIBRARY( ${MODULE} ${SOURCES} ) 41 | TARGET_LINK_LIBRARIES(${MODULE} 42 | ${CMAKE_THREAD_LIBS_INIT} 43 | ) 44 | -------------------------------------------------------------------------------- /src/common/types/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/common/types/CMakeLists.txt -- 21 | # 22 | # 31 Jul 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "commontypes") 28 | 29 | # Dependencies 30 | 31 | # Includes 32 | 33 | # Flags 34 | 35 | # Sources 36 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 37 | 38 | # Binaries 39 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 40 | TARGET_LINK_LIBRARIES(${MODULE} 41 | commonutils) 42 | -------------------------------------------------------------------------------- /src/common/types/DataFormat.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief File containing the implementation of the DataFormat class. 23 | * 24 | * @file DataFormat.cpp 25 | * @author Corneliu-Claudiu Prodescu 26 | * @date 30 Jul 2011 27 | * 28 | * License: GPL v3 29 | * 30 | */ 31 | 32 | // System includes 33 | 34 | #include 35 | 36 | // Local includes 37 | 38 | #include "common/types/DataFormat.hpp" 39 | 40 | std::ostream& operator<<(std::ostream& out, DataFormat dataFormat) { 41 | switch (dataFormat) { 42 | case DATAFORMAT_XML: 43 | out << "text/xml"; 44 | break; 45 | case DATAFORMAT_JSON: 46 | out << "application/json"; 47 | break; 48 | default: 49 | out << DEFAULT_MIME_TYPE; 50 | break; 51 | } 52 | 53 | return out; 54 | } 55 | -------------------------------------------------------------------------------- /src/common/types/DataFormat.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _DATAFORMAT_HPP 22 | #define _DATAFORMAT_HPP 23 | 24 | /** 25 | * @brief File containing the header of the DataFormat class. 26 | * 27 | * @file DataFormat.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 30 Jul 2011 30 | * 31 | * License: GPL v3 32 | * 33 | */ 34 | 35 | // System includes 36 | 37 | #include 38 | 39 | // Constants 40 | 41 | const std::string DEFAULT_MIME_TYPE = "text/xml"; 42 | 43 | enum DataFormat { 44 | DATAFORMAT_DEFAULT, 45 | DATAFORMAT_UNKNOWN, 46 | DATAFORMAT_XML, 47 | DATAFORMAT_JSON, 48 | }; 49 | 50 | std::ostream& operator<<(std::ostream& out, DataFormat dataFormat); 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/common/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/common/utils/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "commonutils") 28 | 29 | # Dependencies 30 | 31 | # Includes 32 | 33 | # Flags 34 | 35 | # Sources 36 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c" "*.hpp" "*.h") 37 | 38 | # Binaries 39 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 40 | -------------------------------------------------------------------------------- /src/common/utils/ContainerIterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2014 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _COMMON_UTILS_CONTAINERITERATOR_HPP 22 | #define _COMMON_UTILS_CONTAINERITERATOR_HPP 23 | 24 | /** 25 | * @brief Container Iterator 26 | * @file ContainerIterator.hpp 27 | * @author Corneliu-Claudiu Prodescu 28 | * @date 14 May 2014 29 | */ 30 | 31 | namespace common { 32 | namespace utils { 33 | 34 | template 35 | class ContainerIterator { 36 | Iterator _current; 37 | Iterator _end; 38 | 39 | public: 40 | ContainerIterator(const Iterator& begin, const Iterator& end) 41 | : _current(begin), _end(end) {} 42 | bool isValid() const { return (_current != _end); } 43 | void next() { _current++; } 44 | bool hasNext() const { 45 | Iterator currentCopy = _current; 46 | currentCopy++; 47 | return (currentCopy != _end); 48 | } 49 | Iterator& get() { return _current; } 50 | const Iterator& get() const { return _current; } 51 | }; 52 | 53 | } // namespace utils 54 | } // namespace common 55 | 56 | #endif // _COMMON_UTILS_CONTAINERITERATOR_HPP 57 | -------------------------------------------------------------------------------- /src/common/utils/TimeStamp.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _TIMESTAMP_HPP 22 | #define _TIMESTAMP_HPP 23 | 24 | /** 25 | * @brief File containing the TimeStamp utility function 26 | * 27 | * @file TimeStamp.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 22 Jun 2011 30 | * 31 | * License: GPL v3 32 | */ 33 | 34 | // System includes 35 | 36 | #include // C++ string header 37 | #include // C standard IO header 38 | #include // C time headers 39 | 40 | // Macros 41 | #define TIMESTAMP_MAXBUFSZ 20 42 | #define TIMESTAMP_ERR "yyyy-mm-dd hh:mm:ss" 43 | 44 | /** 45 | * @brief Method to get the current time. 46 | * @return a string with the current GMT time as "yyyy-mm-dd hh:mm:ss". 47 | */ 48 | inline std::string 49 | TimeStamp() 50 | { 51 | char buffer[TIMESTAMP_MAXBUFSZ]; 52 | time_t currentRawTime; 53 | time_t ret; 54 | struct tm gmtTime; 55 | 56 | ret = time(¤tRawTime); 57 | if (ret == (time_t)-1) 58 | { 59 | snprintf(buffer, 60 | TIMESTAMP_MAXBUFSZ, 61 | "%20s", 62 | TIMESTAMP_ERR); 63 | } 64 | else 65 | { 66 | (void) gmtime_r(¤tRawTime, &gmtTime); 67 | 68 | snprintf(buffer, 69 | TIMESTAMP_MAXBUFSZ, 70 | "%04d-%02d-%02d %02d:%02d:%02d", 71 | gmtTime.tm_year + 1900, 72 | gmtTime.tm_mon + 1, 73 | gmtTime.tm_mday, 74 | gmtTime.tm_hour, 75 | gmtTime.tm_min, 76 | gmtTime.tm_sec); 77 | } 78 | 79 | return (std::string) buffer; 80 | } 81 | 82 | #endif // _TIMESTAMP_HPP 83 | -------------------------------------------------------------------------------- /src/common/utils/getBoolType.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief File containing the implementation of getBoolType utility function 23 | * 24 | * @file getBoolType.cpp 25 | * @author Corneliu-Claudiu Prodescu 26 | * @date 22 Jun 2011 27 | * 28 | * License: GPL v3 29 | */ 30 | 31 | #include 32 | using std::string; 33 | #include 34 | using std::make_pair; 35 | using std::pair; 36 | 37 | #include "common/utils/getBoolType.hpp" 38 | 39 | namespace common { 40 | namespace utils { 41 | 42 | BoolType getBoolType(string str) { 43 | BoolType result; 44 | int i; 45 | const pair values[] = { 46 | make_pair("yes", BOOL_YES), make_pair("1", BOOL_YES), 47 | make_pair("no", BOOL_NO), make_pair("0", BOOL_NO), 48 | make_pair("", BOOL_DEFAULT), // Flag to end array 49 | }; 50 | 51 | result = BOOL_DEFAULT; 52 | 53 | // Lower-casing the buffer 54 | for (i = 0; str[i] != '\0'; i++) { 55 | str[i] = tolower(str[i]); 56 | } 57 | 58 | // Iterating through values 59 | for (i = 0; values[i].second != BOOL_DEFAULT; i++) { 60 | if (str == values[i].first) { 61 | result = values[i].second; 62 | break; 63 | } 64 | } 65 | 66 | return result; 67 | } 68 | 69 | } // namespace utils 70 | } // namespace common 71 | -------------------------------------------------------------------------------- /src/common/utils/getBoolType.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _COMMON_UTILS_GETBOOLTYPE_HPP 22 | #define _COMMON_UTILS_GETBOOLTYPE_HPP 23 | 24 | /** 25 | * @brief File containing the header of getBoolType utility function 26 | * 27 | * @file getBoolType.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 22 Jun 2011 30 | * 31 | * License: GPL v3 32 | */ 33 | 34 | #include // C string header 35 | 36 | namespace common { 37 | namespace utils { 38 | 39 | enum BoolType { 40 | BOOL_DEFAULT, 41 | BOOL_YES, 42 | BOOL_NO, 43 | }; 44 | 45 | /** 46 | * @brief Method to parse a char array and get the boolean value. 47 | * @param str is a char array containing a bool value ("1", "0", "yes", etc) 48 | * @return a BoolType value corresponding to the input. 49 | */ 50 | BoolType getBoolType(std::string); 51 | 52 | } // namespace utils 53 | } // namespace common 54 | 55 | #endif // _COMMON_UTILS_GETBOOLTYPE_HPP 56 | -------------------------------------------------------------------------------- /src/common/utils/getSockAddrLog.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief File containing the getSockAddrLog utility function 23 | * 24 | * @file getSockAddrLog.cpp 25 | * @author Corneliu-Claudiu Prodescu 26 | * @date 25 Jun 2011 27 | * 28 | * License: GPL v3 29 | */ 30 | 31 | // System includes 32 | 33 | #include // C standard IO headers 34 | #include // C network db operations 35 | #include // C++ string header 36 | #include // C main socket headers 37 | 38 | #include "common/utils/compiler_defs.h" 39 | 40 | // Local includes 41 | 42 | #include "common/utils/TimeStamp.hpp" // MWS TimeStamp utility function 43 | 44 | // Macros 45 | 46 | #define RESULT_MAX_SZ 81 47 | #define HOSTNAME_MAX_SZ 40 48 | #define SERVICE_MAX_SZ 40 49 | 50 | // Namespaces 51 | 52 | using namespace std; 53 | 54 | string getSockAddrLog(const sockaddr* sockAddr, const socklen_t sockLen) { 55 | char hostname[HOSTNAME_MAX_SZ]; 56 | char service[SERVICE_MAX_SZ]; 57 | int ret; 58 | char result[RESULT_MAX_SZ]; 59 | 60 | // Getting hostname and service 61 | ret = getnameinfo(sockAddr, sockLen, hostname, HOSTNAME_MAX_SZ, service, 62 | SERVICE_MAX_SZ, 0); 63 | if (ret != 0) { 64 | PRINT_WARN("Error at getnameinfo: %s\n", gai_strerror(ret)); 65 | } else { 66 | snprintf(result, RESULT_MAX_SZ, 67 | "%19s " 68 | "%35s" 69 | "%25s\n", 70 | TimeStamp().c_str(), hostname, service); 71 | } 72 | 73 | return (string)result; 74 | } 75 | -------------------------------------------------------------------------------- /src/common/utils/getSockAddrLog.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _GETSOCKADDRLOG_HPP 22 | #define _GETSOCKADDRLOG_HPP 23 | 24 | /** 25 | * @brief File containing the getSockAddrLog utility function 26 | * 27 | * @file getSockAddrLog.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 25 Jun 2011 30 | * 31 | * License: GPL v3 32 | */ 33 | 34 | // System includes 35 | 36 | #include // C++ string header 37 | #include // C main socket headers 38 | 39 | // Macros 40 | #define TIMESTAMP_MAXBUFSZ 20 41 | #define TIMESTAMP_ERR "yyyy-mm-dd hh:mm:ss" 42 | 43 | /** 44 | * @brief Method to a log about a socket address. 45 | * @param sockAddr is the address of the socket. 46 | * @param sockLen is the length of the address. 47 | * @return a string with the respective log as "yyyy-mm-dd hostname service". 48 | */ 49 | std::string 50 | getSockAddrLog(const sockaddr* sockAddr, 51 | const socklen_t sockLen); 52 | 53 | #endif // _GETSOCKADDRLOG_HPP 54 | -------------------------------------------------------------------------------- /src/common/utils/memstream.h: -------------------------------------------------------------------------------- 1 | #ifndef _MEMSTREAM_MEMSTREAM_H 2 | #define _MEMSTREAM_MEMSTREAM_H 3 | 4 | /** 5 | * @brief Compatibility memstream methods for MacOS X 6 | * @file memstream.h 7 | */ 8 | 9 | #ifdef __APPLE__ 10 | 11 | #include 12 | #include "common/utils/compiler_defs.h" 13 | 14 | BEGIN_DECLS 15 | 16 | FILE * open_memstream(char **cp, size_t *lenp); 17 | FILE * fmemopen(void *buf, size_t size, const char *mode); 18 | 19 | END_DECLS 20 | 21 | #endif // __APPLE__ 22 | 23 | #endif // _MEMSTREAM_MEMSTREAM_H 24 | -------------------------------------------------------------------------------- /src/common/utils/mmap.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief Memory map utilities 23 | * @file mmap.h 24 | * @date 04 Feb 2013 25 | * 26 | * License: GPLv3 27 | */ 28 | 29 | #ifndef __COMMON_UTILS_MMAP_H 30 | #define __COMMON_UTILS_MMAP_H 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "common/utils/compiler_defs.h" 39 | 40 | /*--------------------------------------------------------------------------*/ 41 | /* Type declarations */ 42 | /*--------------------------------------------------------------------------*/ 43 | 44 | typedef struct mmap_handle_s { 45 | const char* path; 46 | char* start_addr; 47 | uint32_t size; 48 | } mmap_handle_t; 49 | 50 | /*--------------------------------------------------------------------------*/ 51 | /* Methods */ 52 | /*--------------------------------------------------------------------------*/ 53 | 54 | BEGIN_DECLS 55 | 56 | /** 57 | * Create and mmap a read/writeable file 58 | * 59 | * @return 0 on success 60 | * @return -1 on failure 61 | */ 62 | int mmap_create(const char* path, off_t size, int flags, mmap_handle_t* mmap); 63 | 64 | /** 65 | * Load a read-only mmap of a file 66 | * 67 | * @return 0 on success 68 | * @return -1 on failure 69 | */ 70 | int mmap_load(const char* path, int flags, mmap_handle_t* mmap); 71 | 72 | /** 73 | * @return 0 on success 74 | * @return -1 on failure 75 | */ 76 | int mmap_unload(mmap_handle_t* mmap_handle); 77 | 78 | /** 79 | * @return 0 on success 80 | * @return -1 on failure 81 | */ 82 | int mmap_remove(mmap_handle_t* mmap_handle); 83 | 84 | END_DECLS 85 | 86 | #endif // __COMMON_UTILS_MMAP_H 87 | -------------------------------------------------------------------------------- /src/common/utils/save_pid_file.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file save_pid_file.c 23 | * @brief save_pid_file implementation 24 | */ 25 | 26 | #define __STDC_FORMAT_MACROS // ensure inttypes.h defines PRIu64 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "common/utils/compiler_defs.h" 33 | 34 | #include "save_pid_file.h" 35 | 36 | BEGIN_DECLS 37 | 38 | int 39 | save_pid_file(const char *path) { 40 | FILE* fp = fopen(path, "w"); 41 | FAIL_ON(fp == NULL); 42 | 43 | fprintf(fp, "%"PRIu64, (uint64_t) getpid()); 44 | 45 | return fclose(fp); 46 | 47 | fail: 48 | return -1; 49 | } 50 | 51 | END_DECLS 52 | -------------------------------------------------------------------------------- /src/common/utils/save_pid_file.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file save_pid_file.h 23 | * @brief save_pid_file API 24 | */ 25 | #ifndef __COMMON_UTILS_SAVE_PID_FILE_H 26 | #define __COMMON_UTILS_SAVE_PID_FILE_H 27 | 28 | #include "common/utils/compiler_defs.h" 29 | 30 | BEGIN_DECLS 31 | 32 | int 33 | save_pid_file(const char *path); 34 | 35 | END_DECLS 36 | 37 | #endif // ! __COMMON_UTILS_SAVE_PID_FILE_H 38 | -------------------------------------------------------------------------------- /src/crawler/crawler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/crawler/crawler/CMakeLists.txt -- 21 | # 22 | # 15 Aug 2011 23 | # d.hasegan@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "crawlercrawler") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | 35 | # Includes 36 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PUBLIC "${GOOGLEURL_INCLUDES}" ) 37 | 38 | # Flags 39 | 40 | # Binaries 41 | TARGET_LINK_LIBRARIES(${MODULE} 42 | crawlertypes 43 | crawlerutils 44 | commonutils) 45 | 46 | -------------------------------------------------------------------------------- /src/crawler/daemon/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/crawler/daemon/CMakeLists.txt -- 21 | # 22 | # 21 Aug 2012 23 | # d.hasegan@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "crawlerdaemon") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | FIND_PACKAGE(MicroHttpd REQUIRED) 35 | 36 | # Includes 37 | TARGET_INCLUDE_DIRECTORIES(${MODULE} PUBLIC ${MICROHTTPD_INCLUDE_DIRS}) 38 | CONFIGURE_FILE(GetResponse.html GetResponse.html) 39 | 40 | # Flags 41 | ADD_DEFINITIONS(${MICROHTTPD_DEFINITIONS}) 42 | 43 | 44 | # Binaries 45 | TARGET_LINK_LIBRARIES(${MODULE} 46 | commonthread 47 | commonutils 48 | crawlertypes 49 | ${MICROHTTPD_LIBRARIES}) 50 | -------------------------------------------------------------------------------- /src/crawler/daemon/GetResponse.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

Use the crawler:

5 |
6 |
7 | Start URL (REQUIRED):
8 |
9 | Count (OPTIONAL):
10 |
11 | Don't crawl (OPTIONAL):
12 | 13 | 14 | 15 |
1:
16 | more 17 |
25 | 26 |
27 |
28 | 29 | 30 | -------------------------------------------------------------------------------- /src/crawler/parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/crawler/parser/CMakeLists.txt -- 21 | # 22 | # 26 Nov 2013 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "crawlerparser") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c" "*.hpp" "*.h") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | FIND_PACKAGE (LibXml2 REQUIRED) 35 | 36 | # Includes 37 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PUBLIC "${LIBXML2_INCLUDE_DIR}" ) 38 | 39 | 40 | # Binaries 41 | TARGET_LINK_LIBRARIES(${MODULE} 42 | commontypes 43 | commonutils 44 | ${LIBXML2_LIBRARIES} 45 | ) 46 | -------------------------------------------------------------------------------- /src/crawler/parser/MathParser.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * Function to extract MathML formulas from a website 23 | * coded in HTML or XHTML 24 | * 25 | * @file MathParser.hpp 26 | * @date 15 Aug 2012 27 | */ 28 | #ifndef _CRAWLER_PARSER_MATHPARSER_HPP 29 | #define _CRAWLER_PARSER_MATHPARSER_HPP 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | namespace crawler { 36 | namespace parser { 37 | 38 | struct Harvest { 39 | std::string dataElement; 40 | std::vector contentMathElements; 41 | }; 42 | 43 | struct HarvesterConfiguration { 44 | struct MetadataItem { 45 | std::string name; 46 | std::string xpath; 47 | }; 48 | 49 | HarvesterConfiguration(); 50 | std::string toString() const; 51 | 52 | bool shouldSaveData; 53 | std::string data_id; 54 | std::string documentIdXpath; 55 | std::string textWithMathXpath; 56 | std::vector metadataItems; 57 | }; 58 | 59 | /** 60 | * @param path to HTML or XHTML file 61 | * @param url URL of the HTML or XHTML 62 | * @param data_id 63 | * 64 | * @return vector of harvest data and expressions 65 | * 66 | * @throw runtime_error when HTML/XHTML parsing fails 67 | */ 68 | Harvest createHarvestFromDocument(const std::string& path, 69 | const HarvesterConfiguration& config); 70 | 71 | /** 72 | * @brief cleanupMathParser should be called at the end of the process to 73 | * release all libxml global resources 74 | */ 75 | void cleanupMathParser(); 76 | 77 | } // namespace parser 78 | } // namespace crawler 79 | 80 | 81 | #endif // _CRAWLER_PARSER_MATHPARSER_HPP 82 | -------------------------------------------------------------------------------- /src/crawler/types/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/crawler/types/CMakeLists.txt -- 21 | # 22 | # 15 Aug 2011 23 | # d.hasegan@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "crawlertypes") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | 35 | # Includes 36 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PUBLIC "${GOOGLEURL_INCLUDES}" ) 37 | 38 | # Flags 39 | 40 | # Binaries 41 | TARGET_LINK_LIBRARIES(${MODULE} 42 | ${GOOGLEURL_LIBRARIES} 43 | crawlerutils 44 | ) 45 | 46 | -------------------------------------------------------------------------------- /src/crawler/types/Robotstxt.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * Class to download and check the Robots.txt of a website 23 | * 24 | * author: Daniel Hasegan 25 | * date: 15 Aug 2012 26 | */ 27 | #ifndef _ROBOTSTXT_HPP 28 | #define _ROBOTSTXT_HPP 29 | 30 | // System includes 31 | #include 32 | 33 | // Third party includes 34 | #include "gurl.h" 35 | 36 | /** 37 | * Robotstxt class keeps track of allowed and dissalowed links in a page 38 | * TODO: Also keeps track of the time between downloads 39 | */ 40 | class Robotstxt 41 | { 42 | 43 | private: 44 | GURL startUrl; // Url of the Robots.txt page 45 | std::vector < GURL > not_allowed; // The list of links that are not allowed 46 | 47 | public: 48 | /** 49 | * Robotstxt Constructor with the Starting URL 50 | * Searches for the Crawler with any name 51 | * @param startUrl the Starting Url of the Page Crawler 52 | */ 53 | Robotstxt(GURL startUrl); 54 | 55 | /** 56 | * Destructor of Robotstxt 57 | */ 58 | ~Robotstxt(); 59 | 60 | /** 61 | * Adds a url to avoid later in Crawling 62 | * @param url the url we want to avoid in our Crawling 63 | */ 64 | void dont_allow( std::string url ); 65 | 66 | /** 67 | * Query if the Page is allowed by the robots.txt file 68 | * @param url The url to be checked in GURL format 69 | * @return 1 if it is allowed, 0 if the link should not be downloaded 70 | */ 71 | int is_allowed_by_robots_txt( GURL url ); 72 | 73 | }; 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/crawler/types/SharedQueue.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _QUEUE 22 | #define _QUEUE 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct QueueNode 30 | { 31 | std::string urlstart; 32 | int count; 33 | int dontcrawlnr; 34 | std::vector dontcrawl; 35 | }; 36 | 37 | struct SharedQueue 38 | { 39 | std::queue sharedQueue; 40 | pthread_mutex_t lock; 41 | SharedQueue() 42 | { 43 | // lock = PTHREAD_MUTEX_INITIALIZER; 44 | pthread_mutexattr_t attr; 45 | pthread_mutexattr_init(&attr); 46 | pthread_mutex_init(&lock, &attr); 47 | } 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/crawler/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/crawler/utils/CMakeLists.txt -- 21 | # 22 | # 15 Aug 2011 23 | # d.hasegan@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "crawlerutils") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c" "*.hpp" "*.h") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | FIND_PACKAGE (LibXml2 REQUIRED) 35 | FIND_PACKAGE (CURL REQUIRED) 36 | FIND_PACKAGE (HtmlCxx REQUIRED) 37 | 38 | # Includes 39 | TARGET_INCLUDE_DIRECTORIES( "${MODULE}" PUBLIC "${GOOGLEURL_INCLUDES}" ) 40 | TARGET_INCLUDE_DIRECTORIES( "${MODULE}" PRIVATE "${LIBXML2_INCLUDE_DIR}" ) 41 | TARGET_INCLUDE_DIRECTORIES( "${MODULE}" PUBLIC "${CURL_INCLUDE_DIRS}" ) 42 | TARGET_INCLUDE_DIRECTORIES( "${MODULE}" PUBLIC "${HTMLCXX_INCLUDE_DIRS}" ) 43 | 44 | configure_file ( MwsMathMLTags.txt MwsMathMLTags.txt ) 45 | # Flags 46 | 47 | 48 | # Binaries 49 | TARGET_LINK_LIBRARIES(${MODULE} 50 | ${LIBXML2_LIBRARIES} 51 | ${CURL_LIBRARIES} 52 | ${HTMLCXX_LIBRARIES} 53 | ${GOOGLEURL_LIBRARIES} 54 | commonutils 55 | ) 56 | -------------------------------------------------------------------------------- /src/crawler/utils/MwsGetMath.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * Function to extract MathML formulas from a website 23 | * coded in HTML or XHTML 24 | * 25 | * authors: Daniel Hasegan & Catalin Perticas 26 | * date: 15 Aug 2012 27 | */ 28 | #ifndef _MWSGETMATH_HPP 29 | #define _MWSGETMATH_HPP 30 | 31 | // System Includes 32 | #include 33 | #include 34 | #include 35 | 36 | // Local includes 37 | #include "crawler/utils/Page.hpp" 38 | 39 | // Global variable 40 | extern std::set < std::string > mathml_tags; 41 | 42 | namespace mws { 43 | // HTML 44 | int isHTML(Page& page); 45 | std::vector< std::string > get_math_html(Page& page); 46 | void store_mathml_tags(); 47 | 48 | // XHTML 49 | int isXHTML(Page& page); 50 | std::vector< std::string > get_math_xhtml(const std::string& content, const std::string& url); 51 | 52 | // GENERAL 53 | std::vector< std::string > get_math(Page& page); 54 | int is_good_xml(std::string xml); 55 | 56 | } // namespace mws 57 | 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /src/mws/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # MWS Modules 20 | ADD_SUBDIRECTORY(analytics) # mwsanalytics 21 | ADD_SUBDIRECTORY(daemon) # mwsdaemon 22 | ADD_SUBDIRECTORY(dbc) # mwsdbc 23 | ADD_SUBDIRECTORY(index) # mwsindex 24 | ADD_SUBDIRECTORY(query) # mwsquery 25 | ADD_SUBDIRECTORY(types) # mwstypes 26 | ADD_SUBDIRECTORY(xmlparser) # mwsxmlparser 27 | 28 | # Main MWS executable 29 | ADD_EXECUTABLE( mwsd mwsd.cpp ) 30 | TARGET_LINK_LIBRARIES( mwsd 31 | mwsdaemon 32 | commonutils 33 | ) 34 | INSTALL(TARGETS mwsd DESTINATION bin) 35 | 36 | # MWS index builder 37 | ADD_EXECUTABLE(mws-index mws-index.cpp) 38 | TARGET_LINK_LIBRARIES( mws-index 39 | mwsindex 40 | commonutils 41 | ) 42 | 43 | # SchemaSearch executable 44 | ADD_EXECUTABLE(schemad schemad.cpp) 45 | TARGET_LINK_LIBRARIES( schemad 46 | mwsdaemon 47 | commonutils 48 | ) 49 | 50 | # Output executables at the root of build tree 51 | SET_PROPERTY( TARGET mwsd mws-index schemad 52 | PROPERTY RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} 53 | ) 54 | -------------------------------------------------------------------------------- /src/mws/analytics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | 20 | # Module name 21 | SET(MODULE "mwsanalytics") 22 | 23 | # Dependencies 24 | 25 | # Includes 26 | 27 | # Flags 28 | 29 | # Sources 30 | FILE(GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | 32 | # Binaries 33 | ADD_LIBRARY(${MODULE} ${SOURCES}) 34 | TARGET_LINK_LIBRARIES(${MODULE} 35 | mwsindex 36 | mwstypes 37 | commonutils 38 | ) 39 | -------------------------------------------------------------------------------- /src/mws/analytics/analytics.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_ANALYTICS_ANALYTICS_HPP 22 | #define _MWS_ANALYTICS_ANALYTICS_HPP 23 | 24 | /** 25 | * @brief Analytics interface 26 | * @file analytics.hpp 27 | * @author Corneliu Prodescu 28 | * @date 20 June 2014 29 | */ 30 | 31 | #include "mws/types/CmmlToken.hpp" 32 | #include "mws/index/index.h" 33 | 34 | namespace mws { 35 | namespace analytics { 36 | 37 | enum AnalyticsStatus { 38 | ANALYTICS_OK, 39 | ANALYTICS_STOP, 40 | }; 41 | 42 | /** 43 | * @brief Callback which runs before any expression is analyzed 44 | * @param index Compressed index handler 45 | * @param root Root node of the index 46 | * @return ANALYTICS_OK if the analytics job should continue 47 | * @return ANALYTICS_STOP if the analytics job should stop 48 | */ 49 | AnalyticsStatus analyze_begin(const index_handle_t* index, 50 | const inode_t* root); 51 | /** 52 | * @brief Callback for each expression in the index 53 | * @param cmmlToken ContentMathML expression 54 | * @param num_hits Number of formulae containing this expression 55 | * @return ANALYTICS_OK if the analytics job should continue 56 | * @return ANALYTICS_STOP if the analytics job should stop 57 | */ 58 | AnalyticsStatus analyze_expression(const types::CmmlToken* cmmlToken, 59 | uint32_t num_hits); 60 | /** 61 | * @brief Callback which runs after all expressions have been analyzed 62 | */ 63 | void analyze_end(); 64 | 65 | } // namespace analytics 66 | } // namespace mws 67 | 68 | #endif // _MWS_ANALYTICS_ANALYTICS_HPP 69 | -------------------------------------------------------------------------------- /src/mws/daemon/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/daemon/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwsdaemon") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES} ) 32 | 33 | # Dependencies 34 | FIND_PACKAGE(MicroHttpd REQUIRED) 35 | 36 | # Includes 37 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PUBLIC ${MICROHTTPD_INCLUDE_DIRS}) 38 | 39 | # Flags 40 | ADD_DEFINITIONS(${MICROHTTPD_DEFINITIONS}) 41 | 42 | 43 | 44 | # Binaries 45 | TARGET_LINK_LIBRARIES(${MODULE} 46 | ${MICROHTTPD_LIBRARIES} 47 | mwsdbc 48 | mwsindex 49 | mwsquery 50 | mwstypes 51 | commonutils 52 | ) 53 | -------------------------------------------------------------------------------- /src/mws/daemon/Daemon.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DAEMON_DAEMON_HPP 22 | #define _MWS_DAEMON_DAEMON_HPP 23 | 24 | /** 25 | * @brief File containing the header of the Daemon class. 26 | * @file Daemon.hpp 27 | * @author Radu Hambasan 28 | * @date 10 Mar 2014 29 | * 30 | * License: GPL v3 31 | */ 32 | 33 | #include 34 | 35 | #include "mws/daemon/microhttpd_linux.h" 36 | #include "mws/daemon/QueryHandler.hpp" 37 | 38 | namespace mws { 39 | namespace daemon { 40 | 41 | class Daemon { 42 | public: 43 | struct Config { 44 | uint16_t port; 45 | bool enableIpv6; 46 | 47 | Config(); 48 | }; 49 | 50 | /** 51 | * @brief Daemon 52 | * @param queryHandler query handler used for each request 53 | * 54 | * Daemon owns the query handler 55 | */ 56 | Daemon(QueryHandler* queryHandler, const Config& config = Config()); 57 | ~Daemon(); 58 | 59 | private: 60 | std::unique_ptr _queryHandler; 61 | struct MHD_Daemon* _mhd; 62 | 63 | DISALLOW_COPY_AND_ASSIGN(Daemon); 64 | }; 65 | 66 | } // namespace daemon 67 | } // namespace mws 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/mws/daemon/HarvestQueryHandler.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2014 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DAEMON_HARVESTQUERYHANDLER_HPP 22 | #define _MWS_DAEMON_HARVESTQUERYHANDLER_HPP 23 | 24 | /** 25 | * @brief HarvestQueryHandler interface 26 | * @file HarvestQueryHandler.hpp 27 | * @author Corneliu-Claudiu Prodescu 28 | * @date 15 Jun 2014 29 | */ 30 | 31 | #include "mws/daemon/QueryHandler.hpp" 32 | #include "mws/dbc/MemFormulaDb.hpp" 33 | #include "mws/dbc/MemCrawlDb.hpp" 34 | #include "mws/index/TmpIndex.hpp" 35 | #include "mws/index/MeaningDictionary.hpp" 36 | #include "mws/index/IndexBuilder.hpp" 37 | 38 | namespace mws { 39 | namespace daemon { 40 | 41 | class HarvestQueryHandler : public QueryHandler { 42 | public: 43 | explicit HarvestQueryHandler(const index::HarvesterConfiguration& config); 44 | ~HarvestQueryHandler(); 45 | 46 | GenericAnswer* handleQuery(types::Query* query); 47 | 48 | private: 49 | index::MeaningDictionary _meaningDictionary; 50 | dbc::MemCrawlDb _crawlDb; 51 | dbc::MemFormulaDb _formulaDb; 52 | index::TmpIndex _index; 53 | index::ExpressionEncoder::Config _encodingConfig; 54 | 55 | DISALLOW_COPY_AND_ASSIGN(HarvestQueryHandler); 56 | }; 57 | 58 | } // namespace daemon 59 | } // namespace mws 60 | 61 | #endif // _MWS_DAEMON_HARVESTQUERYHANDLER_HPP 62 | -------------------------------------------------------------------------------- /src/mws/daemon/IndexQueryHandler.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DAEMON_INDEXQUERYHANDLER_HPP 22 | #define _MWS_DAEMON_INDEXQUERYHANDLER_HPP 23 | 24 | /** 25 | * @author Corneliu-Claudiu Prodescu 26 | * @date 18 Jun 2011 27 | * 28 | * @edited Radu Hambasan 29 | * @date 18 Feb 2014 30 | * 31 | * @edited Corneliu Prodescu 32 | * @date 21 May 2014 33 | * 34 | * License: GPL v3 35 | * 36 | */ 37 | 38 | #include 39 | #include 40 | 41 | #include "common/utils/compiler_defs.h" 42 | #include "mws/daemon/QueryHandler.hpp" 43 | #include "mws/index/ExpressionEncoder.hpp" 44 | #include "mws/index/IndexLoader.hpp" 45 | #include "mws/types/GenericAnswer.hpp" 46 | 47 | namespace mws { 48 | namespace daemon { 49 | 50 | class IndexQueryHandler : public QueryHandler { 51 | public: 52 | struct Config { 53 | index::ExpressionEncoder::Config encoding; 54 | bool useExperimentalQueryEngine; 55 | 56 | Config() : useExperimentalQueryEngine(false) {} 57 | }; 58 | 59 | IndexQueryHandler(const std::string& indexPath, 60 | const Config& config = Config()); 61 | ~IndexQueryHandler(); 62 | 63 | GenericAnswer* handleQuery(types::Query* query); 64 | 65 | private: 66 | index::IndexLoader _index; 67 | Config _config; 68 | 69 | DISALLOW_COPY_AND_ASSIGN(IndexQueryHandler); 70 | }; 71 | 72 | } // namespace daemon 73 | } // namespace mws 74 | 75 | #endif // _MWS_DAEMON_INDEXQUERYHANDLER_HPP 76 | -------------------------------------------------------------------------------- /src/mws/daemon/QueryHandler.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DAEMON_QUERYHANDLER_HPP 22 | #define _MWS_DAEMON_QUERYHANDLER_HPP 23 | 24 | #include "mws/types/Query.hpp" 25 | #include "mws/types/GenericAnswer.hpp" 26 | 27 | namespace mws { 28 | namespace daemon { 29 | 30 | class QueryHandler { 31 | public: 32 | virtual ~QueryHandler() {} 33 | virtual GenericAnswer* handleQuery(types::Query* query) = 0; 34 | }; 35 | 36 | } // namespace daemon 37 | } // namespace mws 38 | 39 | #endif // _MWS_DAEMON_QUERYHANDLER_HPP 40 | -------------------------------------------------------------------------------- /src/mws/daemon/SchemaQueryHandler.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2014 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DAEMON_SCHEMAQUERYHANDLER_HPP 22 | #define _MWS_DAEMON_SCHEMAQUERYHANDLER_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 30 Dec 2014 27 | */ 28 | 29 | #include 30 | #include 31 | 32 | #include "mws/types/CmmlToken.hpp" 33 | #include "mws/daemon/QueryHandler.hpp" 34 | #include "mws/index/ExpressionEncoder.hpp" 35 | using mws::index::ExpressionEncoder; 36 | #include "mws/query/SchemaEngine.hpp" 37 | #include "mws/types/GenericAnswer.hpp" 38 | 39 | namespace mws { 40 | namespace daemon { 41 | 42 | class SchemaQueryHandler : public QueryHandler { 43 | public: 44 | SchemaQueryHandler(const ExpressionEncoder::Config& encodingConfig = 45 | ExpressionEncoder::Config()); 46 | 47 | ~SchemaQueryHandler(); 48 | 49 | GenericAnswer* handleQuery(types::Query* query); 50 | 51 | private: 52 | void getSubstitutions(types::CmmlToken* exprRoot, 53 | types::CmmlToken* schemaRoot, 54 | std::vector* substitutions); 55 | 56 | index::ExpressionEncoder::Config _encodingConfig; 57 | DISALLOW_COPY_AND_ASSIGN(SchemaQueryHandler); 58 | }; 59 | 60 | } // namespace daemon 61 | } // namespace mws 62 | 63 | #endif // _MWS_DAEMON_SCHEMAQUERYHANDLER_HPP 64 | -------------------------------------------------------------------------------- /src/mws/daemon/microhttpd_linux.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief complete MicroHTTPd headers for GNU/Linux 23 | * 24 | * @file microhttpd_linux.h 25 | * 26 | * @author Corneliu-Claudiu Prodescu 27 | * @date 2012-02-08 28 | * 29 | * License: GPL v3 30 | */ 31 | 32 | #ifndef _MICROHTTPD_LINUX_H 33 | #define _MICROHTTPD_LINUX_H 34 | 35 | // Headers needed to be included before MicroHTTPDd native header 36 | 37 | #include // C variable argument list 38 | #include // C standard integer types 39 | #include // C select API 40 | #include // C sockets API 41 | 42 | // MicroHTTPd native header 43 | 44 | #include // MicroHTTPd library headers 45 | 46 | #endif // ! _MICROHTTPD_LINUX_H 47 | -------------------------------------------------------------------------------- /src/mws/dbc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/dbc/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwsdbc") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | # Dependencies 34 | FIND_PACKAGE(Threads REQUIRED) 35 | FIND_PACKAGE(LevelDb REQUIRED) 36 | FIND_PACKAGE(Snappy REQUIRED) 37 | 38 | # Includes 39 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PUBLIC ${LEVELDB_INCLUDE_DIR}) 40 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PRIVATE ${SNAPPY_INCLUDE_DIR}) 41 | 42 | # Flags 43 | 44 | # Binaries 45 | TARGET_LINK_LIBRARIES(${MODULE} 46 | mwstypes 47 | commontypes 48 | commonutils 49 | ${LEVELDB_LIBRARIES} 50 | ${SNAPPY_LIBRARIES} 51 | ${CMAKE_THREAD_LIBS_INIT} 52 | ) 53 | -------------------------------------------------------------------------------- /src/mws/dbc/CrawlDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DBC_CRAWLDB_HPP 22 | #define _MWS_DBC_CRAWLDB_HPP 23 | 24 | /** 25 | * @file CrawlDb.hpp 26 | * @brief Crawl Database interface 27 | * @date 12 Nov 2013 28 | */ 29 | 30 | #include 31 | 32 | namespace mws { 33 | namespace dbc { 34 | 35 | typedef uint32_t CrawlId; 36 | const CrawlId CRAWLID_NULL = 0; 37 | 38 | typedef std::string CrawlData; 39 | const CrawlData CRAWLDATA_NULL = CrawlData(); 40 | 41 | class CrawlDb { 42 | public: 43 | virtual ~CrawlDb() {} 44 | 45 | /** 46 | * @brief insert crawled data 47 | * @param crawlId id of the crawl element 48 | * @param crawlData data associated with the crawl element 49 | * @throw exception 50 | */ 51 | virtual CrawlId putData(const CrawlData& crawlData) = 0; 52 | 53 | /** 54 | * @brief get crawled data 55 | * @param crawlId id of the crawl element 56 | * @return CrawlData corresponding to crawlId 57 | * @throw NotFound or I/O exceptions 58 | */ 59 | virtual const CrawlData getData(const CrawlId& crawlId) = 0; 60 | }; 61 | 62 | } // namespace dbc 63 | } // namespace mws 64 | 65 | #endif // _MWS_DBC_CRAWLDB_HPP 66 | -------------------------------------------------------------------------------- /src/mws/dbc/DbQueryManager.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file DbQueryManager.cpp 23 | * @brief DbQueryManager implementation 24 | * @author cprodescu 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #include "common/utils/compiler_defs.h" 36 | #include "mws/types/MwsAnswset.hpp" 37 | #include "mws/types/Answer.hpp" 38 | #include "mws/dbc/DbQueryManager.hpp" 39 | 40 | namespace mws { 41 | namespace dbc { 42 | 43 | DbQueryManager::DbQueryManager(CrawlDb* crawlDb, FormulaDb* formulaDb) 44 | : mCrawlDb(crawlDb), mFormulaDb(formulaDb) {} 45 | 46 | int DbQueryManager::query(types::FormulaId formulaId, unsigned limitMin, 47 | unsigned limitSize, 48 | DbAnswerCallback dbAnswerCallback) { 49 | QueryCallback formulaQueryCallback = [dbAnswerCallback, this]( 50 | const CrawlId& crawlId, const types::FormulaPath& formulaPath) { 51 | if (crawlId != CRAWLID_NULL && mCrawlDb) { 52 | return dbAnswerCallback(formulaPath, 53 | this->mCrawlDb->getData(crawlId)); 54 | } else { 55 | return dbAnswerCallback(formulaPath, CRAWLDATA_NULL); 56 | } 57 | }; 58 | return mFormulaDb->queryFormula(formulaId, limitMin, limitSize, 59 | formulaQueryCallback); 60 | } 61 | 62 | } // namespace dbc 63 | } // namespace mws 64 | -------------------------------------------------------------------------------- /src/mws/dbc/DbQueryManager.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file DbQueryManager.hpp 23 | * @brief DbQueryManager API 24 | * @date 2 Dec 2013 25 | * @author cprodescu 26 | */ 27 | #ifndef _MWS_DBC_DBQUERYMANAGER_HPP 28 | #define _MWS_DBC_DBQUERYMANAGER_HPP 29 | 30 | #include 31 | 32 | #include "common/utils/compiler_defs.h" 33 | #include "mws/dbc/CrawlDb.hpp" 34 | #include "mws/dbc/FormulaDb.hpp" 35 | 36 | namespace mws { 37 | namespace dbc { 38 | 39 | typedef std::function 40 | DbAnswerCallback; 41 | 42 | class DbQueryManager { 43 | CrawlDb* mCrawlDb; 44 | FormulaDb* mFormulaDb; 45 | 46 | public: 47 | DbQueryManager(CrawlDb* crawlDb, FormulaDb* formulaDb); 48 | 49 | int query(types::FormulaId formulaId, 50 | unsigned limitMin, 51 | unsigned limitSize, 52 | DbAnswerCallback dbAnswerCallback); 53 | 54 | private: 55 | DISALLOW_COPY_AND_ASSIGN(DbQueryManager); 56 | }; 57 | 58 | } // namespace dbc 59 | } // namespace mws 60 | 61 | #endif // _MWS_DBC_DBQUERYMANAGER_HPP 62 | -------------------------------------------------------------------------------- /src/mws/dbc/FormulaDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DBC_FORMULADB_HPP 22 | #define _MWS_DBC_FORMULADB_HPP 23 | 24 | /** 25 | * @file FormulaDb.hpp 26 | * @brief Expression Database interface 27 | * @date 12 Nov 2013 28 | */ 29 | 30 | #include 31 | 32 | #include "mws/types/FormulaPath.hpp" 33 | #include "mws/dbc/CrawlDb.hpp" 34 | 35 | namespace mws { namespace dbc { 36 | 37 | typedef std::function QueryCallback; 39 | 40 | class FormulaDb { 41 | public: 42 | virtual ~FormulaDb() {} 43 | 44 | /** 45 | * @brief insert formula in database 46 | * @param formulaId id of a leaf node in the index 47 | * @param crawlId id corresponding to the crawled data 48 | * @param formulaPath path within the crawled data corresponding to this 49 | * formula 50 | * @return 0 on success and -1 on failure. 51 | */ 52 | virtual int insertFormula(const mws::types::FormulaId& formulaId, 53 | const CrawlId& crawlId, 54 | const mws::types::FormulaPath& formulaPath) = 0; 55 | 56 | /** 57 | * @brief query formula in database 58 | * @param formulaId id of a leaf node in the index 59 | * @param limitMin 60 | * @param limitSize 61 | * @param queryCallback 62 | * @return 0 on success and -1 on failure. 63 | */ 64 | virtual int queryFormula(const mws::types::FormulaId& formulaId, 65 | unsigned limitMin, 66 | unsigned limitSize, 67 | QueryCallback queryCallback) = 0; 68 | }; 69 | 70 | } } 71 | 72 | #endif // _MWS_DBC_FORMULADB_HPP 73 | -------------------------------------------------------------------------------- /src/mws/dbc/LevCrawlDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | 22 | #ifndef _MWS_DBC_LEVCRAWLDB_HPP 23 | #define _MWS_DBC_LEVCRAWLDB_HPP 24 | 25 | /** 26 | * @file LevCrawlDb.hpp 27 | * @brief LevelDb Crawl Database API 28 | * @author Radu Hambasan 29 | * @date 11 Dec 2013 30 | */ 31 | 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #include "mws/dbc/CrawlDb.hpp" 38 | 39 | namespace mws { 40 | namespace dbc { 41 | 42 | class LevCrawlDb : public CrawlDb { 43 | public: 44 | LevCrawlDb(); 45 | virtual ~LevCrawlDb(); 46 | 47 | /** @throw runtime_error */ 48 | void open(const char* path); 49 | /** @throw runtime_error */ 50 | void create_new(const char* path, bool deleteIfExists); 51 | 52 | /** 53 | * @brief insert crawled data 54 | * @param crawlId id of the crawl element 55 | * @param crawlData data associated with the crawl element 56 | * 57 | *@throw runtime_error 58 | */ 59 | virtual CrawlId putData(const CrawlData& crawlData); 60 | 61 | /** 62 | * @brief get crawled data 63 | * @param crawlId id of the crawl element 64 | * @return CrawlData corresponding to crawlId 65 | * @throw NotFound or I/O exceptions 66 | */ 67 | virtual const CrawlData getData(const CrawlId& crawlId); 68 | 69 | private: 70 | leveldb::DB* mDatabase; 71 | CrawlId mNextCrawlId; 72 | }; 73 | 74 | } // namespace dbc 75 | } // namespace mws 76 | 77 | #endif // _MWS_DBC_MEMCRAWLDB_HPP 78 | -------------------------------------------------------------------------------- /src/mws/dbc/LevFormulaDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DBC_LEVFORMULADB_HPP 22 | #define _MWS_DBC_LEVFORMULADB_HPP 23 | 24 | /** 25 | * @file LevFormulaDb.hpp 26 | * @brief Formula Memory Database declarations 27 | * @author Radu Hambasan 28 | * @date 11 Dec 2013 29 | */ 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | 36 | #include "mws/dbc/FormulaDb.hpp" 37 | 38 | namespace mws { namespace dbc { 39 | 40 | class LevFormulaDb : public FormulaDb { 41 | public: 42 | LevFormulaDb(); 43 | virtual ~LevFormulaDb(); 44 | 45 | /**@throw runtime_error*/ 46 | void open(const char* path); 47 | 48 | /**@throw runtime_error*/ 49 | void create_new(const char* path, bool deleteIfExists); 50 | 51 | virtual int insertFormula(const types::FormulaId& formulaId, 52 | const CrawlId& crawlId, 53 | const types::FormulaPath& formulaPath); 54 | 55 | virtual int queryFormula(const types::FormulaId& formulaId, 56 | unsigned limitMin, 57 | unsigned limitSize, 58 | QueryCallback queryCallback); 59 | 60 | private: 61 | leveldb::DB* mDatabase; 62 | uint32_t mCounter; 63 | }; 64 | 65 | } // namespace dbc 66 | } // namespace mws 67 | 68 | #endif // _MWS_DBC_LEVFORMULADB_HPP 69 | -------------------------------------------------------------------------------- /src/mws/dbc/MemCrawlDb.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | 22 | /** 23 | * @file MemCrawlDb.cpp 24 | * @brief Crawl Data Memory Database implementation 25 | * @date 12 Nov 2013 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | using std::to_string; 32 | 33 | #include "mws/dbc/MemCrawlDb.hpp" 34 | 35 | using namespace std; 36 | 37 | namespace mws { 38 | namespace dbc { 39 | 40 | MemCrawlDb::MemCrawlDb() : mNextCrawlId(CRAWLID_NULL) {} 41 | 42 | /** @throw std::exception */ 43 | CrawlId MemCrawlDb::putData(const CrawlData& crawlData) { 44 | const CrawlId crawlId = ++mNextCrawlId; 45 | auto ret = mData.insert(make_pair(crawlId, crawlData)); 46 | if (!ret.second) { 47 | throw std::runtime_error("Duplicate entry at crawlId = " + 48 | to_string(crawlId)); 49 | } 50 | 51 | return crawlId; 52 | } 53 | 54 | /** @throw std::exception */ 55 | const CrawlData MemCrawlDb::getData(const CrawlId& crawlId) { 56 | auto it = mData.find(crawlId); 57 | if (it != mData.end()) { 58 | return it->second; 59 | } else { 60 | throw runtime_error("No data corresponding to crawlId = " + 61 | to_string(crawlId)); 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/mws/dbc/MemCrawlDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DBC_MEMCRAWLDB_HPP 22 | #define _MWS_DBC_MEMCRAWLDB_HPP 23 | 24 | /** 25 | * @file MemCrawlDb.hpp 26 | * @brief Crawl Data Memory Database declarations 27 | * @date 12 Nov 2013 28 | */ 29 | 30 | #include 31 | 32 | #include "mws/dbc/CrawlDb.hpp" 33 | 34 | namespace mws { namespace dbc { 35 | 36 | class MemCrawlDb : public CrawlDb { 37 | public: 38 | MemCrawlDb(); 39 | 40 | /** 41 | * @brief insert crawled data 42 | * @param crawlId id of the crawl element 43 | * @param crawlData data associated with the crawl element 44 | * @throw exception 45 | */ 46 | virtual CrawlId putData(const CrawlData& crawlData); 47 | 48 | /** 49 | * @brief get crawled data 50 | * @param crawlId id of the crawl element 51 | * @return CrawlData corresponding to crawlId 52 | * @throw NotFound or I/O exceptions 53 | */ 54 | virtual const CrawlData getData(const CrawlId& crawlId); 55 | 56 | private: 57 | std::map mData; 58 | CrawlId mNextCrawlId; 59 | }; 60 | 61 | } } 62 | 63 | #endif // _MWS_DBC_MEMCRAWLDB_HPP 64 | -------------------------------------------------------------------------------- /src/mws/dbc/MemFormulaDb.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file MemFormulaDb.cpp 23 | * @brief Formula Memory Database implementation 24 | * @date 12 Nov 2013 25 | */ 26 | 27 | #include "mws/dbc/MemFormulaDb.hpp" 28 | 29 | using namespace std; 30 | 31 | namespace mws { 32 | namespace dbc { 33 | 34 | int MemFormulaDb::insertFormula(const types::FormulaId& formulaId, 35 | const CrawlId& crawlId, 36 | const types::FormulaPath& formulaPath) { 37 | MemFormulaDb::FormulaInfo formulaInfo; 38 | 39 | formulaInfo.crawlId = crawlId; 40 | formulaInfo.formulaPath = formulaPath; 41 | 42 | mData[formulaId].push_back(formulaInfo); 43 | 44 | return 0; 45 | } 46 | 47 | int MemFormulaDb::queryFormula(const types::FormulaId& formulaId, 48 | unsigned limitMin, unsigned limitSize, 49 | QueryCallback queryCallback) { 50 | auto ret = mData.find(formulaId); 51 | if (ret == mData.end()) return 0; 52 | vector formulaInfos = ret->second; 53 | if (limitMin >= formulaInfos.size()) return 0; 54 | auto it = formulaInfos.begin() + limitMin; 55 | 56 | for (unsigned i = 0; i < limitSize && it != formulaInfos.end(); i++, it++) { 57 | if (queryCallback(it->crawlId, it->formulaPath) != 0) return -1; 58 | } 59 | 60 | return 0; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/mws/dbc/MemFormulaDb.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_DBC_MEMFORMULADB_HPP 22 | #define _MWS_DBC_MEMFORMULADB_HPP 23 | 24 | /** 25 | * @file MemFormulaDb.hpp 26 | * @brief Formula Memory Database declarations 27 | * @date 12 Nov 2013 28 | */ 29 | 30 | #include "mws/dbc/FormulaDb.hpp" 31 | 32 | #include 33 | #include 34 | 35 | namespace mws { namespace dbc { 36 | 37 | 38 | class MemFormulaDb : public FormulaDb { 39 | public: 40 | virtual int insertFormula(const types::FormulaId& formulaId, 41 | const CrawlId& crawlId, 42 | const types::FormulaPath& formulaPath); 43 | 44 | virtual int queryFormula(const types::FormulaId& formulaId, 45 | unsigned limitMin, 46 | unsigned limitSize, 47 | QueryCallback queryCallback); 48 | private: 49 | struct FormulaInfo { 50 | CrawlId crawlId; 51 | types::FormulaPath formulaPath; 52 | }; 53 | 54 | std::map > mData; 55 | }; 56 | 57 | } } 58 | 59 | #endif // _MWS_DBC_MEMFORMULADB_HPP 60 | -------------------------------------------------------------------------------- /src/mws/index/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/index/CMakeLists.txt -- 21 | # 22 | # 30 Jan 2013 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwsindex") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c" "*.hpp" "*.h") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES} ) 32 | 33 | # Dependencies 34 | 35 | # Includes 36 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PRIVATE ${CRC32_INCLUDES}) 37 | 38 | # Flags 39 | 40 | # Binaries 41 | TARGET_LINK_LIBRARIES(${MODULE} 42 | mwsdbc 43 | mwstypes 44 | mwsxmlparser 45 | commontypes 46 | commonutils 47 | ${CRC32_LIBRARIES} 48 | ) 49 | -------------------------------------------------------------------------------- /src/mws/index/CallbackIndexIterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2014 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_INDEX_CALLBACKINDEXITERATOR_HPP 22 | #define _MWS_INDEX_CALLBACKINDEXITERATOR_HPP 23 | 24 | /** 25 | * @brief Index Iterator with push/pop callbacks 26 | * @file CallbackIndexIterator.hpp 27 | * @author Corneliu-Claudiu Prodescu 28 | * @date 22 Jun 2014 29 | */ 30 | 31 | #include 32 | 33 | #include "mws/index/IndexIterator.hpp" 34 | 35 | namespace mws { 36 | namespace index { 37 | 38 | template 39 | class CallbackIndexIterator : public IndexIterator { 40 | typedef std::function Callback; 41 | Callback _onPush; 42 | Callback _onPop; 43 | 44 | public: 45 | CallbackIndexIterator(typename Accessor::Index* index, 46 | typename Accessor::Node* root, 47 | Callback onPushCallback, Callback onPopCallback) 48 | : IndexIterator(index, root), 49 | _onPush(onPushCallback), 50 | _onPop(onPopCallback) {} 51 | 52 | protected: 53 | virtual void onPush(typename Accessor::Iterator iterator) { 54 | _onPush(iterator); 55 | } 56 | virtual void onPop(typename Accessor::Iterator iterator) { 57 | _onPop(iterator); 58 | } 59 | }; 60 | 61 | } // namespace index 62 | } // namespace mws 63 | 64 | #endif // _MWS_INDEX_CALLBACKINDEXITERATOR_HPP 65 | -------------------------------------------------------------------------------- /src/mws/index/IndexLoader.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_INDEX_INDEXLOADER_HPP 22 | #define _MWS_INDEX_INDEXLOADER_HPP 23 | 24 | /** 25 | * @file IndexLoader.hpp 26 | * @brief Indexing Manager 27 | * @date 18 Nov 2013 28 | */ 29 | 30 | #include 31 | #include 32 | 33 | #include "mws/types/CmmlToken.hpp" 34 | #include "mws/dbc/FormulaDb.hpp" 35 | #include "mws/dbc/CrawlDb.hpp" 36 | #include "mws/dbc/DbQueryManager.hpp" 37 | #include "mws/index/MeaningDictionary.hpp" 38 | #include "mws/index/index.h" 39 | 40 | namespace mws { 41 | namespace index { 42 | 43 | struct LoadingOptions { 44 | bool includeHits; 45 | LoadingOptions() : includeHits(true) {} 46 | }; 47 | 48 | class IndexLoader { 49 | public: 50 | /** 51 | * @brief Method to load an index stored on disk 52 | */ 53 | IndexLoader(const std::string& indexPath, 54 | const LoadingOptions& options = LoadingOptions()); 55 | 56 | ~IndexLoader(); 57 | 58 | dbc::FormulaDb* getFormulaDb(); 59 | dbc::DbQueryManager* getDbQueryManager(); 60 | index_handle_t* getIndexHandle(); 61 | index::MeaningDictionary* getMeaningDictionary(); 62 | 63 | private: 64 | index::MeaningDictionary m_meaningDictionary; 65 | std::unique_ptr m_formulaDb; 66 | std::unique_ptr m_crawlDb; 67 | std::unique_ptr m_dbQueryManager; 68 | index_handle_t m_index; 69 | memsector_handle_t m_memsectorHandler; 70 | 71 | DISALLOW_COPY_AND_ASSIGN(IndexLoader); 72 | }; 73 | 74 | } // namespace index 75 | } // namespace mws 76 | 77 | #endif // _MWS_INDEX_INDEXLOADER_HPP 78 | -------------------------------------------------------------------------------- /src/mws/index/IndexWriter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2014 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_INDEX_INDEXWRITER_HPP 22 | #define _MWS_INDEX_INDEXWRITER_HPP 23 | 24 | /** 25 | * @file IndexWriter.hpp 26 | * 27 | * @author Radu Hambasan 28 | * @date 29 May 2014 29 | */ 30 | 31 | #include 32 | #include 33 | 34 | #include "mws/index/IndexBuilder.hpp" 35 | #include "mws/index/ExpressionEncoder.hpp" 36 | 37 | namespace mws { 38 | namespace index { 39 | 40 | struct IndexConfiguration { 41 | HarvesterConfiguration harvester; 42 | std::string dataPath; 43 | bool deleteOldData; 44 | }; 45 | 46 | /** 47 | * @brief Write an index, and associated data to disk 48 | * @param config 49 | * @return 0 on success, 1 if an error occurs 50 | */ 51 | int createCompressedIndex(const IndexConfiguration& config); 52 | 53 | } // namespace index 54 | } // namespace mws 55 | 56 | #endif // _MWS_INDEX_INDEXWRITER_HPP 57 | -------------------------------------------------------------------------------- /src/mws/index/MeaningDictionary.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_INDEX_MEANINGDICTIONARY_HPP 22 | #define _MWS_INDEX_MEANINGDICTIONARY_HPP 23 | 24 | /** 25 | * @brief Meaning Dictionary Class 26 | * @file MeaningDictionary.hpp 27 | * @author Corneliu-Claudiu Prodescu 28 | * @date 07 Jul 2011 29 | * 30 | * License: GPL v3 31 | * 32 | */ 33 | 34 | #include "common/types/IdDictionary.hpp" 35 | #include "mws/types/CmmlToken.hpp" 36 | #include "mws/index/encoded_token.h" 37 | 38 | namespace mws { 39 | namespace index { 40 | 41 | typedef common::types::IdDictionary 42 | MeaningDictionary; 43 | 44 | } // namespace types 45 | } // namesapce mws 46 | 47 | #endif // _MWS_INDEX_MEANINGDICTIONARY_HPP 48 | -------------------------------------------------------------------------------- /src/mws/mws-index.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file mws-index.cpp 23 | * @brief mws-index executable 24 | * @author Radu Hambasan 25 | * @date 18 Jan 2014 26 | */ 27 | 28 | #include 29 | #include 30 | using std::exception; 31 | #include 32 | using std::string; 33 | #include 34 | 35 | #include "common/utils/FlagParser.hpp" 36 | using common::utils::FlagParser; 37 | #include "mws/index/IndexWriter.hpp" 38 | using mws::index::IndexConfiguration; 39 | using mws::index::createCompressedIndex; 40 | 41 | int main(int argc, char* argv[]) { 42 | IndexConfiguration indexConfig; 43 | 44 | FlagParser::addFlag('o', "output-directory", FLAG_REQ, ARG_REQ); 45 | FlagParser::addFlag('I', "include-harvest-path", FLAG_REQ, ARG_REQ); 46 | FlagParser::addFlag('r', "recursive", FLAG_OPT, ARG_NONE); 47 | FlagParser::addFlag('e', "harvest-file-extension", FLAG_OPT, ARG_REQ); 48 | FlagParser::addFlag('c', "enable-ci-renaming", FLAG_OPT, ARG_NONE); 49 | 50 | if (FlagParser::parse(argc, argv) != 0) { 51 | fprintf(stderr, "%s", FlagParser::getUsage().c_str()); 52 | return EXIT_FAILURE; 53 | } 54 | 55 | indexConfig.harvester.fileExtension = "harvest"; 56 | if (FlagParser::hasArg('e')) { 57 | indexConfig.harvester.fileExtension = FlagParser::getArg('e'); 58 | } 59 | 60 | indexConfig.harvester.recursive = FlagParser::hasArg('r'); 61 | 62 | indexConfig.harvester.paths = FlagParser::getArgs('I'); 63 | indexConfig.harvester.encoding.renameCi = FlagParser::hasArg('c'); 64 | indexConfig.dataPath = FlagParser::getArg('o'); 65 | 66 | return createCompressedIndex(indexConfig); 67 | } 68 | -------------------------------------------------------------------------------- /src/mws/query/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/query/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwsquery") 28 | 29 | # Dependencies 30 | 31 | # Includes 32 | 33 | # Flags 34 | 35 | # Sources 36 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c" "*.hpp" "*.h") 37 | 38 | # Binaries 39 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 40 | TARGET_LINK_LIBRARIES(${MODULE} 41 | mwsdbc 42 | mwsindex 43 | mwstypes 44 | commonutils 45 | ) 46 | -------------------------------------------------------------------------------- /src/mws/query/engine.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief Query engine API 23 | * @file engine.h 24 | * @date 21 Feb 2013 25 | * 26 | * License: GPLv3 27 | */ 28 | 29 | #ifndef __MWS_QUERY_ENGINE_H 30 | #define __MWS_QUERY_ENGINE_H 31 | 32 | #include "mws/index/index.h" 33 | #include "mws/index/encoded_token.h" 34 | #include "common/utils/compiler_defs.h" 35 | 36 | /*--------------------------------------------------------------------------*/ 37 | /* Type declarations */ 38 | /*--------------------------------------------------------------------------*/ 39 | 40 | typedef enum result_cb_return_e { 41 | QUERY_CONTINUE, 42 | QUERY_STOP, 43 | QUERY_ERROR 44 | } result_cb_return_t; 45 | 46 | /* TODO report unificating instantiation */ 47 | typedef result_cb_return_t (*result_callback_t)(void* handle, 48 | const leaf_t* leaf); 49 | 50 | /*--------------------------------------------------------------------------*/ 51 | /* Methods */ 52 | /*--------------------------------------------------------------------------*/ 53 | 54 | BEGIN_DECLS 55 | 56 | int query_engine_run(index_handle_t* RESTRICT index, 57 | encoded_formula_t* RESTRICT query, result_callback_t cb, 58 | void* RESTRICT cb_handle); 59 | 60 | END_DECLS 61 | 62 | #endif // !__MWS_QUERY_QUERYENGINE_H 63 | -------------------------------------------------------------------------------- /src/mws/types/Answer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_TYPES_ANSWER_HPP 22 | #define _MWS_TYPES_ANSWER_HPP 23 | 24 | /** 25 | * @brief MWS Answer type 26 | * 27 | * @file Answer.hpp 28 | * @author Corneliu Claudiu Prodescu 29 | * @date 27 Apr 2011 30 | * 31 | * License: GPL v3 32 | * 33 | */ 34 | 35 | #include 36 | 37 | namespace mws { 38 | namespace types { 39 | 40 | /** 41 | * @brief Answer 42 | */ 43 | struct Answer { 44 | std::string uri; 45 | std::string xpath; 46 | std::string data; 47 | }; 48 | 49 | } // namespace types 50 | } // namespace mws 51 | 52 | #endif // _MWS_TYPES_ANSWER_HPP 53 | -------------------------------------------------------------------------------- /src/mws/types/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/types/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwstypes") 28 | 29 | # Dependencies 30 | 31 | # Includes 32 | 33 | # Flags 34 | 35 | # Sources 36 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 37 | 38 | # Binaries 39 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 40 | TARGET_LINK_LIBRARIES(${MODULE} 41 | commontypes 42 | commonutils 43 | ) 44 | -------------------------------------------------------------------------------- /src/mws/types/ExprSchema.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_TYPES_EXPRSCHEMA_HPP 22 | #define _MWS_TYPES_EXPRSCHEMA_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 30 Dec 2014 27 | * 28 | * License: GPL v3 29 | * 30 | */ 31 | 32 | #include 33 | #include 34 | 35 | #include "mws/types/CmmlToken.hpp" 36 | 37 | namespace mws { 38 | namespace types { 39 | 40 | struct ExprSchema { 41 | CmmlToken* root; 42 | size_t coverage; 43 | /** Formulae belonging to this schema class. 44 | * Only the index of a formula is included, not the whole formula */ 45 | std::vector formulae; 46 | /** 47 | * hrefs to PMML ids that have been substituted with qvars 48 | */ 49 | std::vector subst; 50 | 51 | ExprSchema() : root(nullptr), coverage(0) {} 52 | }; 53 | 54 | } // namespace types 55 | } // namespace mws 56 | 57 | #endif // _MWS_TYPES_EXPRSCHEMA_HPP 58 | -------------------------------------------------------------------------------- /src/mws/types/GenericAnswer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2015 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _GENERICANSWER_HPP 22 | #define _GENERICANSWER_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 30 Dec 2014 27 | */ 28 | 29 | namespace mws { 30 | 31 | /** 32 | * @brief Generic interface used to answer queries 33 | * 34 | */ 35 | struct GenericAnswer { 36 | virtual ~GenericAnswer() {} 37 | }; 38 | 39 | } // namespace mws 40 | 41 | #endif // _GENERICANSWER_HPP 42 | -------------------------------------------------------------------------------- /src/mws/types/MwsAnswset.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWSANSWSET_HPP 22 | #define _MWSANSWSET_HPP 23 | 24 | /** 25 | * @brief File containing the header of the MwsAnswset class. 26 | * 27 | * @file MwsAnswset.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 27 Apr 2011 30 | */ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "mws/types/Answer.hpp" 38 | #include "GenericAnswer.hpp" 39 | #include "mws/types/FormulaPath.hpp" 40 | 41 | namespace mws { 42 | 43 | /** 44 | * @brief Answer Set 45 | * 46 | */ 47 | struct MwsAnswset : GenericAnswer { 48 | /// Vector containing the MWS Answers 49 | std::vector answers; 50 | /// Total number of solutions in the index 51 | int total; 52 | /// Vector containing the qvar names 53 | std::vector qvarNames; 54 | /// Vector containing the qvar relative xpaths 55 | std::vector qvarXpaths; 56 | /// Set with the FormulaIds 57 | std::set ids; 58 | /// Duration for retrieng results (in ms) 59 | time_t time; 60 | 61 | MwsAnswset() : total(0) {} 62 | 63 | ~MwsAnswset() { 64 | for (auto answer : answers) { 65 | delete answer; 66 | } 67 | } 68 | }; 69 | 70 | } // namespace mws 71 | 72 | #endif // _MWSANSWSET_HPP 73 | -------------------------------------------------------------------------------- /src/mws/types/MwsSubst.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWSSUBST_HPP 22 | #define _MWSSUBST_HPP 23 | 24 | /** 25 | * @brief File containing the header of the MWS Substitution class. 26 | * 27 | * @file MwsSubst.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 31 May 2011 30 | * 31 | * License: GPL v3 32 | * 33 | */ 34 | 35 | // System includes 36 | 37 | #include // C++ vector class header 38 | 39 | // Local includes 40 | 41 | namespace mws { 42 | 43 | /** 44 | * @brief Datatype used to store a MWS Substitution 45 | */ 46 | struct MwsSubst { 47 | /// Xpaths of the substituted qvars 48 | std::vector qvarXpaths; 49 | 50 | MwsSubst() { 51 | // Nothing to do here 52 | } 53 | 54 | ~MwsSubst() { 55 | // Nothing to do here 56 | } 57 | 58 | private: 59 | MwsSubst(const MwsSubst&); 60 | }; 61 | } 62 | 63 | #endif // _MWSSUBST_HPP 64 | -------------------------------------------------------------------------------- /src/mws/types/SchemaAnswset.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2015 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _SCHEMAANSWSET_HPP 22 | #define _SCHEMAANSWSET_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 30 Dec 2014 27 | */ 28 | 29 | #include 30 | 31 | #include "mws/types/CmmlToken.hpp" 32 | #include "mws/types/ExprSchema.hpp" 33 | #include "GenericAnswer.hpp" 34 | 35 | namespace mws { 36 | 37 | struct SchemaAnswset : GenericAnswer { 38 | std::vector schemata; 39 | /// Total number of found schemata (some might have been dropped) 40 | int total; 41 | 42 | SchemaAnswset() : total(0) {} 43 | 44 | ~SchemaAnswset() { 45 | for (mws::types::ExprSchema& sch : schemata) { 46 | delete sch.root; 47 | } 48 | } 49 | }; 50 | 51 | } // namespace mws 52 | 53 | #endif // _SCHEMAANSWSET_HPP 54 | -------------------------------------------------------------------------------- /src/mws/xmlparser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # src/mws/xmlparser/CMakeLists.txt -- 21 | # 22 | # 18 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Module name 27 | SET(MODULE "mwsxmlparser") 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.hpp") 31 | ADD_LIBRARY( ${MODULE} ${SOURCES}) 32 | 33 | 34 | # Dependencies 35 | FIND_PACKAGE (LibXml2 REQUIRED) 36 | FIND_PACKAGE (Json REQUIRED) 37 | 38 | # Includes 39 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PRIVATE "${LIBXML2_INCLUDE_DIR}" ) 40 | TARGET_INCLUDE_DIRECTORIES( ${MODULE} PRIVATE "${JSON_INCLUDE_DIRS}" ) 41 | 42 | # Flags 43 | 44 | # Binaries 45 | TARGET_LINK_LIBRARIES(${MODULE} 46 | mwsdbc 47 | mwstypes 48 | commonutils 49 | ${JSON_LIBRARIES} 50 | ${LIBXML2_LIBRARIES}) 51 | -------------------------------------------------------------------------------- /src/mws/xmlparser/MwsIdsResponseFormatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_PARSER_MWSIDSRESPONSE_FORMATTER_HPP 22 | #define _MWS_PARSER_MWSIDSRESPONSE_FORMATTER_HPP 23 | 24 | /** 25 | * @brief MwsIdsResponseFormatter 26 | * 27 | * @file MwsIdsResponseFormatter.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 30 Jul 2011 30 | * 31 | * @edited Radu Hambasan 32 | * @date 20 Mar 2014 33 | * License: GPL v3 34 | * 35 | */ 36 | 37 | #include "mws/types/GenericAnswer.hpp" 38 | #include "mws/types/Query.hpp" 39 | 40 | namespace mws { 41 | namespace parser { 42 | 43 | struct MwsIdsResponseFormatter : public types::Query::ResponseFormatter { 44 | static MwsIdsResponseFormatter instance; 45 | 46 | virtual const char* getContentType() const; 47 | virtual int writeData(const GenericAnswer* ans, FILE* output) const; 48 | }; 49 | 50 | extern MwsIdsResponseFormatter* RESPONSE_FORMATTER_MWS_IDS; 51 | 52 | } // namespace parser 53 | } // namespace mws 54 | 55 | #endif // _MWS_PARSER_MWSIDSRESPONSE_FORMATTER_HPP 56 | -------------------------------------------------------------------------------- /src/mws/xmlparser/MwsJsonResponseFormatter.hpp: -------------------------------------------------------------------------------- 1 |  2 | /* 3 | 4 | Copyright (C) 2010-2013 KWARC Group 5 | 6 | This file is part of MathWebSearch. 7 | 8 | MathWebSearch is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | MathWebSearch is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with MathWebSearch. If not, see . 20 | 21 | */ 22 | #ifndef _MWS_PARSER_MWSJSONRESPONSE_FORMATTER_HPP 23 | #define _MWS_PARSER_MWSJSONRESPONSE_FORMATTER_HPP 24 | 25 | /** 26 | * @file MwsJsonResponseFormatter.hpp 27 | * @author Corneliu-Claudiu Prodescu 28 | * @date 30 Jul 2011 29 | * 30 | * @edited Radu Hambasan 31 | * @date 20 Mar 2014 32 | * License: GPL v3 33 | * 34 | */ 35 | 36 | #include "mws/types/Query.hpp" 37 | #include "mws/types/GenericAnswer.hpp" 38 | namespace mws { 39 | namespace parser { 40 | 41 | struct MwsJsonResponseFormatter : public types::Query::ResponseFormatter { 42 | static MwsJsonResponseFormatter instance; 43 | 44 | virtual const char* getContentType() const; 45 | virtual int writeData(const GenericAnswer* ans, FILE* output) const; 46 | }; 47 | 48 | extern MwsJsonResponseFormatter* RESPONSE_FORMATTER_MWS_JSON; 49 | 50 | } // namespace parser 51 | } // namespace mws 52 | 53 | #endif // _MWS_PARSER_MWSJSONRESPONSE_FORMATTER_HPP 54 | -------------------------------------------------------------------------------- /src/mws/xmlparser/MwsXmlResponseFormatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_PARSER_MWSXMLRESPONSE_FORMATTER_HPP 22 | #define _MWS_PARSER_MWSXMLRESPONSE_FORMATTER_HPP 23 | 24 | /** 25 | * @file MwsXmlResponseFormatter.hpp 26 | * @author Corneliu-Claudiu Prodescu 27 | * @date 30 Jul 2011 28 | * 29 | * @edited Radu Hambasan 30 | * @date 20 Mar 2014 31 | * License: GPL v3 32 | * 33 | */ 34 | 35 | #include "mws/types/GenericAnswer.hpp" 36 | #include "mws/types/Query.hpp" 37 | 38 | namespace mws { 39 | namespace parser { 40 | 41 | struct MwsXmlResponseFormatter : public types::Query::ResponseFormatter { 42 | static MwsXmlResponseFormatter instance; 43 | 44 | virtual const char* getContentType() const; 45 | virtual int writeData(const GenericAnswer* ans, FILE* output) const; 46 | }; 47 | 48 | extern const MwsXmlResponseFormatter* RESPONSE_FORMATTER_MWS_XML; 49 | 50 | } // namespace parser 51 | } // namespace mws 52 | 53 | #endif // _MWS_PARSER_MWSXMLRESPONSE_FORMATTER_HPP 54 | -------------------------------------------------------------------------------- /src/mws/xmlparser/SchemaJsonResponseFormatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_PARSER_SCHEMAJSONRESPONSE_FORMATTER_HPP 22 | #define _MWS_PARSER_SCHEMAJSONRESPONSE_FORMATTER_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 28 Jan 2015 27 | * License: GPL v3 28 | * 29 | */ 30 | 31 | #include 32 | #include 33 | 34 | #include "mws/types/CmmlToken.hpp" 35 | #include "mws/types/Query.hpp" 36 | #include "mws/types/GenericAnswer.hpp" 37 | 38 | namespace mws { 39 | namespace parser { 40 | 41 | struct SchemaJsonResponseFormatter : public types::Query::ResponseFormatter { 42 | static SchemaJsonResponseFormatter instance; 43 | 44 | virtual const char* getContentType() const; 45 | virtual int writeData(const GenericAnswer* ans, FILE* output) const; 46 | }; 47 | 48 | extern SchemaJsonResponseFormatter* RESPONSE_FORMATTER_SCHEMA_JSON; 49 | 50 | } // namespace parser 51 | } // namespace mws 52 | 53 | #endif // _MWS_PARSER_SCHEMAJSONRESPONSE_FORMATTER_HPP 54 | -------------------------------------------------------------------------------- /src/mws/xmlparser/SchemaXmlResponseFormatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_PARSER_SCHEMAXMLRESPONSE_FORMATTER_HPP 22 | #define _MWS_PARSER_SCHEMAXMLRESPONSE_FORMATTER_HPP 23 | 24 | /** 25 | * @author Radu Hambasan 26 | * @date 30 Dec 2014 27 | * 28 | * License: GPL v3 29 | * 30 | */ 31 | 32 | #include 33 | #include "mws/types/CmmlToken.hpp" 34 | #include "mws/types/GenericAnswer.hpp" 35 | #include "mws/types/Query.hpp" 36 | 37 | namespace mws { 38 | namespace parser { 39 | 40 | struct SchemaXmlResponseFormatter : public types::Query::ResponseFormatter { 41 | static SchemaXmlResponseFormatter instance; 42 | 43 | virtual const char* getContentType() const; 44 | virtual int writeData(const GenericAnswer* ans, FILE* output) const; 45 | 46 | private: 47 | int printCmmlToken(const types::CmmlToken* root, xmlTextWriter* wrt) const; 48 | }; 49 | 50 | extern const SchemaXmlResponseFormatter* RESPONSE_FORMATTER_SCHEMA_XML; 51 | 52 | } // namespace parser 53 | } // namespace mws 54 | 55 | #endif // _MWS_PARSER_SCHEMAXMLRESPONSE_FORMATTER_HPP 56 | -------------------------------------------------------------------------------- /src/mws/xmlparser/readMwsQuery.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _READMWSQUERY_HPP 22 | #define _READMWSQUERY_HPP 23 | 24 | /** 25 | * @brief File containing the header of the readMwsQuery function 26 | * 27 | * @file readMwsQuery.hpp 28 | * @author Corneliu-Claudiu Prodescu 29 | * @date 18 Apr 2011 30 | * 31 | * @edited Radu Hambasan 32 | * @date 20 Mar 2014 33 | * 34 | * License: GPL v3 35 | * 36 | */ 37 | 38 | #include 39 | #include "mws/types/Query.hpp" 40 | 41 | namespace mws { 42 | namespace xmlparser { 43 | 44 | enum QueryMode { 45 | QUERY_MWS, 46 | QUERY_SCHEMA 47 | }; 48 | 49 | /** 50 | * @brief Function to read a MwsQuery from an input file descriptor. 51 | * @param file is the file from where to read. 52 | * @return a pointer to a MwsQuery containing the information read or NULL in 53 | * case of failure. 54 | */ 55 | mws::types::Query* readMwsQuery(FILE* file, QueryMode mode = QUERY_MWS); 56 | 57 | } // namespace xmlparser 58 | } // namespace mws 59 | 60 | #endif // _READMWSQUERY 61 | -------------------------------------------------------------------------------- /src/mws/xmlparser/xmlparser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file xmlparser.cpp 23 | * @author Corneliu-Claudiu Prodescu 24 | * @date 05 May 2011 25 | * 26 | * 27 | */ 28 | 29 | #include 30 | #include 31 | 32 | #include "mws/xmlparser/xmlparser.hpp" 33 | 34 | namespace mws { 35 | namespace parser { 36 | 37 | int initxmlparser() { 38 | // Initializing the library and checking potential ABI mismatches between 39 | // the version it was compiled for and the actual shared library used. 40 | LIBXML_TEST_VERSION; 41 | 42 | // Register xmlCleanupParser to be called at program exit 43 | return atexit(xmlCleanupParser); 44 | } 45 | 46 | } // namespace parser 47 | } // namespace mws 48 | -------------------------------------------------------------------------------- /src/mws/xmlparser/xmlparser.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | #ifndef _MWS_PARSER_XMLPARSER_HPP 22 | #define _MWS_PARSER_XMLPARSER_HPP 23 | 24 | /** 25 | * @file xmlparser.hpp 26 | * @author Corneliu-Claudiu Prodescu 27 | * @date 05 May 2011 28 | * 29 | */ 30 | 31 | namespace mws { 32 | namespace parser { 33 | 34 | /** 35 | * @brief Method to initialize the xmlparser module of Math Web Search. 36 | * @return 0 if the initialization succeeds with multi-threading support 37 | * @return 1 if the initialization succeeds w/o multi-threading support 38 | * @return -1 on failure 39 | */ 40 | int initxmlparser(); 41 | 42 | } // namespace parser 43 | } // namespace mws 44 | 45 | #endif // _MWS_PARSER_XMLPARSER_HPP 46 | -------------------------------------------------------------------------------- /test/data/MwsQuery1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | subscript 10 | R 11 | s 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | subscript 20 | R 21 | s 22 | 23 | 24 | subscript 25 | xaa 26 | 27 | 28 | 29 | x 30 | f 31 | 32 | 33 | 34 | 35 | 36 | 37 | subscript 38 | R 39 | s 40 | 41 | 0 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /test/data/ci_renaming.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | + 6 | a 7 | b 8 | 9 | 10 | 11 | 12 | 13 | + 14 | x 15 | y 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /test/data/data1.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | subscript 9 | 10 | 11 | 12 | x 13 | 0 14 | 15 | 16 | 17 | 18 | 1 19 | 20 | superscript 21 | x 22 | 2 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /test/data/data2.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | bar 5 | 6 | 9 | 10 | 11 | subscript 12 | 13 | 14 | 15 | x 16 | 0 17 | 18 | 19 | 20 | 21 | 1 22 | 23 | superscript 24 | x 25 | 2 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /test/data/data3.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | bar 5 | 6 | 9 | 10 | 11 | subscript 12 | 13 | 14 | 15 | x 16 | 0 17 | 18 | 19 | 20 | 21 | 1 22 | 23 | superscript 24 | x 25 | 2 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /test/data/data4.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CONTENT 5 | 6 | 7 | 119 8 | 9 | 10 | -------------------------------------------------------------------------------- /test/data/empty.harvest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MathWebSearch/mws/35306ab7fd8a01b39426881cbe8088197e023f36/test/data/empty.harvest -------------------------------------------------------------------------------- /test/data/eq_ambiguity.harvest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | P 7 | eq 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /test/src/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | ADD_SUBDIRECTORY( utils ) 20 | ADD_SUBDIRECTORY( types ) 21 | -------------------------------------------------------------------------------- /test/src/common/types/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/common/types/CMakeLists.txt -- 21 | # 22 | # 10 Dec 2013 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Dependencies 27 | 28 | # Includes 29 | 30 | # Flags 31 | 32 | # Sources 33 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 34 | 35 | # Binaries 36 | FOREACH(source ${SOURCES}) 37 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 38 | # Generate Binaries 39 | ADD_EXECUTABLE(${SourceName} ${source}) 40 | TARGET_LINK_LIBRARIES(${SourceName} 41 | commontypes 42 | commonutils 43 | mwstypes) 44 | # Add test 45 | SET(TestName "test_${SourceName}") 46 | ADD_TEST(${TestName} ${SourceName}) 47 | ENDFOREACH(source) 48 | -------------------------------------------------------------------------------- /test/src/common/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/common/utils/CMakeLists.txt -- 21 | # 22 | # 19 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Dependencies 27 | 28 | # Includes 29 | 30 | # Flags 31 | 32 | # Sources 33 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 34 | 35 | # Binaries 36 | FOREACH(source ${SOURCES}) 37 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 38 | # Generate Binaries 39 | ADD_EXECUTABLE(${SourceName} ${source}) 40 | TARGET_LINK_LIBRARIES(${SourceName} 41 | commonutils) 42 | # Add test 43 | SET(TestName "test_${SourceName}") 44 | ADD_TEST(${TestName} ${SourceName}) 45 | ENDFOREACH(source) 46 | -------------------------------------------------------------------------------- /test/src/common/utils/common_utils_mmap.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * 23 | * 24 | */ 25 | 26 | #include 27 | #include 28 | 29 | #include "common/utils/compiler_defs.h" 30 | #include "common/utils/mmap.h" 31 | 32 | 33 | #define TMPFILE_PATH "/tmp/test.map" 34 | #define TMPFILE_SIZE 4 * 1024 35 | 36 | 37 | int main() { 38 | mmap_handle_t m; 39 | 40 | /* ensure the file does not exist */ 41 | FAIL_ON(unlink(TMPFILE_PATH) != 0 && errno != ENOENT); 42 | 43 | /* create and map read-write */ 44 | FAIL_ON(mmap_create(TMPFILE_PATH, TMPFILE_SIZE, MAP_PRIVATE, &m) != 0); 45 | 46 | /* unmap */ 47 | FAIL_ON(mmap_unload(&m) != 0); 48 | 49 | /* map read-only */ 50 | FAIL_ON(mmap_load(TMPFILE_PATH, MAP_PRIVATE, &m) != 0); 51 | 52 | /* remove mmapped file */ 53 | FAIL_ON(mmap_remove(&m) != 0); 54 | 55 | return 0; 56 | 57 | fail: 58 | return -1; 59 | } 60 | -------------------------------------------------------------------------------- /test/src/mws/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | ADD_SUBDIRECTORY(dbc) 20 | ADD_SUBDIRECTORY(index) 21 | ADD_SUBDIRECTORY(parser) 22 | ADD_SUBDIRECTORY(query) 23 | 24 | CONFIGURE_FILE(mws-integration-test.sh.in mws-integration-test.sh @ONLY) 25 | ADD_TEST(NAME integration_queries 26 | COMMAND mws-integration-test.sh) 27 | SET_TESTS_PROPERTIES(integration_queries PROPERTIES TIMEOUT 10) 28 | -------------------------------------------------------------------------------- /test/src/mws/dbc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/mws/dbc/CMakeLists.txt -- 21 | # 22 | # 19 Jun 2013 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Dependencies 27 | 28 | # Includes 29 | 30 | # Flags 31 | 32 | # Sources 33 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 34 | 35 | # Binaries 36 | FOREACH(source ${SOURCES}) 37 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 38 | # Generate Binaries 39 | ADD_EXECUTABLE(${SourceName} ${source}) 40 | TARGET_LINK_LIBRARIES(${SourceName} 41 | mwsdbc 42 | mwstypes 43 | commonutils) 44 | # Add test 45 | SET(TestName "test_${SourceName}") 46 | ADD_TEST(${TestName} ${SourceName}) 47 | ENDFOREACH(source) 48 | -------------------------------------------------------------------------------- /test/src/mws/dbc/MemCrawlDb.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file MemCrawlDb.cpp 23 | * 24 | */ 25 | 26 | #include 27 | 28 | #include "mws/dbc/MemCrawlDb.hpp" 29 | using mws::dbc::CrawlId; 30 | using mws::dbc::CrawlData; 31 | using mws::dbc::CrawlDb; 32 | using mws::dbc::MemCrawlDb; 33 | 34 | #include "common/utils/compiler_defs.h" 35 | 36 | int main() { 37 | CrawlDb* crawlDb = new MemCrawlDb(); 38 | CrawlData crawlData = "foobar"; 39 | 40 | CrawlId crawlId = crawlDb->putData(crawlData); 41 | 42 | // Check if data is inserted 43 | FAIL_ON((crawlData = crawlDb->getData(crawlId)) != "foobar"); 44 | 45 | // Check for false positives 46 | try { 47 | CrawlData data = crawlDb->getData(42); 48 | goto fail; 49 | } catch (...) { 50 | // ignore 51 | } 52 | 53 | delete crawlDb; 54 | 55 | return 0; 56 | 57 | fail: 58 | return -1; 59 | } 60 | -------------------------------------------------------------------------------- /test/src/mws/dbc/MemFormulaDb.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file MemFormulaDb.cpp 23 | * 24 | */ 25 | 26 | #include 27 | using std::vector; 28 | 29 | #include "common/utils/compiler_defs.h" 30 | #include "mws/dbc/CrawlDb.hpp" 31 | using mws::dbc::CrawlId; 32 | #include "mws/dbc/FormulaDb.hpp" 33 | using mws::dbc::FormulaDb; 34 | #include "mws/dbc/MemFormulaDb.hpp" 35 | using mws::dbc::MemFormulaDb; 36 | #include "mws/types/FormulaPath.hpp" 37 | using mws::types::FormulaPath; 38 | 39 | struct FormulaInfo { 40 | CrawlId crawlId; 41 | FormulaPath formulaPath; 42 | }; 43 | 44 | vector g_infos{{0, FormulaPath("id1", "0")}, 45 | {3, FormulaPath("id2", "1")}, 46 | {2, FormulaPath("id3", "4")}}; 47 | 48 | const int TEST_START_IDX = 1; 49 | 50 | static int queryCallback(const CrawlId& crawlId, 51 | const FormulaPath& formulaPath) { 52 | static int i = TEST_START_IDX; 53 | 54 | const FormulaInfo& info = g_infos[i++]; 55 | 56 | FAIL_ON(info.crawlId != crawlId); 57 | FAIL_ON(info.formulaPath != formulaPath); 58 | 59 | return 0; 60 | 61 | fail: 62 | return -1; 63 | } 64 | 65 | int main() { 66 | FormulaDb* formulaDb = new MemFormulaDb(); 67 | 68 | for (auto it = g_infos.begin(); it != g_infos.end(); it++) { 69 | FAIL_ON(formulaDb->insertFormula(0, it->crawlId, it->formulaPath) != 0); 70 | } 71 | 72 | FAIL_ON(formulaDb->queryFormula(0, TEST_START_IDX, 2, queryCallback) != 0); 73 | 74 | delete formulaDb; 75 | 76 | return 0; 77 | 78 | fail: 79 | return -1; 80 | } 81 | -------------------------------------------------------------------------------- /test/src/mws/index/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/mws/index/CMakeLists.txt -- 21 | # 22 | # 19 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Dependencies 27 | 28 | # Includes 29 | INCLUDE_DIRECTORIES( "${LIBXML2_INCLUDE_DIR}" ) 30 | 31 | # Flags 32 | 33 | # Sources 34 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 35 | 36 | FOREACH(source ${SOURCES}) 37 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 38 | # Generate Binaries 39 | ADD_EXECUTABLE(${SourceName} ${source}) 40 | TARGET_LINK_LIBRARIES(${SourceName} 41 | mwsdbc 42 | mwsindex 43 | mwsxmlparser 44 | commonutils 45 | ${LIBXML2_LIBRARIES}) 46 | # Add test 47 | SET(TestName "test_${SourceName}") 48 | ADD_TEST(${TestName} ${SourceName}) 49 | ENDFOREACH(source) 50 | -------------------------------------------------------------------------------- /test/src/mws/mws-integration-test.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd @PROJECT_SOURCE_DIR@ 3 | MWSD_PID="$$" 4 | MWSD_PORT="4444" 5 | 6 | QVAR_QUERY="" 7 | QVAR_TOTAL="28328" 8 | 9 | test_query() { 10 | QUERY="$1" 11 | TOTAL="$2" 12 | 13 | MWS_RESPONSE=`./scripts/send-mws-query.sh $MWSD_PORT 0 0 "$QUERY"` 14 | 15 | # check response 16 | echo $MWS_RESPONSE | grep -q "total=\"$TOTAL\"" || { 17 | echo Wrong output for query "$QUERY" - expected "$TOTAL" 18 | # terminate mwsd if answer is wrong 19 | kill -KILL $MWSD_PID 20 | } 21 | } 22 | 23 | query_and_close_mwsd() { 24 | while kill -s 0 $MWSD_PID &> /dev/null; do 25 | # wait for mwsd port to be up 26 | nc -z localhost $MWSD_PORT 2>&1 > /dev/null || continue 27 | 28 | # run tests 29 | test_query "" "28304" 30 | test_query "" "82" 31 | test_query "" "0" 32 | test_query "" "276" 33 | 34 | # terminate mwsd gracefully 35 | kill -INT $MWSD_PID 36 | exit 0 37 | done 38 | } 39 | 40 | query_and_close_mwsd & 41 | exec bin/mwsd -I data/zbl -p $MWSD_PORT 42 | -------------------------------------------------------------------------------- /test/src/mws/parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/parser/CMakeLists.txt -- 21 | # 22 | # 19 Jun 2011 23 | # c.prodescu@jacobs-university.de 24 | # 25 | 26 | # Dependencies 27 | FIND_PACKAGE (LibXml2 REQUIRED) 28 | 29 | # Includes 30 | INCLUDE_DIRECTORIES( "${LIBXML2_INCLUDE_DIR}" ) 31 | 32 | # Flags 33 | 34 | # Sources 35 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 36 | 37 | # Binaries 38 | FOREACH(source ${SOURCES}) 39 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 40 | # Generate Binaries 41 | ADD_EXECUTABLE(${SourceName} ${source}) 42 | TARGET_LINK_LIBRARIES(${SourceName} 43 | mwsxmlparser 44 | commonutils 45 | ${LIBXML2_LIBRARIES}) 46 | # Add test 47 | SET(TestName "test_${SourceName}") 48 | ADD_TEST(${TestName} ${SourceName}) 49 | ENDFOREACH(source) 50 | -------------------------------------------------------------------------------- /test/src/mws/parser/readMwsQueryTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief Testing for the readMwsQueryFromFd function - implementation 23 | * 24 | * @file readMwsQueryFromFdTest.cpp 25 | * @author Prodescu Corneliu-Claudiu 26 | * @date 27 Apr 2011 27 | * 28 | * License: GPL v3 29 | * 30 | */ 31 | 32 | #include // Primitive System datatypes 33 | #include // POSIX File characteristics 34 | #include // File control operations 35 | #include // LibXML parser header 36 | #include // C++ String header 37 | #include 38 | #include 39 | 40 | #include "mws/xmlparser/readMwsQuery.hpp" 41 | #include "common/utils/compiler_defs.h" 42 | 43 | #include "build-gen/config.h" 44 | 45 | // Namespaces 46 | 47 | using namespace std; 48 | using namespace mws; 49 | 50 | int main() { 51 | types::Query* result; 52 | FILE* file; 53 | const char* xmlfile = "MwsQuery1.xml"; 54 | string xml_path = (string)MWS_TESTDATA_PATH + "/" + (string)xmlfile; 55 | 56 | file = fopen(xml_path.c_str(), "r"); 57 | FAIL_ON(file == nullptr); 58 | 59 | result = xmlparser::readMwsQuery(file); 60 | 61 | FAIL_ON(result == nullptr); 62 | FAIL_ON(result->warnings != 0); 63 | FAIL_ON(result->attrResultMaxSize != 24); 64 | FAIL_ON(result->attrResultLimitMin != 1); 65 | FAIL_ON(result->tokens.size() != (size_t)1); 66 | 67 | delete result; 68 | 69 | fclose(file); 70 | 71 | (void)xmlCleanupParser(); 72 | 73 | return EXIT_SUCCESS; 74 | 75 | fail: 76 | return EXIT_FAILURE; 77 | } 78 | -------------------------------------------------------------------------------- /test/src/mws/parser/writeXmlAnswsetTest.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @brief Testing for the writeXmlAnswsetToFd function - implementation 23 | * 24 | * @file writeXmlAnswsetToFd.cpp 25 | * @author Prodescu Corneliu-Claudiu 26 | * @date 27 Apr 2011 27 | * 28 | * License: GPL v3 29 | * 30 | */ 31 | 32 | #include // Primitive System datatypes 33 | #include // POSIX File characteristics 34 | #include // File control operations 35 | #include // C++ String header 36 | #include 37 | #include 38 | 39 | #include "mws/types/Answer.hpp" 40 | #include "mws/types/MwsAnswset.hpp" 41 | #include "mws/xmlparser/MwsXmlResponseFormatter.hpp" 42 | using mws::parser::RESPONSE_FORMATTER_MWS_XML; 43 | #include "common/utils/compiler_defs.h" 44 | 45 | // Macros 46 | 47 | #define TMP_PATH "/tmp/" 48 | 49 | int main() { 50 | using mws::MwsAnswset; 51 | using std::string; 52 | using mws::types::Answer; 53 | 54 | const string xml_path = (string)TMP_PATH + "/MwsAnswset1.xml"; 55 | 56 | auto answer = new Answer(); 57 | answer->data = "lalala"; 58 | answer->uri = "http://foo"; 59 | answer->xpath = "//*[1]"; 60 | MwsAnswset answset; 61 | answset.answers.push_back(answer); 62 | 63 | FILE* file = fopen(xml_path.c_str(), "w"); 64 | FAIL_ON(file == nullptr); 65 | FAIL_ON(RESPONSE_FORMATTER_MWS_XML->writeData(&answset, file) != 260); 66 | 67 | (void)fclose(file); 68 | 69 | return EXIT_SUCCESS; 70 | 71 | fail: 72 | return EXIT_FAILURE; 73 | } 74 | -------------------------------------------------------------------------------- /test/src/mws/query/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 20 | # test/src/mws/query/CMakeLists.txt -- 21 | # 22 | 23 | # Dependencies 24 | 25 | # Includes 26 | 27 | # Flags 28 | 29 | # Sources 30 | FILE( GLOB SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp" "*.c") 31 | 32 | # Binaries 33 | FOREACH(source ${SOURCES}) 34 | GET_FILENAME_COMPONENT(SourceName ${source} NAME_WE) 35 | # Generate Binaries 36 | ADD_EXECUTABLE(${SourceName} ${source}) 37 | TARGET_LINK_LIBRARIES(${SourceName} 38 | mwsindex 39 | mwsquery 40 | commonutils) 41 | # Add test 42 | SET(TestName "test_${SourceName}") 43 | ADD_TEST(${TestName} ${SourceName}) 44 | ENDFOREACH(source) 45 | -------------------------------------------------------------------------------- /test/src/mws/query/engine_rep_qvars2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file engine_rep_qvars2.cpp 23 | * 24 | */ 25 | 26 | #include 27 | #include 28 | 29 | #include "engine_tester.hpp" 30 | 31 | using namespace mws; 32 | using namespace std; 33 | 34 | /* 35 | 36 | index: f(h,h,t): (apply,4) (f,0) (h,0) (h,0) (f,0) 37 | query: f(h,h,t): (apply,4) (P,0) (Q,0) (Q,0) (P,0) 38 | 39 | 1 solution expected 40 | 41 | */ 42 | static int g_num_hits; 43 | 44 | struct Tester { 45 | static index::TmpIndex* create_test_MwsIndexNode() { 46 | auto data = new index::TmpIndex(); 47 | index::TmpLeafNode* leaf; 48 | 49 | leaf = data->insertData({f_tok}); 50 | leaf->solutions++; 51 | leaf = data->insertData({h_tok}); 52 | leaf->solutions++; 53 | leaf = data->insertData({apply4_tok, f_tok, h_tok, h_tok, f_tok}); 54 | leaf->solutions++; 55 | 56 | return data; 57 | } 58 | }; 59 | 60 | static encoded_formula_t create_test_query() { 61 | encoded_formula_t result; 62 | 63 | result.data = new encoded_token_t[5]; 64 | result.size = 5; 65 | result.data[0] = apply4_tok; 66 | result.data[1] = P_tok; 67 | result.data[2] = Q_tok; 68 | result.data[3] = Q_tok; 69 | result.data[4] = P_tok; 70 | 71 | return result; 72 | } 73 | 74 | static result_cb_return_t result_callback(void* handle, const leaf_t* leaf) { 75 | UNUSED(handle); 76 | UNUSED(leaf); 77 | 78 | g_num_hits++; 79 | 80 | return QUERY_CONTINUE; 81 | } 82 | 83 | int main() { 84 | mws::index::TmpIndex* index = Tester::create_test_MwsIndexNode(); 85 | encoded_formula_t query = create_test_query(); 86 | 87 | FAIL_ON(query_engine_tester(index, &query, result_callback, nullptr) == 88 | EXIT_FAILURE); 89 | FAIL_ON(g_num_hits != 1); 90 | 91 | return EXIT_SUCCESS; 92 | 93 | fail: 94 | return EXIT_FAILURE; 95 | } 96 | -------------------------------------------------------------------------------- /test/src/mws/query/engine_rootqvar0.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2013 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @file engine_rootqvar0.cpp 23 | * 24 | */ 25 | 26 | #include 27 | #include 28 | 29 | #include "engine_tester.hpp" 30 | 31 | using namespace mws; 32 | using namespace std; 33 | 34 | /* 35 | 36 | index: f(h,h,t): (apply,4) (f,0) (h,0) (h,0) (t,0) 37 | query: P : (P,0) 38 | 39 | 4 solutions expected: f, h, t, f(h,h,t) 40 | 41 | */ 42 | static int g_num_hits; 43 | 44 | struct Tester { 45 | static index::TmpIndex* create_test_MwsIndexNode() { 46 | auto data = new index::TmpIndex(); 47 | index::TmpLeafNode* leaf; 48 | 49 | leaf = data->insertData({f_tok}); 50 | leaf->solutions++; 51 | leaf = data->insertData({t_tok}); 52 | leaf->solutions++; 53 | leaf = data->insertData({h_tok}); 54 | leaf->solutions++; 55 | leaf = data->insertData({apply4_tok, f_tok, h_tok, h_tok, t_tok}); 56 | leaf->solutions++; 57 | 58 | return data; 59 | } 60 | }; 61 | 62 | static encoded_formula_t create_test_query() { 63 | encoded_formula_t result; 64 | 65 | result.data = new encoded_token_t[5]; 66 | result.size = 1; 67 | result.data[0] = P_tok; 68 | 69 | return result; 70 | } 71 | 72 | static result_cb_return_t result_callback(void* handle, const leaf_t* leaf) { 73 | UNUSED(handle); 74 | UNUSED(leaf); 75 | 76 | g_num_hits++; 77 | 78 | return QUERY_CONTINUE; 79 | } 80 | 81 | int main() { 82 | mws::index::TmpIndex* index = Tester::create_test_MwsIndexNode(); 83 | encoded_formula_t query = create_test_query(); 84 | 85 | FAIL_ON(query_engine_tester(index, &query, result_callback, nullptr) == 86 | EXIT_FAILURE); 87 | FAIL_ON(g_num_hits != 4); 88 | 89 | return EXIT_SUCCESS; 90 | 91 | fail: 92 | return EXIT_FAILURE; 93 | } 94 | -------------------------------------------------------------------------------- /test/src/mws/query/schema_hashing.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2010-2015 KWARC Group 4 | 5 | This file is part of MathWebSearch. 6 | 7 | MathWebSearch is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | MathWebSearch is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with MathWebSearch. If not, see . 19 | 20 | */ 21 | /** 22 | * @author Radu Hambasan 23 | * @date 28 Dec 2014 24 | * 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #include "mws/index/MeaningDictionary.hpp" 32 | using mws::index::MeaningDictionary; 33 | #include "schema_engine_tester.hpp" 34 | #include "mws/query/SchemaEngine.hpp" 35 | 36 | using namespace mws; 37 | using namespace query; 38 | using namespace std; 39 | 40 | /* 41 | 42 | query #1: f(t,t,t): (apply,4) (f,0) (t,0) (t,0) (t,0) 43 | query #2: f(t,t,t): (apply,4) (f,0) (t,0) (t,0) (t,0) 44 | query #3: f(cn,t,t): (apply,4) (f,0) (cn,0) (t,0) (t,0) 45 | query #4: cn: (cn,0) 46 | query #5: g(t,t,t): (apply,4) (g,0) (t,0) (t,0) (t,0) 47 | 48 | At depth 0, we should have 1 schema: 49 | ?x (the 0 unification) 50 | 51 | At depth 1, we should have 3 schemata: 52 | (apply,4) (f,0) 53 | (cn,0) 54 | (apply,4) (g,0) 55 | 56 | At depth 2, we should have 4 schemata. 57 | (apply,4) (f,0) (t,0) (t,0) (t,0) 58 | (apply,4) (f,0) (t,0) (t,0) (t,0) 59 | (apply,4) (f,0) (cn,0) (t,0) (t,0) 60 | (cn,0) 61 | (apply,4) (g,0) (t,0) (t,0) (t,0) 62 | 63 | */ 64 | 65 | 66 | static vector create_test_exprs() { 67 | vector exprs( 68 | {{apply4_tok, f_tok, t_tok, t_tok, t_tok}, 69 | {apply4_tok, f_tok, t_tok, t_tok, t_tok}, 70 | {apply4_tok, f_tok, cn_tok, t_tok, t_tok}, 71 | {cn_tok}, 72 | {apply4_tok, g_tok, t_tok, t_tok, t_tok}}); 73 | return exprs; 74 | } 75 | 76 | int main() { 77 | vector exprs = create_test_exprs(); 78 | 79 | FAIL_ON(Tester::test_expr_hashing(exprs, 1, 0) != EXIT_SUCCESS); 80 | FAIL_ON(Tester::test_expr_hashing(exprs, 3, 1) != EXIT_SUCCESS); 81 | FAIL_ON(Tester::test_expr_hashing(exprs, 4, 2) != EXIT_SUCCESS); 82 | 83 | return EXIT_SUCCESS; 84 | 85 | fail: 86 | return EXIT_FAILURE; 87 | } 88 | -------------------------------------------------------------------------------- /third_party/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 3rd party libraries 20 | 21 | # Google URL 22 | ADD_SUBDIRECTORY(crc32) 23 | SET(CRC32_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/crc32" PARENT_SCOPE) 24 | SET(CRC32_LIBRARIES crc32 PARENT_SCOPE) 25 | -------------------------------------------------------------------------------- /third_party/common/crc32/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # third_party/common/crc32 -- 20 | 21 | # Sources 22 | FILE( GLOB SOURCES "*.c") 23 | 24 | # Targets 25 | ADD_LIBRARY( crc32 ${SOURCES} ) 26 | -------------------------------------------------------------------------------- /third_party/common/crc32/crc32/crc32.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or 3 | * code or tables extracted from it, as desired without restriction. 4 | * 5 | * First, the polynomial itself and its table of feedback terms. The 6 | * polynomial is 7 | * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 8 | * 9 | * Note that we take it "backwards" and put the highest-order term in 10 | * the lowest-order bit. The X^32 term is "implied"; the LSB is the 11 | * X^31 term, etc. The X^0 term (usually shown as "+1") results in 12 | * the MSB being 1 13 | * 14 | * Note that the usual hardware shift register implementation, which 15 | * is what we're using (we're merely optimizing it by doing eight-bit 16 | * chunks at a time) shifts bits into the lowest-order term. In our 17 | * implementation, that means shifting towards the right. Why do we 18 | * do it this way? Because the calculated CRC must be transmitted in 19 | * order from highest-order term to lowest-order term. UARTs transmit 20 | * characters in order from LSB to MSB. By storing the CRC this way 21 | * we hand it to the UART in the order low-byte to high-byte; the UART 22 | * sends each low-bit to hight-bit; and the result is transmission bit 23 | * by bit from highest- to lowest-order term without requiring any bit 24 | * shuffling on our part. Reception works similarly 25 | * 26 | * The feedback terms table consists of 256, 32-bit entries. Notes 27 | * 28 | * The table can be generated at runtime if desired; code to do so 29 | * is shown later. It might not be obvious, but the feedback 30 | * terms simply represent the results of eight shift/xor opera 31 | * tions for all combinations of data and CRC register values 32 | * 33 | * The values must be right-shifted by eight bits by the "updcrc 34 | * logic; the shift must be unsigned (bring in zeroes). On some 35 | * hardware you could probably optimize the shift in assembler by 36 | * using byte-swap instructions 37 | * polynomial $edb88320 38 | * 39 | * 40 | * CRC32 code derived from work by Gary S. Brown. 41 | */ 42 | 43 | #ifndef THIRD_PARTY_COMMON_CRC32_H 44 | #define THIRD_PARTY_COMMON_CRC32_H 45 | 46 | #include 47 | #include 48 | 49 | uint32_t crc32(uint32_t crc, const void *buf, size_t size); 50 | 51 | #endif // THIRD_PARTY_COMMON_CRC32_H 52 | -------------------------------------------------------------------------------- /third_party/crawler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | # 3rd party libraries 20 | 21 | # Google URL 22 | ADD_SUBDIRECTORY(googleurl) 23 | SET(GOOGLEURL_FOUND true PARENT_SCOPE) 24 | SET(GOOGLEURL_INCLUDES 25 | "${CMAKE_CURRENT_SOURCE_DIR}/googleurl/src" PARENT_SCOPE) 26 | SET(GOOGLEURL_LIBRARIES googleurl PARENT_SCOPE) 27 | -------------------------------------------------------------------------------- /third_party/crawler/googleurl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2010-2013 KWARC Group 3 | # 4 | # This file is part of MathWebSearch. 5 | # 6 | # MathWebSearch is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # MathWebSearch is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with MathWebSearch. If not, see . 18 | # 19 | 20 | # Sources 21 | FILE( GLOB SOURCES "src/*.cc") 22 | 23 | # Targets 24 | ADD_LIBRARY( googleurl ${SOURCES} ) 25 | 26 | # External dependencies 27 | FIND_PACKAGE( ICU REQUIRED ) 28 | FIND_PACKAGE( Threads REQUIRED ) 29 | 30 | # disable warnings from 3rd party code 31 | SET( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w" ) 32 | SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w" ) 33 | 34 | # Includes 35 | TARGET_INCLUDE_DIRECTORIES( googleurl PUBLIC ${ICU_INCLUDE_DIRS} ) 36 | 37 | # Dependencies 38 | TARGET_LINK_LIBRARIES( googleurl 39 | ${CMAKE_THREAD_LIBS_INIT} 40 | ${ICU_LIBRARIES} 41 | ) 42 | -------------------------------------------------------------------------------- /third_party/crawler/googleurl/src/macros.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | #ifndef BASE_MACROS_H_ 15 | #define BASE_MACROS_H_ 16 | 17 | // DISALLOW_COPY_AND_ASSIGN disallows the copy and operator= functions. 18 | // It goes in the private: declarations in a class. 19 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ 20 | TypeName(const TypeName&); \ 21 | void operator=(const TypeName&) 22 | 23 | #endif // BASE_MACROS_H_ 24 | -------------------------------------------------------------------------------- /third_party/crawler/googleurl/src/url_common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | #ifndef GOOGLEURL_SRC_URL_COMMON_H__ 31 | #define GOOGLEURL_SRC_URL_COMMON_H__ 32 | 33 | #if !defined(GURL_IMPLEMENTATION) 34 | #define GURL_IMPLEMENTATION 0 35 | #endif 36 | 37 | #if defined(GURL_DLL) 38 | #if defined(WIN32) 39 | #if GURL_IMPLEMENTATION 40 | #define GURL_API __declspec(dllexport) 41 | #else 42 | #define GURL_API __declspec(dllimport) 43 | #endif 44 | #else 45 | // Non-Windows DLLs. 46 | #define GURL_API __attribute__((visibility("default"))) 47 | #endif 48 | #else 49 | // Not a DLL. 50 | #define GURL_API 51 | #endif 52 | 53 | #endif // GOOGLEURL_SRC_URL_COMMON_H__ 54 | 55 | -------------------------------------------------------------------------------- /third_party/crawler/googleurl/src/url_util_internal.h: -------------------------------------------------------------------------------- 1 | // Copyright 2011, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | #ifndef GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ 31 | #define GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ 32 | 33 | #include 34 | 35 | #include "string16.h" 36 | #include "url_common.h" 37 | #include "url_parse.h" 38 | 39 | namespace url_util { 40 | 41 | extern const char kFileScheme[]; 42 | extern const char kFileSystemScheme[]; 43 | extern const char kMailtoScheme[]; 44 | 45 | // Given a string and a range inside the string, compares it to the given 46 | // lower-case |compare_to| buffer. 47 | bool CompareSchemeComponent(const char* spec, 48 | const url_parse::Component& component, 49 | const char* compare_to); 50 | bool CompareSchemeComponent(const char16* spec, 51 | const url_parse::Component& component, 52 | const char* compare_to); 53 | 54 | } // namespace url_util 55 | 56 | #endif // GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ 57 | --------------------------------------------------------------------------------