├── .bowerrc ├── .dockerignore ├── .editorconfig ├── .flake8.ini ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── .yaydoc.yml ├── Dockerfile ├── LICENSE ├── Procfile ├── README.md ├── app.json ├── app ├── __init__.py ├── query_cache.py ├── scrapers │ ├── __init__.py │ ├── ask.py │ ├── baidu.py │ ├── bing.py │ ├── dailymotion.py │ ├── duckduckgo.py │ ├── exalead.py │ ├── generalized.py │ ├── google.py │ ├── mojeek.py │ ├── parsijoo.py │ ├── quora.py │ ├── twitter.py │ ├── yahoo.py │ └── youtube.py ├── server.py ├── static │ ├── css │ │ └── styles.css │ └── images │ │ ├── ask_icon.ico │ │ ├── baidu_icon.ico │ │ ├── bing_icon.ico │ │ ├── dailymotion_icon.png │ │ ├── duckduckgo_icon.png │ │ ├── exalead_icon.png │ │ ├── favicon.ico │ │ ├── forkme_right_green_007200.png │ │ ├── foss_asia.png │ │ ├── google_icon.png │ │ ├── mojeek_icon.png │ │ ├── parsijoo_icon.png │ │ ├── quora_icon.png │ │ ├── ripple.gif │ │ ├── twitter_icon.png │ │ ├── yahoo_icon.ico │ │ └── youtube_icon.png └── templates │ └── index.html ├── bandit.yml ├── bower.json ├── codecov.yml ├── docker-compose.yml ├── docs └── installation │ ├── docker.md │ ├── heroku.md │ └── local.md ├── manifest.yml ├── package-lock.json ├── package.json ├── requirements-dev.txt ├── requirements.txt ├── runtime.txt ├── scalingo.json └── test ├── __init__.py ├── test_ask.py ├── test_baidu.py ├── test_bing.py ├── test_duckduckgo.py ├── test_generalized.py ├── test_google.py ├── test_mojeek.py ├── test_parsijoo.py ├── test_quora.py ├── test_server.py ├── test_twitter.py ├── test_yahoo.py └── test_youtube.py /.bowerrc: -------------------------------------------------------------------------------- 1 | { 2 | "directory" : "app/static/bower_components" 3 | } -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .dockerignore 3 | 4 | ##################### 5 | # .gitignore copied 6 | ##################### 7 | 8 | .cache/* 9 | __pycache__/* 10 | node_modules/* 11 | .coverage 12 | .idea/* 13 | *.pyc 14 | *.py.bak 15 | *.swp 16 | app/static/bower_components/ -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 4 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.html] 12 | indent_size = 4 13 | 14 | [*.js] 15 | indent_size = 4 16 | 17 | [*.css] 18 | indent_size = 4 -------------------------------------------------------------------------------- /.flake8.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git, 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributions Best Practices 2 | 3 | **Commits** 4 | * Write clear meaningful git commit messages (Do read http://chris.beams.io/posts/git-commit/) 5 | * Make sure your PR's description contains GitHub's special keyword references that automatically close the related issue when the PR is merged. (More info at https://github.com/blog/1506-closing-issues-via-pull-requests ) 6 | * When you make very very minor changes to a PR of yours (like for example fixing a failing travis build or some small style corrections or minor changes requested by reviewers) make sure you squash your commits afterwards so that you don't have an absurd number of commits for a very small fix. (Learn how to squash at https://davidwalsh.name/squash-commits-git ) 7 | * When you're submitting a PR for a UI-related issue, it would be really awesome if you add a screenshot of your change or a link to a deployment where it can be tested out along with your PR. It makes it very easy for the reviewers and you'll also get reviews quicker. 8 | 9 | **Code Styleguide** 10 | * Do follow the .editorconfig file regarding maintaining of code style (It's mandatory). 11 | * For more information regarding .editorconfig file, see [editorconfig](http://editorconfig.org/#download) 12 | 13 | **Feature Requests and Bug Reports** 14 | * When you file a feature request or when you are submitting a bug report to the [issue tracker](https://github.com/fossasia/query-server/issues), make sure you add steps to reproduce it. Especially if that bug is some weird/rare one. 15 | 16 | **Join the development** 17 | * Before you join development, please set up the project on your local machine, run it and go through the application completely. Press on any button you can find and see where it leads to. Explore. (Don't worry ... Nothing will happen to the app or to you due to the exploring :wink: Only thing that will happen is, you'll be more familiar with what is where and might even get some cool ideas on how to improve various aspects of the app.) 18 | * If you would like to work on an issue, drop in a comment at the issue. If it is already assigned to someone, but there is no sign of any work being done, please free to drop in a comment so that the issue can be assigned to you if the previous assignee has dropped it entirely. 19 | 20 | Do read the [Open Source Developer Guide and Best Practices at FOSSASIA](https://blog.fossasia.org/open-source-developer-guide-and-best-practices-at-fossasia). 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | **I'm submitting a ...** 3 | - [ ] bug report 4 | - [ ] feature request 5 | 6 | **Current behavior:** 7 | 8 | 9 | **Expected behavior:** 10 | 11 | 12 | **Steps to reproduce:** 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | Fixes # 3 | 4 | #### Checklist 5 | 6 | - [ ] I have read the [Contribution & Best practices Guide](https://blog.fossasia.org/open-source-developer-guide-and-best-practices-at-fossasia) and my PR follows them. 7 | - [ ] My branch is up-to-date with the Upstream `master` branch. 8 | - [ ] I have added necessary documentation (if appropriate) 9 | 10 | #### Changes proposed in this pull request: 11 | 12 | - 13 | - 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/* 2 | __pycache__/* 3 | venv/* 4 | node_modules/* 5 | app/static/bower_components/* 6 | .coverage 7 | .idea/* 8 | *.pyc 9 | *.py.bak 10 | *.swp 11 | Pipfile 12 | Pipfile.lock 13 | .vscode/* 14 | 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.7" 5 | - "3.6" 6 | 7 | services: 8 | - mongodb 9 | 10 | install: 11 | - pip install codecov flake8 pytest 12 | - pip install -r requirements-dev.txt 13 | - pip install -r requirements.txt 14 | 15 | before_script: 16 | - flake8 . --count --max-complexity=16 --show-source --statistics 17 | 18 | script: 19 | - python -m app.server > /dev/null & 20 | - pytest --cov=./ 21 | - kill $(lsof -t -i:7001) 22 | 23 | after_success: 24 | - bash <(curl -s https://codecov.io/bash) 25 | -------------------------------------------------------------------------------- /.yaydoc.yml: -------------------------------------------------------------------------------- 1 | metadata: 2 | author: FOSSASIA 3 | projectname: "Query Server" 4 | version: development 5 | build: 6 | theme: 7 | name: sphinx_fossasia_theme 8 | source: . 9 | publish: 10 | ghpages: 11 | docurl: query-server.fossasia.org 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:boron 2 | MAINTAINER Afroz Ahamad 3 | 4 | RUN mkdir -p /usr/src/app 5 | WORKDIR /usr/src/app 6 | 7 | RUN apt-get update && apt-get install -y --no-install-recommends \ 8 | build-essential \ 9 | python3-dev \ 10 | libpq-dev \ 11 | libevent-dev \ 12 | libmagic-dev \ 13 | python3-pip && apt-get clean -y 14 | 15 | # copy requirements 16 | COPY package.json /usr/src/app/ 17 | COPY bower.json /usr/src/app/ 18 | COPY .bowerrc /usr/src/app 19 | COPY requirements.txt /usr/src/app/ 20 | 21 | # install requirements 22 | RUN npm install 23 | RUN npm install --global bower 24 | RUN bower --allow-root install 25 | RUN pip3 install -r requirements.txt 26 | 27 | # Bundle app source 28 | COPY . /usr/src/app 29 | 30 | EXPOSE 7001 31 | 32 | CMD [ "python3", "app/server.py" ] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: python app/server.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Query-Server 2 | 3 | [![Build Status](https://travis-ci.org/fossasia/query-server.svg?branch=master)](https://travis-ci.org/fossasia/query-server) 4 | [![Dependency Status](https://david-dm.org/fossasia/query-server.svg)](https://david-dm.org/ossasia/query-server) 5 | [![Join the chat at https://gitter.im/fossasia/query-server](https://badges.gitter.im/fossasia/query-server.svg)](https://gitter.im/fossasia/query-server?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 6 | [![codecov](https://codecov.io/gh/fossasia/query-server/branch/master/graph/badge.svg)](https://codecov.io/gh/fossasia/query-server) 7 | 8 | The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask, DuckDuckGo, Baidu, Exalead, Quora, Parsijoo, Dailymotion, Mojeek and Youtube) and get the results as `json`, `xml` or `csv`. The tool also stores the searched query string in a MongoDB database for analytical purposes. 9 | 10 | [![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/fossasia/query-server) [![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy?template=https://github.com/fossasia/query-server) [![Deploy on Scalingo](https://cdn.scalingo.com/deploy/button.svg)](https://my.scalingo.com/deploy?source=https://github.com/fossasia/query-server#master) [![Deploy to Bluemix](https://bluemix.net/deploy/button.png)](https://bluemix.net/deploy?repository=https://github.com/fossasia/query-server&branch=master) 11 | 12 | ## Table of Contents 13 | 14 | - [Test Deployment](#test-deployment) 15 | - [API](#api) 16 | - [Error Codes](#error-codes) 17 | - [Dependencies](#dependencies) 18 | - [Installation](#installation) 19 | - [Contribute](#contribute) 20 | 21 | ## Test Deployment 22 | 23 | A test deployment of the project is available here: https://query-server.herokuapp.com 24 | 25 | ## API 26 | 27 | The API(s) provided by query-server are as follows: 28 | 29 | ` GET /api/v1/search/?query=query&format=format ` 30 | 31 | > *search-engine* : [`google`, `ask`, `bing`, `duckduckgo`, `yahoo`, `baidu`, `exalead`, `quora`, `youtube`, `parsijoo`, `mojeek`, `dailymotion`] 32 | 33 | > *query* : query can be any string 34 | 35 | > *format* : [`json`, `xml`, `csv`] 36 | 37 | A sample query : `/api/v1/search/bing?query=fossasia&format=xml&num=10` 38 | 39 | ## Error Codes 40 | 404 Not Found : Incorrect Search Engine, Zero Response 41 | 400 Bad Request : query and/or format is not in the correct format 42 | 500 Internal Server Error : Server Error from Search Engine 43 | 44 | ## Dependencies 45 | 46 | * [MongoDB](https://www.mongodb.com) 47 | * [Python 2.7](https://python.org) 48 | * [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc) 49 | * [dicttoxml](https://github.com/quandyfactory/dicttoxml) 50 | * [Flask](http://flask.pocoo.org) 51 | * [pymongo](https://api.mongodb.com/python/current) 52 | * [requests](http://docs.python-requests.org) 53 | * [Node.js](https://nodejs.org/en) 54 | * [bower.io](https://bower.io) 55 | 56 | ## Installation 57 | 58 | 1. [Local Installation](/docs/installation/local.md) 59 | 60 | 2. [Deployment on Heroku](/docs/installation/heroku.md) 61 | 62 | 3. [Deployment with Docker](/docs/installation/docker.md) 63 | 64 | 65 | ## Contribute 66 | 67 | Found an issue? Post it in the [issue tracker](https://github.com/fossasia/query-server/issues) For pull requests please read [Open Source Developer Guide and Best Practices at FOSSASIA](https://blog.fossasia.org/open-source-developer-guide-and-best-practices-at-fossasia/) 68 | 69 | ## License 70 | 71 | This project is currently licensed under the Apache License version 2.0. A copy of `LICENSE` should be present along with the source code. To obtain the software under a different license, please contact [FOSSASIA](http://blog.fossasia.org/contact/). 72 | 73 | -------------------------------------------------------------------------------- /app.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "query-server", 3 | "description": "Query server that stores a query string on a server.", 4 | "repository": "https://github.com/fossasia/query-server/", 5 | "logo": "http://labs.fossasia.org/images/fossasia.png", 6 | "keywords": [ 7 | "query-server", 8 | "fossasia" 9 | ], 10 | "addons": [ 11 | { 12 | "plan": "mongolab:sandbox", 13 | "as": "MONGO" 14 | } 15 | ], 16 | "buildpacks": [ 17 | { 18 | "url": "heroku/python" 19 | }, 20 | { 21 | "url": "heroku/nodejs" 22 | } 23 | ], 24 | "scripts": { 25 | "postinstall": "bower install" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/__init__.py -------------------------------------------------------------------------------- /app/query_cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | query_cache.py -- Implements a caching system for query server based on MongoDB 5 | 6 | Before sending a query to a remote search engine, use lookup() see if results 7 | from that same search engine and query are already in the cache. If so, then 8 | print a cache hit message return the cached results. If not, then use store() 9 | to write the search engine, query, query results, and a datetime created into 10 | the cache. MongoDB will use the datetime to automatically delete out dated 11 | query results. 12 | 13 | Ideas for improvement: 14 | * Add a lookup_count to see how often cache actually saves us time. 15 | """ 16 | 17 | import datetime as dt 18 | import os 19 | 20 | from pymongo import DESCENDING, MongoClient 21 | from pymongo.errors import OperationFailure 22 | 23 | client = MongoClient(os.environ.get('MONGO_URI', 'mongodb://localhost:27017/')) 24 | db = client['query-server-v2'] 25 | db = db['queries'] # Automatically delete records that are older than one day 26 | try: 27 | db.create_index([('createdAt', DESCENDING)], 28 | expireAfterSeconds=60 * 60 * 24) 29 | except OperationFailure: 30 | pass # Database index already exists 31 | 32 | 33 | def lookup(url): 34 | """return search result if the URL is in the db or None on a cache miss.""" 35 | data = db.find_one({'url': url}) or {} 36 | return data.get('links', None) 37 | 38 | 39 | def store(url, links): 40 | """write the URL, the links, and a UTC timestamp into the database.""" 41 | db.delete_many({'url': url}) # remove all records for this URL 42 | db.insert({'url': url, 'links': links, 'createdAt': dt.datetime.utcnow()}) 43 | 44 | 45 | if __name__ == '__main__': 46 | url = 'test_url' 47 | print(lookup(url)) 48 | store(url, 'a b c d e'.split()) 49 | print(lookup(url)) 50 | -------------------------------------------------------------------------------- /app/scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from .ask import Ask 4 | from .baidu import Baidu 5 | from .bing import Bing 6 | from .dailymotion import DailyMotion 7 | from .duckduckgo import DuckDuckGo 8 | from .exalead import ExaLead 9 | from .google import Google 10 | from .mojeek import Mojeek 11 | from .parsijoo import Parsijoo 12 | from .quora import Quora 13 | from .twitter import Twitter 14 | from .yahoo import Yahoo 15 | from .youtube import Youtube 16 | 17 | scrapers = { 18 | 'ask': Ask(), 19 | 'baidu': Baidu(), 20 | 'bing': Bing(), 21 | 'dailymotion': DailyMotion(), 22 | 'duckduckgo': DuckDuckGo(), 23 | 'exalead': ExaLead(), 24 | 'google': Google(), 25 | 'mojeek': Mojeek(), 26 | 'parsijoo': Parsijoo(), 27 | 'quora': Quora(), 28 | 'twitter': Twitter(), 29 | 'yahoo': Yahoo(), 30 | 'youtube': Youtube() 31 | } 32 | 33 | 34 | def small_test(): 35 | assert isinstance(scrapers['google'].search('fossasia', 1), list) 36 | 37 | 38 | def feed_gen(query, engine, count=10, qtype=''): 39 | engine = engine.lower() 40 | # provide temporary backwards compatibility for old names 41 | old_names = {'ubaidu': 'baidu', 42 | 'vdailymotion': 'dailymotion', 43 | 'tyoutube': 'youtube'} 44 | engine = old_names.get(engine, engine) 45 | if engine in ('quora', 'youtube'): 46 | urls = scrapers[engine].search_without_count(query) 47 | else: 48 | urls = scrapers[engine].search(query, count, qtype) 49 | return urls 50 | -------------------------------------------------------------------------------- /app/scrapers/ask.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Ask(Scraper): 6 | """Scrapper class for Ask""" 7 | def __init__(self): 8 | Scraper.__init__(self) 9 | self.url = 'http://ask.com/web' 10 | self.videoURL = 'https://www.ask.com/youtube' 11 | self.defaultStart = 1 12 | self.startKey = 'page' 13 | self.name = 'ask' 14 | 15 | @staticmethod 16 | def next_start(current_start, prev_results): 17 | return current_start + 1 18 | 19 | @staticmethod 20 | def parse_response(soup): 21 | """ Parse the response and return set of urls 22 | Returns: urls (list) 23 | [[Tile1,url1], [Title2, url2],..] 24 | """ 25 | urls = [] 26 | if soup.find('div', class_='PartialSearchResults-noresults'): 27 | return None 28 | for div in soup.findAll('div', class_='PartialSearchResults-item'): 29 | title = div.div.a.text 30 | url = div.div.a['href'] 31 | try: 32 | p = div.find('p', class_='PartialSearchResults-item-abstract') 33 | desc = p.text.replace('\n', '') 34 | urls.append({'title': title, 'link': url, 'desc': desc}) 35 | except Exception: 36 | urls.append({'title': title, 'link': url}) 37 | print('Ask parsed: ' + str(urls)) 38 | return urls 39 | 40 | @staticmethod 41 | def parse_video_response(soup): 42 | """ Parse response and returns the urls 43 | 44 | Returns: urls (list) 45 | [[Tile1, url1], [Title2, url2], ...] 46 | """ 47 | urls = [] 48 | for div in soup.findAll('div', attrs={'class': 'v-info'}): 49 | title = div.div.find('a').getText() 50 | url = 'https' + div.div.a.get('href') 51 | desc = div.find('div', attrs={'class': 'desc'}).getText() 52 | urls.append({ 53 | 'title': title, 54 | 'link': url, 55 | 'desc': desc 56 | }) 57 | 58 | print('Ask parsed: ' + str(urls)) 59 | 60 | return urls 61 | -------------------------------------------------------------------------------- /app/scrapers/baidu.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Baidu(Scraper): 6 | """Scrapper class for Baidu""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.baidu.com/s' 11 | self.newsURL = 'http://news.baidu.com/ns' 12 | self.defaultStart = 0 13 | self.queryKey = 'word' 14 | self.startKey = 'pn' 15 | self.name = 'baidu' 16 | 17 | @staticmethod 18 | def parse_response(soup): 19 | """ Parse the response and return set of urls 20 | Returns: urls (list) 21 | [[Tile1,url1], [Title2, url2],..] 22 | """ 23 | urls = [] 24 | for div in soup.findAll('div', {'class': 'result'}): 25 | title = div.h3.a.getText() 26 | url = div.h3.a['href'] 27 | urls.append({'title': title, 'link': url}) 28 | 29 | print('Baidu parsed: ' + str(urls)) 30 | 31 | return urls 32 | 33 | @staticmethod 34 | def parse_news_response(soup): 35 | """ Parse the response and return set of urls 36 | Returns: urls (list) 37 | [[Tile1,url1], [Title2, url2],..] 38 | """ 39 | urls = [] 40 | for h3 in soup.findAll('h3', {'class': 'c-title'}): 41 | title = h3.a.getText() 42 | link = h3.a.get('href') 43 | urls.append({'title': title, 'link': link}) 44 | 45 | print('Baidu parsed: ' + str(urls)) 46 | 47 | return urls 48 | -------------------------------------------------------------------------------- /app/scrapers/bing.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Bing(Scraper): 6 | """Scrapper class for Bing""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'http://www.bing.com/search' 11 | self.videoURL = 'https://www.bing.com/videos/search' 12 | self.imageURL = 'https://www.bing.com/images/search' 13 | self.newsURL = 'https://www.bing.com/news/search' 14 | self.defaultStart = 1 15 | self.startKey = 'first' 16 | self.name = 'bing' 17 | 18 | @staticmethod 19 | def parse_response(soup): 20 | """ Parses the reponse and return set of urls 21 | Returns: urls (list) 22 | [[Tile1,url1], [Title2, url2],..] 23 | """ 24 | urls = [] 25 | for li in soup.findAll('li', {'class': 'b_algo'}): 26 | title = li.h2.text.replace('\n', '').replace(' ', '') 27 | url = li.h2.a['href'] 28 | desc = li.find('p').text 29 | url_entry = {'title': title, 30 | 'link': url, 31 | 'desc': desc} 32 | urls.append(url_entry) 33 | 34 | print('Bing parsed: ' + str(urls)) 35 | 36 | return urls 37 | 38 | @staticmethod 39 | def parse_video_response(soup): 40 | """ Parse response and returns the urls 41 | 42 | Returns: urls (list) 43 | [[Tile1, url1], [Title2, url2], ...] 44 | """ 45 | urls = [] 46 | for a in soup.findAll('a', attrs={'class': 'mc_vtvc_link'}): 47 | title = a.get('aria-label').split(' Duration')[0] 48 | url = 'https://www.bing.com' + a.get('href') 49 | urls.append({ 50 | 'title': title, 51 | 'link': url 52 | }) 53 | 54 | print('Bing parsed: ' + str(urls)) 55 | 56 | return urls 57 | 58 | @staticmethod 59 | def parse_image_response(soup): 60 | """ Parse response and returns the urls 61 | 62 | Returns: urls (list) 63 | [[url1], [url2], ...] 64 | """ 65 | urls = [] 66 | for a in soup.findAll('a', attrs={'class': 'iusc'}): 67 | url = 'https://www.bing.com' + a.get('href') 68 | urls.append({ 69 | 'link': url 70 | }) 71 | 72 | print('Bing parsed: ' + str(urls)) 73 | 74 | return urls 75 | 76 | @staticmethod 77 | def parse_news_response(soup): 78 | """ Parses the reponse and return set of urls 79 | Returns: urls (list) 80 | [[Tile1,url1], [Title2, url2],..] 81 | """ 82 | urls = [] 83 | for div in soup.findAll('div', {'class': 't_s'}): 84 | link = div.find('a', {'class': 'title'}) 85 | url = link['href'] 86 | title = link.getText() 87 | title = title.replace('\n', '').replace(' ', '') 88 | desc = div.find('div', {'class': 'snippet'}).getText() 89 | desc = desc.replace('\n', '').replace(' ', '') 90 | url_entry = {'title': title, 91 | 'link': url, 92 | 'desc': desc} 93 | urls.append(url_entry) 94 | 95 | print('Bing parsed: ' + str(urls)) 96 | 97 | return urls 98 | -------------------------------------------------------------------------------- /app/scrapers/dailymotion.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | import json 4 | 5 | 6 | class DailyMotion(Scraper): 7 | """Scraper class for DailyMotion""" 8 | 9 | def __init__(self): 10 | Scraper.__init__(self) 11 | self.url = 'https://api.dailymotion.com/videos/' 12 | self.queryKey = 'search' 13 | self.startKey = 'page' 14 | self.defaultStart = 1 15 | self.name = 'dailymotion' 16 | 17 | @staticmethod 18 | def parse_response(soup): 19 | """ Parse the response and return set of urls 20 | Returns: urls (list) 21 | [[Tile1,url1], [Title2, url2],..] 22 | """ 23 | urls = [] 24 | 25 | video_list = json.loads(str(soup))['list'] 26 | for item in video_list: 27 | title = item['title'] 28 | link = 'https://www.dailymotion.com/video/' + str(item['id']) 29 | urls.append({'title': title, 'link': link}) 30 | 31 | print('Dailymotion parsed: ' + str(urls)) 32 | 33 | return urls 34 | -------------------------------------------------------------------------------- /app/scrapers/duckduckgo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class DuckDuckGo(Scraper): 6 | """Scrapper class for DuckDuckGo""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://duckduckgo.com/html' 11 | self.defaultStart = 0 12 | self.startKey = 's' 13 | self.name = 'duckduckgo' 14 | 15 | @staticmethod 16 | def parse_response(soup): 17 | """ Parse the response and return set of urls 18 | Returns: urls (list) 19 | [[Tile1,url1], [Title2, url2],..] 20 | """ 21 | urls = [] 22 | for links in soup.findAll('a', {'class': 'result__a'}): 23 | urls.append({'title': links.getText(), 24 | 'link': links.get('href')}) 25 | 26 | print('DuckDuckGo parsed: ' + str(urls)) 27 | 28 | return urls 29 | -------------------------------------------------------------------------------- /app/scrapers/exalead.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class ExaLead(Scraper): 6 | """Scraper class for ExaLead""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.exalead.com/search/web/results/' 11 | self.defaultStart = 0 12 | self.startKey = 'start_index' 13 | self.name = 'exalead' 14 | 15 | @staticmethod 16 | def parse_response(soup): 17 | """ Parse the response and return set of urls 18 | Returns: urls (list) 19 | [[Tile1,url1], [Title2, url2],..] 20 | """ 21 | urls = [] 22 | for a in soup.findAll('a', {'class': 'title'}): 23 | urls.append({ 24 | 'title': a.getText(), 25 | 'link': a.get('href') 26 | }) 27 | print('Exalead parsed: ' + str(urls)) 28 | return urls 29 | -------------------------------------------------------------------------------- /app/scrapers/generalized.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | VID_SCRAPERS = ('ask', 'bing', 'parsijoo', 'yahoo') 6 | ISCH_SCRAPERS = ('bing', 'parsijoo', 'yahoo') 7 | NEWS_SCRAPERS = ('baidu', 'bing', 'parsijoo', 'mojeek') 8 | 9 | 10 | class Scraper: 11 | """Generalized scraper""" 12 | url = '' 13 | startKey = '' 14 | queryKey = 'q' 15 | defaultStart = 0 16 | qtype = '' 17 | headers = { 18 | 'User-Agent': ( 19 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) ' 20 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 ' 21 | 'Safari/537.36' 22 | ) 23 | } 24 | 25 | def __init__(self): 26 | self.name = "general" 27 | pass 28 | 29 | def get_page(self, query, startIndex=0, qtype=''): 30 | """ Fetch the google search results page 31 | Returns : Results Page 32 | """ 33 | url = self.url 34 | if qtype == 'vid' and self.name in VID_SCRAPERS: 35 | url = self.videoURL 36 | elif qtype == 'isch' and self.name in ISCH_SCRAPERS: 37 | url = self.imageURL 38 | elif qtype == 'news' and self.name in NEWS_SCRAPERS: 39 | url = self.newsURL 40 | payload = {self.queryKey: query, self.startKey: startIndex, 41 | self.qtype: qtype} 42 | if self.name == 'mojeek' and qtype == 'news': 43 | payload['fmt'] = 'news' 44 | response = requests.get(url, headers=self.headers, params=payload) 45 | print(response.url) 46 | return response 47 | 48 | @staticmethod 49 | def parse_response(soup): 50 | raise NotImplementedError 51 | 52 | @staticmethod 53 | def parse_video_response(soup): 54 | raise NotImplementedError 55 | 56 | @staticmethod 57 | def next_start(current_start, prev_results): 58 | return current_start + len(prev_results) 59 | 60 | def search(self, query, num_results, qtype=''): 61 | """ 62 | Search for the query and return set of urls 63 | Returns: list 64 | """ 65 | urls = [] 66 | current_start = self.defaultStart 67 | 68 | while (len(urls) < num_results): 69 | response = self.get_page(query, current_start, qtype) 70 | soup = BeautifulSoup(response.text, 'html.parser') 71 | new_results = self.call_appropriate_parser(qtype, soup) 72 | if new_results is None: 73 | break 74 | urls.extend(new_results) 75 | current_start = self.next_start(current_start, new_results) 76 | return urls[: num_results] 77 | 78 | def call_appropriate_parser(self, qtype, soup): 79 | new_results = '' 80 | if qtype == 'vid' and self.name in VID_SCRAPERS: 81 | new_results = self.parse_video_response(soup) 82 | elif qtype == 'isch' and self.name in ISCH_SCRAPERS: 83 | new_results = self.parse_image_response(soup) 84 | elif qtype == 'news' and self.name in NEWS_SCRAPERS: 85 | new_results = self.parse_news_response(soup) 86 | else: 87 | new_results = self.parse_response(soup) 88 | return new_results 89 | 90 | def search_without_count(self, query): 91 | """ 92 | Search for the query and return set of urls 93 | Returns: list 94 | """ 95 | urls = [] 96 | payload = {self.queryKey: query} 97 | response = requests.get(self.url, headers=self.headers, params=payload) 98 | soup = BeautifulSoup(response.text, 'html.parser') 99 | urls = self.parse_response(soup) 100 | return urls 101 | -------------------------------------------------------------------------------- /app/scrapers/google.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Google(Scraper): 6 | """Scrapper class for Google""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.google.com/search' 11 | self.defaultStart = 0 12 | self.startKey = 'start' 13 | self.qtype = 'tbm' 14 | self.name = 'google' 15 | 16 | @staticmethod 17 | def next_start(current_start, prev_results): 18 | return current_start + len(prev_results) 19 | 20 | @staticmethod 21 | def parse_response(soup): 22 | """ 23 | Parses the response and returns set of urls 24 | Returns: urls (list) 25 | [[Tile1,url1], [Title2, url2],..] 26 | """ 27 | urls = [] 28 | for h3 in soup.findAll('h3', {'class': 'r'}): 29 | links = h3.find('a') 30 | urls.append({'title': links.getText(), 'link': links.get('href')}) 31 | 32 | print('Google parsed: ' + str(urls)) 33 | 34 | return urls 35 | -------------------------------------------------------------------------------- /app/scrapers/mojeek.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Mojeek(Scraper): 6 | """Scraper class for Mojeek""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.mojeek.co.uk/search' 11 | self.newsURL = 'https://www.mojeek.co.uk/search' 12 | self.defaultStart = 1 13 | self.startKey = 's' 14 | self.name = 'mojeek' 15 | 16 | @staticmethod 17 | def parse_response(soup): 18 | """ Parse the response and return set of urls 19 | Returns: urls (list) 20 | [[Tile1,url1], [Title2, url2],..] 21 | """ 22 | urls = [] 23 | for a in soup.findAll('a', {'class': 'ob'}): 24 | title = a.getText() 25 | url = a.get('href') 26 | urls.append({'title': title, 'link': url}) 27 | 28 | print('Mojeek parsed: ' + str(urls)) 29 | 30 | return urls 31 | 32 | @staticmethod 33 | def parse_news_response(soup): 34 | """ Parse response and returns the urls 35 | 36 | Returns: urls (list) 37 | [[url1], [url2], ...] 38 | """ 39 | urls = [] 40 | for a in soup.findAll('a', attrs={'class': 'ob'}): 41 | title = a.getText() 42 | url = a.get('href') 43 | urls.append({ 44 | 'title': title, 45 | 'link': url 46 | }) 47 | 48 | print('Mojeek parsed: ' + str(urls)) 49 | 50 | return urls 51 | -------------------------------------------------------------------------------- /app/scrapers/parsijoo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | try: 4 | from urllib.parse import unquote # Python 3 5 | except ImportError: 6 | from urllib import unquote # Python 2 7 | 8 | 9 | class Parsijoo(Scraper): 10 | """Scraper class for Parsijoo""" 11 | 12 | def __init__(self): 13 | Scraper.__init__(self) 14 | self.url = 'https://parsijoo.ir/web' 15 | self.imageURL = 'https://image.parsijoo.ir/image' 16 | self.videoURL = 'https://video.parsijoo.ir/video' 17 | self.newsURL = 'http://khabar.parsijoo.ir/search/' 18 | self.defaultStart = 0 19 | self.newsStart = 1 20 | self.startKey = 'co' 21 | self.name = 'parsijoo' 22 | 23 | @staticmethod 24 | def parse_response(soup): 25 | """ Parse the response and return set of urls 26 | Returns: urls (list) 27 | [[Tile1,url1], [Title2, url2],..] 28 | """ 29 | urls = [] 30 | for div in soup.findAll('div', {'class': 'result'}): 31 | result_title = div.find('span', {'class': 'result-title'}) 32 | title = result_title.getText()[23:-1] 33 | link = result_title.find('a').get('href') 34 | desc = div.find('span', {'class': 'result-desc'}).getText()[35:-1] 35 | urls.append({'title': title, 'link': link, 'desc': desc}) 36 | 37 | print('Parsijoo parsed: ' + str(urls)) 38 | 39 | return urls 40 | 41 | @staticmethod 42 | def parse_video_response(soup): 43 | """ Parse response and returns the urls 44 | 45 | Returns: urls (list) 46 | [[Tile1, url1], [Title2, url2], ...] 47 | """ 48 | urls = [] 49 | for a in soup.findAll('a', attrs={'class': 'over-page'}): 50 | title = a.get('title') 51 | url = 'https://video.parsijoo.ir' + a.get('href') 52 | urls.append({ 53 | 'title': title, 54 | 'link': url 55 | }) 56 | 57 | print('Parsijoo parsed: ' + str(urls)) 58 | 59 | return urls 60 | 61 | @staticmethod 62 | def parse_image_response(soup): 63 | """ Parse response and returns the urls 64 | 65 | Returns: urls (list) 66 | [[url1], [url2], ...] 67 | """ 68 | urls = [] 69 | for div in soup.find_all('div', class_='image-container overflow'): 70 | a = div.find('a') 71 | url = 'https://image.parsijoo.ir' + a.get('href') 72 | urls.append({ 73 | 'link': url 74 | }) 75 | 76 | print('Parsijoo parsed: ' + str(urls)) 77 | 78 | return urls 79 | 80 | @staticmethod 81 | def parse_news_response(soup): 82 | """ Parse the response and return set of urls 83 | Returns: urls (list) 84 | [[Tile1,url1], [Title2, url2],..] 85 | """ 86 | urls = [] 87 | for div in soup.findAll('div', {'class': 'news-title-link'}): 88 | title = div.a.getText() 89 | link = unquote(div.a.get('href')) 90 | urls.append({'title': title, 'link': link}) 91 | 92 | print('Parsijoo parsed: ' + str(urls)) 93 | 94 | return urls 95 | -------------------------------------------------------------------------------- /app/scrapers/quora.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Quora(Scraper): 6 | """Scrapper class for Quora""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.quora.com/search' 11 | self.name = 'quora' 12 | 13 | @staticmethod 14 | def parse_response(soup): 15 | """ Parse the response and return set of urls 16 | Returns: urls (list) 17 | [[Tile1,url1], [Title2, url2],..] 18 | """ 19 | urls = [] 20 | for a in soup.findAll('a', {'class': 'question_link'}): 21 | link = 'https://www.quora.com' + str(a.get('href')) 22 | urls.append({'title': a.getText(), 'link': link}) 23 | 24 | print('Quora parsed: ' + str(urls)) 25 | 26 | return urls 27 | -------------------------------------------------------------------------------- /app/scrapers/twitter.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from .generalized import Scraper 3 | 4 | 5 | class Twitter(Scraper): 6 | """This scraper takes a query and a count and returns the results of 7 | a Twitter search which is executed via the Loklak API""" 8 | 9 | def __init__(self): 10 | Scraper.__init__(self) 11 | self.loklakURL = 'http://api.loklak.org/api/search.json?q=' 12 | 13 | def search(self, query, num_results, qtype=''): 14 | """ Makes a GET request to Loklak API and returns the URLs 15 | Returns: urls (list) 16 | [[Title1,url1], [Title2, url2],..] 17 | """ 18 | encodedQuery = requests.utils.quote(query, safe='') 19 | url = self.loklakURL+encodedQuery 20 | 21 | responses = requests.get(url).json() 22 | 23 | tweets = [] 24 | for response in responses['statuses']: 25 | tweets.append({'link': response['link'], 'text': response['text']}) 26 | 27 | print('Twitter parsed: ' + str(tweets)) 28 | 29 | return tweets[:num_results] 30 | -------------------------------------------------------------------------------- /app/scrapers/yahoo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | import re 4 | try: 5 | from urllib.parse import unquote # Python 3 6 | except ImportError: 7 | from urllib import unquote # Python 2 8 | 9 | 10 | class Yahoo(Scraper): 11 | """Scrapper class for Yahoo""" 12 | 13 | def __init__(self): 14 | Scraper.__init__(self) 15 | self.url = 'https://search.yahoo.com/search' 16 | self.videoURL = 'https://video.search.yahoo.com/search/video' 17 | self.imageURL = 'https://images.search.yahoo.com/search/images' 18 | self.newsURL = 'https://news.search.yahoo.com/search' 19 | self.defaultStart = 1 20 | self.startKey = 'b' 21 | self.name = 'yahoo' 22 | 23 | @staticmethod 24 | def parse_response(soup): 25 | """ Parse response and returns the urls 26 | 27 | Returns: urls (list) 28 | [[Tile1, url1], [Title2, url2], ...] 29 | """ 30 | urls = [] 31 | for h in soup.findAll('h3', attrs={'class': 'title'}): 32 | t = h.findAll('a', attrs={'class': ' ac-algo fz-l ac-21th lh-24'}) 33 | for y in t: 34 | r = y.get('href') 35 | f = r.split('RU=') 36 | e = f[-1].split('/RK=2') 37 | u = unquote(e[0]) 38 | urls.append({ 39 | 'title': y.getText(), 40 | 'link': u 41 | }) 42 | 43 | print('Yahoo parsed: ' + str(urls)) 44 | 45 | return urls 46 | 47 | @staticmethod 48 | def parse_video_response(soup): 49 | """ Parse response and returns the urls 50 | 51 | Returns: urls (list) 52 | [[Tile1, url1], [Title2, url2], ...] 53 | """ 54 | urls = [] 55 | for h in soup.findAll('li', attrs={'class': 'vr vres'}): 56 | t = h.find('a', attrs={'class': 'ng'}) 57 | r = t.get('data-rurl') 58 | titleDiv = t.find('div', attrs={'class': 'v-meta bx-bb'}) 59 | title = titleDiv.find('h3').getText() 60 | urls.append({ 61 | 'title': title, 62 | 'link': r 63 | }) 64 | 65 | print('Yahoo parsed: ' + str(urls)) 66 | 67 | return urls 68 | 69 | @staticmethod 70 | def parse_image_response(soup): 71 | """ Parse response and returns the urls 72 | 73 | Returns: urls (list) 74 | [[Tile1, url1], [Title2, url2], ...] 75 | """ 76 | urls = [] 77 | for h in soup.findAll('li', attrs={'class': 'ld'}): 78 | t = h.find('a') 79 | r = t.get('aria-label') 80 | cleanr = re.compile('<.*?>') 81 | r = re.sub(cleanr, '', r) 82 | cleanl = re.compile('&#[\d]+(;)') 83 | r = re.sub(cleanl, '\'', r) 84 | img = t.find('img', attrs={'class': 'process'}) 85 | url = img.get('data-src') 86 | urls.append({ 87 | 'title': r, 88 | 'link': url 89 | }) 90 | 91 | print('Yahoo parsed: ' + str(urls)) 92 | 93 | return urls 94 | 95 | @staticmethod 96 | def parse_news_response(soup): 97 | """ Parse response and returns the urls 98 | Returns: urls (list) 99 | [[Tile1, url1], [Title2, url2], ...] 100 | """ 101 | urls = [] 102 | for div in soup.findAll('div', attrs={'class': 'dd algo NewsArticle'}): 103 | link = div.find('a', attrs={'class': 'fz-m'}) 104 | descDiv = div.find('div', attrs={'class': 'compText'}) 105 | unparsedURL = link.get('href') 106 | urlSearch = re.search('/RU=(.*?)/', unparsedURL, re.I) 107 | url = unquote(urlSearch.group(1)) 108 | urls.append({ 109 | 'title': link.getText(), 110 | 'link': url, 111 | 'desc': descDiv.find('p').getText() 112 | }) 113 | 114 | print('Yahoo parsed: ' + str(urls)) 115 | 116 | return urls 117 | -------------------------------------------------------------------------------- /app/scrapers/youtube.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from .generalized import Scraper 3 | 4 | 5 | class Youtube(Scraper): 6 | """Scraper class for Youtube""" 7 | 8 | def __init__(self): 9 | Scraper.__init__(self) 10 | self.url = 'https://www.youtube.com/results' 11 | self.queryKey = 'search_query' 12 | self.name = 'youtube' 13 | 14 | @staticmethod 15 | def parse_response(soup): 16 | """ Parse the response and return list of urls 17 | Returns: urls (list) 18 | [[Tile1,url1], [Title2, url2],..] 19 | """ 20 | urls = [] 21 | for a in soup.findAll('a'): 22 | if a.get('href').startswith('/watch?'): 23 | link = 'https://www.youtube.com' + str(a.get('href')) 24 | if not a.getText().startswith('\n\n'): 25 | urls.append({'title': a.getText(), 'link': link}) 26 | else: 27 | continue 28 | 29 | print('Youtube parsed: ' + str(urls)) 30 | 31 | return urls 32 | -------------------------------------------------------------------------------- /app/server.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from argparse import ArgumentParser 4 | 5 | from defusedxml.minidom import parseString 6 | from dicttoxml import dicttoxml 7 | from flask import (Flask, Response, abort, jsonify, make_response, 8 | render_template, request) 9 | 10 | try: 11 | from scrapers import feed_gen, scrapers 12 | except Exception as e: 13 | from app.scrapers import feed_gen, scrapers 14 | 15 | DISABLE_CACHE = True # Temporarily disable the MongoDB cache 16 | if DISABLE_CACHE: 17 | def lookup(url): 18 | return False 19 | 20 | def store(url, links): 21 | pass 22 | else: 23 | from query_cache import lookup, store 24 | 25 | app = Flask(__name__) 26 | err = "" 27 | 28 | errorObj = { 29 | 'type': 'Internal Server Error', 30 | 'status_code': 500, 31 | 'error': 'Could not parse the page due to Internal Server Error' 32 | } 33 | 34 | 35 | @app.route('/') 36 | def index(): 37 | return render_template('index.html', engines_list=sorted(scrapers.keys())) 38 | 39 | 40 | def bad_request(error): 41 | message = {'Error': error[1], 'Status Code': error[0]} 42 | response = dicttoxml(message) if error[2] == 'xml' else json.dumps(message) 43 | return make_response(response, error[0]) 44 | 45 | 46 | @app.route('/api/v1/search/', methods=['GET']) 47 | def search(search_engine): 48 | try: 49 | count = int(request.args.get('num', 10)) 50 | qformat = request.args.get('format', 'json').lower() 51 | qtype = request.args.get('type', '') 52 | if qformat not in ('json', 'xml', 'csv'): 53 | abort(400, 'Not Found - undefined format') 54 | 55 | engine = search_engine 56 | if engine not in scrapers: 57 | error = [404, 'Incorrect search engine', engine] 58 | return bad_request(error) 59 | 60 | query = request.args.get('query') 61 | if not query: 62 | error = [400, 'Not Found - missing query', qformat] 63 | return bad_request(error) 64 | 65 | # first see if we can get the results for the cache 66 | engine_and_query = engine + ':' + query 67 | result = lookup(engine_and_query) 68 | if result: 69 | print("cache hit: {}".format(engine_and_query)) 70 | else: 71 | result = feed_gen(query, engine, count, qtype) 72 | if result: 73 | # store the result in the cache to speed up future searches 74 | store(engine_and_query, result) 75 | else: 76 | error = [404, 'No response', engine_and_query] 77 | return bad_request(error) 78 | 79 | try: 80 | unicode # unicode is undefined in Python 3 so NameError is raised 81 | for line in result: 82 | line['link'] = line['link'].encode('utf-8') 83 | if 'title' in line: 84 | line['title'] = line['title'].encode('utf-8') 85 | if 'desc' in line: 86 | line['desc'] = line['desc'].encode('utf-8') 87 | except NameError: 88 | pass # Python 3 strings are already Unicode 89 | if qformat == 'json': 90 | return jsonify(result) 91 | elif qformat == 'csv': 92 | csvfeed = '"' 93 | csvfeed += '","'.join(result[0].keys()) 94 | for line in result: 95 | csvfeed += '"\n"' 96 | csvfeed += '","'.join(line.values()) 97 | csvfeed += '"' 98 | return Response(csvfeed) 99 | 100 | xmlfeed = dicttoxml(result, custom_root='channel', attr_type=False) 101 | xmlfeed = parseString(xmlfeed).toprettyxml() 102 | return Response(xmlfeed, mimetype='application/xml') 103 | except Exception as e: 104 | print(e) 105 | return jsonify(errorObj) 106 | 107 | 108 | @app.after_request 109 | def set_header(r): 110 | r.headers["Cache-Control"] = "no-cache" 111 | return r 112 | 113 | 114 | if __name__ == '__main__': 115 | port = int(os.environ.get('PORT', 7001)) 116 | parser = ArgumentParser() 117 | help_msg = "Start the server in development mode with debug=True" 118 | parser.add_argument("--dev", help=help_msg, action="store_true") 119 | args = parser.parse_args() 120 | app.run(host='0.0.0.0', port=port, debug=args.dev) 121 | -------------------------------------------------------------------------------- /app/static/css/styles.css: -------------------------------------------------------------------------------- 1 | html { 2 | height: 100%; 3 | box-sizing: border-box; 4 | } 5 | 6 | body { 7 | padding-top: 0px; 8 | font: DroidSansMono; 9 | position: relative; 10 | margin: 0; 11 | padding-bottom: 6rem; 12 | min-height: 100%; 13 | font-family: Droid Sans Mono; 14 | } 15 | 16 | .github-fork-ribbon { 17 | position: absolute; 18 | top: 0; 19 | right: 0; 20 | border: 0; 21 | } 22 | 23 | .queryArea { 24 | min-width: 100%; 25 | } 26 | 27 | .custom{ 28 | padding-left: 0px!important; 29 | padding-right: 0px!important; 30 | } 31 | 32 | .responseType { 33 | display: inline-flex; 34 | } 35 | 36 | .engineDrop { 37 | padding: 0; 38 | margin-right: 30px; 39 | margin-left: 16px; 40 | margin-bottom: 10px; 41 | } 42 | 43 | .qType { 44 | padding: 0; 45 | margin-right: 30px; 46 | margin-left: 16px; 47 | } 48 | 49 | .miscInfo { 50 | margin: 0 auto; 51 | display: table; 52 | } 53 | 54 | .typeButton { 55 | padding: 0px 10px 10px; 56 | } 57 | 58 | .qCount { 59 | display: inline-block; 60 | margin: 0; 61 | min-width: 50%; 62 | } 63 | 64 | #submit { 65 | margin-top: 20px; 66 | } 67 | 68 | #a1 { 69 | cursor: pointer; 70 | } 71 | 72 | #search { 73 | padding: 0; 74 | margin: 0, auto; 75 | } 76 | 77 | #feed { 78 | display: none; 79 | font-size: 1.2em; 80 | margin: 0em 2em 0 2em; 81 | padding: -3em 0 -3em 0; 82 | z-index: -100000; 83 | } 84 | 85 | #load { 86 | position: relative; 87 | display: none; 88 | left: 50%; 89 | top: 50%; 90 | margin: auto; 91 | } 92 | 93 | p { 94 | font-size: medium; 95 | } 96 | 97 | .jumbotron p { 98 | font-weight: 300; 99 | font-size: 1.5em; 100 | display: inline-block; 101 | box-shadow: aliceblue; 102 | } 103 | 104 | .btn-group-vertical { 105 | display: inline-flex; 106 | } 107 | 108 | .btn-toolbar { 109 | margin-top: 1em; 110 | } 111 | 112 | .btn { 113 | border: none; 114 | color: white; 115 | padding: 0.5em 1.5em; 116 | text-align: center; 117 | text-decoration: ghostwhite; 118 | display: inline-block; 119 | font-size: 1.4em; 120 | border-radius: 0.4em; 121 | } 122 | 123 | .btn-outline { 124 | background-color: transparent; 125 | color: inherit; 126 | transition: all .5s; 127 | border: 1px #428bca solid; 128 | } 129 | 130 | .btn-outline:hover { 131 | color: #286090; 132 | background-color: rgba(0, 0, 0, 0.09); 133 | } 134 | 135 | .search { 136 | margin: 0.3em; 137 | } 138 | 139 | #tweet { 140 | font-size: large; 141 | color:#0084b4; 142 | background-color:#fff; 143 | border: 2px solid #0084b4; 144 | } 145 | 146 | 147 | #tweet:hover{ 148 | font-size: large; 149 | background-color:#0084b4; 150 | color:#fff; 151 | } 152 | 153 | #fb { 154 | font-size: large; 155 | background-color:#fff; 156 | 157 | color:#3b5998; 158 | border: 2px solid #3b5998; 159 | } 160 | 161 | 162 | #fb:hover{ 163 | font-size: large; 164 | background-color:#3b5998; 165 | color:#fff; 166 | } 167 | 168 | .panel-heading { 169 | font-weight: 300; 170 | font-size: 1.5em; 171 | } 172 | 173 | .panel-body { 174 | line-height: 25px; 175 | margin: 0 auto; 176 | } 177 | 178 | .panel-body .label { 179 | font-size: 1em; 180 | } 181 | 182 | .footer { 183 | bottom:0; 184 | width: 100%; 185 | height: 100px; 186 | } 187 | 188 | 189 | .footer .text-muted { 190 | margin: 20px 0; 191 | } 192 | 193 | 194 | /* Media Queries */ 195 | 196 | @media only screen and (min-device-width: 320px) and (max-device-width: 480px) { 197 | .github-fork-ribbon img { 198 | width: 120px; 199 | } 200 | h1 { 201 | font-size: 2.5em; 202 | } 203 | .jumbotron p { 204 | font-size: 1.5em; 205 | } 206 | #query { 207 | width: 90%; 208 | } 209 | .btn-group { 210 | margin-top: 10px; 211 | line-height: 20px; 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /app/static/images/ask_icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/ask_icon.ico -------------------------------------------------------------------------------- /app/static/images/baidu_icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/baidu_icon.ico -------------------------------------------------------------------------------- /app/static/images/bing_icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/bing_icon.ico -------------------------------------------------------------------------------- /app/static/images/dailymotion_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/dailymotion_icon.png -------------------------------------------------------------------------------- /app/static/images/duckduckgo_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/duckduckgo_icon.png -------------------------------------------------------------------------------- /app/static/images/exalead_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/exalead_icon.png -------------------------------------------------------------------------------- /app/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/favicon.ico -------------------------------------------------------------------------------- /app/static/images/forkme_right_green_007200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/forkme_right_green_007200.png -------------------------------------------------------------------------------- /app/static/images/foss_asia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/foss_asia.png -------------------------------------------------------------------------------- /app/static/images/google_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/google_icon.png -------------------------------------------------------------------------------- /app/static/images/mojeek_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/mojeek_icon.png -------------------------------------------------------------------------------- /app/static/images/parsijoo_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/parsijoo_icon.png -------------------------------------------------------------------------------- /app/static/images/quora_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/quora_icon.png -------------------------------------------------------------------------------- /app/static/images/ripple.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/ripple.gif -------------------------------------------------------------------------------- /app/static/images/twitter_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/twitter_icon.png -------------------------------------------------------------------------------- /app/static/images/yahoo_icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/yahoo_icon.ico -------------------------------------------------------------------------------- /app/static/images/youtube_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/app/static/images/youtube_icon.png -------------------------------------------------------------------------------- /app/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | query-server · FOSSASIA 29 | 30 | 31 | 32 |
33 | 34 | Fork me on GitHub 35 | 36 |
37 | 38 |
39 |
40 |

query-server

41 |

42 | Query server that stores a query string on a server. 43 |

44 |
45 | 134 |
135 |
136 |
137 |
138 | 139 |
140 |
141 | 142 |
143 |
144 |

Want to contribute? gitter badge to get started.

145 |

to get latest updates on query-server.

146 |

Help to spread the news:

147 |
148 |
149 | Tweet about this » 151 |
152 |
153 | Share on Facebook » 154 |
155 |
156 | 157 |

158 |
159 |

160 | Git Commit/Version number: 161 | 162 |

163 |
164 |
165 |
166 | 292 | 293 | 294 | 295 | -------------------------------------------------------------------------------- /bandit.yml: -------------------------------------------------------------------------------- 1 | skips: ['B101'] 2 | -------------------------------------------------------------------------------- /bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "query-server", 3 | "description": "A simple server to store query strings", 4 | "main": "", 5 | "authors": [ 6 | "enigmaeth " 7 | ], 8 | "license": "MIT", 9 | "homepage": "https://github.com/fossasia/query-server", 10 | "ignore": [ 11 | "**/.*", 12 | "node_modules", 13 | "bower_components", 14 | "test", 15 | "tests" 16 | ], 17 | "dependencies": { 18 | "bootstrap": "^3.3.7", 19 | "jquery": "^3.1.1", 20 | "json3": "^3.3.2", 21 | "prismjs": "^1.6.0" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: yes 4 | 5 | coverage: 6 | precision: 2 7 | round: down 8 | range: "70...100" 9 | 10 | status: 11 | project: yes 12 | patch: yes 13 | changes: no 14 | 15 | comment: 16 | layout: "reach, diff, flags, files, footer" 17 | behavior: default 18 | require_changes: no 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | services: 3 | web: 4 | build: . 5 | ports: 6 | - "7001:7001" 7 | links: 8 | - mongo 9 | environment: 10 | MONGO_URI: mongodb://localhost:27017/query-server 11 | 12 | mongo: 13 | image: mongo 14 | volumes: 15 | - /data/mongodb/db:/data/db 16 | ports: 17 | - "27017:27017" -------------------------------------------------------------------------------- /docs/installation/docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | [![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/fossasia/query-server) 4 | 5 | * Get the latest version of docker. See the [offical site](https://docs.docker.com/engine/installation/) for installation info for your platform. 6 | 7 | * Install the latest version of docker-compose. Windows and Mac users should have docker-compose by default as it is part of Docker toolbox. For Linux users, see the 8 | [official guide](https://docs.docker.com/compose/install/). 9 | 10 | * Run `docker` and in terminal to see if they are properly installed. 11 | 12 | * Clone the project and cd into it. 13 | 14 | ```bash 15 | git clone https://github.com/fossasia/query-server.git && cd query-server 16 | ``` 17 | 18 | * In the terminal window, run `docker build -t query-server:latest .` to build badgeyay's docker image. This process can take some time. 19 | 20 | * After build is done, run `docker run -d -p 7001:7001 query-server` to start the server. 21 | -------------------------------------------------------------------------------- /docs/installation/heroku.md: -------------------------------------------------------------------------------- 1 | # Heroku 2 | 3 | One-click Heroku deployment is available: 4 | 5 | [![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy?template=https://github.com/fossasia/query-server) 6 | 7 | ### Steps for Manual Deployment 8 | 9 | * We need to install heroku on our machine. Type the following in your linux terminal: 10 | * ```wget -O- https://toolbelt.heroku.com/install-ubuntu.sh | sh``` 11 | This installs the Heroku Toolbelt on your machine to access heroku from the command line. For windows user, install from [here](https://devcenter.heroku.com/articles/heroku-cli#windows) 12 | * Next we need to login to our heroku server (assuming that you have already created an account). Type the following in the terminal: 13 | * ```heroku login```(for windows user on cygwin or git bash: ```winpty heroku login```) 14 | * Enter your credentials and login. 15 | * Once logged in we need to create a space on the heroku server for our application. This is done with the following command 16 | * ```heroku create``` 17 | * Add nodejs build pack to the app 18 | * ```heroku buildpacks:add --index 1 heroku/nodejs``` 19 | * Add python build pack to the app 20 | * ```heroku buildpacks:add --index 2 heroku/python``` 21 | * Check nodejs and python build pack in the app 22 | * ```heroku buildpacks``` 23 | It should return 24 | 25 | > 1. heroku/nodejs 26 | > 2. heroku/python 27 | 28 | * Then we deploy the code to heroku. 29 | * ```git push heroku master``` or 30 | * ```git push heroku yourbranch:master``` if you are in a different branch than master 31 | -------------------------------------------------------------------------------- /docs/installation/local.md: -------------------------------------------------------------------------------- 1 | # Local Development Setup 2 | 3 | The instructions on this page will guide you in setting up a local development environment in your system. 4 | 5 | For a start, fork Query-Server to your own github account. Then, clone it to your local system. 6 | 7 | ```sh 8 | git clone -b master https://github.com//query-server.git 9 | ``` 10 | 11 | Add an upstream remote so that you can push your patched branches for starting a PR . 12 | 13 | ```sh 14 | cd query-server 15 | git remote add upstream https://github.com/fossasia/query-server.git 16 | ``` 17 | 18 | Make sure you have [Nodejs](https://nodejs.org/en/) installed. 19 | Running this tool requires installing the nodejs as well as python dependencies. 20 | 21 | ``` 22 | npm install -g bower 23 | bower install 24 | pip install virtualenv 25 | virtualenv venv 26 | . venv/bin/activate # Linux 27 | venv\Scripts\activate # Windows 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | or to use [`pipenv`](https://docs.pipenv.org) instead of `pip` and `virtualenv` separately. 32 | 33 | ``` 34 | npm install -g bower 35 | bower install 36 | pip install pipenv 37 | pipenv --two # To setup python 2 virtual environment 38 | pipenv install -r requirements.txt 39 | pipenv shell # To activate virtual environment 40 | ``` 41 | 42 | To set up MongoDB on your server : 43 | 44 | ```bash 45 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10 46 | echo "deb http://repo.mongodb.org/apt/ubuntu "$(lsb_release -sc)"/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list 47 | sudo apt-get update 48 | sudo apt-get install -y mongodb 49 | sudo service mongodb start 50 | ``` 51 | 52 | To run the project on a local machine. 53 | 54 | For development mode (with debugger active), use the following command 55 | ```sh 56 | python app/server.py --dev 57 | ``` 58 | 59 | To run the project on a production machine. 60 | 61 | ```sh 62 | python app/server.py 63 | ``` 64 | 65 | ## Preferred Development Workflow 66 | 67 | 1. Get the latest copy of code from upstream. 68 | 69 | ```sh 70 | git pull upstream master 71 | ``` 72 | 73 | 2. Once you get assigned an issue, create a new branch from `master`. 74 | 75 | ```sh 76 | git checkout -b XXX-mock-issue # XXX is the issue number 77 | ``` 78 | 79 | 3. Work on your patch, test it and when it's done, push it to your fork. 80 | 81 | ```sh 82 | git push origin XXX-mock-issue 83 | ``` 84 | 85 | 4. File a PR and wait for the maintainers to suggest reviews or in the best case 86 | merge the PR. Then just update `master` of your local clone. 87 | 88 | ```sh 89 | git pull upstream master 90 | ``` 91 | 92 | And then loop back again. For contribution guidelines, refer [here](https://github.com/fossasia/query-server/blob/master/.github/CONTRIBUTING.md) 93 | -------------------------------------------------------------------------------- /manifest.yml: -------------------------------------------------------------------------------- 1 | applications: 2 | - name: query-server 3 | memory: 256M 4 | command: python mongo-app.py 5 | buildpack: https://github.com/cloudfoundry/python-buildpack.git 6 | instances: 2 7 | services: 8 | - todo-mongo-db -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "query-server", 3 | "version": "0.1.0", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "bower": { 8 | "version": "1.8.2", 9 | "resolved": "https://registry.npmjs.org/bower/-/bower-1.8.2.tgz", 10 | "integrity": "sha1-rfU1KcjUrwLvJPuNU0HBQZ0z4vc=" 11 | }, 12 | "ci-info": { 13 | "version": "1.1.2", 14 | "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-1.1.2.tgz", 15 | "integrity": "sha512-uTGIPNx/nSpBdsF6xnseRXLLtfr9VLqkz8ZqHXr3Y7b6SftyRxBGjwMtJj1OhNbmlc1wZzLNAlAcvyIiE8a6ZA==", 16 | "dev": true 17 | }, 18 | "husky": { 19 | "version": "0.14.3", 20 | "resolved": "https://registry.npmjs.org/husky/-/husky-0.14.3.tgz", 21 | "integrity": "sha512-e21wivqHpstpoiWA/Yi8eFti8E+sQDSS53cpJsPptPs295QTOQR0ZwnHo2TXy1XOpZFD9rPOd3NpmqTK6uMLJA==", 22 | "dev": true, 23 | "requires": { 24 | "is-ci": "1.1.0", 25 | "normalize-path": "1.0.0", 26 | "strip-indent": "2.0.0" 27 | } 28 | }, 29 | "is-ci": { 30 | "version": "1.1.0", 31 | "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-1.1.0.tgz", 32 | "integrity": "sha512-c7TnwxLePuqIlxHgr7xtxzycJPegNHFuIrBkwbf8hc58//+Op1CqFkyS+xnIMkwn9UsJIwc174BIjkyBmSpjKg==", 33 | "dev": true, 34 | "requires": { 35 | "ci-info": "1.1.2" 36 | } 37 | }, 38 | "normalize-path": { 39 | "version": "1.0.0", 40 | "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-1.0.0.tgz", 41 | "integrity": "sha1-MtDkcvkf80VwHBWoMRAY07CpA3k=", 42 | "dev": true 43 | }, 44 | "strip-indent": { 45 | "version": "2.0.0", 46 | "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-2.0.0.tgz", 47 | "integrity": "sha1-XvjbKV0B5u1sv3qrlpmNeCJSe2g=", 48 | "dev": true 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "query-server", 3 | "version": "0.1.0", 4 | "dependencies": { 5 | "bower": "^1.8.0" 6 | }, 7 | "scripts": { 8 | "postinstall": "bower install", 9 | "precommit": "flake8 . --max-line-length=85" 10 | }, 11 | "devDependencies": { 12 | "husky": "^0.14.3" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8==1.4 2 | coverage>=4.3.4 3 | coveralls>=1.1 4 | pytest>=3.0.6 5 | pytest-cov>=2.4.0 6 | mock>=2.0.0 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.5.1 2 | dicttoxml>=1.7.4 3 | feedgen>=0.5.1 4 | Flask>=0.12 5 | futures>=3.0.5 6 | html5lib>=0.9999999 7 | Jinja2>=2.9.5 8 | lxml>=3.7.2 9 | pymongo>=3.6.0 10 | requests>=2.13.0 11 | webencodings>=0.5 12 | defusedxml>=0.5.0 13 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-2.7.14 -------------------------------------------------------------------------------- /scalingo.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Query-Server", 3 | "description": "The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask and DuckDuckGo) and get the results as json or xml.", 4 | "logo": "https://scalingo.com/logo.svg", 5 | "repository": "https://github.com/fossasia/query-server", 6 | "website": "https://query-server.herokuapp.com/", 7 | "env": { 8 | }, 9 | "addons": ["scalingo-redis"] 10 | } 11 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fossasia/query-server/51c98716e1ec9cde6023a6b3ac2eb3081daa9e82/test/__init__.py -------------------------------------------------------------------------------- /test/test_ask.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Ask 4 | 5 | 6 | def test_next_start(): 7 | assert 3 == Ask().next_start(2, None) 8 | 9 | 10 | def test_parse_response_for_none(): 11 | html_text = """
12 |
13 |

No results for:

14 |

44754546546545545465465f4654f654654

15 |

Please try again.

16 |
17 |
""" 18 | stub_soup = BeautifulSoup(html_text, 'html.parser') 19 | resp = Ask().parse_response(stub_soup) 20 | assert resp is None 21 | 22 | 23 | def test_parse_response_with_desc(): 24 | html_div = """
25 |
26 | mock_title 28 |
29 |

mock_desc

30 |
""" 31 | stub_soup_div = BeautifulSoup(html_div, 'html.parser') 32 | resp = Ask().parse_response(stub_soup_div) 33 | expected_resp = [ 34 | { 35 | 'link': u'mock_url', 36 | 'title': u'mock_title', 37 | 'desc': u'mock_desc' 38 | } 39 | ] 40 | assert resp == expected_resp 41 | 42 | 43 | def test_parse_response_without_desc(): 44 | html_div = """
45 |
46 | mock_title 48 |
49 |
""" 50 | stub_soup_div = BeautifulSoup(html_div, 'html.parser') 51 | resp = Ask().parse_response(stub_soup_div) 52 | expected_resp = [ 53 | { 54 | 'link': u'mock_url', 55 | 'title': u'mock_title' 56 | } 57 | ] 58 | assert resp == expected_resp 59 | 60 | 61 | def test_parse_video_response(): 62 | html_div = """
64 |
mock_desc
65 |
""" 66 | stub_soup_div = BeautifulSoup(html_div, 'html.parser') 67 | resp = Ask().parse_video_response(stub_soup_div) 68 | url_video = 'https' + 'mock_url' 69 | expected_resp = [ 70 | { 71 | 'link': url_video, 72 | 'title': u'mock_title', 73 | 'desc': u'mock_desc' 74 | } 75 | ] 76 | assert resp == expected_resp 77 | -------------------------------------------------------------------------------- /test/test_baidu.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Baidu 4 | 5 | 6 | def test_parse_response(): 7 | html_text = """

8 | mock_title 9 |

""" 10 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 11 | resp = Baidu().parse_response(dummy_soup) 12 | expected_resp = [{ 13 | 'title': u'mock_title', 14 | 'link': u'mock_url' 15 | }] 16 | assert resp == expected_resp 17 | 18 | 19 | def test_parse_news_response(): 20 | html_text = """

21 | mock_title 22 |

""" 23 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 24 | resp = Baidu().parse_news_response(dummy_soup) 25 | expected_resp = [{ 26 | 'title': u'mock_title', 27 | 'link': u'mock_url' 28 | }] 29 | assert resp == expected_resp 30 | -------------------------------------------------------------------------------- /test/test_bing.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Bing 4 | 5 | 6 | def test_parse_response(): 7 | html_text = """
  • 8 |

    mock_title

    9 |

    mock_desc

    10 |
  • """ 11 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 12 | resp = Bing().parse_response(dummy_soup) 13 | expected_resp = [{ 14 | 'title': u'mock_title', 15 | 'link': u'mock_url', 16 | 'desc': u'mock_desc' 17 | }] 18 | assert resp == expected_resp 19 | 20 | 21 | def test_parse_image_response(): 22 | html_text = """mock_title""" 23 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 24 | resp = Bing().parse_image_response(dummy_soup) 25 | link_image = 'https://www.bing.com' + 'mock_url' 26 | expected_resp = [{ 27 | 'link': link_image 28 | }] 29 | assert resp == expected_resp 30 | 31 | 32 | def test_parse_video_response(): 33 | html_text = """""" 35 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 36 | resp = Bing().parse_video_response(dummy_soup) 37 | link_video = 'https://www.bing.com' + 'mock_url' 38 | expected_resp = [{ 39 | 'title': u'mock_title', 40 | 'link': link_video, 41 | }] 42 | assert resp == expected_resp 43 | 44 | 45 | def test_parse_news_response(): 46 | html_text = """
    48 | mock_desc
    """ 49 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 50 | resp = Bing().parse_news_response(dummy_soup) 51 | expected_resp = [{ 52 | 'title': u'mock_title', 53 | 'link': u'mock_url', 54 | 'desc': u'mock_desc', 55 | }] 56 | assert resp == expected_resp 57 | -------------------------------------------------------------------------------- /test/test_duckduckgo.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import DuckDuckGo 4 | 5 | 6 | def test_parse_response(): 7 | html_text = """

    8 | mock_title 9 |

    """ 10 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 11 | resp = DuckDuckGo().parse_response(dummy_soup) 12 | expected_resp = [{ 13 | 'title': u'mock_title', 14 | 'link': u'mock_url' 15 | }] 16 | assert resp == expected_resp 17 | -------------------------------------------------------------------------------- /test/test_generalized.py: -------------------------------------------------------------------------------- 1 | from mock import patch 2 | import pytest 3 | 4 | from app.scrapers.generalized import Scraper 5 | 6 | 7 | @patch('requests.models.Response') 8 | @patch('app.scrapers.generalized.requests.get') 9 | def test_get_page(mock_request_get, mock_response): 10 | mock_request_get.return_value = mock_response 11 | mock_response.url = "Mock Url" 12 | response = Scraper().get_page("dummy_query") 13 | assert response == mock_response 14 | expected_payload = {'q': 'dummy_query', '': ''} 15 | expected_headers = { 16 | 'User-Agent': ( 17 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) ' 18 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 ' 19 | 'Safari/537.36' 20 | ) 21 | } 22 | mock_request_get.assert_called_with( 23 | '', headers=expected_headers, params=expected_payload) 24 | 25 | 26 | def test_parse_response(): 27 | with pytest.raises(NotImplementedError): 28 | Scraper().parse_response(None) 29 | 30 | 31 | def test_next_start(): 32 | dummy_prev_results = ['dummy_value'] 33 | if not Scraper().next_start(3, dummy_prev_results) == 4: 34 | raise AssertionError() 35 | 36 | 37 | @patch('app.scrapers.generalized.Scraper.parse_response') 38 | @patch('app.scrapers.generalized.Scraper.get_page') 39 | @patch('requests.models.Response') 40 | def test_search(mock_resp, mock_get_page, mock_parse_resp): 41 | mock_get_page.return_value = mock_resp 42 | mock_resp.text = "Mock response" 43 | expected_resp = [{ 44 | 'title': 'mock_title', 45 | 'link': 'mock_url' 46 | }] 47 | # assuming parse_response is being implemented by 48 | # classes inheriting Scraper. Thus, returning dummy 49 | # response instead of raising NotImplementedError 50 | mock_parse_resp.return_value = expected_resp 51 | resp = Scraper().search('dummy_query', 1) 52 | assert resp == expected_resp 53 | 54 | 55 | @patch('app.scrapers.generalized.Scraper.get_page') 56 | @patch('requests.models.Response') 57 | def test_search_parsed_response_none(mock_resp, mock_get): 58 | mock_get.return_value = mock_resp 59 | mock_resp.text = "Mock Response" 60 | with patch('app.scrapers.generalized.Scraper.parse_response', 61 | return_value=None): 62 | resp = Scraper().search('dummy_query', 1) 63 | assert resp == [] 64 | 65 | 66 | @patch('app.scrapers.generalized.requests.get') 67 | @patch('app.scrapers.generalized.Scraper.parse_response') 68 | @patch('requests.models.Response') 69 | def test_search_without_count(mock_resp, mock_parse_resp, mock_get): 70 | mock_get.return_value = mock_resp 71 | mock_resp.text = 'mock response' 72 | expected_resp = [{ 73 | 'title': 'mock_title', 74 | 'link': 'mock_url' 75 | }] 76 | expected_payload = {'q': 'dummy_query'} 77 | expected_headers = { 78 | 'User-Agent': ( 79 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) ' 80 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 ' 81 | 'Safari/537.36' 82 | ) 83 | } 84 | mock_parse_resp.return_value = expected_resp 85 | resp = Scraper().search_without_count('dummy_query') 86 | assert resp == expected_resp 87 | mock_get.assert_called_with( 88 | '', headers=expected_headers, params=expected_payload) 89 | -------------------------------------------------------------------------------- /test/test_google.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Google 4 | 5 | 6 | def test_parse_response(): 7 | html_text = """

    8 | mock_title 9 |

    """ 10 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 11 | expected_resp = [{ 12 | 'title': u'mock_title', 13 | 'link': u'mock_url' 14 | }] 15 | resp = Google().parse_response(dummy_soup) 16 | assert resp == expected_resp 17 | -------------------------------------------------------------------------------- /test/test_mojeek.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Mojeek 4 | 5 | 6 | def test_parse_response(): 7 | html_text = 'mock_title' 8 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 9 | expected_resp = [{ 10 | 'title': u'mock_title', 11 | 'link': u'mock_url' 12 | }] 13 | resp = Mojeek().parse_response(dummy_soup) 14 | assert resp == expected_resp 15 | 16 | 17 | def test_parse_news_response(): 18 | html_text = 'mock_title' 19 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 20 | expected_resp = [{ 21 | 'title': u'mock_title', 22 | 'link': u'mock_url' 23 | }] 24 | resp = Mojeek().parse_news_response(dummy_soup) 25 | assert resp == expected_resp 26 | -------------------------------------------------------------------------------- /test/test_parsijoo.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Parsijoo 4 | 5 | 6 | def test_parse_response(): 7 | html_text = """
    8 | 9 | """ + " " * 22 + """mock_title 10 | mock_url 11 | """ + " " * 34 + """ mock_desc 12 | mock_similar 14 |
    """ 15 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 16 | expected_resp = [{ 17 | 'title': u'mock_title', 18 | 'link': u'mock_url', 19 | 'desc': u'mock_desc' 20 | }] 21 | resp = Parsijoo().parse_response(dummy_soup) 22 | assert resp == expected_resp 23 | 24 | 25 | def test_parse_video_response(): 26 | html_text = """mock_title""" 28 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 29 | url = 'https://video.parsijoo.ir' + "mock_url" 30 | expected_resp = [{ 31 | 'title': u'mock_title', 32 | 'link': url, 33 | }] 34 | resp = Parsijoo().parse_video_response(dummy_soup) 35 | assert resp == expected_resp 36 | 37 | 38 | def test_parse_image_response(): 39 | html_text = """""" 41 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 42 | image_url = 'https://image.parsijoo.ir' + 'mock_url' 43 | expected_resp = [{ 44 | 'link': image_url, 45 | }] 46 | resp = Parsijoo().parse_image_response(dummy_soup) 47 | assert resp == expected_resp 48 | 49 | 50 | def test_parse_news_response(): 51 | html_text = """""" 54 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 55 | expected_resp = [{ 56 | 'title': u'mock_title', 57 | 'link': u'mock_url' 58 | }] 59 | resp = Parsijoo().parse_news_response(dummy_soup) 60 | assert resp == expected_resp 61 | -------------------------------------------------------------------------------- /test/test_quora.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Quora 4 | 5 | 6 | def test_parse_response(): 7 | html_text = ("") 10 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 11 | expected_resp = [{ 12 | 'title': u'mock_title', 13 | 'link': u'https://www.quora.com/mock_url' 14 | }] 15 | resp = Quora().parse_response(dummy_soup) 16 | assert resp == expected_resp 17 | -------------------------------------------------------------------------------- /test/test_server.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import pytest 5 | import requests 6 | from defusedxml import ElementTree 7 | from mock import patch 8 | 9 | from app.scrapers import small_test 10 | from app.server import app 11 | 12 | REASON = 'Do you have query-server running on http://127.0.0.1:7001 ?' 13 | TRAVIS_CI = os.getenv('TRAVIS', False) # Running in Travis CI? 14 | 15 | 16 | @pytest.mark.xfail(not TRAVIS_CI, reason=REASON) 17 | def test_small_test(): 18 | small_test() 19 | 20 | 21 | @pytest.mark.xfail(not TRAVIS_CI, reason=REASON) 22 | def test_invalid_url_api_call(): 23 | response = requests.get('http://localhost:7001/api/v1/search/invalid_url') 24 | assert response.json()['Status Code'] == 404 25 | 26 | 27 | def make_engine_api_call(engine_name): 28 | url = 'http://localhost:7001/api/v1/search/' + engine_name 29 | assert requests.get(url).json()['Status Code'] == 400 30 | 31 | 32 | @pytest.mark.xfail(not TRAVIS_CI, reason=REASON) 33 | def test_engine_api_calls(engine_names=None): 34 | engines = """ask baidu bing dailymotion duckduckgo exalead google 35 | mojeek parsijoo quora yahoo youtube""".split() 36 | for engine_name in (engine_names or engines): 37 | make_engine_api_call(engine_name) 38 | 39 | 40 | def test_api_index(): 41 | assert app.test_client().get('/').status_code == 200 42 | 43 | 44 | @patch('app.server.abort') 45 | def test_api_search_invalid_qformat(mock_abort): 46 | url = '/api/v1/search/google?query=fossasia&format=invalid' 47 | app.test_client().get(url) 48 | mock_abort.assert_called_with(400, 'Not Found - undefined format') 49 | 50 | 51 | @patch('app.server.bad_request', return_value="Mock Response") 52 | def test_api_search_invalid_engine(mock_bad_request): 53 | url = '/api/v1/search/invalid?query=fossasia' 54 | resp = app.test_client().get(url).get_data().decode('utf-8') 55 | mock_bad_request.assert_called_with( 56 | [404, 'Incorrect search engine', 'invalid']) 57 | assert resp == "Mock Response" 58 | 59 | 60 | @patch('app.server.bad_request', return_value="Mock Response") 61 | def test_api_search_missing_query(mock_bad_request): 62 | # invalid url with query parameter missing 63 | url = '/api/v1/search/google' 64 | resp = app.test_client().get(url).get_data().decode('utf-8') 65 | mock_bad_request.assert_called_with( 66 | [400, 'Not Found - missing query', 'json']) 67 | assert resp == "Mock Response" 68 | 69 | 70 | @patch('app.server.bad_request', return_value="Mock Response") 71 | def test_api_search_for_no_response(mock_bad_request): 72 | url = '/api/v1/search/google?query=fossasia' 73 | with patch('app.server.lookup', return_value=None): 74 | with patch('app.server.feed_gen', return_value=None): 75 | resp = app.test_client().get(url).get_data().decode('utf-8') 76 | mock_bad_request.assert_called_with([404, 'No response', 77 | 'google:fossasia']) 78 | assert resp == "Mock Response" 79 | 80 | 81 | def test_api_search_for_cache_hit(): 82 | url = '/api/v1/search/google?query=fossasia' 83 | mock_result = [{'title': 'mock_title', 'link': 'mock_link'}] 84 | with patch('app.server.lookup', return_value=mock_result): 85 | resp = app.test_client().get(url).get_data().decode('utf-8') 86 | assert json.loads(resp) == mock_result 87 | 88 | 89 | @patch('app.server.feed_gen') 90 | @patch('app.server.lookup') 91 | def test_api_search_for_format(mock_lookup, mock_feed_gen): 92 | for qformat in ['json', 'csv', 'xml']: 93 | url = '/api/v1/search/google?query=fossasia&format=' + qformat 94 | mock_result = [ 95 | { 96 | 'title': 'mock_title', 97 | 'link': 'mock_link', 98 | 'desc': 'mock_desc' 99 | } 100 | ] 101 | mock_lookup.return_value = None 102 | mock_feed_gen.return_value = mock_result 103 | resp = app.test_client().get(url).get_data().decode('utf-8') 104 | expected_resp = expected_response_for_format(qformat) 105 | if qformat == 'json': 106 | resp = json.loads(resp) 107 | elif qformat == 'xml': 108 | resp = resp.replace('\t', '').replace('\n', '') 109 | resp = get_json_equivalent_from_xml_feed(resp) 110 | expected_resp = get_json_equivalent_from_xml_feed(expected_resp) 111 | elif qformat == 'csv': 112 | resp = get_json_equivalent_from_csv_feed(resp) 113 | expected_resp = get_json_equivalent_from_csv_feed(expected_resp) 114 | assert expected_resp == resp 115 | 116 | 117 | def expected_response_for_format(qformat): 118 | if qformat == 'json': 119 | return [ 120 | {'title': 'mock_title', 121 | 'link': 'mock_link', 122 | 'desc': 'mock_desc'} 123 | ] 124 | elif qformat == 'csv': 125 | return '"link","title","desc"\n"mock_link","mock_title","mock_desc"' 126 | elif qformat == 'xml': 127 | return ('' 128 | 'mock_descmock_link' 129 | 'mock_title') 130 | 131 | 132 | def get_json_equivalent_from_csv_feed(feed): 133 | keys_feed1 = feed.split('\n')[0].split(',') 134 | json_result = [] 135 | for row_index, row in enumerate(feed.split('\n')): 136 | if row_index == 0: 137 | continue 138 | entry = {} 139 | for index, value in enumerate(row.split(',')): 140 | entry[keys_feed1[index].replace('"', '')] = value.replace('"', '') 141 | json_result.append(entry) 142 | return json_result 143 | 144 | 145 | def get_json_equivalent_from_xml_feed(feed): 146 | def internal_iter(tree, accum): 147 | if tree is None: 148 | return accum 149 | 150 | if tree.getchildren(): 151 | accum[tree.tag] = {} 152 | for each in tree.getchildren(): 153 | result = internal_iter(each, {}) 154 | if each.tag in accum[tree.tag]: 155 | if not isinstance(accum[tree.tag][each.tag], list): 156 | accum[tree.tag][each.tag] = [ 157 | accum[tree.tag][each.tag] 158 | ] 159 | accum[tree.tag][each.tag].append(result[each.tag]) 160 | else: 161 | accum[tree.tag].update(result) 162 | else: 163 | accum[tree.tag] = tree.text 164 | 165 | return accum 166 | 167 | return internal_iter(ElementTree.fromstring(feed), {}) 168 | -------------------------------------------------------------------------------- /test/test_twitter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from mock import patch, MagicMock 4 | 5 | from app.scrapers import Twitter 6 | 7 | 8 | @patch('requests.models.Response') 9 | @patch('app.scrapers.twitter.requests.get') 10 | def test_search(mock_requests_get, mock_response): 11 | dummy_json = json.loads('''{ 12 | "aggregations": {}, 13 | "readme_3": "mock_data", 14 | "statuses": [ 15 | { 16 | "hosts_count": 1, 17 | "links": [ 18 | "http://Phimp.Me" 19 | ], 20 | "text": "mock_text", 21 | "retweet_count": 0, 22 | "source_type": "TWITTER", 23 | "link": "mock_link", 24 | "links_count": 1 25 | } 26 | ] 27 | }''') 28 | expected_resp = [ 29 | { 30 | 'text': u'mock_text', 31 | 'link': u'mock_link' 32 | } 33 | ] 34 | mock_requests_get.return_value = mock_response 35 | mock_response.json = MagicMock(return_value=dummy_json) 36 | resp = Twitter().search('dummy_query', 1) 37 | assert expected_resp == resp 38 | -------------------------------------------------------------------------------- /test/test_yahoo.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Yahoo 4 | 5 | 6 | def test_parse_response(): 7 | html_text = ('

    mock_title

    ') 10 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 11 | expected_resp = [{ 12 | 'title': u'mock_title', 13 | 'link': u'mock_url' 14 | }] 15 | resp = Yahoo().parse_response(dummy_soup) 16 | assert resp == expected_resp 17 | 18 | 19 | def test_parse_image_response(): 20 | html_text = """
  • 21 | """ 22 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 23 | expected_resp = [{ 24 | 'title': u'mock_title', 25 | 'link': u'mock_url' 26 | }] 27 | resp = Yahoo().parse_image_response(dummy_soup) 28 | assert resp == expected_resp 29 | 30 | 31 | def test_parse_news_response(): 32 | html_text = '
    ' \ 33 | '

    mock_title

    mock_desc'\ 36 | '

    ' 37 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 38 | expected_resp = [{ 39 | 'title': u'mock_title', 40 | 'link': u'mock_url', 41 | 'desc': u'mock_desc' 42 | }] 43 | resp = Yahoo().parse_news_response(dummy_soup) 44 | assert resp == expected_resp 45 | -------------------------------------------------------------------------------- /test/test_youtube.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from app.scrapers import Youtube 4 | 5 | 6 | def test_parse_response(): 7 | html_text = ('mock_channelmock_title') 12 | dummy_soup = BeautifulSoup(html_text, 'html.parser') 13 | expected_resp = [{ 14 | 'title': u'mock_title', 15 | 'link': u'https://www.youtube.com/watch?v=mock' 16 | }] 17 | resp = Youtube().parse_response(dummy_soup) 18 | assert resp == expected_resp 19 | --------------------------------------------------------------------------------