├── .gitignore ├── README.md ├── app.py ├── config ├── Caddyfile └── gunicorn.py ├── docs └── 00.png ├── items ├── __init__.py ├── baidu.py ├── bing.py └── google.py ├── requirements.txt ├── settings.py └── wsgi.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | .html/ 108 | 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## toapi-search 2 | 3 | ### What is toapi-search? 4 | 5 | This project uses [Toapi](https://github.com/gaojiuli/toapi) to build a friendly and robust API from Google, Bing, Baidu, So, DuckDuckGo etc. 6 | 7 | 8 | ``` shell 9 | 10 | # or git clone https://github.com/toapi/toapi-search 11 | toapi new toapi/toapi-search 12 | cd toapi-search 13 | # toapi run 14 | python wsgi.py 15 | ``` 16 | 17 | Then, everything is done, the following content will show on screen: 18 | 19 | ![RUN](./docs/00.png) 20 | 21 | ### Usage: 22 | 23 | Once the server is started, you can get JSON data from toapi-search: 24 | 25 | Visit `http://0.0.0.0:5000/_items/` 26 | 27 | ``` json 28 | 29 | { 30 | "/:wd": [ 31 | "google", 32 | "bing", 33 | "baidu" 34 | ] 35 | } 36 | 37 | ``` 38 | 39 | > http://0.0.0.0:5000/python 40 | 41 | ``` json 42 | 43 | { 44 | "baidu": [ 45 | { 46 | "title": "Welcome to Python.org", 47 | "url": "http://www.baidu.com/link?url=g2_i_ThdQ0aA4WYZ5sIj5Lt3rly1xBA7XY0IkBza_W7DBcn_jJam4k1F9qiFBwZB" 48 | }, 49 | { 50 | "title": "Download Python | Python.org", 51 | "url": "http://www.baidu.com/link?url=nb5pPKHJv403lz96-4EztfUBtWhiw6VDP-HPQrHVexuN8YoXaDnJILQl_Jy8r22j" 52 | }, 53 | { 54 | "title": "Python 基础教程 | 菜鸟教程", 55 | "url": "http://www.baidu.com/link?url=ISKIBt6yMmDv6TM6rH9OvbyzM1j8r-3ZI6LcLg_w8-0BbWBC-OMaLFjl_JiKAkX88gcrlrvAUjcKNtG2Yxs5xa" 56 | }, 57 | { 58 | "title": "Python教程 - 廖雪峰的官方网站", 59 | "url": "http://www.baidu.com/link?url=acz2VD5xN9J6e7R2GDE1vxa9ThpxF3uGHCPqQXvez04bUGUsxxz1S2PqnNhZWQ0ZBVQXTfIdwkInoVR1KmL-6solFSfCoM3C7TDkT5OdeTRK1ttSYRrbtv87-tufAVY9" 60 | }, 61 | { 62 | "title": "你是如何自学 Python 的? - 知乎", 63 | "url": "http://www.baidu.com/link?url=ERdlWXCJNqIfvj-bKT8spkUeF6ORHWshUy6WdsRR2f6y9XWRK9tHg-2aWCxbndVpiXFf4Rn7zdy6LM_8wcz6gq" 64 | }, 65 | { 66 | "title": "Python Releases for Windows | Python.org", 67 | "url": "http://www.baidu.com/link?url=-gL2jvZJVDsWI9aXZE4LO9G5IkMRbxApwf-yHMw9fLyYWXJ-_gylkX9jDNjKfH3EGTn0WocHOQ8-lNamkr9ega" 68 | }, 69 | { 70 | "title": "Python - 伯乐在线", 71 | "url": "http://www.baidu.com/link?url=R0yId0pR9fQMXq615mHSkjNOtpq59wJOx5RWvFrV4Lfd6Ql26MV_teRi519oOqef" 72 | }, 73 | { 74 | "title": "Python 简介 | 菜鸟教程", 75 | "url": "http://www.baidu.com/link?url=HBE46hWbBQim7NbuG7KukYnLSscD3YK4MdNY8MimB5Xq6OgLdkdeAcVYoDKrJTnAngnzcJx-oY-JC6fb5z1edq" 76 | } 77 | ], 78 | "bing": [ 79 | { 80 | "title": "Python - Official Site", 81 | "url": "https://www.python.org/" 82 | }, 83 | { 84 | "title": "Python (programming language) - Wikipedia", 85 | "url": "https://en.wikipedia.org/wiki/Python_%28programming_language%29" 86 | }, 87 | { 88 | "title": "Python - Tutorial", 89 | "url": "https://www.tutorialspoint.com/python/" 90 | }, 91 | { 92 | "title": "Python - Wikipedia", 93 | "url": "https://en.wikipedia.org/wiki/Python" 94 | }, 95 | { 96 | "title": "Python - Free download and software reviews - …", 97 | "url": "http://download.cnet.com/Python/3000-2069_4-10080057.html" 98 | }, 99 | { 100 | "title": "2. Built-in Functions — Python 3.6.4 documentation", 101 | "url": "https://docs.python.org/3/library/functions.html" 102 | }, 103 | { 104 | "title": "The Python Standard Library — Python 3.6.4 …", 105 | "url": "https://docs.python.org/3/library/index.html" 106 | }, 107 | { 108 | "title": "Learn Python - Free Interactive Python Tutorial", 109 | "url": "https://www.learnpython.org/" 110 | }, 111 | { 112 | "title": "Python - Basic Operators - tutorialspoint.com", 113 | "url": "http://www.tutorialspoint.com/python/python_basic_operators.htm" 114 | }, 115 | { 116 | "title": "Learn Python | Codecademy", 117 | "url": "https://www.codecademy.com/learn/learn-python" 118 | } 119 | ], 120 | "google": [ 121 | { 122 | "title": "Welcome to Python.org", 123 | "url": "https://www.python.org/" 124 | }, 125 | { 126 | "title": "Python (programming language) - Wikipedia", 127 | "url": "https://en.wikipedia.org/wiki/Python_(programming_language)" 128 | }, 129 | { 130 | "title": "Python | Codecademy", 131 | "url": "https://www.codecademy.com/en/tracks/python" 132 | }, 133 | { 134 | "title": "Python - Learn Python | Codecademy", 135 | "url": "https://www.codecademy.com/learn/learn-python" 136 | }, 137 | { 138 | "title": "Python Tutorial", 139 | "url": "https://www.tutorialspoint.com/python/" 140 | }, 141 | { 142 | "title": "Python · GitHub", 143 | "url": "https://github.com/python" 144 | }, 145 | { 146 | "title": "Images for python", 147 | "url": null 148 | }, 149 | { 150 | "title": "", 151 | "url": "" 152 | } 153 | ] 154 | } 155 | 156 | ``` 157 | 158 | ### Deploy: 159 | 160 | We recommend that you use Caddy(Nginx) + Gunicorn 161 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from toapi import Api 2 | from items.google import Google 3 | from items.bing import Bing 4 | from items.baidu import Baidu 5 | from settings import MySettings 6 | 7 | api = Api(settings=MySettings) 8 | 9 | api.register(Google) 10 | api.register(Bing) 11 | api.register(Baidu) 12 | 13 | if __name__ == '__main__': 14 | api.serve() 15 | -------------------------------------------------------------------------------- /config/Caddyfile: -------------------------------------------------------------------------------- 1 | www.toapi-search.com { 2 | proxy / 127.0.0.1:5000 3 | timeouts none 4 | gzip 5 | } 6 | 7 | toapi-search.com { 8 | redir http://www.toapi-search.com 9 | } -------------------------------------------------------------------------------- /config/gunicorn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | WORKERS = os.getenv('WORKERS', 4) 5 | 6 | bind = '0.0.0.0:5000' 7 | max_requests = 1000 8 | worker_class = 'gevent' 9 | workers = WORKERS 10 | preload_app = True 11 | graceful_timeout = 30 12 | preload = True 13 | -------------------------------------------------------------------------------- /docs/00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toapi/toapi-search/3b30949b4a42563b266df0912065550503444480/docs/00.png -------------------------------------------------------------------------------- /items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toapi/toapi-search/3b30949b4a42563b266df0912065550503444480/items/__init__.py -------------------------------------------------------------------------------- /items/baidu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from toapi import Css 3 | 4 | from .bing import Bing 5 | 6 | 7 | class Baidu(Bing): 8 | __name__ = 'baidu' 9 | __base_url__ = 'http://www.baidu.com' 10 | 11 | url = Css('h3.t a', attr='href') 12 | title = Css('h3.t a') 13 | 14 | class Meta: 15 | source = Css('div.result') 16 | route = {'/:wd': '/s?wd=:wd&ie=utf-8&vf_bl=1'} 17 | -------------------------------------------------------------------------------- /items/bing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from toapi import Css, Item 3 | 4 | 5 | class Bing(Item): 6 | __name__ = 'bing' 7 | __base_url__ = 'https://www.bing.com' 8 | 9 | url = Css('h2 a', attr='href') 10 | title = Css('h2 a') 11 | 12 | def clean_url(self, url): 13 | if isinstance(url, list) and len(url): 14 | url = url[0].get('href') 15 | return url if url else '' 16 | 17 | def clean_title(self, title): 18 | if isinstance(title, list) and len(title): 19 | text = '' 20 | for node in title[0].itertext(): 21 | text += node 22 | title = text.strip() 23 | return title if title else '' 24 | 25 | class Meta: 26 | source = Css('li.b_algo') 27 | route = {'/:wd': '/search?q=:wd&ensearch=1'} 28 | -------------------------------------------------------------------------------- /items/google.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from urllib.parse import urlparse, parse_qs 3 | 4 | from toapi import Css, Item 5 | 6 | 7 | class Google(Item): 8 | __name__ = 'google' 9 | __base_url__ = 'https://www.google.com' 10 | 11 | url = Css('h3.r > a', attr='href') 12 | title = Css('h3.r > a') 13 | 14 | def clean_url(self, url): 15 | if isinstance(url, list) and len(url): 16 | url = url[0].get('href') 17 | return self.filter_link(link=url) if url else '' 18 | 19 | def clean_title(self, title): 20 | if isinstance(title, list) and len(title): 21 | text = '' 22 | for node in title[0].itertext(): 23 | text += node 24 | title = text.strip() 25 | return title if title else '' 26 | 27 | @classmethod 28 | def filter_link(cls, link): 29 | """ 30 | Returns None if the link doesn't yield a valid result. 31 | Token from https://github.com/MarioVilas/google 32 | :return: a valid result 33 | """ 34 | try: 35 | # Valid results are absolute URLs not pointing to a Google domain 36 | # like images.google.com or googleusercontent.com 37 | o = urlparse(link, 'http') 38 | if o.netloc: 39 | return link 40 | # Decode hidden URLs. 41 | if link.startswith('/url?'): 42 | link = parse_qs(o.query)['q'][0] 43 | # Valid results are absolute URLs not pointing to a Google domain 44 | # like images.google.com or googleusercontent.com 45 | o = urlparse(link, 'http') 46 | if o.netloc: 47 | return link 48 | # Otherwise, or on error, return None. 49 | except Exception as e: 50 | return '' 51 | 52 | class Meta: 53 | source = Css('div.g') 54 | route = { 55 | '/:wd': '/search?hl=en&q=:wd&btnG=Search&gbv=1', 56 | } 57 | web = { 58 | "with_ajax": False, 59 | "request_config": { 60 | 'headers': { 61 | 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" 62 | }, 63 | 'proxies': { 64 | 'http': '0.0.0.0:8118', 65 | 'https': '0.0.0.0:8118' 66 | } 67 | }, 68 | "headers": None 69 | } 70 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gunicorn 2 | gevent 3 | toapi -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from toapi.cache import RedisCache, PickleSerializer 4 | from toapi.settings import Settings 5 | 6 | 7 | class MySettings(Settings): 8 | """ 9 | Create custom configuration 10 | http://www.toapi.org/topics/settings/ 11 | """ 12 | 13 | cache = { 14 | 'cache_class': RedisCache, 15 | 'cache_config': { 16 | 'host': '127.0.0.1', 17 | 'port': 6379, 18 | 'db': 0 19 | }, 20 | 'serializer': PickleSerializer, 21 | 'ttl': 10000 22 | } 23 | storage = { 24 | "PATH": os.getcwd(), 25 | "DB_URL": None 26 | } 27 | web = { 28 | "with_ajax": False, 29 | "request_config": { 30 | 'headers': { 31 | 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" 32 | } 33 | }, 34 | "headers": None 35 | } 36 | -------------------------------------------------------------------------------- /wsgi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from app import api 4 | from gunicorn.app.base import Application 5 | 6 | 7 | class MyApplication(Application): 8 | def load_config(self): 9 | self.load_config_from_module_name_or_filename( 10 | "config/gunicorn.py") 11 | 12 | def load(self): 13 | return api.server.app 14 | 15 | 16 | if __name__ == '__main__': 17 | MyApplication().run() 18 | --------------------------------------------------------------------------------