├── config ├── __init__.py └── routes.py ├── oreilly ├── __init__.py ├── handlers │ ├── __init__.py │ ├── books.py │ ├── base.py │ ├── serializers.py │ └── api.py ├── models │ ├── __init__.py │ └── books.py └── cron_jobs │ ├── __init__.py │ ├── parsers.py │ ├── routes.py │ └── handlers.py ├── static └── img │ └── favicon.ico ├── cron.yaml ├── index.yaml ├── app.yaml ├── main.py ├── LICENSE ├── .gitignore └── README.md /config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oreilly/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oreilly/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oreilly/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oreilly/cron_jobs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /oreilly/cron_jobs/parsers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Parsers module.""" 5 | -------------------------------------------------------------------------------- /static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikjhordan-rey/api-oreilly-free-books/HEAD/static/img/favicon.ico -------------------------------------------------------------------------------- /cron.yaml: -------------------------------------------------------------------------------- 1 | cron: 2 | - description: daily summary job 3 | url: /jobs/retrieve_books/ 4 | schedule: every 1 hours 5 | -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | - kind: Book 4 | properties: 5 | - name: __key__ 6 | direction: desc 7 | 8 | - kind: Book 9 | properties: 10 | - name: title 11 | direction: asc 12 | 13 | - kind: Category 14 | properties: 15 | - name: category 16 | 17 | - kind: Category 18 | properties: 19 | - name: subcategory -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | # GCP Settings 2 | api_version: 1 3 | skip_files: 4 | - ^(.*/)?.*\.py[co]$ 5 | 6 | # Runtime 7 | runtime: python27 8 | threadsafe: yes 9 | 10 | # URLs 11 | handlers: 12 | 13 | # Favicon 14 | - url: /favicon\.ico 15 | static_files: static/img/favicon.ico 16 | upload: bp_content/themes/default/static/favicon.ico 17 | secure: always 18 | 19 | # Main URLs module 20 | - url: /.* 21 | script: main.app 22 | secure: always 23 | 24 | # Third Party Libraries 25 | libraries: 26 | - name: lxml 27 | version: "2.3" 28 | -------------------------------------------------------------------------------- /oreilly/handlers/books.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Books - API module.""" 5 | 6 | # Handlers 7 | from .base import APIHandler 8 | 9 | # Models 10 | from ..models.books import Book 11 | 12 | # Serializers 13 | from .serializers import book_serializer 14 | 15 | 16 | class BooksListHandler(APIHandler): 17 | """Books list Handler.""" 18 | 19 | def get(self): 20 | """Return a list with all books.""" 21 | q = Book.query() 22 | books = [book_serializer(b) for b in q] 23 | return self.reponse(books) 24 | -------------------------------------------------------------------------------- /oreilly/handlers/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Handlers - Base module.""" 5 | 6 | # WebApp 2 7 | from webapp2 import RequestHandler 8 | 9 | # Utilities 10 | import json 11 | 12 | 13 | class APIHandler(RequestHandler): 14 | """API Handler. 15 | 16 | Setup request and reponse objects. 17 | """ 18 | 19 | def reponse(self, data): 20 | """Set Content-Type and JSON content.""" 21 | response = self.response 22 | response.headers['Content-Type'] = 'application/json' 23 | json.dump(data, response.out) 24 | return response 25 | -------------------------------------------------------------------------------- /oreilly/handlers/serializers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Books - Serializers.""" 5 | 6 | # Models 7 | from ..models.books import Category 8 | 9 | 10 | def book_serializer(book): 11 | """Given a book object, return a python dictionary.""" 12 | return { 13 | 'title': book.title, 14 | 'url': book.url, 15 | 'thumbnail': book.thumbnail, 16 | 'description': book.description, 17 | 'pdf': book.pdf, 18 | 'mobi': book.mobi, 19 | 'epub': book.epub, 20 | 'category': book.category.get().name, 21 | 'subcategory': book.subcategory.get().name if book.subcategory else None, 22 | } 23 | -------------------------------------------------------------------------------- /oreilly/cron_jobs/routes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Cron jobs routes module.""" 5 | 6 | # WebApp 2 7 | from webapp2_extras.routes import RedirectRoute 8 | 9 | # Handlers 10 | from . import handlers 11 | 12 | 13 | _routes = [ 14 | 15 | RedirectRoute( 16 | template='/jobs/retrieve_books', 17 | handler=handlers.RetrieveBooksHandler, 18 | name='jobs__retrieve_books', 19 | strict_slash=True 20 | ), 21 | 22 | ] 23 | 24 | 25 | def get_routes(): 26 | """Return routes.""" 27 | return _routes 28 | 29 | 30 | def add_routes(app): 31 | """Add routes to app.""" 32 | for r in _routes: 33 | app.router.add(r) 34 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | u"""O’Reilly's Free Books API. 5 | 6 | REST API that exposing scrapped data from O’Reilly's 7 | Free Books website (http://www.oreilly.com/programming/free/). 8 | """ 9 | 10 | # Python libraries 11 | import os 12 | import sys 13 | 14 | # Third party libraries 15 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'oreilly')) 16 | 17 | # WebApp 2 18 | import webapp2 19 | 20 | # Routes 21 | from config import routes 22 | from oreilly.cron_jobs import routes as cron_jobs_routes 23 | 24 | # App initialization. 25 | app = webapp2.WSGIApplication(debug=os.environ.get('DEBUG', 'prod') == 'dev') 26 | 27 | routes.add_routes(app) 28 | cron_jobs_routes.add_routes(app) 29 | -------------------------------------------------------------------------------- /oreilly/handlers/api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Handlers - API module.""" 5 | 6 | from .base import APIHandler 7 | 8 | 9 | class MenuHandler(APIHandler): 10 | """Menu Handler.""" 11 | 12 | def get(self): 13 | """Return API available methods.""" 14 | base_url = self.request.host_url + '{endpoint}' 15 | return self.reponse([ 16 | { 17 | 'name': 'API endpoints list', 18 | 'allowed_methods': ['GET'], 19 | 'url': base_url.format(endpoint='/') 20 | }, 21 | { 22 | 'name': 'Books list', 23 | 'allowed_methods': ['GET'], 24 | 'url': base_url.format(endpoint='/books') 25 | } 26 | ]) 27 | -------------------------------------------------------------------------------- /config/routes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Main routes module.""" 5 | 6 | # WebApp 2 7 | from webapp2_extras.routes import RedirectRoute 8 | 9 | from oreilly.handlers import api as api_handlers 10 | from oreilly.handlers import books as books_handlers 11 | 12 | _routes = [ 13 | 14 | RedirectRoute( 15 | template='/', 16 | handler=api_handlers.MenuHandler, 17 | name='menu', 18 | strict_slash=True 19 | ), 20 | 21 | RedirectRoute( 22 | template='/books', 23 | handler=books_handlers.BooksListHandler, 24 | name='books_list', 25 | strict_slash=True 26 | ), 27 | 28 | ] 29 | 30 | 31 | def get_routes(): 32 | """Return routes.""" 33 | return _routes 34 | 35 | 36 | def add_routes(app): 37 | """Add routes to app.""" 38 | for r in _routes: 39 | app.router.add(r) 40 | -------------------------------------------------------------------------------- /oreilly/models/books.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Models - Books.""" 5 | 6 | # App Engine Extensions 7 | from google.appengine.ext import ndb 8 | 9 | 10 | class Category(ndb.Model): 11 | """Category/Subcategory model class.""" 12 | 13 | name = ndb.StringProperty() 14 | category = ndb.BooleanProperty(default=False) 15 | subcategory = ndb.BooleanProperty(default=False) 16 | 17 | 18 | class Book(ndb.Model): 19 | """Book reference model class.""" 20 | 21 | category = ndb.KeyProperty(kind=Category) 22 | subcategory = ndb.KeyProperty(kind=Category) 23 | 24 | title = ndb.StringProperty(required=True) 25 | url = ndb.StringProperty(required=True) 26 | 27 | thumbnail = ndb.StringProperty() 28 | description = ndb.StringProperty() 29 | 30 | # Download URLs 31 | pdf = ndb.StringProperty() 32 | mobi = ndb.StringProperty() 33 | epub = ndb.StringProperty() 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Inventive 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 0 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # OS X 92 | .DS_Store 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # O'Reilly free programming books - Web API 2 | 3 | This project was inspired in [erikcaffrey's](https://github.com/erikcaffrey/api-oreilly-free-books/tree/api-oreilly-free-v1) web application project 4 | used for expose [O'Reilly free programming ebooks](http://www.oreilly.com/programming/free/) via a web API. 5 | 6 | ## How does it works? 7 | 8 | It is written in Python 2.7 using webapp2 and it's designed to be executed on [**Google App Engine**](https://cloud.google.com/appengine/). 9 | It also make use of [**Google Cloud Datastore**](https://cloud.google.com/datastore/) to persist books data and a there is 10 | a [**cron job**](https://cloud.google.com/appengine/docs/python/config/cron) scheduled to run every 1 hour to go to O'Reilly's website 11 | and update books data by doing web scrapping. 12 | 13 | That's basically it. 14 | 15 | ## About the API 16 | 17 | It is hosted on [https://oreilly-api.appspot.com/](https://oreilly-api.appspot.com/) and the main resource is 18 | [**/books**](https://oreilly-api.appspot.com/books) which after a GET request will expose you a 19 | JSON list with all available books. Each list element will look this: 20 | 21 | | Field | Type | Description | 22 | |-------------|-------------|-------------------------------------------------------------------| 23 | | title | String | Book title | 24 | | url | String | Book details URL | 25 | | thumbnail | String | Book thumbnail URL | 26 | | description | String | Book description | 27 | | category | String | Book category | 28 | | subcategory | String/null | Book subcategory. This can be either the subcategory name or null | 29 | | pdf | String/null | Download URL (PDF). This can be either null or a URL | 30 | | mobi | String/null | Download URL (MOBI). This can be either null or a URL | 31 | | epub | String/null | Download URL (ePub). This can be either null or a URL | 32 | 33 | ## Usage 34 | These instructions will get you a copy of the project up and running on your local machine for development 35 | and testing purposes. See deployment for notes on how to deploy the project on a live system. 36 | 37 | 1. Install Google App Engine SDK for Python from [https://cloud.google.com/appengine/downloads](https://cloud.google.com/appengine/downloads) 38 | 2. Install lxml your local environment with: `$ pip install lxml==2.3` 39 | 3. Run with: `$ dev_appserver.py .` 40 | 41 | ## Deployment 42 | Once you have set up your gcloud CLI in your local environment as well as your project in Google App Engine's console, 43 | you can simply run this for deployment. 44 | 45 | ``` 46 | $ glcoud app deploy 47 | ``` 48 | 49 | # Built with 50 | 51 | * [lxml](http://lxml.de/): Used for HTML processing 52 | * [webapp2](http://lxml.de/): Python Web micro-framework. 53 | 54 | ## Contributors 55 | * [erikcaffrey](https://github.com/erikcaffrey)(Erik Jhordan Rey) 56 | * [pablotrinidad](https://github.com/pablotrinidad) (Pablo Trinidad) 57 | 58 | ## License 59 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details 60 | -------------------------------------------------------------------------------- /oreilly/cron_jobs/handlers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Cron Jobs module.""" 5 | 6 | # Google App Engine 7 | from google.appengine.ext import ndb 8 | 9 | # Handlers 10 | from ..handlers.base import APIHandler 11 | 12 | # Models 13 | from ..models.books import Category, Book 14 | 15 | # Utilities 16 | from lxml import html 17 | import urllib2 18 | 19 | 20 | class RetrieveBooksHandler(APIHandler): 21 | """Menu Handler.""" 22 | 23 | source_url = 'http://www.oreilly.com/programming/free/' 24 | 25 | def append_book(self, book, category, subcategory=None): 26 | """Add book to datastore.""" 27 | # Get properties by attribute value. 28 | title = book.get('title', '') 29 | url = book.get('href', '').split('?')[0] 30 | description = book.get('data-content', '') 31 | thumbnail = book.xpath('.//img')[0].get('src', '') 32 | 33 | # NOQA: If URL ends with .csp, will replace it for the corresponding extension. 34 | if url.endswith('.csp'): 35 | pdf = url.replace('.csp', '.pdf') 36 | epub = url.replace('.csp', '.epub') 37 | mobi = url.replace('.csp', '.mobi') 38 | # Else, will set to None 39 | else: 40 | pdf = None 41 | epub = None 42 | mobi = None 43 | 44 | # Create Book entity. 45 | book = Book( 46 | title=title, 47 | url=url, 48 | description=description, 49 | category=category.key, 50 | thumbnail=thumbnail, 51 | pdf=pdf, 52 | epub=epub, 53 | mobi=mobi, 54 | ) 55 | if subcategory: 56 | book.subcategory = subcategory.key 57 | book.put() 58 | 59 | def get(self): 60 | """Return API available methods.""" 61 | # Remove old data 62 | cs = Category.query() 63 | ndb.delete_multi([x.key for x in cs]) 64 | bs = Book.query() 65 | ndb.delete_multi([x.key for x in bs]) 66 | 67 | # Count variables 68 | categories_count = 0 69 | subcategories_count = 0 70 | books_count = 0 71 | 72 | # NOQA: lxml tree from web page response and search for category boxes. 73 | tree = html.fromstring(urllib2.urlopen(self.source_url).read()) 74 | category_boxes = tree.xpath('//div[@class="callout-row"]') 75 | 76 | for category_box in category_boxes: 77 | 78 | # Get box title by filtering all H3 tags without attributes. 79 | category_name = category_box.xpath('.//h3[not(@*)]/text()')[0] 80 | 81 | # Create category entity and update counters. 82 | category = Category(name=category_name, category=True, subcategory=False) # NOQA 83 | category.put() 84 | categories_count += 1 85 | 86 | # Get subcategories by looking for all divs with the right class. 87 | subcategories = category_box.xpath('.//div[not(@class="product-row cover-showcase")]') # NOQA 88 | book_expression = './/a' 89 | 90 | # If categories, iterate each and append book with subcategory 91 | for subcategory_box in subcategories: 92 | # Ger subcategory title by filtering the first H3 tag. 93 | subcategory_name = subcategory_box.xpath('.//h3/text()')[0] 94 | 95 | # Create category entity and update counters. 96 | subcategory = Category(name=subcategory_name, category=False, subcategory=True) # NOQA 97 | subcategory.put() 98 | subcategories_count += 1 99 | 100 | books = subcategory_box.xpath(book_expression) 101 | for book in books: 102 | self.append_book(book, category, subcategory) 103 | books_count += 1 104 | 105 | # Else, get books and append each with category only 106 | if not subcategories: 107 | books = category_box.xpath(book_expression) 108 | for book in books: 109 | self.append_book(book, category) 110 | books_count += 1 111 | 112 | return self.reponse({ 113 | 'status': 'ok', 114 | 'books_added': books_count, 115 | 'categories_added': categories_count, 116 | 'subcategories_added': subcategories_count, 117 | }) 118 | --------------------------------------------------------------------------------