├── config
    ├── __init__.py
    └── routes.py
├── oreilly
    ├── __init__.py
    ├── handlers
    │   ├── __init__.py
    │   ├── books.py
    │   ├── base.py
    │   ├── serializers.py
    │   └── api.py
    ├── models
    │   ├── __init__.py
    │   └── books.py
    └── cron_jobs
    │   ├── __init__.py
    │   ├── parsers.py
    │   ├── routes.py
    │   └── handlers.py
├── static
    └── img
    │   └── favicon.ico
├── cron.yaml
├── index.yaml
├── app.yaml
├── main.py
├── LICENSE
├── .gitignore
└── README.md


/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oreilly/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oreilly/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oreilly/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oreilly/cron_jobs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/oreilly/cron_jobs/parsers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """Parsers module."""
5 | 


--------------------------------------------------------------------------------
/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikjhordan-rey/api-oreilly-free-books/HEAD/static/img/favicon.ico


--------------------------------------------------------------------------------
/cron.yaml:
--------------------------------------------------------------------------------
1 | cron:
2 | - description: daily summary job
3 |   url: /jobs/retrieve_books/
4 |   schedule: every 1 hours
5 | 


--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
 1 | indexes:
 2 | 
 3 | - kind: Book
 4 |   properties:
 5 |     - name: __key__
 6 |       direction: desc
 7 | 
 8 | - kind: Book
 9 |   properties:
10 |     - name: title
11 |       direction: asc
12 | 
13 | - kind: Category
14 |   properties:
15 |     - name: category
16 | 
17 | - kind: Category
18 |   properties:
19 |     - name: subcategory


--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
 1 | # GCP Settings
 2 | api_version: 1
 3 | skip_files:
 4 |   - ^(.*/)?.*\.py[co]$
 5 | 
 6 | # Runtime
 7 | runtime: python27
 8 | threadsafe: yes
 9 | 
10 | # URLs
11 | handlers:
12 | 
13 |   # Favicon
14 |   - url: /favicon\.ico
15 |     static_files: static/img/favicon.ico
16 |     upload: bp_content/themes/default/static/favicon.ico
17 |     secure: always
18 | 
19 |   # Main URLs module
20 |   - url: /.*
21 |     script: main.app
22 |     secure: always
23 | 
24 | # Third Party Libraries
25 | libraries:
26 |   - name: lxml
27 |     version: "2.3"
28 | 


--------------------------------------------------------------------------------
/oreilly/handlers/books.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Books - API module."""
 5 | 
 6 | # Handlers
 7 | from .base import APIHandler
 8 | 
 9 | # Models
10 | from ..models.books import Book
11 | 
12 | # Serializers
13 | from .serializers import book_serializer
14 | 
15 | 
16 | class BooksListHandler(APIHandler):
17 |     """Books list Handler."""
18 | 
19 |     def get(self):
20 |         """Return a list with all books."""
21 |         q = Book.query()
22 |         books = [book_serializer(b) for b in q]
23 |         return self.reponse(books)
24 | 


--------------------------------------------------------------------------------
/oreilly/handlers/base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Handlers - Base module."""
 5 | 
 6 | # WebApp 2
 7 | from webapp2 import RequestHandler
 8 | 
 9 | # Utilities
10 | import json
11 | 
12 | 
13 | class APIHandler(RequestHandler):
14 |     """API Handler.
15 | 
16 |     Setup request and reponse objects.
17 |     """
18 | 
19 |     def reponse(self, data):
20 |         """Set Content-Type and JSON content."""
21 |         response = self.response
22 |         response.headers['Content-Type'] = 'application/json'
23 |         json.dump(data, response.out)
24 |         return response
25 | 


--------------------------------------------------------------------------------
/oreilly/handlers/serializers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Books - Serializers."""
 5 | 
 6 | # Models
 7 | from ..models.books import Category
 8 | 
 9 | 
10 | def book_serializer(book):
11 |     """Given a book object, return a python dictionary."""
12 |     return {
13 |         'title': book.title,
14 |         'url': book.url,
15 |         'thumbnail': book.thumbnail,
16 |         'description': book.description,
17 |         'pdf': book.pdf,
18 |         'mobi': book.mobi,
19 |         'epub': book.epub,
20 |         'category': book.category.get().name,
21 |         'subcategory': book.subcategory.get().name if book.subcategory else None,
22 |     }
23 | 


--------------------------------------------------------------------------------
/oreilly/cron_jobs/routes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Cron jobs routes module."""
 5 | 
 6 | # WebApp 2
 7 | from webapp2_extras.routes import RedirectRoute
 8 | 
 9 | # Handlers
10 | from . import handlers
11 | 
12 | 
13 | _routes = [
14 | 
15 |     RedirectRoute(
16 |         template='/jobs/retrieve_books',
17 |         handler=handlers.RetrieveBooksHandler,
18 |         name='jobs__retrieve_books',
19 |         strict_slash=True
20 |     ),
21 | 
22 | ]
23 | 
24 | 
25 | def get_routes():
26 |     """Return routes."""
27 |     return _routes
28 | 
29 | 
30 | def add_routes(app):
31 |     """Add routes to app."""
32 |     for r in _routes:
33 |         app.router.add(r)
34 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | u"""O’Reilly's Free Books API.
 5 | 
 6 | REST API that exposing scrapped data from O’Reilly's
 7 | Free Books website (http://www.oreilly.com/programming/free/).
 8 | """
 9 | 
10 | # Python libraries
11 | import os
12 | import sys
13 | 
14 | # Third party libraries
15 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'oreilly'))
16 | 
17 | # WebApp 2
18 | import webapp2
19 | 
20 | # Routes
21 | from config import routes
22 | from oreilly.cron_jobs import routes as cron_jobs_routes
23 | 
24 | # App initialization.
25 | app = webapp2.WSGIApplication(debug=os.environ.get('DEBUG', 'prod') == 'dev')
26 | 
27 | routes.add_routes(app)
28 | cron_jobs_routes.add_routes(app)
29 | 


--------------------------------------------------------------------------------
/oreilly/handlers/api.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Handlers - API module."""
 5 | 
 6 | from .base import APIHandler
 7 | 
 8 | 
 9 | class MenuHandler(APIHandler):
10 |     """Menu Handler."""
11 | 
12 |     def get(self):
13 |         """Return API available methods."""
14 |         base_url = self.request.host_url + '{endpoint}'
15 |         return self.reponse([
16 |             {
17 |                 'name': 'API endpoints list',
18 |                 'allowed_methods': ['GET'],
19 |                 'url': base_url.format(endpoint='/')
20 |             },
21 |             {
22 |                 'name': 'Books list',
23 |                 'allowed_methods': ['GET'],
24 |                 'url': base_url.format(endpoint='/books')
25 |             }
26 |         ])
27 | 


--------------------------------------------------------------------------------
/config/routes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Main routes module."""
 5 | 
 6 | # WebApp 2
 7 | from webapp2_extras.routes import RedirectRoute
 8 | 
 9 | from oreilly.handlers import api as api_handlers
10 | from oreilly.handlers import books as books_handlers
11 | 
12 | _routes = [
13 | 
14 |     RedirectRoute(
15 |         template='/',
16 |         handler=api_handlers.MenuHandler,
17 |         name='menu',
18 |         strict_slash=True
19 |     ),
20 | 
21 |     RedirectRoute(
22 |         template='/books',
23 |         handler=books_handlers.BooksListHandler,
24 |         name='books_list',
25 |         strict_slash=True
26 |     ),
27 | 
28 | ]
29 | 
30 | 
31 | def get_routes():
32 |     """Return routes."""
33 |     return _routes
34 | 
35 | 
36 | def add_routes(app):
37 |     """Add routes to app."""
38 |     for r in _routes:
39 |         app.router.add(r)
40 | 


--------------------------------------------------------------------------------
/oreilly/models/books.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Models - Books."""
 5 | 
 6 | # App Engine Extensions
 7 | from google.appengine.ext import ndb
 8 | 
 9 | 
10 | class Category(ndb.Model):
11 |     """Category/Subcategory model class."""
12 | 
13 |     name = ndb.StringProperty()
14 |     category = ndb.BooleanProperty(default=False)
15 |     subcategory = ndb.BooleanProperty(default=False)
16 | 
17 | 
18 | class Book(ndb.Model):
19 |     """Book reference model class."""
20 | 
21 |     category = ndb.KeyProperty(kind=Category)
22 |     subcategory = ndb.KeyProperty(kind=Category)
23 | 
24 |     title = ndb.StringProperty(required=True)
25 |     url = ndb.StringProperty(required=True)
26 | 
27 |     thumbnail = ndb.StringProperty()
28 |     description = ndb.StringProperty()
29 | 
30 |     # Download URLs
31 |     pdf = ndb.StringProperty()
32 |     mobi = ndb.StringProperty()
33 |     epub = ndb.StringProperty()
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Inventive
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 0
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # OS X
92 | .DS_Store
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # O'Reilly free programming books - Web API
 2 | 
 3 | This project was inspired in [erikcaffrey's](https://github.com/erikcaffrey/api-oreilly-free-books/tree/api-oreilly-free-v1) web application project
 4 | used for expose [O'Reilly free programming ebooks](http://www.oreilly.com/programming/free/) via a web API.
 5 | 
 6 | ## How does it works?
 7 | 
 8 | It is written in Python 2.7 using webapp2 and it's designed to be executed on [**Google App Engine**](https://cloud.google.com/appengine/).
 9 | It also make use of [**Google Cloud Datastore**](https://cloud.google.com/datastore/) to persist books data and a there is
10 | a [**cron job**](https://cloud.google.com/appengine/docs/python/config/cron) scheduled to run every 1 hour to go to O'Reilly's website
11 | and update books data by doing web scrapping.
12 | 
13 | That's basically it.
14 | 
15 | ## About the API
16 | 
17 | It is hosted on [https://oreilly-api.appspot.com/](https://oreilly-api.appspot.com/) and the main resource is
18 | [**/books**](https://oreilly-api.appspot.com/books) which after a GET request will expose you a
19 | JSON list with all available books. Each list element will look this:
20 | 
21 | | Field       | Type        | Description                                                       |
22 | |-------------|-------------|-------------------------------------------------------------------|
23 | | title       | String      | Book title                                                        |
24 | | url         | String      | Book details URL                                                  |
25 | | thumbnail   | String      | Book thumbnail URL                                                |
26 | | description | String      | Book description                                                  |
27 | | category    | String      | Book category                                                     |
28 | | subcategory | String/null | Book subcategory. This can be either the subcategory name or null |
29 | | pdf         | String/null | Download URL (PDF). This can be either null or a URL              |
30 | | mobi        | String/null | Download URL (MOBI). This can be either null or a URL             |
31 | | epub        | String/null | Download URL (ePub). This can be either null or a URL             |
32 | 
33 | ## Usage
34 | These instructions will get you a copy of the project up and running on your local machine for development
35 | and testing purposes. See deployment for notes on how to deploy the project on a live system.
36 | 
37 | 1. Install Google App Engine SDK for Python from [https://cloud.google.com/appengine/downloads](https://cloud.google.com/appengine/downloads)
38 | 2. Install lxml your local environment with: `$ pip install lxml==2.3`
39 | 3. Run with: `$ dev_appserver.py .`
40 | 
41 | ## Deployment
42 | Once you have set up your gcloud CLI in your local environment as well as your project in Google App Engine's console,
43 | you can simply run this for deployment.
44 | 
45 | ```
46 | $ glcoud app deploy
47 | ```
48 | 
49 | # Built with
50 | 
51 | * [lxml](http://lxml.de/): Used for HTML processing
52 | * [webapp2](http://lxml.de/): Python Web micro-framework.
53 | 
54 | ## Contributors
55 | * [erikcaffrey](https://github.com/erikcaffrey)(Erik Jhordan Rey)
56 | * [pablotrinidad](https://github.com/pablotrinidad) (Pablo Trinidad)
57 | 
58 | ## License
59 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
60 | 


--------------------------------------------------------------------------------
/oreilly/cron_jobs/handlers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Cron Jobs module."""
  5 | 
  6 | # Google App Engine
  7 | from google.appengine.ext import ndb
  8 | 
  9 | # Handlers
 10 | from ..handlers.base import APIHandler
 11 | 
 12 | # Models
 13 | from ..models.books import Category, Book
 14 | 
 15 | # Utilities
 16 | from lxml import html
 17 | import urllib2
 18 | 
 19 | 
 20 | class RetrieveBooksHandler(APIHandler):
 21 |     """Menu Handler."""
 22 | 
 23 |     source_url = 'http://www.oreilly.com/programming/free/'
 24 | 
 25 |     def append_book(self, book, category, subcategory=None):
 26 |         """Add book to datastore."""
 27 |         # Get properties by attribute value.
 28 |         title = book.get('title', '')
 29 |         url = book.get('href', '').split('?')[0]
 30 |         description = book.get('data-content', '')
 31 |         thumbnail = book.xpath('.//img')[0].get('src', '')
 32 | 
 33 |         # NOQA: If URL ends with .csp, will replace it for the corresponding extension.
 34 |         if url.endswith('.csp'):
 35 |             pdf = url.replace('.csp', '.pdf')
 36 |             epub = url.replace('.csp', '.epub')
 37 |             mobi = url.replace('.csp', '.mobi')
 38 |         # Else, will set to None
 39 |         else:
 40 |             pdf = None
 41 |             epub = None
 42 |             mobi = None
 43 | 
 44 |         # Create Book entity.
 45 |         book = Book(
 46 |             title=title,
 47 |             url=url,
 48 |             description=description,
 49 |             category=category.key,
 50 |             thumbnail=thumbnail,
 51 |             pdf=pdf,
 52 |             epub=epub,
 53 |             mobi=mobi,
 54 |         )
 55 |         if subcategory:
 56 |             book.subcategory = subcategory.key
 57 |         book.put()
 58 | 
 59 |     def get(self):
 60 |         """Return API available methods."""
 61 |         # Remove old data
 62 |         cs = Category.query()
 63 |         ndb.delete_multi([x.key for x in cs])
 64 |         bs = Book.query()
 65 |         ndb.delete_multi([x.key for x in bs])
 66 | 
 67 |         # Count variables
 68 |         categories_count = 0
 69 |         subcategories_count = 0
 70 |         books_count = 0
 71 | 
 72 |         # NOQA: lxml tree from web page response and search for category boxes.
 73 |         tree = html.fromstring(urllib2.urlopen(self.source_url).read())
 74 |         category_boxes = tree.xpath('//div[@class="callout-row"]')
 75 | 
 76 |         for category_box in category_boxes:
 77 | 
 78 |             # Get box title by filtering all H3 tags without attributes.
 79 |             category_name = category_box.xpath('.//h3[not(@*)]/text()')[0]
 80 | 
 81 |             # Create category entity and update counters.
 82 |             category = Category(name=category_name, category=True, subcategory=False)  # NOQA
 83 |             category.put()
 84 |             categories_count += 1
 85 | 
 86 |             # Get subcategories by looking for all divs with the right class.
 87 |             subcategories = category_box.xpath('.//div[not(@class="product-row cover-showcase")]')  # NOQA
 88 |             book_expression = './/a'
 89 | 
 90 |             # If categories, iterate each and append book with subcategory
 91 |             for subcategory_box in subcategories:
 92 |                 # Ger subcategory title by filtering the first H3 tag.
 93 |                 subcategory_name = subcategory_box.xpath('.//h3/text()')[0]
 94 | 
 95 |                 # Create category entity and update counters.
 96 |                 subcategory = Category(name=subcategory_name, category=False, subcategory=True)  # NOQA
 97 |                 subcategory.put()
 98 |                 subcategories_count += 1
 99 | 
100 |                 books = subcategory_box.xpath(book_expression)
101 |                 for book in books:
102 |                     self.append_book(book, category, subcategory)
103 |                     books_count += 1
104 | 
105 |             # Else, get books and append each with category only
106 |             if not subcategories:
107 |                 books = category_box.xpath(book_expression)
108 |                 for book in books:
109 |                     self.append_book(book, category)
110 |                     books_count += 1
111 | 
112 |         return self.reponse({
113 |             'status': 'ok',
114 |             'books_added': books_count,
115 |             'categories_added': categories_count,
116 |             'subcategories_added': subcategories_count,
117 |         })
118 | 


--------------------------------------------------------------------------------