├── pocket_tagger
    ├── __init__.py
    ├── logger.py
    ├── pocket_api_client.py
    ├── scraper.py
    ├── pocket_tagger.py
    └── language_service_client.py
├── LICENSE
├── examples
    └── example-1.py
├── setup.py
├── .gitignore
└── README.md


/pocket_tagger/__init__.py:
--------------------------------------------------------------------------------
1 | from .pocket_tagger import PocketTagger
2 | 


--------------------------------------------------------------------------------
/pocket_tagger/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | class Log:
4 |     @classmethod
5 |     def get_logger(cls, name):
6 |         logging.basicConfig(level=logging.INFO)
7 |         return logging.getLogger(name)
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Harshit Sanghvi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pocket_tagger/pocket_api_client.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | from pocket import Pocket, PocketException
 5 | 
 6 | from .logger import Log
 7 | 
 8 | logger = Log.get_logger(__name__)
 9 | 
10 | class PocketAPIClient:
11 |     pocket_client = None
12 |     def __init__(self, consumer_key, access_token):
13 |         self.pocket_client = Pocket(consumer_key, access_token)
14 | 
15 |     def get_articles_data(self, *args, **kwargs):
16 |         # Fetch the articles
17 |         try:
18 |             # For list of optional parameters the API supports - https://getpocket.com/developer/docs/v3/retrieve
19 |             response, headers = self.pocket_client.get(*args, **kwargs)
20 |             return response.get('list')
21 |         except PocketException as e:
22 |             print(e)
23 | 
24 |     def add_tags_to_articles(self, articles_with_tags, replace=False):
25 |         try:
26 |             total_articles = len(articles_with_tags.items())
27 |             if total_articles == 0:
28 |                 return
29 |             pocket_instance = self.pocket_client
30 |             # Start a bulk operation
31 |             for id, data in articles_with_tags.items():
32 |                 pocket_instance = pocket_instance.tags_add(id, data['tags'])
33 | 
34 |             # and commit
35 |             response, headers = self.pocket_client.commit()
36 |             logger.info('Added the tags to articles.')
37 |         except PocketException as e:
38 |             logger.error(e)
39 | 


--------------------------------------------------------------------------------
/examples/example-1.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | today = datetime.date.today()
 3 | 
 4 | from pocket_tagger.pocket_tagger import PocketTagger
 5 | 
 6 | def tag_em():
 7 |     try:
 8 |         # Google Cloud - Enable Natural Language Processing API for a project, and get your service account API key.
 9 |         # Save it as gcloud_credentials_file.json
10 | 
11 |         # Pocket API - Create a credentials.py file with the following lines
12 |         '''
13 |         pocket_credentials = {
14 |             'consumer_key': 'your-consumer-key',
15 |             'access_token': 'your-access-token'
16 |         }
17 |         '''
18 |         from credentials import pocket_credentials
19 | 
20 |         tagger = PocketTagger(gcloud_credentials_file='gcloud_credentials_file.json',
21 |                         consumer_key=pocket_credentials.get('consumer_key'),
22 |                         access_token=pocket_credentials.get('access_token'))
23 | 
24 |         # For list of optional parameters the API supports - https://getpocket.com/developer/docs/v3/retrieve
25 |         articles = tagger.get_articles_from_api(count=10, offset=10, detailType='complete')
26 |         # Alternatively you can load the articles from file if you saved them previously using save_articles_to_file
27 |         # articles = tagger.get_articles_from_file('20190621.json')
28 | 
29 |         # Generate tags for each article
30 |         articles_with_tags = tagger.get_tags_for_articles(articles)
31 | 
32 |         # Save the articles with tags to file
33 |         tagger.save_articles_to_file(today.strftime('%Y%m%d-with-tags.json'), articles_with_tags)
34 | 
35 |         # You can skip this step if you want to do a dry run. Verify the tags in the file we generated in the previous step.
36 |         tagger.add_tags_to_articles(articles_with_tags)
37 | 
38 |     except Exception as e:
39 |         print(e)
40 | 
41 | tag_em()
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import os
 4 | 
 5 | from os import path
 6 | from io import open
 7 | from setuptools import setup
 8 | 
 9 | # io.open is needed for projects that support Python 2.7
10 | # It ensures open() defaults to text mode with universal newlines,
11 | # and accepts an argument to specify the text encoding
12 | # Python 3 only projects can skip this import
13 | from io import open
14 | 
15 | here = path.abspath(path.dirname(__file__))
16 | 
17 | # Get the long description from the README file
18 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
19 |     long_description = f.read()
20 | 
21 | '''
22 | When bumping the version
23 | 1. Update version number in this file
24 | 2. Generate package tar file - python setup.py sdist
25 | 3. Publish the package - twine upload dist/*
26 | 4. Tag the commit with same version number and push the tag to github
27 | '''
28 | 
29 | setup(name='pocket-tagger',
30 |     version='0.1.1',
31 |     description='Tag your pocket articles from getpocket.com automatically using NLP',
32 |     long_description=long_description,
33 |     long_description_content_type='text/markdown',
34 |     url='http://github.com/sanghviharshit/pocket-tagger',
35 |     author='Harshit Sanghvi',
36 |     author_email='hello@sanghviharshit.com',
37 |     classifiers=[
38 |         'Development Status :: 4 - Beta',
39 |         'Intended Audience :: Developers',
40 |         'Intended Audience :: End Users/Desktop',
41 |         'Operating System :: OS Independent',
42 |         'Topic :: Software Development :: Libraries :: Python Modules',
43 |         'License :: OSI Approved :: MIT License',
44 |         'Programming Language :: Python :: 3',
45 |         'Programming Language :: Python :: 3.4',
46 |         'Programming Language :: Python :: 3.5',
47 |         'Programming Language :: Python :: 3.6',
48 |         'Programming Language :: Python :: 3.7',
49 |     ],
50 |     license='MIT',
51 |     keywords='getpocket, pocket, api, articles, automatic, suggested, tag, natural language processing, nlp',
52 |     packages=['pocket_tagger'],
53 |     install_requires=['google.cloud', 'pocket', 'requests', 'bs4'],
54 |     project_urls={
55 |         'Bug Reports': 'https://github.com/sanghviharshit/pocket-tagger/issues',
56 |         'Say Thanks!': 'https://saythanks.io/to/sanghviharshit',
57 |         'Source': 'https://github.com/sanghviharshit/pocket-tagger',
58 |     },
59 | )
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # OS
127 | .DS_Store
128 | 
129 | # Project specific
130 | data/
131 | scripts/
132 | *.json
133 | credentials.py
134 | 


--------------------------------------------------------------------------------
/pocket_tagger/scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from .logger import Log
 7 | 
 8 | logger = Log.get_logger(__name__)
 9 | 
10 | class Scraper:
11 |     def get_webpage_content(self, url):
12 |         title = ''
13 |         description = ''
14 |         text = ''
15 | 
16 |         # Make the request and check object type
17 |         r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (sanghviharshit/Auto Pocket tagger) Chrome/18 Safari/535.19'})
18 |         # Extract HTML from Response object
19 |         html = r.text
20 |         # Create a BeautifulSoup object from the HTML
21 |         soup = BeautifulSoup(html, 'html5lib')
22 | 
23 |         # Get title and description
24 |         try:
25 |             if soup.title:
26 |                 title = soup.title.get_text()
27 |             elif soup.h1:
28 |                 title = soup.h1.get_text()
29 |             logger.info('         Title: {}'.format(title))
30 | 
31 |             meta = soup.find('meta', attrs={'name': 'description'})
32 |             for tag, value in meta.attrs.items():
33 |                 if tag == 'content':
34 |                     description = value
35 |                     break
36 |             if not description:
37 |                 if soup.h2:
38 |                     description = soup.h2.get_text()
39 | 
40 |             logger.info('         Description: {}'.format(description))
41 | 
42 |         except Exception as e:
43 |             logger.warning('         ({}) Could not find title/description. {}'.format(url, e))
44 |             pass
45 | 
46 |         text = self.get_clean_text(soup)
47 | 
48 |         webpage_content = {
49 |             'title': title,
50 |             'description': description,
51 |             'text': text
52 |         }
53 |         return webpage_content
54 | 
55 |     def get_clean_text(self, soup):
56 |         # kill all script and style elements
57 |         for script in soup(['script', 'style']):
58 |             script.decompose()    # rip it out
59 | 
60 |         # get body text
61 |         text_body = soup.body.get_text()
62 |         # break into lines and remove leading and trailing space on each
63 |         text_lines = (line.strip() for line in text_body.splitlines())
64 |         # break multi-headlines into a line each
65 |         text_chunks = (phrase.strip() for line in text_lines for phrase in line.split('  '))
66 |         # drop blank lines
67 |         clean_text = '\n'.join(chunk for chunk in text_chunks if chunk)
68 |         return clean_text
69 | 


--------------------------------------------------------------------------------
/pocket_tagger/pocket_tagger.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import random
 4 | import logging
 5 | 
 6 | from .pocket_api_client import PocketAPIClient
 7 | from .scraper import Scraper
 8 | from .language_service_client import LanguageServiceClient
 9 | from .logger import Log
10 | 
11 | logger = Log.get_logger(__name__)
12 | 
13 | class PocketTaggerException(Exception):
14 |     pass
15 | 
16 | class PocketTagger:
17 |     pocket_client = None
18 | 
19 |     def __init__(self, consumer_key=None, access_token=None, gcloud_credentials_file=None):
20 |         if consumer_key != None and access_token != None:
21 |             self.pocket_client = self.get_pocket_client(consumer_key, access_token)
22 |         self.scraper = Scraper()
23 |         self.language_service_client = LanguageServiceClient(gcloud_credentials_file)
24 | 
25 |     def get_pocket_client(self, consumer_key=None, access_token=None):
26 |         if self.pocket_client:
27 |             return self.pocket_client
28 |         elif consumer_key is None or access_token is None:
29 |             raise PocketTaggerException
30 |         else:
31 |             return PocketAPIClient(consumer_key, access_token)
32 | 
33 |     def get_articles_from_api(self, *args, **kwargs):
34 |         return self.get_pocket_client().get_articles_data(*args, **kwargs)
35 | 
36 |     def add_tags_to_articles(self, articles_with_tags):
37 |         self.get_pocket_client().add_tags_to_articles(articles_with_tags)
38 | 
39 |     def get_articles_from_file(self, fileName):
40 |         try:
41 |             with open(fileName, 'r') as infile:
42 |                 articles = json.load(infile)
43 |                 return articles
44 |         except Exception as e:
45 |             logger.error('({}) {}'.format(fileName, e))
46 |             return {}
47 | 
48 |     def save_articles_to_file(self, file_name, articles):
49 |       with open(file_name, 'w') as file_name:
50 |           json.dump(articles, file_name)
51 | 
52 |     def get_tags_for_articles(self, articles, *args, **kwargs):
53 |         urls = []
54 |         index = 1
55 | 
56 |         total_articles = len(articles.items())
57 |         if total_articles == 0:
58 |             logger.warning('No articles fetched from Pocket')
59 | 
60 |         for id, data in articles.items():
61 |             url = data['given_url']
62 |             tags = []
63 | 
64 |             try:
65 |                 logger.info('({}/{}) {}'.format(index, total_articles, url))
66 |                 webpage_content = self.scraper.get_webpage_content(url)
67 |                 if webpage_content:
68 |                     tags = self.language_service_client.get_tags_from_webpage_content(webpage_content, *args, **kwargs)
69 |             except Exception as e:
70 |                 logger.error('         ({}) {}'.format(url, e))
71 | 
72 |             if tags:
73 |                 logger.info('         Tags: {}'.format(', '.join(tags)))
74 |             else:
75 |                 logger.warning('         ({}) No Tags found'.format(url))
76 |             data['tags'] = tags
77 |             index += 1
78 |             # time.sleep(5)
79 | 
80 |         return articles
81 | 


--------------------------------------------------------------------------------
/pocket_tagger/language_service_client.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import math
 4 | import logging
 5 | 
 6 | from google.cloud import language
 7 | 
 8 | from .logger import Log
 9 | 
10 | logger = Log.get_logger(__name__)
11 | 
12 | class LanguageServiceClient:
13 | 
14 |     entity_salience_threshold = 0.7
15 |     category_confidence_threshold = 0.3
16 | 
17 |     def __init__(self, crendentials_file=None):
18 |         if crendentials_file:
19 |             os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = crendentials_file
20 | 
21 |         # Initialize google cloud language service client
22 |         self.client = language.LanguageServiceClient()
23 | 
24 |     def get_tags_from_webpage_content(self, webpage_content, thresholds={}):
25 |         if thresholds.get('entity_salience_threshold'):
26 |             self.entity_salience_threshold = kwargs.get('entity_salience_threshold')
27 |         if thresholds.get('category_confidence_threshold'):
28 |             self.category_confidence_threshold = kwargs.get('category_confidence_threshold')
29 | 
30 |         entities = []
31 |         entities = self.get_entities_from_content(webpage_content)
32 |         categories = self.get_categories_from_content(webpage_content)
33 |         return list(dict.fromkeys(entities + categories))  # Remove Duplicates
34 | 
35 |     def get_categories_from_content(self, webpage_content):
36 |         categories = []
37 |         doc_content = '. '.join([webpage_content['title'], webpage_content['description'], webpage_content['text']])
38 |         if sys.getsizeof(doc_content) > 128000:
39 |             max_len = len(doc_content)*128000/sys.getsizeof(doc_content)
40 |             doc_content = doc_content[:math.floor(max_len)]
41 | 
42 |         document = language.types.Document(
43 |             content = doc_content,
44 |             # language='en',
45 |             type=language.enums.Document.Type.PLAIN_TEXT,
46 |             # type=language.enums.Document.Type.HTML,
47 |             )
48 | 
49 |         response = self.client.classify_text(document)
50 | 
51 |         response_categories = response.categories
52 |         logger.debug('         Categories: ')
53 |         for category in response_categories:
54 |             addCategory = False
55 |             if category.confidence > self.category_confidence_threshold:
56 |                 addCategory = True
57 |                 labels = [label for label in category.name.split('/') if label]
58 |                 categories = categories + labels
59 |             logger.debug('            {} {}: {}'.format('X' if not addCategory else ' ', category.name, category.confidence))
60 |         return categories
61 | 
62 |     def get_entities_from_content(self, webpage_content):
63 |         entities = []
64 |         document = language.types.Document(
65 |             content = '. '.join([webpage_content['title'], webpage_content['description']]),
66 |             # language='en',
67 |             type=language.enums.Document.Type.PLAIN_TEXT,
68 |             )
69 |         response = self.client.analyze_entities(
70 |             document=document,
71 |             encoding_type='UTF32',
72 |             )
73 | 
74 |         logger.debug('         Entities: ')
75 |         for entity in response.entities:
76 |             addEntity = False
77 |             if entity.salience > self.entity_salience_threshold:
78 |                 addEntity = True
79 |                 entities.append(entity.name.title())
80 |             logger.debug('            {} {}: {}'.format('X' if not addEntity else ' ', entity.name.title(), entity.salience))
81 |         return entities
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Auto Pocket Tagger
  2 | 
  3 | Use Google cloud's Natural Language Processing API to automatically analyze the webpage from articles saved in your Pocket list, derive tags/keywords based on the content of the page, and add tags to the articles in Pocket list for free.
  4 | 
  5 | > Pocket has suggested tags service for their paid premium plans. You can find more about it [here](https://help.getpocket.com/article/906-pocket-premium-suggested-tags). This still requires manual work of adding the tags to each article one-by-one. This package automates all of it for free.
  6 | 
  7 | ## Features
  8 | - Uses [Python wrapper](https://github.com/tapanpandita/pocket) for [Pocket API](http://getpocket.com/api/docs) to retrieve articles in the `My List`
  9 | - Uses [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) to scrape webpages
 10 | - Uses Google Cloud's [Natural Language Processing API](https://cloud.google.com/natural-language/) to generate list of categories and entities from the content of the webpage
 11 | - Uses Pocket API to add tags to articles in your `My List`
 12 | 
 13 | 
 14 | ## Usage
 15 | 
 16 | ### Installation
 17 | 
 18 | #### Install published version from pypi
 19 | ```shell
 20 | $ pip install pocket-tagger
 21 | ```
 22 | 
 23 | #### Install latest version from git
 24 | ```shell
 25 | $ pip install git+https://github.com/sanghviharshit/pocket-tagger
 26 | ```
 27 | 
 28 | 
 29 | ### Prerequisites
 30 | #### [Google Cloud](https://cloud.google.com/natural-language/docs/quickstart)
 31 | 
 32 | This package relies on Google cloud natural language processing API, which requires billing enabled on your project.
 33 | You can find the quickstart instructions [here](https://cloud.google.com/natural-language/docs/quickstart)
 34 | **Options:**
 35 | 1. Create a service account and download the credentials file - https://cloud.google.com/video-intelligence/docs/common/auth
 36 | ```python
 37 | tagger = PocketTagger(gcloud_credentials_file="gcloud_credentials_file.json")
 38 | ```
 39 | 2. or Configure gloud locally - https://cloud.google.com/sdk/gcloud/reference/init
 40 | ```python
 41 | tagger = PocketTagger()
 42 | ```
 43 | 
 44 | #### [Pocket API](https://getpocket.com/developer/)
 45 | 
 46 | To fetch the articles list and add tags, you need a developer key from [here](https://getpocket.com/developer/)
 47 | Create a new Application with `modify` and `retrieve` permissions. Save the Consumer Key and Access Token.
 48 | ```python
 49 | tagger = PocketTagger(consumer_key='your-consumer-key',
 50 |                 access_token='your-access-token')
 51 | ```
 52 | 
 53 | ### [Examples](./examples)
 54 | 
 55 | ```python
 56 | # Initialize PocketTagger with GCloud and Pocket API Credentials
 57 | tagger = PocketTagger(gcloud_credentials_file="gcloud_credentials_file.json",
 58 |                 consumer_key='pocket-consumer-key',
 59 |                 access_token='pocket-access-token')
 60 | 
 61 | # Check https://getpocket.com/developer/docs/v3/retrieve for additional list of options you can pass for retrieving pocket list
 62 | articles = tagger.get_articles_from_api(count=10, offset=10, detailType='complete')
 63 | 
 64 | # Alternatively you can load the articles from file if you saved them previously using save_articles_to_file
 65 | # articles = tagger.get_articles_from_file("20190621.json")
 66 | # Generate tags for each article
 67 | articles_with_tags = tagger.get_tags_for_articles(articles)
 68 | 
 69 | # Save the articles with tags to file. You can use this file to verify it looks good before running the final step to tag the articles.
 70 | tagger.save_articles_to_file(today.strftime('%Y%m%d-with-tags.json'), articles_with_tags)
 71 | 
 72 | # You can skip this step if you want to do a dry run. Verify the tags in the file we generated in the previous step.
 73 | tagger.add_tags_to_articles(articles_with_tags)
 74 | ```
 75 | 
 76 | ### Optional overrides
 77 | You can override the default thresholds for [entity](https://cloud.google.com/natural-language/docs/reference/rest/v1/Entity
 78 | ) salience and [category](https://cloud.google.com/natural-language/docs/reference/rest/v1/ClassificationCategory) confidence
 79 | 
 80 | ```python
 81 | thresholds = {
 82 |   'entity_salience_threshold': 0.7
 83 |   'category_confidence_threshold': 0.3
 84 | }
 85 | articles_with_tags = tagger.get_tags_for_articles(articles, thresholds)
 86 | ```
 87 | 
 88 | ## Sample
 89 | 
 90 | Sample output from running it for my 490 items long Pocket list
 91 | > `X` under Entities or Categories denotes the NLP client returned those as potential candidates, but we skipped them because it didn't meet the threshold. You can see the last line `Tags: abc, xyz` for list of tags pocket-tagger added for each URL.
 92 | 
 93 | ```
 94 | (1/490) https://www.reddit.com/r/explainlikeimfive/comments/bvweym/eli5_why_do_coffee_drinkers_feel_more_clear/?utm_source=share&utm_medium=ios_app
 95 |          Title: ELI5: Why do coffee drinkers feel more clear headed after consuming caffeine? Why do some get a headache without it? Does caffeine cause any permanent brain changes and can the brain go back to 'normal' after years of caffeine use? : explainlikeimfive
 96 |          Description: r/explainlikeimfive: **Explain Like I'm Five is the best forum and archive on the internet for layperson-friendly explanations.** &nbsp; Don't Panic!
 97 |          Entities:
 98 |             X Coffee Drinkers: 0.2438652664422989
 99 |             X Eli5: 0.14941969513893127
100 |             X Caffeine: 0.12065556645393372
101 |             X Caffeine: 0.0874909833073616
102 |             X Some: 0.06917785853147507
103 |             X Headache: 0.0606028214097023
104 |             X Brain: 0.03606536239385605
105 |             X Explainlikeimfive: 0.033727116882801056
106 |             X Brain Changes: 0.03211209550499916
107 |             X Caffeine Use: 0.029848895967006683
108 |             X R: 0.02966366335749626
109 |             X Forum: 0.028598546981811523
110 |             X Internet: 0.022404097020626068
111 |             X Archive: 0.022404097020626068
112 |             X Explainlikeimfive: 0.017647551372647285
113 |             X Don'T Panic: 0.009302889928221703
114 |             X Five: 0.007013489492237568
115 |             X Five: 0.0
116 |          Categories:
117 |               /Food & Drink/Beverages/Coffee & Tea: 0.6700000166893005
118 |          Tags: Food & Drink, Beverages, Coffee & Tea
119 | (2/490) https://www.reddit.com/r/television/comments/bnpwe3/enjoy_three_full_minutes_of_the_cast_of_game_of/?utm_source=share&utm_medium=ios_app
120 |          Title: Enjoy three full minutes of the cast of 'Game of Thrones' expressing disappointment in Season 8. : television
121 |          Description: r/television:
122 |          Entities:
123 |             X Cast: 0.31218624114990234
124 |             X Disappointment: 0.20341947674751282
125 |             X Season: 0.20341947674751282
126 |             X Game Of Thrones: 0.13265934586524963
127 |             X Television: 0.08712445199489594
128 |             X Television: 0.06119102984666824
129 |             X 8: 0.0
130 |             X Three: 0.0
131 |          Categories:
132 |               /Arts & Entertainment/TV & Video/TV Shows & Programs: 0.75
133 |          Tags: Arts & Entertainment, TV & Video, TV Shows & Programs
134 | (3/490) https://www.reddit.com/r/homeautomation/comments/awvf5r/local_realtime_person_detection_for_rtsp_cameras/
135 |          Title: Local realtime person detection for RTSP cameras : homeautomation
136 |          Description: r/homeautomation: A subreddit focused on automating your home, housework or household activity. Sensors, switches, cameras, locks, etc. Any …
137 |          Entities:
138 |             X Realtime Person Detection: 0.3057926297187805
139 |             X Homeautomation: 0.15315502882003784
140 |             X Cameras: 0.14035314321517944
141 |             X Rtsp: 0.07461880147457123
142 |             X Homeautomation: 0.051411159336566925
143 |             X Home: 0.047811269760131836
144 |             X Housework: 0.04366889223456383
145 |             X Subreddit: 0.04183248057961464
146 |             X R: 0.04132793843746185
147 |             X Cameras: 0.032860007137060165
148 |             X Locks: 0.028899790719151497
149 |             X Household Activity: 0.012798599898815155
150 |             X Switches: 0.012735127471387386
151 |             X Sensors: 0.012735127471387386
152 |          Categories:
153 |               /Computers & Electronics: 0.7900000214576721
154 |          Tags: Computers & Electronics
155 | ```
156 | 
157 | ## References
158 | - [Pocket API Wrapper for Python](https://github.com/tapanpandita/pocket)
159 | - [Pocket API Docs](http://getpocket.com/api/docs)
160 | - [Google Cloud Natural Language Processing](https://cloud.google.com/natural-language/)
161 | - [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/)
162 | - [Complete list of content categories from Google Natural Language API](https://cloud.google.com/natural-language/docs/categories)
163 | 
164 | ## Analytics
165 | [![Analytics](https://ga-beacon.appspot.com/UA-59542024-4/pocket-tagger/)](https://github.com/igrigorik/ga-beacon)
166 | 


--------------------------------------------------------------------------------