├── .gitignore
├── Pipfile
├── indexing.py
├── readme.md
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | Pipfile.lock
2 | *.db
3 | *.json
4 | urls.txt


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | oauth2client = "*"
 8 | httplib2 = "*"
 9 | google-api-python-client = "*"
10 | sqlalchemy = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/indexing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from datetime import datetime
  4 | from oauth2client.service_account import ServiceAccountCredentials
  5 | from googleapiclient.discovery import build
  6 | from googleapiclient.http import BatchHttpRequest
  7 | import httplib2
  8 | from sqlalchemy import create_engine, Boolean, Column, DateTime, Integer, String, func
  9 | from sqlalchemy.orm import sessionmaker
 10 | from sqlalchemy.orm import declarative_base
 11 | from datetime import datetime, timedelta
 12 | 
 13 | Base = declarative_base()
 14 | 
 15 | class URL(Base):
 16 |     __tablename__ = 'urls'
 17 | 
 18 |     id = Column(Integer, primary_key=True)
 19 |     url = Column(String, unique=True, nullable=False)
 20 |     last_submitted = Column(DateTime, nullable=True)
 21 |     index_checked_date = Column(DateTime, nullable=True)
 22 |     is_indexed = Column(Boolean, default=False)
 23 |     
 24 | class Quota(Base):
 25 |     __tablename__ = 'quota'
 26 | 
 27 |     id = Column(Integer, primary_key=True)
 28 |     date = Column(String, nullable=False)
 29 |     count = Column(Integer, nullable=False)
 30 | 
 31 | class Log(Base):
 32 |     __tablename__ = 'logs'
 33 | 
 34 |     id = Column(Integer, primary_key=True)
 35 |     timestamp = Column(DateTime, default=datetime.utcnow)
 36 |     log_type = Column(String, nullable=False)
 37 |     message = Column(String, nullable=False)
 38 | 
 39 | class BulkIndexer:
 40 |     def __init__(self):
 41 |         """
 42 |         Initialize the BulkIndexer class.
 43 |         """
 44 |         self.engine = create_engine('sqlite:///database.db')
 45 |         Base.metadata.create_all(self.engine)
 46 |         self.Session = sessionmaker(bind=self.engine)
 47 |         
 48 |         scopes = ['https://www.googleapis.com/auth/indexing']
 49 |         credentials = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scopes=scopes)
 50 |         http = credentials.authorize(httplib2.Http())
 51 |         self.service = build('indexing', 'v3', http=http)
 52 |         self.urls = self.load_urls()
 53 |         self.quota = self.get_quota()
 54 | 
 55 |     def index(self):
 56 |         """
 57 |         Index all URLs in the database, up to the daily quota limit.
 58 |         """
 59 |         batch = self.service.new_batch_http_request(callback=self.insert_event)
 60 |         for url in self.urls:
 61 |             if self.quota >= 200:
 62 |                 print('Quota exceeded. Please try again tomorrow.')
 63 |                 return
 64 |             batch.add(self.service.urlNotifications().publish(body={"url": url.url, "type": "URL_UPDATED"}))
 65 |         batch.execute()
 66 | 
 67 |     def load_urls(self, limit=200):
 68 |         """
 69 |         Load URLs from the database.
 70 |         """
 71 |         session = self.Session()
 72 |         urls = session.query(URL).limit(limit).all()
 73 |         session.close()
 74 |         return urls
 75 |     
 76 |     def load_unindexed_urls(self):
 77 |         """
 78 |         Load URLs from the database that have not been indexed.
 79 |         """
 80 |         session = self.Session()
 81 |         urls = session.query(URL).filter(URL.is_indexed == False).filter(URL.index_checked_date == None or URL.index_checked_date < datetime.now() - timedelta(days=7)).all()
 82 |         session.close()
 83 |         return urls
 84 | 
 85 |     def insert_event(self, request_id, response, exception):
 86 |         """
 87 |         Callback function for the batch request.
 88 |         """
 89 |         if exception is not None:
 90 |             self.log(str(exception), 'error')
 91 |         else:
 92 |             url = response['urlNotificationMetadata']['url']
 93 |             self.log(url, 'submitted')
 94 |             self.update_url(url, datetime.now())
 95 |             print(f'Requested indexing of {url}')
 96 |         self.update_quota()
 97 | 
 98 |     def remove_url(self, url):
 99 |         """
100 |         Remove a URL from the database.
101 |         """
102 |         session = self.Session()
103 |         try:
104 |             session.query(URL).filter_by(url=url).delete()
105 |             session.commit()
106 |         finally:
107 |             session.close()
108 | 
109 |     def update_url(self, url, last_submitted):
110 |         """
111 |         Update the last_submitted field for a given URL.
112 |         """
113 |         session = self.Session()
114 |         try:
115 |             session.query(URL).filter_by(url=url).update({'last_submitted': last_submitted})
116 |             session.commit()
117 |         finally:
118 |             session.close()
119 | 
120 |     def add_urls(self, new_urls):
121 |         """
122 |         Add new URLs to the database.
123 |         """
124 |         session = self.Session()
125 |         try:
126 |             for url in new_urls:
127 |                 exists = session.query(URL).filter_by(url=url).first()
128 |                 if not exists:
129 |                     session.add(URL(url=url))
130 |             session.commit()
131 |         finally:
132 |             session.close()
133 | 
134 |     def load_urls_from_file(self, file_path='urls.txt'):
135 |         """
136 |         Load URLs from a file, add them to the database, and then clear the file.
137 |         """
138 |         with open(file_path, 'r') as f:
139 |             urls = f.read().splitlines()
140 | 
141 |         self.add_urls(urls)
142 | 
143 |         with open(file_path, 'w') as f:
144 |             f.write('')
145 | 
146 |     def get_quota(self):
147 |         """
148 |         Get the number of URLs submitted today.
149 |         """
150 |         today = datetime.now().strftime('%Y-%m-%d')
151 |         session = self.Session()
152 |         quota = session.query(Quota).filter_by(date=today).first()
153 | 
154 |         if quota is None:
155 |             session.add(Quota(date=today, count=0))
156 |             session.commit()
157 |             return 0
158 |         else:
159 |             return quota.count
160 | 
161 |     def update_quota(self):
162 |         """
163 |         Increment the number of URLs submitted today.
164 |         """
165 |         today = datetime.now().strftime('%Y-%m-%d')
166 |         session = self.Session()
167 |         try:
168 |             quota = session.query(Quota).filter_by(date=today).first()
169 |             if quota:
170 |                 quota.count += 1
171 |             else:
172 |                 session.add(Quota(date=today, count=1))
173 |             session.commit()
174 |         finally:
175 |             session.close()
176 | 
177 |     def log(self, message, log_type):
178 |         """
179 |         Log a message to the database.
180 |         """
181 |         session = self.Session()
182 |         try:
183 |             new_log = Log(log_type=log_type, message=message)
184 |             session.add(new_log)
185 |             session.commit()
186 |         except Exception as e:
187 |             print(f"Failed to log message. Error: {e}")
188 |         finally:
189 |             session.close()
190 | 
191 |     def check_indexing(self):
192 |         """
193 |         Check URLs to see if they are indexed.
194 | 
195 |         @todo: implement a method to check the indexed status of the URLs.
196 |         
197 |         Options:
198 |         1. use the DataForSEO API and perform site: and inurl: searches
199 |         2. Build a custom scraper to check the indexed status of the URLs using scraperapi.com
200 |         """
201 |         pass
202 | 
203 | if __name__ == '__main__':
204 |     indexer = BulkIndexer()
205 |     
206 |     if len(sys.argv) < 2:
207 |         print("Usage: indexing.py <command> [<args>]")
208 |         print("Commands:")
209 |         print("   index     Run the indexing process")
210 |         print("   load      Load URLs from a given file to the DB")
211 |         sys.exit(1)
212 | 
213 |     command = sys.argv[1]
214 | 
215 |     if command == "index":
216 |         indexer.index()
217 |     elif command == "load":
218 |         if len(sys.argv) < 3:
219 |             print("Please provide a file name to load URLs from.")
220 |             sys.exit(1)
221 |         file_path = sys.argv[2]
222 |         indexer.load_urls_from_file(file_path)
223 |         print(f"URLs loaded from {file_path} into the database.")
224 |     else:
225 |         print(f"Unknown command: {command}")
226 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Bulk URL Indexer
 2 | 
 3 | This script can help you to submit URLs in bulk to the Google Indexing API. The API is free to use and you can use this tool to manage URLs, track the daily quota, and log submissions and errors.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | I use pipenv to manage my virtual environments. If you don't have it, you can find info on how to install it [here](https://pipenv.pypa.io/en/latest/installation/).
 8 | 
 9 | To install the dependencies, run the following command:
10 | 
11 | ```
12 | pipenv install
13 | ```
14 | 
15 | If you are using a different virtual environment manager, you can use the `requirements.txt` file to install the dependencies:
16 | 
17 | ```
18 | pip install -r requirements.txt
19 | ```
20 | 
21 | **Important**: Ensure you have the Google Service Account credentials and save them as `credentials.json` in the project directory.
22 | 
23 | Follow the instructions [in this thread](https://x.com/iannuttall/status/1706606218129928655?s=20) to make sure you have the correct permissions for the service account.
24 | 
25 | ## Usage
26 | 
27 | If you're using pipenv, enter a shell in the virtual environment:
28 | 
29 | ```
30 | pipenv shell
31 | ```
32 | 
33 | If you are using a different virtual environment manager, activate the virtual environment however you normally would.
34 | 
35 | ### 1. Adding URLs
36 | 
37 | To add URLs to the queue, run the following command:
38 | 
39 | ```
40 | python indexing.py load urls.txt
41 | ```
42 | 
43 | Make sure the text file contains one URL per line and exists in the project directory.
44 | 
45 | *Note*: This action will also clear the file after loading its URLs.
46 | 
47 | ### 2. Indexing URLs
48 | 
49 | To submit URLs to the Indexing API, run the following command:
50 | 
51 | ```
52 | python indexing.py index
53 | ```
54 | 
55 | This will submit URLs to the Indexing API until the daily quota is reached. If the quota is reached, the script will stop and the remaining URLs will be saved for the next day.
56 | 
57 | ## Database Overview
58 | 
59 | This script uses an SQLite database to manage:
60 | 
61 | - **URLs**: Track each URL, its indexed status, and submission details.
62 | - **Quota**: Daily counter of submitted URLs.
63 | - **Logs**: Detailed logs for every URL submission, ensuring you always know what's happening.
64 | 
65 | ## To Do
66 | 
67 | To make this script better, I'd like to add the following features:
68 | 
69 | - Ability to use more GCP projects and multiple service accounts to increase the daily quota.
70 | - Add option to check the index status of URLs before submitting them.
71 | - Get sitemaps from Search Console and use them to generate a list of URLs to submit.
72 | 
73 | ## Contributing
74 | 
75 | If you have any suggestions or ideas for improving this to make the internal linking suggestions more accurate, you can fork this repo and submit a pull request. I would love to hear your ideas!


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | -i https://pypi.org/simple
 2 | cachetools==5.3.1; python_version >= '3.7'
 3 | certifi==2023.7.22; python_version >= '3.6'
 4 | charset-normalizer==3.2.0; python_full_version >= '3.7.0'
 5 | google-api-core==2.12.0; python_version >= '3.7'
 6 | google-api-python-client==2.101.0; python_version >= '3.7'
 7 | google-auth==2.23.0; python_version >= '3.7'
 8 | google-auth-httplib2==0.1.1
 9 | googleapis-common-protos==1.60.0; python_version >= '3.7'
10 | httplib2==0.22.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
11 | idna==3.4; python_version >= '3.5'
12 | oauth2client==4.1.3
13 | protobuf==4.24.3; python_version >= '3.7'
14 | pyasn1==0.5.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
15 | pyasn1-modules==0.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
16 | pyparsing==3.1.1; python_version >= '3.1'
17 | requests==2.31.0; python_version >= '3.7'
18 | rsa==4.9; python_version >= '3.6' and python_version < '4'
19 | six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
20 | sqlalchemy==2.0.21; python_version >= '3.7'
21 | typing-extensions==4.8.0; python_version >= '3.8'
22 | uritemplate==4.1.1; python_version >= '3.6'
23 | urllib3==1.26.16; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
24 | 


--------------------------------------------------------------------------------