├── .gitignore
├── requirements.txt
├── README.md
├── models.py
└── it-ebooks.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e git+https://git.eddyhintze.com/xtream1101/scraper-lib@master#egg=scraper_lib
2 | -e git+https://github.com/xtream1101/scraper-monitor-lib@master#egg=scraper_monitor
3 | cutil
4 | sqlalchemy
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrape-itebooks
 2 | 
 3 | Developed using Python 3.4
 4 | 
 5 | Scrape the site https://it-ebooks.info/ and save all the books on the site. Currently does not backfill but will keep up with the latest uploads
 6 | 
 7 | *I am working on a way to backfill all of the data.
 8 | 
 9 | 
10 | Must pass in a config file like so: `python3 it-ebooks.py -c ~/scrapers.conf`
11 | 
12 | See what the conf file needs to contain here: https://git.eddyhintze.com/xtream1101/scraper-lib
13 | 
14 | This scraper also requires this section in the config:
15 | ```
16 | [it-ebooks]
17 | # `scraper_key` is only needed if `scraper-monitor` is enabled
18 | scraper_key =
19 | ```
20 | 
21 | ## Setup
22 | 
23 | Run `pip3 install -r requirements.txt`
24 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
 2 | from sqlalchemy.schema import CreateSchema
 3 | from sqlalchemy.orm import sessionmaker
 4 | from sqlalchemy.orm.exc import NoResultFound
 5 | from sqlalchemy.ext.declarative import declarative_base
 6 | from sqlalchemy.exc import ProgrammingError, IntegrityError
 7 | from scraper_lib import raw_config
 8 | 
 9 | Base = declarative_base()
10 | 
11 | SCHEMA = 'it-ebooks'
12 | # Used when schema cannot be used
13 | table_prefix = ''
14 | 
15 | if not raw_config.get('database', 'uri').startswith('postgres'):
16 |     SCHEMA = None
17 |     table_prefix = SCHEMA + '_'
18 | 
19 | 
20 | class Book(Base):
21 |     __tablename__ = table_prefix + 'book'
22 |     __table_args__ = {'schema': SCHEMA}
23 |     id = Column(Integer, primary_key=True, autoincrement=True)
24 |     book_id = Column(Integer, unique=True)
25 |     file_location = Column(String(300))
26 |     file_cover_location = Column(String(300))
27 |     file_cover_source = Column(String(200))
28 |     description = Column(Text)
29 |     file_source = Column(String(200))
30 |     format = Column(String(10))
31 |     isbn = Column(String(20))
32 |     language = Column(String(20))
33 |     pages = Column(Integer)
34 |     publisher = Column(String(100))
35 |     title = Column(String(512))
36 |     subtitle = Column(String(1024))
37 |     year = Column(Integer)
38 |     author = Column(String(200))
39 |     time_collected = Column(DateTime)
40 | 
41 | 
42 | class Setting(Base):
43 |     __tablename__ = table_prefix + 'setting'
44 |     __table_args__ = {'schema': SCHEMA}
45 |     id = Column(Integer, primary_key=True)
46 |     book_last_ran = Column(DateTime)
47 |     book_last_id = Column(Integer)
48 |     bit = Column(Integer, unique=True)
49 | 
50 | 
51 | engine = create_engine(raw_config.get('database', 'uri'))
52 | 
53 | if raw_config.get('database', 'uri').startswith('postgres'):
54 |     try:
55 |         engine.execute(CreateSchema(SCHEMA))
56 |     except ProgrammingError:
57 |         # Schema already exists
58 |         pass
59 | 
60 | Base.metadata.create_all(engine)
61 | 
62 | Base.metadata.bind = engine
63 | 
64 | DBSession = sessionmaker(bind=engine)
65 | 
66 | db_session = DBSession()
67 | 
68 | try:
69 |     new_setting = Setting()
70 |     new_setting.bit = 0
71 |     db_session.add(new_setting)
72 |     db_session.commit()
73 | except IntegrityError:
74 |     # Settings row has already been created
75 |     db_session.rollback()
76 | 


--------------------------------------------------------------------------------
/it-ebooks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | import time
  4 | import cutil
  5 | import signal
  6 | import logging
  7 | from models import db_session, Setting, Book, NoResultFound
  8 | from scraper_monitor import scraper_monitor
  9 | from scraper_lib import Scraper, Web
 10 | 
 11 | # Create logger for this script
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class Worker:
 16 | 
 17 |     def __init__(self, web, book_id):
 18 |         """
 19 |         Worker Profile
 20 | 
 21 |         Run for each item that needs parsing
 22 |         Each thread has a web instance that is used for parsing
 23 |         """
 24 |         # `web` is what utilizes the profiles and proxying
 25 |         self.web = web
 26 |         self.book_id = str(book_id)
 27 | 
 28 |         # Get the sites content as a beautifulsoup object
 29 |         url = '{base_url}/book/{book_id}/'.format(base_url=self.web.scraper.BASE_URL,
 30 |                                                   book_id=self.book_id)
 31 |         soup = self.web.get_site(url, page_format='html')
 32 |         if soup is None:
 33 |             logger.warning("Response was None for url {url}".format(url=url))
 34 |             return
 35 | 
 36 |         if soup.find('img', {'alt': 'Page Not Found'}):
 37 |             logger.warning("Page Not Found: {url}".format(url=url))
 38 |             return
 39 | 
 40 |         logger.info("Getting book {book_id}".format(book_id=self.book_id))
 41 | 
 42 |         data = self.parse_book(soup)
 43 | 
 44 |         if data is not None:
 45 |             # Add raw data to db
 46 |             self.web.scraper.insert_data(data)
 47 | 
 48 |             # Add success count to stats. Keeps track of how much ref data has been parsed
 49 |             self.web.scraper.track_stat('ref_data_success_count', 1)
 50 | 
 51 |         # Take it easy on the site
 52 |         time.sleep(1)
 53 | 
 54 |     def parse_book(self, content):
 55 |         """
 56 |         :return: List of items with their details
 57 |         """
 58 |         cover_source = content.find('img', {'itemprop': 'image'})['src'].strip()
 59 |         try:
 60 |             subtitle = content.find('h3').getText().strip()
 61 |         except AttributeError:
 62 |             subtitle = None
 63 | 
 64 |         try:
 65 |             file_source = content.find('a', {'href': re.compile('http://filepi.com')})['href']
 66 |         except (AttributeError, TypeError):
 67 |             file_source = None
 68 | 
 69 |         parsed_data = {'book_id': self.book_id,
 70 |                        'file_location': None,
 71 |                        'file_cover_location': None,
 72 |                        'file_cover_source': self.web.scraper.BASE_URL + cover_source,
 73 |                        'description': content.find('span', {'itemprop': 'description'}).getText().strip(),
 74 |                        'file_source': file_source,
 75 |                        'format': content.find(attrs={'itemprop': 'bookFormat'}).getText().strip().lower(),
 76 |                        'isbn': content.find(attrs={'itemprop': 'isbn'}).getText().strip(),
 77 |                        'language': content.find(attrs={'itemprop': 'inLanguage'}).getText().strip(),
 78 |                        'pages': content.find(attrs={'itemprop': 'numberOfPages'}).getText().strip(),
 79 |                        'publisher': content.find(attrs={'itemprop': 'publisher'}).getText().strip(),
 80 |                        'title': content.find('h1', {'itemprop': 'name'}).getText().strip(),
 81 |                        'subtitle': subtitle,
 82 |                        'year': content.find(attrs={'itemprop': 'datePublished'}).getText().strip(),
 83 |                        'author': content.find(attrs={'itemprop': 'author'}).getText().strip(),
 84 |                        'time_collected': cutil.get_datetime(),
 85 |                        }
 86 | 
 87 |         # Download book
 88 |         base_filename = '{last_nums}/{book_id}/{book_id}'\
 89 |                         .format(last_nums=self.book_id[-2:], book_id=self.book_id)
 90 | 
 91 |         book_filename = '{base_filename}_book.{ext}'.format(base_filename=base_filename,
 92 |                                                             ext=parsed_data.get('format'))
 93 |         cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source'))
 94 |         book_cover_filename = '{base_filename}_cover{ext}'.format(base_filename=base_filename,
 95 |                                                                   ext=cover_ext)
 96 |         parsed_data['file_cover_location'] = self.web.download(parsed_data.get('file_cover_source'),
 97 |                                                                book_cover_filename)
 98 | 
 99 |         header = {'Referer': self.web.scraper.BASE_URL}
100 |         if parsed_data.get('file_source') is not None:
101 |             parsed_data['file_location'] = self.web.download(parsed_data.get('file_source'),
102 |                                                              book_filename,
103 |                                                              header=header)
104 | 
105 |         return parsed_data
106 | 
107 | 
108 | class ItEbooks(Scraper):
109 | 
110 |     def __init__(self, config_file=None):
111 |         super().__init__('itebooks')
112 | 
113 |         self.BASE_URL = 'http://it-ebooks.info'
114 |         self.book_ids = self.get_latest_books()
115 |         self.last_id_scraped = self.get_last_scraped()
116 | 
117 |     def start(self):
118 |         """
119 |         Send the ref data to the worker threads
120 |         """
121 |         if len(self.book_ids) == 0:
122 |             logger.critical("No books found in the latest upload section")
123 |             return
124 | 
125 |         if self.book_ids[-1] <= self.last_id_scraped:
126 |             # No need to continue
127 |             logger.info("Already have the newest book")
128 |             return
129 | 
130 |         # Log how many items in total we will be parsing
131 |         scraper.stats['ref_data_count'] = len(self.book_ids)
132 | 
133 |         # Only ever use 1 thread here
134 |         self.thread_profile(1, 'requests', self.book_ids, Worker)
135 | 
136 |     def get_latest_books(self):
137 |         """
138 |         Get the latest uploaded book id's and return as a list
139 |         """
140 |         logger.info("Get latest upload ids")
141 | 
142 |         tmp_web = Web(self, 'requests')
143 | 
144 |         # Get the json data
145 |         try:
146 |             soup = tmp_web.get_site(self.BASE_URL, page_format='html')
147 |         except:
148 |             logger.critical("Problem loading home page to get latest uploads", exc_info=True)
149 |             sys.exit(1)
150 | 
151 |         book_list_raw = soup.find_all("td", {"width": 120})
152 |         book_list = []
153 |         for book in book_list_raw:
154 |             try:
155 |                 book_id_raw = book.find('a').get('href').split('/')[2]
156 |                 book_list.append(int(book_id_raw))
157 |             except ValueError:
158 |                 logger.error("Could not get book id from {book_id_raw}".format(book_id_raw=book_id_raw))
159 | 
160 |         book_list.sort()
161 |         return book_list
162 | 
163 |     def get_last_scraped(self):
164 |         """
165 |         Get last book scraped
166 |         """
167 |         last_scraped_id = db_session.query(Setting).filter(Setting.bit == 0).one().book_last_id
168 | 
169 |         if last_scraped_id is None:
170 |             last_scraped_id = 0
171 | 
172 |         return last_scraped_id
173 | 
174 |     def log_last_scraped(self):
175 |         try:
176 |             try:
177 |                 last_book_id = db_session.query(Book).order_by(Book.book_id.desc()).first()
178 |                 if last_book_id is not None:
179 |                     setting = db_session.query(Setting).filter(Setting.bit == 0).one()
180 |                     setting.book_last_id = last_book_id.book_id
181 |                     setting.book_last_ran = cutil.get_datetime()
182 | 
183 |                     db_session.add(setting)
184 |                     db_session.commit()
185 |             except NoResultFound:
186 |                 # If there is no raw data then no books were collected
187 |                 pass
188 | 
189 |         except:
190 |             logger.exception("Problem logging last book scraped")
191 | 
192 |     def insert_data(self, data):
193 |         """
194 |         Will handle inserting data into the database
195 |         """
196 |         try:
197 |             # Check if book is in database, if so update else create
198 |             try:
199 |                 book = db_session.query(Book).filter(Book.book_id == data.get('book_id')).one()
200 |             except NoResultFound:
201 |                 book = Book()
202 | 
203 |             book.title = data.get('title')
204 |             book.subtitle = data.get('subtitle')
205 |             book.author = data.get('author')
206 |             book.year = data.get('year')
207 |             book.pages = data.get('pages')
208 |             book.language = data.get('language')
209 |             book.publisher = data.get('publisher')
210 |             book.isbn = data.get('isbn')
211 |             book.format = data.get('format')
212 |             book.description = data.get('description')
213 |             book.file_source = data.get('file_source')
214 |             book.file_cover_source = data.get('file_cover_source')
215 |             book.file_location = data.get('file_location')
216 |             book.file_cover_location = data.get('file_cover_location')
217 |             book.book_id = data.get('book_id')
218 |             book.time_collected = data.get('time_collected')
219 | 
220 |             db_session.add(book)
221 |             db_session.commit()
222 |             # self.track_stat('rows_added_to_db', rows_affected)
223 | 
224 |         except Exception:
225 |             db_session.rollback()
226 |             logger.exception("Error adding to db {data}".format(data=data))
227 | 
228 | 
229 | def sigint_handler(signal, frame):
230 |     logger.critical("Keyboard Interrupt")
231 |     sys.exit(0)
232 | 
233 | 
234 | if __name__ == '__main__':
235 |     signal.signal(signal.SIGINT, sigint_handler)
236 | 
237 |     try:
238 |         scraper = ItEbooks()
239 |         try:
240 |             scraper.start()
241 |             scraper.cleanup()
242 | 
243 |         except Exception:
244 |             logger.critical("Main Error", exc_info=True)
245 | 
246 |     except Exception:
247 |         logger.critical("Setup Error", exc_info=True)
248 | 
249 |     finally:
250 |         scraper.log_last_scraped()
251 |         try:
252 |             # Log stats
253 |             scraper_monitor.stop(total_urls=scraper.stats['total_urls'],
254 |                                  ref_data_count=scraper.stats['ref_data_count'],
255 |                                  ref_data_success_count=scraper.stats['ref_data_success_count'],
256 |                                  rows_added_to_db=scraper.stats['rows_added_to_db'])
257 | 
258 |         except NameError:
259 |             # If there is an issue with scraper.stats
260 |             scraper_monitor.stop()
261 | 
262 |         except Exception:
263 |             logger.critical("Scraper Monitor Stop Error", exc_info=True)
264 |             scraper_monitor.stop()
265 | 


--------------------------------------------------------------------------------