├── .gitignore ├── requirements.txt ├── README.md ├── models.py └── it-ebooks.py /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__ 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e git+https://git.eddyhintze.com/xtream1101/scraper-lib@master#egg=scraper_lib 2 | -e git+https://github.com/xtream1101/scraper-monitor-lib@master#egg=scraper_monitor 3 | cutil 4 | sqlalchemy 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrape-itebooks 2 | 3 | Developed using Python 3.4 4 | 5 | Scrape the site https://it-ebooks.info/ and save all the books on the site. Currently does not backfill but will keep up with the latest uploads 6 | 7 | *I am working on a way to backfill all of the data. 8 | 9 | 10 | Must pass in a config file like so: `python3 it-ebooks.py -c ~/scrapers.conf` 11 | 12 | See what the conf file needs to contain here: https://git.eddyhintze.com/xtream1101/scraper-lib 13 | 14 | This scraper also requires this section in the config: 15 | ``` 16 | [it-ebooks] 17 | # `scraper_key` is only needed if `scraper-monitor` is enabled 18 | scraper_key = 19 | ``` 20 | 21 | ## Setup 22 | 23 | Run `pip3 install -r requirements.txt` 24 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text 2 | from sqlalchemy.schema import CreateSchema 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy.orm.exc import NoResultFound 5 | from sqlalchemy.ext.declarative import declarative_base 6 | from sqlalchemy.exc import ProgrammingError, IntegrityError 7 | from scraper_lib import raw_config 8 | 9 | Base = declarative_base() 10 | 11 | SCHEMA = 'it-ebooks' 12 | # Used when schema cannot be used 13 | table_prefix = '' 14 | 15 | if not raw_config.get('database', 'uri').startswith('postgres'): 16 | SCHEMA = None 17 | table_prefix = SCHEMA + '_' 18 | 19 | 20 | class Book(Base): 21 | __tablename__ = table_prefix + 'book' 22 | __table_args__ = {'schema': SCHEMA} 23 | id = Column(Integer, primary_key=True, autoincrement=True) 24 | book_id = Column(Integer, unique=True) 25 | file_location = Column(String(300)) 26 | file_cover_location = Column(String(300)) 27 | file_cover_source = Column(String(200)) 28 | description = Column(Text) 29 | file_source = Column(String(200)) 30 | format = Column(String(10)) 31 | isbn = Column(String(20)) 32 | language = Column(String(20)) 33 | pages = Column(Integer) 34 | publisher = Column(String(100)) 35 | title = Column(String(512)) 36 | subtitle = Column(String(1024)) 37 | year = Column(Integer) 38 | author = Column(String(200)) 39 | time_collected = Column(DateTime) 40 | 41 | 42 | class Setting(Base): 43 | __tablename__ = table_prefix + 'setting' 44 | __table_args__ = {'schema': SCHEMA} 45 | id = Column(Integer, primary_key=True) 46 | book_last_ran = Column(DateTime) 47 | book_last_id = Column(Integer) 48 | bit = Column(Integer, unique=True) 49 | 50 | 51 | engine = create_engine(raw_config.get('database', 'uri')) 52 | 53 | if raw_config.get('database', 'uri').startswith('postgres'): 54 | try: 55 | engine.execute(CreateSchema(SCHEMA)) 56 | except ProgrammingError: 57 | # Schema already exists 58 | pass 59 | 60 | Base.metadata.create_all(engine) 61 | 62 | Base.metadata.bind = engine 63 | 64 | DBSession = sessionmaker(bind=engine) 65 | 66 | db_session = DBSession() 67 | 68 | try: 69 | new_setting = Setting() 70 | new_setting.bit = 0 71 | db_session.add(new_setting) 72 | db_session.commit() 73 | except IntegrityError: 74 | # Settings row has already been created 75 | db_session.rollback() 76 | -------------------------------------------------------------------------------- /it-ebooks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import time 4 | import cutil 5 | import signal 6 | import logging 7 | from models import db_session, Setting, Book, NoResultFound 8 | from scraper_monitor import scraper_monitor 9 | from scraper_lib import Scraper, Web 10 | 11 | # Create logger for this script 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Worker: 16 | 17 | def __init__(self, web, book_id): 18 | """ 19 | Worker Profile 20 | 21 | Run for each item that needs parsing 22 | Each thread has a web instance that is used for parsing 23 | """ 24 | # `web` is what utilizes the profiles and proxying 25 | self.web = web 26 | self.book_id = str(book_id) 27 | 28 | # Get the sites content as a beautifulsoup object 29 | url = '{base_url}/book/{book_id}/'.format(base_url=self.web.scraper.BASE_URL, 30 | book_id=self.book_id) 31 | soup = self.web.get_site(url, page_format='html') 32 | if soup is None: 33 | logger.warning("Response was None for url {url}".format(url=url)) 34 | return 35 | 36 | if soup.find('img', {'alt': 'Page Not Found'}): 37 | logger.warning("Page Not Found: {url}".format(url=url)) 38 | return 39 | 40 | logger.info("Getting book {book_id}".format(book_id=self.book_id)) 41 | 42 | data = self.parse_book(soup) 43 | 44 | if data is not None: 45 | # Add raw data to db 46 | self.web.scraper.insert_data(data) 47 | 48 | # Add success count to stats. Keeps track of how much ref data has been parsed 49 | self.web.scraper.track_stat('ref_data_success_count', 1) 50 | 51 | # Take it easy on the site 52 | time.sleep(1) 53 | 54 | def parse_book(self, content): 55 | """ 56 | :return: List of items with their details 57 | """ 58 | cover_source = content.find('img', {'itemprop': 'image'})['src'].strip() 59 | try: 60 | subtitle = content.find('h3').getText().strip() 61 | except AttributeError: 62 | subtitle = None 63 | 64 | try: 65 | file_source = content.find('a', {'href': re.compile('http://filepi.com')})['href'] 66 | except (AttributeError, TypeError): 67 | file_source = None 68 | 69 | parsed_data = {'book_id': self.book_id, 70 | 'file_location': None, 71 | 'file_cover_location': None, 72 | 'file_cover_source': self.web.scraper.BASE_URL + cover_source, 73 | 'description': content.find('span', {'itemprop': 'description'}).getText().strip(), 74 | 'file_source': file_source, 75 | 'format': content.find(attrs={'itemprop': 'bookFormat'}).getText().strip().lower(), 76 | 'isbn': content.find(attrs={'itemprop': 'isbn'}).getText().strip(), 77 | 'language': content.find(attrs={'itemprop': 'inLanguage'}).getText().strip(), 78 | 'pages': content.find(attrs={'itemprop': 'numberOfPages'}).getText().strip(), 79 | 'publisher': content.find(attrs={'itemprop': 'publisher'}).getText().strip(), 80 | 'title': content.find('h1', {'itemprop': 'name'}).getText().strip(), 81 | 'subtitle': subtitle, 82 | 'year': content.find(attrs={'itemprop': 'datePublished'}).getText().strip(), 83 | 'author': content.find(attrs={'itemprop': 'author'}).getText().strip(), 84 | 'time_collected': cutil.get_datetime(), 85 | } 86 | 87 | # Download book 88 | base_filename = '{last_nums}/{book_id}/{book_id}'\ 89 | .format(last_nums=self.book_id[-2:], book_id=self.book_id) 90 | 91 | book_filename = '{base_filename}_book.{ext}'.format(base_filename=base_filename, 92 | ext=parsed_data.get('format')) 93 | cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source')) 94 | book_cover_filename = '{base_filename}_cover{ext}'.format(base_filename=base_filename, 95 | ext=cover_ext) 96 | parsed_data['file_cover_location'] = self.web.download(parsed_data.get('file_cover_source'), 97 | book_cover_filename) 98 | 99 | header = {'Referer': self.web.scraper.BASE_URL} 100 | if parsed_data.get('file_source') is not None: 101 | parsed_data['file_location'] = self.web.download(parsed_data.get('file_source'), 102 | book_filename, 103 | header=header) 104 | 105 | return parsed_data 106 | 107 | 108 | class ItEbooks(Scraper): 109 | 110 | def __init__(self, config_file=None): 111 | super().__init__('itebooks') 112 | 113 | self.BASE_URL = 'http://it-ebooks.info' 114 | self.book_ids = self.get_latest_books() 115 | self.last_id_scraped = self.get_last_scraped() 116 | 117 | def start(self): 118 | """ 119 | Send the ref data to the worker threads 120 | """ 121 | if len(self.book_ids) == 0: 122 | logger.critical("No books found in the latest upload section") 123 | return 124 | 125 | if self.book_ids[-1] <= self.last_id_scraped: 126 | # No need to continue 127 | logger.info("Already have the newest book") 128 | return 129 | 130 | # Log how many items in total we will be parsing 131 | scraper.stats['ref_data_count'] = len(self.book_ids) 132 | 133 | # Only ever use 1 thread here 134 | self.thread_profile(1, 'requests', self.book_ids, Worker) 135 | 136 | def get_latest_books(self): 137 | """ 138 | Get the latest uploaded book id's and return as a list 139 | """ 140 | logger.info("Get latest upload ids") 141 | 142 | tmp_web = Web(self, 'requests') 143 | 144 | # Get the json data 145 | try: 146 | soup = tmp_web.get_site(self.BASE_URL, page_format='html') 147 | except: 148 | logger.critical("Problem loading home page to get latest uploads", exc_info=True) 149 | sys.exit(1) 150 | 151 | book_list_raw = soup.find_all("td", {"width": 120}) 152 | book_list = [] 153 | for book in book_list_raw: 154 | try: 155 | book_id_raw = book.find('a').get('href').split('/')[2] 156 | book_list.append(int(book_id_raw)) 157 | except ValueError: 158 | logger.error("Could not get book id from {book_id_raw}".format(book_id_raw=book_id_raw)) 159 | 160 | book_list.sort() 161 | return book_list 162 | 163 | def get_last_scraped(self): 164 | """ 165 | Get last book scraped 166 | """ 167 | last_scraped_id = db_session.query(Setting).filter(Setting.bit == 0).one().book_last_id 168 | 169 | if last_scraped_id is None: 170 | last_scraped_id = 0 171 | 172 | return last_scraped_id 173 | 174 | def log_last_scraped(self): 175 | try: 176 | try: 177 | last_book_id = db_session.query(Book).order_by(Book.book_id.desc()).first() 178 | if last_book_id is not None: 179 | setting = db_session.query(Setting).filter(Setting.bit == 0).one() 180 | setting.book_last_id = last_book_id.book_id 181 | setting.book_last_ran = cutil.get_datetime() 182 | 183 | db_session.add(setting) 184 | db_session.commit() 185 | except NoResultFound: 186 | # If there is no raw data then no books were collected 187 | pass 188 | 189 | except: 190 | logger.exception("Problem logging last book scraped") 191 | 192 | def insert_data(self, data): 193 | """ 194 | Will handle inserting data into the database 195 | """ 196 | try: 197 | # Check if book is in database, if so update else create 198 | try: 199 | book = db_session.query(Book).filter(Book.book_id == data.get('book_id')).one() 200 | except NoResultFound: 201 | book = Book() 202 | 203 | book.title = data.get('title') 204 | book.subtitle = data.get('subtitle') 205 | book.author = data.get('author') 206 | book.year = data.get('year') 207 | book.pages = data.get('pages') 208 | book.language = data.get('language') 209 | book.publisher = data.get('publisher') 210 | book.isbn = data.get('isbn') 211 | book.format = data.get('format') 212 | book.description = data.get('description') 213 | book.file_source = data.get('file_source') 214 | book.file_cover_source = data.get('file_cover_source') 215 | book.file_location = data.get('file_location') 216 | book.file_cover_location = data.get('file_cover_location') 217 | book.book_id = data.get('book_id') 218 | book.time_collected = data.get('time_collected') 219 | 220 | db_session.add(book) 221 | db_session.commit() 222 | # self.track_stat('rows_added_to_db', rows_affected) 223 | 224 | except Exception: 225 | db_session.rollback() 226 | logger.exception("Error adding to db {data}".format(data=data)) 227 | 228 | 229 | def sigint_handler(signal, frame): 230 | logger.critical("Keyboard Interrupt") 231 | sys.exit(0) 232 | 233 | 234 | if __name__ == '__main__': 235 | signal.signal(signal.SIGINT, sigint_handler) 236 | 237 | try: 238 | scraper = ItEbooks() 239 | try: 240 | scraper.start() 241 | scraper.cleanup() 242 | 243 | except Exception: 244 | logger.critical("Main Error", exc_info=True) 245 | 246 | except Exception: 247 | logger.critical("Setup Error", exc_info=True) 248 | 249 | finally: 250 | scraper.log_last_scraped() 251 | try: 252 | # Log stats 253 | scraper_monitor.stop(total_urls=scraper.stats['total_urls'], 254 | ref_data_count=scraper.stats['ref_data_count'], 255 | ref_data_success_count=scraper.stats['ref_data_success_count'], 256 | rows_added_to_db=scraper.stats['rows_added_to_db']) 257 | 258 | except NameError: 259 | # If there is an issue with scraper.stats 260 | scraper_monitor.stop() 261 | 262 | except Exception: 263 | logger.critical("Scraper Monitor Stop Error", exc_info=True) 264 | scraper_monitor.stop() 265 | --------------------------------------------------------------------------------