├── DAILY.txt
├── 4HOURLY.txt
├── Scraper_config.py
├── TODO.txt
├── README.md
├── Utils.py
├── Entry.py
└── Scraper.py


/DAILY.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Colissimo Suivi - Cadeau de Noel de Remy et Clara
 3 |     http://www.colissimo.fr/portail_colissimo/suivre.do?language=fr_FR
 4 |     post:parcelnumber=NUMERO_DE_SUIVI
 5 |     root_div_id:leftColumn
 6 |     category:Suivi
 7 |     runid:day
 8 | 
 9 | __END__
10 | 
11 | 


--------------------------------------------------------------------------------
/4HOURLY.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Engadget
 3 |     http://www.engadget.com
 4 |     root_div_class:blogroll
 5 |     runid:4hour
 6 | 
 7 | Engadget Espagnol
 8 |     http://es.engadget.com
 9 |     root_div_class:content
10 |     runid:4hour
11 | 
12 | Dangerous Prototypes
13 |     http://dangerousprototypes.com/
14 |     root_div_id:content-area
15 |     runid:4hour
16 | 
17 | __END__
18 | 
19 | Grenoble Startup Weekend [SIGNUP]
20 |     http://www.eventbrite.com/event/3614159047
21 |     when:DAILY(20h)
22 |     proc:CODE_200
23 | 
24 | 
25 |     when:WEEKLY(FRIDAY 20h)
26 |     
27 | 


--------------------------------------------------------------------------------
/Scraper_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | Scraper_config = {
 3 |     # Parser to use:
 4 |     'PARSER' : None,
 5 |     #'PARSER' : 'html.parser',
 6 |     #'PARSER' : 'lxml'
 7 |     #'PARSER' : 'xml'
 8 |     #'PARSER' : 'html5lib'
 9 | 
10 |     # Put your e-mail address here:
11 |     'SEND_TO'     : 'scraper@jean-dupont.net',
12 | 
13 |     # Put the hostname or ip address of your service providers SMTP server here:
14 |     'SMTP_HOST' : 'smtp.provider.fr',
15 | 
16 |     # USER credentials for smtp provider stored in files
17 |     'SMTP_HOST_USER_FILE' : '/home/ubuntu/.CREDENTIALS/mail.user.raw',
18 |     'SMTP_HOST_PWD_FILE' : '/home/ubuntu/.CREDENTIALS/mail.app.raw',
19 | 
20 |     # Put the desired sender e-mail address here: no need to change this
21 |     'SENDER_EMAIL': 'scraper_cron@scraper.net',
22 | 
23 |     # Put the desired sender name: no need to change this
24 |     'SENDER_NAME' : 'Scraper',
25 | 
26 |     # If diffs to send are less than this number, don't send an e-mail, probably just junk: no need to change this
27 |     'SEND_MAIL_MIN_BYTES'  : 30,
28 |     'SEND_MAIL_MIN_LINES'  : 1,
29 | 
30 |     # Send a separate e-mail of diffsfor each site: no need to change this
31 |     'SEND_MAIL_INDIVIDUAL' : True,
32 | 
33 |     # Send diffs of all sites in just one e-mail: no need to change this
34 |     'SEND_MAIL_GLOBAL'     : False,
35 | 
36 |     # Send error e-mail for each site which encounters an error in processing: no need to change this
37 |     'SEND_ERROR_MAIL_INDIVIDUAL' : False,
38 | 
39 |     # Send one error e-mail for all sites which encounter errors in processing: no need to change this
40 |     'SEND_ERROR_MAIL_GLOBAL'     : False,
41 | 
42 |     # Send an e-mail summary of all sites processed: no need to change this
43 |     'SEND_MAIL_SUMMARY'          : False,
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | TODO:
 3 | 
 4 |    *** Add Mail throttling strategies ... if e-mail errors backoff ...
 5 | 
 6 |    NEXT:
 7 |     Complete README.md file with usage/install/configuration instructions
 8 |         Create an INSTALL.txt file
 9 | 
10 |     NEW TAGS in file list:
11 |         mailfrom:
12 |         subject:
13 |         exclude:
14 |         remove images??
15 |         minbytes:
16 | 
17 |     Reflect on "GROUPS" => "SCRAPE_RUN_ID"
18 | 
19 |     make find_all for root_div_* => more general (tag, attribute, value) and recursive check ...
20 | 
21 |    link-only mode: linkOnly:True -> detect change, provide link (as changedetection.com)
22 | 
23 |    Move sendmail_INDIVIDUAL into diff_page, move Prev/Next to return value, not included in INDIV mail 
24 | 
25 |    GLOBAL mails:
26 |       - Add index to o/p ...
27 |       - with fwd links and stats (num lines / delta lines)
28 |       -  Implement Prev/Next
29 |       - Divider markers allowing ctrl-F searching
30 |       - Show category markers allowing ctrl-F searching
31 | 
32 |    Add categories / select by category / order by category
33 |       -daily_general
34 |       -daily_linux
35 |       -daily_rpi
36 |       -...
37 |    Option to send e-mail by site, e-mail by category
38 |    Develop options in full list
39 | 
40 |    Accept gzip
41 | 
42 |    METHOD:
43 |      - get Latest Page => save to file
44 |      - parse Page / extract section => save to file
45 |      - diff against old section
46 |      - include new text of diff against old section
47 | 
48 |    Create HTML(only HTML?) pages of changed content
49 | 
50 |    E-Mail HTML page, or link to page, with
51 |      - reduced size images (or css/js to do this?)
52 |      - text-only (based on MODE)
53 |      - "include" changes as change segments (how? js/jquery?)
54 |          - grouped changes by date change? by category?
55 |          - ability to hide/delete content
56 |          - keep change log by category/url/...
57 | 
58 |    Move to OpenShift
59 |      - cron
60 | 
61 | 
62 |    Tags:
63 |        when:     When to check ...
64 |        actions:  MAIL?, MAIL_LINK, GROUP, ...
65 |        login:    ???? salted sha1 password hash
66 |        diff:     text, html(, xml?)
67 |        format:   General page format - text, html, Blogger, WordPress, ...
68 |        blogperday: Updates frequency (for Blogs to know if diffs are worthwhile)
69 |                      e.g. Engadget   => blogperday: 10
70 |                      e.g. Infrequent => blogperday: 0.1
71 |        rootid:   tag id to be used as content root (else all document)
72 |        filterids:tag ids to be used to filter content
73 |        category: general category for sorting entries
74 |        group:    specific category for grouping and sorting entries
75 |        images:   none small tiny local remote
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Scraper
  2 | =======
  3 | 
  4 | Python-based Web Scraper script
  5 | 
  6 | Scraper is a Python-script to perform web scraping.
  7 | 
  8 | The intended functionality is to monitor web-sites specified in a text-file,
  9 | detecting changes and sending changes as fragments of HTML by e-mail.
 10 | 
 11 | History:
 12 | --------
 13 | 
 14 | 2012-07-19: Creation of github archive
 15 | 	Checkin of first code version (needs to be cleaned up to be used by you ...!)
 16 | 
 17 | FAQ:
 18 | ----
 19 | 
 20 | - Why don't I just use an RSS reader for this?
 21 |     - Many web pages of interest don't have an RSS feed
 22 |     - I like to recieve notifications by e-mail not by other means
 23 |     - I've never liked using RSS readers
 24 |     - At some point maybe I'll allow other forms of notification
 25 | 
 26 | - Why reinvent the wheel?
 27 |     - I needed a good example application to teach myself Python
 28 |     - I didn't find the wheel I was looking for
 29 | 
 30 | NOTES:
 31 | ------
 32 | 
 33 | **UNICODE problems notes:**
 34 |     http://stackoverflow.com/questions/492483/setting-the-correct-encoding-when-piping-stdout-in-python
 35 |     A rule of thumb is: Always use unicode internally. decode what you receive, encode what you send.
 36 | 
 37 | INSTALLATION:
 38 | -------------
 39 | 
 40 | **Dependencies**
 41 | 
 42 |   This software is currently developed and run using Python running under Debian "Wheezy"
 43 |   (more precisely "Raspbian" running on a Raspberry Pi => http://www.raspberrypi.org).
 44 | 
 45 |   It should have no problem running with most Unices and even under Windows.
 46 | 
 47 |   - Python
 48 |      This software is currently developed using Python 2.7.3
 49 | 
 50 |   - Beautiful Soup: (error tolerant) HTML parsing software
 51 |      This software is currently developed using bs4
 52 | 
 53 |   - *Other Python modules?* (requests?)
 54 | 
 55 | **To install**
 56 |   - Make sure you have Python, BeautifulSoup installed
 57 | 
 58 |   - Copy the *.py files to a suitable directory
 59 | 
 60 |   - Modify Scraper_Config.py to set the appropriate SMTP and SENDER parameters to be used for sending results e-mail
 61 | 
 62 |   - Create your own LIST.txt listing sites to be scraped, use TEST_LIST.txt as an example
 63 |           See below for explanation of syntax.
 64 | 
 65 | LIST SYNTAX:
 66 | ------------
 67 | 
 68 | Each URL to be monitored has an entry of the form
 69 |   Entry Title
 70 |      tag1: value1
 71 |      tag2: value2
 72 |      tag3: value3
 73 | 
 74 | Where the "Entry Title" appears at the beginning of a line, serving as a delimiter between entries
 75 | (it is also strongly recommended that a blank line appears before this title line for readability)
 76 | 
 77 | The tag lines must be indented with spaces (arbitrary but at least 1).
 78 | 
 79 | *Acceptable tags are the following:*
 80 |     - http, https - to specify the URL to be monitored
 81 |               In this case the "tag:value" pair represents the full URL, e.g. http://mysite.com
 82 | 
 83 |     - root_<tag>_<attr> - to specify a subsection of the document to be treated
 84 |               By identifying that all useful content is contained within a <div id=content> tag for
 85 |               example, we can only interest ourselves in content within this tag.
 86 |               The line to specify this would be:
 87 |                     root_div_id:content
 88 | 
 89 |               This can allow to avoid unwanted sections of a page.
 90 | 
 91 |               Currently we use only the first corresponding entry (as multiple matching tags may exist)
 92 | 
 93 |               If no root tag is specified then the whole <body> is used.
 94 | 
 95 |     - runid - Used to specify the runid associated with this entry
 96 |               Typically the script will be run from cron with a specified period such as -week
 97 |               which automatically sets the runid to week
 98 | 
 99 |               A specific runid can also be set on the command-line using the -id argument.
100 | 
101 |     - filename_base - The software saves web pages, or the specified root section into a file
102 |               with name based upon the specified URL.
103 |               This option allows to specify another more readable base name for the file(s).
104 | 
105 |     - category - Arbitrary categories can be used to assign to entries
106 |               These can be used to specify a subset of entries to treat using the "-c <category>" argument
107 | 
108 |     - enabled - By default all entries are enabled, but they may be enabled/disabled by
109 |               setting this value to true or false
110 | 
111 |     - debug - Set debug mode for this entry, overides global debug mode setting
112 |               Equivalent to -debug commd-line arg which sets global debug mode for all entries
113 | 
114 |     - dinfo - Set dinfo mode for this entry, overides global dinfo mode setting
115 |               Equivalent to -dinfo commd-line arg which sets global dinfo mode for all entries
116 | 
117 |               Setting dinfo mode allows to include debugging info in the result e-mails, e.g.
118 |               indicating if root_* entries were matched or not.
119 | 
120 |     - parser - By default the Beautiful Soup default parser is used, but it is recommended
121 |               to specify a particular parser for each entry.
122 | 
123 |               Available parsers are 'html.parser', 'lxml', 'xml', 'html5lib'.
124 |               For more info please refer to the BeautifulSoup documentation here:
125 |                 http://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
126 | 
127 |     - action - Default action to perform is to determine differences compared to a previous run.
128 |               Another action is possible 'email_selection' which sends the whole selection,
129 |               not just differences
130 | 
131 |     - mailto - By default mails are sent to the SEND_TO address configured in Scraper_Config.py
132 |               Alternatively, a different value can be set for a particular entry using this value.
133 | 
134 |     - mailto+ - As mailto, but also sends mail to the SEND_TO address configured in
135 |               Scraper_Config.py
136 | 
137 |     - proc - TODO
138 | 
139 |     - when - TODO
140 | 
141 | Running the scraper:
142 | --------------------
143 | 
144 |     To list entries:
145 |         all entries:
146 |             ./Scraper.py -l LIST.txt
147 | 
148 |         all entries for weekly run:
149 |             ./Scraper.py -week -l LIST.txt
150 | 
151 |         all entries for weekly run, and runid week_thurs:
152 |             ./Scraper.py -week -id week_thurs -l LIST.txt
153 | 
154 |         all entries for category electronics:
155 |             ./Scraper.py -c electronics -l LIST.txt
156 | 
157 |         all entries for category electronics, with arduino in the url:
158 |             ./Scraper.py -c electronics -u arduino -l LIST.txt
159 | 
160 |         all entries for category electronics, with arduino in the entry tile:
161 |             ./Scraper.py -c electronics -e arduino -l LIST.txt
162 | 
163 |     To obtain differences:
164 | 
165 |         For 4 hourly checks:
166 |             ./Scraper.py -hour4 -get -diff -l 4HOURLY.txt
167 | 
168 |         For daily checks:
169 |             ./Scraper.py -day -get -diff -l FULL_LIST.txt
170 | 
171 |         For weekly checks:
172 |             ./Scraper.py -week -get -diff -l FULL_LIST.txt
173 | 
174 | Command-line invocations:
175 | ------------------------
176 | 
177 |     To be done later, in the meantime please refer to the following cron entries.
178 | 
179 | Example crontab entries:
180 | ------------------------
181 | 
182 |     ################################################################################
183 |     ## Screen scraping:
184 | 
185 |     SCRAPER_DIR=/home/user/usr/cron/SCRAPER
186 |     SCRAPER_VAR=/home/user/var
187 |     SCRAPER=/home/user/usr/cron/SCRAPER/Scraper.py
188 | 
189 |     # Get pages:
190 | 
191 |     #Roughly every 4 hours (for pages which change often):
192 |     01 03,07,12,15,19 * * * $SCRAPER -hour4 -get -diff -l $SCRAPER_DIR/4HOURLY.txt >> $SCRAPER_VAR/SCRAPER_hour.log 2>&1
193 | 
194 |     #Daily:
195 |     03 06 * * * $SCRAPER -day  -get -diff -l $SCRAPER_DIR/FULL_LIST.txt >> $SCRAPER_VAR/SCRAPER_day.log 2>&1
196 | 
197 |     #Weekly: Thursday
198 |     20 19 * * 4 $SCRAPER -week -get -diff -l $SCRAPER_DIR/OTHER_PEOPLE.txt >> $SCRAPER_VAR/SCRAPER_week_OTHERS.log 2>&1
199 | 
200 |     #Weekly: Thursday
201 |     02 12 * * 4 $SCRAPER -week -get -diff -l $SCRAPER_DIR/FULL_LIST.txt >> $SCRAPER_VAR/SCRAPER_week.log 2>&1
202 | 
203 |     #Weekly: Friday-WE
204 |     02 17 * * 5 $SCRAPER -week -id weekend -get -diff -l $SCRAPER_DIR/FULL_LIST.txt >> $SCRAPER_VAR/SCRAPER_weekend.log 2>&1
205 | 
206 |     #Monthly: 1st day of month
207 |     02 05 11 * * $SCRAPER -month -get -diff -l $SCRAPER_DIR/FULL_LIST.txt >> $SCRAPER_VAR/SCRAPER_month.log 2>&1
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/Utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import unicodedata
  3 | 
  4 | import socket
  5 | import smtplib
  6 | 
  7 | NLTK=False
  8 | try:
  9 |     import nltk
 10 |     NLTK=True
 11 | except ImportError:
 12 |     print("No nltk module available")
 13 | 
 14 | import traceback
 15 | 
 16 | import datetime
 17 | from datetime import date
 18 | 
 19 | SENDER_EMAIL=None
 20 | SENDER_NAME=None
 21 | SEND_MAIL_MIN_BYTES=None
 22 | SEND_MAIL_MIN_LINES=None
 23 | 
 24 | TEST_MODE=False
 25 | 
 26 | FMT_DATE='%G_%B_%d'
 27 | FMT_DATEHOUR='%G_%B_%d_%Hh'
 28 | FMT_DATETIME='%G_%B_%d_%Hh%Mm%S'
 29 | 
 30 | DATE=datetime.datetime.now().strftime(FMT_DATE)
 31 | DATEHOUR=datetime.datetime.now().strftime(FMT_DATEHOUR)
 32 | DATETIME=datetime.datetime.now().strftime(FMT_DATETIME)
 33 | 
 34 | ################################################################################
 35 | # encode2Ascii(lines):
 36 | 
 37 | def encode2Ascii(lines):
 38 |     ret=""
 39 | 
 40 |     if (str(type(lines)) == "<type 'str'>"):
 41 |         #print("STR")
 42 |         return lines
 43 | 
 44 |     #    return lines
 45 |     try:
 46 |         return lines.encode('ascii','ignore')
 47 |     except:
 48 |          return lines
 49 | 
 50 |     try:
 51 |         text = unicodedata.normalize('NFKD', lines).encode('ascii','ignore')
 52 |     except:
 53 |         text = unicodedata.normalize('NFKD', lines).encode('utf-8','ignore')
 54 | 
 55 |     return text
 56 | 
 57 |     ret = ''
 58 |     for i in range(0, len(text)):
 59 |         if (i % 2) == 0:
 60 |             ret = ret + text[i]
 61 | 
 62 |     return ret;
 63 | 
 64 |     for char in lines:
 65 |         try:
 66 |             unicodedata.normalize('NFKD', char).encode('ascii','ignore')
 67 |             byte=char
 68 |             #byte = unicodedata.normalize('NFKD', char).encode('ascii','ignore')
 69 |             #byte = char.encode("ascii")
 70 |         except:
 71 |             byte = "."
 72 |         ret = ret + byte
 73 | 
 74 |     return ret
 75 | 
 76 | ################################################################################
 77 | # readFile(filename):
 78 | 
 79 | def readFile(filename):
 80 |     """Read a file"""
 81 | 
 82 |     lines = [line.rstrip() for line in open(filename)]
 83 | 
 84 |     return lines
 85 | 
 86 | ################################################################################
 87 | # writeFile(filename):
 88 | 
 89 | def writeFile(filename, text):
 90 |     """Write a file"""
 91 | 
 92 |     try:
 93 |         with open(filename, 'wb') as file:
 94 |             file.write(text)
 95 | 
 96 |     except:
 97 |         print("ERROR: in writeFile("+filename+"):" + traceback.format_exc())
 98 |         raise
 99 | 
100 | ################################################################################
101 | # def isAscii(mystring):
102 | 
103 | def isAscii(mystring):
104 |     try:
105 |         mystring.decode('ascii')
106 |     except UnicodeDecodeError:
107 |         return "it was not a ascii-encoded unicode string"
108 |     else:
109 |         return "It may have been an ascii-encoded unicode string"
110 | 
111 | ################################################################################
112 | # def sendmail( entry, to, body, select_entries, category, period, name, runid):
113 | 
114 | def sendmail( entry, to, body, select_entries, category, period, name, runid):
115 |     print(f"sendmail( {entry}, {to}, {body}, {select_entries}, {category}, {period}, {name}, {runid})")
116 | 
117 |     print(f"type(body)={type(body)}") # <class 'bytes'>
118 |     if str(type(body)) == "<class 'bytes'>":
119 |         body=body.decode('ascii','ignore')
120 |         print(f"AFTER decode('ascii'): type(body)={type(body)}")
121 | 
122 |     try:
123 |         body=''.join(body)
124 |         print("len")
125 |         num_body_bytes = len(str(body))
126 |         print("count")
127 |         num_body_lines = body.count('\n')
128 |     except:
129 |         print("join failed")
130 |         num_body_bytes=0
131 |         num_body_lines=0
132 | 
133 |     try:
134 |         print("-----------------------------------------------------------------------------------")
135 |         #print(f"str(body)={str(body)}")
136 |         #print("-----------------------------------------------------------------------------------")
137 |         print(f"num_body_bytes={num_body_bytes}")
138 |         print(f"num_body_lines={num_body_lines}")
139 |         print("body_info")
140 |         body_info = "HTML TEXT: "
141 |         print("body_info+")
142 |         body_info = body_info + \
143 |             str(num_body_bytes) + " body bytes &nbsp;&nbsp; " + \
144 |             str(num_body_lines) + " body lines<br>"
145 |         print("OK")
146 |     except:
147 |         body=""
148 |         body_info=""
149 |         num_body_bytes = 0
150 |         num_body_lines=0
151 | 
152 |         if NLTK:
153 |             try:
154 |                 text = nltk.clean_html(body)
155 |                 plain_bytes = len(text)
156 |                 plain_lines = text.count('\n')
157 |                 body_info = body_info + "PLAIN TEXT: " + \
158 |                     str(plain_bytes) + " plain bytes &nbsp;&nbsp; " + \
159 |                     str(plain_lines) + " plain lines<br>"
160 |             except:
161 |                 print("Failed to calculate plaintext size using NLTK")
162 | 
163 |     if (num_body_bytes < SEND_MAIL_MIN_BYTES):
164 |         print(f"{type(name)}=>{name}")
165 |         print(f"{type(num_body_bytes)}=>{num_body_bytes}")
166 |         print(f"{type(SEND_MAIL_MIN_BYTES)}=>{SEND_MAIL_MIN_BYTES}")
167 |         #print("**** Not sending mail as num bytes="+str(num_body_bytes)+"< min("+str(SEND_MAIL_MIN_BYTES)+") [" + name + "]")
168 |         print(f"**** Not sending mail as num bytes={num_body_bytes} < min({SEND_MAIL_MIN_BYTES}) [{name}]")
169 |         return
170 |     else:
171 |         #print("**** Sending mail as num bytes="+str(num_body_bytes)+">= min("+str(SEND_MAIL_MIN_BYTES)+") [" + name + "]")
172 |         print(f"**** Sending mail as num bytes={num_body_bytes} >= min({SEND_MAIL_MIN_BYTES}) [{name}]")
173 | 
174 |     print(f"SEND_MAIL_MIN_LINES={SEND_MAIL_MIN_LINES}")
175 |     if (num_body_lines < SEND_MAIL_MIN_LINES):
176 |         print(f"**** Not sending mail as num lines={num_body_lines} < min({SEND_MAIL_MIN_LINES} ) [{name}]")
177 |         return
178 |     else:
179 |         print(f"**** Sending mail as num lines={num_body_lines} >= min({SEND_MAIL_MIN_LINES} ) [{name}]")
180 | 
181 |     if (entry != None):
182 |         #entry_info ="<br><h3>Entry info:</h3>\n"
183 |         entry_info ="<br><b>Entry info:</b>\n"
184 |         entry_info = entry_info + "&nbsp;&nbsp;&nbsp;&nbsp; <b>url</b>: "
185 |         entry_info = entry_info + "<a href='" + entry.url + "'> " + entry.url + "</a>"
186 |         entry_info = entry_info + "<br>\n"
187 | 
188 |         for key in entry.fields.keys():
189 |             entry_info = entry_info + "&nbsp;&nbsp;&nbsp;&nbsp;" + key + ": " + entry.fields.get(key) + "<br>\n"
190 | 
191 |         debug_info_text=""
192 |         if (entry.dinfo):
193 |             debug_info_text = "<hr>" + entry.dinfo_text
194 |             
195 |         #body = entry_info + "<br>" + str(num_body_bytes) + " body bytes<br><br>" + debug_info_text + body
196 |         body = entry_info + body_info + debug_info_text + body
197 | 
198 |         if ('mailto' in entry.fields):
199 |             #to = [ entry.fields.mailto ]
200 |             to = entry.fields.get('mailto')
201 | 
202 |         if ('mailto+' in entry.fields):
203 |             #to.append( entry.fields.mailto )
204 |             to.append( entry.fields.get('mailto+') )
205 | 
206 |     if (runid == None):
207 |         print("ERROR: in sendmail() runid is unset: " + traceback.format_exc())
208 |         #runid=runids.get(period)
209 |         runid="__NO_RUNID__"
210 | 
211 |     subject=f'[{runid}]<{num_body_bytes}c, {num_body_lines}l>: {name}'
212 |     if TEST_MODE:
213 |         subject='[TEST_MODE]: ' + subject
214 | 
215 |     if (select_entries):
216 |         subject = subject + '<' + select_entries + '> '
217 | 
218 |     if (category):
219 |         subject = subject + '[' + category + ']'
220 | 
221 |     subject = subject + '[' + DATEHOUR + '] '
222 | 
223 |     headers=[ 'MIME-Version: 1.0\n', 'Content-type: text/html\n' ]
224 |     
225 |     _sendmail( to, headers, body, subject)
226 | 
227 | ################################################################################
228 | # def _sendmail(to, headers, body, subject="Scraper", sender=SENDER_EMAIL, sender_name=SENDER_NAME):
229 | 
230 | #def _sendmail(to, headers, body, subject="Scraper", sender=SENDER_EMAIL, sender_name=SENDER_NAME):
231 | def _sendmail(to, headers, body, subject="Scraper"):
232 | 
233 |     sender=SENDER_EMAIL
234 |     sender_name=SENDER_NAME
235 | 
236 |     print("SENDER_EMAIL=" + SENDER_EMAIL)
237 |     print("sender="+sender)
238 |     print("sender_name="+sender_name)
239 | 
240 |     message = "From: " + sender_name
241 |     message = message + " <"+sender+">\n" + "To: "
242 |     message = message + ' '.join(to)
243 |     message = message + "\nSubject: " + subject + "\n"
244 | 
245 |     for header in headers:
246 |         message = message + header
247 | 
248 |     message = message + "\n\n" + body
249 | 
250 |     num_body_bytes = len(body)
251 |     by = "[ with " + str(num_body_bytes) + " bytes]"
252 | 
253 |     #sender = 'from@mjbright.net'
254 |     #receivers = ['mjbrightfr@gmail.com']
255 | 
256 |     PWD=None
257 |     USER=None
258 |     if SMTP_HOST_PWD_FILE:  PWD=readFile(SMTP_HOST_PWD_FILE)[0].strip()
259 |     if SMTP_HOST_USER_FILE: USER=readFile(SMTP_HOST_USER_FILE)[0].strip()
260 |     print(f"PWD={PWD} USER={USER}")
261 | 
262 |     try:
263 |        if PWD:
264 |            s = smtplib.SMTP('smtp.gmail.com', 587)
265 |            s.set_debuglevel(2)
266 |            s.starttls()
267 |            s.ehlo()
268 |        else:
269 |            s = smtplib.SMTP(SMTP_HOST)
270 |     except smtplib.SMTPException:
271 |        print(f"**** Error: unable to connect to smtp host {SMTP_HOST}")
272 | 
273 |     if PWD:
274 |         try:
275 |             s.login(USER, PWD)
276 |         except smtplib.SMTPException:
277 |            print(f"**** Error: unable to login({USER}, {PWD}) to smtp host {SMTP_HOST}")
278 | 
279 |     try:
280 |        s.sendmail(sender, to, message)
281 |        print("**** Sent email to <" + ' '.join(to) + "> " + by + " " + subject)
282 |     except smtplib.SMTPException:
283 |        print("**** Error: unable to send email" + by + " " + subject)
284 | 
285 | 
286 | 


--------------------------------------------------------------------------------
/Entry.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from bs4 import BeautifulSoup
  3 | 
  4 | # optional parser:
  5 | import html5lib
  6 |     
  7 | import urllib
  8 | import difflib
  9 | 
 10 | import os
 11 | import traceback
 12 | import gzip # For check_file_not_gzipped()
 13 | 
 14 | #from Utils import *
 15 | import Utils as u
 16 | 
 17 | #import unicodedata
 18 | 
 19 | class Entry:
 20 |     globalRunID=None
 21 |     Parser=None
 22 | 
 23 |     def __init__(self):
 24 |         self.fields = {}
 25 |         self.debug=False
 26 |         self.dinfo=False
 27 |         self.dinfo_text=""
 28 | 
 29 |     ################################################################################
 30 |     # get(self, field):
 31 | 
 32 |     def get(self, field):
 33 |         if field in self.fields:
 34 |             return self.fields[field]
 35 | 
 36 |         #if (self.fields[field]):
 37 |             #return self.fields[field]
 38 | 
 39 |         #if (self.fields.get(field)):
 40 |             #return self.fields.get(field)
 41 |             
 42 |         return None
 43 | 
 44 |     ################################################################################
 45 |     # set(self, field):
 46 | 
 47 |     def set(self, field, value):
 48 |         self.fields.set(field, value)
 49 | 
 50 |     ################################################################################
 51 |     # createFileName(self):
 52 | 
 53 |     def createFileName(self):
 54 | 
 55 |         if (self.get('filename_base')):
 56 |             return self.get('filename_base')
 57 | 
 58 |         file = self.url
 59 |         file = file.replace("http://", "")
 60 |         file = file.replace("https://", "")
 61 |         file = file.replace("/", "_")
 62 |         file = file.replace("?", "_")
 63 |         file = file.replace("&", "_")
 64 | 
 65 |         #return file[0:100]
 66 |         return file
 67 | 
 68 |     ################################################################################
 69 |     # get_page(self, DOWNLOAD_DIR):
 70 |     
 71 |     def get_page(self, DOWNLOAD_DIR):
 72 |         op_file = DOWNLOAD_DIR + "/" + self.createFileName()
 73 | 
 74 |         # Configure User-Agents:
 75 |         #TODO: read this from Scraper_config:
 76 |         UAs = dict({
 77 |           'ffox5': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12'
 78 |           })
 79 | 
 80 |         ua = UAs.get('ffox5') # TODO: browser-configurable
 81 |     
 82 |         #HACK: remove trailing " in url used to distinguish multiple entries with same URL:
 83 |         #url = str(self.get('url')).rstrip('\"')
 84 |         url = str(self.url).rstrip('\"')
 85 |         print("url:" + url)
 86 |     
 87 |         try:
 88 |             opener = urllib.request.build_opener()
 89 |         except urllib.error.HTTPError as e:
 90 |             print(f"Failed to open url {url}")
 91 |             return
 92 |     
 93 |         try:
 94 |             opener.addheaders = [('User-agent', ua)]
 95 |             req = opener.open(url, timeout=30)
 96 |             #req = urllib.urlopen(url, headers={'User-Agent' : ua})
 97 |     
 98 |             CHUNK = 16 * 1024
 99 |             with open(op_file, 'wb') as fp:
100 |               while True:
101 |                 chunk = req.read(CHUNK)
102 |                 if not chunk: break
103 |     
104 |                 #### # Strip chars > 128 (replace with space):
105 |                 #### for i in range(0, len(chunk)):
106 |                     #### if (ord(chunk[i]) > 128):
107 |                         #### chunk[i]=' '
108 |                 fp.write(chunk)
109 |     
110 |         except urllib.error.HTTPError as e:
111 |             print(e.fp.read())
112 |     
113 |         except urllib.error.URLError as e:
114 |             if isinstance(e.reason, socket.timeout):
115 |                 #raise MyException("Connection timedout - error: %r" % e)
116 |                 print("Connection timedout - error: %r" % e)
117 |             else:
118 |                 # reraise the original error
119 |                 # raise
120 |                 #print(e.fp.read())
121 |                 print("URL Error")
122 |     
123 |         except:
124 |             print("ERROR: in get_page:" + traceback.format_exc())
125 |             #raise
126 |     
127 |         try:
128 |             self.check_file_not_gzipped(op_file)
129 |         except:
130 |             print("ERROR: in get_page - failed gzip checking - " + traceback.format_exc())
131 |     
132 |         #except urllib.Error as e:
133 |                 #print("urllib.Error: " + e.fp.read())
134 |     
135 |     
136 |     ################################################################################
137 |     # def get_subtree_from_html(self, file, html, tag, attribute_name, attribute_value):
138 | 
139 |     def get_subtree_from_html(self, file, html, tag, attribute_name, attribute_value):
140 |         value = None
141 | 
142 |         entry_key = tag + "_" + attribute_name
143 | 
144 |         search = "<" + tag + " " + attribute_name + "='" + attribute_value + "'>"
145 |         search_text = "&lt;" + tag + " " + attribute_name + "='" + attribute_value + "'&gt;"
146 | 
147 |         #print("Getting content from root " + entry_key + "='" + attribute_value +"'")
148 |         print("Getting content from root " + search + " tag")
149 | 
150 |         try:
151 |             attrs=dict()
152 |             attrs[attribute_name]=attribute_value
153 | 
154 |             print("main = html.find_all(" + tag + ",  attrs={" + attribute_name + " : " + attribute_value + "})")
155 |             main = html.find_all(tag, attrs)
156 | 
157 |             # Would be good "sometimes" to show failed matches also - for now only show actual MATCH:
158 |             if (len(main) > 0):
159 |                 self.dinfo_text = self.dinfo_text + "MATCHED " + str(len(main)) + " element(s) for root '" + search_text + "' tag <br>\n"
160 |             print("MATCHED " + str(len(main)) + " element(s) for root '" + search + "' tag <br>\n")
161 | 
162 |             if (len(main) > 1):
163 |                 print("WARN: matched on more than 1 " + search + " tag")
164 | 
165 |             if (len(main) == 0):
166 |                 raise Exception("Not", " found")
167 | 
168 |             #print(repr(main))
169 | 
170 |             contents=main[0].contents # Return contents of first match only
171 |             #self.dinfo_text = self.dinfo_text + "MATCHED 1/" + str(len(main)) + " element(s) for root '" + search_text + "' tag [" + str(len(str(contents))) + " bytes]<br>\n"
172 | 
173 |             if self.debug:
174 |                 file = file + "." + entry_key + ".selection"
175 |                 print("Writing selection file: " + file)
176 |                 u.writeFile(file, str(contents))
177 | 
178 |             return contents
179 | 
180 |         except:
181 |             #self.dinfo_text = self.dinfo_text + "FAILED to match root '" + search_text + "' tag<br>\n"
182 |             print("ERROR: Failed to find root at " + search + " tag")
183 |             if self.debug:
184 |                 print(traceback.format_exc())
185 |                 self.dinfo_text = self.dinfo_text + traceback.format_exc() + "<br>\n"
186 |             raise
187 | 
188 |     ################################################################################
189 |     # def parse_page(self, DIR):
190 |     
191 |     def parse_page(self, DIR):
192 | 
193 |         url = self.get('url')
194 |         print("--->parse_page(" + str(url) + ")")
195 |         file = DIR + "/" + self.createFileName()
196 |     
197 |         if (not os.path.exists(file)):
198 |             print("No such dir/file as '"+file+"'")
199 |             return
200 |     
201 |         if (not os.path.isfile(file)):
202 |             print("No such file as '"+file+"'")
203 |             return
204 |     
205 |         print("--->parse_file(" + file + ")")
206 |     
207 |         text = ''
208 |         #f = open(file, "r")
209 |         #text = f.read(10000000) # 10 MBy !
210 |         #f.close()
211 |         f = open(file, "rb")
212 |         # See: https://stackoverflow.com/questions/33054527/typeerror-a-bytes-like-object-is-required-not-str-when-writing-to-a-file-in
213 |         #f = open(file, "r")
214 |         text = f.read(10000000) # 10 MBy !
215 |         text = u.encode2Ascii(text)
216 |         f.close()
217 |         '''
218 |         f = open(file, "rb")
219 |         # See: https://stackoverflow.com/questions/33054527/typeerror-a-bytes-like-object-is-required-not-str-when-writing-to-a-file-in
220 |         #f = open(file, "r")
221 |         text = f.read(10000000) # 10 MBy !
222 |         text = u.encode2Ascii(text)
223 |         f.close()
224 |         '''
225 |     
226 |         try:
227 |             parser=Entry.Parser
228 |             if (self.get('parser')):
229 |                 parser=self.get('parser')
230 | 
231 |         
232 |             print("soup = BeautifulSoup(text, " + str(parser) +")")
233 |            
234 |             if (parser == None):
235 |                 soup = BeautifulSoup(text)
236 |             else:
237 |                 if (parser == "html5lib"):
238 |                     soup = BeautifulSoup(text, html5lib)
239 |                 else:
240 |                     soup = BeautifulSoup(text, parser)
241 | 
242 |         except:
243 |             print("ERROR: Failed to parse html file: " + file)
244 |             print(traceback.format_exc())
245 |             #return '<br> Failed to parse ' + file + '\n' + ''.join(open(file).readlines())
246 |             return '<br> Failed to parse ' + file + '\n' + text
247 |     
248 |         try:
249 |             print("Original encoding = " + str(soup.originalEncoding))
250 |         except:
251 |             print("Original encoding = <exception>")
252 |     
253 |         body = soup.body
254 |     
255 |         if (body == None):
256 |             return ""
257 |     
258 |         self.dinfo_text = self.dinfo_text + "<b> Searching in file '" + file + "'</b><br>\n"
259 | 
260 |         ############################################################
261 |         ## Try first root_div_class, root_div_id entries if present:
262 |     
263 |         for key in self.fields:
264 |             if (key[0:5] == "root_"):
265 |                 attr_val=self.fields[key]
266 |     
267 |                 parts=key.split("_")
268 |                 tag=parts[1]
269 |                 attr=parts[2]
270 |                 
271 |                 try:
272 |                     return self.get_subtree_from_html(file, body, tag, attr, attr_val)
273 |                 except:
274 |                     if (attr == "class"):
275 |                         attr="id"
276 |     
277 |                     try:
278 |                         return self.get_subtree_from_html(file, body, tag, attr, attr_val)
279 |                     except:
280 |                         pass
281 |     
282 |         root_div_class = None
283 |         if ('root_div_class' in self.fields):
284 |             root_div_class = self.get('root_div_class')
285 |             try:
286 |                 return self.get_subtree_from_html(file, body, 'div', 'class', root_div_class)
287 |             except:
288 |                 if (not 'root_div_id' in self.fields):
289 |                     print("Trying as 'root_div_id'")
290 |                     self.fields['root_div_id'] = root_div_class
291 |     
292 |         root_div_id = None
293 |         if ('root_div_id' in self.fields):
294 |             root_div_id = self.get('root_div_id')
295 |     
296 |             try:
297 |                 return self.get_subtree_from_html(file, body, 'div', 'id', root_div_id)
298 |             except:
299 |                 pass
300 |     
301 |         ############################################################
302 |         ## Then try root_div_class, root_div_id as 'content':
303 |     
304 |         if (not root_div_class == 'content'):
305 |             root_div_class = 'content'
306 |             try:
307 |                 return self.get_subtree_from_html(file, body, 'div', 'class', root_div_class)
308 |             except:
309 |                 pass
310 |     
311 |         if (not root_div_id == 'content'):
312 |             root_div_id='content'
313 |             try:
314 |                 return self.get_subtree_from_html(file, body, 'div', 'id', root_div_id)
315 |             except:
316 |                 pass
317 |     
318 |         ############################################################
319 |         ## Then try body
320 |         if (body):
321 |             self.dinfo_text = self.dinfo_text + "Used full body<br>\n"
322 |             return body.contents
323 |     
324 |         ############################################################
325 |         ## If all else fails return nothing!
326 |         print("Returning NO content")
327 |         return "";
328 |     
329 |         #print(main.prettify())
330 |         #print(repr(soup.prettify()))
331 |     
332 |     ################################################################################
333 |     # def diff_page(self, classId, NEW_DIR, OLD_DIR, email_attrs):
334 |     
335 |     def diff_page(self, classId, NEW_DIR, OLD_DIR, email_attrs):
336 |         itemno=0
337 |     
338 |         new_lines = self.parse_page(NEW_DIR)
339 |         try:
340 |             new_lines = str(new_lines) # to UTF-8
341 |         except:
342 |             print("ERROR: Failed to str(NEW page)")
343 |             raise
344 |             #return ""
345 |     
346 |         try:
347 |             new_lines = ''.join(new_lines)
348 |             new_lines = new_lines.decode("utf8")
349 |         except:
350 |             print("ERROR: Failed to decode NEW page to 'utf8'")
351 |             #raise
352 |             #return ""
353 |     
354 |         if ((new_lines != "") and email_attrs['SEND_MAIL_INDIVIDUAL']):
355 |             #body = ''.join(lines.readlines())
356 |             #body = new_lines
357 |             body = u.encode2Ascii(new_lines)
358 |     
359 |             if (('action' in self.fields) and (self.get('action')  == "email_selection")):
360 |                 print("email_selection")
361 |     
362 |                 select_entries=email_attrs['select_entries']
363 |                 category=email_attrs['category']
364 |                 period=email_attrs['period']
365 |                 name=email_attrs['name']
366 |                 send_to= [ email_attrs['SEND_TO'] ]
367 |                 u.sendmail( self, send_to, body, select_entries, category, period, "SELECT: " + name, Entry.globalRunID)
368 |                 return ""
369 |     
370 |         try:
371 |             old_lines = self.parse_page(OLD_DIR)
372 |         except:
373 |             print("ERROR: Failed to parse_page(OLD page)")
374 |             raise
375 |     
376 |         try:
377 |             old_lines = str(old_lines) # to UTF-8
378 |         except:
379 |             print("ERROR: Failed to str(OLD page)")
380 |             old_lines = ""
381 |             #raise
382 |             #return ""
383 |     
384 |         try:
385 |             old_lines = ''.join(old_lines)
386 |             old_lines = old_lines.decode("utf8")
387 |         except:
388 |             print("ERROR: Failed to decode OLD page to 'utf8'")
389 |             #raise
390 |             #return ""
391 |     
392 |         
393 |         file = NEW_DIR + "/" + self.createFileName() + ".new.prediff"
394 |         u.writeFile(file, u.encode2Ascii(new_lines))
395 |         file = NEW_DIR + "/" + self.createFileName() + ".old.prediff"
396 |         u.writeFile(file, u.encode2Ascii(old_lines))
397 |     
398 |         print("   diff("+str(len(old_lines))+" old bytes vs. "+str(len(new_lines))+" new bytes)")
399 |         diff_text = difflib.unified_diff(old_lines.split("\n"), new_lines.split("\n"))
400 |         #print("   ==> "+str(len(diff))+" bytes different")
401 |     
402 |         if self.debug:
403 |             try:
404 |                 #### file = NEW_DIR + "/" + self.createFileName() + ".diff"
405 |                 print("Writing diff file: " + file)
406 |                 debug_diff_text = diff_text[:] # Deepcopy !!
407 |                 debug_diff_text = ' '.join(list(debug_diff_text))
408 |                 print("debug_diff_text len="+str(len(debug_diff_text)))
409 |                 debug_diff_text = u.encode2Ascii(debug_diff_text)
410 |                 print("debug_diff_text len="+str(len(debug_diff_text)))
411 |                 u.writeFile(file, debug_diff_text)
412 |             except:
413 |                 print("ERROR: failed to write diff file: " + traceback.format_exc())
414 |     
415 |         show_new_only=True
416 |         show_new_only=False
417 |     
418 |         div_page_diffs = "<hr>\n<div class id='"+classId+"'>\n"
419 |         ##if (itemno > 0):
420 |             ##item=str(itemno)
421 |             ##div_page_diffs = div_page_diffs + "<a href='#item_"+item+"'> Prev</a>\n"
422 |         ##nextno=str(itemno+2)
423 |         ##div_page_diffs = div_page_diffs + "<a href='#item_"+nextno+"'>Next</a>\n"
424 |     
425 |         itemno = itemno +1
426 |         item=str(itemno)
427 |         div_page_diffs = div_page_diffs + "<a name='item_"+item+"'> </a>\n"
428 |         div_page_diffs = div_page_diffs + "<h1> "+classId+" </h1>\n"
429 |     
430 |         page_diffs = ""
431 |     
432 |         for d in diff_text:
433 |             d = d.encode("utf8", "ignore")
434 |             d = d.decode()
435 |             #d = d.encode()
436 |     
437 |             # Ignore initial '+++' line:
438 |             if (d.find("+++") == 0): 
439 |                 continue
440 |     
441 |             # Ignore position '@@' lines:
442 |             if (d.find("@@") == 0):
443 |                 continue
444 |     
445 |             # Ignore removed lines:
446 |             if (d.find("-") == 0): 
447 |                 continue
448 |     
449 |             d = self.substitute_local_links(d)
450 |     
451 |             # Remove leading '+' from new/modified lines:
452 |             if (d.find("+") == 0): 
453 |                 d = d.replace("+","",1).replace("u[\"","",1)
454 |                 if (show_new_only):
455 |                     page_diffs = page_diffs + d + "\n";
456 |                     #print(d)
457 |                     continue
458 |                 #print(d.replace("+","",1))
459 |                 #print(d)
460 |     
461 |             if ( not show_new_only):
462 |                 # Print new/modified/"old context" lines:
463 |                 page_diffs = page_diffs + d + "\n";
464 |                 #print(d)
465 |     
466 |         print("   ==> "+str(len(page_diffs))+" NEW bytes different")
467 |     
468 |         if (page_diffs == ""):
469 |             return ""
470 |     
471 |         if self.debug:
472 |             try:
473 |                 file = NEW_DIR + "/" + self.createFileName() + ".diff.NEW"
474 |                 print("Writing diff file: " + file)
475 |                 debug_page_diffs = page_diffs[:] # Deepcopy !!
476 |                 debug_page_diffs = ' '.join(list(debug_page_diffs))
477 |                 print("debug_page_diffs len="+str(len(debug_page_diffs)))
478 |                 debug_page_diffs = u.encode2Ascii(debug_page_diffs)
479 |                 print("debug_page_diffs len="+str(len(debug_page_diffs)))
480 |                 u.writeFile(file, debug_page_diffs)
481 |             except:
482 |                 print("ERROR: failed to write diff file: " + traceback.format_exc())
483 |         page_diffs = div_page_diffs + page_diffs + "</div><<br/> <!-- "+classId+"-->\n\n"
484 |     
485 |         if ((page_diffs != "") and email_attrs['SEND_MAIL_INDIVIDUAL']):
486 |             #body = ''.join(lines.readlines())
487 |             body = page_diffs.encode('utf-8')
488 |     
489 |             select_entries=email_attrs['select_entries']
490 |             category=email_attrs['category']
491 |             period=email_attrs['period']
492 |             name=email_attrs['name']
493 |             send_to= [ email_attrs['SEND_TO'] ]
494 |             u.sendmail( self, send_to, body, select_entries, category, period, name, Entry.globalRunID)
495 |     
496 |         return page_diffs
497 |     
498 |     ################################################################################
499 |     # def check_file_not_gzipped(self, file):
500 | 
501 |     def check_file_not_gzipped(self, file):
502 |     
503 |         byte1 = 0
504 |         byte2 = 0
505 |     
506 |         with open(file, 'rb') as fp:
507 |             byte1 = ord(fp.read(1))
508 |             byte2 = ord(fp.read(1))
509 |     
510 |         if (byte1 == 0x1f) and (byte2 == 0x8b):
511 |             print("File '" + file + "' is gzip-compressed, uncompressing ...")
512 |             ifp = gzip.open(file, 'rb')
513 |             content = ifp.read()
514 |             ifp.close()
515 |     
516 |             u.writeFile(file, content)
517 |      
518 |     ################################################################################
519 |     # def substitute_local_links(self, d):
520 |     
521 |     def substitute_local_links(self, d):
522 |     
523 |        file_slash=d.find('href="/')
524 |     
525 |        if (file_slash < 0):
526 |            file_slash=d.find("href='/")
527 |     
528 |            if (file_slash < 0):
529 |                return d
530 |     
531 |        slash=self.url.find("/")
532 |     
533 |        protocol = self.url[:slash-1]
534 |        addr = self.url[slash+2:]
535 |        slash3 = addr.find("/")
536 |     
537 |        #print("PROTOCOL="+protocol)
538 |        #print("ADDR="+addr)
539 |        #print("slash="+str(slash))
540 |        #print(d)
541 |     
542 |        orig = d
543 |     
544 |        rootUrl = protocol + "://" + addr[:slash3] + "/"
545 |        #print("rootUrl="+rootUrl)
546 |     
547 |        d = d.replace("href='/", "href='"+rootUrl)
548 |        d = d.replace('href="/', 'href="'+rootUrl)
549 |       
550 |        #d = d.replace("href='/", "href='"+self.url)
551 |        #d = d.replace('href="/', 'href="'+self.url)
552 |     
553 |        #if (orig != d):
554 |           #print("orig("+orig+")=>"+d)
555 |     
556 |        return d
557 |     
558 | 


--------------------------------------------------------------------------------
/Scraper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import re
  4 | import requests,sys,os
  5 | import traceback
  6 | 
  7 | from datetime import date, timedelta
  8 | 
  9 | # Used for converting unicode to ASCII (??):
 10 | import unicodedata
 11 | 
 12 | ## Import Scraper config from this Python module
 13 | from Scraper_config import Scraper_config
 14 | 
 15 | from Entry import Entry
 16 | 
 17 | import Utils as u
 18 | 
 19 | DATE=u.DATE
 20 | DATETIME=u.DATETIME
 21 | DATEHOUR=u.DATEHOUR
 22 | FMT_DATE=u.FMT_DATE
 23 | FMT_DATETIME=u.FMT_DATETIME
 24 | FMT_DATEHOUR=u.FMT_DATEHOUR
 25 | 
 26 | ################################################################################
 27 | 
 28 | DEBUG_MODE=False
 29 | DEBUG_INFO=False
 30 | TEST_MODE=False
 31 | 
 32 | SAVE_ERRORS=list()
 33 | 
 34 | ################################################################################
 35 | # Entry filtering
 36 | 
 37 | select_entries=None
 38 | select_urls=None
 39 | 
 40 | category=None
 41 | 
 42 | ################################################################################
 43 | # Differencing period:
 44 | 
 45 | 
 46 | HOUR=1
 47 | HOUR2=2
 48 | HOUR4=4
 49 | 
 50 | DAY=10
 51 | DAY2=20
 52 | 
 53 | WEEK=100
 54 | WEEK2=200
 55 | 
 56 | MONTH=1000
 57 | MONTH2=2000
 58 | 
 59 | period=DAY
 60 | runids = dict({
 61 |     HOUR:   'hour',
 62 |     HOUR2:  '2hour',
 63 |     HOUR4:  '4hour',
 64 |     DAY:    'day',
 65 |     DAY2:   '2day',
 66 |     WEEK:   'week',
 67 |     WEEK2:  '2week',
 68 |     MONTH:  'month',
 69 |     MONTH2: '2month',
 70 | })
 71 | 
 72 | 
 73 | ################################################################################
 74 | # E-mail config:
 75 | 
 76 | EMAIL_CONFIG_KEYS=list({
 77 |     'SEND_TO', 'SENDER_EMAIL', 'SENDER_NAME', 'SMTP_HOST',
 78 |     'SEND_MAIL_MIN_BYTES', 'SEND_MAIL_MIN_LINES',
 79 |     'SMTP_HOST_PWD_FILE', 'SMTP_HOST_USER_FILE',
 80 |     'SEND_MAIL_INDIVIDUAL', 'SEND_MAIL_GLOBAL',
 81 |     'SEND_ERROR_MAIL_INDIVIDUAL', 'SEND_ERROR_MAIL_GLOBAL',
 82 |     'SEND_MAIL_SUMMARY'
 83 |     })
 84 | 
 85 | for key in EMAIL_CONFIG_KEYS:
 86 |     if (not key in Scraper_config):
 87 |         print("Entry for config item '" + key + "' is missing from Scraper_config")
 88 |         exit(255)
 89 | 
 90 | u.SENDER_EMAIL = Scraper_config['SENDER_EMAIL']
 91 | u.SENDER_NAME = Scraper_config['SENDER_NAME']
 92 | u.SEND_MAIL_MIN_BYTES = Scraper_config['SEND_MAIL_MIN_BYTES']
 93 | u.SEND_MAIL_MIN_LINES = Scraper_config['SEND_MAIL_MIN_LINES']
 94 | u.SMTP_HOST = Scraper_config['SMTP_HOST']
 95 | u.SMTP_HOST_PWD_FILE = Scraper_config['SMTP_HOST_PWD_FILE']
 96 | u.SMTP_HOST_USER_FILE = Scraper_config['SMTP_HOST_USER_FILE']
 97 | 
 98 | SEND_TO = Scraper_config['SEND_TO']
 99 | SEND_MAIL_INDIVIDUAL = Scraper_config['SEND_MAIL_INDIVIDUAL']
100 | SEND_MAIL_GLOBAL = Scraper_config['SEND_MAIL_GLOBAL']
101 | SEND_ERROR_MAIL_INDIVIDUAL = Scraper_config['SEND_ERROR_MAIL_INDIVIDUAL']
102 | SEND_ERROR_MAIL_GLOBAL = Scraper_config['SEND_ERROR_MAIL_GLOBAL']
103 | SEND_MAIL_SUMMARY = Scraper_config['SEND_MAIL_SUMMARY']
104 | 
105 | ################################################################################
106 | # debug(line):
107 | 
108 | debug_flag=False
109 | debug_flag=True
110 | debug_readUrlList=debug_flag
111 | 
112 | def debug(line):
113 |     if (debug_flag):
114 |         print("DEBUG: " + line)
115 | 
116 | ###############################################################################
117 | # def getTimeString(tdelta, FMT):
118 | 
119 | def getTimeString(tdelta, FMT):
120 |     return (date.today() + tdelta).strftime(FMT)
121 | 
122 | ################################################################################
123 | # def filterSortEntries(entries, select_entries, select_urls, category, runid):
124 | 
125 | def filterSortEntries(entries, select_entries, select_urls, category, runid):
126 | 
127 |     #TODO: sort by category / name
128 | 
129 |     filtered_entries = dict()
130 | 
131 |     DEBUG_MODE_FILTER=True
132 |     DEBUG_MODE_FILTER=False
133 | 
134 |     #print("runid="+runid)
135 |     for key in entries.keys():
136 |         url=key
137 |         entry=entries[key]
138 |         name=entry.name
139 | 
140 |         e_runid=None
141 |         if (entry.fields.get('runid')):
142 |             e_runid=entry.fields.get('runid').lower()
143 | 
144 |         e_category=None
145 |         if (entry.fields.get('category')):
146 |             e_category=entry.fields.get('category')
147 | 
148 |         enabled=True
149 |         if (entry.fields.get('enabled')):
150 |             e_enabled=entry.fields.get('enabled').lower()
151 | 
152 |             enabled=False
153 |             if (e_enabled == 'true'):
154 |                  enabled=True
155 | 
156 |         if (enabled == False):
157 |             if DEBUG_MODE_FILTER:
158 |                 print("DISABLED: " + url)
159 |             continue
160 | 
161 |         if (select_entries and name.lower().find(select_entries.lower()) == -1):
162 |             if DEBUG_MODE_FILTER:
163 |                 print("SELECT_ENTRIES: " + select_entries + " not found " + url)
164 |             continue
165 | 
166 |         if (select_urls and url.lower().find(select_urls.lower()) == -1):
167 |             if DEBUG_MODE_FILTER:
168 |                 print("SELECT_URLS: " + select_urls + " not found " + url)
169 |             continue
170 | 
171 |         if category:
172 |             if (e_category == None):
173 |                 if DEBUG_MODE_FILTER:
174 |                     print("CATEGORY: " + category + ", no category in entry " + url)
175 |                 continue
176 |             if (e_category != category):
177 |                 if DEBUG_MODE_FILTER:
178 |                     print("CATEGORY: " + category + " != " + e_category + " category in entry " + url)
179 |                 continue
180 | 
181 |         if runid:
182 |             if (e_runid == None):
183 |                 if DEBUG_MODE_FILTER:
184 |                     print("RUNID: " + runid + ", no runid in entry " + url)
185 |                 continue
186 |             if (e_runid != runid.lower()):
187 |                 if DEBUG_MODE_FILTER:
188 |                     print("RUNID: " + runid + " != " + e_runid + " runid in entry " + url)
189 |                 continue
190 | 
191 |         filtered_entries[url]=entry
192 | 
193 |     num_entries=len(entries)
194 |     num_filtered_entries=len(filtered_entries)
195 |     if (num_entries != num_filtered_entries):
196 |         print("filterEntries returned "+str(num_filtered_entries)+" from initial " + str(num_entries) + " entries")
197 | 
198 |     return filtered_entries
199 | 
200 | 
201 | ################################################################################
202 | # def hexdump(src, length=8):
203 | 
204 | def hexdump(src, start=0, count=-1, length=16):
205 |     result = []
206 | 
207 |     digits = 4 if isinstance(src, unicode) else 2
208 | 
209 |     if (count == -1):
210 |         count=len(src)
211 | 
212 |     if (count > len(src)):
213 |         count=len(src)
214 | 
215 |     for i in xrange(start, start+count, length):
216 |        s = src[i:i+length]
217 | 
218 |        hexa = b' '.join(["%0*X" % (digits, ord(x))  for x in s])
219 |        text = b''.join([x if 0x20 <= ord(x) < 0x7F else b'.'  for x in s])
220 | 
221 |        result.append( b"%04X   %-*s   %s" % (i, length*(digits + 1), hexa, text) )
222 | 
223 |     return b'\n'.join(result)
224 | 
225 | 
226 | ################################################################################
227 | # def printBuffer(label, buffer, start, count):
228 | 
229 | def printBuffer(label, buffer, start, count):
230 |     print(label + "\n" + hexdump(buffer, start, count, 16))
231 | 
232 | 
233 | ################################################################################
234 | # def mkdirp(directory):
235 | 
236 | def mkdirp(directory):
237 |     if not os.path.isdir(directory):
238 |         os.makedirs(directory)
239 | 
240 | ################################################################################
241 | # get_pages(entries, DOWNLOAD_DIR):
242 | 
243 | def get_pages(entries, DOWNLOAD_DIR):
244 | 
245 |     mkdirp(DOWNLOAD_DIR)
246 | 
247 |     for key in entries.keys():
248 |         url=key
249 |         entry=entries[key]
250 |         name=entry.name
251 |         print("\nGET: " + name + " => <" + url + ">")
252 | 
253 |         try:
254 |             entry.get_page(DOWNLOAD_DIR)
255 |         except:
256 |             print("\n**** UNCAUGHT EXCEPTION on get_page(): " + traceback.format_exc())
257 | 
258 | ################################################################################
259 | # getUrlId(url):
260 | 
261 | def getUrlId(url):
262 |     classId = url.replace("http://", "")
263 |     classId = classId.replace("https://", "")
264 |     classId = classId.replace("/", "_")
265 | 
266 |     return classId
267 | 
268 | ################################################################################
269 | # parse_pages(entries, DIR):
270 | 
271 | def parse_pages(entries, DIR):
272 |     for url in entries.keys():
273 |         name=entry.name
274 |         print(name + " => <" + url + ">")
275 | 
276 |         return entry.parse_page(DIR)
277 | 
278 | ################################################################################
279 | # def cleanText(text):
280 | 
281 | def cleanText(text):
282 | 
283 |     by=0
284 |     line=1
285 |     linepos=0
286 |     return u.encode2Ascii(text)
287 | 
288 |     print("cleantext("+str(len(text))+" bytes)")
289 | 
290 |     for byte in text:
291 |         by = by + 1
292 |         if (ord(byte) > 0xa):
293 |             line = line + 1
294 |             linepos=0
295 |             continue
296 | 
297 |         linepos = linepos + 1
298 |         if (ord(byte) > 128):
299 |             hexstr=strformat("0x%x", ord(byte))
300 |             print("Found big number " + hexstr +" at by " + by + " at line"+line+"@"+linepos)
301 |             byte=' '
302 | 
303 |         text = text + byte
304 | 
305 |     return text
306 | 
307 | ################################################################################
308 | # readUrlList(filename):
309 | 
310 | def readUrlList(filename):
311 |     debug_flag=debug_readUrlList
312 |     print(f"reading url list from file {filename}")
313 | 
314 |     file_lines = u.readFile(filename)
315 | 
316 |     # entries=(#Entries keyed by URL)
317 |     entries=dict()
318 | 
319 |     url_match='^https?://'
320 |     p_url = re.compile(url_match)
321 | 
322 |     empty_match='^\s*$'
323 |     p_empty = re.compile(empty_match)
324 | 
325 |     comment_match='^\s*#'
326 |     p_comment = re.compile(comment_match)
327 | 
328 |     end_match='^__END__$'
329 |     p_end = re.compile(end_match)
330 | 
331 |     ######################################################################
332 |     ## Read all lines, adding entries to dictionary:
333 | 
334 |     entry_no=1;
335 |     line_no=0;
336 |     entries_started=False;
337 | 
338 |     entry = Entry()
339 |     entry.url=None
340 |     entry.name='entry'+str(entry_no)+'_line'+str(line_no)
341 |     entry.fields['name']='entry'+str(entry_no)+'_line'+str(line_no)
342 |     entry.debug=DEBUG_MODE
343 |     entry.dinfo=DEBUG_INFO
344 | 
345 |     skip_until_empty_lines=False
346 |     for file_line in file_lines:
347 |         line_no = line_no+1
348 |         debug("LINE"+str(line_no)+": "+file_line)
349 | 
350 |         ########################################
351 |         ## Skip comment lines:
352 |         if p_comment.match(file_line):
353 |             continue
354 | 
355 |         ########################################
356 |         ## Empty lines delimit entries:
357 |         if (p_empty.match(file_line) or p_end.match(file_line)):
358 |             url = entry.url
359 |             #print("END OF ENTRY")
360 | 
361 |             if skip_until_empty_lines:
362 |                 debug("IGNORING lines after error")
363 |                 continue
364 | 
365 |             # Ignore if empty-line before 1st entry:
366 |             if (p_empty.match(file_line) and (not entries_started)):
367 |                 debug("IGNORING empty-lines before 1st entry")
368 |                 continue
369 | 
370 |             if p_end.match(file_line):
371 |                 break
372 | 
373 |             if (url == None):
374 |                 continue
375 |                 #print("No url defined for entry"+str(entry_no)+" ending at line "+str(line_no))
376 |                 #exit(-1)
377 | 
378 |             if (url in entries):
379 |                 full_error = "Entry already defined for url <{}> in entry <{}> ending at line {}".format(url, str(entry_no), str(line_no))
380 |                 u.sendmail( entry, [ SEND_TO ], full_error, [], category, period, "ERROR: Duplicate url", runid)
381 | 
382 |                 # skip rest of entry lines:
383 |                 skip_until_empty_lines=True
384 |                 continue
385 |                 #exit(-1)
386 | 
387 |             if (entry.get('debug') and ((entry.get('debug').lower == "true") or (entry.get('debug').lower == "enabled"))):
388 |                 entry.debug=True
389 | 
390 |             if (entry.get('dinfo') and ((entry.get('dinfo').lower == "true") or (entry.get('dinfo').lower == "enabled"))):
391 |                 entry.dinfo=True
392 | 
393 |             debug("Adding entry#"+str(entry_no))
394 |             entries[url]=entry
395 |             entry_no = entry_no+1
396 | 
397 |             entry = Entry()
398 |             entry.url=None
399 |             entry.debug=DEBUG_MODE
400 |             entry.dinfo=DEBUG_INFO
401 |             entry.name='entry'+str(entry_no)+'_line'+str(line_no)
402 |             entry.fields['name']='entry'+str(entry_no)+'_line'+str(line_no)
403 |             continue
404 | 
405 |         skip_until_empty_lines=False
406 | 
407 |         ########################################
408 |         ## Detect title lines: (No spaces before line)
409 |         if (file_line.find(" ") != 0): 
410 |             entry.fields['name']=file_line
411 |             entry.name=file_line
412 |             entries_started=True;
413 |             continue
414 | 
415 |         file_line=file_line.lstrip()
416 |         entries_started=True;
417 | 
418 |         ########################################
419 |         ## Detect url lines:
420 |         if (p_url.match(file_line)):
421 |             entry.url=file_line
422 |             continue
423 | 
424 |         ########################################
425 |         ## Treat other lines:
426 |         elements = file_line.split(":")
427 |         name = elements[0]
428 |         value = ":".join(elements[1:])
429 |         entry.fields[name]=value
430 | 
431 |     return entries
432 | 
433 | 
434 | ################################################################################
435 | # def diff_pages(entries, NEW_DIR, OLD_DIR):
436 | 
437 | def diff_pages(entries, NEW_DIR, OLD_DIR):
438 | 
439 |     global period
440 | 
441 |     diff_pages = ""
442 | 
443 |     for url in entries.keys():
444 |         entry=entries[url]
445 |         name=entry.name
446 |         print(40 * '_')
447 |         print("\nDIFF: " + name + " => <" + url + ">")
448 | 
449 |         classId=getUrlId(url)
450 | 
451 |         email_attrs=dict()
452 |         email_attrs['select_entries']=select_entries
453 |         email_attrs['category']=category
454 |         email_attrs['period']=period
455 |         email_attrs['name']=name
456 |         email_attrs['SEND_TO']=SEND_TO
457 |         email_attrs['SEND_MAIL_INDIVIDUAL']=SEND_MAIL_INDIVIDUAL
458 | 
459 |         page = ""
460 |         try:
461 |             page = entry.diff_page(classId, NEW_DIR, OLD_DIR, email_attrs)
462 |         except:
463 |             error = "ERROR: on diff_page("+url+")" + traceback.format_exc()
464 |             print(error)
465 | 
466 |             full_error= "<pre>" + traceback.format_exc() + "</pre>"
467 |             full_error_header="<b> Errors for '<u>"+name+"</u>'</b><br>"
468 | 
469 |             SAVE_ERRORS.append(full_error_header+full_error)
470 | 
471 |             if entry.debug:
472 |                 TEST='just testing'
473 |                 u.sendmail( entry, [ 'mjbrightfr@gmail.com' ], TEST, [], 'category', 'period', TEST, 'runid')
474 |                 u.sendmail( entry, [ SEND_TO ], full_error, select_entries, category, period, "ERROR: " + name, runid)
475 | 
476 |         diff_pages = diff_pages + page
477 | 
478 |     return diff_pages
479 | 
480 | ################################################################################
481 | # def showlist(entries):
482 | 
483 | def showlist(entries):
484 | 
485 |     print("\nEntries: " + str(len(entries)) + " entries (filtered)")
486 | 
487 |     for key in entries.keys():
488 |         url=key
489 |         value=entries[key]
490 |         name=value.name
491 |         print(name + " => <" + url + ">")
492 | 
493 |     print("\nFinished list of " + str(len(entries)) + " entries (filtered)")
494 |     print
495 | 
496 | ################################################################################
497 | # CMD-LINE ARGS:
498 | 
499 | # FOR DEBUGGING:
500 | TEXT=u.readFile("/etc/hosts")
501 | 
502 | '''
503 | FORCE TEST sendmail:
504 |   entry=Entry()
505 |   entry.url='url'
506 |   u.sendmail( entry, [ 'mjbrightfr@gmail.com' ], TEXT, [], 'category', 'period', TEXT, 'runid')
507 | '''
508 | 
509 | args=sys.argv
510 | 
511 | print(80 * '_')
512 | print("Programe started at: " + u.DATETIME + " as:")
513 | print(' '.join(args))
514 | 
515 | 
516 | ifile='LIST.txt'
517 | 
518 | operations=[ 'list' ]
519 | 
520 | ofile='GLOBAL_OP.html'
521 | 
522 | # Used for DIFF_page
523 | DIR0=None
524 | DIR1=None
525 | 
526 | runid=None
527 | 
528 | a=0
529 | while a < (len(args)-1):
530 |     a=a+1
531 |     opt=args[a]
532 | 
533 |     if opt == "-u":
534 |         a=a+1
535 |         select_urls=args[a]
536 |         continue
537 | 
538 |     if opt == "-e":
539 |         a=a+1
540 |         select_entries=args[a]
541 |         continue
542 | 
543 |     if opt == "-c":
544 |         a=a+1
545 |         category=args[a]
546 |         continue
547 | 
548 |     if opt == "-id":
549 |         a=a+1
550 |         runid=args[a]
551 |         continue
552 | 
553 |     if opt == "-allid":
554 |         a=a+1
555 |         runid=None
556 |         continue
557 | 
558 |     if opt == "-o":
559 |         a=a+1
560 |         ofile=args[a]
561 |         continue
562 | 
563 |     if opt == "-l":
564 |         a=a+1
565 |         ifile=args[a]
566 |         continue
567 | 
568 |     if opt == "-parser":
569 |         a=a+1
570 |         Entry.Parser = args[a]
571 |         continue
572 | 
573 |     if opt == "-dinfo":
574 |         print("Setting DEBUG_INFO to True")
575 |         DEBUG_INFO=True
576 |         continue
577 | 
578 |     if opt == "-debug":
579 |         print("Setting DEBUG_MODE to True")
580 |         DEBUG_MODE=True
581 |         continue
582 | 
583 |     if opt == "-test":
584 |         print("Setting TEST_MODE to True")
585 |         TEST_MODE=True
586 |         continue
587 | 
588 |     if opt == "-local":
589 |         operations.append('parse_local')
590 |         continue
591 | 
592 |     if opt == "-get":
593 |         operations.append('get_pages')
594 |         continue
595 | 
596 |     if opt == "-diff":
597 |         operations.append('diff_page')
598 |         continue
599 | 
600 |     if opt == "-DIFF":
601 |         operations.append('DIFF_page')
602 |         a=a+1
603 |         DIR0=args[a]
604 |         a=a+1
605 |         DIR1=args[a]
606 |         continue
607 | 
608 |     ########################################
609 |     ## Period options:
610 | 
611 |     if opt == "-hour":
612 |         period=HOUR
613 |         runid=runids[period]
614 |         continue
615 | 
616 |     if opt == "-hour2":
617 |         period=HOUR2
618 |         runid=runids[period]
619 |         continue
620 | 
621 |     if opt == "-hour4":
622 |         period=HOUR4
623 |         runid=runids[period]
624 |         continue
625 | 
626 |     if opt == "-day":
627 |         period=DAY
628 |         runid=runids[period]
629 |         continue
630 | 
631 |     if opt == "-day2":
632 |         period=DAY2
633 |         runid=runids[period]
634 |         continue
635 | 
636 |     if opt == "-week":
637 |         period=WEEK
638 |         runid=runids[period]
639 |         continue
640 | 
641 |     if opt == "-week2":
642 |         period=WEEK2
643 |         runid=runids[period]
644 |         continue
645 | 
646 |     if opt == "-month":
647 |         period=MONTH
648 |         runid=runids[period]
649 |         continue
650 | 
651 |     if opt == "-month2":
652 |         period=MONTH2
653 |         runid=runids[period]
654 |         continue
655 | 
656 |     ########################################
657 |     ## Mail options:
658 | 
659 |     if opt == "-maili":
660 |         SEND_MAIL_INDIVIDUAL=True
661 |         continue
662 | 
663 |     if opt == "-nomail":
664 |         SEND_MAIL_INDIVIDUAL=False
665 |         SEND_MAIL_GLOBAL=False
666 |         continue
667 | 
668 |     if opt == "-nomaili":
669 |         SEND_MAIL_INDIVIDUAL=False
670 |         continue
671 | 
672 |     if opt == "-mailg":
673 |         SEND_MAIL_GLOBAL=True
674 |         continue
675 | 
676 |     if opt == "-nomailg":
677 |         SEND_MAIL_GLOBAL=False
678 |         continue
679 | 
680 |     print("Unknown option '"+opt+"'")
681 |     exit(255)
682 | 
683 | ################################################################################
684 | # MAIN:
685 | 
686 | HOME=os.getenv("HOME")
687 | 
688 | Entry.globalRunID=runid
689 | 
690 | CACHE=HOME + "/var/SCRAPER-CACHE/"
691 | if TEST_MODE:
692 |     CACHE=HOME + "/var/SCRAPER-CACHE-TEST/"
693 | 
694 | LATEST=CACHE + "LATEST"
695 | 
696 | if (not os.path.exists(CACHE)):
697 |     os.makedirs(CACHE)
698 | 
699 | new_dir="UNKNOWN"
700 | old_dir="UNKNOWN"
701 | 
702 | if period == HOUR:
703 |     new_dir     = CACHE + DATEHOUR
704 |     old_dir = CACHE + getTimeString(timedelta(hours=-1), FMT_DATEHOUR)
705 |  
706 | if period == HOUR2:
707 |     new_dir     = CACHE + DATEHOUR
708 |     old_dir = CACHE + getTimeString(timedelta(hours=-2), FMT_DATEHOUR)
709 | 
710 | if period == HOUR4:
711 |     new_dir     = CACHE + DATEHOUR
712 |     old_dir = CACHE + getTimeString(timedelta(hours=-4), FMT_DATEHOUR)
713 | 
714 | if period == DAY:
715 |     new_dir     = CACHE + DATE
716 |     old_dir = CACHE + getTimeString(timedelta(days=-1), FMT_DATE)
717 | 
718 | if period == DAY2:
719 |     new_dir     = CACHE + DATE
720 |     old_dir = CACHE + getTimeString(timedelta(days=-2), FMT_DATE)
721 | 
722 | if period == WEEK:
723 |     new_dir     = CACHE + DATE
724 |     old_dir = CACHE + getTimeString(timedelta(days=-7), FMT_DATE)
725 | 
726 | if period == WEEK2:
727 |     new_dir     = CACHE + DATE
728 |     old_dir = CACHE + getTimeString(timedelta(days=-14), FMT_DATE)
729 | 
730 | if period == MONTH:
731 |     new_dir     = CACHE + DATE
732 |     old_dir = CACHE + getTimeString(timedelta(days=-30), FMT_DATE)
733 | 
734 | if period == MONTH2:
735 |     new_dir     = CACHE + DATE
736 |     old_dir = CACHE + getTimeString(timedelta(days=-60), FMT_DATE)
737 | 
738 | entries = readUrlList(ifile)
739 | 
740 | entries = filterSortEntries(entries, select_entries, select_urls, category, runid)
741 | 
742 | for oper in operations:
743 | 
744 |     if (oper == "list"):
745 |         showlist(entries)
746 | 
747 |     if (oper == "get_pages"):
748 |         if (not os.path.exists(new_dir)):
749 |             print("os.makedirs("+new_dir+")")
750 |             os.makedirs(new_dir)
751 | 
752 |         get_pages(entries, new_dir)
753 | 
754 |     if (oper == "parse_local"):
755 |         parse_pages(entries, "PAGES")
756 | 
757 |     if (oper == "diff_page") or (oper == "DIFF_page"):
758 |         dir1=new_dir
759 |         dir0=old_dir
760 |         if (oper == "DIFF_page"):
761 |             dir1=DIR1
762 |             dir0=DIR0
763 | 
764 |         diff_pages_op = diff_pages(entries, dir1, dir0)
765 | 
766 |         if (len(SAVE_ERRORS) > 0):
767 |            SEND_MAIL_SUMMARY=True
768 | 
769 |         if (SEND_MAIL_GLOBAL):
770 |             with open(ofile, 'w') as f:
771 |                 f.writelines(diff_pages_op)
772 | 
773 |             lines = open(ofile, 'r')
774 |             #lines = strip_tags(lines)
775 | 
776 |             body=''
777 |             if SEND_MAIL_SUMMARY and (len(SAVE_ERRORS) > 0):
778 |                 body = '<H1> Errors: </H1>' + ' '.join(SAVE_ERRORS) + '<br>'
779 | 
780 |             body = body + ''.join(lines.readlines())
781 |             u.sendmail( None, [ SEND_TO], body, select_entries, category, period, "GLOBAL", runid)
782 | 
783 |         elif SEND_MAIL_SUMMARY and (len(SAVE_ERRORS) > 0):
784 |             body = '<H1> Errors: </H1>' + ' '.join(SAVE_ERRORS) + '<br>'
785 | 
786 |             u.sendmail( None, [ SEND_TO], body, select_entries, category, period, "SUMMARY", runid)
787 | 
788 | 
789 | exit(0)
790 | 
791 | 


--------------------------------------------------------------------------------