├── docs
    ├── GUI interface.ppt
    └── Google Search GUI.jpg
├── .gitattributes
├── README.md
├── .gitignore
├── Get_google_link_results.py
├── Python_google_search_gui.py
└── Python_Google_Search.py


/docs/GUI interface.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spidezad/google_search_module/HEAD/docs/GUI interface.ppt


--------------------------------------------------------------------------------
/docs/Google Search GUI.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spidezad/google_search_module/HEAD/docs/Google Search GUI.jpg


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | google_search_module
 2 | ====================
 3 | 
 4 | Retrieve google results using python
 5 | 
 6 | Progam obtained the results links from google main page and each links are run separately using Scrapy. In this way, users have more flexibility in obtaining various information from individual websites. At present, only the title and meta contents are scrapped from each website. The other advantage is that is remove further dependency from Google html tag changes.
 7 | 
 8 | Dependency of script are Scrapy and yaml (for unicode handling). Both can be downloaded using PIP.
 9 | 
10 | Scripts is divided into 2 parts. The main script for running is from Python_Google_Search.py. The get_google_link_results.py is the scrapy spider for crawling either the google search page or individual websites. The switch depends on the json setting file created.
11 | 
12 | More information can be obtained from: http://wp.me/p4nnkg-1i
13 | 
14 | For the GUI version, required the following module: https://github.com/spidezad/Extract_specified_txt_fr_files
15 | See the docs folder for the GUI display.
16 | For more details on the GUI, pls see the http://simplypython.wordpress.com/2014/07/12/scaping-google-results-using-python-gui-version/
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | 
 46 | [Dd]ebug/
 47 | [Rr]elease/
 48 | x64/
 49 | build/
 50 | [Bb]in/
 51 | [Oo]bj/
 52 | 
 53 | # MSTest test Results
 54 | [Tt]est[Rr]esult*/
 55 | [Bb]uild[Ll]og.*
 56 | 
 57 | *_i.c
 58 | *_p.c
 59 | *.ilk
 60 | *.meta
 61 | *.obj
 62 | *.pch
 63 | *.pdb
 64 | *.pgc
 65 | *.pgd
 66 | *.rsp
 67 | *.sbr
 68 | *.tlb
 69 | *.tli
 70 | *.tlh
 71 | *.tmp
 72 | *.tmp_proj
 73 | *.log
 74 | *.vspscc
 75 | *.vssscc
 76 | .builds
 77 | *.pidb
 78 | *.log
 79 | *.scc
 80 | 
 81 | # Visual C++ cache files
 82 | ipch/
 83 | *.aps
 84 | *.ncb
 85 | *.opensdf
 86 | *.sdf
 87 | *.cachefile
 88 | 
 89 | # Visual Studio profiler
 90 | *.psess
 91 | *.vsp
 92 | *.vspx
 93 | 
 94 | # Guidance Automation Toolkit
 95 | *.gpState
 96 | 
 97 | # ReSharper is a .NET coding add-in
 98 | _ReSharper*/
 99 | *.[Rr]e[Ss]harper
100 | 
101 | # TeamCity is a build add-in
102 | _TeamCity*
103 | 
104 | # DotCover is a Code Coverage Tool
105 | *.dotCover
106 | 
107 | # NCrunch
108 | *.ncrunch*
109 | .*crunch*.local.xml
110 | 
111 | # Installshield output folder
112 | [Ee]xpress/
113 | 
114 | # DocProject is a documentation generator add-in
115 | DocProject/buildhelp/
116 | DocProject/Help/*.HxT
117 | DocProject/Help/*.HxC
118 | DocProject/Help/*.hhc
119 | DocProject/Help/*.hhk
120 | DocProject/Help/*.hhp
121 | DocProject/Help/Html2
122 | DocProject/Help/html
123 | 
124 | # Click-Once directory
125 | publish/
126 | 
127 | # Publish Web Output
128 | *.Publish.xml
129 | *.pubxml
130 | 
131 | # NuGet Packages Directory
132 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
133 | #packages/
134 | 
135 | # Windows Azure Build Output
136 | csx
137 | *.build.csdef
138 | 
139 | # Windows Store app package directory
140 | AppPackages/
141 | 
142 | # Others
143 | sql/
144 | *.Cache
145 | ClientBin/
146 | [Ss]tyle[Cc]op.*
147 | ~$*
148 | *~
149 | *.dbmdl
150 | *.[Pp]ublish.xml
151 | *.pfx
152 | *.publishsettings
153 | 
154 | # RIA/Silverlight projects
155 | Generated_Code/
156 | 
157 | # Backup & report files from converting an old project file to a newer
158 | # Visual Studio version. Backup files are not needed, because we have git ;-)
159 | _UpgradeReport_Files/
160 | Backup*/
161 | UpgradeLog*.XML
162 | UpgradeLog*.htm
163 | 
164 | # SQL Server files
165 | App_Data/*.mdf
166 | App_Data/*.ldf
167 | 
168 | #############
169 | ## Windows detritus
170 | #############
171 | 
172 | # Windows image file caches
173 | Thumbs.db
174 | ehthumbs.db
175 | 
176 | # Folder config file
177 | Desktop.ini
178 | 
179 | # Recycle Bin used on file shares
180 | $RECYCLE.BIN/
181 | 
182 | # Mac crap
183 | .DS_Store
184 | 
185 | 
186 | #############
187 | ## Python
188 | #############
189 | 
190 | *.py[co]
191 | 
192 | # Packages
193 | *.egg
194 | *.egg-info
195 | dist/
196 | build/
197 | eggs/
198 | parts/
199 | var/
200 | sdist/
201 | develop-eggs/
202 | .installed.cfg
203 | 
204 | # Installer logs
205 | pip-log.txt
206 | 
207 | # Unit test / coverage reports
208 | .coverage
209 | .tox
210 | 
211 | #Translations
212 | *.mo
213 | 
214 | #Mr Developer
215 | .mr.developer.cfg
216 | 
217 | #############
218 | ## Python wx
219 | #############
220 | *.wxg
221 | 
222 | ###############
223 | ## Extra
224 | ###############
225 | backup


--------------------------------------------------------------------------------
/Get_google_link_results.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | #############################################
  4 | 
  5 |  Module to get google search results by using Scrapy (Spider module)
  6 |  Author: Tan Kok Hua (Guohua tan)
  7 |  Email: spider123@gmail.com
  8 |  Revised date: Apr 11 2014
  9 | 
 10 | ##############################################
 11 | 
 12 | Scrapy Spider Module to
 13 |     1) Scrape the google link results from the google search page
 14 |     2) Scrape the individual links for the title and description.
 15 | 
 16 | Updates:
 17 |     Apr 17 2014: Cater for mutliple google search
 18 |     Apr 12 2014: Resolve issues with unrecognise character
 19 |                 : Add in join_list_of_str function and remove_whitespace_fr_raw
 20 |                 : Add in paragraph process
 21 |                 : Add in option to extract text from paragraph
 22 | 
 23 | TODO: Enable injecting of some tag for xml collection
 24 | 
 25 | '''
 26 | 
 27 | import re
 28 | import os
 29 | import sys
 30 | import json
 31 | import string
 32 | import yaml
 33 | from scrapy.spider import Spider
 34 | from scrapy.selector import Selector
 35 | from Python_Google_Search import gsearch_url_form_class 
 36 | 
 37 | # Options
 38 | GS_LINK_JSON_FILE       = r'C:\data\temp\output'
 39 | RESULT_FILE             = r'c:\data\temp\htmlread_1.txt'
 40 | 
 41 | ENABLE_TEXT_SUMMARIZE   = 0 # For NLTK to look into the text for details.
 42 | ENABLE_PARAGRAPH_STORED = 1 # Store website content to file.
 43 | 
 44 | class GoogleSearch(Spider):
 45 | 
 46 |     # Save the result file: Always in the same file
 47 |     # re -start at each section of the run
 48 |     with open(RESULT_FILE,'w') as f:
 49 |         f.write('')
 50 |         print 'Restart the log file'
 51 | 
 52 |     with open(GS_LINK_JSON_FILE,'w') as f:
 53 |         f.write('')
 54 |         print 'Restart the GS_LINK_JSON_FILE file'
 55 | 
 56 |     # for retrieving the settings from json file
 57 |     search_class = gsearch_url_form_class("")
 58 |     setting_data = search_class.retrieved_setting_fr_json_file()
 59 | 
 60 |     # Parameters set used for spider crawling
 61 |     name = setting_data['Name']
 62 |     allowed_domains = setting_data['Domain']
 63 |     start_urls = setting_data['SearchUrl']
 64 | 
 65 |     def join_list_of_str(self,list_of_str, joined_chars= '...'):
 66 |         '''
 67 |             Function to combine a list of str to one long str
 68 |             list of str --> str
 69 | 
 70 |         '''
 71 |         return joined_chars.join([n for n in list_of_str])
 72 | 
 73 |     def remove_whitespace_fr_raw(self,raw_input):
 74 |         '''
 75 |             Remove unnecessary white space such as \r,\t,\n
 76 |             str raw_input --> str
 77 |             
 78 |         '''
 79 |         
 80 |         for n in ['\n','\t','\r']:
 81 |             raw_input = raw_input.replace(n,'')
 82 |         return raw_input
 83 | 
 84 |     def combine_all_url_link_for_multiple_search(self,more_url_list):
 85 |         '''
 86 |             Combine all the url link list in the event of mutliple search.
 87 |             list more_url_list --> none
 88 |             get from Json file and eventually dump all back
 89 |         '''
 90 |         
 91 |         with open(GS_LINK_JSON_FILE, "r") as outfile:
 92 |             setting_data = yaml.load(outfile)
 93 | 
 94 |         if setting_data is None or not setting_data.has_key('output_url'):
 95 |             setting_data = dict()
 96 |             setting_data['output_url'] = []
 97 | 
 98 |         with open(GS_LINK_JSON_FILE, "w") as outfile:
 99 |             json.dump({'output_url': setting_data['output_url']+more_url_list}, outfile, indent=4)
100 |         
101 | 
102 | 
103 |     def parse(self, response):
104 |         '''
105 |            Required function for spider to crawl
106 |            Run two different type of parsing depending on the json keyword type of parse
107 |                if type of parse == google_search --> get list of links from google results
108 |                if type of parse == general --> get the meta information for each site
109 |         '''
110 | 
111 |         if self.setting_data['type_of_parse'] == 'google_search':
112 |             print
113 |             print 'For google search parsing'
114 |             
115 |             ## Get the selector for xpath parsing
116 |             sel = Selector(response)
117 |             google_search_links_list =  sel.xpath('//h3/a/@href').extract()
118 |             google_search_links_list = [re.search('q=(.*)&sa',n).group(1) for n in google_search_links_list if re.search('q=(.*)&sa',n)]
119 | 
120 |             ## Display a list of the result link
121 |             for n in google_search_links_list:
122 |                 print n
123 | 
124 |             ## Dump all results to file
125 |             self.combine_all_url_link_for_multiple_search(google_search_links_list)
126 | 
127 |         if self.setting_data['type_of_parse'] == 'general':
128 | 
129 |             print 
130 |             print 'general website processing'
131 |             sel = Selector(response)
132 | 
133 |             ## Get meta info from website
134 |             title = sel.xpath('//title/text()').extract()
135 |             if len(title)>0:
136 |                 title = title[0].encode(errors='replace') #replace any unknown character with ?
137 |             contents = sel.xpath('/html/head/meta[@name="description"]/@content').extract()
138 |             if len(contents)>0:
139 |                 contents = contents[0].encode(errors='replace') #replace any unknown character with ?
140 | 
141 |             if ENABLE_PARAGRAPH_STORED:
142 |                 paragraph_list = sel.xpath('//p/text()').extract()
143 |                 para_str = self.join_list_of_str(paragraph_list, joined_chars= '..')
144 |                 para_str = para_str.encode(errors='replace')
145 |                 para_str = self.remove_whitespace_fr_raw(para_str)
146 |                 
147 | 
148 |             print
149 |             print title
150 |             print 
151 |             print contents
152 | 
153 |             ## Dump results to text file
154 |             with open(RESULT_FILE,'a') as f:
155 |                 f.write('\n')
156 |                 f.write('#'*20)
157 |                 f.write('\n')
158 |                 f.write(title + '\n')
159 |                 f.write(response.url)
160 |                 for n in range(2): f.write('\n')
161 |                 f.write(str(contents))
162 |                 for n in range(2): f.write('\n')
163 |                 if ENABLE_PARAGRAPH_STORED:
164 |                     f.write(para_str)
165 |                 f.write('\n')
166 |                 f.write('#'*20)
167 |                 for n in range(2): f.write('\n')
168 | 
169 |         print
170 |         print 'Completed'
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/Python_google_search_gui.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     GUI for the python google search.
  3 |     Allow more easy viewing of results.
  4 | 
  5 |     TODO:
  6 |     Set limit to the page scroller
  7 |     Input the multiple search options
  8 | 
  9 |     Learning:
 10 |         Trigger event\
 11 |         http://stackoverflow.com/questions/747781/wxpython-calling-an-event-manually
 12 |     
 13 | 
 14 | 
 15 | """
 16 | 
 17 | 
 18 | import os
 19 | import sys
 20 | import wx
 21 | from Python_Google_Search import gsearch_url_form_class
 22 | 
 23 | import Extract_specified_txt_fr_files
 24 | 
 25 | 
 26 | class MyPanel(wx.Panel):
 27 |     def __init__(self, parent,*args, **kwds):
 28 |         self.parent = parent
 29 |         wx.Panel.__init__(self, parent, *args, **kwds)
 30 | 
 31 |         ## parameters
 32 |         self.page_scroller_result = dict()
 33 |         
 34 |         self.dummy_panel = wx.Panel(self, wx.ID_ANY)
 35 | 
 36 |         ## input search results.
 37 |         self.search_input_txtctrl = wx.TextCtrl(self, wx.ID_ANY, "")
 38 | 
 39 |         ## button for executing the search results
 40 |         self.search_btn = wx.Button(self, wx.ID_ANY, "search")
 41 |         self.search_btn.Bind(wx.EVT_BUTTON, self.run_search)
 42 | 
 43 |         ## incremental button for the page viewing
 44 |         self.page_scroller = wx.SpinCtrl(self, -1, "", (30, 50))
 45 |         self.page_scroller.SetRange(1,100)
 46 |         self.page_scroller.SetValue(1)
 47 |         self.Bind(wx.EVT_SPINCTRL, self.OnSpin, self.page_scroller)
 48 |         self.Bind(wx.EVT_TEXT, self.OnText, self.page_scroller)
 49 | 
 50 |         ## Display of search results
 51 |         self.results_txtctrl = wx.TextCtrl(self, wx.ID_ANY, "",
 52 |                                            style = wx.TE_MULTILINE| wx.TE_RICH2|wx.TE_WORDWRAP)
 53 | 
 54 |         ## for notes taking .
 55 |         self.notes_txtctrl = wx.TextCtrl(self, wx.ID_ANY, "",
 56 |                                            style = wx.TE_MULTILINE| wx.TE_RICH2|wx.TE_WORDWRAP)
 57 | 
 58 |         self.__do_layout()
 59 | 
 60 | 
 61 |     def __do_layout(self):
 62 | 
 63 |         sizer_1 = wx.BoxSizer(wx.VERTICAL)
 64 |         sizer_2 = wx.BoxSizer(wx.HORIZONTAL)
 65 |         mid_portion_sizer =  wx.BoxSizer(wx.HORIZONTAL)
 66 |         
 67 |         sizer_2.Add(self.search_input_txtctrl, 5, wx.ALL | wx.EXPAND, 7)
 68 |         sizer_2.Add(self.search_btn, 0, wx.ALL | wx.EXPAND, 7)
 69 | 
 70 |         mid_portion_sizer.Add(self.page_scroller,1, wx.ALL | wx.EXPAND, 7)
 71 |         mid_portion_sizer.Add((-1,-1),4, wx.ALL | wx.EXPAND, 7)
 72 |         
 73 |         sizer_1.Add(sizer_2, 1, wx.ALL | wx.EXPAND, 3)
 74 |         sizer_1.Add(mid_portion_sizer, 1, wx.EXPAND, 0)
 75 |         sizer_1.Add(self.results_txtctrl, 10, wx.ALL | wx.EXPAND, 10)
 76 |         sizer_1.Add(self.notes_txtctrl, 5, wx.ALL | wx.EXPAND, 10)
 77 |         
 78 |         self.SetSizer(sizer_1)
 79 |         sizer_1.Fit(self)
 80 | 
 81 |     def OnSpin(self, evt):
 82 |         """Page Scroller function: on scroll. Scroll to correct page"""
 83 |         target_output = self.page_scroller_result[self.page_scroller.GetValue()]
 84 |         self.results_txtctrl.SetValue(target_output) 
 85 | 
 86 |     def OnText(self, evt):
 87 |         """Page Scroller function: on enter text. text to correct page"""
 88 |         target_output = self.page_scroller_result[self.page_scroller.GetValue()]
 89 |         self.results_txtctrl.SetValue(target_output)
 90 | 
 91 |     def trigger_scroller_event(self):
 92 |         """Manually trigger the event for the self.page_scroller to display the first set of result"""
 93 |         evt = wx.PyCommandEvent(wx.EVT_TEXT.typeId,self.page_scroller.GetId())
 94 |         self.GetEventHandler().ProcessEvent(evt) 
 95 | 
 96 |     def run_search(self, evt):
 97 |         """Run the google search """
 98 |         search_input = self.search_input_txtctrl.GetValue()
 99 |         self.execute_google_search(str(search_input))
100 |         self.set_result_to_dict_for_page_scroller()
101 |         self.clear_result_screen()
102 |         self.trigger_scroller_event()
103 | 
104 |     def execute_google_search(self, search_input):
105 |         """Run the full google search"""
106 | 
107 |         print search_input
108 |         
109 |         # User options
110 |         NUM_SEARCH_RESULTS = 5     # number of search results returned
111 |         BYPASS_GOOGLE_SEARCH = 0    # if this is active, bypass searching
112 |         NUM_RESULTS_TO_PROCESS = 30 # specify the number of results url to crawl
113 | 
114 |         print 'Start search'
115 |         
116 |         ## Parameters setting
117 |         search_words = search_input
118 |         #search_words = ['best area to stay in tokyo','cheap place to stay in tokyo']
119 |         GS_LINK_JSON_FILE = r'C:\data\temp\output' #must be same as the get_google_link_results.py
120 | 
121 |         # spider store location, depend on user input
122 |         spider_file_path = r'C:\pythonuserfiles\google_search_module'
123 |         spider_filename = 'Get_google_link_results.py'
124 | 
125 |         ## Google site link scrape
126 |         if not BYPASS_GOOGLE_SEARCH:
127 |             print 'Get the google search results links'
128 |             hh = gsearch_url_form_class(search_words)
129 |             hh.data_format_switch = 1
130 |             hh.set_results_num_str(NUM_SEARCH_RESULTS)
131 |             hh.formed_search_url()
132 |             ## Set the setting for json
133 |             temp_data_for_store = hh.prepare_data_for_json_store()
134 |             hh.set_setting_to_json_file(temp_data_for_store)
135 |             new_project_cmd = 'scrapy settings -s DEPTH_LIMIT=1 & cd "%s" & scrapy runspider %s' %(spider_file_path,spider_filename)
136 |             os.system(new_project_cmd)
137 |             
138 |         ## Scape list of results link
139 |         print 'Start scrape individual results'
140 |         data  = hh.retrieved_setting_fr_json_file(GS_LINK_JSON_FILE)
141 |         
142 |         ##check if proper url --> must at least start with http
143 |         url_links_fr_search = [n for n in data['output_url'] if n.startswith('http')]
144 | 
145 |         ## Switch to the second seach 
146 |         hh.data_format_switch = 2
147 | 
148 |         ## Optional limit the results displayed
149 |         hh.sp_search_url_list = url_links_fr_search[:NUM_RESULTS_TO_PROCESS]#keep the results to 10.Can be removed
150 | 
151 |         ## Set the setting for json
152 |         temp_data_for_store = hh.prepare_data_for_json_store()
153 |         hh.set_setting_to_json_file(temp_data_for_store)
154 | 
155 |         ## Run the crawler -- and remove the pause if do not wish to see contents of the command prompt
156 |         new_project_cmd = 'scrapy settings -s DEPTH_LIMIT=1 & cd "%s" & scrapy runspider %s' %(spider_file_path,spider_filename)
157 |         os.system(new_project_cmd)
158 | 
159 |         print 'Completed'
160 | 
161 |     def display_result_to_screen(self):
162 |         """Read the data from the file and display in at the screen"""
163 |         with open(r'c:\data\temp\htmlread_1.txt','r') as f:
164 |             all_data = f.readlines()
165 |         self.results_txtctrl.SetValue(self.join_list_of_str(all_data))
166 | 
167 |     def clear_result_screen(self):
168 |         """Function to clear result screen (self.results_txtctrl"""
169 |         self.results_txtctrl.SetValue('')
170 | 
171 |     def set_result_to_dict_for_page_scroller(self):
172 |         """Store all result in dict to be used for page scroller"""
173 |         key_symbol = '###'
174 |         combined_result_list,self.page_scroller_result = Extract_specified_txt_fr_files.para_extract(r'c:\data\temp\htmlread_1.txt', key_symbol, overlapping = 0 )
175 | 
176 |     def join_list_of_str(self,list_of_str, joined_chars= ''):
177 |         '''
178 |             Function to combine a list of str to one long str
179 |             list of str --> str
180 | 
181 |         '''
182 |         return joined_chars.join([n for n in list_of_str])
183 |     
184 |     
185 | 
186 | class MyFrame(wx.Frame):
187 |     def __init__(self, parent, ID, title):
188 |                
189 |         wx.Frame.__init__(self, parent, ID, title,pos=(150, 50), size=(550, 620))#size and position
190 |         self.top_panel = MyPanel(self)
191 | 
192 |         ## Parameters
193 |         self.save_notes_filename = ''
194 | 
195 |         ## Add in menu
196 |         menuBar = wx.MenuBar()
197 | 
198 |         ## Menu keys
199 |         menu1 = wx.Menu()
200 |         menu1.Append(101, "Save", "")
201 |         menu1.Append(102, "Save As", "")
202 | 
203 |         menuBar.Append(menu1, "File")
204 |         self.SetMenuBar(menuBar)
205 | 
206 |         self.Bind(wx.EVT_MENU, self.menu_save_notes, id=101)
207 |         self.Bind(wx.EVT_MENU, self.menu_save_as_notes, id=102)
208 | 
209 |     ## Menu functions --> need a function to retrieve the data
210 |     def menu_save_notes(self, evt):
211 |         """Save the notes options to the same file"""
212 |         if self.save_notes_filename == '':
213 |             ## do a save as if the file name not specified.
214 |             self.menu_save_as_notes(8888)#dummy
215 |         else:
216 |             self.write_notes_data_to_file()
217 | 
218 |     def menu_save_as_notes(self, evt):
219 |         """Open a dialog to save the file"""
220 | 
221 |         dlg = wx.FileDialog(
222 |             self, message="Save file as ...", defaultDir=r'c:\data\temp', 
223 |             defaultFile="",  style=wx.SAVE
224 |             )
225 | 
226 | 
227 |         dlg.SetFilterIndex(2)
228 | 
229 |         if dlg.ShowModal() == wx.ID_OK:
230 |             path = dlg.GetPath()
231 |             self.save_notes_filename = path
232 | 
233 |         dlg.Destroy()
234 |         self.write_notes_data_to_file()
235 | 
236 |     def write_notes_data_to_file(self):
237 |         """ Write all the notes to file"""
238 |         with open(self.save_notes_filename, 'w') as f:
239 |             f.write(self.top_panel.notes_txtctrl.GetValue())
240 | 
241 | 
242 |         
243 | 
244 | class MyApp(wx.App):
245 |     def __init__(self):
246 |         wx.App.__init__(self,redirect =False)
247 |         self.frame= MyFrame(None,wx.ID_ANY, "Google Search")
248 |         self.SetTopWindow(self.frame)    
249 |         self.frame.Show()
250 |         
251 | def run():
252 |     try:
253 |         app = MyApp()
254 |         app.MainLoop()
255 |     except Exception,e:
256 |         print e
257 |         del app#make sure to include this
258 | 
259 | 
260 | if __name__ == '__main__':
261 | 
262 |     run()
263 | 
264 |  


--------------------------------------------------------------------------------
/Python_Google_Search.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | #############################################
  3 | 
  4 |  Module to get google search results by using Scrapy
  5 |  Author: Tan Kok Hua (Guohua tan)
  6 |  Email: spider123@gmail.com
  7 |  Revised date: Jul 18 2014
  8 | 
  9 | ##############################################
 10 | 
 11 | Usage:
 12 |     Retrieve the google results links from google search site using Scrapy
 13 |     For each link, use scrapy to crawl the title and contents.
 14 |     By minimizing the information retrieved from the google search site, allows more independent control of info extract
 15 |     and also reduce the dependency on google format/tag change.
 16 | 
 17 |     Uses the windows platform. called the scrapy crawler from command line.
 18 | 
 19 | Required Modules:
 20 |     YAML --> for the clean html, resolve unicode.
 21 |     Scrapy --> for scraping website, make use of scrapy crawler
 22 | 
 23 | Updates:
 24 |     Jul 17 2014: Rm set_results_num_str function as the result per page fixed to 100 per page.
 25 |     Apr 16 2014: re arrange the self.reformat_search_for_spaces function to formed_individual_url function
 26 |                : Add in capability to handle multiple search items
 27 |     Apr 11 2014: Add in users parameters
 28 |     Apr 09 2014: Add in modify_search_key function
 29 | 
 30 | TODO:
 31 |     Add in advanced google search
 32 |     http://www.johntedesco.net/blog/2012/06/21/how-to-solve-impossible-problems-daniel-russells-awesome-google-search-techniques/
 33 |     Time out when scraping --> some times scrape a lot of one website... need to cut down
 34 | 
 35 |     Handling mulitple search and mulitple page
 36 |     if not enough search, need to filter off similar ones.
 37 | 
 38 | BUGS:
 39 |     If the total results from google is less than specified, it will loop over.
 40 | 
 41 | LEARNING:
 42 |     The "&start=101" will determine the page number.
 43 | 
 44 | '''
 45 | 
 46 | import re, os, sys, math
 47 | import json
 48 | 
 49 | import yaml
 50 | from scrapy.spider import Spider
 51 | from scrapy.selector import Selector
 52 | 
 53 | 
 54 | class gsearch_url_form_class(object):
 55 |     '''
 56 |         Class for constructing the url to be used in search.
 57 |     '''
 58 |     def __init__(self, google_search_keyword = '' ):
 59 |         '''
 60 |             Take in the search key word and transform it to the google search url
 61 |             str/list google_search_keyword --> None
 62 |             Able to take in a list or str,
 63 |                 if str will set to self.g_search_key
 64 |                 else set to self.g_search_key_list
 65 | 
 66 |             #ie - Sets the character encoding that is used to interpret the query string
 67 |             #oe - Sets the character encoding that is used to encode the results
 68 |             #aq -?
 69 |             #num -1,10,100 results displayed per page, default use 100 per page in this case.
 70 |             #client -- temp maintain to be firefox
 71 | 
 72 | 
 73 |             TODO:
 74 |             #with different agent --randomize this
 75 |             #take care of situation where the catchpa come out
 76 |             #may need to turn off personalize search pws = 0            
 77 |         '''
 78 |         
 79 |         if type(google_search_keyword) == str:
 80 |             self.g_search_key = google_search_keyword
 81 |             self.multiple_search_enabled = 0
 82 |         elif type(google_search_keyword) == list:
 83 |             self.g_search_key_list = google_search_keyword
 84 |             self.g_search_key = ''
 85 |             self.multiple_search_enabled = 1
 86 |         else:
 87 |             print 'google_search_keyword not of type str or list'
 88 |             raise
 89 | 
 90 |         ## user defined parameters
 91 |         self.search_results_num = 100 #set to any variable
 92 | 
 93 |         ## url construct string text
 94 |         self.prefix_of_search_text = "https://www.google.com/search?q="
 95 |         self.postfix_of_search_text = '&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:official&client=firefox-a&channel=fflb&num=100'# non changable text
 96 | 
 97 |         ## Type of crawler.
 98 |         self.data_format_switch = 1 # 1 - google site crawler, 2 - individual site crawler
 99 | 
100 |         #storage of the various parameters
101 |         self.setting_json_file = r'c:\data\temp\google_search'
102 |         self.spider_name = 'Search'
103 |         self.sp_allowed_domain = ['www.google.com']
104 |         self.sp_search_url_list = []#place to put the search results
105 |     
106 | 
107 |     def reformat_search_for_spaces(self):
108 |         """
109 |             Method call immediately at the initialization stages
110 |             get rid of the spaces and replace by the "+"
111 |             Use in search term. Eg: "Cookie fast" to "Cookie+fast"
112 | 
113 |             steps:
114 |             strip any lagging spaces if present
115 |             replace the self.g_search_key
116 |         """
117 |         self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')
118 | 
119 |     def set_num_of_search_results(self, num_search):
120 |         """ Method to set the number of search results. Will be round in multiple of 100.
121 |             Args:
122 |                 num_search (int): Number of search results to display. Must be int.
123 | 
124 |         """
125 |         assert num_search > 0
126 |         self.search_results_num = num_search
127 | 
128 |     def calculate_num_page_to_scan(self):
129 |         """Calculate the num of page to scan, assume 100 results per page.
130 |            Based on user defined self.search_results_num.
131 |            Estimate the number of page needed to scan in multiple of hundred.
132 | 
133 |         """
134 |         if self.search_results_num <1:
135 |             print "search results specified is not valid."
136 |             raise
137 |         
138 |         self.pages_to_scan = int(math.ceil(self.search_results_num/100.0))
139 | 
140 |     def modify_search_key(self, purpose):
141 |         '''
142 |             This allow modification to the search key according to purpose
143 |             str purpose --> none  (set to self.g_search_key)
144 |             purpose: 'def' Get definition of word
145 |         '''
146 |         if purpose == 'def':
147 |             self.g_search_key = 'define+' + self.g_search_key
148 |         else:
149 |             print 'purpose unknown: do nothing'
150 |             pass ## no changes if the purpose is not defined
151 | 
152 |     def formed_search_url(self):
153 |         '''
154 |             Handle the different type of search: either one selected key phrases or multiple search items
155 |             Depend on the input (self.multiple_search_enabled) will call the different function.
156 |             Set to self.sp_search_url_list
157 |         '''
158 |         if not self.multiple_search_enabled:
159 |             return self.formed_individual_search_url()
160 |         else:
161 |             return self.formed_multiple_search_url()
162 | 
163 |     def formed_page_num(self, page_index):
164 |         """ Method to form part of the url where the page num is included.
165 |             Args:
166 |                 page_num (int): page num in int to be formed. Will convert to multiple of 100.
167 |                 for example page_index 1 will require "&start=100".
168 |                 Start page begin with index 0
169 |             Returns:
170 |                 (str): return part of the url.
171 | 
172 |         """
173 |         return "&start=%i" %(page_index*100)
174 | 
175 | 
176 |     def formed_individual_search_url(self):
177 |         '''
178 |             Function to get the formed url for search
179 |             need the page num
180 |             none --> str output_url_str
181 |             set to tthe self.output_url_str and also return the string
182 |             also set to self.sp_search_url_list
183 |             
184 |         '''
185 |         ## scan the number of results needed
186 |         self.calculate_num_page_to_scan()
187 |         
188 |         ## convert the input search result
189 |         self.reformat_search_for_spaces()
190 | 
191 |         self.sp_search_url_list = []
192 |         for n in range(0,self.pages_to_scan,1):
193 |             self.output_url_str = self.prefix_of_search_text + self.g_search_key + \
194 |                                   self.postfix_of_search_text +\
195 |                                   self.formed_page_num(n)
196 |             self.sp_search_url_list.append(self.output_url_str)
197 |         
198 |         return  self.sp_search_url_list
199 | 
200 |     ## !!!
201 |     def formed_multiple_search_url(self):
202 |         '''
203 |             Function to create multiple search url by querying a list of phrases.
204 |             For running consecutive search
205 |             Use the formed_search_url to create individual search and store them in list
206 |         
207 |         '''
208 |         temp_url_list = []
209 |         ## get the individual url
210 |         for n in self.g_search_key_list:
211 |             ## set the individual key
212 |             self.g_search_key = n
213 |             temp_url_list= temp_url_list + self.formed_individual_search_url()
214 | 
215 |         self.sp_search_url_list = temp_url_list
216 |         return temp_url_list
217 | 
218 |     def prepare_data_for_json_store(self,additonal_parm_dict = {}):
219 |         '''
220 |             orgainized the data set for storing (trigger by self.data_format_switch)
221 |             none, dict additonal_parm_dict --> dict 
222 |             prepare a dict for read in to json --> a parameters to control the type of data input
223 |             store and return as a dict
224 |             additonal_parm_dict will add more user setting data to the data for storage
225 |             
226 |             inject a variable that differentiate between google search and other random website            
227 |         '''
228 |         if self.data_format_switch == 1:
229 |             temp_data = {'Name':self.spider_name, 'Domain':self.sp_allowed_domain,
230 |                         'SearchUrl':self.sp_search_url_list, 'type_of_parse':'google_search'}
231 | 
232 |         elif self.data_format_switch == 2:
233 |             temp_data = {'Name':'random target website', 'Domain':[],
234 |                         'SearchUrl':self.sp_search_url_list,'type_of_parse':'general'}
235 |         else:
236 |             raise
237 |         
238 |         temp_data.update(additonal_parm_dict)
239 |         return temp_data        
240 | 
241 |     def print_list_of_data_format_for_json(self):
242 |         '''
243 |             Function to print out the various list of format prepared based on the data format switch (self.data_format_switch)
244 |             None --> None
245 |         '''
246 |         print '1 -- google search \n 2 -- random website domain'
247 | 
248 |     def set_setting_to_json_file(self, data_dict):
249 |         '''
250 |             Function to set the various setting to json file
251 |             dict data_dict --> none
252 |             List of parameters to store (mainly for the spider to crawl
253 |             name, allowed domains also in list (may leave blanks??), search url (list to be more than one???)
254 | 
255 |         '''
256 |         with open(self.setting_json_file, "w") as outfile:
257 |             json.dump(data_dict, outfile, indent=4)
258 | 
259 |     def retrieved_setting_fr_json_file(self, filename = ''):
260 |         '''
261 |             Function to retrieve the various setting from the json file specified by the self.setting_json_file
262 |             None --> json object  setting_data
263 |             set the various parameters
264 | 
265 |         '''
266 |         if filename =='':
267 |             filename = self.setting_json_file
268 | 
269 |         with open(filename, "r") as infile:
270 |             setting_data = yaml.load(infile)
271 | 
272 |         return setting_data
273 | 
274 | if __name__ == '__main__':
275 | 
276 |     '''
277 |         Running the google search
278 | 
279 |     '''
280 |     # User options
281 |     NUM_SEARCH_RESULTS = 125    # number of search results returned
282 |     BYPASS_GOOGLE_SEARCH = 0    # if this is active, bypass searching
283 |     NUM_RESULTS_TO_PROCESS = 5 # specify the number of results url to crawl
284 | 
285 |     print 'Start search'
286 |     
287 |     ## Parameters setting
288 |     search_words = 'tokyo go'
289 |     #search_words = ['best area to stay in tokyo','cheap place to stay in tokyo']
290 |     GS_LINK_JSON_FILE = r'C:\data\temp\output' #must be same as the get_google_link_results.py
291 | 
292 |     # spider store location, depend on user input
293 |     spider_file_path = r'C:\pythonuserfiles\google_search_module'
294 |     spider_filename = 'Get_google_link_results.py'
295 | 
296 |     ## Google site link scrape
297 |     if not BYPASS_GOOGLE_SEARCH:
298 |         print 'Get the google search results links'
299 |         hh = gsearch_url_form_class(search_words)
300 |         hh.set_num_of_search_results(NUM_SEARCH_RESULTS)
301 |         hh.data_format_switch = 1
302 |         hh.formed_search_url()
303 | 
304 |         ## Set the setting for json
305 |         temp_data_for_store = hh.prepare_data_for_json_store()
306 |         hh.set_setting_to_json_file(temp_data_for_store)
307 |         new_project_cmd = 'scrapy settings -s DEPTH_LIMIT=1 & cd "%s" & scrapy runspider %s & pause' %(spider_file_path,spider_filename)
308 |         os.system(new_project_cmd)
309 |         
310 |     ## Scape list of results link
311 |     print 'Start scrape individual results'
312 |     data  = hh.retrieved_setting_fr_json_file(GS_LINK_JSON_FILE)
313 |     
314 |     ##check if proper url --> must at least start with http
315 |     url_links_fr_search = [n for n in data['output_url'] if n.startswith('http')]
316 | 
317 |     ## Switch to the second seach 
318 |     hh.data_format_switch = 2
319 | 
320 |     ## Optional limit the results displayed
321 |     hh.sp_search_url_list = url_links_fr_search[:NUM_RESULTS_TO_PROCESS]#keep the results to 10.Can be removed
322 | 
323 |     ## Set the setting for json
324 |     temp_data_for_store = hh.prepare_data_for_json_store()
325 |     hh.set_setting_to_json_file(temp_data_for_store)
326 | 
327 |     ## Run the crawler -- and remove the pause if do not wish to see contents of the command prompt
328 |     new_project_cmd = 'scrapy settings -s DEPTH_LIMIT=1 & cd "%s" & scrapy runspider %s  & pause' %(spider_file_path,spider_filename)
329 |     os.system(new_project_cmd)
330 | 
331 |     print 'Completed'    
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------