├── .gitignore ├── LICENSE ├── README.md ├── distributed.py ├── etl.py ├── extends.py ├── project.xml ├── sample.py └── spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # etlpy 2 | ##designed by desert 3 | a smart stream-like crawler & etl python library 4 | 5 | ##1.简介 6 | etlpy是基于配置文件的数据采集和清洗工具。 7 | 8 | 写爬虫和数据清洗代码总是很烦人。因此,应该通过工具生成爬虫和数据清洗的代码! etlpy就是为了解决这个问题而生的。 9 | 10 | 通过可视化和图形化设计工具,快速生成爬虫和数据清洗流程,并保存为xml文件,并由etlpy引擎解析它,即可获得最终的数据结果。 11 | 12 | ##2.使用 13 | 使用起来非常简单: 14 | ``` 15 | from etl import ETLTool 16 | tool = ETLTool(); 17 | tool.LoadProject('project.xml', '数据清洗ETL-大众点评'); 18 | datas = tool.RefreshDatas(); 19 | for r in datas: 20 | print(r) 21 | ``` 22 | RefreshDatas函数返回的是生成器,通过for循环,即可自动读取所有数据。 23 | 24 | ##3.基本原理 25 | 模块分为 生成,过滤,排序,转换,执行四种。 26 | 27 | 利用Python的生成器,可以将不同模块组织起来,定义一个流水线,数据(python的字典)会在流水线上被加工和消费。 28 | 29 | 图形化工具是用C#开发的,使用了类似Python生成器的Linq技术。其原始思路来自于Lisp的s-表达式。 30 | 31 | ##4. 用途 32 | 爬虫,计算,清洗,任何符合一定计算范式的数据,都可以使用它来完成。 33 | -------------------------------------------------------------------------------- /distributed.py: -------------------------------------------------------------------------------- 1 | import sys; 2 | from queue import Queue 3 | from multiprocessing.managers import BaseManager 4 | import etl; 5 | import json 6 | import extends; 7 | import time; 8 | authkey= "etlpy".encode('utf-8') 9 | timeout=1; 10 | rpc_port=8888 11 | 12 | class ETLJob: 13 | def __init__(self,project,jobname,config,id): 14 | self.project= project; 15 | self.jobname=jobname; 16 | self.config=config; 17 | self.id= id; 18 | 19 | class JobResult: 20 | def __init__(self,name,count,id): 21 | self.name=name; 22 | self.count=count; 23 | self.id=id; 24 | 25 | class Master: 26 | 27 | def __init__(self,project,jobname): 28 | # 派发出去的作业队列 29 | self.dispatched_job_queue = Queue() 30 | # 完成的作业队列 31 | self.finished_job_queue = Queue() 32 | self.project= project; 33 | self.jobname=jobname; 34 | self.maxprocess= 10; 35 | 36 | def get_dispatched_job_queue(self): 37 | return self.dispatched_job_queue 38 | 39 | def get_finished_job_queue(self): 40 | return self.finished_job_queue 41 | 42 | def start(self,skip=0): 43 | # 把派发作业队列和完成作业队列注册到网络上 44 | BaseManager.register('get_dispatched_job_queue', callable=self.get_dispatched_job_queue) 45 | BaseManager.register('get_finished_job_queue', callable=self.get_finished_job_queue) 46 | 47 | # 监听端口和启动服务 48 | manager = BaseManager(address=('0.0.0.0', rpc_port), authkey=authkey) 49 | manager.start() 50 | 51 | # 使用上面注册的方法获取队列 52 | dispatched_jobs = manager.get_dispatched_job_queue() 53 | finished_jobs = manager.get_finished_job_queue() 54 | 55 | job_id = 0 56 | module= self.project.modules[self.jobname]; 57 | 58 | proj=json.loads(json.dumps(etl.convert_dict(self.project,self.project.__defaultdict__), ensure_ascii=False)) 59 | while True: 60 | for task in etl.parallel_map(module): 61 | job_id = job_id + 1 62 | if job_id1: 133 | ip=argv[1]; 134 | if len(argv)>2: 135 | port=int(argv[2]); 136 | slave= Slave(); 137 | slave.start(True,ip,port); 138 | 139 | 140 | -------------------------------------------------------------------------------- /etl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'zhaoyiming' 3 | import re; 4 | import extends 5 | import urllib 6 | import spider; 7 | import json; 8 | import html 9 | import xml.etree.ElementTree as ET 10 | import csv 11 | 12 | import os; 13 | 14 | intattrs = re.compile('Max|Min|Count|Index|Interval|Position'); 15 | boolre = re.compile('^(One|Can|Is)|Enable|Should|Have|Revert'); 16 | rescript = re.compile('Regex|Number') 17 | 18 | 19 | def SetAttr(etl, key, value): 20 | if key in ['Group','Type']: 21 | return 22 | 23 | if intattrs.search(key) is not None: 24 | try: 25 | t = int(value); 26 | setattr(etl, key, t); 27 | except ValueError: 28 | print('it is a ValueError') 29 | setattr(etl, key, value); 30 | elif boolre.search(key) is not None: 31 | setattr(etl, key, True if value == 'True' else False); 32 | else: 33 | setattr(etl, key, value); 34 | 35 | def getMatchCount(mat): 36 | return mat.lastindex if mat.lastindex is not None else 1; 37 | 38 | class ETLTool(extends.EObject): 39 | def __init__(self): 40 | self.Enabled=True; 41 | self.Column = '' 42 | def process(self, data): 43 | return data 44 | def init(self): 45 | pass; 46 | 47 | class Transformer(ETLTool): 48 | def __init__(self): 49 | super(Transformer, self).__init__() 50 | self.IsMultiYield=False 51 | self.NewColumn=''; 52 | self.OneOutput=True; 53 | self.OneInput = False; 54 | 55 | def transform(self,data): 56 | pass; 57 | def process(self,data): 58 | if self.IsMultiYield: # one to many 59 | for r in data: 60 | for p in self.transform( r): 61 | yield extends.MergeQuery(p, r,self.NewColumn); 62 | return; 63 | for d in data: # one to one 64 | if self.OneOutput: 65 | if self.Column not in d or self.Column not in d: 66 | yield d; 67 | continue; 68 | item = d[self.Column] if self.OneInput else d; 69 | res = self.transform(item) 70 | key= self.NewColumn if self.NewColumn!='' else self.Column; 71 | d[key]=res; 72 | else: 73 | self.transform( d) 74 | yield d; 75 | 76 | class Executor(ETLTool): 77 | def execute(self,data): 78 | pass; 79 | def process(self,data): 80 | for r in data: 81 | self.execute(r); 82 | yield r; 83 | 84 | 85 | class Filter(ETLTool): 86 | def __init__(self): 87 | super(Filter, self).__init__() 88 | self.Revert=False; 89 | def filter(self,data): 90 | 91 | return True; 92 | 93 | def process(self, data): 94 | for r in data: 95 | item = None; 96 | if self.Column in r: 97 | item = r[self.Column]; 98 | if item is None and self.__class__ != NullFT: 99 | continue; 100 | result = self.filter( item) 101 | if result == True and self.Revert == False: 102 | yield r; 103 | elif result == False and self.Revert == True: 104 | yield r; 105 | 106 | class Generator(ETLTool): 107 | def __init__(self): 108 | super(Generator, self).__init__() 109 | self.MergeType='Append' 110 | self.Position=0; 111 | def generate(self,generator): 112 | pass; 113 | 114 | def process(self, generator): 115 | if generator is None: 116 | return self.generate(None); 117 | else: 118 | if self.MergeType=='Append': 119 | return extends.Append(generator,self.process(None)); 120 | elif self.MergeType=='Merge': 121 | return extends.Merge(generator, self.process(None)); 122 | else: 123 | return extends.Cross(generator,self.generate) 124 | 125 | 126 | 127 | class ConnectorBase(ETLTool): 128 | def __init__(self): 129 | super(ConnectorBase, self).__init__() 130 | self.Connector = ''; 131 | self.ExecuteType = 'OnlyInsert' 132 | self.filetype = ''; 133 | 134 | def init(self): 135 | self.connector= self.__proj__.connectors[self.Connector]; 136 | if self.connector.TypeName=='MongoDBConnector': 137 | import pymongo 138 | client = pymongo.MongoClient(self.connector.ConnectString); 139 | db = client[self.connector.DBName]; 140 | self.Table = db[self.TableName]; 141 | else: 142 | path = self.TableName; 143 | filetype = path.split('.')[-1].lower(); 144 | encode = 'utf-8'; 145 | self.file = open(path, type, encoding=encode) 146 | self.filetype = filetype; 147 | 148 | 149 | class DbEX(ConnectorBase): 150 | def __init__(self): 151 | super(DbEX, self).__init__() 152 | self.TableName='' 153 | 154 | 155 | 156 | 157 | def process(self,datas): 158 | if self.connector.TypeName == 'MongoDBConnector': 159 | etype = self.ExecuteType; 160 | table = self.Table; 161 | work = {'OnlyInsert': lambda d: table.save(d),'InsertOrUpdate':lambda d: table.save(d)}; 162 | for data in datas: 163 | work[etype](data); 164 | yield data; 165 | else: 166 | 167 | if self.filetype in ['csv', 'txt']: 168 | field = extends.getkeys(datas); 169 | self.writer = csv.DictWriter(self.file, field, delimiter=sp, lineterminator='\n') 170 | self.writer.writeheader() 171 | for data in datas: 172 | self.writer.writerow(data); 173 | yield data; 174 | elif self.filetype == 'json': 175 | self.file.write('[') 176 | for data in datas: 177 | json.dump(data, self.file, ensure_ascii=False) 178 | self.file.write(','); 179 | yield data; 180 | self.file.write(']') 181 | self.file.close(); 182 | 183 | 184 | class DBGE(ConnectorBase): 185 | 186 | def generate(self,data): 187 | if self.Connector=='MongoDBConnector': 188 | for data in self.Table.find(): 189 | yield data; 190 | else: 191 | if self.filetype in ['csv', 'txt']: 192 | sp = ',' if self.filetype == 'csv' else '\t'; 193 | reader = csv.DictReader(self.file, delimiter=sp) 194 | for r in reader: 195 | yield r; 196 | elif self.filetype == 'json': 197 | items = json.load(self.file); 198 | for r in items: 199 | yield r; 200 | 201 | def process(self, generator): 202 | if generator is None: 203 | return self.generate(None); 204 | else: 205 | if self.MergeType == 'Append': 206 | return extends.Append(generator, self.process(None)); 207 | elif self.MergeType == 'Merge': 208 | return extends.Merge(generator, self.process(None)); 209 | else: 210 | return extends.Cross(generator, self.generate) 211 | 212 | 213 | def setValue(data,etl,value): 214 | if etl.NewColumn!='': 215 | data[etl.NewColumn]=value; 216 | else: 217 | data[etl.Column]=value; 218 | 219 | class RegexFT(Filter): 220 | 221 | def init(self): 222 | self.Regex = re.compile(self.Script); 223 | self.Count=1; 224 | 225 | def filter(self,data): 226 | v = self.Regex.findall(data); 227 | if v is None: 228 | return False; 229 | else: 230 | return self.Count <= len(v) 231 | 232 | class RangeFT(Filter): 233 | 234 | def filter(self,item): 235 | f = float(item) 236 | return self.Min <= f <= self.Max; 237 | 238 | class RepeatFT(Filter): 239 | 240 | def init(self): 241 | self.set=set(); 242 | def filter(self,data): 243 | if data in self.set: 244 | return False; 245 | else: 246 | self.set.add(data); 247 | return True; 248 | 249 | class NullFT(Filter): 250 | 251 | def filter(self,data): 252 | if data is None: 253 | return False; 254 | if isinstance(data, str): 255 | return data.strip() != ''; 256 | return True; 257 | 258 | 259 | class AddNewTF(Transformer): 260 | 261 | def transform(self,data): 262 | return self.NewValue; 263 | 264 | 265 | class AutoIndexTF(Transformer): 266 | def init(self): 267 | super(AutoIndexTF, self).__init__() 268 | self.currindex = 0; 269 | def transform(self, data): 270 | self.currindex += 1; 271 | return self.currindex; 272 | 273 | 274 | class RenameTF(Transformer): 275 | 276 | def __init__(self): 277 | super(RenameTF, self).__init__() 278 | self.OneOutput = False; 279 | def transform(self, data): 280 | if not self.Column in data: 281 | return; 282 | item = data[self.Column]; 283 | del data[self.Column]; 284 | if self.NewColumn != "": 285 | data[self.NewColumn] = item; 286 | 287 | class DeleteTF(Transformer): 288 | def __init__(self): 289 | super(DeleteTF, self).__init__() 290 | self.OneOutput = False; 291 | def transform(self, data): 292 | if self.Column in data: 293 | del data[self.Column]; 294 | 295 | class HtmlTF(Transformer): 296 | def __init__(self): 297 | super(HtmlTF, self).__init__() 298 | self.OneInput=True; 299 | 300 | def transform(self, data): 301 | return html.escape(data) if self.ConvertType == 'Encode' else html.unescape(data); 302 | 303 | 304 | class UrlTF(Transformer): 305 | def __init__(self): 306 | super(UrlTF, self).__init__() 307 | self.OneInput = True; 308 | def transform(self, data): 309 | if self.ConvertType == 'Encode': 310 | url = data.encode('utf-8'); 311 | return urllib.parse.quote(url); 312 | else: 313 | return urllib.parse.unquote(data); 314 | 315 | 316 | class RegexSplitTF(Transformer): 317 | def transform(self, data): 318 | items = re.split(self.Regex, data) 319 | if len(items) <= self.Index: 320 | return data; 321 | if not self.FromBack: 322 | return items[self.Index]; 323 | else: 324 | index = len(items) - self.Index - 1; 325 | if index < 0: 326 | return data; 327 | else: 328 | return items[index]; 329 | return items[index]; 330 | 331 | class MergeTF(Transformer): 332 | def __init__(self): 333 | super(MergeTF, self).__init__() 334 | self.Format='{0}' 335 | self.MergeWith='' 336 | def transform(self, data): 337 | if self.MergeWith == '': 338 | columns = []; 339 | else: 340 | columns = [str(data[r]) for r in self.MergeWith.split(' ')] 341 | columns.insert(0, data[self.Column] if self.Column in data else ''); 342 | res = self.Format; 343 | for i in range(len(columns)): 344 | res = res.replace('{' + str(i) + '}', str(columns[i])) 345 | return res; 346 | 347 | 348 | 349 | 350 | class RegexTF(Transformer): 351 | def __init__(self): 352 | super(RegexTF, self).__init__() 353 | self.Script = ''; 354 | self.OneInput = True; 355 | 356 | def init(self): 357 | self.Regex = re.compile(self.Script); 358 | def transform(self, data): 359 | item = re.findall(self.Regex, str(data)); 360 | if self.Index < 0: 361 | return ''; 362 | if len(item) <= self.Index: 363 | return ''; 364 | else: 365 | r = item[self.Index]; 366 | return r if isinstance(r, str) else r[0]; 367 | 368 | class ReReplaceTF(RegexTF): 369 | 370 | def transform(self, data): 371 | return re.sub(self.Regex, self.ReplaceText, data); 372 | 373 | class NumberTF(RegexTF): 374 | def __init__(self): 375 | super(NumberTF, self).__init__() 376 | self.Script='' #TODO 377 | 378 | def transform(self, data): 379 | t = super(NumberTF,self).transform( data); 380 | if t is not None and t != '': 381 | return int(t); 382 | return t; 383 | 384 | class SplitTF(Transformer): 385 | def __init__(self): 386 | super(SplitTF, self).__init__() 387 | self.SplitChar=''; 388 | self.OneInput = True; 389 | 390 | 391 | def transform(self, data): 392 | splits = self.SplitChar.split(' '); 393 | sp = splits[0] 394 | if sp == '': 395 | return data; 396 | 397 | r = data.split(splits[0]); 398 | if len(r) > self.Index: 399 | return r[self.Index]; 400 | return ''; 401 | 402 | class TrimTF(Transformer): 403 | def __init__(self): 404 | super(TrimTF, self).__init__() 405 | self.OneInput = True; 406 | 407 | def transform(self, data): 408 | return data.strip(); 409 | 410 | class StrExtractTF(Transformer): 411 | def __init__(self): 412 | super(StrExtractTF, self).__init__() 413 | self.HaveStartEnd=False; 414 | self.Start='' 415 | self.OneInput=True; 416 | self.End='' 417 | 418 | def transform(self, data): 419 | start = data.find(self.Former); 420 | if start == -1: 421 | return 422 | end = data.find(self.End, start); 423 | if end == -1: 424 | return; 425 | if self.HaveStartEnd: 426 | end += len(self.End); 427 | if not self.HaveStartEnd: 428 | start += len(self.Former); 429 | return data[start:end]; 430 | 431 | class PythonTF(Transformer): 432 | def __init__(self): 433 | super(PythonTF, self).__init__() 434 | self.OneOutput=False 435 | self.Script='value' 436 | self.ScriptWorkMode='不进行转换' 437 | def transform(self, data): 438 | result = eval(self.Script, {'value': data[self.Column]}, data); 439 | if result is not None and self.IsMultiYield == False: 440 | key = self.NewColumn if self.NewColumn != '' else self.Column; 441 | data[key] = result; 442 | return result; 443 | 444 | class CrawlerTF(Transformer): 445 | def __init__(self): 446 | super(CrawlerTF, self).__init__() 447 | self.CrawlerSelector=''; 448 | self.MaxTryCount=1; 449 | self.IsRegex=False 450 | self.OneOutput=False; 451 | def init(self): 452 | self.IsMultiYield = True; 453 | self.crawler = self.__proj__.modules.get(self.CrawlerSelector, None); 454 | self.buff = {}; 455 | def transform(self, data): 456 | crawler = self.crawler; 457 | url = data[self.Column]; 458 | buff = self.buff; 459 | if url in buff: 460 | datas = buff[url]; 461 | else: 462 | datas = crawler.CrawData(url); 463 | if len(buff) < 100: 464 | buff[url] = datas; 465 | if self.crawler.IsMultiData == 'List': 466 | for d in datas: 467 | res = extends.MergeQuery(d, data, self.NewColumn); 468 | yield res; 469 | else: 470 | data = extends.Merge(data, datas); 471 | yield data; 472 | 473 | 474 | class XPathTF(Transformer): 475 | def __init__(self): 476 | super(XPathTF, self).__init__() 477 | self.XPath='' 478 | self.IsMultiYield = True; 479 | self.OneOutput=False; 480 | 481 | def init(self): 482 | self.IsMultiYield=True; 483 | self.OneOutput = False; 484 | def transform(self, data): 485 | from lxml import etree 486 | if self.IsManyData: 487 | tree = spider.GetHtmlTree(data[self.Column]); 488 | nodes = tree.xpath(self.XPath); 489 | for node in nodes: 490 | ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; 491 | ext['OHTML'] = ext['HTML'] 492 | yield extends.MergeQuery(ext, data, self.NewColumn); 493 | else: 494 | tree = spider.GetHtmlTree(data[self.Column]); 495 | nodes = tree.xpath(self.XPath); 496 | node=nodes[0] 497 | if hasattr(node,'text'): 498 | setValue(data, self, node.text); 499 | else: 500 | setValue(data,self,str(node)) 501 | yield data; 502 | 503 | 504 | class ToListTF(Transformer): 505 | def transform(self, data): 506 | yield data; 507 | 508 | class JsonTF(Transformer): 509 | def __init__(self): 510 | super(JsonTF, self).__init__() 511 | self.OneOutput=False 512 | self.ScriptWorkMode='文档列表'; 513 | 514 | def init(self): 515 | self.IsMultiYield= self.ScriptWorkMode=='文档列表'; 516 | 517 | def transform(self, data): 518 | js = json.loads(data[self.Column]); 519 | if isinstance(js, list): 520 | for j in js: 521 | yield j; 522 | else: 523 | yield js; 524 | 525 | class RangeGE(Generator): 526 | def __init__(self): 527 | super(RangeGE, self).__init__() 528 | self.Interval='1' 529 | self.MaxValue='1' 530 | self.MinValue='1' 531 | def generate(self,generator): 532 | interval= int(extends.Query(generator,self.Interval)) 533 | maxvalue= int(extends.Query(generator,self.MaxValue)) 534 | minvalue= int(extends.Query(generator,self.MinValue)) 535 | for i in range(minvalue,maxvalue,interval): 536 | item= {self.Column:round(i,5)} 537 | yield item; 538 | 539 | class RangeTF(Transformer): 540 | def __init__(self): 541 | super(RangeTF, self).__init__() 542 | self.Skip=0; 543 | self.Take=9999999; 544 | def transform(self, data): 545 | skip = int(extends.Query(data, self.Skip)); 546 | take = int(extends.Query(data, self.Take)); 547 | i = 0; 548 | for r in data: 549 | if i < skip: 550 | continue; 551 | if i >= take: 552 | break; 553 | i += 1; 554 | yield r; 555 | 556 | 557 | class EtlGE(Generator): 558 | def generate(self,data): 559 | subetl = self.__proj__.modules[self.ETLSelector]; 560 | for r in generate(subetl.AllETLTools): 561 | yield r; 562 | 563 | class EtlEX(Executor): 564 | def execute(self,datas): 565 | subetl = self.__proj__.modules[self.ETLSelector]; 566 | for data in datas: 567 | if spider.IsNone(self.NewColumn): 568 | doc = data.copy(); 569 | else: 570 | doc = {}; 571 | extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column); 572 | result=(r for r in generate(subetl.AllETLTools, [doc])) 573 | count=0; 574 | for r in result: 575 | count+=1; 576 | print(r); 577 | print(count) 578 | yield data; 579 | 580 | class EtlTF(Transformer): 581 | def transform(self,datas): 582 | subetl = self.__proj__.modules[self.ETLSelector]; 583 | if self.IsMultiYield: 584 | 585 | for data in datas: 586 | doc = data.copy(); 587 | for r in subetl.__generate__(subetl.AllETLTools, [doc]): 588 | yield extends.MergeQuery(r, data, self.NewColumn); 589 | else: 590 | yield None; # TODO 591 | 592 | 593 | 594 | class TextGE(Generator): 595 | def __init__(self): 596 | super(TextGE, self).__init__() 597 | self.Content=''; 598 | def init(self): 599 | self.arglists= [r.strip() for r in self.Content.split('\n')]; 600 | def generate(self,data): 601 | for i in range(self.Position, len(self.arglists)): 602 | yield {self.Column: self.arglists[i]} 603 | 604 | 605 | 606 | 607 | 608 | 609 | class TableEX(Executor): 610 | def __init__(self): 611 | super(TableEX, self).__init__() 612 | self.Table = 'Table'; 613 | def execute(self,data): 614 | tables= self.__proj__.tables; 615 | tname = self.Table; 616 | if tname not in tables: 617 | tables[tname] = []; 618 | for r in data: 619 | tables[tname].append(r); 620 | yield r; 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | class BaiduLocation(Transformer): 629 | pass; 630 | 631 | 632 | class GetIPLocation(Transformer): 633 | pass; 634 | 635 | class GetRoute(Transformer): 636 | pass; 637 | 638 | class NearbySearch(Transformer): 639 | pass; 640 | 641 | class NlpTF(Transformer): 642 | pass; 643 | 644 | class TransTF(Transformer): 645 | pass; 646 | class JoinDBTF(Transformer): 647 | pass; 648 | 649 | class RepeatTF(Transformer): 650 | pass; 651 | class ResponseTF(Transformer): 652 | pass; 653 | 654 | class Time2StrTF(Transformer): 655 | pass; 656 | 657 | 658 | class BfsGE(Generator): 659 | pass; 660 | 661 | class DictTF(Transformer): 662 | pass; 663 | 664 | class FileExistFT(Transformer): 665 | def __init__(self): 666 | super(FileExistFT,self).__init__(); 667 | self.Script = ''; 668 | self.OneInput = True; 669 | def transform(self,data): 670 | import os; 671 | return str(os.path.exists(data)); 672 | 673 | class MergeRepeatTF(Transformer): 674 | pass; 675 | 676 | class NumRangeFT(Filter): 677 | pass; 678 | 679 | class DelayTF(Transformer): 680 | pass; 681 | 682 | class ReadFileTextTF(Transformer): 683 | pass; 684 | 685 | class WriteFileTextTF(Transformer): 686 | pass; 687 | class FolderGE(Generator): 688 | pass; 689 | 690 | class TableGE(Generator): 691 | pass; 692 | class FileDataTF(Transformer): 693 | pass; 694 | 695 | 696 | 697 | class SaveFileEX(Executor): 698 | def __init__(self): 699 | super(SaveFileEX, self).__init__() 700 | self.SavePath=''; 701 | 702 | def execute(self,data): 703 | 704 | save_path = extends.Query(data, self.SavePath); 705 | (folder,file)=os.path.split(save_path); 706 | if not os.path.exists(folder): 707 | os.makedirs(folder); 708 | urllib.request.urlretrieve(data[self.Column], save_path) 709 | 710 | 711 | def GetChildNode(roots, name): 712 | for etool in roots: 713 | if etool.get('Name') == name or etool.tag == name: 714 | return etool; 715 | return None; 716 | 717 | 718 | def InitFromHttpItem(config, item): 719 | httprib = config.attrib; 720 | paras = spider.Para2Dict(httprib['Parameters'], '\n', ':'); 721 | item.Headers = paras; 722 | item.Url = httprib['URL']; 723 | post = 'Postdata'; 724 | if post in httprib: 725 | item.postdata = httprib[post]; 726 | else: 727 | item.postdata = None; 728 | 729 | 730 | 731 | 732 | class Project(extends.EObject): 733 | def __init__(self): 734 | self.modules={}; 735 | self.tables={} 736 | self.connectors={}; 737 | self.__defaultdict__={}; 738 | 739 | 740 | def LoadProject_dict(dic): 741 | proj = Project(); 742 | for key,connector in dic['connectors'].items(): 743 | proj.connectors[key]= extends.dict_to_poco_type(connector); 744 | for key,module in dic['modules'].items(): 745 | task =None; 746 | if 'AllETLTools' in module: 747 | task = etl_factory(ETLTask(),proj); 748 | for r in module['AllETLTools']: 749 | etl= etl_factory(r['Type'],proj); 750 | for attr,value in r.items(): 751 | if attr in ['Type']: 752 | continue; 753 | setattr(etl,attr,value); 754 | etl.__proj__=proj; 755 | task.AllETLTools.append(etl) 756 | elif 'CrawItems' in module: 757 | task=etl_factory(spider.SmartCrawler(),proj); 758 | task.CrawItems=[]; 759 | extends.dict_copy_poco(task,module); 760 | for r in module['CrawItems']: 761 | crawlitem= etl_factory(spider.CrawItem(),proj) 762 | extends.dict_copy_poco(crawlitem,r); 763 | task.CrawItems.append(crawlitem) 764 | task.HttpItem= etl_factory(spider.HTTPItem(),proj) 765 | extends.dict_copy_poco(task.HttpItem,module['HttpItem']) 766 | task.HttpItem.Headers=module['HttpItem']["Headers"]; 767 | if task is not None: 768 | proj.modules[key]=task; 769 | 770 | print('load project success') 771 | return proj; 772 | 773 | 774 | def task_DumpLinq(tools): 775 | array=[]; 776 | for t in tools: 777 | typename= extends.get_type_name(t); 778 | newcolumn=getattr(t,'NewColumn',''); 779 | s='%s,%s'%(typename,t.Column); 780 | s+='=>%s,'%newcolumn if newcolumn!='' else ','; 781 | attrs=[]; 782 | defaultdict= t.__proj__.__defaultdict__[typename]; 783 | for att in t.__dict__: 784 | value=t.__dict__[att]; 785 | if att in ['NewColumn','Column','IsMultiYield']: 786 | continue 787 | if not isinstance(value,(str,int,bool,float)): 788 | continue; 789 | if value is None or att not in defaultdict or defaultdict[att]==value: 790 | continue; 791 | attrs.append('%s=%s'%(att,value)); 792 | s+=','.join(attrs) 793 | array.append(s) 794 | return '\n'.join(array); 795 | 796 | def convert_dict(obj,defaultdict): 797 | if not isinstance(obj, (str, int, float, list, dict, tuple, extends.EObject)): 798 | return None 799 | # if isinstance(obj,) 800 | if isinstance(obj, extends.EObject): 801 | d={} 802 | typename = extends.get_type_name(obj); 803 | 804 | for key, value in obj.__dict__.items(): 805 | if typename in defaultdict: 806 | default = defaultdict[typename]; 807 | if value== default.get(key,None): 808 | continue; 809 | if key.startswith('__'): 810 | continue; 811 | 812 | p =convert_dict(value,defaultdict) 813 | if p is not None: 814 | d[key]=p 815 | if isinstance(obj,ETLTool): 816 | d['Type']= typename; 817 | return d; 818 | 819 | elif isinstance(obj, list): 820 | return [convert_dict(r,defaultdict) for r in obj]; 821 | elif isinstance(obj,dict): 822 | return {key: convert_dict(value,defaultdict) for key,value in obj.items()} 823 | return obj; 824 | 825 | 826 | 827 | 828 | return d 829 | 830 | def Project_DumpJson(proj): 831 | dic= convert_dict(proj,proj.__defaultdict__) 832 | return json.dumps(dic, ensure_ascii=False, indent=2) 833 | 834 | 835 | def Project_LoadJson(js): 836 | d=json.loads(js); 837 | return LoadProject_dict(d) 838 | 839 | def etl_factory(item,proj): 840 | if isinstance(item,str): 841 | item=eval('%s()'%item); 842 | else: 843 | item=item; 844 | import copy 845 | name = extends.get_type_name(item) 846 | if name not in proj.__defaultdict__: 847 | proj.__defaultdict__[name]=copy.deepcopy( item.__dict__); 848 | return item; 849 | 850 | 851 | def Project_LoadXml(path): 852 | tree = ET.parse(path); 853 | proj=Project(); 854 | def factory(obj): 855 | return etl_factory(obj,proj); 856 | root = tree.getroot(); 857 | root = root.find('Doc'); 858 | for etool in root: 859 | if etool.tag == 'Children': 860 | etype = etool.get('Type'); 861 | name = etool.get('Name'); 862 | if etype == 'SmartETLTool': 863 | etltool = factory(ETLTask()); 864 | for m in etool: 865 | if m.tag == 'Children': 866 | type= m.attrib['Type'] 867 | etl = factory(type); 868 | etl.__proj__=proj 869 | for att in m.attrib: 870 | SetAttr(etl, att, m.attrib[att]); 871 | etltool.AllETLTools.append(etl); 872 | proj.modules[name] = etltool; 873 | elif etype == 'SmartCrawler': 874 | import spider; 875 | crawler =factory(spider.SmartCrawler()); 876 | crawler.HttpItem= factory(spider.HTTPItem()) 877 | crawler.Name = etool.attrib['Name']; 878 | crawler.IsMultiData = etool.attrib['IsMultiData'] 879 | crawler.RootXPath= etool.attrib['RootXPath'] 880 | httpconfig = GetChildNode(etool, 'HttpSet'); 881 | InitFromHttpItem(httpconfig, crawler.HttpItem); 882 | login = GetChildNode(etool, 'Login'); 883 | if login is not None: 884 | crawler.Login = factory(spider.HTTPItem()); 885 | InitFromHttpItem(login, crawler.Login); 886 | crawler.CrawItems = []; 887 | for child in etool: 888 | if child.tag == 'Children': 889 | crawitem= factory(spider.CrawItem()); 890 | crawitem.Name=child.attrib['Name']; 891 | crawitem.XPath = child.attrib['XPath']; 892 | crawler.CrawItems.append(crawitem); 893 | 894 | proj.modules[name] = crawler; 895 | elif etool.tag == 'DBConnections': 896 | for tool in etool: 897 | if tool.tag == 'Children': 898 | connector = extends.EObject(); 899 | for att in tool.attrib: 900 | SetAttr(connector, att, tool.attrib[att]); 901 | proj.connectors[connector.Name] = connector; 902 | 903 | print('load project success') 904 | return proj; 905 | 906 | 907 | def generate(tools, generator=None, execute=False, enabledFilter=True): 908 | #print(task_DumpLinq(tools)); 909 | for tool in tools: 910 | if tool.Enabled == False and enabledFilter == True: 911 | continue 912 | tool.init(); 913 | if isinstance(tool,Executor) and execute==False: 914 | continue; 915 | 916 | generator = tool.process(generator) 917 | return generator; 918 | 919 | def parallel_map(task, execute=True): 920 | tools = task.AllETLTools; 921 | index = extends.getindex(tools, lambda d: isinstance(d, ToListTF)); 922 | if index == -1: 923 | index = 0; 924 | tool = tools[index]; 925 | generator = tool.process(None); 926 | else: 927 | generator = generate(tools[:index],None, execute=execute); 928 | return generator; 929 | 930 | def parallel_reduce(task,generator=None, execute=True): 931 | tools = task.AllETLTools; 932 | index = extends.getindex(tools, lambda d: isinstance(d,ToListTF)); 933 | index =0 if index==-1 else index; 934 | generator = generate(tools[index + 1:], generator, execute); 935 | return generator; 936 | 937 | 938 | 939 | 940 | 941 | 942 | class ETLTask(extends.EObject): 943 | def __init__(self): 944 | self.AllETLTools = []; 945 | 946 | 947 | 948 | def QueryDatas(self, etlCount=100, execute=False): 949 | return generate((tool for tool in self.AllETLTools[:etlCount]), None, execute); 950 | 951 | def Close(self): 952 | for tool in self.AllETLTools: 953 | if tool.Type in ['DbGE', 'DbEX']: 954 | if tool.connector.TypeName == 'FileManager': 955 | if tool.filetype == 'json': 956 | tool.file.write('{}]'); 957 | tool.file.close(); 958 | 959 | 960 | def mThreadExecute(self, threadcount=10,canexecute=True): 961 | import threadpool 962 | pool = threadpool.ThreadPool(threadcount) 963 | 964 | seed= parallel_map(self,canexecute); 965 | def Funcs(item): 966 | task= parallel_reduce(self,[item],canexecute); 967 | print('totalcount: %d'%len([r for r in task])); 968 | print('finish' + str(item)); 969 | 970 | requests = threadpool.makeRequests(Funcs, seed); 971 | [pool.putRequest(req) for req in requests] 972 | pool.wait() 973 | # self.__close__() 974 | 975 | 976 | -------------------------------------------------------------------------------- /extends.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | import re; 3 | 4 | spacere = re.compile("[ ]{2,}"); 5 | spacern = re.compile("(^\r\n?)|(\r\n?$)") 6 | 7 | 8 | def getkeys(generator): 9 | count=0; 10 | s=set(); 11 | for r in generator: 12 | s=s|r.keys(); 13 | count+=1; 14 | if count>=20: 15 | return list(s); 16 | return list(s) 17 | 18 | def ReplaceLongSpace(txt): 19 | r = spacere.subn(' ', txt)[0] 20 | r = spacern.subn('', r)[0] 21 | return r; 22 | 23 | 24 | def Merge(d1, d2): 25 | for r in d2: 26 | d1[r] = d2[r]; 27 | return d1; 28 | 29 | 30 | def MergeQuery(d1, d2, columns): 31 | if isinstance(columns, str) and columns.strip() != "": 32 | columns = columns.split(' '); 33 | for r in columns: 34 | if r in d2: 35 | d1[r] = d2[r]; 36 | return d1; 37 | 38 | 39 | 40 | 41 | def Query(data, key): 42 | if data is None: 43 | return key; 44 | if isinstance(key, str) and key.startswith('[') and key.endswith(']'): 45 | key = key[1:-1]; 46 | return data[key]; 47 | return key; 48 | 49 | 50 | 51 | 52 | 53 | def findany(iteral, func): 54 | for r in iteral: 55 | if func(r): 56 | return True; 57 | return False; 58 | 59 | 60 | def getindex(iteral, func): 61 | for r in range(len(iteral)): 62 | if func(iteral[r]): 63 | return r; 64 | return -1; 65 | 66 | def Cross(a, genefunc): 67 | 68 | for r1 in a: 69 | for r2 in genefunc(r1): 70 | for key in r2: 71 | r1[key] = r2[key] 72 | yield r1; 73 | 74 | 75 | def MergeAll(a, b): 76 | while True: 77 | t1 = a.__next__() 78 | if t1 is None: 79 | return; 80 | t2 = b.__next__() 81 | if t2 is not None: 82 | for t in t2: 83 | t1[t] = t2[t]; 84 | yield t1; 85 | 86 | 87 | def Append(a, b): 88 | for r in a: 89 | yield r; 90 | for r in b: 91 | yield r; 92 | 93 | def get_type_name(obj): 94 | s=str(obj.__class__); 95 | p=s.find('.'); 96 | r= s[p+1:].split('\'')[0] 97 | return r; 98 | 99 | 100 | class EObject(object): 101 | pass; 102 | 103 | 104 | 105 | def convert_to_builtin_type(obj): 106 | d= { key:value for key,value in obj.__dict__.items() if isinstance(value,(str,int,float,list,dict,tuple,EObject) or value is None)}; 107 | return d 108 | 109 | def dict_to_poco_type(obj): 110 | if isinstance(obj,dict): 111 | result= EObject(); 112 | for key in obj: 113 | v= obj[key] 114 | setattr(result,key,dict_to_poco_type(v)) 115 | return result 116 | elif isinstance(obj,list): 117 | for i in range(len(obj)): 118 | obj[i]=dict_to_poco_type(obj[i]); 119 | 120 | return obj; 121 | 122 | 123 | def dict_copy_poco(obj,dic): 124 | for key,value in obj.__dict__.items(): 125 | if key in dic: 126 | if isinstance(dic[key], (str,int,float)): 127 | 128 | setattr(obj,key,dic[key]) 129 | -------------------------------------------------------------------------------- /project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /sample.py: -------------------------------------------------------------------------------- 1 | import etl; 2 | 3 | import extends 4 | import time; 5 | path='/home/desert.zym/dev' 6 | 7 | proj=etl.Project_LoadXml(path+'/Hawk-Projects/图片抓取/昵图网.xml'); 8 | lagou=proj.modules['昵图网']; 9 | tools= lagou.AllETLTools; 10 | tools[-12].Format="/cloud/usr/desert.zym/picture/昵图网/{1}/{0}.jpg"; 11 | tools[-1].Enabled=False; 12 | tools[-9].Enabled=False; 13 | #for r in lagou.QueryDatas(etlCount=19,execute=False): 14 | # print(r) 15 | # print(r) 16 | from distributed import * 17 | master =Master(proj,"昵图网"); 18 | master.start(); 19 | 20 | 21 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import gzip 3 | import re 4 | import socket 5 | import urllib.request 6 | from lxml import etree 7 | from urllib.parse import urlparse,urlunparse 8 | import extends; 9 | import http.cookiejar 10 | from urllib.request import quote 11 | 12 | boxRegex = re.compile(r"\[\d{1,3}\]"); 13 | 14 | 15 | class CrawItem(extends.EObject): 16 | def __init__(self, name=None, sample=None, ismust=False, isHTMLorText=True, xpath=None): 17 | self.XPath = xpath; 18 | self.Sample = sample; 19 | self.Name = name; 20 | self.IsMust = ismust; 21 | self.IsHTMLorText = isHTMLorText; 22 | self.Children = []; 23 | 24 | def __str__(self): 25 | return "%s %s %s" % (self.Name, self.XPath, self.Sample); 26 | 27 | 28 | def RemoveFinalNum(paths): 29 | v = paths[-1]; 30 | m = boxRegex.search(v); 31 | if m is None: 32 | return paths; 33 | s = m.group(0); 34 | paths[-1] = v.replace(s, ""); 35 | return paths; 36 | 37 | 38 | def GetMaxCompareXPath(items): 39 | xpaths = [r.XPath.split('/') for r in items]; 40 | minlen = min(len(r) for r in xpaths); 41 | c = None; 42 | for i in range(minlen): 43 | for index in range(len(xpaths)): 44 | path = xpaths[index]; 45 | if index == 0: 46 | c = path[i]; 47 | elif c != path[i]: 48 | first = path[0:i + 1]; 49 | return '/'.join(RemoveFinalNum(first)); 50 | 51 | 52 | attrsplit=re.compile('@|\['); 53 | 54 | def GetDataFromXPath(node, path): 55 | p = node.xpath(path); 56 | if p is None: 57 | return None; 58 | if len(p) == 0: 59 | return None; 60 | paths = path.split('/'); 61 | last = paths[-1]; 62 | if last.find('@')>=0 and last.find('[1]')>=0: 63 | return p[0]; 64 | return getnodetext(p[0]); 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | def GetImage(addr, fname): 74 | u = urllib.urlopen(addr) 75 | data = u.read() 76 | f = open(fname, 'wb') 77 | f.write(data) 78 | f.close() 79 | 80 | 81 | def urlEncodeNonAscii(b): 82 | return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) 83 | 84 | def iriToUri(iri): 85 | parts= urlparse(iri) 86 | 87 | pp= [(parti,part) for parti, part in enumerate(parts)] 88 | res=[]; 89 | for p in pp: 90 | res.append(p[1] if p[0] != 4 else quote(p[1] )) 91 | 92 | return urlunparse(res); 93 | 94 | 95 | 96 | 97 | extract = re.compile('\[(\w+)\]'); 98 | 99 | charset = re.compile(r'content="text/html;.?charset=(.*?)"'); 100 | class HTTPItem(extends.EObject): 101 | def __init__(self): 102 | self.Url = '' 103 | self.Cookie = ''; 104 | self.Headers = None; 105 | self.Timeout = 30; 106 | self.opener = ""; 107 | self.postdata='' 108 | 109 | def PraseURL(self, url): 110 | u = Para2Dict(urlparse(self.Url).query, '&', '='); 111 | for r in extract.findall(url): 112 | url = url.replace('[' + r + ']', u[r]) 113 | return url; 114 | 115 | def GetHTML(self, destUrl=None): 116 | if destUrl is None: 117 | destUrl = self.Url; 118 | destUrl = self.PraseURL(destUrl); 119 | socket.setdefaulttimeout(self.Timeout); 120 | cj = http.cookiejar.CookieJar() 121 | pro = urllib.request.HTTPCookieProcessor(cj) 122 | opener = urllib.request.build_opener(pro) 123 | t = [(r, self.Headers[r]) for r in self.Headers]; 124 | opener.addheaders = t; 125 | binary_data = self.postdata.encode('utf-8') 126 | try: 127 | destUrl.encode('ascii') 128 | except UnicodeEncodeError: 129 | destUrl = iriToUri(destUrl) 130 | 131 | try: 132 | if self.postdata=='': 133 | page=opener.open(destUrl); 134 | else: 135 | page = opener.open(destUrl, binary_data) 136 | html = page.read() 137 | except Exception as e: 138 | print(e); 139 | return "" 140 | 141 | 142 | if page.info().get('Content-Encoding') == 'gzip': 143 | html = gzip.decompress(html) 144 | encoding = charset.search(str(html)) 145 | if encoding is not None: 146 | encoding = encoding.group(1); 147 | if encoding is None: 148 | encoding = 'utf-8' 149 | try: 150 | html=html.decode(encoding) 151 | except UnicodeDecodeError as e: 152 | print(e); 153 | import chardet 154 | encoding= chardet.detect(html) 155 | html=html.decode(encoding); 156 | 157 | return html; 158 | 159 | 160 | # 解压函数 161 | def ungzip(data): 162 | data = gzip.decompress(data) 163 | return data; 164 | 165 | def IsNone(data): 166 | return data is None or data==''; 167 | 168 | def __getnodetext__(node, arrs): 169 | t=node.text; 170 | if t is not None: 171 | s = t.strip(); 172 | if s != '': 173 | arrs.append(s) 174 | for sub in node.iterchildren(): 175 | __getnodetext__(sub,arrs) 176 | 177 | def getnodetext(node): 178 | if node is None: 179 | return "" 180 | arrs=[]; 181 | __getnodetext__(node,arrs); 182 | return ' '.join(arrs); 183 | 184 | 185 | class SmartCrawler(extends.EObject): 186 | def __init__(self): 187 | self.IsMultiData = "List"; 188 | self.HttpItem = None; 189 | self.Name = None; 190 | self.CrawItems = None; 191 | self.Login = ""; 192 | self.haslogin = False; 193 | self.RootXPath='' 194 | 195 | def autologin(self, loginItem): 196 | if loginItem.postdata is None: 197 | return; 198 | import http.cookiejar 199 | cj = http.cookiejar.CookieJar() 200 | pro = urllib.request.HTTPCookieProcessor(cj) 201 | opener = urllib.request.build_opener(pro) 202 | t = [(r, loginItem.Headers[r]) for r in loginItem.Headers]; 203 | opener.addheaders = t; 204 | binary_data = loginItem.postdata.encode('utf-8') 205 | op = opener.open(loginItem.Url, binary_data) 206 | data = op.read().decode('utf-8') 207 | print(data) 208 | self.HttpItem.Url = op.url; 209 | return opener; 210 | 211 | def CrawData(self, url): 212 | 213 | if self.Login !="" and self.haslogin == False: 214 | self.HttpItem.opener = self.autologin(self.Login); 215 | self.haslogin = True; 216 | html = self.HttpItem.GetHTML(url); 217 | 218 | root =None if html=='' else etree.HTML(html); 219 | if root is None: 220 | return {} if self.IsMultiData == 'One' else []; 221 | 222 | tree = etree.ElementTree(root); 223 | if isinstance(self.CrawItems, list) and len(self.CrawItems) == 0: 224 | return {'Content': html}; 225 | 226 | return self.GetDataFromCrawItems(tree ); 227 | 228 | def GetDataFromCrawItems(self,tree): 229 | documents = []; 230 | if self.IsMultiData =='One': 231 | document = {}; 232 | for r in self.CrawItems: 233 | data = GetDataFromXPath(tree, r.XPath); 234 | if data is not None: 235 | document[r.Name] = data; 236 | else: 237 | document[r.Name] = ""; 238 | return document; 239 | else: 240 | if not IsNone(self.RootXPath): 241 | rootXPath = self.RootXPath; 242 | else: 243 | rootXPath = GetMaxCompareXPath(self.CrawItems); 244 | nodes = tree.xpath(rootXPath) 245 | if nodes is not None: 246 | for node in nodes: 247 | document = {}; 248 | for r in self.CrawItems: 249 | path=r.XPath; 250 | if IsNone(self.RootXPath): 251 | paths=r.XPath.split('/'); 252 | path='/'.join(paths[len(rootXPath.split('/')):len(paths)]); 253 | else: 254 | path= tree.getpath(node)+ path; 255 | data = GetDataFromXPath(node,path); 256 | if data is not None: 257 | document[r.Name] = data; 258 | if len(document) == 0: 259 | continue; 260 | documents.append(document); 261 | return documents; 262 | 263 | def Para2Dict(para, split1, split2): 264 | r = {}; 265 | for s in para.split(split1): 266 | rs = s.split(split2); 267 | if len(rs) < 2: 268 | continue; 269 | key = rs[0]; 270 | value = s[len(key) + 1:]; 271 | r[rs[0]] = value; 272 | 273 | return r; 274 | 275 | 276 | def GetHTML(url, code=None): 277 | url = url.strip(); 278 | if not url.startswith('http'): 279 | url = 'http://' + url; 280 | print("auto transform %s" % (url)); 281 | socket.setdefaulttimeout(30) 282 | i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", 283 | "Accept": "text/plain"} 284 | req = urllib.request.Request(url=url, headers=i_headers) 285 | page = urllib.request.urlopen(req) 286 | html = page.read() 287 | return html; 288 | 289 | 290 | def GetHTMLFromFile(fname): 291 | f = open(fname, 'r', 'utf-8'); 292 | r = f.read(); 293 | return r; 294 | 295 | 296 | def GetCrawNode(craws, name, tree): 297 | for r in craws: 298 | if r.Name == name: 299 | return tree.xpath(r.XPath); 300 | return None; 301 | 302 | 303 | def GetImageFormat(name): 304 | if name is None: 305 | return None, None; 306 | p = name.split('.'); 307 | if len(p) != 2: 308 | return name, 'jpg'; 309 | 310 | back = p[-1]; 311 | if back == "jpg" or back == "png" or back == "gif": # back=="png" ignore because png is so big! 312 | return p[-2], back; 313 | return None, None; 314 | 315 | 316 | def GetCrawData(crawitems, tree): 317 | doc = {}; 318 | for crawItem in crawitems: 319 | node = tree.xpath(crawItem.XPath); 320 | if len(node) == 0: 321 | if crawItem.IsMust: 322 | return; 323 | if crawItem.IsHTMLorText is False: 324 | text = node[0].text; 325 | else: 326 | text = etree.tostring(node[0]); 327 | doc[crawItem.Name] = text; 328 | return doc; 329 | 330 | 331 | def GetHtmlTree(html): 332 | root = etree.HTML(html); 333 | tree = etree.ElementTree(root); 334 | return tree; 335 | --------------------------------------------------------------------------------