├── .gitignore
├── LICENSE
├── README.md
├── distributed.py
├── etl.py
├── extends.py
├── project.xml
├── sample.py
└── spider.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # etlpy
 2 | ##designed by desert
 3 | a smart stream-like crawler &amp; etl python library
 4 | 
 5 | ##1.简介
 6 | etlpy是基于配置文件的数据采集和清洗工具。  
 7 | 
 8 | 写爬虫和数据清洗代码总是很烦人。因此，应该通过工具生成爬虫和数据清洗的代码！  etlpy就是为了解决这个问题而生的。  
 9 | 
10 | 通过可视化和图形化设计工具，快速生成爬虫和数据清洗流程，并保存为xml文件，并由etlpy引擎解析它，即可获得最终的数据结果。
11 | 
12 | ##2.使用
13 | 使用起来非常简单:
14 | ```
15 | from etl import ETLTool
16 | tool = ETLTool();
17 | tool.LoadProject('project.xml', '数据清洗ETL-大众点评');
18 | datas = tool.RefreshDatas();
19 | for r in datas:
20 |   print(r)
21 | ```
22 | RefreshDatas函数返回的是生成器，通过for循环，即可自动读取所有数据。
23 | 
24 | ##3.基本原理
25 | 模块分为 生成，过滤，排序，转换，执行四种。  
26 | 
27 | 利用Python的生成器，可以将不同模块组织起来，定义一个流水线，数据（python的字典）会在流水线上被加工和消费。  
28 | 
29 | 图形化工具是用C#开发的，使用了类似Python生成器的Linq技术。其原始思路来自于Lisp的s-表达式。
30 | 
31 | ##4. 用途
32 | 爬虫，计算，清洗，任何符合一定计算范式的数据，都可以使用它来完成。
33 | 


--------------------------------------------------------------------------------
/distributed.py:
--------------------------------------------------------------------------------
  1 | import  sys;
  2 | from queue import Queue
  3 | from multiprocessing.managers import BaseManager
  4 | import etl;
  5 | import json
  6 | import extends;
  7 | import time;
  8 | authkey= "etlpy".encode('utf-8')
  9 | timeout=1;
 10 | rpc_port=8888
 11 | 
 12 | class ETLJob:
 13 |     def __init__(self,project,jobname,config,id):
 14 |         self.project= project;
 15 |         self.jobname=jobname;
 16 |         self.config=config;
 17 |         self.id= id;
 18 | 
 19 | class JobResult:
 20 |     def __init__(self,name,count,id):
 21 |         self.name=name;
 22 |         self.count=count;
 23 |         self.id=id;
 24 | 
 25 | class Master:
 26 | 
 27 |     def __init__(self,project,jobname):
 28 |         # 派发出去的作业队列
 29 |         self.dispatched_job_queue = Queue()
 30 |         # 完成的作业队列
 31 |         self.finished_job_queue = Queue()
 32 |         self.project= project;
 33 |         self.jobname=jobname;
 34 |         self.maxprocess= 10;
 35 | 
 36 |     def get_dispatched_job_queue(self):
 37 |         return self.dispatched_job_queue
 38 | 
 39 |     def get_finished_job_queue(self):
 40 |         return self.finished_job_queue
 41 | 
 42 |     def start(self,skip=0):
 43 |         # 把派发作业队列和完成作业队列注册到网络上
 44 |         BaseManager.register('get_dispatched_job_queue', callable=self.get_dispatched_job_queue)
 45 |         BaseManager.register('get_finished_job_queue', callable=self.get_finished_job_queue)
 46 | 
 47 |         # 监听端口和启动服务
 48 |         manager = BaseManager(address=('0.0.0.0', rpc_port), authkey=authkey)
 49 |         manager.start()
 50 | 
 51 |         # 使用上面注册的方法获取队列
 52 |         dispatched_jobs = manager.get_dispatched_job_queue()
 53 |         finished_jobs = manager.get_finished_job_queue()
 54 | 
 55 |         job_id = 0
 56 |         module= self.project.modules[self.jobname];
 57 | 
 58 |         proj=json.loads(json.dumps(etl.convert_dict(self.project,self.project.__defaultdict__), ensure_ascii=False))
 59 |         while True:
 60 |             for task in etl.parallel_map(module):
 61 |                 job_id = job_id + 1
 62 |                 if job_id<skip:
 63 |                     continue
 64 |                 job = ETLJob(proj, self.jobname, task, job_id);
 65 |                 print('Dispatch job: %s' % job.id)
 66 |                 dispatched_jobs.put(job)
 67 | 
 68 |             while not dispatched_jobs.empty():
 69 |                 job = finished_jobs.get(60)
 70 |                 print('Finished Job: %s, Count: %s' % (job.id, job.count))
 71 | 
 72 |             key=input('press any key to repeat,c to cancel')
 73 |             if key=='c':
 74 |                 manager.shutdown()
 75 |                 break
 76 | 
 77 |         #manager.shutdown()
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | class Slave:
 84 | 
 85 |     def __init__(self):
 86 |         # 派发出去的作业队列
 87 |         self.dispatched_job_queue = Queue()
 88 |         # 完成的作业队列
 89 |         self.finished_job_queue = Queue()
 90 |     def start(self,execute= True,serverip='127.0.0.1',port=8888):
 91 |         # 把派发作业队列和完成作业队列注册到网络上
 92 |         BaseManager.register('get_dispatched_job_queue')
 93 |         BaseManager.register('get_finished_job_queue')
 94 | 
 95 |         server = serverip;
 96 |         print('Connect to server %s...' % server)
 97 |         manager = BaseManager(address=(server, port), authkey=authkey)
 98 |         manager.connect()
 99 |         # 使用上面注册的方法获取队列
100 |         dispatched_jobs = manager.get_dispatched_job_queue()
101 |         finished_jobs = manager.get_finished_job_queue()
102 | 
103 |         # 运行作业并返回结果，这里只是模拟作业运行，所以返回的是接收到的作业
104 |         while True:
105 |             if dispatched_jobs.empty():
106 |                 time.sleep(1)
107 |                 print('queue is empty,wait 1 sec...')
108 |                 continue;
109 | 
110 |             job = dispatched_jobs.get(timeout=timeout)
111 |             print('Run job: %s ' % job.id)
112 |             project=job.project;
113 |             project= etl.LoadProject_dict(project);
114 |             module= project.modules[job.jobname];
115 |             count=0
116 |             try:
117 |                 generator= etl.parallel_reduce(module,[ job.config],execute)
118 |                 for r in generator:
119 |                     count+=1;
120 |             except Exception as e:
121 |                 print(e)
122 |             print('finish job,id %s, count %s'%(job.id,count))
123 |             resultjob= JobResult(job.jobname,count,job.id)
124 | 
125 |             finished_jobs.put(resultjob)
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     ip='127.0.0.1'
130 |     port=8888;
131 |     argv=sys.argv;
132 |     if len(argv)>1:
133 |         ip=argv[1];
134 |     if len(argv)>2:
135 |         port=int(argv[2]);
136 |     slave= Slave();
137 |     slave.start(True,ip,port);
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/etl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | __author__ = 'zhaoyiming'
  3 | import re;
  4 | import extends
  5 | import urllib
  6 | import spider;
  7 | import json;
  8 | import html
  9 | import xml.etree.ElementTree as ET
 10 | import csv
 11 | 
 12 | import os;
 13 | 
 14 | intattrs = re.compile('Max|Min|Count|Index|Interval|Position');
 15 | boolre = re.compile('^(One|Can|Is)|Enable|Should|Have|Revert');
 16 | rescript = re.compile('Regex|Number')
 17 | 
 18 | 
 19 | def SetAttr(etl, key, value):
 20 |     if key in ['Group','Type']:
 21 |         return
 22 | 
 23 |     if intattrs.search(key) is not None:
 24 |         try:
 25 |             t = int(value);
 26 |             setattr(etl, key, t);
 27 |         except ValueError:
 28 |             print('it is a ValueError')
 29 |             setattr(etl, key, value);
 30 |     elif boolre.search(key) is not None:
 31 |         setattr(etl, key, True if value == 'True' else False);
 32 |     else:
 33 |         setattr(etl, key, value);
 34 | 
 35 | def getMatchCount(mat):
 36 |     return mat.lastindex if mat.lastindex is not None else 1;
 37 | 
 38 | class ETLTool(extends.EObject):
 39 |     def __init__(self):
 40 |         self.Enabled=True;
 41 |         self.Column = ''
 42 |     def process(self, data):
 43 |         return data
 44 |     def init(self):
 45 |         pass;
 46 | 
 47 | class Transformer(ETLTool):
 48 |     def __init__(self):
 49 |         super(Transformer, self).__init__()
 50 |         self.IsMultiYield=False
 51 |         self.NewColumn='';
 52 |         self.OneOutput=True;
 53 |         self.OneInput = False;
 54 | 
 55 |     def transform(self,data):
 56 |         pass;
 57 |     def process(self,data):
 58 |         if self.IsMultiYield:  # one to many
 59 |             for r in data:
 60 |                 for p in self.transform( r):
 61 |                     yield extends.MergeQuery(p, r,self.NewColumn);
 62 |             return;
 63 |         for d in data:  # one to one
 64 |             if self.OneOutput:
 65 |                 if self.Column not in d or self.Column not in d:
 66 |                     yield d;
 67 |                     continue;
 68 |                 item = d[self.Column] if self.OneInput else d;
 69 |                 res = self.transform(item)
 70 |                 key= self.NewColumn if self.NewColumn!='' else self.Column;
 71 |                 d[key]=res;
 72 |             else:
 73 |                 self.transform( d)
 74 |             yield d;
 75 | 
 76 | class Executor(ETLTool):
 77 |     def execute(self,data):
 78 |         pass;
 79 |     def process(self,data):
 80 |         for r in data:
 81 |             self.execute(r);
 82 |             yield r;
 83 | 
 84 | 
 85 | class Filter(ETLTool):
 86 |     def __init__(self):
 87 |         super(Filter, self).__init__()
 88 |         self.Revert=False;
 89 |     def filter(self,data):
 90 | 
 91 |         return True;
 92 | 
 93 |     def process(self, data):
 94 |         for r in data:
 95 |             item = None;
 96 |             if self.Column in r:
 97 |                 item = r[self.Column];
 98 |             if item is None and self.__class__ != NullFT:
 99 |                 continue;
100 |             result = self.filter( item)
101 |             if result == True and self.Revert == False:
102 |                 yield r;
103 |             elif result == False and self.Revert == True:
104 |                 yield r;
105 | 
106 | class Generator(ETLTool):
107 |     def __init__(self):
108 |         super(Generator, self).__init__()
109 |         self.MergeType='Append'
110 |         self.Position=0;
111 |     def generate(self,generator):
112 |         pass;
113 | 
114 |     def process(self, generator):
115 |         if generator is None:
116 |             return  self.generate(None);
117 |         else:
118 |             if self.MergeType=='Append':
119 |                 return extends.Append(generator,self.process(None));
120 |             elif self.MergeType=='Merge':
121 |                 return extends.Merge(generator, self.process(None));
122 |             else:
123 |                 return extends.Cross(generator,self.generate)
124 | 
125 | 
126 | 
127 | class ConnectorBase(ETLTool):
128 |     def __init__(self):
129 |         super(ConnectorBase, self).__init__()
130 |         self.Connector = '';
131 |         self.ExecuteType = 'OnlyInsert'
132 |         self.filetype = '';
133 | 
134 |     def init(self):
135 |         self.connector= self.__proj__.connectors[self.Connector];
136 |         if self.connector.TypeName=='MongoDBConnector':
137 |             import pymongo
138 |             client = pymongo.MongoClient(self.connector.ConnectString);
139 |             db = client[self.connector.DBName];
140 |             self.Table = db[self.TableName];
141 |         else:
142 |             path = self.TableName;
143 |             filetype = path.split('.')[-1].lower();
144 |             encode = 'utf-8';
145 |             self.file = open(path, type, encoding=encode)
146 |             self.filetype = filetype;
147 | 
148 | 
149 | class DbEX(ConnectorBase):
150 |     def __init__(self):
151 |         super(DbEX, self).__init__()
152 |         self.TableName=''
153 | 
154 | 
155 | 
156 | 
157 |     def process(self,datas):
158 |         if self.connector.TypeName == 'MongoDBConnector':
159 |             etype = self.ExecuteType;
160 |             table = self.Table;
161 |             work = {'OnlyInsert': lambda d: table.save(d),'InsertOrUpdate':lambda d: table.save(d)};
162 |             for data in datas:
163 |                 work[etype](data);
164 |                 yield data;
165 |         else:
166 | 
167 |             if self.filetype in ['csv', 'txt']:
168 |                 field = extends.getkeys(datas);
169 |                 self.writer = csv.DictWriter(self.file, field, delimiter=sp, lineterminator='\n')
170 |                 self.writer.writeheader()
171 |                 for data in datas:
172 |                     self.writer.writerow(data);
173 |                     yield data;
174 |             elif self.filetype == 'json':
175 |                 self.file.write('[')
176 |                 for data in datas:
177 |                     json.dump(data, self.file, ensure_ascii=False)
178 |                     self.file.write(',');
179 |                     yield data;
180 |                 self.file.write(']')
181 |             self.file.close();
182 | 
183 | 
184 | class DBGE(ConnectorBase):
185 | 
186 |     def generate(self,data):
187 |         if self.Connector=='MongoDBConnector':
188 |             for data in self.Table.find():
189 |                 yield data;
190 |         else:
191 |             if self.filetype in ['csv', 'txt']:
192 |                 sp = ',' if self.filetype == 'csv' else '\t';
193 |                 reader = csv.DictReader(self.file, delimiter=sp)
194 |                 for r in reader:
195 |                     yield r;
196 |             elif self.filetype == 'json':
197 |                 items = json.load(self.file);
198 |                 for r in items:
199 |                     yield r;
200 | 
201 |     def process(self, generator):
202 |         if generator is None:
203 |             return self.generate(None);
204 |         else:
205 |             if self.MergeType == 'Append':
206 |                 return extends.Append(generator, self.process(None));
207 |             elif self.MergeType == 'Merge':
208 |                 return extends.Merge(generator, self.process(None));
209 |             else:
210 |                 return extends.Cross(generator, self.generate)
211 | 
212 | 
213 | def setValue(data,etl,value):
214 |     if etl.NewColumn!='':
215 |         data[etl.NewColumn]=value;
216 |     else:
217 |         data[etl.Column]=value;
218 | 
219 | class RegexFT(Filter):
220 | 
221 |     def init(self):
222 |         self.Regex = re.compile(self.Script);
223 |         self.Count=1;
224 | 
225 |     def filter(self,data):
226 |         v = self.Regex.findall(data);
227 |         if v is None:
228 |             return False;
229 |         else:
230 |             return self.Count <= len(v)
231 | 
232 | class RangeFT(Filter):
233 | 
234 |     def filter(self,item):
235 |         f = float(item)
236 |         return self.Min <= f <= self.Max;
237 | 
238 | class RepeatFT(Filter):
239 | 
240 |     def init(self):
241 |         self.set=set();
242 |     def filter(self,data):
243 |         if data in self.set:
244 |             return False;
245 |         else:
246 |             self.set.add(data);
247 |             return True;
248 | 
249 | class NullFT(Filter):
250 | 
251 |     def filter(self,data):
252 |         if data is None:
253 |             return False;
254 |         if isinstance(data, str):
255 |             return data.strip() != '';
256 |         return True;
257 | 
258 | 
259 | class AddNewTF(Transformer):
260 | 
261 |     def transform(self,data):
262 |         return self.NewValue;
263 | 
264 | 
265 | class AutoIndexTF(Transformer):
266 |     def init(self):
267 |         super(AutoIndexTF, self).__init__()
268 |         self.currindex = 0;
269 |     def transform(self, data):
270 |         self.currindex += 1;
271 |         return self.currindex;
272 | 
273 | 
274 | class RenameTF(Transformer):
275 | 
276 |     def __init__(self):
277 |         super(RenameTF, self).__init__()
278 |         self.OneOutput = False;
279 |     def transform(self, data):
280 |         if not self.Column in data:
281 |             return;
282 |         item = data[self.Column];
283 |         del data[self.Column];
284 |         if self.NewColumn != "":
285 |             data[self.NewColumn] = item;
286 | 
287 | class DeleteTF(Transformer):
288 |     def __init__(self):
289 |         super(DeleteTF, self).__init__()
290 |         self.OneOutput = False;
291 |     def transform(self, data):
292 |         if self.Column in data:
293 |             del data[self.Column];
294 | 
295 | class HtmlTF(Transformer):
296 |     def __init__(self):
297 |         super(HtmlTF, self).__init__()
298 |         self.OneInput=True;
299 | 
300 |     def transform(self, data):
301 |         return html.escape(data) if self.ConvertType == 'Encode' else html.unescape(data);
302 | 
303 | 
304 | class UrlTF(Transformer):
305 |     def __init__(self):
306 |         super(UrlTF, self).__init__()
307 |         self.OneInput = True;
308 |     def transform(self, data):
309 |         if self.ConvertType == 'Encode':
310 |             url = data.encode('utf-8');
311 |             return urllib.parse.quote(url);
312 |         else:
313 |             return urllib.parse.unquote(data);
314 | 
315 | 
316 | class RegexSplitTF(Transformer):
317 |     def transform(self, data):
318 |         items = re.split(self.Regex, data)
319 |         if len(items) <= self.Index:
320 |             return data;
321 |         if not self.FromBack:
322 |             return items[self.Index];
323 |         else:
324 |             index = len(items) - self.Index - 1;
325 |             if index < 0:
326 |                 return data;
327 |             else:
328 |                 return items[index];
329 |         return items[index];
330 | 
331 | class MergeTF(Transformer):
332 |     def __init__(self):
333 |         super(MergeTF, self).__init__()
334 |         self.Format='{0}'
335 |         self.MergeWith=''
336 |     def transform(self, data):
337 |         if self.MergeWith == '':
338 |             columns = [];
339 |         else:
340 |             columns = [str(data[r]) for r in self.MergeWith.split(' ')]
341 |         columns.insert(0, data[self.Column] if self.Column in data else '');
342 |         res = self.Format;
343 |         for i in range(len(columns)):
344 |             res = res.replace('{' + str(i) + '}', str(columns[i]))
345 |         return res;
346 | 
347 | 
348 | 
349 | 
350 | class RegexTF(Transformer):
351 |     def __init__(self):
352 |         super(RegexTF, self).__init__()
353 |         self.Script = '';
354 |         self.OneInput = True;
355 | 
356 |     def init(self):
357 |         self.Regex = re.compile(self.Script);
358 |     def transform(self, data):
359 |         item = re.findall(self.Regex, str(data));
360 |         if self.Index < 0:
361 |             return '';
362 |         if len(item) <= self.Index:
363 |             return '';
364 |         else:
365 |             r = item[self.Index];
366 |             return r if isinstance(r, str) else r[0];
367 | 
368 | class ReReplaceTF(RegexTF):
369 | 
370 |     def transform(self, data):
371 |         return re.sub(self.Regex, self.ReplaceText, data);
372 | 
373 | class NumberTF(RegexTF):
374 |     def __init__(self):
375 |         super(NumberTF, self).__init__()
376 |         self.Script=''  #TODO
377 | 
378 |     def transform(self, data):
379 |         t = super(NumberTF,self).transform( data);
380 |         if t is not None and t != '':
381 |             return int(t);
382 |         return t;
383 | 
384 | class SplitTF(Transformer):
385 |     def __init__(self):
386 |         super(SplitTF, self).__init__()
387 |         self.SplitChar='';
388 |         self.OneInput = True;
389 | 
390 | 
391 |     def transform(self, data):
392 |         splits = self.SplitChar.split(' ');
393 |         sp = splits[0]
394 |         if sp == '':
395 |             return data;
396 | 
397 |         r = data.split(splits[0]);
398 |         if len(r) > self.Index:
399 |             return r[self.Index];
400 |         return '';
401 | 
402 | class TrimTF(Transformer):
403 |     def __init__(self):
404 |         super(TrimTF, self).__init__()
405 |         self.OneInput = True;
406 | 
407 |     def transform(self, data):
408 |         return data.strip();
409 | 
410 | class StrExtractTF(Transformer):
411 |     def __init__(self):
412 |         super(StrExtractTF, self).__init__()
413 |         self.HaveStartEnd=False;
414 |         self.Start=''
415 |         self.OneInput=True;
416 |         self.End=''
417 | 
418 |     def transform(self, data):
419 |         start = data.find(self.Former);
420 |         if start == -1:
421 |             return
422 |         end = data.find(self.End, start);
423 |         if end == -1:
424 |             return;
425 |         if self.HaveStartEnd:
426 |             end += len(self.End);
427 |         if not self.HaveStartEnd:
428 |             start += len(self.Former);
429 |         return data[start:end];
430 | 
431 | class PythonTF(Transformer):
432 |     def __init__(self):
433 |         super(PythonTF, self).__init__()
434 |         self.OneOutput=False
435 |         self.Script='value'
436 |         self.ScriptWorkMode='不进行转换'
437 |     def transform(self, data):
438 |         result = eval(self.Script, {'value': data[self.Column]}, data);
439 |         if result is not None and self.IsMultiYield == False:
440 |             key = self.NewColumn if self.NewColumn != '' else self.Column;
441 |             data[key] = result;
442 |         return result;
443 | 
444 | class CrawlerTF(Transformer):
445 |     def __init__(self):
446 |         super(CrawlerTF, self).__init__()
447 |         self.CrawlerSelector='';
448 |         self.MaxTryCount=1;
449 |         self.IsRegex=False
450 |         self.OneOutput=False;
451 |     def init(self):
452 |         self.IsMultiYield = True;
453 |         self.crawler = self.__proj__.modules.get(self.CrawlerSelector, None);
454 |         self.buff = {};
455 |     def transform(self, data):
456 |         crawler = self.crawler;
457 |         url = data[self.Column];
458 |         buff = self.buff;
459 |         if url in buff:
460 |             datas = buff[url];
461 |         else:
462 |             datas = crawler.CrawData(url);
463 |             if len(buff) < 100:
464 |                 buff[url] = datas;
465 |         if self.crawler.IsMultiData == 'List':
466 |             for d in datas:
467 |                 res = extends.MergeQuery(d, data, self.NewColumn);
468 |                 yield res;
469 |         else:
470 |             data = extends.Merge(data, datas);
471 |             yield data;
472 | 
473 | 
474 | class XPathTF(Transformer):
475 |     def __init__(self):
476 |         super(XPathTF, self).__init__()
477 |         self.XPath=''
478 |         self.IsMultiYield = True;
479 |         self.OneOutput=False;
480 | 
481 |     def init(self):
482 |         self.IsMultiYield=True;
483 |         self.OneOutput = False;
484 |     def transform(self, data):
485 |         from lxml import etree
486 |         if self.IsManyData:
487 |             tree = spider.GetHtmlTree(data[self.Column]);
488 |             nodes = tree.xpath(self.XPath);
489 |             for node in nodes:
490 |                 ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')};
491 |                 ext['OHTML'] = ext['HTML']
492 |                 yield extends.MergeQuery(ext, data, self.NewColumn);
493 |         else:
494 |             tree = spider.GetHtmlTree(data[self.Column]);
495 |             nodes = tree.xpath(self.XPath);
496 |             node=nodes[0]
497 |             if hasattr(node,'text'):
498 |                 setValue(data, self, node.text);
499 |             else:
500 |                 setValue(data,self,str(node))
501 |             yield data;
502 | 
503 | 
504 | class ToListTF(Transformer):
505 |     def transform(self, data):
506 |         yield data;
507 | 
508 | class JsonTF(Transformer):
509 |     def __init__(self):
510 |         super(JsonTF, self).__init__()
511 |         self.OneOutput=False
512 |         self.ScriptWorkMode='文档列表';
513 | 
514 |     def init(self):
515 |         self.IsMultiYield= self.ScriptWorkMode=='文档列表';
516 | 
517 |     def transform(self, data):
518 |         js = json.loads(data[self.Column]);
519 |         if isinstance(js, list):
520 |             for j in js:
521 |                 yield j;
522 |         else:
523 |             yield js;
524 | 
525 | class RangeGE(Generator):
526 |     def __init__(self):
527 |         super(RangeGE, self).__init__()
528 |         self.Interval='1'
529 |         self.MaxValue='1'
530 |         self.MinValue='1'
531 |     def generate(self,generator):
532 |         interval= int(extends.Query(generator,self.Interval))
533 |         maxvalue= int(extends.Query(generator,self.MaxValue))
534 |         minvalue= int(extends.Query(generator,self.MinValue))
535 |         for i in range(minvalue,maxvalue,interval):
536 |             item= {self.Column:round(i,5)}
537 |             yield item;
538 | 
539 | class RangeTF(Transformer):
540 |     def __init__(self):
541 |         super(RangeTF, self).__init__()
542 |         self.Skip=0;
543 |         self.Take=9999999;
544 |     def transform(self, data):
545 |         skip = int(extends.Query(data, self.Skip));
546 |         take = int(extends.Query(data, self.Take));
547 |         i = 0;
548 |         for r in data:
549 |             if i < skip:
550 |                 continue;
551 |             if i >= take:
552 |                 break;
553 |             i += 1;
554 |             yield r;
555 | 
556 | 
557 | class EtlGE(Generator):
558 |     def generate(self,data):
559 |         subetl = self.__proj__.modules[self.ETLSelector];
560 |         for r in generate(subetl.AllETLTools):
561 |             yield r;
562 | 
563 | class EtlEX(Executor):
564 |     def execute(self,datas):
565 |         subetl = self.__proj__.modules[self.ETLSelector];
566 |         for data in datas:
567 |             if spider.IsNone(self.NewColumn):
568 |                 doc = data.copy();
569 |             else:
570 |                 doc = {};
571 |                 extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column);
572 |             result=(r for r in generate(subetl.AllETLTools, [doc]))
573 |             count=0;
574 |             for r in result:
575 |                 count+=1;
576 |                 print(r);
577 |             print(count)
578 |             yield data;
579 | 
580 | class EtlTF(Transformer):
581 |     def transform(self,datas):
582 |         subetl = self.__proj__.modules[self.ETLSelector];
583 |         if self.IsMultiYield:
584 | 
585 |             for data in datas:
586 |                 doc = data.copy();
587 |                 for r in subetl.__generate__(subetl.AllETLTools, [doc]):
588 |                     yield extends.MergeQuery(r, data, self.NewColumn);
589 |         else:
590 |             yield None;  # TODO
591 | 
592 | 
593 | 
594 | class TextGE(Generator):
595 |     def __init__(self):
596 |         super(TextGE, self).__init__()
597 |         self.Content='';
598 |     def init(self):
599 |         self.arglists= [r.strip() for r in self.Content.split('\n')];
600 |     def generate(self,data):
601 |         for i in range(self.Position, len(self.arglists)):
602 |             yield {self.Column: self.arglists[i]}
603 | 
604 | 
605 | 
606 | 
607 | 
608 | 
609 | class TableEX(Executor):
610 |     def __init__(self):
611 |         super(TableEX, self).__init__()
612 |         self.Table = 'Table';
613 |     def execute(self,data):
614 |         tables= self.__proj__.tables;
615 |         tname = self.Table;
616 |         if tname not in tables:
617 |             tables[tname] = [];
618 |         for r in data:
619 |             tables[tname].append(r);
620 |             yield r;
621 | 
622 | 
623 | 
624 | 
625 | 
626 | 
627 | 
628 | class BaiduLocation(Transformer):
629 |     pass;
630 | 
631 | 
632 | class GetIPLocation(Transformer):
633 |     pass;
634 | 
635 | class GetRoute(Transformer):
636 |     pass;
637 | 
638 | class NearbySearch(Transformer):
639 |     pass;
640 | 
641 | class NlpTF(Transformer):
642 |     pass;
643 | 
644 | class TransTF(Transformer):
645 |     pass;
646 | class JoinDBTF(Transformer):
647 |     pass;
648 | 
649 | class RepeatTF(Transformer):
650 |     pass;
651 | class ResponseTF(Transformer):
652 |     pass;
653 | 
654 | class Time2StrTF(Transformer):
655 |     pass;
656 | 
657 | 
658 | class BfsGE(Generator):
659 |     pass;
660 | 
661 | class DictTF(Transformer):
662 |     pass;
663 | 
664 | class FileExistFT(Transformer):
665 |     def __init__(self):
666 |         super(FileExistFT,self).__init__();
667 |         self.Script = '';
668 |         self.OneInput = True;
669 |     def transform(self,data):
670 |         import os;
671 |         return str(os.path.exists(data));
672 | 
673 | class MergeRepeatTF(Transformer):
674 |     pass;
675 | 
676 | class NumRangeFT(Filter):
677 |     pass;
678 | 
679 | class DelayTF(Transformer):
680 |     pass;
681 | 
682 | class ReadFileTextTF(Transformer):
683 |     pass;
684 | 
685 | class WriteFileTextTF(Transformer):
686 |     pass;
687 | class FolderGE(Generator):
688 |     pass;
689 | 
690 | class TableGE(Generator):
691 |     pass;
692 | class FileDataTF(Transformer):
693 |     pass;
694 | 
695 | 
696 | 
697 | class SaveFileEX(Executor):
698 |     def __init__(self):
699 |         super(SaveFileEX, self).__init__()
700 |         self.SavePath='';
701 | 
702 |     def execute(self,data):
703 | 
704 |         save_path = extends.Query(data, self.SavePath);
705 |         (folder,file)=os.path.split(save_path);
706 |         if not os.path.exists(folder):
707 |             os.makedirs(folder);
708 |         urllib.request.urlretrieve(data[self.Column], save_path)
709 | 
710 | 
711 | def GetChildNode(roots, name):
712 |     for etool in roots:
713 |         if etool.get('Name') == name or etool.tag == name:
714 |             return etool;
715 |     return None;
716 | 
717 | 
718 | def InitFromHttpItem(config, item):
719 |     httprib = config.attrib;
720 |     paras = spider.Para2Dict(httprib['Parameters'], '\n', ':');
721 |     item.Headers = paras;
722 |     item.Url = httprib['URL'];
723 |     post = 'Postdata';
724 |     if post in httprib:
725 |         item.postdata = httprib[post];
726 |     else:
727 |         item.postdata = None;
728 | 
729 | 
730 | 
731 | 
732 | class Project(extends.EObject):
733 |     def __init__(self):
734 |         self.modules={};
735 |         self.tables={}
736 |         self.connectors={};
737 |         self.__defaultdict__={};
738 | 
739 | 
740 | def LoadProject_dict(dic):
741 |     proj = Project();
742 |     for key,connector in dic['connectors'].items():
743 |         proj.connectors[key]= extends.dict_to_poco_type(connector);
744 |     for key,module in dic['modules'].items():
745 |         task =None;
746 |         if 'AllETLTools' in  module:
747 |             task = etl_factory(ETLTask(),proj);
748 |             for r in module['AllETLTools']:
749 |                 etl= etl_factory(r['Type'],proj);
750 |                 for attr,value in r.items():
751 |                     if attr in ['Type']:
752 |                         continue;
753 |                     setattr(etl,attr,value);
754 |                 etl.__proj__=proj;
755 |                 task.AllETLTools.append(etl)
756 |         elif 'CrawItems' in module:
757 |             task=etl_factory(spider.SmartCrawler(),proj);
758 |             task.CrawItems=[];
759 |             extends.dict_copy_poco(task,module);
760 |             for r in module['CrawItems']:
761 |                 crawlitem= etl_factory(spider.CrawItem(),proj)
762 |                 extends.dict_copy_poco(crawlitem,r);
763 |                 task.CrawItems.append(crawlitem)
764 |             task.HttpItem= etl_factory(spider.HTTPItem(),proj)
765 |             extends.dict_copy_poco(task.HttpItem,module['HttpItem'])
766 |             task.HttpItem.Headers=module['HttpItem']["Headers"];
767 |         if task is not  None:
768 |             proj.modules[key]=task;
769 | 
770 |     print('load project success')
771 |     return proj;
772 | 
773 | 
774 | def task_DumpLinq(tools):
775 |     array=[];
776 |     for t in tools:
777 |         typename= extends.get_type_name(t);
778 |         newcolumn=getattr(t,'NewColumn','');
779 |         s='%s,%s'%(typename,t.Column);
780 |         s+='=>%s,'%newcolumn if newcolumn!='' else ',';
781 |         attrs=[];
782 |         defaultdict= t.__proj__.__defaultdict__[typename];
783 |         for att in t.__dict__:
784 |             value=t.__dict__[att];
785 |             if att in ['NewColumn','Column','IsMultiYield']:
786 |                 continue
787 |             if not isinstance(value,(str,int,bool,float)):
788 |                 continue;
789 |             if value is None  or att not in defaultdict or  defaultdict[att]==value:
790 |                 continue;
791 |             attrs.append('%s=%s'%(att,value));
792 |         s+=','.join(attrs)
793 |         array.append(s)
794 |     return '\n'.join(array);
795 | 
796 | def convert_dict(obj,defaultdict):
797 |     if not isinstance(obj, (str, int, float, list, dict, tuple, extends.EObject)):
798 |         return None
799 | #    if isinstance(obj,)
800 |     if isinstance(obj, extends.EObject):
801 |         d={}
802 |         typename = extends.get_type_name(obj);
803 | 
804 |         for key, value in obj.__dict__.items():
805 |             if typename in defaultdict:
806 |                 default = defaultdict[typename];
807 |                 if value== default.get(key,None):
808 |                     continue;
809 |             if key.startswith('__'):
810 |                 continue;
811 | 
812 |             p =convert_dict(value,defaultdict)
813 |             if p is not None:
814 |                 d[key]=p
815 |         if isinstance(obj,ETLTool):
816 |             d['Type']= typename;
817 |         return d;
818 | 
819 |     elif isinstance(obj, list):
820 |        return [convert_dict(r,defaultdict) for r in obj];
821 |     elif isinstance(obj,dict):
822 |         return {key: convert_dict(value,defaultdict) for key,value in obj.items()}
823 |     return obj;
824 | 
825 | 
826 | 
827 | 
828 |     return d
829 | 
830 | def Project_DumpJson(proj):
831 |     dic=  convert_dict(proj,proj.__defaultdict__)
832 |     return  json.dumps(dic, ensure_ascii=False, indent=2)
833 | 
834 | 
835 | def Project_LoadJson(js):
836 |     d=json.loads(js);
837 |     return LoadProject_dict(d)
838 | 
839 | def etl_factory(item,proj):
840 |     if isinstance(item,str):
841 |         item=eval('%s()'%item);
842 |     else:
843 |         item=item;
844 |     import copy
845 |     name = extends.get_type_name(item)
846 |     if name not in proj.__defaultdict__:
847 |         proj.__defaultdict__[name]=copy.deepcopy(  item.__dict__);
848 |     return item;
849 | 
850 | 
851 | def Project_LoadXml(path):
852 |     tree = ET.parse(path);
853 |     proj=Project();
854 |     def factory(obj):
855 |         return  etl_factory(obj,proj);
856 |     root = tree.getroot();
857 |     root = root.find('Doc');
858 |     for etool in root:
859 |         if etool.tag == 'Children':
860 |             etype = etool.get('Type');
861 |             name = etool.get('Name');
862 |             if etype == 'SmartETLTool':
863 |                 etltool = factory(ETLTask());
864 |                 for m in etool:
865 |                     if m.tag == 'Children':
866 |                         type= m.attrib['Type']
867 |                         etl = factory(type);
868 |                         etl.__proj__=proj
869 |                         for att in m.attrib:
870 |                             SetAttr(etl, att, m.attrib[att]);
871 |                         etltool.AllETLTools.append(etl);
872 |                 proj.modules[name] = etltool;
873 |             elif etype == 'SmartCrawler':
874 |                 import spider;
875 |                 crawler =factory(spider.SmartCrawler());
876 |                 crawler.HttpItem= factory(spider.HTTPItem())
877 |                 crawler.Name = etool.attrib['Name'];
878 |                 crawler.IsMultiData = etool.attrib['IsMultiData']
879 |                 crawler.RootXPath= etool.attrib['RootXPath']
880 |                 httpconfig = GetChildNode(etool, 'HttpSet');
881 |                 InitFromHttpItem(httpconfig, crawler.HttpItem);
882 |                 login = GetChildNode(etool, 'Login');
883 |                 if login is not None:
884 |                     crawler.Login = factory(spider.HTTPItem());
885 |                     InitFromHttpItem(login, crawler.Login);
886 |                 crawler.CrawItems = [];
887 |                 for child in etool:
888 |                     if child.tag == 'Children':
889 |                         crawitem= factory(spider.CrawItem());
890 |                         crawitem.Name=child.attrib['Name'];
891 |                         crawitem.XPath = child.attrib['XPath'];
892 |                         crawler.CrawItems.append(crawitem);
893 | 
894 |                 proj.modules[name] = crawler;
895 |         elif etool.tag == 'DBConnections':
896 |             for tool in etool:
897 |                 if tool.tag == 'Children':
898 |                     connector = extends.EObject();
899 |                     for att in tool.attrib:
900 |                         SetAttr(connector, att, tool.attrib[att]);
901 |                     proj.connectors[connector.Name] = connector;
902 | 
903 |     print('load project success')
904 |     return proj;
905 | 
906 | 
907 | def generate(tools, generator=None, execute=False, enabledFilter=True):
908 |     #print(task_DumpLinq(tools));
909 |     for tool in tools:
910 |         if tool.Enabled == False and enabledFilter == True:
911 |             continue
912 |         tool.init();
913 |         if isinstance(tool,Executor) and execute==False:
914 |             continue;
915 | 
916 |         generator = tool.process(generator)
917 |     return generator;
918 | 
919 | def parallel_map(task, execute=True):
920 |     tools = task.AllETLTools;
921 |     index = extends.getindex(tools, lambda d: isinstance(d,  ToListTF));
922 |     if index == -1:
923 |         index = 0;
924 |         tool = tools[index];
925 |         generator = tool.process(None);
926 |     else:
927 |         generator = generate(tools[:index],None, execute=execute);
928 |     return generator;
929 | 
930 | def parallel_reduce(task,generator=None, execute=True):
931 |     tools = task.AllETLTools;
932 |     index = extends.getindex(tools, lambda d: isinstance(d,ToListTF));
933 |     index =0 if index==-1 else index;
934 |     generator = generate(tools[index + 1:], generator, execute);
935 |     return generator;
936 | 
937 | 
938 | 
939 | 
940 | 
941 | 
942 | class ETLTask(extends.EObject):
943 |     def __init__(self):
944 |         self.AllETLTools = [];
945 | 
946 | 
947 | 
948 |     def QueryDatas(self,  etlCount=100, execute=False):
949 |         return generate((tool for tool in self.AllETLTools[:etlCount]), None, execute);
950 | 
951 |     def Close(self):
952 |         for tool in self.AllETLTools:
953 |             if tool.Type in ['DbGE', 'DbEX']:
954 |                 if tool.connector.TypeName == 'FileManager':
955 |                     if tool.filetype == 'json':
956 |                         tool.file.write('{}]');
957 |                     tool.file.close();
958 | 
959 | 
960 |     def mThreadExecute(self, threadcount=10,canexecute=True):
961 |         import threadpool
962 |         pool = threadpool.ThreadPool(threadcount)
963 | 
964 |         seed= parallel_map(self,canexecute);
965 |         def Funcs(item):
966 |             task= parallel_reduce(self,[item],canexecute);
967 |             print('totalcount: %d'%len([r for r in task]));
968 |             print('finish' + str(item));
969 | 
970 |         requests = threadpool.makeRequests(Funcs, seed);
971 |         [pool.putRequest(req) for req in requests]
972 |         pool.wait()
973 |         # self.__close__()
974 | 
975 | 
976 | 


--------------------------------------------------------------------------------
/extends.py:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | import re;
  3 | 
  4 | spacere = re.compile("[ ]{2,}");
  5 | spacern = re.compile("(^\r\n?)|(\r\n?$)")
  6 | 
  7 | 
  8 | def getkeys(generator):
  9 |     count=0;
 10 |     s=set();
 11 |     for r in generator:
 12 |         s=s|r.keys();
 13 |         count+=1;
 14 |         if count>=20:
 15 |             return list(s);
 16 |     return list(s)
 17 | 
 18 | def ReplaceLongSpace(txt):
 19 |     r = spacere.subn(' ', txt)[0]
 20 |     r = spacern.subn('', r)[0]
 21 |     return r;
 22 | 
 23 | 
 24 | def Merge(d1, d2):
 25 |     for r in d2:
 26 |         d1[r] = d2[r];
 27 |     return d1;
 28 | 
 29 | 
 30 | def MergeQuery(d1, d2, columns):
 31 |     if isinstance(columns, str) and columns.strip() != "":
 32 |         columns = columns.split(' ');
 33 |     for r in columns:
 34 |         if r in d2:
 35 |             d1[r] = d2[r];
 36 |     return d1;
 37 | 
 38 | 
 39 | 
 40 | 
 41 | def Query(data, key):
 42 |     if data is None:
 43 |         return key;
 44 |     if isinstance(key, str) and key.startswith('[') and key.endswith(']'):
 45 |         key = key[1:-1];
 46 |         return data[key];
 47 |     return key;
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | def findany(iteral, func):
 54 |     for r in iteral:
 55 |         if func(r):
 56 |             return True;
 57 |     return False;
 58 | 
 59 | 
 60 | def getindex(iteral, func):
 61 |     for r in range(len(iteral)):
 62 |         if func(iteral[r]):
 63 |             return r;
 64 |     return -1;
 65 | 
 66 | def Cross(a, genefunc):
 67 | 
 68 |     for r1 in a:
 69 |         for r2 in genefunc(r1):
 70 |             for key in r2:
 71 |                 r1[key] = r2[key]
 72 |             yield r1;
 73 | 
 74 | 
 75 | def MergeAll(a, b):
 76 |     while True:
 77 |         t1 = a.__next__()
 78 |         if t1 is None:
 79 |             return;
 80 |         t2 = b.__next__()
 81 |         if t2 is not None:
 82 |             for t in t2:
 83 |                 t1[t] = t2[t];
 84 |         yield t1;
 85 | 
 86 | 
 87 | def Append(a, b):
 88 |     for r in a:
 89 |         yield r;
 90 |     for r in b:
 91 |         yield r;
 92 | 
 93 | def get_type_name(obj):
 94 |     s=str(obj.__class__);
 95 |     p=s.find('.');
 96 |     r= s[p+1:].split('\'')[0]
 97 |     return r;
 98 | 
 99 | 
100 | class EObject(object):
101 |     pass;
102 | 
103 | 
104 | 
105 | def convert_to_builtin_type(obj):
106 |     d=  { key:value for key,value in obj.__dict__.items() if isinstance(value,(str,int,float,list,dict,tuple,EObject) or value is None)};
107 |     return d
108 | 
109 | def dict_to_poco_type(obj):
110 |     if isinstance(obj,dict):
111 |         result=  EObject();
112 |         for key in obj:
113 |             v= obj[key]
114 |             setattr(result,key,dict_to_poco_type(v))
115 |         return result
116 |     elif isinstance(obj,list):
117 |         for i in range(len(obj)):
118 |             obj[i]=dict_to_poco_type(obj[i]);
119 | 
120 |     return obj;
121 | 
122 | 
123 | def dict_copy_poco(obj,dic):
124 |     for key,value in obj.__dict__.items():
125 |         if key in dic:
126 |             if isinstance(dic[key], (str,int,float)):
127 | 
128 |                 setattr(obj,key,dic[key])
129 | 


--------------------------------------------------------------------------------
/project.xml:
--------------------------------------------------------------------------------
  1 | <root>
  2 |   <Doc Name="" Description="" Version="0" SavePath="D:\我的工程.xml">
  3 |     <DBConnections>
  4 |       <Children DBName="" Name="文件管理" TypeName="FileManager" ConnectString="" AutoConnect="True">
  5 |         <Children Name="打开新文件" Size="0" Description="" />
  6 |       </Children>
  7 |       <Children DBName="" Name="MongoDB" TypeName="MongoDBConnector" ConnectString="mongodb://127.0.0.1" AutoConnect="False" />
  8 |     </DBConnections>
  9 |     <Children Name="数据清洗ETL-python" Type="SmartETLTool" DelayTime="200" GenerateMode="串行模式" CreateTime="2016/3/15 14:48:56" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]" TypeName="普通任务">
 10 |       <Children MinValue="1" MaxValue="20" Interval="1" RepeatCount="1" Column="temp" Enabled="True" Position="0" MergeType="Append" Type="RangeGE" Group="Generator" />
 11 |       <Children MinValue="1" MaxValue="40" Interval="2" RepeatCount="1" Column="temp2" Enabled="True" Position="0" MergeType="Merge" Type="RangeGE" Group="Generator" />
 12 |       <Children Script="temp+temp2" Column="temp" NewColumn="result" Enabled="True" OneOutput="False" IsMultiYield="False" Type="PythonTF" Group="Transformer" />
 13 |     </Children>
 14 |     <Children Name="网页采集器-链家二手房" Type="SmartCrawler" URL="http://bj.lianjia.com/ershoufang/pg2/" IsMultiData="List" URLFilter="" ContentFilter="" CreateTime="2016/3/15 13:04:15" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
 15 |       <HttpSet URL="http://bj.lianjia.com/ershoufang/pg2/" Allowautoredirect="False" Postdata="" Encoding="Unknown" Method="GET" Parameters="Cookie:select_city=110000;expires=Mon;25-Mar-2016 08:53:04 GMT=;Max-Age=315360000;path=/;domain=.lianjia.com;logger_session=f29b3d248467111a78cda6a2610cc72a;24-Mar-2016 11:53:04 GMT=;lianjia_uuid=4ad1da1f-8fb8-497a-8087-6d56c5313430;22-Mar-26 08:53:04 GMT=;05-Mar-2026 11:49:23 GMT=;07-Mar-2016 14:49:23 GMT=;30-Jan-2016 10:40:30 GMT=;29-Jan-2016 13:40:30 GMT=;26-Jan-26 10:42:35 GMT=&#xA;" />
 16 |       <Children Name="属性0" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[1]/div[2]/h2[1]/a[1]/@title[1]" IsHtml="False" />
 17 |       <Children Name="属性1" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/h2[1]/a[1]" IsHtml="False" />
 18 |       <Children Name="laisuzhou_" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[1]/a[1]/span[1]" IsHtml="False" />
 19 |       <Children Name="属性3" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]/div[1]/span[1]/span[1]" IsHtml="False" />
 20 |       <Children Name="where_" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[1]/span[2]" IsHtml="False" />
 21 |       <Children Name="属性5" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]/div[1]/span[3]" IsHtml="False" />
 22 |       <Children Name="属性6" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[2]/div[1]/a[1]" IsHtml="False" />
 23 |       <Children Name="属性7" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]/div[3]/div[1]/div[1]/span[2]/span[1]" IsHtml="False" />
 24 |       <Children Name="属性8" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[3]/div[1]/div[1]/span[4]/span[1]" IsHtml="False" />
 25 |       <Children Name="属性9" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]/div[3]/div[1]/div[1]/span[6]/span[1]" IsHtml="False" />
 26 |       <Children Name="col-3_price-pre" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[2]/div[2]" IsHtml="False" />
 27 |       <Children Name="属性0" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[4]/div[2]/div[2]/div[1]/span[1]" IsHtml="False" />
 28 |       <Children Name="属性1" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[2]/ul[1]/li[5]/div[2]/div[3]/div[1]/div[1]/span[1]" IsHtml="False" />
 29 |     </Children>
 30 |     <Children Name="数据清洗ETL-链家二手房" Type="SmartETLTool" DelayTime="200" GenerateMode="串行模式" CreateTime="2016/3/15 13:04:12" Description="任务描述">
 31 |       <Children MinValue="1" MaxValue="100" Interval="1" RepeatCount="1" Column="id" Enabled="True" Position="0" MergeType="Append" Type="RangeGE" Group="Generator" />
 32 |       <Children MergeWith="" Format="http://bj.lianjia.com/ershoufang/pg{0}/" Column="id" NewColumn="url" Enabled="True" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
 33 |       <Children CrawlerSelector="网页采集器-链家二手房" Column="url" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="True" Type="CrawlerTF" Group="Transformer" />
 34 |       <Children ConvertType="Decode" Column="where_" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="False" Type="HtmlTF" Group="Transformer" />
 35 |       <Children ConvertType="Decode" Column="属性3" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="False" Type="HtmlTF" Group="Transformer" />
 36 |       <Children IsMultiYield="False" Index="0" Script="(-?\d+)(\.\d+)?" NewColumn="" Column="col-3_price-pre" Enabled="True" OneOutput="True" Type="NumberTF" Group="Transformer" />
 37 |       <Children Column="col-3_price-pre" NewColumn="单价" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 38 |       <Children Column="属性6" NewColumn="区域" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 39 |       <Children ExecuteType="OnlyInsert" NewTableName="D:\hello.json" Column="laisuzhou_" Enabled="True" Type="DbEX" Group="Executor" Connector="文件管理" TableName="打开新文件" />
 40 |     </Children>
 41 |     <Children Name="网页采集器-通用" Type="SmartCrawler" URL="http://www.dianping.com/search/category/2/10/g311r14" IsMultiData="One" URLFilter="https://www.dianping.com/shopall/2/0" ContentFilter="" CreateTime="2016/3/23 9:13:13" Description="任务描述">
 42 |       <HttpSet URL="http://www.dianping.com/search/category/2/10/g311r14" Allowautoredirect="False" Postdata="" Encoding="Unknown" Method="GET" Parameters="Host: www.dianping.com&#xA;Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8&#xA;User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36&#xA;Cookie:s_ViewType=10;Domain=.dianping.com;Expires=Thu;24-Mar-2018 14:48:31 GMT=;Path=/;JSESSIONID=7B790EF85A4BA7A52302D1D54730FF18;aburl=1;24-Mar-2017 14:48:32 GMT=;cy=2;cye=beijing;24-Mar-2018 14:25:41 GMT=;24-Mar-2017 14:25:42 GMT=;24-Mar-2018 08:52:55 GMT=;24-Mar-2017 08:52:55 GMT=;24-Mar-2018 03:26:35 GMT=;24-Mar-2017 03:26:36 GMT=;24-Mar-2018 03:22:12 GMT=;24-Mar-2017 03:22:13 GMT=;24-Mar-2018 02:53:53 GMT=;24-Mar-2017 02:53:54 GMT=;24-Mar-2018 02:03:39 GMT=;24-Mar-2017 02:03:40 GMT=;24-Mar-2018 01:37:33 GMT=;24-Mar-2017 01:37:33 GMT=;23-Mar-2018 10:27:54 GMT=;23-Mar-2017 10:27:55 GMT=;23-Mar-2018 10:21:04 GMT=;23-Mar-2017 10:21:04 GMT=;23-Mar-2018 05:58:47 GMT=;23-Mar-2017 05:58:48 GMT=;23-Mar-2018 03:03:27 GMT=;23-Mar-2017 03:03:28 GMT=;23-Mar-2018 02:49:16 GMT=;23-Mar-2017 02:49:17 GMT=;23-Mar-2018 01:52:15 GMT=;23-Mar-2017 01:52:16 GMT=;23-Mar-2018 01:52:14 GMT=;23-Mar-2017 01:52:15 GMT=;23-Mar-2017 01:12:58 GMT=;23-Mar-2017 01:12:56 GMT=;23-Mar-2017 01:12:54 GMT=;_hc.v=&quot;\&quot;7ebbc90d-1288-4f90-bf2f-e6bcd3e653c1.1446514502\&quot;&quot;;__utma=1.1290031819.1455245681.1455245681.1455245681.1;__utmz=1.1455245681.1.1.utmcsr=dianping.com|utmccn=(referral)|utmcmd=referral|utmcct=/search/category/2/20/g119r15p2;PHOENIX_ID=0a650671-153a106beb0-984c419&#xA;Connection: keep-alive&#xA;Cache-Control: max-age=0&#xA;Upgrade-Insecure-Requests: 1&#xA;Accept-Encoding: gzip, deflate, sdch&#xA;Accept-Language: zh-CN,zh;q=0.8&#xA;" />
 43 |     </Children>
 44 |     <Children Name="大众点评类别" Type="SmartETLTool" DelayTime="200" GenerateMode="串行模式" CreateTime="2016/3/23 9:19:34" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
 45 |       <Children Content="1" Column="city" Name="城市" Enabled="True" Position="0" MergeType="Append" Type="TextGE" Group="Generator" />
 46 |       <Children MergeWith="" Format="https://www.dianping.com/shopall/{0}/0" Column="city" NewColumn="url" Enabled="True" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
 47 |       <Children CrawlerSelector="网页采集器-通用" Column="url" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="CrawlerTF" Group="Transformer" />
 48 |       <Children XPath="//*[@id=&quot;top&quot;]/div[6]/div/div[1]/dl[1]/dd/ul/li" IsManyData="True" GetText="False" Column="Content" Name="xpath筛选" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="True" Type="XPathTF" Group="Transformer" />
 49 |       <Children ConvertType="Decode" Column="Text" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="False" Type="HtmlTF" Group="Transformer" />
 50 |       <Children FromBack="False" ShouldSplitChars="False" SplitPause="True" SplitNull="True" Index="0" SplitChar="" Column="Text" NewColumn="Text" Enabled="True" OneOutput="True" IsMultiYield="False" Type="SplitTF" Group="Transformer" />
 51 |       <Children Column="OHTML" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
 52 |       <Children IsMultiYield="False" Index="0" Script="g\d+" NewColumn="" Column="HTML" Enabled="True" OneOutput="True" Type="RegexTF" Group="Transformer" />
 53 |     </Children>
 54 |     <Children Name="大众点评区域" Type="SmartETLTool" DelayTime="200" GenerateMode="串行模式" CreateTime="2016/3/23 9:39:50" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
 55 |       <Children Content="1" Column="city" Name="城市" Enabled="True" Position="0" MergeType="Append" Type="TextGE" Group="Generator" />
 56 |       <Children MergeWith="" Format="http://www.dianping.com/search/category/{0}/0/" Column="city" NewColumn="url" Enabled="True" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
 57 |       <Children CrawlerSelector="网页采集器-通用" Column="url" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="CrawlerTF" Group="Transformer" />
 58 |       <Children XPath="//*[@id=&quot;region-nav&quot;]/a" IsManyData="True" GetText="False" Column="Content" NewColumn="city" Enabled="True" OneOutput="True" IsMultiYield="True" Type="XPathTF" Group="Transformer" />
 59 |       <Children Column="HTML" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
 60 |       <Children IsMultiYield="False" Index="0" Script="r\d+" NewColumn="" Column="OHTML" Enabled="True" OneOutput="True" Type="RegexTF" Group="Transformer" />
 61 |       <Children Revert="False" Column="OHTML" Priority="0" Enabled="True" Type="NullFT" Group="Filter" />
 62 |     </Children>
 63 |     <Children Name="大众点评门店" Type="SmartETLTool" DelayTime="200" GenerateMode="并行模式" CreateTime="2016/3/23 9:54:53" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
 64 |       <Children Content="2" Column="city" Name="城市" Enabled="False" Position="0" MergeType="Append" Type="TextGE" Group="Generator" />
 65 |       <Children ETLSelector="大众点评类别" Insert="城市" Column="" Enabled="True" Position="0" MergeType="Append" Type="EtlGE" Group="Generator" />
 66 |       <Children Column="HTML" NewColumn="type" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 67 |       <Children Column="Text" NewColumn="门类" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 68 |       <Children ETLSelector="大众点评区域" Insert="城市" Column="type" Enabled="True" Position="0" MergeType="Cross" Type="EtlGE" Group="Generator" />
 69 |       <Children Column="OHTML" NewColumn="region" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 70 |       <Children MergeWith="region city" Format="http://www.dianping.com/search/category/{2}/10/{0}{1}" Column="type" NewColumn="url" Enabled="True" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
 71 |       <Children Column="page" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="True" Type="ToListTF" Group="Transformer" />
 72 |       <Children CrawlerSelector="网页采集器-通用" Column="url" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="CrawlerTF" Group="Transformer" />
 73 |       <Children XPath="/html[1]/body[1]/div[6]/div[1]/span[7]" IsManyData="False" GetText="False" Column="Content" NewColumn="page" Enabled="True" OneOutput="True" IsMultiYield="False" Type="XPathTF" Group="Transformer" />
 74 |       <Children IsMultiYield="False" Index="0" Script="(-?\d+)(\.\d+)?" NewColumn="" Column="page" Enabled="True" OneOutput="True" Type="NumberTF" Group="Transformer" />
 75 |       <Children Column="Content" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
 76 |       <Children Script="int(int(value)/15.0+1)" Column="page" NewColumn="page" Enabled="True" OneOutput="False" IsMultiYield="False" Type="PythonTF" Group="Transformer" />
 77 |       <Children MinValue="1" MaxValue="[page]" Interval="1" RepeatCount="1" Column="np" Enabled="True" Position="0" MergeType="Cross" Type="RangeGE" Group="Generator" />
 78 |       <Children MergeWith="region np city" Format="http://www.dianping.com/search/category/{3}/10/{0}{1}p{2}" Column="type" NewColumn="url" Enabled="True" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
 79 |       <Children CrawlerSelector="大众点评门店采集" Column="url" NewColumn="Text" Enabled="True" OneOutput="False" IsMultiYield="True" Type="CrawlerTF" Group="Transformer" />
 80 |       <Children IsMultiYield="False" Index="0" Script="(-?\d+)(\.\d+)?" NewColumn="" Column="均价" Enabled="True" OneOutput="True" Type="NumberTF" Group="Transformer" />
 81 |       <Children IsMultiYield="False" Index="0" Script="(-?\d+)(\.\d+)?" NewColumn="Text" Column="Text" Enabled="True" OneOutput="True" Type="NumberTF" Group="Transformer" />
 82 |       <Children Column="Text" NewColumn="区县" Enabled="True" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
 83 |       <Children NewTableName="city" Column="ID" Enabled="False" Type="TableEX" Group="Executor" />
 84 | 	  <Children ExecuteType="OnlyInsert" NewTableName="D:\dp.txt" Column="laisuzhou_" Enabled="True" Type="DbEX" Group="Executor" Connector="文件管理" TableName="打开新文件" />
 85 |     </Children>
 86 |     <Children Name="大众点评门店采集" Type="SmartCrawler" URL="http://www.dianping.com/search/category/2/10/g311r14p2" IsMultiData="List" URLFilter="" ContentFilter="" CreateTime="2016/3/23 10:47:01" Description="任务描述" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
 87 |       <HttpSet URL="http://www.dianping.com/search/category/2/10/g311r14p2" Allowautoredirect="False" Postdata="" Encoding="Unknown" Method="GET" Parameters="Host: www.dianping.com&#xA;Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8&#xA;User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36&#xA;Cookie:s_ViewType=10;Domain=.dianping.com;Expires=Thu;24-Mar-2018 14:25:41 GMT=;Path=/;JSESSIONID=7B790EF85A4BA7A52302D1D54730FF18;aburl=1;24-Mar-2017 14:25:42 GMT=;cy=2;cye=beijing;24-Mar-2018 03:26:35 GMT=;24-Mar-2017 03:26:36 GMT=;24-Mar-2018 02:53:54 GMT=;24-Mar-2017 02:53:54 GMT=;24-Mar-2018 02:03:39 GMT=;24-Mar-2017 02:03:40 GMT=;23-Mar-2018 10:27:54 GMT=;23-Mar-2017 10:27:55 GMT=;23-Mar-2018 10:21:04 GMT=;23-Mar-2017 10:21:04 GMT=;23-Mar-2018 09:48:03 GMT=;23-Mar-2017 09:48:03 GMT=;23-Mar-2018 02:54:33 GMT=;23-Mar-2017 02:54:34 GMT=;23-Mar-2018 02:38:15 GMT=;23-Mar-2017 02:38:16 GMT=;23-Mar-2018 01:52:15 GMT=;23-Mar-2017 01:52:16 GMT=;23-Mar-2018 01:52:14 GMT=;23-Mar-2017 01:52:15 GMT=;23-Mar-2017 01:12:58 GMT=;23-Mar-2017 01:12:56 GMT=;23-Mar-2017 01:12:54 GMT=;_hc.v=&quot;\&quot;7ebbc90d-1288-4f90-bf2f-e6bcd3e653c1.1446514502\&quot;&quot;;__utma=1.1290031819.1455245681.1455245681.1455245681.1;__utmz=1.1455245681.1.1.utmcsr=dianping.com|utmccn=(referral)|utmcmd=referral|utmcct=/search/category/2/20/g119r15p2;PHOENIX_ID=0a650671-153a106beb0-984c419&#xA;Connection: keep-alive&#xA;Cache-Control: max-age=0&#xA;Upgrade-Insecure-Requests: 1&#xA;Accept-Encoding: gzip, deflate, sdch&#xA;Accept-Language: zh-CN,zh;q=0.8&#xA;" />
 88 |       <Children Name="标题" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[1]/a[1]/img[1]/@title[1]" IsHtml="False" />
 89 |       <Children Name="ID" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[1]/a[1]/@href[1]" IsHtml="False" />
 90 |       <Children Name="介绍" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[1]/a[1]/@title[1]" IsHtml="False" />
 91 |       <Children Name="星级" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[2]/div[2]/span[1]/@title[1]" IsHtml="False" />
 92 |       <Children Name="点评" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[2]/a[1]/b[1]" IsHtml="False" />
 93 |       <Children Name="均价" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[2]/a[2]/b[1]" IsHtml="False" />
 94 |       <Children Name="类型" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[3]/a[1]/span[1]" IsHtml="False" />
 95 |       <Children Name="区域" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[3]/a[2]/span[1]" IsHtml="False" />
 96 |       <Children Name="地址" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[3]/span[1]" IsHtml="False" />
 97 |       <Children Name="口味" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[2]/span[1]/span[1]/b[1]" IsHtml="False" />
 98 |       <Children Name="环境" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[2]/div[2]/span[1]/span[2]/b[1]" IsHtml="False" />
 99 |       <Children Name="服务" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[2]/span[1]/span[3]/b[1]" IsHtml="False" />
100 |       <Children Name="其他" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[3]/a[1]/@title[1]" IsHtml="False" />
101 |       <Children Name="总店" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[4]/a[3]/@data-sname[1]" IsHtml="False" />
102 |       <Children Name="总店id" XPath="/html[1]/body[1]/div[6]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/div[4]/a[3]/@data-shopid[1]" IsHtml="False" />
103 |     </Children>
104 |   </Doc>
105 | </root>


--------------------------------------------------------------------------------
/sample.py:
--------------------------------------------------------------------------------
 1 | import etl;
 2 | 
 3 | import extends
 4 | import time;
 5 | path='/home/desert.zym/dev'
 6 | 
 7 | proj=etl.Project_LoadXml(path+'/Hawk-Projects/图片抓取/昵图网.xml');
 8 | lagou=proj.modules['昵图网'];
 9 | tools= lagou.AllETLTools;
10 | tools[-12].Format="/cloud/usr/desert.zym/picture/昵图网/{1}/{0}.jpg";
11 | tools[-1].Enabled=False;
12 | tools[-9].Enabled=False;
13 | #for r in lagou.QueryDatas(etlCount=19,execute=False):
14 | #    print(r)
15 | #     print(r)
16 | from  distributed import  *
17 | master =Master(proj,"昵图网");
18 | master.start();
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import gzip
  3 | import re
  4 | import socket
  5 | import urllib.request
  6 | from lxml import etree
  7 | from urllib.parse import urlparse,urlunparse
  8 | import extends;
  9 | import http.cookiejar
 10 | from urllib.request import quote
 11 | 
 12 | boxRegex = re.compile(r"\[\d{1,3}\]");
 13 | 
 14 | 
 15 | class CrawItem(extends.EObject):
 16 |     def __init__(self, name=None, sample=None, ismust=False, isHTMLorText=True, xpath=None):
 17 |         self.XPath = xpath;
 18 |         self.Sample = sample;
 19 |         self.Name = name;
 20 |         self.IsMust = ismust;
 21 |         self.IsHTMLorText = isHTMLorText;
 22 |         self.Children = [];
 23 | 
 24 |     def __str__(self):
 25 |         return "%s %s %s" % (self.Name, self.XPath, self.Sample);
 26 | 
 27 | 
 28 | def RemoveFinalNum(paths):
 29 |     v = paths[-1];
 30 |     m = boxRegex.search(v);
 31 |     if m is None:
 32 |         return paths;
 33 |     s = m.group(0);
 34 |     paths[-1] = v.replace(s, "");
 35 |     return paths;
 36 | 
 37 | 
 38 | def GetMaxCompareXPath(items):
 39 |     xpaths = [r.XPath.split('/') for r in items];
 40 |     minlen = min(len(r) for r in xpaths);
 41 |     c = None;
 42 |     for i in range(minlen):
 43 |         for index in range(len(xpaths)):
 44 |             path = xpaths[index];
 45 |             if index == 0:
 46 |                 c = path[i];
 47 |             elif c != path[i]:
 48 |                 first = path[0:i + 1];
 49 |                 return '/'.join(RemoveFinalNum(first));
 50 | 
 51 | 
 52 | attrsplit=re.compile('@|\[');
 53 | 
 54 | def GetDataFromXPath(node, path):
 55 |     p = node.xpath(path);
 56 |     if p is None:
 57 |         return None;
 58 |     if len(p) == 0:
 59 |         return None;
 60 |     paths = path.split('/');
 61 |     last = paths[-1];
 62 |     if last.find('@')>=0 and last.find('[1]')>=0:
 63 |         return p[0];
 64 |     return  getnodetext(p[0]);
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | def GetImage(addr, fname):
 74 |     u = urllib.urlopen(addr)
 75 |     data = u.read()
 76 |     f = open(fname, 'wb')
 77 |     f.write(data)
 78 |     f.close()
 79 | 
 80 | 
 81 | def urlEncodeNonAscii(b):
 82 |     return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
 83 | 
 84 | def iriToUri(iri):
 85 |     parts= urlparse(iri)
 86 | 
 87 |     pp= [(parti,part) for parti, part in enumerate(parts)]
 88 |     res=[];
 89 |     for p in pp:
 90 |         res.append(p[1] if p[0] != 4 else quote(p[1] ))
 91 | 
 92 |     return urlunparse(res);
 93 | 
 94 | 
 95 | 
 96 | 
 97 | extract = re.compile('\[(\w+)\]');
 98 | 
 99 | charset = re.compile(r'content="text/html;.?charset=(.*?)"');
100 | class HTTPItem(extends.EObject):
101 |     def __init__(self):
102 |         self.Url = ''
103 |         self.Cookie = '';
104 |         self.Headers = None;
105 |         self.Timeout = 30;
106 |         self.opener = "";
107 |         self.postdata=''
108 | 
109 |     def PraseURL(self, url):
110 |         u = Para2Dict(urlparse(self.Url).query, '&', '=');
111 |         for r in extract.findall(url):
112 |             url = url.replace('[' + r + ']', u[r])
113 |         return url;
114 | 
115 |     def GetHTML(self, destUrl=None):
116 |         if destUrl is None:
117 |             destUrl = self.Url;
118 |         destUrl = self.PraseURL(destUrl);
119 |         socket.setdefaulttimeout(self.Timeout);
120 |         cj = http.cookiejar.CookieJar()
121 |         pro = urllib.request.HTTPCookieProcessor(cj)
122 |         opener = urllib.request.build_opener(pro)
123 |         t = [(r, self.Headers[r]) for r in self.Headers];
124 |         opener.addheaders = t;
125 |         binary_data = self.postdata.encode('utf-8')
126 |         try:
127 |             destUrl.encode('ascii')
128 |         except UnicodeEncodeError:
129 |             destUrl =  iriToUri(destUrl)
130 | 
131 |         try:
132 |             if self.postdata=='':
133 |                 page=opener.open(destUrl);
134 |             else:
135 |                 page = opener.open(destUrl, binary_data)
136 |             html = page.read()
137 |         except Exception as e:
138 |             print(e);
139 |             return ""
140 | 
141 | 
142 |         if page.info().get('Content-Encoding') == 'gzip':
143 |             html = gzip.decompress(html)
144 |         encoding = charset.search(str(html))
145 |         if encoding is not None:
146 |             encoding = encoding.group(1);
147 |         if encoding is None:
148 |             encoding = 'utf-8'
149 |         try:
150 |             html=html.decode(encoding)
151 |         except UnicodeDecodeError as e:
152 |             print(e);
153 |             import chardet
154 |             encoding= chardet.detect(html)
155 |             html=html.decode(encoding);
156 | 
157 |         return html;
158 | 
159 | 
160 | # 解压函数
161 | def ungzip(data):
162 |     data = gzip.decompress(data)
163 |     return data;
164 | 
165 | def IsNone(data):
166 |     return  data is  None or data=='';
167 | 
168 | def __getnodetext__(node, arrs):
169 |     t=node.text;
170 |     if t is not None:
171 |         s = t.strip();
172 |         if s != '':
173 |             arrs.append(s)
174 |     for sub in node.iterchildren():
175 |         __getnodetext__(sub,arrs)
176 | 
177 | def getnodetext(node):
178 |     if node is None:
179 |         return ""
180 |     arrs=[];
181 |     __getnodetext__(node,arrs);
182 |     return ' '.join(arrs);
183 | 
184 | 
185 | class SmartCrawler(extends.EObject):
186 |     def __init__(self):
187 |         self.IsMultiData = "List";
188 |         self.HttpItem = None;
189 |         self.Name = None;
190 |         self.CrawItems = None;
191 |         self.Login = "";
192 |         self.haslogin = False;
193 |         self.RootXPath=''
194 | 
195 |     def autologin(self, loginItem):
196 |         if loginItem.postdata is None:
197 |             return;
198 |         import http.cookiejar
199 |         cj = http.cookiejar.CookieJar()
200 |         pro = urllib.request.HTTPCookieProcessor(cj)
201 |         opener = urllib.request.build_opener(pro)
202 |         t = [(r, loginItem.Headers[r]) for r in loginItem.Headers];
203 |         opener.addheaders = t;
204 |         binary_data = loginItem.postdata.encode('utf-8')
205 |         op = opener.open(loginItem.Url, binary_data)
206 |         data = op.read().decode('utf-8')
207 |         print(data)
208 |         self.HttpItem.Url = op.url;
209 |         return opener;
210 | 
211 |     def CrawData(self, url):
212 | 
213 |         if   self.Login !="" and  self.haslogin == False:
214 |             self.HttpItem.opener = self.autologin(self.Login);
215 |             self.haslogin = True;
216 |         html = self.HttpItem.GetHTML(url);
217 | 
218 |         root =None if html=='' else etree.HTML(html);
219 |         if root is None:
220 |             return {} if self.IsMultiData == 'One' else [];
221 | 
222 |         tree = etree.ElementTree(root);
223 |         if isinstance(self.CrawItems, list) and len(self.CrawItems) == 0:
224 |             return {'Content': html};
225 | 
226 |         return self.GetDataFromCrawItems(tree );
227 | 
228 |     def GetDataFromCrawItems(self,tree):
229 |         documents = [];
230 |         if self.IsMultiData =='One':
231 |             document = {};
232 |             for r in self.CrawItems:
233 |                 data = GetDataFromXPath(tree, r.XPath);
234 |                 if data is not None:
235 |                     document[r.Name] = data;
236 |                 else:
237 |                     document[r.Name] = "";
238 |             return document;
239 |         else:
240 |             if not IsNone(self.RootXPath):
241 |                 rootXPath = self.RootXPath;
242 |             else:
243 |                 rootXPath = GetMaxCompareXPath(self.CrawItems);
244 |             nodes = tree.xpath(rootXPath)
245 |             if nodes is not None:
246 |                 for node in nodes:
247 |                     document = {};
248 |                     for r in self.CrawItems:
249 |                         path=r.XPath;
250 |                         if IsNone(self.RootXPath):
251 |                             paths=r.XPath.split('/');
252 |                             path='/'.join(paths[len(rootXPath.split('/')):len(paths)]);
253 |                         else:
254 |                             path=  tree.getpath(node)+ path;
255 |                         data = GetDataFromXPath(node,path);
256 |                         if data is not None:
257 |                             document[r.Name] = data;
258 |                     if len(document) == 0:
259 |                         continue;
260 |                     documents.append(document);
261 |                 return documents;
262 | 
263 | def Para2Dict(para, split1, split2):
264 |     r = {};
265 |     for s in para.split(split1):
266 |         rs = s.split(split2);
267 |         if len(rs) < 2:
268 |             continue;
269 |         key = rs[0];
270 |         value = s[len(key) + 1:];
271 |         r[rs[0]] = value;
272 | 
273 |     return r;
274 | 
275 | 
276 | def GetHTML(url, code=None):
277 |     url = url.strip();
278 |     if not url.startswith('http'):
279 |         url = 'http://' + url;
280 |         print("auto transform %s" % (url));
281 |     socket.setdefaulttimeout(30)
282 |     i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",
283 |                  "Accept": "text/plain"}
284 |     req = urllib.request.Request(url=url, headers=i_headers)
285 |     page = urllib.request.urlopen(req)
286 |     html = page.read()
287 |     return html;
288 | 
289 | 
290 | def GetHTMLFromFile(fname):
291 |     f = open(fname, 'r', 'utf-8');
292 |     r = f.read();
293 |     return r;
294 | 
295 | 
296 | def GetCrawNode(craws, name, tree):
297 |     for r in craws:
298 |         if r.Name == name:
299 |             return tree.xpath(r.XPath);
300 |     return None;
301 | 
302 | 
303 | def GetImageFormat(name):
304 |     if name is None:
305 |         return None, None;
306 |     p = name.split('.');
307 |     if len(p) != 2:
308 |         return name, 'jpg';
309 | 
310 |     back = p[-1];
311 |     if back == "jpg" or back == "png" or back == "gif":  # back=="png"  ignore because png is so big!
312 |         return p[-2], back;
313 |     return None, None;
314 | 
315 | 
316 | def GetCrawData(crawitems, tree):
317 |     doc = {};
318 |     for crawItem in crawitems:
319 |         node = tree.xpath(crawItem.XPath);
320 |         if len(node) == 0:
321 |             if crawItem.IsMust:
322 |                 return;
323 |         if crawItem.IsHTMLorText is False:
324 |             text = node[0].text;
325 |         else:
326 |             text = etree.tostring(node[0]);
327 |         doc[crawItem.Name] = text;
328 |     return doc;
329 | 
330 | 
331 | def GetHtmlTree(html):
332 |     root = etree.HTML(html);
333 |     tree = etree.ElementTree(root);
334 |     return tree;
335 | 


--------------------------------------------------------------------------------