├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── feapder_pipelines ├── VERSION ├── __init__.py ├── db │ ├── __init__.py │ └── pgsqldb.py ├── pipelines │ ├── __init__.py │ └── pgsql_pipeline.py ├── requirements.txt └── utils │ ├── __init__.py │ └── pgsql_tool.py ├── setup.py └── tests └── pgsql ├── test_pgsqldb.py ├── test_postgresql_spider.py └── test_postgresql_updateitem_spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | files/* 2 | .DS_Store 3 | .idea/* 4 | */.idea/* 5 | venv/* 6 | venv2/* 7 | *.pyc 8 | *test.py 9 | *.log 10 | **/proxy_file 11 | build/ 12 | dist/ 13 | *.egg-info/ 14 | .vscode/ 15 | media/ 16 | .MWebMetaData/ 17 | push.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modifications: 4 | 5 | Copyright (c) 2020 Boris 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | 4 | include feapder_pipelines/requirements.txt 5 | include feapder_pipelines/VERSION 6 | 7 | recursive-include tests * 8 | 9 | global-exclude __pycache__ *.py[cod] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FEAPDER 管道扩展 2 | 3 | ![](https://img.shields.io/badge/python-3-brightgreen) 4 | ![](https://img.shields.io/github/watchers/Boris-code/feapder_pipelines?style=social) 5 | ![](https://img.shields.io/github/stars/Boris-code/feapder_pipelines?style=social) 6 | ![](https://img.shields.io/github/forks/Boris-code/feapder_pipelines?style=social) 7 | 8 | ## 简介 9 | 10 | 此模块为`feapder`的`pipelines`扩展,感谢广大开发者对`feapder`的贡献 11 | 12 | 随着feapder支持的pipelines越来越多,为减少feapder的体积,特将pipelines提出,使用者可按需安装 13 | 14 | ## 管道 15 | 16 | ### PostgreSQL 17 | 18 | > 贡献者:沈瑞祥 19 | > 20 | > 联系方式:ruixiang.shen@outlook.com 21 | 22 | 23 | #### 安装 24 | 25 | ``` 26 | pip install feapder_pipelines[pgsql] 27 | ``` 28 | 29 | #### 使用 30 | 31 | 在`feapder`项目的`setting.py`中使用如下配置 32 | 33 | ```python 34 | # PostgreSQL 35 | PGSQL_IP = 36 | PGSQL_PORT = 37 | PGSQL_DB = 38 | PGSQL_USER_NAME = 39 | PGSQL_USER_PASS = 40 | 41 | ITEM_PIPELINES = [ 42 | "feapder_pipelines.pipelines.pgsql_pipeline.PgsqlPipeline" 43 | ] 44 | ``` 45 | 46 | 47 |
48 | 细节 49 | 注:入库时 ON CONFLICT(key) 默认为id或通过如下sql查出来的第一个值 50 | 51 | ```sql 52 | select column_names from( 53 | select 54 | t.relname as table_name, 55 | i.relname as index_name, 56 | array_to_string(array_agg(a.attname), ', ') as column_names 57 | from 58 | pg_class t, 59 | pg_class i, 60 | pg_index ix, 61 | pg_attribute a 62 | where 63 | t.oid = ix.indrelid 64 | and i.oid = ix.indexrelid 65 | and a.attrelid = t.oid 66 | and a.attnum = ANY(ix.indkey) 67 | and t.relkind = 'r' 68 | and t.relname like '%' 69 | group by 70 | t.relname, 71 | i.relname 72 | order by 73 | t.relname, 74 | i.relname) as res 75 | where table_name = 'table_name'; 76 | ``` 77 |
78 | 79 | 80 | -------------------------------------------------------------------------------- /feapder_pipelines/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.3 2 | -------------------------------------------------------------------------------- /feapder_pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/12/22 2:24 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | -------------------------------------------------------------------------------- /feapder_pipelines/db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/12/22 2:24 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | -------------------------------------------------------------------------------- /feapder_pipelines/db/pgsqldb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: 操作pgsql数据库 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | 11 | from typing import List, Dict 12 | from urllib import parse 13 | 14 | import psycopg2 15 | from dbutils.pooled_db import PooledDB 16 | 17 | import feapder.setting as setting 18 | from feapder.db.mysqldb import MysqlDB 19 | from feapder.utils.log import log 20 | from feapder_pipelines.utils.pgsql_tool import ( 21 | make_insert_sql, 22 | make_update_sql, 23 | make_batch_sql, 24 | ) 25 | 26 | 27 | def auto_retry(func): 28 | def wapper(*args, **kwargs): 29 | for i in range(3): 30 | try: 31 | return func(*args, **kwargs) 32 | except (psycopg2.InterfaceError, psycopg2.OperationalError) as e: 33 | log.error( 34 | """ 35 | error:%s 36 | sql: %s 37 | """ 38 | % (e, kwargs.get("sql") or args[1]) 39 | ) 40 | 41 | return wapper 42 | 43 | 44 | class PgsqlDB(MysqlDB): 45 | def __init__( 46 | self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs 47 | ): 48 | # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 49 | if not ip: 50 | ip = setting.PGSQL_IP 51 | if not port: 52 | port = setting.PGSQL_PORT 53 | if not db: 54 | db = setting.PGSQL_DB 55 | if not user_name: 56 | user_name = setting.PGSQL_USER_NAME 57 | if not user_pass: 58 | user_pass = setting.PGSQL_USER_PASS 59 | 60 | try: 61 | self.connect_pool = PooledDB( 62 | creator=psycopg2, 63 | mincached=1, 64 | maxcached=100, 65 | maxconnections=100, 66 | blocking=True, 67 | ping=7, 68 | host=ip, 69 | port=port, 70 | user=user_name, 71 | password=user_pass, 72 | database=db, 73 | ) 74 | 75 | except Exception as e: 76 | log.error( 77 | """ 78 | 连接失败: 79 | ip: {} 80 | port: {} 81 | db: {} 82 | user_name: {} 83 | user_pass: {} 84 | exception: {} 85 | """.format( 86 | ip, port, db, user_name, user_pass, e 87 | ) 88 | ) 89 | else: 90 | log.debug("连接到postgresql数据库 %s : %s" % (ip, db)) 91 | 92 | @classmethod 93 | def from_url(cls, url, **kwargs): 94 | # postgresql://user_name:user_passwd@ip:port/db?charset=utf8 95 | url_parsed = parse.urlparse(url) 96 | 97 | db_type = url_parsed.scheme.strip() 98 | if db_type != "postgresql": 99 | raise Exception( 100 | "url error, expect postgresql://username:ip:port/db?charset=utf8, but get {}".format( 101 | url 102 | ) 103 | ) 104 | 105 | connect_params = { 106 | "ip": url_parsed.hostname.strip(), 107 | "port": url_parsed.port, 108 | "user_name": url_parsed.username.strip(), 109 | "user_pass": url_parsed.password.strip(), 110 | "db": url_parsed.path.strip("/").strip(), 111 | } 112 | 113 | connect_params.update(kwargs) 114 | 115 | return cls(**connect_params) 116 | 117 | def add_smart(self, table, data: Dict, **kwargs): 118 | """ 119 | 添加数据, 直接传递json格式的数据,不用拼sql 120 | Args: 121 | table: 表名 122 | data: 字典 {"xxx":"xxx"} 123 | **kwargs: 124 | 125 | Returns: 添加行数 126 | 127 | """ 128 | sql = make_insert_sql(table, data, **kwargs) 129 | return self.add(sql) 130 | 131 | def add_batch(self, sql, datas: List[Dict]): 132 | """ 133 | @summary: 批量添加数据 134 | --------- 135 | @ param sql: insert into (xxx,xxx) values (%s, %s, %s) 136 | # param datas: 列表 [{}, {}, {}] 137 | --------- 138 | @result: 添加行数 139 | """ 140 | try: 141 | conn, cursor = self.get_connection() 142 | cursor.executemany(sql, datas) 143 | affect_count = cursor.rowcount 144 | conn.commit() 145 | 146 | except Exception as e: 147 | log.error( 148 | """ 149 | error:%s 150 | sql: %s 151 | """ 152 | % (e, sql) 153 | ) 154 | affect_count = None 155 | finally: 156 | self.close_connection(conn, cursor) 157 | 158 | return affect_count 159 | 160 | def add_batch_smart(self, table, datas: List[Dict], **kwargs): 161 | """ 162 | 批量添加数据, 直接传递list格式的数据,不用拼sql 163 | Args: 164 | table: 表名 165 | datas: 列表 [{}, {}, {}] 166 | **kwargs: 167 | 168 | Returns: 添加行数 169 | 170 | """ 171 | sql, datas = make_batch_sql(table, datas, **kwargs) 172 | return self.add_batch(sql, datas) 173 | 174 | def update_smart(self, table, data: Dict, condition): 175 | """ 176 | 更新, 不用拼sql 177 | Args: 178 | table: 表名 179 | data: 数据 {"xxx":"xxx"} 180 | condition: 更新条件 where后面的条件,如 condition='status=1' 181 | 182 | Returns: True / False 183 | 184 | """ 185 | sql = make_update_sql(table, data, condition) 186 | return self.update(sql) 187 | -------------------------------------------------------------------------------- /feapder_pipelines/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/12/22 2:24 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | -------------------------------------------------------------------------------- /feapder_pipelines/pipelines/pgsql_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: pgsql pipeline 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | 11 | from typing import Dict, List, Tuple 12 | 13 | import feapder_pipelines.utils.pgsql_tool as tools 14 | from feapder_pipelines.db.pgsqldb import PgsqlDB 15 | from feapder.pipelines import BasePipeline 16 | from feapder.utils.log import log 17 | 18 | 19 | class PgsqlPipeline(BasePipeline): 20 | def __init__(self): 21 | self._to_db = None 22 | self._indexes_cols_cached = {} 23 | 24 | @property 25 | def to_db(self): 26 | if not self._to_db: 27 | self._to_db = PgsqlDB() 28 | 29 | return self._to_db 30 | 31 | def __get_indexes_cols(self, table): 32 | if table not in self._indexes_cols_cached: 33 | get_indexes_sql = tools.get_primaryKey_col_sql(table) 34 | indexes_cols = self.to_db.find(sql=get_indexes_sql) or "id" 35 | log.info(f"主键列名:{indexes_cols[0][0]}") 36 | if indexes_cols: 37 | indexes_cols = indexes_cols[0][0] 38 | else: 39 | log.error(f"无法找到主键列名") 40 | raise Exception("请确保数据库有主键") 41 | self._indexes_cols_cached[table] = indexes_cols 42 | 43 | return self._indexes_cols_cached[table] 44 | 45 | def save_items(self, table, items: List[Dict]) -> bool: 46 | """ 47 | 保存数据 48 | Args: 49 | table: 表名 50 | items: 数据,[{},{},...] 51 | 52 | Returns: 是否保存成功 True / False 53 | 若False,不会将本批数据入到去重库,以便再次入库 54 | 55 | """ 56 | sql, datas = tools.make_batch_sql( 57 | table, items, indexes_cols=self.__get_indexes_cols(table) 58 | ) 59 | add_count = self.to_db.add_batch(sql, datas) 60 | # log.info(sql) 61 | datas_size = len(datas) 62 | if add_count is not None: 63 | log.info( 64 | "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count) 65 | ) 66 | 67 | return add_count is not None 68 | 69 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 70 | """ 71 | 更新数据 72 | Args: 73 | table: 表名 74 | items: 数据,[{},{},...] 75 | update_keys: 更新的字段, 如 ("title", "publish_time") 76 | 77 | Returns: 是否更新成功 True / False 78 | 若False,不会将本批数据入到去重库,以便再次入库 79 | 80 | """ 81 | sql, datas = tools.make_batch_sql( 82 | table, 83 | items, 84 | update_columns=update_keys or list(items[0].keys()), 85 | indexes_cols=self.__get_indexes_cols(table), 86 | ) 87 | # log.info(sql) 88 | update_count = self.to_db.add_batch(sql, datas) 89 | if update_count: 90 | msg = "共更新 %s 条数据 到 %s" % (update_count, table) 91 | if update_keys: 92 | msg += " 更新字段为 {}".format(update_keys) 93 | log.info(msg) 94 | 95 | return update_count is not None 96 | -------------------------------------------------------------------------------- /feapder_pipelines/requirements.txt: -------------------------------------------------------------------------------- 1 | feapder 2 | psycopg2-binary>=2.9.2 -------------------------------------------------------------------------------- /feapder_pipelines/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/12/22 2:24 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | -------------------------------------------------------------------------------- /feapder_pipelines/utils/pgsql_tool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: 操作pgsql数据库 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | 11 | from feapder.utils.tools import list2str, format_sql_value 12 | 13 | 14 | # PostgreSQL数据库相关 15 | def get_indexes_col_sql(table): 16 | """ 17 | @summary: 适用于PostgreSQL 18 | --------- 19 | @param table: 20 | 21 | --------- 22 | @result: 23 | """ 24 | sql = """ 25 | select column_names from( 26 | select 27 | t.relname as table_name, 28 | i.relname as index_name, 29 | array_to_string(array_agg(a.attname), ', ') as column_names 30 | from 31 | pg_class t, 32 | pg_class i, 33 | pg_index ix, 34 | pg_attribute a 35 | where 36 | t.oid = ix.indrelid 37 | and i.oid = ix.indexrelid 38 | and a.attrelid = t.oid 39 | and a.attnum = ANY(ix.indkey) 40 | and t.relkind = 'r' 41 | and t.relname like '%' 42 | group by 43 | t.relname, 44 | i.relname 45 | order by 46 | t.relname, 47 | i.relname) as res 48 | where table_name = '{table}'; 49 | """ 50 | sql = sql.format(table=table).replace("None", "null") 51 | return sql 52 | 53 | 54 | def get_primaryKey_col_sql(table): 55 | """ 56 | @summary: 适用于PostgreSQL 57 | --------- 58 | @param table: 59 | 60 | --------- 61 | @result: 62 | """ 63 | sql = """ 64 | SELECT 65 | string_agg(DISTINCT t3.attname,',') AS primaryKeyColumn 66 | ,t4.tablename AS tableName 67 | , string_agg(cast(obj_description(relfilenode,'pg_class') as varchar),'') as comment 68 | FROM 69 | pg_constraint t1 70 | INNER JOIN pg_class t2 ON t1.conrelid = t2.oid 71 | INNER JOIN pg_attribute t3 ON t3.attrelid = t2.oid AND array_position(t1.conkey,t3.attnum) is not null 72 | INNER JOIN pg_tables t4 on t4.tablename = t2.relname 73 | INNER JOIN pg_index t5 ON t5.indrelid = t2.oid AND t3.attnum = ANY (t5.indkey) 74 | LEFT JOIN pg_description t6 on t6.objoid=t3.attrelid and t6.objsubid=t3.attnum 75 | WHERE t1.contype = 'p' 76 | AND length(t3.attname) > 0 77 | AND t2.oid = '{table}' :: regclass 78 | group by t4.tablename; 79 | """ 80 | sql = sql.format(table=table).replace("None", "null") 81 | return sql 82 | 83 | 84 | def get_constraint_name_sql(table): 85 | """ 86 | @summary: 适用于PostgreSQL 87 | --------- 88 | @param table:tablename 89 | --------- 90 | @result: 91 | """ 92 | sql = "SELECT indexname FROM pg_indexes WHERE tablename = '{table}'" 93 | sql = sql.format(table=table).replace("None", "null") 94 | return sql 95 | 96 | 97 | def make_insert_sql( 98 | table, 99 | data, 100 | auto_update=False, 101 | update_columns=(), 102 | insert_ignore=False, 103 | indexes_cols=(), 104 | ): 105 | """ 106 | @summary: 适用于PostgreSQL 107 | --------- 108 | @param table: 109 | @param data: 表数据 json格式 110 | @param auto_update: 更新所有所有列的开关 111 | @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列 112 | @param insert_ignore: 更新策略:数据存在则忽略本条数据 113 | @param indexes_cols: 索引列 114 | --------- 115 | @result: 116 | """ 117 | 118 | keys = ["{}".format(key) for key in data.keys()] 119 | keys = list2str(keys).replace("'", "") 120 | 121 | values = [format_sql_value(value) for value in data.values()] 122 | values = list2str(values) 123 | 124 | if update_columns: 125 | if not isinstance(update_columns, (tuple, list)): 126 | update_columns = [update_columns] 127 | update_columns_ = ", ".join( 128 | ["{key}=excluded.{key}".format(key=key) for key in update_columns] 129 | ) 130 | sql = ( 131 | "insert into {table} {keys} values {values} on conflict({indexes_cols}) DO UPDATE SET %s" 132 | % update_columns_ 133 | ) 134 | 135 | elif auto_update: 136 | update_all_columns_ = ", ".join( 137 | ["{key}=excluded.{key}".format(key=key) for key in keys] 138 | ) 139 | sql = ( 140 | "insert into {table} {keys} values {values} on conflict({indexes_cols}) DO UPDATE SET %s" 141 | % update_all_columns_ 142 | ) 143 | elif insert_ignore: 144 | sql = "insert into {table} {keys} values {values} on conflict({indexes_cols}) DO NOTHING" 145 | else: 146 | sql = "insert into {table} {keys} values {values}" 147 | 148 | sql = sql.format( 149 | table=table, keys=keys, values=values, indexes_cols=indexes_cols 150 | ).replace("None", "null") 151 | return sql 152 | 153 | 154 | def make_update_sql(table, data, condition): 155 | """ 156 | @summary: 适用于PostgreSQL 157 | --------- 158 | @param table: 159 | @param data: 表数据 json格式 160 | @param condition: where 条件 161 | --------- 162 | @result: 163 | """ 164 | key_values = [] 165 | 166 | for key, value in data.items(): 167 | value = format_sql_value(value) 168 | if isinstance(value, str): 169 | key_values.append("{}={}".format(key, repr(value))) 170 | elif value is None: 171 | key_values.append("{}={}".format(key, "null")) 172 | else: 173 | key_values.append("{}={}".format(key, value)) 174 | 175 | key_values = ", ".join(key_values) 176 | 177 | sql = "update {table} set {key_values} where {condition}" 178 | sql = sql.format(table=table, key_values=key_values, condition=condition) 179 | return sql 180 | 181 | 182 | def make_batch_sql( 183 | table, 184 | datas, 185 | auto_update=False, 186 | update_columns=(), 187 | update_columns_value=(), 188 | indexes_cols=(), 189 | ): 190 | """ 191 | @summary: 生产批量的sql 192 | --------- 193 | @param table: 194 | @param datas: 表数据 [{...}] 195 | @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据 196 | @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列 197 | @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",) 198 | @param indexes_cols: 索引列 str 199 | --------- 200 | @result: 201 | """ 202 | if not datas: 203 | return 204 | 205 | keys = list(datas[0].keys()) 206 | values_placeholder = ["%s"] * len(keys) 207 | 208 | values = [] 209 | for data in datas: 210 | value = [] 211 | for key in keys: 212 | current_data = data.get(key) 213 | current_data = format_sql_value(current_data) 214 | 215 | value.append(current_data) 216 | 217 | values.append(value) 218 | 219 | keys = ["{}".format(key) for key in keys] 220 | keys = list2str(keys).replace("'", "") 221 | 222 | values_placeholder = list2str(values_placeholder).replace("'", "") 223 | 224 | if update_columns: 225 | if not isinstance(update_columns, (tuple, list)): 226 | update_columns = [update_columns] 227 | if update_columns_value: 228 | update_columns_ = ", ".join( 229 | [ 230 | "{key}=excluded.{value}".format(key=key, value=value) 231 | for key, value in zip(update_columns, update_columns_value) 232 | ] 233 | ) 234 | else: 235 | update_columns_ = ", ".join( 236 | ["{key}=excluded.{key}".format(key=key) for key in update_columns] 237 | ) 238 | sql = "insert into {table} {keys} values {values_placeholder} ON CONFLICT({indexes_cols}) DO UPDATE SET {update_columns}".format( 239 | table=table, 240 | keys=keys, 241 | values_placeholder=values_placeholder, 242 | update_columns=update_columns_, 243 | indexes_cols=indexes_cols, 244 | ) 245 | elif auto_update: 246 | update_all_columns_ = ", ".join( 247 | ["{key}=excluded.{key}".format(key=key) for key in keys] 248 | ) 249 | sql = "insert into {table} {keys} values {values_placeholder} on conflict({indexes_cols}) DO UPDATE SET {update_all_columns_}".format( 250 | table=table, 251 | keys=keys, 252 | values_placeholder=values_placeholder, 253 | indexes_cols=indexes_cols, 254 | update_all_columns_=update_all_columns_, 255 | ) 256 | else: 257 | sql = "insert into {table} {keys} values {values_placeholder} on conflict({indexes_cols}) do nothing".format( 258 | table=table, 259 | keys=keys, 260 | values_placeholder=values_placeholder, 261 | indexes_cols=indexes_cols, 262 | ) 263 | 264 | return sql, values 265 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 10:45 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from os.path import dirname, join 12 | from sys import version_info 13 | 14 | import setuptools 15 | 16 | if version_info < (3, 6, 0): 17 | raise SystemExit("Sorry! feapder_pipelines requires python 3.6.0 or later.") 18 | 19 | with open(join(dirname(__file__), "feapder_pipelines/VERSION"), "rb") as f: 20 | version = f.read().decode("ascii").strip() 21 | 22 | with open("README.md", "r") as fh: 23 | long_description = fh.read() 24 | 25 | packages = setuptools.find_packages() 26 | requires = [ 27 | "feapder", 28 | ] 29 | 30 | extras_require = {"pgsql": ["psycopg2-binary>=2.9.2"]} 31 | 32 | setuptools.setup( 33 | name="feapder_pipelines", 34 | version=version, 35 | author="Boris", 36 | license="MIT", 37 | author_email="feapder@qq.com", 38 | python_requires=">=3.6", 39 | description="feapder pipelines extension", 40 | long_description=long_description, 41 | long_description_content_type="text/markdown", 42 | install_requires=requires, 43 | extras_require=extras_require, 44 | url="https://github.com/Boris-code/feapder_pipelines.git", 45 | packages=packages, 46 | include_package_data=True, 47 | classifiers=["Programming Language :: Python :: 3"], 48 | ) 49 | -------------------------------------------------------------------------------- /tests/pgsql/test_pgsqldb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: 操作pgsql数据库 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | from feapder_pipelines.db.pgsqldb import PgsqlDB 11 | 12 | 13 | db = PgsqlDB( 14 | ip="localhost", port=5432, db="postgres", user_name="postgres", user_pass="123456" 15 | ) 16 | 17 | # postgresql://user_name:user_passwd@ip:port/db?charset=utf8 18 | PgsqlDB.from_url("postgresql://postgres:123456@localhost:5432/postgres?charset=utf8") 19 | -------------------------------------------------------------------------------- /tests/pgsql/test_postgresql_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: 操作pgsql数据库 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | import feapder 11 | from feapder import Item, UpdateItem 12 | 13 | 14 | class TestPostgreSQL(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | ITEM_PIPELINES=["feapder_pipelines.pipelines.pgsql_pipeline.PgsqlPipeline"], 17 | PGSQL_IP="localhost", 18 | PGSQL_PORT=5432, 19 | PGSQL_DB="feapder", 20 | PGSQL_USER_NAME="postgres", 21 | PGSQL_USER_PASS="123456", 22 | ) 23 | 24 | def start_requests(self): 25 | yield feapder.Request("https://www.baidu.com") 26 | 27 | def parse(self, request, response): 28 | title = response.xpath("//title/text()").extract_first() # 取标题 29 | for i in range(10): 30 | item = Item() # 声明一个item 31 | item.table_name = "test_postgresql" 32 | item.id = i 33 | item.title = title 34 | item.index = i 35 | item.c = "777" 36 | yield item # 返回item, item会自动批量入库 37 | 38 | # 测试索引冲突 39 | for i in range(10): 40 | item = Item() # 声明一个item 41 | item.table_name = "test_postgresql" 42 | item.id = i + 5 43 | item.title = title 44 | item.index = i 45 | item.c = "777" 46 | yield item # 返回item, item会自动批量入库 47 | 48 | 49 | if __name__ == "__main__": 50 | TestPostgreSQL().start() 51 | -------------------------------------------------------------------------------- /tests/pgsql/test_postgresql_updateitem_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-12-04 14:42 4 | --------- 5 | @summary: 操作pgsql数据库 6 | --------- 7 | @author: 沈瑞祥 8 | @email: ruixiang.shen@outlook.com 9 | """ 10 | import feapder 11 | from feapder import Item, UpdateItem 12 | 13 | 14 | class TestPostgreSQL(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | ITEM_PIPELINES=["feapder_pipelines.pipelines.pgsql_pipeline.PgsqlPipeline"], 17 | PGSQL_IP="localhost", 18 | PGSQL_PORT=5432, 19 | PGSQL_DB="feapder", 20 | PGSQL_USER_NAME="postgres", 21 | PGSQL_USER_PASS="123456", 22 | ) 23 | 24 | def start_requests(self): 25 | yield feapder.Request("https://www.baidu.com") 26 | 27 | def parse(self, request, response): 28 | title = response.xpath("//title/text()").extract_first() # 取标题 29 | for i in range(10): 30 | item = UpdateItem() # 声明一个item 31 | item.table_name = "test_postgresql" 32 | item.id = i 33 | item.title = title + str(666) # 给item属性赋值 34 | item.index = i 35 | item.c = "postgresql测试成功" 36 | yield item # 返回item, item会自动批量入库 37 | 38 | 39 | if __name__ == "__main__": 40 | TestPostgreSQL().start() 41 | --------------------------------------------------------------------------------