├── .gitignore ├── .pylintrc ├── Pipfile ├── Pipfile.lock ├── README.md ├── config ├── __init__.py ├── logger.py └── settings.py ├── db ├── __init__.py ├── queries.py └── utils.py ├── env.example ├── main.py ├── olx ├── __init__.py └── utils.py ├── tg └── __init__.py └── utils ├── __init__.py └── models.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .env 3 | venv/ 4 | *.log 5 | *.gz 6 | __pycache__/ 7 | *.db 8 | *.sqlite 9 | *.sqlite3 10 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=tests 13 | 14 | # Add files or directories matching the regex patterns to the blacklist. The 15 | # regex matches against base names, not paths. 16 | ignore-patterns=object_detection_grpc_client.py,prediction_pb2.py,prediction_pb2_grpc.py,mnist_DDP.py,mnistddpserving.py 17 | 18 | # Pickle collected data for later comparisons. 19 | persistent=no 20 | 21 | # List of plugins (as comma separated values of python modules names) to load, 22 | # usually to register additional checkers. 23 | load-plugins= 24 | 25 | # Use multiple processes to speed up Pylint. 26 | jobs=4 27 | 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the 29 | # active Python interpreter and may run arbitrary code. 30 | unsafe-load-any-extension=no 31 | 32 | # A comma-separated list of package or module names from where C extensions may 33 | # be loaded. Extensions are loading into the active Python interpreter and may 34 | # run arbitrary code 35 | extension-pkg-whitelist= 36 | 37 | 38 | [MESSAGES CONTROL] 39 | 40 | # Only show warnings with the listed confidence levels. Leave empty to show 41 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 42 | confidence= 43 | 44 | # Enable the message, report, category or checker with the given id(s). You can 45 | # either give multiple identifier separated by comma (,) or put this option 46 | # multiple time (only on the command line, not in the configuration file where 47 | # it should appear only once). See also the "--disable" option for examples. 48 | #enable= 49 | 50 | # Disable the message, report, category or checker with the given id(s). You 51 | # can either give multiple identifiers separated by comma (,) or put this 52 | # option multiple times (only on the command line, not in the configuration 53 | # file where it should appear only once).You can also use "--disable=all" to 54 | # disable everything first and then reenable specific checks. For example, if 55 | # you want to run only the similarities checker, you can use "--disable=all 56 | # --enable=similarities". If you want to run only the classes checker, but have 57 | # no Warning level messages displayed, use"--disable=all --enable=classes 58 | # --disable=W" 59 | disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals 60 | 61 | 62 | [REPORTS] 63 | 64 | # Set the output format. Available formats are text, parseable, colorized, msvs 65 | # (visual studio) and html. You can also give a reporter class, eg 66 | # mypackage.mymodule.MyReporterClass. 67 | output-format=text 68 | 69 | # Put messages in a separate file for each module / package specified on the 70 | # command line instead of printing them on stdout. Reports (if any) will be 71 | # written in a file name "pylint_global.[txt|html]". This option is deprecated 72 | # and it will be removed in Pylint 2.0. 73 | files-output=no 74 | 75 | # Tells whether to display a full report or only the messages 76 | reports=no 77 | 78 | # Python expression which should return a note less than 10 (10 is the highest 79 | # note). You have access to the variables errors warning, statement which 80 | # respectively contain the number of errors / warnings messages and the total 81 | # number of statements analyzed. This is used by the global evaluation report 82 | # (RP0004). 83 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 84 | 85 | # Template used to display messages. This is a python new-style format string 86 | # used to format the message information. See doc for all details 87 | #msg-template= 88 | 89 | 90 | [BASIC] 91 | 92 | # Good variable names which should always be accepted, separated by a comma 93 | good-names=i,j,k,ex,Run,_ 94 | 95 | # Bad variable names which should always be refused, separated by a comma 96 | bad-names=foo,bar,baz,toto,tutu,tata 97 | 98 | # Colon-delimited sets of names that determine each other's naming style when 99 | # the name regexes allow several styles. 100 | name-group= 101 | 102 | # Include a hint for the correct naming format with invalid-name 103 | include-naming-hint=no 104 | 105 | # List of decorators that produce properties, such as abc.abstractproperty. Add 106 | # to this list to register other decorators that produce valid properties. 107 | property-classes=abc.abstractproperty 108 | 109 | # Regular expression matching correct function names 110 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 111 | 112 | # Naming hint for function names 113 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 114 | 115 | # Regular expression matching correct variable names 116 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 117 | 118 | # Naming hint for variable names 119 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 120 | 121 | # Regular expression matching correct constant names 122 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 123 | 124 | # Naming hint for constant names 125 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 126 | 127 | # Regular expression matching correct attribute names 128 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 129 | 130 | # Naming hint for attribute names 131 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 132 | 133 | # Regular expression matching correct argument names 134 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 135 | 136 | # Naming hint for argument names 137 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 138 | 139 | # Regular expression matching correct class attribute names 140 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 141 | 142 | # Naming hint for class attribute names 143 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 144 | 145 | # Regular expression matching correct inline iteration names 146 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 147 | 148 | # Naming hint for inline iteration names 149 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 150 | 151 | # Regular expression matching correct class names 152 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 153 | 154 | # Naming hint for class names 155 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 156 | 157 | # Regular expression matching correct module names 158 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 159 | 160 | # Naming hint for module names 161 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 162 | 163 | # Regular expression matching correct method names 164 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 165 | 166 | # Naming hint for method names 167 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 168 | 169 | # Regular expression which should only match function or class names that do 170 | # not require a docstring. 171 | no-docstring-rgx=^_ 172 | 173 | # Minimum line length for functions/classes that require docstrings, shorter 174 | # ones are exempt. 175 | docstring-min-length=-1 176 | 177 | 178 | [ELIF] 179 | 180 | # Maximum number of nested blocks for function / method body 181 | max-nested-blocks=5 182 | 183 | 184 | [TYPECHECK] 185 | 186 | # Tells whether missing members accessed in mixin class should be ignored. A 187 | # mixin class is detected if its name ends with "mixin" (case insensitive). 188 | ignore-mixin-members=yes 189 | 190 | # List of module names for which member attributes should not be checked 191 | # (useful for modules/projects where namespaces are manipulated during runtime 192 | # and thus existing member attributes cannot be deduced by static analysis. It 193 | # supports qualified module names, as well as Unix pattern matching. 194 | ignored-modules= 195 | 196 | # List of class names for which member attributes should not be checked (useful 197 | # for classes with dynamically set attributes). This supports the use of 198 | # qualified names. 199 | ignored-classes=optparse.Values,thread._local,_thread._local 200 | 201 | # List of members which are set dynamically and missed by pylint inference 202 | # system, and so shouldn't trigger E1101 when accessed. Python regular 203 | # expressions are accepted. 204 | generated-members= 205 | 206 | # List of decorators that produce context managers, such as 207 | # contextlib.contextmanager. Add to this list to register other decorators that 208 | # produce valid context managers. 209 | contextmanager-decorators=contextlib.contextmanager 210 | 211 | 212 | [FORMAT] 213 | 214 | # Maximum number of characters on a single line. 215 | max-line-length=100 216 | 217 | # Regexp for a line that is allowed to be longer than the limit. 218 | ignore-long-lines=^\s*(# )??$ 219 | 220 | # Allow the body of an if to be on the same line as the test if there is no 221 | # else. 222 | single-line-if-stmt=no 223 | 224 | # List of optional constructs for which whitespace checking is disabled. `dict- 225 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 226 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 227 | # `empty-line` allows space-only lines. 228 | no-space-check=trailing-comma,dict-separator 229 | 230 | # Maximum number of lines in a module 231 | max-module-lines=1000 232 | 233 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 234 | # tab). 235 | indent-string=' ' 236 | 237 | # Number of spaces of indent required inside a hanging or continued line. 238 | indent-after-paren=4 239 | 240 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 241 | expected-line-ending-format= 242 | 243 | 244 | [MISCELLANEOUS] 245 | 246 | # List of note tags to take in consideration, separated by a comma. 247 | notes=FIXME,XXX,TODO 248 | 249 | 250 | [VARIABLES] 251 | 252 | # Tells whether we should check for unused import in __init__ files. 253 | init-import=no 254 | 255 | # A regular expression matching the name of dummy variables (i.e. expectedly 256 | # not used). 257 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy 258 | 259 | # List of additional names supposed to be defined in builtins. Remember that 260 | # you should avoid to define new builtins when possible. 261 | additional-builtins= 262 | 263 | # List of strings which can identify a callback function by name. A callback 264 | # name must start or end with one of those strings. 265 | callbacks=cb_,_cb 266 | 267 | # List of qualified module names which can have objects that can redefine 268 | # builtins. 269 | redefining-builtins-modules=six.moves,future.builtins 270 | 271 | 272 | [LOGGING] 273 | 274 | # Logging modules to check that the string format arguments are in logging 275 | # function parameter format 276 | logging-modules=logging 277 | 278 | 279 | [SIMILARITIES] 280 | 281 | # Minimum lines number of a similarity. 282 | min-similarity-lines=4 283 | 284 | # Ignore comments when computing similarities. 285 | ignore-comments=yes 286 | 287 | # Ignore docstrings when computing similarities. 288 | ignore-docstrings=yes 289 | 290 | # Ignore imports when computing similarities. 291 | ignore-imports=no 292 | 293 | 294 | [SPELLING] 295 | 296 | # Spelling dictionary name. Available dictionaries: none. To make it working 297 | # install python-enchant package. 298 | spelling-dict= 299 | 300 | # List of comma separated words that should not be checked. 301 | spelling-ignore-words= 302 | 303 | # A path to a file that contains private dictionary; one word per line. 304 | spelling-private-dict-file= 305 | 306 | # Tells whether to store unknown words to indicated private dictionary in 307 | # --spelling-private-dict-file option instead of raising a message. 308 | spelling-store-unknown-words=no 309 | 310 | 311 | [IMPORTS] 312 | 313 | # Deprecated modules which should not be used, separated by a comma 314 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 315 | 316 | # Create a graph of every (i.e. internal and external) dependencies in the 317 | # given file (report RP0402 must not be disabled) 318 | import-graph= 319 | 320 | # Create a graph of external dependencies in the given file (report RP0402 must 321 | # not be disabled) 322 | ext-import-graph= 323 | 324 | # Create a graph of internal dependencies in the given file (report RP0402 must 325 | # not be disabled) 326 | int-import-graph= 327 | 328 | # Force import order to recognize a module as part of the standard 329 | # compatibility libraries. 330 | known-standard-library= 331 | 332 | # Force import order to recognize a module as part of a third party library. 333 | known-third-party=enchant 334 | 335 | # Analyse import fallback blocks. This can be used to support both Python 2 and 336 | # 3 compatible code, which means that the block might have code that exists 337 | # only in one or another interpreter, leading to false positives when analysed. 338 | analyse-fallback-blocks=no 339 | 340 | 341 | [DESIGN] 342 | 343 | # Maximum number of arguments for function / method 344 | max-args=7 345 | 346 | # Argument names that match this expression will be ignored. Default to name 347 | # with leading underscore 348 | ignored-argument-names=_.* 349 | 350 | # Maximum number of locals for function / method body 351 | max-locals=15 352 | 353 | # Maximum number of return / yield for function / method body 354 | max-returns=6 355 | 356 | # Maximum number of branch for function / method body 357 | max-branches=12 358 | 359 | # Maximum number of statements in function / method body 360 | max-statements=50 361 | 362 | # Maximum number of parents for a class (see R0901). 363 | max-parents=7 364 | 365 | # Maximum number of attributes for a class (see R0902). 366 | max-attributes=7 367 | 368 | # Minimum number of public methods for a class (see R0903). 369 | min-public-methods=0 370 | 371 | # Maximum number of public methods for a class (see R0904). 372 | max-public-methods=20 373 | 374 | # Maximum number of boolean expressions in a if statement 375 | max-bool-expr=5 376 | 377 | 378 | [CLASSES] 379 | 380 | # List of method names used to declare (i.e. assign) instance attributes. 381 | defining-attr-methods=__init__,__new__,setUp 382 | 383 | # List of valid names for the first argument in a class method. 384 | valid-classmethod-first-arg=cls 385 | 386 | # List of valid names for the first argument in a metaclass class method. 387 | valid-metaclass-classmethod-first-arg=mcs 388 | 389 | # List of member names, which should be excluded from the protected access 390 | # warning. 391 | exclude-protected=_asdict,_fields,_replace,_source,_make,_response 392 | 393 | 394 | [EXCEPTIONS] 395 | 396 | # Exceptions that will emit a warning when being caught. Defaults to 397 | # "Exception" 398 | overgeneral-exceptions=Exception 399 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | pylint = "*" 8 | 9 | [packages] 10 | beautifulsoup4 = "*" 11 | environs = "*" 12 | fake-useragent = "*" 13 | lxml = "*" 14 | python-dateutil = "*" 15 | python-telegram-bot = "*" 16 | requests = "*" 17 | 18 | [requires] 19 | python_version = "3.7" 20 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "b3d59cfc3126d8a3e90a3e503b30fddb41c7fcec1b628f5975f60ca561188525" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.7" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "beautifulsoup4": { 20 | "hashes": [ 21 | "sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", 22 | "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", 23 | "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718" 24 | ], 25 | "index": "pypi", 26 | "version": "==4.7.1" 27 | }, 28 | "certifi": { 29 | "hashes": [ 30 | "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", 31 | "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" 32 | ], 33 | "version": "==2019.9.11" 34 | }, 35 | "cffi": { 36 | "hashes": [ 37 | "sha256:00d890313797d9fe4420506613384b43099ad7d2b905c0752dbcc3a6f14d80fa", 38 | "sha256:0cf9e550ac6c5e57b713437e2f4ac2d7fd0cd10336525a27224f5fc1ec2ee59a", 39 | "sha256:0ea23c9c0cdd6778146a50d867d6405693ac3b80a68829966c98dd5e1bbae400", 40 | "sha256:193697c2918ecdb3865acf6557cddf5076bb39f1f654975e087b67efdff83365", 41 | "sha256:1ae14b542bf3b35e5229439c35653d2ef7d8316c1fffb980f9b7647e544baa98", 42 | "sha256:1e389e069450609c6ffa37f21f40cce36f9be7643bbe5051ab1de99d5a779526", 43 | "sha256:263242b6ace7f9cd4ea401428d2d45066b49a700852334fd55311bde36dcda14", 44 | "sha256:33142ae9807665fa6511cfa9857132b2c3ee6ddffb012b3f0933fc11e1e830d5", 45 | "sha256:364f8404034ae1b232335d8c7f7b57deac566f148f7222cef78cf8ae28ef764e", 46 | "sha256:47368f69fe6529f8f49a5d146ddee713fc9057e31d61e8b6dc86a6a5e38cecc1", 47 | "sha256:4895640844f17bec32943995dc8c96989226974dfeb9dd121cc45d36e0d0c434", 48 | "sha256:558b3afef987cf4b17abd849e7bedf64ee12b28175d564d05b628a0f9355599b", 49 | "sha256:5ba86e1d80d458b338bda676fd9f9d68cb4e7a03819632969cf6d46b01a26730", 50 | "sha256:63424daa6955e6b4c70dc2755897f5be1d719eabe71b2625948b222775ed5c43", 51 | "sha256:6381a7d8b1ebd0bc27c3bc85bc1bfadbb6e6f756b4d4db0aa1425c3719ba26b4", 52 | "sha256:6381ab708158c4e1639da1f2a7679a9bbe3e5a776fc6d1fd808076f0e3145331", 53 | "sha256:6fd58366747debfa5e6163ada468a90788411f10c92597d3b0a912d07e580c36", 54 | "sha256:728ec653964655d65408949b07f9b2219df78badd601d6c49e28d604efe40599", 55 | "sha256:7cfcfda59ef1f95b9f729c56fe8a4041899f96b72685d36ef16a3440a0f85da8", 56 | "sha256:819f8d5197c2684524637f940445c06e003c4a541f9983fd30d6deaa2a5487d8", 57 | "sha256:825ecffd9574557590e3225560a8a9d751f6ffe4a49e3c40918c9969b93395fa", 58 | "sha256:9009e917d8f5ef780c2626e29b6bc126f4cb2a4d43ca67aa2b40f2a5d6385e78", 59 | "sha256:9c77564a51d4d914ed5af096cd9843d90c45b784b511723bd46a8a9d09cf16fc", 60 | "sha256:a19089fa74ed19c4fe96502a291cfdb89223a9705b1d73b3005df4256976142e", 61 | "sha256:a40ed527bffa2b7ebe07acc5a3f782da072e262ca994b4f2085100b5a444bbb2", 62 | "sha256:bb75ba21d5716abc41af16eac1145ab2e471deedde1f22c6f99bd9f995504df0", 63 | "sha256:e22a00c0c81ffcecaf07c2bfb3672fa372c50e2bd1024ffee0da191c1b27fc71", 64 | "sha256:e55b5a746fb77f10c83e8af081979351722f6ea48facea79d470b3731c7b2891", 65 | "sha256:ec2fa3ee81707a5232bf2dfbd6623fdb278e070d596effc7e2d788f2ada71a05", 66 | "sha256:fd82eb4694be712fcae03c717ca2e0fc720657ac226b80bbb597e971fc6928c2" 67 | ], 68 | "version": "==1.13.1" 69 | }, 70 | "chardet": { 71 | "hashes": [ 72 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 73 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 74 | ], 75 | "version": "==3.0.4" 76 | }, 77 | "cryptography": { 78 | "hashes": [ 79 | "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c", 80 | "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595", 81 | "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad", 82 | "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651", 83 | "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2", 84 | "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff", 85 | "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d", 86 | "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42", 87 | "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d", 88 | "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e", 89 | "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912", 90 | "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793", 91 | "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13", 92 | "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7", 93 | "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0", 94 | "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879", 95 | "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f", 96 | "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9", 97 | "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2", 98 | "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf", 99 | "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8" 100 | ], 101 | "version": "==2.8" 102 | }, 103 | "environs": { 104 | "hashes": [ 105 | "sha256:9b752da76f3d470dea21b7abfe44a836c775bbb3ca50f50bfc4b188c49ac633a", 106 | "sha256:a69695630145bf559baedd1ac99312abd27cca55cbc1dca7f7a6a2bae7e1570c" 107 | ], 108 | "index": "pypi", 109 | "version": "==4.1.0" 110 | }, 111 | "fake-useragent": { 112 | "hashes": [ 113 | "sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35" 114 | ], 115 | "index": "pypi", 116 | "version": "==0.1.11" 117 | }, 118 | "future": { 119 | "hashes": [ 120 | "sha256:858e38522e8fd0d3ce8f0c1feaf0603358e366d5403209674c7b617fa0c24093" 121 | ], 122 | "version": "==0.18.1" 123 | }, 124 | "idna": { 125 | "hashes": [ 126 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 127 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 128 | ], 129 | "version": "==2.8" 130 | }, 131 | "lxml": { 132 | "hashes": [ 133 | "sha256:0dd6589fa75d369ba06d2b5f38dae107f76ea127f212f6a7bee134f6df2d1d21", 134 | "sha256:1afbac344aa68c29e81ab56c1a9411c3663157b5aee5065b7fa030b398d4f7e0", 135 | "sha256:1baad9d073692421ad5dbbd81430aba6c7f5fdc347f03537ae046ddf2c9b2297", 136 | "sha256:1d8736421a2358becd3edf20260e41a06a0bf08a560480d3a5734a6bcbacf591", 137 | "sha256:1e1d9bddc5afaddf0de76246d3f2152f961697ad7439c559f179002682c45801", 138 | "sha256:1f179dc8b2643715f020f4d119d5529b02cd794c1c8f305868b73b8674d2a03f", 139 | "sha256:241fb7bdf97cb1df1edfa8f0bcdfd80525d4023dac4523a241907c8b2f44e541", 140 | "sha256:2f9765ee5acd3dbdcdc0d0c79309e01f7c16bc8d39b49250bf88de7b46daaf58", 141 | "sha256:312e1e1b1c3ce0c67e0b8105317323e12807955e8186872affb667dbd67971f6", 142 | "sha256:3273db1a8055ca70257fd3691c6d2c216544e1a70b673543e15cc077d8e9c730", 143 | "sha256:34dfaa8c02891f9a246b17a732ca3e99c5e42802416628e740a5d1cb2f50ff49", 144 | "sha256:3aa3f5288af349a0f3a96448ebf2e57e17332d99f4f30b02093b7948bd9f94cc", 145 | "sha256:51102e160b9d83c1cc435162d90b8e3c8c93b28d18d87b60c56522d332d26879", 146 | "sha256:56115fc2e2a4140e8994eb9585119a1ae9223b506826089a3ba753a62bd194a6", 147 | "sha256:69d83de14dbe8fe51dccfd36f88bf0b40f5debeac763edf9f8325180190eba6e", 148 | "sha256:99fdce94aeaa3ccbdfcb1e23b34273605c5853aa92ec23d84c84765178662c6c", 149 | "sha256:a7c0cd5b8a20f3093ee4a67374ccb3b8a126743b15a4d759e2a1bf098faac2b2", 150 | "sha256:abe12886554634ed95416a46701a917784cb2b4c77bfacac6916681d49bbf83d", 151 | "sha256:b4f67b5183bd5f9bafaeb76ad119e977ba570d2b0e61202f534ac9b5c33b4485", 152 | "sha256:bdd7c1658475cc1b867b36d5c4ed4bc316be8d3368abe03d348ba906a1f83b0e", 153 | "sha256:c6f24149a19f611a415a51b9bc5f17b6c2f698e0d6b41ffb3fa9f24d35d05d73", 154 | "sha256:d1e111b3ab98613115a208c1017f266478b0ab224a67bc8eac670fa0bad7d488", 155 | "sha256:d6520aa965773bbab6cb7a791d5895b00d02cf9adc93ac2bf4edb9ac1a6addc5", 156 | "sha256:dd185cde2ccad7b649593b0cda72021bc8a91667417001dbaf24cd746ecb7c11", 157 | "sha256:de2e5b0828a9d285f909b5d2e9d43f1cf6cf21fe65bc7660bdaa1780c7b58298", 158 | "sha256:f726444b8e909c4f41b4fde416e1071cf28fa84634bfb4befdf400933b6463af" 159 | ], 160 | "index": "pypi", 161 | "version": "==4.3.0" 162 | }, 163 | "marshmallow": { 164 | "hashes": [ 165 | "sha256:077b4612f5d3b9333b736fdc6b963d2b46d409070f44ff3e6c4109645c673e83", 166 | "sha256:9a2f3e8ea5f530a9664e882d7d04b58650f46190178b2264c72b7d20399d28f0" 167 | ], 168 | "version": "==3.2.1" 169 | }, 170 | "pycparser": { 171 | "hashes": [ 172 | "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" 173 | ], 174 | "version": "==2.19" 175 | }, 176 | "python-dateutil": { 177 | "hashes": [ 178 | "sha256:063df5763652e21de43de7d9e00ccf239f953a832941e37be541614732cdfc93", 179 | "sha256:88f9287c0174266bb0d8cedd395cfba9c58e87e5ad86b2ce58859bc11be3cf02" 180 | ], 181 | "index": "pypi", 182 | "version": "==2.7.5" 183 | }, 184 | "python-dotenv": { 185 | "hashes": [ 186 | "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", 187 | "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" 188 | ], 189 | "version": "==0.10.3" 190 | }, 191 | "python-telegram-bot": { 192 | "hashes": [ 193 | "sha256:78695b1f6e147e9b360ccfb1ac92b542cab27870ccaf04065a88ee601ffa58b6", 194 | "sha256:cca4e32ebb8da7fdf35ab2fa2b3edd441211364819c5592fc253acdb7561ea5b" 195 | ], 196 | "index": "pypi", 197 | "version": "==11.1.0" 198 | }, 199 | "requests": { 200 | "hashes": [ 201 | "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", 202 | "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" 203 | ], 204 | "index": "pypi", 205 | "version": "==2.21.0" 206 | }, 207 | "six": { 208 | "hashes": [ 209 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 210 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 211 | ], 212 | "version": "==1.12.0" 213 | }, 214 | "soupsieve": { 215 | "hashes": [ 216 | "sha256:605f89ad5fdbfefe30cdc293303665eff2d188865d4dbe4eb510bba1edfbfce3", 217 | "sha256:b91d676b330a0ebd5b21719cb6e9b57c57d433671f65b9c28dd3461d9a1ed0b6" 218 | ], 219 | "version": "==1.9.4" 220 | }, 221 | "urllib3": { 222 | "hashes": [ 223 | "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", 224 | "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" 225 | ], 226 | "index": "pypi", 227 | "version": "==1.24.2" 228 | } 229 | }, 230 | "develop": { 231 | "astroid": { 232 | "hashes": [ 233 | "sha256:09a3fba616519311f1af8a461f804b68f0370e100c9264a035aa7846d7852e33", 234 | "sha256:5a79c9b4bd6c4be777424593f957c996e20beb5f74e0bc332f47713c6f675efe" 235 | ], 236 | "version": "==2.3.2" 237 | }, 238 | "isort": { 239 | "hashes": [ 240 | "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1", 241 | "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd" 242 | ], 243 | "version": "==4.3.21" 244 | }, 245 | "lazy-object-proxy": { 246 | "hashes": [ 247 | "sha256:02b260c8deb80db09325b99edf62ae344ce9bc64d68b7a634410b8e9a568edbf", 248 | "sha256:18f9c401083a4ba6e162355873f906315332ea7035803d0fd8166051e3d402e3", 249 | "sha256:1f2c6209a8917c525c1e2b55a716135ca4658a3042b5122d4e3413a4030c26ce", 250 | "sha256:2f06d97f0ca0f414f6b707c974aaf8829c2292c1c497642f63824119d770226f", 251 | "sha256:616c94f8176808f4018b39f9638080ed86f96b55370b5a9463b2ee5c926f6c5f", 252 | "sha256:63b91e30ef47ef68a30f0c3c278fbfe9822319c15f34b7538a829515b84ca2a0", 253 | "sha256:77b454f03860b844f758c5d5c6e5f18d27de899a3db367f4af06bec2e6013a8e", 254 | "sha256:83fe27ba321e4cfac466178606147d3c0aa18e8087507caec78ed5a966a64905", 255 | "sha256:84742532d39f72df959d237912344d8a1764c2d03fe58beba96a87bfa11a76d8", 256 | "sha256:874ebf3caaf55a020aeb08acead813baf5a305927a71ce88c9377970fe7ad3c2", 257 | "sha256:9f5caf2c7436d44f3cec97c2fa7791f8a675170badbfa86e1992ca1b84c37009", 258 | "sha256:a0c8758d01fcdfe7ae8e4b4017b13552efa7f1197dd7358dc9da0576f9d0328a", 259 | "sha256:a4def978d9d28cda2d960c279318d46b327632686d82b4917516c36d4c274512", 260 | "sha256:ad4f4be843dace866af5fc142509e9b9817ca0c59342fdb176ab6ad552c927f5", 261 | "sha256:ae33dd198f772f714420c5ab698ff05ff900150486c648d29951e9c70694338e", 262 | "sha256:b4a2b782b8a8c5522ad35c93e04d60e2ba7f7dcb9271ec8e8c3e08239be6c7b4", 263 | "sha256:c462eb33f6abca3b34cdedbe84d761f31a60b814e173b98ede3c81bb48967c4f", 264 | "sha256:fd135b8d35dfdcdb984828c84d695937e58cc5f49e1c854eb311c4d6aa03f4f1" 265 | ], 266 | "version": "==1.4.2" 267 | }, 268 | "mccabe": { 269 | "hashes": [ 270 | "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", 271 | "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" 272 | ], 273 | "version": "==0.6.1" 274 | }, 275 | "pylint": { 276 | "hashes": [ 277 | "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492", 278 | "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c" 279 | ], 280 | "index": "pypi", 281 | "version": "==2.2.2" 282 | }, 283 | "six": { 284 | "hashes": [ 285 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 286 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 287 | ], 288 | "version": "==1.12.0" 289 | }, 290 | "typed-ast": { 291 | "hashes": [ 292 | "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", 293 | "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", 294 | "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", 295 | "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", 296 | "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", 297 | "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", 298 | "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", 299 | "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", 300 | "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", 301 | "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", 302 | "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", 303 | "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", 304 | "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", 305 | "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", 306 | "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", 307 | "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", 308 | "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", 309 | "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", 310 | "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", 311 | "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" 312 | ], 313 | "markers": "implementation_name == 'cpython' and python_version < '3.8'", 314 | "version": "==1.4.0" 315 | }, 316 | "wrapt": { 317 | "hashes": [ 318 | "sha256:565a021fd19419476b9362b05eeaa094178de64f8361e44468f9e9d7843901e1" 319 | ], 320 | "version": "==1.11.2" 321 | } 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## OLX flat parser. 2 | Simple python script which can be running by crontab every 30 minutes for instance, 3 | looking up about new flats in Odessa and notifying in the telegram through the bot. 4 | The script is looking for apartments that were added yesterday and today only. 5 | 6 | ### Prerequisites 7 | 0. `python 3.6 or higher` 8 | 0. `pipenv` 9 | 0. `sqlite3` 10 | 11 | ## Getting Started 12 | 0. Clone project to your own local machine - `git clone https://github.com/digitalashes/olx-parser.git` 13 | 0. Go to the script directory - `cd olx-parser` 14 | 0. Copy file env.example to config directory and rename it to *.env* - `cp env.example ./config/.env` 15 | 0. Fill in **TELEGRAM_BOT_KEY** and **TELEGRAM_CHAT_IDS** in `config/.env` also if you want, you can uncomment and change others constants. 16 | 0. Create new pipenv environment - `pipenv install` 17 | 0. Run `pipenv shell main.py` and waiting messages. 18 | 19 | #### Settings description: 20 | 21 | Crontab rule (every 30 minutes) - `0/30 * * * * ` 22 | 23 | 0. **BASE_URL** - Base url of olx with protocol. - `https://www.olx.ua/` 24 | 0. **PHONE_URL** - Url for fetching seller telephone numbers. 25 | 0. **CATEGORY** - `nedvizhimost` 26 | 0. **SUB_CATEGORY** - `arenda-kvartir` 27 | 0. **SUB_SUB_CATEGORY** - `dolgosrochnaya-arenda-kvartir` 28 | 0. **CITY** - `odessa` 29 | 0. **DISTRICT_ID** - `85` (Киевский), `199` (Коминтерновский), `87` (Малиновский), `89` (Приморский), `91` (Суворовский) 30 | 0. **MIN_PRICE** - Min price of flat rent (not set less 1000). `2500` 31 | 0. **MAX_PRICE** - Max price of flat rent. `5000` 32 | 0. **MIN_ROOMS** - Min rooms amount in flat. `1` 33 | 0. **MAX_ROOMS** - Max rooms amount in flat. `1` 34 | 0. **WITH_PHOTOS** - Search ads with photos only or not. `True` 35 | 0. **WITH_PROMOTED** - Include promoted ads. `False` 36 | 0. **PUBLICATION_DATE** - List of values with information when ad was published. `['сегодня', 'вчера']` 37 | 0. **TELEGRAM_BOT_API_URL** - Telegram api url. 38 | 0. **TELEGRAM_BOT_KEY** - Api key of telegram bot which will be sending messages. 39 | 0. **TELEGRAM_CHAT_IDS** - List of conversations ids when messages will be sending. 40 | 0. **LOG_FILENAME** - Name of logfile. 41 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import logger 2 | from .settings import settings 3 | -------------------------------------------------------------------------------- /config/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from config.settings import settings 4 | 5 | __all__ = [ 6 | 'logger', 7 | ] 8 | 9 | logger = logging.getLogger(settings.LOGGER_NAME) 10 | logger.setLevel(logging.INFO) 11 | 12 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - ' 13 | '[%(module)s - %(filename)s - %(funcName)s - %(lineno)d]:\n' 14 | ' %(message)s', 15 | datefmt='%Y-%m-%d-%H:%M:%S') 16 | if settings.LOGGING_IN_STDOUT: 17 | ch = logging.StreamHandler() 18 | ch.setLevel(logging.INFO) 19 | ch.setFormatter(formatter) 20 | logger.addHandler(ch) 21 | 22 | if settings.LOGGING_IN_FILE: 23 | fh = logging.FileHandler(f'{settings.LOGGER_NAME}.log') 24 | fh.setLevel(logging.INFO) 25 | fh.setFormatter(formatter) 26 | logger.addHandler(fh) 27 | -------------------------------------------------------------------------------- /config/settings.py: -------------------------------------------------------------------------------- 1 | from environs import Env 2 | 3 | env = Env() 4 | env.read_env() 5 | 6 | 7 | class settings: 8 | DEFAULT_USER_AGENT = env.str('DEFAULT_USER_AGENT', 9 | default='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' 10 | '(KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36') 11 | 12 | BASE_URL = env.str('BASE_URL', default='https://www.olx.ua/') 13 | PHONE_URL = env.str('PHONE_URL', default='https://www.olx.ua/ajax/misc/contact/phone/') 14 | 15 | CATEGORY = env.str('CATEGORY', default='nedvizhimost') 16 | SUB_CATEGORY = env.str('SUB_CATEGORY', default='arenda-kvartir') 17 | SUB_SUB_CATEGORY = env.str('SUB_SUB_CATEGORY', default='dolgosrochnaya-arenda-kvartir') 18 | CITY = env.str('CITY', default='odessa') 19 | DISTRICT_ID = env.int('DISTRICT_ID', default=89) 20 | MIN_PRICE = env.int('MIN_PRICE', default=7000) 21 | MAX_PRICE = env.int('MAX_PRICE', default=10000) 22 | MIN_ROOMS = env.int('MIN_ROOMS', default=1) 23 | MAX_ROOMS = env.int('MAX_ROOMS', default=1) 24 | WITH_PHOTOS = int(env.bool('WITH_PHOTOS', default=True)) 25 | WITH_PROMOTED = env.bool('WITH_PROMOTED', default=True) 26 | PUBLICATION_DATE = [item.lower() for item 27 | in env.list('PUBLICATION_DATE', default=['сегодня', 'вчера'])] 28 | 29 | TELEGRAM_BOT_API_URL = env.str('TELEGRAM_BOT_API_URL', default='https://api.telegram.org/bot') 30 | TELEGRAM_BOT_KEY = env.str('TELEGRAM_BOT_KEY', default=None) 31 | TELEGRAM_CHAT_IDS = env.list('TELEGRAM_CHAT_IDS', default=[]) 32 | 33 | DB_NAME = 'olx_parser.db' 34 | 35 | LOGGER_NAME = env.str('LOG_FILENAME', default='olx_parser_log') 36 | LOGGING_IN_STDOUT = env.bool('LOGGING_IN_STDOUT', default=True) 37 | LOGGING_IN_FILE = env.bool('LOGGING_IN_FILE', default=False) 38 | 39 | MONTH_MAPPING = { 40 | 'янв.': 1, 41 | 'февр.': 2, 42 | 'марта': 3, 43 | 'апр.': 4, 44 | 'мая': 5, 45 | 'июня': 6, 46 | 'июля': 7, 47 | 'авг.': 8, 48 | 'сент.': 9, 49 | 'окт.': 10, 50 | 'нояб.': 11, 51 | 'дек.': 12, 52 | } 53 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digitalashes/olx-parser/12234eae806c086e60c79a0b0bfa3abd9088399e/db/__init__.py -------------------------------------------------------------------------------- /db/queries.py: -------------------------------------------------------------------------------- 1 | from sqlite3 import Cursor as SqliteCursor 2 | from typing import List 3 | 4 | from config import logger 5 | from utils.models import AdModel 6 | from utils.models import LandLordModel 7 | from .utils import db_cache 8 | 9 | 10 | def get_exists_ads(db_cursor: SqliteCursor, 11 | external_ids: List[str]) -> List[int]: 12 | logger.info('=== Fetching existing ads ===') 13 | 14 | result = db_cursor.execute(f""" 15 | select external_id 16 | from ads 17 | where external_id in ({",".join(external_ids)}); 18 | """).fetchall() 19 | 20 | if result: 21 | result = [item[0] for item in result] 22 | logger.info('=== Found %s ads ===', len(result)) 23 | return result 24 | 25 | logger.info('=== Existing ads not found ===') 26 | return result 27 | 28 | 29 | @db_cache 30 | def get_author_id(db_cursor: SqliteCursor, 31 | external_id: str) -> int: 32 | logger.info('=== Trying get author with %s external id ===', external_id) 33 | 34 | result = db_cursor.execute(""" 35 | select id 36 | from authors 37 | where external_id = ?; 38 | """, (external_id,)).fetchone() 39 | 40 | if result: 41 | result = result[0] 42 | logger.info('=== Found author with id - %s ===', result) 43 | return result 44 | 45 | logger.info('=== Author not found ===') 46 | return result 47 | 48 | 49 | def create_author(db_cursor: SqliteCursor, 50 | data: LandLordModel) -> int: 51 | logger.info('=== Adding a new author - %s ===', repr(data)) 52 | 53 | db_cursor.execute(""" 54 | insert into authors(external_id, url, name, platform_created_at, other_ads) 55 | values (?,?,?,?,?); 56 | """, (data.external_id, data.url, data.name, data.platform_created_at, data.other_ads)) 57 | return db_cursor.lastrowid 58 | 59 | 60 | def create_ad(db_cursor: SqliteCursor, 61 | data: AdModel) -> None: 62 | logger.info('=== Adding a new ad - %s ===', repr(data)) 63 | 64 | db_cursor.execute(""" 65 | insert into ads(external_id, title, price, url, author_id, platform_created_at) 66 | values (?,?,?,?,?,?); 67 | """, (data.external_id, data.title, data.price, data.url, data.author_id, data.created)) 68 | 69 | 70 | def add_phones(db_cursor: SqliteCursor, 71 | author_id: int, phones: List[str]) -> None: 72 | logger.info('Add new phones - %s for author with id - %s ===', phones, author_id) 73 | 74 | for phone in phones: 75 | db_cursor.execute(""" 76 | insert into phones(phone, author_id) 77 | select :phone, :author_id 78 | where not exists(select 1 from phones where author_id = :author_id and phone = :phone) 79 | """, {'author_id': author_id, 'phone': phone}) 80 | -------------------------------------------------------------------------------- /db/utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from functools import wraps 3 | from typing import Callable 4 | 5 | from config import logger 6 | 7 | sql = """ 8 | 9 | pragma foreign_keys = on; 10 | 11 | create table if not exists authors 12 | ( 13 | id integer primary key autoincrement, 14 | external_id text not null, 15 | url text not null, 16 | name text not null, 17 | platform_created_at date not null default current_date, 18 | other_ads integer not null default 0 19 | ); 20 | 21 | create unique index if not exists authors_id_uindex 22 | on authors (id); 23 | create unique index if not exists authors_external_id_uindex 24 | on authors (external_id); 25 | create index if not exists authors_platform_created_at_index 26 | on authors (platform_created_at); 27 | 28 | create table if not exists phones 29 | ( 30 | id integer primary key autoincrement, 31 | phone text not null, 32 | author_id integer not null, 33 | 34 | foreign key (author_id) references authors (id) 35 | ); 36 | 37 | create unique index if not exists phones_id_uindex 38 | on phones (id); 39 | create index if not exists phones_phone_index 40 | on phones (phone); 41 | create index if not exists phones_author_id_index 42 | on phones (author_id); 43 | 44 | create table if not exists ads 45 | ( 46 | id integer primary key autoincrement, 47 | external_id text not null, 48 | title text not null, 49 | price real not null, 50 | url text not null, 51 | author_id integer not null, 52 | platform_created_at timestamp not null default current_timestamp, 53 | 54 | foreign key (author_id) references authors (id) 55 | ); 56 | 57 | create unique index if not exists ads_id_uindex 58 | on ads (id); 59 | create unique index if not exists ads_external_id_uindex 60 | on ads (external_id); 61 | create index if not exists ads_author_id_index 62 | on ads (author_id); 63 | create index if not exists ads_platform_created_at_index 64 | on ads (platform_created_at); 65 | 66 | """ 67 | 68 | 69 | def check_db(db_connect, db_cursor) -> None: 70 | logger.info('=== Check database tables ===') 71 | 72 | db_cursor.executescript(sql) 73 | db_connect.commit() 74 | 75 | 76 | def db_cache(func: Callable = None) -> Callable: 77 | _cache = {} 78 | 79 | if func is None: 80 | return partial(db_cache) 81 | 82 | @wraps(func) 83 | def __wrapper(*args, **kwargs): 84 | *_, _external_id = args 85 | _id = _cache.get(_external_id) 86 | if _id is None: 87 | _id = func(*args, **kwargs) 88 | if _id is not None: 89 | _cache[_external_id] = _id 90 | return _id 91 | 92 | return __wrapper 93 | -------------------------------------------------------------------------------- /env.example: -------------------------------------------------------------------------------- 1 | #DEFAULT_USER_AGENT= 2 | 3 | #BASE_URL= 4 | #PHONE_URL= 5 | #CATEGORY= 6 | #SUB_CATEGORY= 7 | #SUB_SUB_CATEGORY= 8 | #CITY= 9 | #DISTRICT_ID= 10 | #MIN_PRICE= 11 | #MAX_PRICE= 12 | #MIN_ROOMS= 13 | #MAX_ROOMS= 14 | #WITH_PHOTOS= 15 | #WITH_PROMOTED= 16 | #PUBLICATION_DATE= 17 | 18 | #TELEGRAM_BOT_API_URL= 19 | TELEGRAM_BOT_KEY= 20 | TELEGRAM_CHAT_IDS= 21 | 22 | #LOG_FILENAME= 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from requests import session as RequestsSession 4 | from telegram import Bot 5 | 6 | from config import logger 7 | from config import settings 8 | from olx import fetch_ads 9 | from olx import filter_new_ads 10 | from tg import send_message_into_telegram 11 | 12 | 13 | def main(): 14 | new_ads = [] 15 | bot = Bot(token=settings.TELEGRAM_BOT_KEY) 16 | 17 | with RequestsSession() as session: 18 | ads = fetch_ads(session) 19 | if ads: 20 | new_ads = filter_new_ads(session, ads) 21 | if new_ads: 22 | send_message_into_telegram(bot, new_ads) 23 | 24 | 25 | if __name__ == '__main__': 26 | start_time = time.time() 27 | logger.info('=== Script has been started ===') 28 | try: 29 | main() 30 | except KeyboardInterrupt: 31 | logger.info('=== Script has been stopped manually! ===') 32 | except Exception as e: # pylint: disable=broad-except 33 | logger.exception(e) 34 | else: 35 | logger.info('=== Script has been finished successfully ===') 36 | finally: 37 | logger.info('=== Operating time is %s seconds ===', (time.time() - start_time)) 38 | -------------------------------------------------------------------------------- /olx/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sqlite3 3 | from contextlib import closing 4 | from typing import List 5 | from typing import Set 6 | from typing import Tuple 7 | from urllib.parse import urljoin 8 | from urllib.parse import urlparse 9 | 10 | from bs4 import BeautifulSoup 11 | from dateutil import parser 12 | from fake_useragent import UserAgent 13 | from requests import ConnectionError as RequestsConnectionError 14 | from requests import session as Session 15 | 16 | from config import logger 17 | from config import settings 18 | from db.queries import add_phones 19 | from db.queries import create_ad 20 | from db.queries import create_author 21 | from db.queries import get_author_id 22 | from db.queries import get_exists_ads 23 | from db.utils import check_db 24 | from olx.utils import _get_landlord_created_at 25 | from olx.utils import _get_landlord_id 26 | from olx.utils import _get_landlord_name 27 | from olx.utils import _get_landlord_other_ads_count 28 | from olx.utils import _get_landlord_url 29 | from utils import RussianParserInfo 30 | from utils import build_url 31 | from utils.models import AdModel 32 | from utils.models import LandLordModel 33 | from utils.models import NewAdModel 34 | 35 | 36 | def fetch_ads(session: Session) -> Set[AdModel]: 37 | url = build_url() 38 | ads = [] 39 | 40 | logger.info('=== Starting fetch ads ===') 41 | 42 | response = session.get(url) 43 | if response.status_code != 200: 44 | logger.critical('=== Unsuccessful attempt. ' 45 | 'Please check url - %s ' 46 | 'The script will be stopped ===', url) 47 | raise RequestsConnectionError('Unable to get urls') 48 | 49 | soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml') 50 | ads_items = soup.find_all('table', attrs={'summary': 'Объявление'}) 51 | 52 | logger.info('=== Start processing %s ads ===', len(ads_items)) 53 | for item in ads_items: 54 | 55 | item_url_obj = item.find('a', class_='marginright5') 56 | item_url, url_info, *_ = item_url_obj.attrs.get('href').split('#') 57 | 58 | if not settings.WITH_PROMOTED and 'promoted' in url_info: 59 | continue 60 | 61 | try: 62 | price = int( 63 | item.find('p', class_='price').text.split(' грн.')[0].strip().replace(' ', '') 64 | ) 65 | except ValueError: 66 | logger.exception('=== Error during parsing a price ===') 67 | continue 68 | 69 | day = item.select('small > span')[1].text.strip().split(' ')[0].lower() 70 | 71 | ad = AdModel( 72 | external_id=item.attrs.get('data-id'), 73 | title=item_url_obj.text.strip(), 74 | price=price, 75 | url=item_url, 76 | ) 77 | 78 | if day in settings.PUBLICATION_DATE and \ 79 | settings.MIN_PRICE <= ad.price <= settings.MAX_PRICE: 80 | ads.append(ad) 81 | 82 | result = {ad for ad in ads} 83 | logger.info('=== Found %s ads after filtering ===', len(result)) 84 | return result 85 | 86 | 87 | def filter_new_ads(session: Session, ads: Set[AdModel]) -> List[NewAdModel]: 88 | new_ads = [] 89 | result = [] 90 | 91 | with closing(sqlite3.connect(settings.DB_NAME)) as db_connect: 92 | with closing(db_connect.cursor()) as db_cursor: 93 | check_db(db_connect, db_cursor) 94 | exists_ads = get_exists_ads(db_cursor, sorted([ad.external_id for ad in ads])) 95 | ads = [ad for ad in ads if ad.external_id not in exists_ads] 96 | 97 | if not ads: 98 | logger.info('=== New ads not found ===') 99 | 100 | for ad in ads: 101 | result.append(fetch_ads_detail(session, ad)) 102 | 103 | for item in filter(None, result): 104 | ad, author, phones, *_ = item 105 | author_id = get_author_id(db_cursor, author.external_id) 106 | if author_id is None: 107 | author_id = create_author(db_cursor, author) 108 | ad = ad._replace(author_id=author_id) 109 | 110 | create_ad(db_cursor, ad) 111 | add_phones(db_cursor, author_id, phones) 112 | 113 | new_ads.append( 114 | NewAdModel( 115 | title=ad.title, 116 | price=ad.price, 117 | url=ad.url, 118 | created=ad.created, 119 | author=author.name, 120 | phones=phones, 121 | ) 122 | ) 123 | db_connect.commit() 124 | return new_ads 125 | 126 | 127 | def fetch_ads_detail(session: Session, 128 | ad: AdModel) -> Tuple[AdModel, LandLordModel, List[str]] or None: 129 | ua = UserAgent(fallback=settings.DEFAULT_USER_AGENT) 130 | headers = { 131 | 'Host': urlparse(settings.BASE_URL).netloc, 132 | 'User-Agent': ua.random, 133 | 'Referer': ad.url, # Important! Must be present in headers and be equal of ad url. 134 | 'X-Requested-With': 'XMLHttpRequest', 135 | } 136 | logger.debug('=== Starting to fetch landlord telephone number and name ===') 137 | response = session.get(ad.url) 138 | 139 | if response.status_code != 200: 140 | logger.warning('=== Unsuccessful attempt ===') 141 | return None 142 | 143 | soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml') 144 | if soup.select('div#ad-not-available-box'): 145 | return None 146 | 147 | # fetch landlord info 148 | landlord_url = _get_landlord_url(soup) 149 | landlord_id = _get_landlord_id(landlord_url) 150 | landlord_name = _get_landlord_name(soup) 151 | landlord_created_at = _get_landlord_created_at(soup) 152 | landlord_other_ads = _get_landlord_other_ads_count(soup) 153 | author = LandLordModel( 154 | external_id=landlord_id, 155 | url=landlord_url, 156 | name=landlord_name, 157 | platform_created_at=landlord_created_at, 158 | other_ads=landlord_other_ads, 159 | ) 160 | 161 | posted_at = ' '.join( 162 | filter(None, soup.select_one('div.offer-titlebox__details > em').text.strip().split(' ')) 163 | ) 164 | posted_at = parser.parse(re.findall(r'\s\d+:\d+, \d+ \w+ \d+', posted_at)[0], 165 | parserinfo=RussianParserInfo()) 166 | ad = ad._replace(created=posted_at) 167 | 168 | # find and get phoneToken (needed for correct request) 169 | raw_text = [elem for elem in soup.find_all('script') 170 | if 'phoneToken' in elem.text][0].text.strip() 171 | token = re.findall(r"['\"](.*?)['\"]", raw_text)[0] 172 | 173 | # get id of ad 174 | ad_id = ad.url.split('ID')[1].split('.')[0] 175 | # shaping of the correct url with ad id and phone token 176 | phone_url = urljoin(settings.PHONE_URL, f'{ad_id}/?pt={token}') 177 | response = session.get(phone_url, headers=headers) 178 | 179 | if response.status_code != 200: 180 | logger.warning('=== Unsuccessful attempt. Empty values of phone numbers ===') 181 | return ad, author, [] 182 | 183 | phone_numbers = response.json().get('value') 184 | 185 | logger.debug('=== Finishing to fetching landlord phone number and name ===') 186 | if 'span' not in phone_numbers: 187 | return ad, author, [phone_numbers] 188 | 189 | soup = BeautifulSoup(phone_numbers, 'lxml') 190 | phone_numbers = [item.text.strip() for item in soup.find_all('span')] 191 | return ad, author, phone_numbers 192 | -------------------------------------------------------------------------------- /olx/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from urllib.parse import urlparse 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from config import logger 7 | from config import settings 8 | 9 | 10 | def _get_landlord_url(soup: BeautifulSoup) -> str: 11 | return soup.select('h4 > a')[0].attrs['href'] 12 | 13 | 14 | def _get_landlord_id(landlord_url: str) -> str: 15 | try: 16 | return landlord_url.split('/')[5] 17 | except IndexError: 18 | return urlparse(landlord_url)[1].split('.')[0] 19 | except Exception as e: # pylint: disable=broad-except 20 | logger.exception(e) 21 | return landlord_url 22 | 23 | 24 | def _get_landlord_name(soup: BeautifulSoup) -> str: 25 | return f"{soup.find('h4').text.strip()} " \ 26 | f"({soup.find('span', class_='user-since').text})" 27 | 28 | 29 | def _get_landlord_created_at(soup: BeautifulSoup) -> datetime.date: 30 | date = soup.find('span', class_='user-since').text.split('с ')[1] 31 | month, year = date.split(' ') 32 | month = settings.MONTH_MAPPING[month] 33 | return datetime.date(int(year), month, 1) 34 | 35 | 36 | def _get_landlord_other_ads_count(soup: BeautifulSoup) -> int: 37 | return len(soup.select('td.offer')) 38 | -------------------------------------------------------------------------------- /tg/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List 3 | 4 | from telegram import Bot 5 | 6 | from config import logger 7 | from config import settings 8 | from utils.models import NewAdModel 9 | 10 | 11 | def send_message_into_telegram(bot: Bot, new_ads: List[NewAdModel]) -> None: 12 | for item in reversed(new_ads): 13 | text = f'''{item.title} ({item.price}) 14 |
{'; '.join(item.phones)}\n{item.author}\n{item.created}
''' 15 | for chat in settings.TELEGRAM_CHAT_IDS: 16 | try: 17 | bot.send_message(chat_id=chat, text=text, parse_mode='HTML') 18 | time.sleep(0.250) 19 | except Exception: # pylint: disable=broad-except 20 | logger.exception('=== Error during sending message via Telegram ===') 21 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlencode 2 | from urllib.parse import urljoin 3 | from urllib.parse import urlparse 4 | from urllib.parse import urlunparse 5 | 6 | from dateutil import parser 7 | 8 | from config import settings 9 | 10 | 11 | class RussianParserInfo(parser.parserinfo): 12 | MONTHS = [('янв.', 'января'), 13 | ('февр.', 'февраля'), 14 | ('марта', 'матра'), 15 | ('апр.', 'апреля'), 16 | ('мая', 'мая'), 17 | ('июня', 'июня'), 18 | ('июля', 'июля'), 19 | ('авг.', 'августа'), 20 | ('сент.', 'Sept', 'сентября'), 21 | ('окт.', 'октября'), 22 | ('нояб.', 'ноября'), 23 | ('дек.', 'декабря')] 24 | 25 | 26 | def build_url() -> str: 27 | filters = urlencode({ 28 | 'search[district_id]': settings.DISTRICT_ID, 29 | 'search[filter_float_price:from]': settings.MIN_PRICE, 30 | 'search[filter_float_price:to]': settings.MAX_PRICE, 31 | 'search[filter_float_number_of_rooms:from]': settings.MIN_ROOMS, 32 | 'search[filter_float_number_of_rooms:to]': settings.MAX_ROOMS, 33 | 'search[photos]': settings.WITH_PHOTOS, 34 | 'search[order]': 'created_at:desc' 35 | }) 36 | url = urlparse( 37 | urljoin(settings.BASE_URL, '/'.join([settings.CATEGORY, 38 | settings.SUB_CATEGORY, 39 | settings.SUB_SUB_CATEGORY, 40 | settings.CITY])) 41 | ) 42 | return urlunparse((url.scheme, url.netloc, url.path, None, filters, None)) 43 | -------------------------------------------------------------------------------- /utils/models.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List 3 | from typing import NamedTuple 4 | 5 | 6 | class AdModel(NamedTuple): 7 | external_id: str 8 | title: str 9 | price: int 10 | url: str 11 | created: datetime = None 12 | author_id: str = None 13 | 14 | 15 | class NewAdModel(NamedTuple): 16 | title: str 17 | price: int 18 | url: str 19 | created: datetime 20 | author: str 21 | phones: List[str] 22 | 23 | 24 | class LandLordModel(NamedTuple): 25 | external_id: str 26 | url: str 27 | name: str 28 | platform_created_at: datetime.date 29 | other_ads: int 30 | --------------------------------------------------------------------------------