├── .gitignore ├── Pipfile ├── Pipfile.lock ├── README.md ├── jav ├── __init__.py ├── exporters.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── sites │ ├── __init__.py │ ├── ave │ │ ├── __init__.py │ │ ├── article.py │ │ ├── constants.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── article_spider.py │ │ │ └── video_spider.py │ │ └── video.py │ └── dmm │ │ ├── __init__.py │ │ ├── article.py │ │ ├── constants.py │ │ ├── spiders │ │ ├── __init__.py │ │ ├── actress_spider.py │ │ ├── article_spider.py │ │ ├── genre_spider.py │ │ ├── maker_spider.py │ │ ├── monthly_spider.py │ │ ├── product_spider.py │ │ ├── series_spider.py │ │ └── video_spider.py │ │ └── video.py ├── spiders │ ├── __init__.py │ └── list_spider.py └── utils.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Scrapy stuff: 2 | .scrapy 3 | scrapinghub.yml 4 | 5 | # pyenv 6 | .python-version 7 | 8 | # dotenv 9 | .env 10 | 11 | setup.py 12 | *.egg-info/ 13 | build/ 14 | __pycache__/ 15 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [scripts] 7 | crawl = "scrapy crawl" 8 | 9 | [packages] 10 | scrapy = "*" 11 | requests = "*" 12 | 13 | [dev-packages] 14 | shub = "*" 15 | 16 | [requires] 17 | python_version = "3.8" 18 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "57171a82ac724120db8309f8a2266577c76eaf5444798151971b32617fa0b4cc" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.8" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "attrs": { 20 | "hashes": [ 21 | "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6", 22 | "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700" 23 | ], 24 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 25 | "version": "==20.3.0" 26 | }, 27 | "automat": { 28 | "hashes": [ 29 | "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33", 30 | "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111" 31 | ], 32 | "version": "==20.2.0" 33 | }, 34 | "certifi": { 35 | "hashes": [ 36 | "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", 37 | "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" 38 | ], 39 | "version": "==2020.12.5" 40 | }, 41 | "cffi": { 42 | "hashes": [ 43 | "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", 44 | "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", 45 | "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", 46 | "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", 47 | "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", 48 | "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", 49 | "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", 50 | "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", 51 | "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", 52 | "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", 53 | "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", 54 | "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", 55 | "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", 56 | "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", 57 | "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", 58 | "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", 59 | "sha256:7ef7d4ced6b325e92eb4d3502946c78c5367bc416398d387b39591532536734e", 60 | "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", 61 | "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", 62 | "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", 63 | "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", 64 | "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", 65 | "sha256:a5ed8c05548b54b998b9498753fb9cadbfd92ee88e884641377d8a8b291bcc01", 66 | "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", 67 | "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", 68 | "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", 69 | "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", 70 | "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", 71 | "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", 72 | "sha256:d5ff0621c88ce83a28a10d2ce719b2ee85635e85c515f12bac99a95306da4b2e", 73 | "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", 74 | "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", 75 | "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", 76 | "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", 77 | "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", 78 | "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", 79 | "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" 80 | ], 81 | "version": "==1.14.4" 82 | }, 83 | "chardet": { 84 | "hashes": [ 85 | "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa", 86 | "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5" 87 | ], 88 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 89 | "version": "==4.0.0" 90 | }, 91 | "constantly": { 92 | "hashes": [ 93 | "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", 94 | "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" 95 | ], 96 | "version": "==15.1.0" 97 | }, 98 | "cryptography": { 99 | "hashes": [ 100 | "sha256:0003a52a123602e1acee177dc90dd201f9bb1e73f24a070db7d36c588e8f5c7d", 101 | "sha256:0e85aaae861d0485eb5a79d33226dd6248d2a9f133b81532c8f5aae37de10ff7", 102 | "sha256:594a1db4511bc4d960571536abe21b4e5c3003e8750ab8365fafce71c5d86901", 103 | "sha256:69e836c9e5ff4373ce6d3ab311c1a2eed274793083858d3cd4c7d12ce20d5f9c", 104 | "sha256:788a3c9942df5e4371c199d10383f44a105d67d401fb4304178020142f020244", 105 | "sha256:7e177e4bea2de937a584b13645cab32f25e3d96fc0bc4a4cf99c27dc77682be6", 106 | "sha256:83d9d2dfec70364a74f4e7c70ad04d3ca2e6a08b703606993407bf46b97868c5", 107 | "sha256:84ef7a0c10c24a7773163f917f1cb6b4444597efd505a8aed0a22e8c4780f27e", 108 | "sha256:9e21301f7a1e7c03dbea73e8602905a4ebba641547a462b26dd03451e5769e7c", 109 | "sha256:9f6b0492d111b43de5f70052e24c1f0951cb9e6022188ebcb1cc3a3d301469b0", 110 | "sha256:a69bd3c68b98298f490e84519b954335154917eaab52cf582fa2c5c7efc6e812", 111 | "sha256:b4890d5fb9b7a23e3bf8abf5a8a7da8e228f1e97dc96b30b95685df840b6914a", 112 | "sha256:c366df0401d1ec4e548bebe8f91d55ebcc0ec3137900d214dd7aac8427ef3030", 113 | "sha256:dc42f645f8f3a489c3dd416730a514e7a91a59510ddaadc09d04224c098d3302" 114 | ], 115 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", 116 | "version": "==3.3.1" 117 | }, 118 | "cssselect": { 119 | "hashes": [ 120 | "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", 121 | "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" 122 | ], 123 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 124 | "version": "==1.1.0" 125 | }, 126 | "hyperlink": { 127 | "hashes": [ 128 | "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b", 129 | "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4" 130 | ], 131 | "version": "==21.0.0" 132 | }, 133 | "idna": { 134 | "hashes": [ 135 | "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", 136 | "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" 137 | ], 138 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 139 | "version": "==2.10" 140 | }, 141 | "incremental": { 142 | "hashes": [ 143 | "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", 144 | "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" 145 | ], 146 | "version": "==17.5.0" 147 | }, 148 | "itemadapter": { 149 | "hashes": [ 150 | "sha256:5327c2136353cb965b6b4ba564af002fd458691b8e30d3bd6b14c474d92c6b25", 151 | "sha256:cb7aaa577fefe2aa6f229ccf4d058e05f44e0178a98c8fb70ee4d95acfabb423" 152 | ], 153 | "markers": "python_version >= '3.6'", 154 | "version": "==0.2.0" 155 | }, 156 | "itemloaders": { 157 | "hashes": [ 158 | "sha256:1277cd8ca3e4c02dcdfbc1bcae9134ad89acfa6041bd15b4561c6290203a0c96", 159 | "sha256:4cb46a0f8915e910c770242ae3b60b1149913ed37162804f1e40e8535d6ec497" 160 | ], 161 | "markers": "python_version >= '3.6'", 162 | "version": "==1.0.4" 163 | }, 164 | "jmespath": { 165 | "hashes": [ 166 | "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", 167 | "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" 168 | ], 169 | "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", 170 | "version": "==0.10.0" 171 | }, 172 | "lxml": { 173 | "hashes": [ 174 | "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", 175 | "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", 176 | "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", 177 | "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", 178 | "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", 179 | "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", 180 | "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", 181 | "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", 182 | "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", 183 | "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", 184 | "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", 185 | "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", 186 | "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", 187 | "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", 188 | "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", 189 | "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", 190 | "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", 191 | "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", 192 | "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", 193 | "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", 194 | "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", 195 | "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", 196 | "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", 197 | "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", 198 | "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", 199 | "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", 200 | "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", 201 | "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", 202 | "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", 203 | "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", 204 | "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", 205 | "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", 206 | "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", 207 | "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", 208 | "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", 209 | "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", 210 | "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" 211 | ], 212 | "markers": "platform_python_implementation == 'CPython'", 213 | "version": "==4.6.2" 214 | }, 215 | "parsel": { 216 | "hashes": [ 217 | "sha256:70efef0b651a996cceebc69e55a85eb2233be0890959203ba7c3a03c72725c79", 218 | "sha256:9e1fa8db1c0b4a878bf34b35c043d89c9d1cbebc23b4d34dbc3c0ec33f2e087d" 219 | ], 220 | "version": "==1.6.0" 221 | }, 222 | "protego": { 223 | "hashes": [ 224 | "sha256:a682771bc7b51b2ff41466460896c1a5a653f9a1e71639ef365a72e66d8734b4" 225 | ], 226 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 227 | "version": "==0.1.16" 228 | }, 229 | "pyasn1": { 230 | "hashes": [ 231 | "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", 232 | "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576", 233 | "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf", 234 | "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7", 235 | "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", 236 | "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00", 237 | "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8", 238 | "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86", 239 | "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12", 240 | "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776", 241 | "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba", 242 | "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2", 243 | "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3" 244 | ], 245 | "version": "==0.4.8" 246 | }, 247 | "pyasn1-modules": { 248 | "hashes": [ 249 | "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8", 250 | "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199", 251 | "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811", 252 | "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed", 253 | "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4", 254 | "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e", 255 | "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74", 256 | "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb", 257 | "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45", 258 | "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd", 259 | "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0", 260 | "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d", 261 | "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405" 262 | ], 263 | "version": "==0.2.8" 264 | }, 265 | "pycparser": { 266 | "hashes": [ 267 | "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", 268 | "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" 269 | ], 270 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 271 | "version": "==2.20" 272 | }, 273 | "pydispatcher": { 274 | "hashes": [ 275 | "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", 276 | "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" 277 | ], 278 | "version": "==2.0.5" 279 | }, 280 | "pyhamcrest": { 281 | "hashes": [ 282 | "sha256:412e00137858f04bde0729913874a48485665f2d36fe9ee449f26be864af9316", 283 | "sha256:7ead136e03655af85069b6f47b23eb7c3e5c221aa9f022a4fbb499f5b7308f29" 284 | ], 285 | "markers": "python_version >= '3.5'", 286 | "version": "==2.0.2" 287 | }, 288 | "pyopenssl": { 289 | "hashes": [ 290 | "sha256:4c231c759543ba02560fcd2480c48dcec4dae34c9da7d3747c508227e0624b51", 291 | "sha256:818ae18e06922c066f777a33f1fca45786d85edfe71cd043de6379337a7f274b" 292 | ], 293 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 294 | "version": "==20.0.1" 295 | }, 296 | "queuelib": { 297 | "hashes": [ 298 | "sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17", 299 | "sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02" 300 | ], 301 | "version": "==1.5.0" 302 | }, 303 | "requests": { 304 | "hashes": [ 305 | "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", 306 | "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" 307 | ], 308 | "index": "pypi", 309 | "version": "==2.25.1" 310 | }, 311 | "scrapy": { 312 | "hashes": [ 313 | "sha256:27621ab491706ec8cc41168cdbdff07e7fe8c344c8640e9e9faebd7cf84008e2", 314 | "sha256:68c48f01a58636bdf0f6fcd5035a19ecf277b58af24bd70c36dc6e556df3e005" 315 | ], 316 | "index": "pypi", 317 | "version": "==2.4.1" 318 | }, 319 | "service-identity": { 320 | "hashes": [ 321 | "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36", 322 | "sha256:0858a54aabc5b459d1aafa8a518ed2081a285087f349fe3e55197989232e2e2d" 323 | ], 324 | "version": "==18.1.0" 325 | }, 326 | "six": { 327 | "hashes": [ 328 | "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", 329 | "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" 330 | ], 331 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 332 | "version": "==1.15.0" 333 | }, 334 | "twisted": { 335 | "hashes": [ 336 | "sha256:040eb6641125d2a9a09cf198ec7b83dd8858c6f51f6770325ed9959c00f5098f", 337 | "sha256:147780b8caf21ba2aef3688628eaf13d7e7fe02a86747cd54bfaf2140538f042", 338 | "sha256:158ddb80719a4813d292293ac44ba41d8b56555ed009d90994a278237ee63d2c", 339 | "sha256:2182000d6ffc05d269e6c03bfcec8b57e20259ca1086180edaedec3f1e689292", 340 | "sha256:25ffcf37944bdad4a99981bc74006d735a678d2b5c193781254fbbb6d69e3b22", 341 | "sha256:3281d9ce889f7b21bdb73658e887141aa45a102baf3b2320eafcfba954fcefec", 342 | "sha256:356e8d8dd3590e790e3dba4db139eb8a17aca64b46629c622e1b1597a4a92478", 343 | "sha256:70952c56e4965b9f53b180daecf20a9595cf22b8d0935cd3bd664c90273c3ab2", 344 | "sha256:7408c6635ee1b96587289283ebe90ee15dbf9614b05857b446055116bc822d29", 345 | "sha256:7c547fd0215db9da8a1bc23182b309e84a232364cc26d829e9ee196ce840b114", 346 | "sha256:894f6f3cfa57a15ea0d0714e4283913a5f2511dbd18653dd148eba53b3919797", 347 | "sha256:94ac3d55a58c90e2075c5fe1853f2aa3892b73e3bf56395f743aefde8605eeaa", 348 | "sha256:a58e61a2a01e5bcbe3b575c0099a2bcb8d70a75b1a087338e0c48dd6e01a5f15", 349 | "sha256:c09c47ff9750a8e3aa60ad169c4b95006d455a29b80ad0901f031a103b2991cd", 350 | "sha256:ca3a0b8c9110800e576d89b5337373e52018b41069bc879f12fa42b7eb2d0274", 351 | "sha256:cd1dc5c85b58494138a3917752b54bb1daa0045d234b7c132c37a61d5483ebad", 352 | "sha256:cdbc4c7f0cd7a2218b575844e970f05a1be1861c607b0e048c9bceca0c4d42f7", 353 | "sha256:d267125cc0f1e8a0eed6319ba4ac7477da9b78a535601c49ecd20c875576433a", 354 | "sha256:d72c55b5d56e176563b91d11952d13b01af8725c623e498db5507b6614fc1e10", 355 | "sha256:d95803193561a243cb0401b0567c6b7987d3f2a67046770e1dccd1c9e49a9780", 356 | "sha256:e92703bed0cc21d6cb5c61d66922b3b1564015ca8a51325bd164a5e33798d504", 357 | "sha256:f058bd0168271de4dcdc39845b52dd0a4a2fecf5f1246335f13f5e96eaebb467", 358 | "sha256:f3c19e5bd42bbe4bf345704ad7c326c74d3fd7a1b3844987853bef180be638d4" 359 | ], 360 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 361 | "version": "==20.3.0" 362 | }, 363 | "urllib3": { 364 | "hashes": [ 365 | "sha256:1b465e494e3e0d8939b50680403e3aedaa2bc434b7d5af64dfd3c958d7f5ae80", 366 | "sha256:de3eedaad74a2683334e282005cd8d7f22f4d55fa690a2a1020a416cb0a47e73" 367 | ], 368 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", 369 | "version": "==1.26.3" 370 | }, 371 | "w3lib": { 372 | "hashes": [ 373 | "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53", 374 | "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df" 375 | ], 376 | "version": "==1.22.0" 377 | }, 378 | "zope.interface": { 379 | "hashes": [ 380 | "sha256:05a97ba92c1c7c26f25c9f671aa1ef85ffead6cdad13770e5b689cf983adc7e1", 381 | "sha256:07d61722dd7d85547b7c6b0f5486b4338001fab349f2ac5cabc0b7182eb3425d", 382 | "sha256:0a990dcc97806e5980bbb54b2e46b9cde9e48932d8e6984daf71ef1745516123", 383 | "sha256:150e8bcb7253a34a4535aeea3de36c0bb3b1a6a47a183a95d65a194b3e07f232", 384 | "sha256:1743bcfe45af8846b775086471c28258f4c6e9ee8ef37484de4495f15a98b549", 385 | "sha256:1b5f6c8fff4ed32aa2dd43e84061bc8346f32d3ba6ad6e58f088fe109608f102", 386 | "sha256:21e49123f375703cf824214939d39df0af62c47d122d955b2a8d9153ea08cfd5", 387 | "sha256:21f579134a47083ffb5ddd1307f0405c91aa8b61ad4be6fd5af0171474fe0c45", 388 | "sha256:27c267dc38a0f0079e96a2945ee65786d38ef111e413c702fbaaacbab6361d00", 389 | "sha256:299bde0ab9e5c4a92f01a152b7fbabb460f31343f1416f9b7b983167ab1e33bc", 390 | "sha256:2ab88d8f228f803fcb8cb7d222c579d13dab2d3622c51e8cf321280da01102a7", 391 | "sha256:2ced4c35061eea623bc84c7711eedce8ecc3c2c51cd9c6afa6290df3bae9e104", 392 | "sha256:2dcab01c660983ba5e5a612e0c935141ccbee67d2e2e14b833e01c2354bd8034", 393 | "sha256:32546af61a9a9b141ca38d971aa6eb9800450fa6620ce6323cc30eec447861f3", 394 | "sha256:32b40a4c46d199827d79c86bb8cb88b1bbb764f127876f2cb6f3a47f63dbada3", 395 | "sha256:3cc94c69f6bd48ed86e8e24f358cb75095c8129827df1298518ab860115269a4", 396 | "sha256:42b278ac0989d6f5cf58d7e0828ea6b5951464e3cf2ff229dd09a96cb6ba0c86", 397 | "sha256:495b63fd0302f282ee6c1e6ea0f1c12cb3d1a49c8292d27287f01845ff252a96", 398 | "sha256:4af87cdc0d4b14e600e6d3d09793dce3b7171348a094ba818e2a68ae7ee67546", 399 | "sha256:4b94df9f2fdde7b9314321bab8448e6ad5a23b80542dcab53e329527d4099dcb", 400 | "sha256:4c48ddb63e2b20fba4c6a2bf81b4d49e99b6d4587fb67a6cd33a2c1f003af3e3", 401 | "sha256:4df9afd17bd5477e9f8c8b6bb8507e18dd0f8b4efe73bb99729ff203279e9e3b", 402 | "sha256:518950fe6a5d56f94ba125107895f938a4f34f704c658986eae8255edb41163b", 403 | "sha256:538298e4e113ccb8b41658d5a4b605bebe75e46a30ceca22a5a289cf02c80bec", 404 | "sha256:55465121e72e208a7b69b53de791402affe6165083b2ea71b892728bd19ba9ae", 405 | "sha256:588384d70a0f19b47409cfdb10e0c27c20e4293b74fc891df3d8eb47782b8b3e", 406 | "sha256:6278c080d4afffc9016e14325f8734456831124e8c12caa754fd544435c08386", 407 | "sha256:64ea6c221aeee4796860405e1aedec63424cda4202a7ad27a5066876db5b0fd2", 408 | "sha256:681dbb33e2b40262b33fd383bae63c36d33fd79fa1a8e4092945430744ffd34a", 409 | "sha256:6936aa9da390402d646a32a6a38d5409c2d2afb2950f045a7d02ab25a4e7d08d", 410 | "sha256:778d0ec38bbd288b150a3ae363c8ffd88d2207a756842495e9bffd8a8afbc89a", 411 | "sha256:8251f06a77985a2729a8bdbefbae79ee78567dddc3acbd499b87e705ca59fe24", 412 | "sha256:83b4aa5344cce005a9cff5d0321b2e318e871cc1dfc793b66c32dd4f59e9770d", 413 | "sha256:844fad925ac5c2ad4faaceb3b2520ad016b5280105c6e16e79838cf951903a7b", 414 | "sha256:8ceb3667dd13b8133f2e4d637b5b00f240f066448e2aa89a41f4c2d78a26ce50", 415 | "sha256:92dc0fb79675882d0b6138be4bf0cec7ea7c7eede60aaca78303d8e8dbdaa523", 416 | "sha256:9789bd945e9f5bd026ed3f5b453d640befb8b1fc33a779c1fe8d3eb21fe3fb4a", 417 | "sha256:a2b6d6eb693bc2fc6c484f2e5d93bd0b0da803fa77bf974f160533e555e4d095", 418 | "sha256:aab9f1e34d810feb00bf841993552b8fcc6ae71d473c505381627143d0018a6a", 419 | "sha256:abb61afd84f23099ac6099d804cdba9bd3b902aaaded3ffff47e490b0a495520", 420 | "sha256:adf9ee115ae8ff8b6da4b854b4152f253b390ba64407a22d75456fe07dcbda65", 421 | "sha256:aedc6c672b351afe6dfe17ff83ee5e7eb6ed44718f879a9328a68bdb20b57e11", 422 | "sha256:b7a00ecb1434f8183395fac5366a21ee73d14900082ca37cf74993cf46baa56c", 423 | "sha256:ba32f4a91c1cb7314c429b03afbf87b1fff4fb1c8db32260e7310104bd77f0c7", 424 | "sha256:cbd0f2cbd8689861209cd89141371d3a22a11613304d1f0736492590aa0ab332", 425 | "sha256:e4bc372b953bf6cec65a8d48482ba574f6e051621d157cf224227dbb55486b1e", 426 | "sha256:eccac3d9aadc68e994b6d228cb0c8919fc47a5350d85a1b4d3d81d1e98baf40c", 427 | "sha256:efd550b3da28195746bb43bd1d815058181a7ca6d9d6aa89dd37f5eefe2cacb7", 428 | "sha256:efef581c8ba4d990770875e1a2218e856849d32ada2680e53aebc5d154a17e20", 429 | "sha256:f057897711a630a0b7a6a03f1acf379b6ba25d37dc5dc217a97191984ba7f2fc", 430 | "sha256:f37d45fab14ffef9d33a0dc3bc59ce0c5313e2253323312d47739192da94f5fd", 431 | "sha256:f44906f70205d456d503105023041f1e63aece7623b31c390a0103db4de17537" 432 | ], 433 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 434 | "version": "==5.2.0" 435 | } 436 | }, 437 | "develop": { 438 | "certifi": { 439 | "hashes": [ 440 | "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", 441 | "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" 442 | ], 443 | "version": "==2020.12.5" 444 | }, 445 | "chardet": { 446 | "hashes": [ 447 | "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa", 448 | "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5" 449 | ], 450 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 451 | "version": "==4.0.0" 452 | }, 453 | "click": { 454 | "hashes": [ 455 | "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", 456 | "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" 457 | ], 458 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 459 | "version": "==7.0" 460 | }, 461 | "docker": { 462 | "hashes": [ 463 | "sha256:0604a74719d5d2de438753934b755bfcda6f62f49b8e4b30969a4b0a2a8a1220", 464 | "sha256:e455fa49aabd4f22da9f4e1c1f9d16308286adc60abaf64bf3e1feafaed81d06" 465 | ], 466 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 467 | "version": "==4.4.1" 468 | }, 469 | "idna": { 470 | "hashes": [ 471 | "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", 472 | "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" 473 | ], 474 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 475 | "version": "==2.10" 476 | }, 477 | "pyyaml": { 478 | "hashes": [ 479 | "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf", 480 | "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696", 481 | "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393", 482 | "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77", 483 | "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922", 484 | "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5", 485 | "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8", 486 | "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10", 487 | "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc", 488 | "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018", 489 | "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e", 490 | "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253", 491 | "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183", 492 | "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb", 493 | "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185", 494 | "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db", 495 | "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46", 496 | "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b", 497 | "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63", 498 | "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df", 499 | "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc" 500 | ], 501 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", 502 | "version": "==5.4.1" 503 | }, 504 | "requests": { 505 | "hashes": [ 506 | "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", 507 | "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" 508 | ], 509 | "index": "pypi", 510 | "version": "==2.25.1" 511 | }, 512 | "retrying": { 513 | "hashes": [ 514 | "sha256:08c039560a6da2fe4f2c426d0766e284d3b736e355f8dd24b37367b0bb41973b" 515 | ], 516 | "version": "==1.3.3" 517 | }, 518 | "scrapinghub": { 519 | "hashes": [ 520 | "sha256:9dbe4ebac719cf4ec065a4748cc513a6f9cc4190ef31b98ea24deb63992a9eff", 521 | "sha256:cffdfdb233af30f1e5e45d9c3e5e52b1e5d88e13c6c76c66ccf2f5231b601dbf" 522 | ], 523 | "version": "==2.3.1" 524 | }, 525 | "shub": { 526 | "hashes": [ 527 | "sha256:6ba0c49fe22129b1573e877e9632b3ac34cb593635b8bdababf5caaef34a0727", 528 | "sha256:b653360bc1eda496345548faa384e5c93abbeae9f1426e7ba82d017cd2b6e5ec" 529 | ], 530 | "index": "pypi", 531 | "version": "==2.13.0" 532 | }, 533 | "six": { 534 | "hashes": [ 535 | "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", 536 | "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" 537 | ], 538 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 539 | "version": "==1.15.0" 540 | }, 541 | "toml": { 542 | "hashes": [ 543 | "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", 544 | "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" 545 | ], 546 | "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", 547 | "version": "==0.10.2" 548 | }, 549 | "tqdm": { 550 | "hashes": [ 551 | "sha256:556c55b081bd9aa746d34125d024b73f0e2a0e62d5927ff0e400e20ee0a03b9a", 552 | "sha256:b8b46036fd00176d0870307123ef06bb851096964fa7fc578d789f90ce82c3e4" 553 | ], 554 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 555 | "version": "==4.55.1" 556 | }, 557 | "urllib3": { 558 | "hashes": [ 559 | "sha256:1b465e494e3e0d8939b50680403e3aedaa2bc434b7d5af64dfd3c958d7f5ae80", 560 | "sha256:de3eedaad74a2683334e282005cd8d7f22f4d55fa690a2a1020a416cb0a47e73" 561 | ], 562 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", 563 | "version": "==1.26.3" 564 | }, 565 | "websocket-client": { 566 | "hashes": [ 567 | "sha256:0fc45c961324d79c781bab301359d5a1b00b13ad1b10415a4780229ef71a5549", 568 | "sha256:d735b91d6d1692a6a181f2a8c9e0238e5f6373356f561bb9dc4c7af36f452010" 569 | ], 570 | "version": "==0.57.0" 571 | } 572 | } 573 | } 574 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy-jav 2 | JAV site scrapers 3 | - DMM 4 | -------------------------------------------------------------------------------- /jav/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/randName/scrapy-jav/46ef4fcb4acd0a4ed6b7462dc944281c7599f77f/jav/__init__.py -------------------------------------------------------------------------------- /jav/exporters.py: -------------------------------------------------------------------------------- 1 | from scrapy.exporters import BaseItemExporter 2 | from scrapy.utils.python import to_bytes 3 | 4 | 5 | class UrlExporter(BaseItemExporter): 6 | 7 | def __init__(self, file, **kwargs): 8 | self._configure(kwargs, dont_fail=True) 9 | self.file = file 10 | 11 | def export_item(self, item): 12 | url = item.get('url') 13 | if not url or not isinstance(url, str): 14 | return 15 | self.file.write(to_bytes(url + '\n')) 16 | -------------------------------------------------------------------------------- /jav/items.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from scrapy import Field, Item 4 | from scrapy.loader import ItemLoader 5 | from itemloaders.processors import MapCompose, Compose, TakeFirst 6 | 7 | 8 | def filter_empty(urls): 9 | for url in urls: 10 | if not url or url.startswith('#'): 11 | continue 12 | yield url 13 | 14 | 15 | class Unique(Compose): 16 | 17 | def __init__(self, **kw): 18 | super().__init__(**kw) 19 | self.functions = (set, sorted) 20 | 21 | 22 | class Number: 23 | 24 | def __init__(self, t=float): 25 | self.t = t 26 | 27 | def __call__(self, values, loader_context=None): 28 | for v in values: 29 | try: 30 | yield self.t(v) 31 | except ValueError: 32 | yield 0 33 | 34 | 35 | class URLField(Field): 36 | 37 | def __init__(self, multi=False): 38 | self['input_processor'] = filter_empty 39 | self['output_processor'] = Unique() if multi else TakeFirst() 40 | 41 | 42 | class StringField(Field): 43 | 44 | def __init__(self): 45 | self['input_processor'] = MapCompose(str.strip) 46 | self['output_processor'] = TakeFirst() 47 | 48 | 49 | class ArticleField(Field): 50 | 51 | def __init__(self, parse=None): 52 | if parse: 53 | self['input_processor'] = parse 54 | self['output_processor'] = Unique() 55 | 56 | 57 | class NumberField(Field): 58 | 59 | def __init__(self, t=float): 60 | self['input_processor'] = Number(t) 61 | self['output_processor'] = TakeFirst() 62 | 63 | 64 | class Video(Item): 65 | fields = defaultdict(StringField) 66 | fields.update({ 67 | 'image': URLField(), 68 | 'title': StringField(), 69 | 'gallery': URLField(multi=True), 70 | 'related': URLField(multi=True), 71 | 'articles': URLField(multi=True), 72 | }) 73 | 74 | 75 | class JAVLoader(ItemLoader): 76 | 77 | default_item_class = Video 78 | 79 | def __init__(self, xpaths=None, **kw): 80 | super().__init__(**kw) 81 | 82 | if xpaths is not None: 83 | for k, xp in xpaths.items(): 84 | if isinstance(xp, (tuple, list)): 85 | for v in xp: 86 | self.add_xpath(k, v) 87 | else: 88 | self.add_xpath(k, xp) 89 | 90 | def nested(self, **context): 91 | return self.__class__(item=self.item, parent=self, **context) 92 | -------------------------------------------------------------------------------- /jav/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy.exceptions import IgnoreRequest 2 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 3 | from scrapy.spidermiddlewares.httperror import HttpErrorMiddleware 4 | 5 | 6 | class XPathRetryMiddleware(RetryMiddleware): 7 | """Middleware to retry a request if a specified xpath is not present""" 8 | 9 | max_retry_times = 2 10 | priority_adjust = 0 11 | 12 | def __init__(self, settings): 13 | pass 14 | 15 | def process_response(self, request, response, spider): 16 | xp = getattr(spider, 'retry_xpath', None) 17 | if xp is None: 18 | return response 19 | 20 | if response.url.endswith('robots.txt'): 21 | return response 22 | 23 | if response.status == 200 and not response.xpath(xp): 24 | reason = 'could not find xpath "{}"'.format(xp) 25 | return self._retry(request, reason, spider) or response 26 | 27 | return response 28 | 29 | 30 | class NotFound(IgnoreRequest): 31 | pass 32 | 33 | 34 | class NotFoundMiddleware(HttpErrorMiddleware): 35 | 36 | def __init__(self, settings): 37 | pass 38 | 39 | def process_spider_input(self, response, spider): 40 | if response.status == 404: 41 | raise NotFound('Page not found') 42 | 43 | def process_spider_exception(self, response, exception, spider): 44 | if isinstance(exception, NotFound): 45 | return () 46 | -------------------------------------------------------------------------------- /jav/pipelines.py: -------------------------------------------------------------------------------- 1 | from os import makedirs 2 | from os.path import exists, dirname 3 | 4 | from scrapy.exporters import JsonLinesItemExporter 5 | 6 | 7 | def merge(fn, new): 8 | from json import load 9 | with open(fn) as f: 10 | old = load(f) 11 | 12 | seqs = (list, tuple, set) 13 | 14 | for k, v in new.items(): 15 | if k in old: 16 | oldv = old[k] 17 | if v == oldv: 18 | continue 19 | elif type(oldv) in seqs and isinstance(v, set): 20 | v.update(set(oldv)) 21 | old[k] = v 22 | 23 | return old 24 | 25 | 26 | class JsonWriterPipeline(object): 27 | """Pipeline to save scraped items into JSON files.""" 28 | 29 | dump_config = { 30 | 'sort_keys': True, 31 | 'ensure_ascii': False, 32 | } 33 | 34 | def open_spider(self, spider): 35 | try: 36 | ow = int(spider.settings.get('JSON_OVERWRITE', 0)) 37 | except ValueError: 38 | ow = 0 39 | self.overwrite = ow 40 | 41 | self.json_filename = getattr(spider, 'json_filename', '') 42 | 43 | self.out = spider.settings.get('JSON_DIR') 44 | if self.out: 45 | spider.logger.info("Writing files to %s" % self.out) 46 | 47 | def process_item(self, item, spider): 48 | try: 49 | jsfn = self.json_filename.format(**item) 50 | except KeyError: 51 | return item 52 | 53 | if not jsfn: 54 | return item 55 | 56 | if self.out is None: 57 | return item 58 | 59 | fn = '%s/%s' % (self.out, jsfn) 60 | 61 | if exists(fn): 62 | if not self.overwrite: 63 | return item 64 | if self.overwrite == 1: 65 | item = merge(fn, item) 66 | 67 | for k, v in item.items(): 68 | if isinstance(v, set): 69 | item[k] = sorted(v) 70 | 71 | makedirs(dirname(fn), exist_ok=True) 72 | 73 | with open(fn, 'wb') as f: 74 | exporter = JsonLinesItemExporter(f, **self.dump_config) 75 | exporter.export_item(item) 76 | 77 | item.pop('url', None) 78 | return item 79 | -------------------------------------------------------------------------------- /jav/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = 'scrapy-jav' 2 | 3 | SPIDER_MODULES = ( 4 | 'jav.sites.dmm.spiders', 5 | 'jav.sites.ave.spiders', 6 | ) 7 | 8 | LOG_LEVEL = 'INFO' 9 | 10 | USER_AGENT = 'scrapy-jav/0.5' 11 | ROBOTSTXT_OBEY = True 12 | COOKIES_ENABLED = False 13 | 14 | TELNETCONSOLE_ENABLED = False 15 | 16 | EXTENSIONS = { 17 | 'scrapy.extensions.telnet.TelnetConsole': None, 18 | } 19 | 20 | FEED_EXPORTERS = { 21 | 'url': 'jav.exporters.UrlExporter', 22 | } 23 | 24 | SPIDER_MIDDLEWARES = { 25 | 'jav.middlewares.NotFoundMiddleware': 75, 26 | } 27 | 28 | DOWNLOADER_MIDDLEWARES = { 29 | 'jav.middlewares.XPathRetryMiddleware': 540, 30 | } 31 | -------------------------------------------------------------------------------- /jav/sites/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/randName/scrapy-jav/46ef4fcb4acd0a4ed6b7462dc944281c7599f77f/jav/sites/__init__.py -------------------------------------------------------------------------------- /jav/sites/ave/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/randName/scrapy-jav/46ef4fcb4acd0a4ed6b7462dc944281c7599f77f/jav/sites/ave/__init__.py -------------------------------------------------------------------------------- /jav/sites/ave/article.py: -------------------------------------------------------------------------------- 1 | from jav.utils import parse_url, get_key 2 | 3 | url_formats = { 4 | 'DVD': { 5 | 'video': ('product', 'product_id'), 6 | 'studio': ('studio', 'studioid'), 7 | 'series': ('series', 'seriesid'), 8 | 'subdept': ('subdept', 'subdept_id'), 9 | 'actress': ('actress', 'actressname'), 10 | }, 11 | 'PPV': { 12 | 'video': ('ppv/new', 'proid'), 13 | 'subdept': ('ppv/dept', 'cat_id'), 14 | 'studio': ('ppv/ppv_studio', 'studioid'), 15 | 'series': ('ppv/ppv_series', 'seriesid'), 16 | 'actress': ('ppv/ppv_actress', 'actressname'), 17 | }, 18 | } 19 | 20 | 21 | def parse_ave_url(url): 22 | 23 | def get_parts(p): 24 | shop = 'PPV' if p.startswith('ppv') else 'DVD' 25 | 26 | for t, a in url_formats[shop].items(): 27 | if p.startswith(a[0]): 28 | return shop, t, a[1] 29 | 30 | return None, None, None 31 | 32 | path, query = parse_url(url.lower()) 33 | shop, t, idk = get_parts(path[1:]) 34 | 35 | if t is None: 36 | return {} 37 | 38 | _id = get_key(query, idk) 39 | 40 | try: 41 | _id = int(_id) 42 | except ValueError: 43 | _id = _id.replace(' ', '-') 44 | except TypeError: 45 | return {} 46 | 47 | return { 48 | 'base': t, 49 | 'id': _id, 50 | 'shop': shop, 51 | } 52 | 53 | 54 | def get_article(url, **article): 55 | a = parse_ave_url(url) 56 | 57 | if a.get('base', 'video') == 'video': 58 | return None 59 | 60 | article['article'] = a.pop('base') 61 | article.update(a) 62 | 63 | return article 64 | 65 | 66 | def save_article(urls): 67 | for url in urls: 68 | a = get_article(url) 69 | if a: 70 | yield '{article}/{id}'.format(**a) 71 | 72 | 73 | def parse_article(response): 74 | item = response.meta.get('article') or get_article(response.url) 75 | if item is None: 76 | return 77 | 78 | name = response.xpath('//h3[@class="block"]/a/text()').get() 79 | item.setdefault('name', name) 80 | 81 | return item 82 | -------------------------------------------------------------------------------- /jav/sites/ave/constants.py: -------------------------------------------------------------------------------- 1 | PAGEN = '(//div[@class="pagination"])[1]//a' 2 | 3 | ARTICLE_LABELS = ( 4 | 'スタジオ', 5 | 'シリーズ', 6 | '女優名', 7 | '主演女優', 8 | 'カテゴリー', 9 | 'カテゴリ一覧', 10 | ) 11 | -------------------------------------------------------------------------------- /jav/sites/ave/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jav/sites/ave/spiders/article_spider.py: -------------------------------------------------------------------------------- 1 | from jav.utils import extract_a 2 | from jav.spiders import JAVSpider 3 | 4 | from ..article import get_article, parse_article 5 | 6 | 7 | class ArticleSpider(JAVSpider): 8 | name = 'ave.article' 9 | 10 | def export_items(self, response): 11 | yield parse_article(response) 12 | 13 | 14 | class StudioSpider(JAVSpider): 15 | name = 'ave.studios' 16 | 17 | start_urls = ( 18 | 'http://aventertainments.com/studiolists.aspx?Dept_ID=29', 19 | 'http://aventertainments.com/ppv/ppv_studiolists.aspx', 20 | ) 21 | 22 | def export_items(self, response): 23 | for cell in response.xpath('//td[@class="table-border"]'): 24 | url, t = next(extract_a(cell)) 25 | 26 | m = get_article(url, name=t) 27 | if m is None: 28 | continue 29 | 30 | m['url'] = response.urljoin(url) 31 | 32 | img = cell.xpath('.//img/@src').get() 33 | if img: 34 | m['image'] = img 35 | 36 | yield m 37 | 38 | 39 | class SubdeptSpider(JAVSpider): 40 | name = 'ave.subdept' 41 | 42 | start_urls = ( 43 | 'https://www.aventertainments.com/categorylists.aspx?Dept_ID=29', 44 | 'https://www.aventertainments.com/ppv/ppv_categorylists.aspx', 45 | ) 46 | 47 | def export_items(self, response): 48 | for section in response.xpath('//div[@class="row2"]'): 49 | sname = section.xpath('h1/text()').get() 50 | 51 | for url, t in extract_a(section): 52 | if url.startswith('#'): 53 | continue 54 | 55 | item = get_article(url, name=t, category=sname) 56 | if item: 57 | item['url'] = response.urljoin(url) 58 | yield item 59 | -------------------------------------------------------------------------------- /jav/sites/ave/spiders/video_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | from jav.spiders.list_spider import UrlListSpider 3 | 4 | from ..video import parse_video 5 | from ..constants import PAGEN 6 | 7 | 8 | class VideoSpider(JAVSpider): 9 | name = 'ave.video' 10 | 11 | retry_xpath = '//h2' 12 | 13 | def export_items(self, response): 14 | yield parse_video(response) 15 | 16 | 17 | class ListSpider(UrlListSpider): 18 | name = 'ave.list' 19 | 20 | pagination_xpath = PAGEN 21 | 22 | def export_items(self, response): 23 | yield parse_video(response) 24 | 25 | def get_list(self, response): 26 | yield from response.xpath('//td/a[1]/@href').getall() 27 | -------------------------------------------------------------------------------- /jav/sites/ave/video.py: -------------------------------------------------------------------------------- 1 | from jav.items import JAVLoader 2 | 3 | vid_re = r'.*: (.*)' 4 | cov_re = r".*url\('(.*)'\).*" 5 | info_xp = '//div[@class="main-subcontent-page"]/div[1]/ul/li' 6 | 7 | text_labels = { 8 | '発売日': 'date', 9 | '配信日': 'date', 10 | '収録時間': 'runtime', 11 | } 12 | 13 | xpaths = { 14 | 'title': '//h2/text()', 15 | 'related': '//div[@id="mini-tabs"]//a/@href', 16 | 'text': ( 17 | '//div[@class="border"]/p/text()', 18 | '//ul[@class="review"]/li[1]/text()', 19 | ), 20 | 'gallery': ( 21 | '//a[@href="#title"]/img/@src', 22 | '//ul[contains(@class,"thumbs")]/li/a/@href', 23 | ), 24 | 'articles': '%s/a/@href' % info_xp, 25 | } 26 | 27 | 28 | def get_info(li): 29 | for text in li.xpath('(.|span)/text()').getall(): 30 | t = text.strip() 31 | if not t: 32 | continue 33 | yield t 34 | 35 | 36 | def parse_video(response): 37 | v = JAVLoader(response=response, xpaths=xpaths) 38 | v.add_xpath('vid', '//div[@class="top-title"]/text()', re=vid_re) 39 | v.add_xpath('image', '//div[@class="top_sample"]/style', re=cov_re) 40 | 41 | for li in response.xpath(info_xp): 42 | info = tuple(get_info(li)) 43 | if not info: 44 | continue 45 | label = text_labels.get(info[0][:-1]) 46 | if label: 47 | v.add_value(label, info[1]) 48 | 49 | return v.load_item() 50 | -------------------------------------------------------------------------------- /jav/sites/dmm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/randName/scrapy-jav/46ef4fcb4acd0a4ed6b7462dc944281c7599f77f/jav/sites/dmm/__init__.py -------------------------------------------------------------------------------- /jav/sites/dmm/article.py: -------------------------------------------------------------------------------- 1 | performer_re = { 2 | 'actress': r'.*\xa0(.+?)(?:[((](.+?)[))])?(?:\(([^)]+?)\))?$', 3 | 'histrion': r'.*\xa0(.+?)(?:((.+?)))?(?:\(([^)]+?)\))?$', 4 | } 5 | 6 | 7 | def get_article(url, **article): 8 | u = url.split('/')[:-1] 9 | try: 10 | article['service'], article['shop'] = u[-7:-5] 11 | article['article'], aid = (v.split('=')[1] for v in u[-2:]) 12 | article['id'] = int(aid) 13 | except (ValueError, IndexError): 14 | return None 15 | 16 | return article 17 | 18 | 19 | def save_article(urls): 20 | for url in urls: 21 | a = get_article(url) 22 | if a: 23 | yield '{article}/{id}'.format(**a) 24 | 25 | 26 | def parse_article(response): 27 | item = response.meta.get('article') or get_article(response.url) 28 | if item is None: 29 | return 30 | 31 | span = response.xpath('string(//p[@class="headwithelem"]/span)') 32 | 33 | kana = None 34 | alias = None 35 | article = item['article'] 36 | 37 | if article in performer_re: 38 | name, alias, kana = span.re(performer_re[article]) 39 | elif article == 'director': 40 | name, kana = span.re(r'.*\xa0(.+?)(?:\(([^)]+?)\))?$') 41 | else: 42 | name = span.re_first(r'.*\xa0(.+)$') 43 | 44 | assert name == item.setdefault('name', name) 45 | 46 | if alias: 47 | item.setdefault('alias', alias) 48 | 49 | if kana: 50 | item.setdefault('kana', kana) 51 | 52 | return item 53 | -------------------------------------------------------------------------------- /jav/sites/dmm/constants.py: -------------------------------------------------------------------------------- 1 | DOMAIN = 'http://www.dmm.co.jp' 2 | 3 | REALMS = ( 4 | {'service': 'mono', 'shop': 'dvd'}, 5 | {'service': 'digital', 'shop': 'videoa'} 6 | ) 7 | 8 | 9 | def get_date_urls(): 10 | base = ( 11 | 'calendar/=/year={0:%Y}/month={0:%m}/day={0:%d}-{1:%d}', 12 | 'delivery-list/=/delivery_date={0:%Y-%m-%d}' 13 | ) 14 | for r, b in zip(REALMS, base): 15 | yield '{0}/{service}/{shop}/-/{1}'.format(DOMAIN, b, **r) 16 | 17 | 18 | DATE_URLS = tuple(get_date_urls()) 19 | 20 | DATE_MIN = (2001, 3, 1) 21 | 22 | AIUEO = '//table[@class="menu_aiueo"]//a' 23 | 24 | PAGEN = '(//div[contains(@class,"pagenation")])[1]/ul/li/a' 25 | 26 | RELATED = '%s/{0}/{1}/-/detail/=/cid={2}/' % DOMAIN 27 | 28 | MUTUALS = '/misc/-/mutual-link/ajax-index/=/cid={0}/service={1}/shop={2}/' 29 | 30 | ARTICLES = '%s/{service}/{shop}/-/list/=/article={article}/id={id}/' % DOMAIN 31 | 32 | ARTICLE_LABELS = ( 33 | 'ジャンル', 34 | 'シリーズ', 35 | 'メーカー', 36 | 'レーベル', 37 | '出演者', 38 | '監督', 39 | ) 40 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/actress_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | 3 | from ..constants import AIUEO, PAGEN 4 | from ..article import get_article, parse_article 5 | 6 | name_re = r'(.+?)(?:[((](.+?)[))]?)?(?:(.+?))?$' 7 | table_pagen = '//td[@class="header"]/following-sibling::td/a' 8 | works_xpath = '//td[@class="t_works1"]/../following-sibling::tr' 9 | 10 | 11 | def get_actress(url): 12 | try: 13 | return { 14 | 'article': 'actress', 15 | 'id': int(url[:-1].split('=')[2]), 16 | } 17 | except (IndexError, ValueError): 18 | return None 19 | 20 | 21 | def details(response): 22 | for row in response.xpath('//table[@class="w100"]//table/tr'): 23 | label, info = row.xpath('td/text()').getall() 24 | if info != '----': 25 | yield label[:-1], info 26 | 27 | 28 | def parse_actress(response): 29 | init = response.meta.get(0) 30 | page = response.meta.get('page') 31 | 32 | if page and not init: 33 | for row in response.xpath(works_xpath): 34 | works = row.xpath('td/a/@href') 35 | if not works: 36 | continue 37 | url, *related = works.getall() 38 | yield {'url': url, 'related': related} 39 | elif init and not page: 40 | item = response.meta.get('article') or get_actress(response.url) 41 | item.update(details(response)) 42 | yield item 43 | 44 | 45 | def actresses(response): 46 | if 'actress.dmm' in response.url: 47 | parse_url = get_actress 48 | main_xp = '//tr[@class="list"]' 49 | else: 50 | parse_url = get_article 51 | main_xp = '//div[contains(@class,"act-box")]/ul/li/a' 52 | 53 | for act in response.xpath(main_xp): 54 | url = act.xpath('(td/a|.)/@href').get().split('sort')[0] 55 | 56 | a = parse_url(url) 57 | if a is None: 58 | continue 59 | 60 | image = act.xpath('(td/a|.)/img/@src').get() 61 | a['name'], alias = act.xpath('(td[2]/a|.)/text()').re(name_re) 62 | a['url'] = response.urljoin(url) 63 | 64 | if image: 65 | a['image'] = image 66 | 67 | if alias: 68 | a['alias'] = alias 69 | 70 | try: 71 | a['kana'], ak = act.xpath('(td[3]|span[1])/text()').re(name_re) 72 | if ak: 73 | a['alias_kana'] = ak 74 | except ValueError: 75 | pass 76 | 77 | yield a 78 | 79 | 80 | class ActressSpider(JAVSpider): 81 | name = 'dmm.actress' 82 | 83 | deep = False 84 | 85 | start_urls = ( 86 | 'http://www.dmm.co.jp/digital/videoa/-/actress/=/keyword=nn/', 87 | 'http://www.dmm.co.jp/mono/dvd/-/actress/=/keyword=nn/', 88 | 'http://actress.dmm.co.jp/-/top/', 89 | ) 90 | 91 | pagination_xpath = '(%s|%s)' % (PAGEN, table_pagen) 92 | 93 | def parse_item(self, response): 94 | page = response.meta.get('page') 95 | 96 | if not page and not response.meta.get(0): 97 | yield from self.links(response, AIUEO, follow=True, meta={0: 1}) 98 | 99 | if response.meta.get('article'): 100 | if 'actress.dmm' in response.url: 101 | export = parse_actress(response) 102 | else: 103 | export = (parse_article(response),) 104 | response.meta['export'] = export 105 | 106 | elif self.deep: 107 | for a in actresses(response): 108 | yield response.follow(a['url'], meta={'article': a, 0: 1}) 109 | 110 | elif page != 1: 111 | response.meta['export'] = actresses(response) 112 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/article_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | from jav.utils import parse_range 3 | 4 | from ..article import parse_article 5 | from ..constants import REALMS, ARTICLES 6 | 7 | 8 | class ArticleSpider(JAVSpider): 9 | name = 'dmm.article' 10 | 11 | def __init__(self, article=None, ids='', **kw): 12 | if article: 13 | self.article = {'article': article} 14 | self.range = set(parse_range(ids)) or range(100) 15 | else: 16 | self.article = None 17 | 18 | def start_requests(self): 19 | yield from super().start_requests() 20 | 21 | if self.article: 22 | a = self.article 23 | for i in self.range: 24 | a['id'] = i 25 | for r in REALMS: 26 | yield self.make_request(ARTICLES.format(**a, **r)) 27 | 28 | def export_items(self, response): 29 | yield parse_article(response) 30 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/genre_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | 3 | from ..article import get_article 4 | 5 | xp = '(//table[@class="sect02"]|//div[@class="d-sect"]/ul)' 6 | s_xp = '../preceding-sibling::div/text()' 7 | 8 | 9 | class GenreSpider(JAVSpider): 10 | name = 'dmm.genre' 11 | 12 | start_urls = ( 13 | 'http://www.dmm.co.jp/digital/videoa/-/genre/', 14 | 'http://www.dmm.co.jp/mono/dvd/-/genre/', 15 | ) 16 | 17 | def export_items(self, response): 18 | for section in response.xpath(xp): 19 | sname = section.xpath('@summary').get() 20 | if not sname: 21 | sname = section.xpath(s_xp).get() 22 | 23 | if sname == 'おすすめジャンル': 24 | continue 25 | 26 | for a in section.xpath('.//a'): 27 | url, t = a.xpath('@href|text()').getall() 28 | if url.startswith('#'): 29 | continue 30 | 31 | item = get_article(url, name=t, category=sname) 32 | if item: 33 | item['url'] = response.urljoin(url) 34 | yield item 35 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/maker_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | 3 | from ..article import get_article 4 | 5 | 6 | def mora_xp(url): 7 | if 'mono' in url: 8 | return '(//td[@class="makerlist-box-t2" or @class="initial"])/a' 9 | else: 10 | return '//ul[@class="d-modtab" or @class="d-modsort-la"]/li/a' 11 | 12 | 13 | class MakerSpider(JAVSpider): 14 | name = 'dmm.maker' 15 | 16 | start_urls = ( 17 | 'http://www.dmm.co.jp/digital/videoa/-/maker/=/keyword=nn/', 18 | 'http://www.dmm.co.jp/mono/dvd/-/maker/=/keyword=nn/', 19 | ) 20 | 21 | def makers(self, response, xp, genre=None): 22 | for mk in response.xpath(xp.pop('main')): 23 | url = mk.xpath('.//a/@href').get('') 24 | 25 | m = get_article(url) 26 | if m is None: 27 | continue 28 | 29 | m['url'] = response.urljoin(url) 30 | 31 | if genre is not None: 32 | m['genre'] = set((genre['id'],)) 33 | yield m 34 | continue 35 | 36 | m.update({k: mk.xpath(v).get('').strip() for k, v in xp.items()}) 37 | 38 | yield m 39 | 40 | def parse_item(self, response): 41 | yield from super().parse_item(response) 42 | yield from self.links(response, mora_xp(response.url), follow=True) 43 | 44 | def export_items(self, response): 45 | if 'mono' in response.url: 46 | xp = { 47 | 'main': '//td[@class="w50"]', 48 | 'name': 'div/a/text()', 49 | 'image': 'a/img/@src', 50 | 'text': 'div[@class="maker-text"]/text()', 51 | } 52 | 53 | yield from self.makers(response, { 54 | 'main': '//table[@class="list-table mg-t12"]/tr', 55 | 'name': 'td/a/text()', 56 | 'text': 'td[2]/text()', 57 | }) 58 | else: 59 | xp = { 60 | 'main': '//div[@class="d-unit"]', 61 | 'name': 'div/a/span[@class="d-ttllarge"]/text()', 62 | 'image': 'div/a//img/@src', 63 | 'text': 'div/div/p/text()', 64 | } 65 | 66 | yield from self.makers(response, xp, response.meta.get('genre')) 67 | 68 | 69 | class MakerGenreSpider(MakerSpider): 70 | name = 'dmm.maker.genre' 71 | 72 | start_urls = ( 73 | 'http://www.dmm.co.jp/digital/videoa/-/maker/=/article=keyword/', 74 | ) 75 | 76 | def parse_item(self, response): 77 | if response.meta.get('genre'): 78 | response.meta['export'] = self.export_items(response) 79 | return () 80 | 81 | for section in response.xpath('//div[@class="d-sect"]')[2:-1]: 82 | for url in section.xpath('.//a/@href').getall(): 83 | yield response.follow(url, meta={'genre': get_article(url)}) 84 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/monthly_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | 3 | from ..constants import PAGEN 4 | from ..article import get_article 5 | 6 | side_xp = '//div[@id="monthly-localnav"]/ul/li/a/@href' 7 | list_xp = '//ul[@id="list"]/li/div/a/@href' 8 | maker_xp = '//td[@class="header"]/a/@href' 9 | 10 | 11 | class MonthlySpider(JAVSpider): 12 | name = 'dmm.monthly' 13 | 14 | start_urls = ('https://www.dmm.co.jp/monthly/prime/-/list/',) 15 | 16 | pagination_xpath = PAGEN 17 | 18 | def export_items(self, response): 19 | for url in response.xpath(list_xp).getall(): 20 | yield {'url': url} 21 | 22 | 23 | class MonthlyAllSpider(MonthlySpider): 24 | name = 'dmm.monthly.all' 25 | 26 | start_urls = ('https://www.dmm.co.jp/monthly/',) 27 | 28 | def parse_item(self, response): 29 | stage = response.meta.get(0) 30 | page = response.meta.get('page') 31 | 32 | if stage is None and page is None: 33 | for url in response.xpath(side_xp).getall(): 34 | yield response.follow(url, meta={0: 0}) 35 | elif stage == 0: 36 | date_list = None 37 | for dgm in response.xpath('//h1[@class="dgm-ttl"]/a'): 38 | url, t = dgm.xpath('@href|text()').getall() 39 | if t == 'AVメーカー一覧へ': 40 | yield response.follow(url, meta={0: 1}) 41 | break 42 | if 'sort=date' in url: 43 | date_list = url 44 | else: 45 | if date_list: 46 | yield response.follow(date_list, meta={0: 2}) 47 | elif stage == 1: 48 | for url in response.xpath(maker_xp).getall(): 49 | m = get_article(url) 50 | if not m: 51 | continue 52 | yield response.follow(url, meta={0: 2, 'article': m}) 53 | elif page != 1: 54 | response.meta['export'] = self.export_items(response) 55 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/product_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | from jav.utils import parse_range 3 | 4 | from ..video import parse_product 5 | 6 | product_link = 'http://actress.dmm.co.jp/-/product/=/link_id=%d' 7 | 8 | 9 | class ProductSpider(JAVSpider): 10 | name = 'dmm.product' 11 | 12 | def __init__(self, ids='', **kw): 13 | self.range = set(parse_range(ids)) or range(100) 14 | 15 | def start_requests(self): 16 | yield from super().start_requests() 17 | 18 | for i in self.range: 19 | yield self.make_request(product_link % i) 20 | 21 | def export_items(self, response): 22 | yield parse_product(response) 23 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/series_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | 3 | from ..article import get_article 4 | 5 | 6 | class SeriesSpider(JAVSpider): 7 | name = 'dmm.series' 8 | 9 | start_urls = ( 10 | 'http://www.dmm.co.jp/digital/videoa/-/series/', 11 | 'http://www.dmm.co.jp/mono/dvd/-/series/', 12 | ) 13 | 14 | pagination_xpath = '(//div[contains(@class,"nation")])[1]//a' 15 | 16 | def export_items(self, response): 17 | if not response.meta.get('page'): 18 | return () 19 | 20 | for div in response.xpath('.//div[@class="tx-work mg-b12 left"]'): 21 | try: 22 | url, t = div.xpath('(p/a)[1]').xpath('@href|text()').getall() 23 | except ValueError: 24 | continue 25 | 26 | item = get_article(url, name=t) 27 | if not item: 28 | continue 29 | 30 | item['url'] = response.urljoin(url) 31 | desc = ''.join(div.xpath('text()').getall()).strip() 32 | if desc: 33 | item['text'] = desc 34 | 35 | yield item 36 | -------------------------------------------------------------------------------- /jav/sites/dmm/spiders/video_spider.py: -------------------------------------------------------------------------------- 1 | from jav.spiders import JAVSpider 2 | from jav.spiders.list_spider import UrlListSpider 3 | 4 | from ..video import parse_video 5 | from ..constants import PAGEN, DATE_URLS 6 | 7 | tmb_xpath = '//p[@class="tmb"]/a' 8 | monocal_xpath = '//td[@class="title-monocal"]/a' 9 | 10 | 11 | class VideoSpider(JAVSpider): 12 | name = 'dmm.video' 13 | 14 | retry_xpath = '//h1' 15 | 16 | def export_items(self, response): 17 | yield parse_video(response) 18 | 19 | 20 | class VideoListSpider(UrlListSpider): 21 | name = 'dmm.video.list' 22 | 23 | link_xp = tmb_xpath 24 | pagination_xpath = PAGEN 25 | 26 | start_urls = ( 27 | 'http://www.dmm.co.jp/digital/videoa/-/list/=/sort=release_date/', 28 | 'http://www.dmm.co.jp/mono/dvd/-/list/=/sort=date/', 29 | ) 30 | 31 | def export_items(self, response): 32 | yield parse_video(response) 33 | 34 | def get_list(self, response): 35 | for l in self.links(response, self.link_xp): 36 | yield l.split('?')[0] 37 | 38 | 39 | class VideoDateSpider(VideoListSpider): 40 | name = 'dmm.video.date' 41 | 42 | link_xp = '(%s|%s)' % (tmb_xpath, monocal_xpath) 43 | 44 | def __init__(self, date='', **kw): 45 | super().__init__(**kw) 46 | 47 | self.start, self.month = self.get_date(date) 48 | 49 | def get_date(self, d): 50 | from datetime import datetime 51 | try: 52 | return datetime.strptime(d, '%Y-%m-%d'), False 53 | except ValueError: 54 | try: 55 | return datetime.strptime(d, '%Y-%m'), True 56 | except ValueError: 57 | pass 58 | 59 | return datetime.now(), False 60 | 61 | def start_requests(self): 62 | d = self.start 63 | 64 | if self.month: 65 | from datetime import timedelta 66 | from calendar import monthrange 67 | 68 | one_day = timedelta(days=1) 69 | last_day = monthrange(d.year, d.month)[1] 70 | last = d.replace(day=last_day) 71 | 72 | yield self.make_request(DATE_URLS[0].format(d, last)) 73 | 74 | while d <= last: 75 | yield self.make_request(DATE_URLS[1].format(d)) 76 | d += one_day 77 | else: 78 | for url in DATE_URLS: 79 | yield self.make_request(url.format(d, d)) 80 | -------------------------------------------------------------------------------- /jav/sites/dmm/video.py: -------------------------------------------------------------------------------- 1 | from jav.utils import get_aux 2 | from jav.items import JAVLoader 3 | 4 | from .constants import MUTUALS 5 | 6 | 7 | text_labels = { 8 | '品番': 'cid', 9 | '発売日': 'date', 10 | '商品発売日': 'date', 11 | '収録時間': 'runtime', 12 | '配信開始日': 'delivery_date', 13 | } 14 | 15 | xpaths = { 16 | 'title': '//h1/text()', 17 | 'image': '//img[@class="tdmm"]/../@href', 18 | 'text': '//div[@class="mg-b20 lh4"]//text()', 19 | 'gallery': '//a[@name="sample-image"]/img/@src', 20 | 'related': '//div[@class="area-edition"]//a/@href', 21 | 'articles': ( 22 | '//span[@id="performer"]/a/@href', 23 | '//table[@class="mg-b20"]/tr/td/a/@href', 24 | ), 25 | } 26 | 27 | 28 | def parse_video(response): 29 | v = JAVLoader(response=response, xpaths=xpaths) 30 | 31 | for row in response.xpath('//td[@class="nw"]/..'): 32 | label, *vals = row.xpath('td/text()').getall() 33 | if not vals: 34 | continue 35 | label = text_labels.get(label[:-1]) 36 | 37 | if label: 38 | v.add_value(label, vals[0].strip()) 39 | 40 | m_l = response.xpath('//script[contains(.,"#mutual-link")]/text()') 41 | if m_l: 42 | m_l = response.urljoin(MUTUALS.format(*m_l.re(r":\s*'(.*)',"))) 43 | v.nested(selector=get_aux(m_l)).add_xpath('related', '//a/@href') 44 | 45 | a_p = response.xpath('//script[contains(.,"#a_performer")]/text()') 46 | if a_p: 47 | a_p = response.urljoin(a_p.re_first(r"url: '(.*)',")) 48 | v.nested(selector=get_aux(a_p)).add_xpath('articles', '//a/@href') 49 | 50 | return v.load_item() 51 | -------------------------------------------------------------------------------- /jav/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider, Request 2 | 3 | 4 | class JAVSpider(Spider): 5 | """Custom Spider class for JAV scrapers. 6 | 7 | Allow file containing `start_urls` to be specified in settings 8 | """ 9 | 10 | handle_httpstatus_list = (404,) 11 | 12 | start_urls = () 13 | 14 | pagination_xpath = None 15 | pagination_text = 'text()' 16 | 17 | def get_start_urls(self, urls): 18 | for url in urls: 19 | try: 20 | with open(url) as f: 21 | for line in f.readlines(): 22 | line = line.strip() 23 | if line and not line.startswith('#'): 24 | yield line 25 | except OSError: 26 | yield url 27 | 28 | def make_request(self, url, **kw): 29 | return Request(url, **kw) 30 | 31 | def start_requests(self): 32 | urls = self.settings.getlist('START_URLS', ()) 33 | if urls: 34 | for url in self.get_start_urls(urls): 35 | yield Request(url, dont_filter=True) 36 | else: 37 | yield from super().start_requests() 38 | 39 | def parse_item(self, response): 40 | response.meta['export'] = self.export_items(response) 41 | yield 42 | 43 | def export_items(self, response): 44 | yield 45 | 46 | def links(self, response, xp, follow=False, ignore=None, **kw): 47 | for url in response.xpath(xp).xpath('@href').getall(): 48 | if ignore and ignore(url): 49 | continue 50 | 51 | if follow: 52 | yield response.follow(url, **kw) 53 | else: 54 | yield response.urljoin(url) 55 | 56 | def page_number(self, anchor): 57 | return anchor.xpath(self.pagination_text).get() 58 | 59 | def pagination(self, response, ignore=None, **kw): 60 | if not self.pagination_xpath: 61 | return () 62 | 63 | max_page = self.settings.getint('MAX_PAGE', 1) 64 | 65 | for a in response.xpath(self.pagination_xpath): 66 | try: 67 | page = int(self.page_number(a)) 68 | except (TypeError, ValueError): 69 | continue 70 | 71 | if max_page > 0 and page > max_page: 72 | continue 73 | 74 | url = a.xpath('@href').get() 75 | if not url: 76 | continue 77 | 78 | if ignore and ignore(url): 79 | continue 80 | 81 | yield response.follow(url, meta={'page': page}, **kw) 82 | 83 | def parse(self, response): 84 | yield from self.parse_item(response) 85 | 86 | for item in response.meta.get('export', ()): 87 | if item is None: 88 | continue 89 | url = item.pop('url', response.url) 90 | yield {'url': url, 'item': item} 91 | 92 | yield from self.pagination(response) 93 | -------------------------------------------------------------------------------- /jav/spiders/list_spider.py: -------------------------------------------------------------------------------- 1 | from . import JAVSpider 2 | 3 | 4 | class UrlListSpider(JAVSpider): 5 | 6 | deep = False 7 | 8 | def get_list(self, response): 9 | yield 10 | 11 | def parse_item(self, response): 12 | if response.meta.get('deep'): 13 | response.meta['export'] = self.export_items(response) 14 | return () 15 | 16 | urls = self.get_list(response) 17 | 18 | if self.deep: 19 | for url in urls: 20 | yield response.follow(url, meta={'deep': True}) 21 | else: 22 | for url in urls: 23 | yield {'url': url} 24 | -------------------------------------------------------------------------------- /jav/utils.py: -------------------------------------------------------------------------------- 1 | class AttrDict(dict): 2 | """Access `dict` keys as attributes""" 3 | __getattr__ = dict.__getitem__ 4 | __setattr__ = dict.__setitem__ 5 | __delattr__ = dict.__delitem__ 6 | 7 | 8 | def get_aux(url): 9 | """Get auxiliary page and return Selector""" 10 | from requests import get 11 | from scrapy.selector import Selector 12 | return Selector(text=get(url).text) 13 | 14 | 15 | def extract_t(element, p='text()'): 16 | """Get stripped text of first element""" 17 | try: 18 | return element.xpath(p).get('').strip() 19 | except AttributeError: 20 | return '' 21 | 22 | 23 | def extract_a(element, xpaths=('@href', 'text()')): 24 | """Get attributes of all a elements""" 25 | for e in element.xpath('.//a'): 26 | yield tuple(extract_t(e, i) for i in xpaths) 27 | 28 | 29 | def parse_url(url): 30 | """Get the path and parsed query string""" 31 | from urllib.parse import urlparse, parse_qs 32 | 33 | u = urlparse(url) 34 | return u.path, parse_qs(u.query) 35 | 36 | 37 | def get_key(url, key): 38 | """Get a key from the query string of a url""" 39 | if isinstance(url, dict): 40 | query = url 41 | else: 42 | query = parse_url(url)[1] 43 | 44 | return query.get(key, (None,))[0] 45 | 46 | 47 | def parse_range(ranges): 48 | """Parse comma seperated range of values e.g. 1,2-3,5-10""" 49 | if not ranges: 50 | return () 51 | 52 | for r in ranges.split(','): 53 | try: 54 | p = tuple(int(v) for v in r.split('-')) 55 | except ValueError: 56 | continue 57 | 58 | if 1 > len(p) > 2: 59 | continue 60 | 61 | try: 62 | start, end = p 63 | except ValueError: 64 | start = end = p[0] 65 | 66 | if start > end: 67 | end, start = start, end 68 | 69 | yield from range(start, end + 1) 70 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = jav.settings 3 | --------------------------------------------------------------------------------