├── .gitattributes ├── .gitignore ├── README.md ├── Vagrantfile ├── get-pip.py ├── okcubot ├── okcubot │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── okcubot_spider.old.py │ │ └── okcubot_spider.py ├── project.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── entry_points.txt │ └── top_level.txt ├── run.bat ├── scrapy.cfg └── setup.py └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | 45 | # Build files 46 | *.pyc 47 | okcubot/build/ 48 | 49 | # R files 50 | .Rhistory 51 | 52 | # Project files 53 | .idea 54 | .sync 55 | .vagrant 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OKCubot 2 | 3 | Scraping people for science and stuff. 4 | 5 | ## Instructions 6 | 7 | 1. Install Python 2.7 8 | 2. Install pip (you could use python get-pip.py, but don't trust anyone) 9 | 3. Add pip to your path 10 | 4. python -m pip install -r requirements.txt 11 | 12 | ## Troubleshooting 13 | 14 | I don't know. Add an issue. 15 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 8 | # All Vagrant configuration is done here. The most common configuration 9 | # options are documented and commented below. For a complete reference, 10 | # please see the online documentation at vagrantup.com. 11 | 12 | config.vm.hostname = "okcubot" 13 | 14 | # Every Vagrant virtual environment requires a box to build off of. 15 | config.vm.box = "precise64" 16 | 17 | # The url from where the 'config.vm.box' box will be fetched if it 18 | # doesn't already exist on the user's system. 19 | config.vm.box_url = "http://files.vagrantup.com/precise64.box" 20 | 21 | # Create a forwarded port mapping which allows access to a specific port 22 | # within the machine from a port on the host machine. In the example below, 23 | # accessing "localhost:8080" will access port 80 on the guest machine. 24 | # config.vm.network :forwarded_port, guest: 80, host: 8080 25 | 26 | # Create a private network, which allows host-only access to the machine 27 | # using a specific IP. 28 | config.vm.network :private_network, ip: "192.168.33.10" 29 | 30 | # Create a public network, which generally matched to bridged network. 31 | # Bridged networks make the machine appear as another physical device on 32 | # your network. 33 | # config.vm.network :public_network 34 | 35 | # If true, then any SSH connections made will enable agent forwarding. 36 | # Default value: false 37 | config.ssh.forward_agent = true 38 | 39 | # Share an additional folder to the guest VM. The first argument is 40 | # the path on the host to the actual folder. The second argument is 41 | # the path on the guest to mount the folder. And the optional third 42 | # argument is a set of non-required options. 43 | # config.vm.synced_folder "../data", "/vagrant_data" 44 | 45 | # Provider-specific configuration so you can fine-tune various 46 | # backing providers for Vagrant. These expose provider-specific options. 47 | # Example for VirtualBox: 48 | # 49 | config.vm.provider :virtualbox do |vb| 50 | # # Don't boot with headless mode 51 | # vb.gui = true 52 | # 53 | # # Use VBoxManage to customize the VM. For example to change memory: 54 | vb.customize ["modifyvm", :id, "--memory", "512"] 55 | end 56 | # 57 | # View the documentation for the provider you're using for more 58 | # information on available options. 59 | 60 | config.vm.provision :shell do |shell| 61 | if File.exists?(Dir.home + '/.gitconfig') 62 | shell.args = "'#{File.read(Dir.home + '/.gitconfig').strip.gsub!(/\n/, '\n')}'" 63 | end 64 | shell.inline = setup_project(config.vm.hostname) 65 | end 66 | end 67 | 68 | # shell script to set up project 69 | def setup_project(project) 70 | return <<-EOS 71 | export PROJECT=#{project} 72 | 73 | export PYTHON_VERSION=2.7 74 | export SETUPTOOLS_VERSION=1.1.6 75 | export VIRTUALENVWRAPPER_VERSION=4.1.1 76 | 77 | export VAGRANT_USER=vagrant 78 | export VAGRANT_HOME=/home/$VAGRANT_USER 79 | 80 | if [ -n "$1" ]; then 81 | echo -e $1 > $VAGRANT_HOME/.gitconfig 82 | chown $VAGRANT_USER.$VAGRANT_USER $VAGRANT_HOME/.gitconfig 83 | fi 84 | 85 | apt-get update 86 | apt-get install -y build-essential curl git python-dev libxml2-dev \ 87 | libxslt1-dev python-software-properties nodejs npm \ 88 | libffi-dev libssl-dev 89 | 90 | ln -sf /usr/bin/nodejs /usr/bin/node 91 | 92 | # For building presentation slides 93 | npm install -g cleaver 94 | 95 | if [ ! -e /usr/local/bin/virtualenv-$PYTHON_VERSION ]; then 96 | cd $VAGRANT_HOME 97 | curl -ksLo virtualenv.tar.gz https://github.com/pypa/virtualenv/tarball/develop 98 | tar xzf virtualenv.tar.gz 99 | cd pypa-virtualenv* 100 | python$PYTHON_VERSION setup.py install 101 | cd $VAGRANT_HOME 102 | rm -rf pypa-virtualenv* virtualenv.tar.gz 103 | fi 104 | 105 | if [ ! -e /usr/local/lib/python$PYTHON_VERSION/dist-packages/setuptools-$SETUPTOOLS_VERSION-py$PYTHON_VERSION.egg ]; then 106 | cd $VAGRANT_HOME 107 | curl -sO https://pypi.python.org/packages/source/s/setuptools/setuptools-$SETUPTOOLS_VERSION.tar.gz 108 | tar xzf setuptools-$SETUPTOOLS_VERSION.tar.gz 109 | cd setuptools-$SETUPTOOLS_VERSION 110 | python$PYTHON_VERSION setup.py install 111 | cd $VAGRANT_HOME 112 | rm -rf setuptools-$SETUPTOOLS_VERSION* 113 | fi 114 | 115 | if [ ! -e /usr/local/bin/pip$PYTHON_VERSION ]; then 116 | cd $VAGRANT_HOME 117 | curl -ksLo pip.tar.gz https://github.com/pypa/pip/tarball/develop 118 | tar xzf pip.tar.gz 119 | cd pypa-pip* 120 | python$PYTHON_VERSION setup.py install 121 | cd $VAGRANT_HOME 122 | rm -rf pypa-pip* pip.tar.gz 123 | fi 124 | 125 | if [ ! -e /usr/local/lib/python$PYTHON_VERSION/dist-packages/virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION-py$PYTHON_VERSION.egg-info ]; then 126 | cd $VAGRANT_HOME 127 | curl -ksLO https://pypi.python.org/packages/source/v/virtualenvwrapper/virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION.tar.gz 128 | tar xzf virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION.tar.gz 129 | cd virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION 130 | python$PYTHON_VERSION setup.py install 131 | cd $VAGRANT_HOME 132 | rm -rf virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION* 133 | fi 134 | 135 | if ! grep -q WORKON_HOME $VAGRANT_HOME/.bashrc; then 136 | echo >> $VAGRANT_HOME/.bashrc 137 | echo 'export VIRTUALENVWRAPPER_PYTHON=/usr/bin/python'$PYTHON_VERSION >> $VAGRANT_HOME/.bashrc 138 | echo 'export WORKON_HOME=$HOME/.virtualenvs' >> $VAGRANT_HOME/.bashrc 139 | echo 'export PROJECT_HOME=/vagrant' >> $VAGRANT_HOME/.bashrc 140 | echo 'source /usr/local/bin/virtualenvwrapper.sh' >> $VAGRANT_HOME/.bashrc 141 | fi 142 | 143 | if ! grep -q 'alias co=' $VAGRANT_HOME/.bashrc; then 144 | echo >> $VAGRANT_HOME/.bashrc 145 | echo 'alias co="cd /vagrant"' >> $VAGRANT_HOME/.bashrc 146 | fi 147 | 148 | if ! grep -q 'EDITOR=' $VAGRANT_HOME/.bashrc; then 149 | echo >> $VAGRANT_HOME/.bashrc 150 | echo 'export EDITOR=vim' >> $VAGRANT_HOME/.bashrc 151 | fi 152 | 153 | if ! grep -q "workon $PROJECT" $VAGRANT_HOME/.profile; then 154 | echo >> $VAGRANT_HOME/.profile 155 | echo "workon $PROJECT" >> $VAGRANT_HOME/.profile 156 | fi 157 | 158 | if [ ! -e $VAGRANT_HOME/.virtualenvs/$PROJECT ]; then 159 | mkdir -p $VAGRANT_HOME/.virtualenvs 160 | echo "virtualenv --system-site-packages $VAGRANT_HOME/.virtualenvs/$PROJECT" 161 | virtualenv -q --system-site-packages $VAGRANT_HOME/.virtualenvs/$PROJECT 162 | 163 | echo "#!/bin/bash" > $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate 164 | echo "export PYTHONPATH=/vagrant" >> $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate 165 | chmod 775 $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate 166 | 167 | echo "#!/bin/bash" > $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate 168 | echo "unset PYTHONPATH" >> $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate 169 | chmod 775 $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate 170 | 171 | chown -R $VAGRANT_USER.$VAGRANT_USER $VAGRANT_HOME/.virtualenvs 172 | fi 173 | 174 | # Install requirements 175 | pip install -r /vagrant/requirements.txt 176 | 177 | # Cleanup veewee post install scripts 178 | if [[ -e $VAGRANT_HOME/vagrant.sh || -e $VAGRANT_HOME/postinstall.sh ]]; then 179 | rm $VAGRANT_HOME/{apt,build_time,chef,cleanup,postinstall,ruby,sudo,vagrant,vbox}.sh 180 | fi 181 | EOS 182 | end -------------------------------------------------------------------------------- /okcubot/okcubot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deleetdk/OKCubot/a40fd515da6cd4c108c7528dce7aeb9f8493dcd4/okcubot/okcubot/__init__.py -------------------------------------------------------------------------------- /okcubot/okcubot/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class UserItem(Item): 9 | # Details 10 | d_username = Field() 11 | d_age = Field() 12 | d_gender = Field() 13 | d_city = Field() 14 | d_country = Field() 15 | d_orientation = Field() 16 | d_ethnicity = Field() 17 | d_bodytype = Field() 18 | d_diet_manner = Field() 19 | d_diet_type = Field() 20 | d_smokes = Field() 21 | d_drinks = Field() 22 | d_drugs = Field() 23 | # Fiction section 24 | d_religion_type = Field() 25 | d_religion_seriosity = Field() 26 | d_astrology_sign = Field() 27 | d_astrology_seriosity = Field() 28 | d_education_phase = Field() 29 | d_education_type = Field() 30 | d_job = Field() 31 | d_income = Field() 32 | d_relationship = Field() 33 | d_relationship_manner = Field() 34 | d_relationship_type = Field() 35 | d_offspring_current = Field() 36 | d_offspring_desires = Field() 37 | d_pets_dogs = Field() 38 | d_pets_cats = Field() 39 | d_languages = Field() 40 | # Looking for 41 | lf_want = Field() 42 | lf_min_age = Field() 43 | lf_max_age = Field() 44 | lf_location = Field() 45 | lf_single = Field() 46 | lf_for = Field() 47 | # Personality scale 48 | p_explove = Field() # Experienced in Love 49 | p_adven = Field() # Adventurous 50 | p_indie = Field() # Indie 51 | p_spon = Field() # Spontaneous 52 | p_scien = Field() # Scientific 53 | p_inde = Field() # Independent 54 | p_conf = Field() # Confident 55 | p_math = Field() # Mathematical 56 | p_logic = Field() # Logical 57 | p_organ = Field() # Organized 58 | p_oldfash = Field() # Old-Fashioned 59 | p_lit = Field() # Literary 60 | p_opti = Field() # Optimistic 61 | p_roman = Field() # Romantic 62 | p_comp = Field() # Compassionate 63 | p_lovedri = Field() # Love-driven 64 | p_sprit = Field() # Spiritual 65 | p_kinky = Field() # Kinky 66 | p_artsy = Field() # Artsy 67 | p_thrift = Field() # Thrifty 68 | p_drug = Field() # Drug-friendly 69 | p_arro = Field() # Arrogant 70 | p_sloppy = Field() # Sloppy 71 | p_extro = Field() # Extroverted 72 | p_geeky = Field() # Geeky 73 | p_aggre = Field() # Aggressive 74 | p_expsex = Field() # Experienced in sex 75 | p_capi = Field() # Capitalistic 76 | p_exer = Field() # Into Exercise 77 | p_kind = Field() # Kind 78 | p_pure = Field() # Pure 79 | p_convenmoral = Field() # Conventionally Moral 80 | p_manners = Field() # Mannered 81 | p_ambi = Field() # Ambitious 82 | p_polit = Field() # Political 83 | p_greed = Field() # Greedy 84 | p_sexdrive = Field() # Sex-driven 85 | p_energetic = Field() # Energetic 86 | p_cool = Field() # Cool 87 | p_introvert = Field() # Introverted 88 | p_trusting = Field() # Trusting 89 | p_dominant = Field() # Dominant 90 | p_laidback = Field() # Laid-back 91 | p_submissive = Field() # Submissive 92 | p_explife = Field() # Experienced in life 93 | p_friendstrangers = Field() # Fiendly to strangers 94 | p_honest = Field() # Honest 95 | p_giving = Field() # Giving 96 | p_passion = Field() # Passion-driven 97 | p_progress = Field() # Progressive 98 | # Misc 99 | m_photocount = Field() 100 | 101 | class QuestionItem(Item): 102 | id = Field() 103 | text = Field() 104 | # The text of the options 105 | option_1 = Field() 106 | option_2 = Field() 107 | option_3 = Field() 108 | option_4 = Field() 109 | 110 | class AnswerItem(Item): 111 | # User who answered 112 | author = Field() 113 | # Question ID 114 | question = Field() 115 | # Value between 1-4 116 | answer = Field() 117 | # Answer text (if any) 118 | answer_text = Field() 119 | -------------------------------------------------------------------------------- /okcubot/okcubot/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | # TODO: Validate data-pipeline 7 | # TODO: Answer pipeline - translate answer in to a numeric value 8 | 9 | from scrapy import signals 10 | from scrapy.contrib.exporter import CsvItemExporter 11 | from scrapy.exceptions import DropItem 12 | import csv 13 | from collections import defaultdict 14 | import os 15 | import time 16 | 17 | class OkcubotPipeline(object): 18 | def process_item(self, item, spider): 19 | return item 20 | class DuplicatePipeline(object): 21 | def __init__(self): 22 | self.usernames = self.get_column('user.tsv', 'd_username') 23 | self.ids = self.get_column('question.tsv', 'id') 24 | 25 | def process_item(self, item, spider): 26 | try: 27 | if type(item).__name__ == 'UserItem': 28 | return self.check_duplicate_user(item) 29 | elif type(item).__name__ == 'QuestionItem': 30 | return self.check_duplicate_question(item) 31 | elif type(item).__name__ == 'AnswerItem': 32 | return self.check_duplicate_answer(item) 33 | except DropItem as e: 34 | raise e 35 | return 36 | 37 | # Not a user, not a question, not an answer: skip. 38 | return item 39 | 40 | # Get the values of a column in a CSV-file 41 | def get_column(self, file, column): 42 | if not os.path.isfile(file): 43 | return list() 44 | 45 | columns = defaultdict(list) 46 | 47 | with open(file) as f: 48 | reader = csv.DictReader(f) 49 | for row in reader: 50 | for (k,v) in row.items(): 51 | columns[k].append(v) 52 | 53 | return columns[column] 54 | 55 | def check_duplicate_user(self, item): 56 | if item['d_username'] in self.usernames: 57 | raise DropItem('Duplicate user found: %s' % item) 58 | else: 59 | self.usernames.append(item['d_username']) 60 | return item 61 | 62 | def check_duplicate_question(self, item): 63 | if item['id'] in self.ids: 64 | raise DropItem('Duplicate question found: %s' % item) 65 | else: 66 | self.ids.append(item['id']) 67 | return item 68 | 69 | def check_duplicate_answer(self, item): 70 | if item['author'] in self.usernames: 71 | raise DropItem('Duplicate answer found: %s' % item) 72 | else: 73 | return item 74 | 75 | class AnswerSanitationPipeline(object): 76 | def __init__(self): 77 | self.questions = set() 78 | 79 | def process_item(self, item, spider): 80 | if type(item).__name__ == 'QuestionItem': 81 | self.questions.add(item) 82 | return item 83 | if type(item).__name__ != 'AnswerItem': 84 | return item 85 | 86 | question = self.find_question(item['question']) 87 | 88 | if question == None: 89 | return item 90 | 91 | if 'option_1' in question and item['answer'] == question['option_1']: 92 | item['answer'] = 1 93 | elif 'option_2' in question and item['answer'] == question['option_2']: 94 | item['answer'] = 2 95 | elif 'option_3' in question and item['answer'] == question['option_3']: 96 | item['answer'] = 3 97 | elif 'option_4' in question and item['answer'] == question['option_4']: 98 | item['answer'] = 4 99 | 100 | return item 101 | 102 | # Find question by ID 103 | def find_question(self, id): 104 | for question in self.questions: 105 | if question['id'] == id: 106 | return question 107 | 108 | # TODO: Find in files 109 | return None 110 | 111 | class TsvItemExporter(CsvItemExporter): 112 | def __init__(self, *args, **kwargs): 113 | kwargs['encoding'] = 'utf-8' 114 | kwargs['delimiter'] = '\t' 115 | 116 | super(TsvItemExporter, self).__init__(*args, **kwargs) 117 | 118 | #class TsvExportPipeline(object): 119 | # def __init__(self): 120 | # self.files = {} 121 | # 122 | # @classmethod 123 | # def from_crawler(cls, crawler): 124 | # pipeline = cls() 125 | # 126 | # crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 127 | # crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 128 | # 129 | # return pipeline 130 | # 131 | # def spider_opened(self, spider): 132 | # file = open('%s_data.tsv' % spider.name, 'w+b') 133 | # self.files[spider] = file 134 | # self.exporter = TsvItemExporter(file) 135 | # self.exporter.start_exporting() 136 | # 137 | # def spider_closed(self, spider): 138 | # self.exporter.finish_exporting() 139 | # file = self.files.pop(spider) 140 | # file.close() 141 | # 142 | # def process_item(self, item, spider): 143 | # self.exporter.export_item(item) 144 | # 145 | # return item 146 | 147 | def item_type(item): 148 | return type(item).__name__.replace('Item', '').lower() 149 | 150 | class MultiTSVItemPipeline(object): 151 | types = ['user', 'question', 'answer'] 152 | 153 | @classmethod 154 | def from_crawler(cls, crawler): 155 | pipeline = cls() 156 | 157 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 158 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 159 | 160 | return pipeline 161 | 162 | def spider_opened(self, spider): 163 | headers = True 164 | for name in self.types: 165 | if os.path.isfile(name + '.tsv'): 166 | # File already exists -- don't write headers again 167 | headers = False 168 | break 169 | 170 | self.files = dict([ (name, open(time.strftime("%d-%m-%Y") + name + '.tsv', 'ab+')) for name in self.types ]) 171 | self.exporters = dict([ (name, TsvItemExporter(self.files[name], include_headers_line=headers)) for name in self.types]) 172 | [e.start_exporting() for e in self.exporters.values()] 173 | 174 | def spider_closed(self, spider): 175 | [e.finish_exporting() for e in self.exporters.values()] 176 | [f.close() for f in self.files.values()] 177 | 178 | def process_item(self, item, spider): 179 | what = item_type(item) 180 | 181 | if what in set(self.types): 182 | self.exporters[what].export_item(item) 183 | self.files[what].flush() 184 | return item -------------------------------------------------------------------------------- /okcubot/okcubot/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for okcubot project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'okcubot' 10 | 11 | SPIDER_MODULES = ['okcubot.spiders'] 12 | NEWSPIDER_MODULE = 'okcubot.spiders' 13 | 14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 15 | #USER_AGENT = 'okcubot (+http://www.yourdomain.com)' 16 | 17 | ITEM_PIPELINES = { 18 | 'okcubot.pipelines.DuplicatePipeline': 300, 19 | 'okcubot.pipelines.AnswerSanitationPipeline': 500, 20 | 'okcubot.pipelines.MultiTSVItemPipeline': 800 21 | } 22 | 23 | EXTENSIONS = {'scrapy.contrib.feedexport.FeedExporter': None} 24 | 25 | # Log 26 | LOG_LEVEL = 'INFO' 27 | 28 | # Human-like 29 | ALLOWED_DOMAINS = ["okcupid.com"] 30 | DOWNLOAD_DELAY = 2 31 | CONCURRENT_REQUESTS_PER_IP = 1 32 | 33 | # Schedular 34 | DEPTH_PRIORITY = 1 35 | -------------------------------------------------------------------------------- /okcubot/okcubot/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /okcubot/okcubot/spiders/okcubot_spider.old.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import Spider 2 | from scrapy.http import Request, FormRequest 3 | from scrapy.selector import Selector 4 | from scrapy import log 5 | 6 | import re 7 | 8 | from okcubot.items import UserItem, QuestionItem, AnswerItem 9 | 10 | class OkCubotSpider(Spider): 11 | # Spider settings 12 | name = "okcubot" 13 | 14 | # Others 15 | base_url = "http://www.okcupid.com" 16 | 17 | # TODO: Implement missing traits 18 | personality_scale_dict = { 19 | 'p_explove': re.compile('(experienced in love)', re.IGNORECASE), 20 | 'p_adven': re.compile('(adventurous)', re.IGNORECASE), 21 | 'p_indie': re.compile('(indie)', re.IGNORECASE), 22 | 'p_spon': re.compile('(spontaneous)', re.IGNORECASE), 23 | 'p_scien': re.compile('(scientific)', re.IGNORECASE), 24 | 'p_inde': re.compile('(independent)', re.IGNORECASE), 25 | 'p_conf': re.compile('(confident)', re.IGNORECASE), 26 | 'p_math': re.compile('(mathematical)', re.IGNORECASE), 27 | 'p_logic': re.compile('(logical)', re.IGNORECASE), 28 | 'p_organ': re.compile('(organized)', re.IGNORECASE), 29 | 'p_oldfash': re.compile('(old\-fashioned)', re.IGNORECASE), 30 | 'p_lit': re.compile('(literary)', re.IGNORECASE), 31 | 'p_opti': re.compile('(optimistic)', re.IGNORECASE), 32 | 'p_roman': re.compile('(romantic)', re.IGNORECASE), 33 | 'p_comp': re.compile('(compassionate)', re.IGNORECASE), 34 | 'p_lovedri': re.compile('(love\-driven)', re.IGNORECASE), 35 | 'p_sprit': re.compile('(spiritual)', re.IGNORECASE), 36 | 'p_kinky': re.compile('(kinky)', re.IGNORECASE), 37 | 'p_artsy': re.compile('(artsy)', re.IGNORECASE), 38 | 'p_thrift': re.compile('(thrifty)', re.IGNORECASE), 39 | 'p_drug': re.compile('(drug\-friendly)', re.IGNORECASE), 40 | 'p_arro': re.compile('(arrogant)', re.IGNORECASE), 41 | 'p_sloppy': re.compile('(sloppy)', re.IGNORECASE), 42 | 'p_extro': re.compile('(extroverted)', re.IGNORECASE), 43 | 'p_geeky': re.compile('(geeky)', re.IGNORECASE), 44 | 'p_aggre': re.compile('(aggressive)', re.IGNORECASE), 45 | 'p_expsex': re.compile('(experienced in sex)', re.IGNORECASE), 46 | 'p_capi': re.compile('(capitalistic)', re.IGNORECASE), 47 | 'p_exer': re.compile('(into exercise)', re.IGNORECASE), 48 | 'p_kind': re.compile('(kind)', re.IGNORECASE), 49 | 'p_pure': re.compile('(pure)', re.IGNORECASE), 50 | 'p_convenmoral': re.compile('(conventionally moral)', re.IGNORECASE), 51 | 'p_manners': re.compile('(mannered)', re.IGNORECASE), 52 | 'p_ambi': re.compile('(ambitious)', re.IGNORECASE), 53 | 'p_polit': re.compile('(political)', re.IGNORECASE), 54 | 'p_greed': re.compile('(greedy)', re.IGNORECASE), 55 | 'p_sexdrive': re.compile('(sex\-driven)', re.IGNORECASE), 56 | 'p_energetic': re.compile('(energetic)', re.IGNORECASE), 57 | 'p_cool': re.compile('(cool)', re.IGNORECASE), 58 | 'p_introvert': re.compile('(introverted)', re.IGNORECASE), 59 | 'p_trusting': re.compile('(trusting)', re.IGNORECASE), 60 | 'p_dominant': re.compile('(dominant)', re.IGNORECASE), 61 | 'p_laidback': re.compile('(laid\-back)', re.IGNORECASE), 62 | 'p_submissive': re.compile('(submissive)', re.IGNORECASE), 63 | 'p_explife': re.compile('(experienced in life)', re.IGNORECASE), 64 | 'p_friendstrangers': re.compile('(friendly to strangers)', re.IGNORECASE), 65 | 'p_honest': re.compile('(honest)', re.IGNORECASE), 66 | 'p_giving': re.compile('(giving)', re.IGNORECASE), 67 | 'p_passion': re.compile('(passion\-driven)', re.IGNORECASE), 68 | 'p_progress': re.compile('(progressive)', re.IGNORECASE) 69 | } 70 | 71 | # User 72 | user = None 73 | password = None 74 | 75 | # Seeds and target 76 | target = None 77 | 78 | # TODO: Format argument in constructor 79 | # Args 80 | # user - username for bot account 81 | # pass - password for bot account 82 | # target - optional target for scraping single users 83 | # format - optional (default: tsv) format to export data to (e.g. csv, tsv) 84 | def __init__(self, *args, **kwargs): 85 | super(OkCubotSpider, self).__init__(*args, **kwargs) 86 | 87 | if "user" not in kwargs or "pass" not in kwargs: 88 | print "Please supply a user and a password" 89 | exit() 90 | 91 | if "target" in kwargs: 92 | self.target = kwargs['target'] 93 | 94 | self.user = kwargs['user'] 95 | self.password = kwargs['pass'] 96 | 97 | # Patch 98 | self.monkey_patch_HTTPClientParser_statusReceived() 99 | 100 | def start_requests(self): 101 | return [FormRequest("https://www.okcupid.com/login", 102 | formdata={'username': self.user, 'password': self.password}, 103 | callback=self.logged_in)] 104 | 105 | def logged_in(self, response): 106 | selector = Selector(response) 107 | 108 | if self.target != None: 109 | # We only want to scrape this user 110 | yield Request(self.base_url + '/profile/' + self.target, callback=self.parse_profile) 111 | else: 112 | profiles = selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract() 113 | if len(profiles) == 0: 114 | log.msg('Credentials incorrect.', level=log.ERROR) 115 | else: 116 | for url in profiles: 117 | log.msg('Seeded bot with user (' + url + ')') 118 | yield Request(self.base_url + url, callback=self.parse_profile) 119 | 120 | def parse_profile(self, response): 121 | selector = Selector(response) 122 | 123 | # TODO: Handle parameters which are - as none. 124 | # A note on this is that you can set default values 125 | # etc. in the Field method of an item. 126 | # TODO: Trim 127 | # TODO: Implement 128 | # d_religion_type = Field() 129 | # d_religion_seriosity = Field() 130 | # d_astrology_sign = Field() 131 | # d_astrology_seriosity = Field() 132 | # d_education_phase = Field() 133 | # d_education_type = Field() 134 | # d_job = Field() 135 | # d_income = Field() 136 | # d_relationship = Field() 137 | # d_relationship_manner = Field() 138 | # d_relationship_type = Field() 139 | # d_offspring_current = Field() 140 | # d_offspring_desires = Field() 141 | # d_pets_dogs = Field() 142 | # d_pets_cats = Field() 143 | # Looking for 144 | # lf_want = Field() 145 | # lf_min_age = Field() 146 | # lf_max_age = Field() 147 | # lf_location = Field() 148 | # lf_single = Field() 149 | # lf_for = Field() 150 | 151 | attribute_dict = { 152 | 'd_username': '#basic_info_sn.name::text', 153 | 'd_age': '#ajax_age::text', 154 | 'd_gender': '.ajax_gender::text', 155 | 'd_orientation': '#ajax_orientation::text', 156 | 'd_ethnicity': '#ajax_ethnicities::text', 157 | 'd_bodytype': '#ajax_bodytype::text', 158 | 'd_smokes': '#ajax_smoking::text', 159 | 'd_drinks': '#ajax_drinking::text', 160 | 'd_drugs': '#ajax_drugs::text', 161 | 'd_languages': '#ajax_languages::text', 162 | } 163 | 164 | user = UserItem() 165 | # Iterate over attribute dictionary and fetch data 166 | for attr, ident in attribute_dict.iteritems(): 167 | val = selector.css(ident).extract()[0] 168 | 169 | # Trim 170 | val = val.strip() 171 | 172 | # Translate - to blanks 173 | val = val.replace('—', '') 174 | 175 | # Set attribute 176 | user[attr] = val 177 | 178 | #name = selector.css('#basic_info_sn.name::text').extract()[0] 179 | 180 | #age = selector.css('#ajax_age::text').extract()[0] 181 | #gender = selector.css('.ajax_gender::text').extract()[0] 182 | location = selector.css('#ajax_location::text').extract()[0].split(',') 183 | city = location[0] 184 | country = location[1] 185 | #orientation = selector.css('#ajax_orientation::text').extract()[0] 186 | #ethnicity = selector.css('#ajax_ethnicities::text').extract() 187 | #bodytype = selector.css('#ajax_bodytype::text').extract()[0] 188 | diet = selector.css('#ajax_diet::text').extract()[0].split(' ') 189 | # Diet handling stuff 190 | diet_manner = 0 191 | diet_type = 0 192 | if len(diet) == 1: 193 | diet_type = diet[0] 194 | else: 195 | diet_manner = diet[0] 196 | diet_type = diet[1] 197 | # End diet handling stuff 198 | #smokes = selector.css('#ajax_smoking::text').extract()[0] 199 | #drinks = selector.css('#ajax_drinking::text').extract()[0] 200 | #drugs = selector.css('#ajax_drugs::text').extract()[0] 201 | 202 | #user['d_username'] = name 203 | #user['d_age'] = age 204 | #user['d_gender'] = gender 205 | user['d_city'] = city 206 | user['d_country'] = country 207 | #user['d_orientation'] = orientation 208 | #user['d_ethnicity'] = ethnicity # TODO: fix 209 | #user['d_bodytype'] = bodytype 210 | user['d_diet_manner'] = diet_manner 211 | user['d_diet_type'] = diet_type 212 | #user['d_smokes'] = smokes 213 | #user['d_drinks'] = drinks 214 | #user['d_drugs'] = drugs 215 | 216 | # Request parsing of the user's personality traits 217 | request = Request(self.base_url + '/profile/' + name + '/personality', callback=self.parse_personality, priority=100) 218 | request.meta['user'] = user 219 | yield request 220 | 221 | if self.target == None: 222 | # Find other users 223 | i = 0 224 | for url in selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract(): 225 | i += 1 226 | if url != response.request.url: 227 | yield Request(self.base_url + url, callback=self.parse_profile, priority=-100) 228 | log.msg('Queued ' + `i` + ' users from ' + name) 229 | 230 | def parse_personality(self, response): 231 | selector = Selector(response) 232 | 233 | user = response.meta['user'] 234 | 235 | i = 0 236 | for trait in selector.css('.pt_row'): 237 | label = trait.css('label::text').extract()[0] 238 | percentage = re.sub('(width\:)|(\%\;)', '', trait.css('span::attr(style)').extract()[0]) 239 | 240 | try: 241 | percentage = int(percentage) 242 | except ValueError: 243 | log.msg('Could not parse trait, moving on.', level=log.ERROR) 244 | continue 245 | 246 | if len(trait.css('p.right > label')) == 1: 247 | # Label is in right p, so negate the percentage 248 | percentage = -percentage 249 | 250 | actual = None 251 | for t, r in self.personality_scale_dict.iteritems(): 252 | if r.search(label): 253 | actual = t 254 | 255 | if actual == None: 256 | log.msg('Unknown trait ' + label, level=log.ERROR) 257 | else: 258 | user[actual] = percentage 259 | i += 1 260 | log.msg(`i` + ' traits parsed for user ' + user['d_username']) 261 | 262 | # Request parsing questions/answers 263 | request = Request(self.base_url + '/profile/' + user['d_username'] + '/questions', callback=self.parse_questions, priority=400) 264 | request.meta['user'] = user 265 | yield request 266 | 267 | def parse_questions(self, response): 268 | selector = Selector(response) 269 | 270 | user = response.meta['user'] 271 | 272 | i = 0 273 | for qa in selector.css('.question'): 274 | i += 1 275 | 276 | qid = qa.css('::attr(data-qid)').extract()[0] 277 | if qa.css('.not_answered'): 278 | # Not answered, answer it and store it. 279 | question = QuestionItem() 280 | question['id'] = qid 281 | question['text'] = qa.css('.qtext > p::text').extract()[0] 282 | options = qa.css('.my_answer > label::text').extract() 283 | 284 | if len(options) > 0: 285 | question['option_1'] = options[0] 286 | if len(options) > 1: 287 | question['option_2'] = options[1] 288 | if len(options) > 2: 289 | question['option_3'] = options[2] 290 | if len(options) > 3: 291 | question['option_4'] = options[3] 292 | 293 | # TODO: Make the bot actually answer the question 294 | #yield self.answer_question(qid, 1) 295 | yield question 296 | else: 297 | answer = AnswerItem() 298 | answer['author'] = user['d_username'] 299 | answer['question'] = qid 300 | answer['answer'] = qa.css('.answers .target .text::text').extract()[0] 301 | answer['answer_text'] = qa.css('.answers .target .note::text').extract()[0] 302 | 303 | yield answer 304 | 305 | log.msg(`i` + ' questions/answers parsed for user ' + user['d_username']) 306 | 307 | if len(selector.css('.pages .next.disabled').extract()) > 0: 308 | # We don't have any more pages. Yield the user. 309 | log.msg('Done processing ' + user['d_username']) 310 | yield user 311 | else: 312 | # We're not done. 313 | next = selector.css('.pages .next > a::attr(href)').extract()[0] 314 | request = Request(self.base_url + next, callback=self.parse_questions, priority=400) 315 | request.meta['user'] = user 316 | yield request 317 | 318 | def answer_question(self, qid, option): 319 | return FormRequest("https://www.okcupid.com/questions/ask", 320 | formdata={ 321 | 'ajax': '1', 322 | 'submit': '1', 323 | 'answer_question': '1', 324 | 'skip': '0', 325 | 'show_all': '0', 326 | 'is_new': '1', 327 | 'matchanswers': 'irrelevant', 328 | 'qid': str(qid), 329 | 'importance': '5', 330 | 'is_public': '1', 331 | 'note': '', 332 | 'delete_note': '0', 333 | 'targetid': '', 334 | 'is_public': '1', 335 | 'answers': str(option) 336 | }, 337 | callback=self.answered, priority=1000, 338 | headers={ 339 | 'Accept': 'application/json', 340 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 341 | 'X-Requested-With': 'XMLHttpRequest' 342 | }) 343 | 344 | def answered(self, response): 345 | # TODO: WE NEED TO GET THEIR ANSWER (hint: target in POST) 346 | pass 347 | 348 | def monkey_patch_HTTPClientParser_statusReceived(self): 349 | """ 350 | Monkey patch for twisted.web._newclient.HTTPClientParser.statusReceived 351 | """ 352 | from twisted.web._newclient import HTTPClientParser, ParseError 353 | old_sr = HTTPClientParser.statusReceived 354 | def statusReceived(self, status): 355 | try: 356 | return old_sr(self, status) 357 | except ParseError, e: 358 | if e.args[0] == 'wrong number of parts': 359 | log.msg('Wrong number of parts in header. Assuming 200 OK', level=log.DEBUG) 360 | return old_sr(self, str(status) + ' OK') 361 | raise 362 | statusReceived.__doc__ == old_sr.__doc__ 363 | HTTPClientParser.statusReceived = statusReceived -------------------------------------------------------------------------------- /okcubot/okcubot/spiders/okcubot_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from scrapy.spider import Spider 3 | from scrapy.http import Request, FormRequest 4 | from scrapy.selector import Selector 5 | from scrapy import log 6 | 7 | import re 8 | 9 | from okcubot.items import UserItem, QuestionItem, AnswerItem 10 | 11 | class OkCubotSpider(Spider): 12 | # Spider settings 13 | name = "okcubot" 14 | 15 | # Others 16 | base_url = "http://www.okcupid.com" 17 | 18 | user_queue = [] 19 | 20 | # TODO: Implement missing traits 21 | personality_scale_dict = { 22 | 'p_explove': re.compile('(experienced in love)', re.IGNORECASE), 23 | 'p_adven': re.compile('(adventurous)', re.IGNORECASE), 24 | 'p_indie': re.compile('(indie)', re.IGNORECASE), 25 | 'p_spon': re.compile('(spontaneous)', re.IGNORECASE), 26 | 'p_scien': re.compile('(scientific)', re.IGNORECASE), 27 | 'p_inde': re.compile('(independent)', re.IGNORECASE), 28 | 'p_conf': re.compile('(confident)', re.IGNORECASE), 29 | 'p_math': re.compile('(mathematical)', re.IGNORECASE), 30 | 'p_logic': re.compile('(logical)', re.IGNORECASE), 31 | 'p_organ': re.compile('(organized)', re.IGNORECASE), 32 | 'p_oldfash': re.compile('(old\-fashioned)', re.IGNORECASE), 33 | 'p_lit': re.compile('(literary)', re.IGNORECASE), 34 | 'p_opti': re.compile('(optimistic)', re.IGNORECASE), 35 | 'p_roman': re.compile('(romantic)', re.IGNORECASE), 36 | 'p_comp': re.compile('(compassionate)', re.IGNORECASE), 37 | 'p_lovedri': re.compile('(love\-driven)', re.IGNORECASE), 38 | 'p_sprit': re.compile('(spiritual)', re.IGNORECASE), 39 | 'p_kinky': re.compile('(kinky)', re.IGNORECASE), 40 | 'p_artsy': re.compile('(artsy)', re.IGNORECASE), 41 | 'p_thrift': re.compile('(thrifty)', re.IGNORECASE), 42 | 'p_drug': re.compile('(drug\-friendly)', re.IGNORECASE), 43 | 'p_arro': re.compile('(arrogant)', re.IGNORECASE), 44 | 'p_sloppy': re.compile('(sloppy)', re.IGNORECASE), 45 | 'p_extro': re.compile('(extroverted)', re.IGNORECASE), 46 | 'p_geeky': re.compile('(geeky)', re.IGNORECASE), 47 | 'p_aggre': re.compile('(aggressive)', re.IGNORECASE), 48 | 'p_expsex': re.compile('(experienced in sex)', re.IGNORECASE), 49 | 'p_capi': re.compile('(capitalistic)', re.IGNORECASE), 50 | 'p_exer': re.compile('(into exercise)', re.IGNORECASE), 51 | 'p_kind': re.compile('(kind)', re.IGNORECASE), 52 | 'p_pure': re.compile('(pure)', re.IGNORECASE), 53 | 'p_convenmoral': re.compile('(conventionally moral)', re.IGNORECASE), 54 | 'p_manners': re.compile('(mannered)', re.IGNORECASE), 55 | 'p_ambi': re.compile('(ambitious)', re.IGNORECASE), 56 | 'p_polit': re.compile('(political)', re.IGNORECASE), 57 | 'p_greed': re.compile('(greedy)', re.IGNORECASE), 58 | 'p_sexdrive': re.compile('(sex\-driven)', re.IGNORECASE), 59 | 'p_energetic': re.compile('(energetic)', re.IGNORECASE), 60 | 'p_cool': re.compile('(cool)', re.IGNORECASE), 61 | 'p_introvert': re.compile('(introverted)', re.IGNORECASE), 62 | 'p_trusting': re.compile('(trusting)', re.IGNORECASE), 63 | 'p_dominant': re.compile('(dominant)', re.IGNORECASE), 64 | 'p_laidback': re.compile('(laid\-back)', re.IGNORECASE), 65 | 'p_submissive': re.compile('(submissive)', re.IGNORECASE), 66 | 'p_explife': re.compile('(experienced in life)', re.IGNORECASE), 67 | 'p_friendstrangers': re.compile('(friendly to strangers)', re.IGNORECASE), 68 | 'p_honest': re.compile('(honest)', re.IGNORECASE), 69 | 'p_giving': re.compile('(giving)', re.IGNORECASE), 70 | 'p_passion': re.compile('(passion\-driven)', re.IGNORECASE), 71 | 'p_progress': re.compile('(progressive)', re.IGNORECASE) 72 | } 73 | 74 | # Regular expressions 75 | education_re = re.compile('(graduated from|working on|dropped out of|)\s?(high school|university|masters program|law school|med school|space camp|ph\.d program|two\-year college)', re.IGNORECASE) 76 | lf_age_re = re.compile('(\d+).(\d+)') 77 | 78 | # User 79 | user = None 80 | password = None 81 | 82 | # Seeds and target 83 | target = None 84 | 85 | # Args 86 | # user - username for bot account 87 | # pass - password for bot account 88 | # target - optional target for scraping single users 89 | def __init__(self, *args, **kwargs): 90 | super(OkCubotSpider, self).__init__(*args, **kwargs) 91 | 92 | if "user" not in kwargs or "pass" not in kwargs: 93 | print "Please supply a user and a password" 94 | exit() 95 | 96 | if "target" in kwargs: 97 | self.target = kwargs['target'] 98 | 99 | self.user = kwargs['user'] 100 | self.password = kwargs['pass'] 101 | 102 | # Patch 103 | self.monkey_patch_HTTPClientParser_statusReceived() 104 | 105 | def spider_idle(self, spider): 106 | next_user = self.next_user() 107 | if next_user is not None: 108 | yield next_user 109 | 110 | def queue_user(self, req): 111 | if req not in self.user_queue: 112 | if len(self.user_queue) < 100: 113 | self.user_queue.append(req) 114 | return True 115 | else: 116 | log.msg('User queue is too big. Skipping.') 117 | return False 118 | 119 | log.msg('User is already in queue. Skipping.') 120 | return False 121 | 122 | 123 | def next_user(self): 124 | if len(self.user_queue) > 0: 125 | return self.user_queue.pop() 126 | return None 127 | 128 | def start_requests(self): 129 | return [FormRequest("https://www.okcupid.com/login", 130 | formdata={'username': self.user, 'password': self.password}, 131 | callback=self.logged_in)] 132 | 133 | def logged_in(self, response): 134 | selector = Selector(response) 135 | 136 | if self.target != None: 137 | # We only want to scrape this user 138 | yield Request(self.base_url + '/profile/' + self.target, callback=self.parse_profile) 139 | else: 140 | profiles = selector.css('#similar_users_list li > a::attr(href), #matchphotobrowser_int .item a.name::attr(href)').extract() 141 | if len(profiles) == 0: 142 | log.msg('Credentials incorrect.', level=log.ERROR) 143 | else: 144 | for url in profiles: 145 | log.msg('Seeded bot with user (' + url + ')') 146 | self.queue_user(Request(self.base_url + url, callback=self.parse_profile)) 147 | 148 | # Yield two users to get things started 149 | yield self.next_user() 150 | yield self.next_user() 151 | 152 | def parse_profile(self, response): 153 | selector = Selector(response) 154 | 155 | # TODO: Implement 156 | # d_pets_dogs = Field() 157 | # d_pets_cats = Field() 158 | 159 | attribute_dict = { 160 | # Details 161 | 'd_username': '#basic_info_sn.name::text', 162 | 'd_age': '#ajax_age::text', 163 | 'd_gender': '.ajax_gender::text', 164 | 'd_orientation': '#ajax_orientation::text', 165 | 'd_ethnicity': '#ajax_ethnicities::text', 166 | 'd_bodytype': '#ajax_bodytype::text', 167 | 'd_relationship': '#ajax_status::text', 168 | 'd_smokes': '#ajax_smoking::text', 169 | 'd_drinks': '#ajax_drinking::text', 170 | 'd_drugs': '#ajax_drugs::text', 171 | 'd_languages': '#ajax_languages::text', 172 | 'd_job': '#ajax_job::text', 173 | 'd_income': '#ajax_income::text', 174 | 175 | # Looking for 176 | 'lf_location': '#ajax_near::text', 177 | 'lf_want': '#ajax_gentation::text', 178 | 'lf_single': '#ajax_single::text' 179 | } 180 | 181 | user = UserItem() 182 | # Iterate over attribute dictionary and fetch data 183 | for attr, ident in attribute_dict.iteritems(): 184 | val = selector.css(ident).extract()[0] 185 | 186 | # Translate - to blanks 187 | val = val.encode('utf-8').replace('—', '') 188 | 189 | # Set attribute 190 | user[attr] = val 191 | 192 | # Looking for 193 | user['lf_for'] = selector.css('#ajax_lookingfor::text').extract()[0].replace('for', '') 194 | 195 | age = selector.css('#ajax_ages::text').extract()[0] 196 | age = self.lf_age_re.findall(age) 197 | lf_min_age = None 198 | lf_max_age = None 199 | if len(age) > 0: 200 | # Stored in tuples.. appearantly 201 | lf_min_age = age[0][0] 202 | lf_max_age = age[0][1] 203 | user['lf_min_age'] = lf_min_age 204 | user['lf_max_age'] = lf_max_age 205 | 206 | # Location 207 | location = selector.css('#ajax_location::text').extract()[0].split(',') 208 | city = location[0] 209 | country = location[1] 210 | user['d_city'] = city 211 | user['d_country'] = country 212 | 213 | # Diet 214 | diet = selector.css('#ajax_diet::text').extract()[0].split(' ') 215 | diet_manner = None 216 | diet_type = None 217 | if len(diet) == 1: 218 | diet_type = diet[0] 219 | elif len(diet) == 2: 220 | diet_manner = diet[0] 221 | diet_type = diet[1] 222 | user['d_diet_manner'] = diet_manner 223 | user['d_diet_type'] = diet_type 224 | 225 | # Religion 226 | religion = selector.css('#ajax_religion::text').extract()[0].split(',') 227 | religion_type = None 228 | religion_seriosity = None 229 | if len(religion) == 1: 230 | religion_type = religion[0] 231 | elif len(religion) == 2: 232 | religion_type = religion[0] 233 | religion_seriosity = religion[1] 234 | user['d_religion_type'] = religion_type 235 | user['d_religion_seriosity'] = religion_seriosity 236 | 237 | # Astrology 238 | astrology = selector.css('#ajax_sign::text').extract()[0].split(',') 239 | astrology_sign = None 240 | astrology_seriosity = None 241 | if len(astrology) == 1: 242 | astrology_sign = astrology[0] 243 | elif len(astrology) == 2: 244 | astrology_sign = astrology[0] 245 | astrology_seriosity = astrology[1] 246 | user['d_astrology_sign'] = astrology_sign 247 | user['d_astrology_seriosity'] = astrology_seriosity 248 | 249 | # Relationship 250 | relationship = selector.css('#ajax_monogamous::text').extract()[0].split(' ') 251 | relationship_manner = None 252 | relationship_type = None 253 | if len(relationship) == 1: 254 | relationship_type = relationship[0] 255 | elif len(relationship) == 2: 256 | relationship_manner = relationship[0] 257 | relationship_type = relationship[1] 258 | user['d_relationship_manner'] = relationship_manner 259 | user['d_relationship_type'] = relationship_type 260 | 261 | # Offspring 262 | offspring = selector.css('#ajax_children::text').extract()[0].split(' ') 263 | offspring_desires = None 264 | offspring_current = None 265 | if len(offspring) == 1: 266 | offspring_current = offspring[0] 267 | elif len(offspring) == 2: 268 | offspring_desires = offspring[0] 269 | offspring_current = offspring[1] 270 | user['d_offspring_desires'] = offspring_desires 271 | user['d_offspring_current'] = offspring_current 272 | 273 | # Education 274 | education = selector.css('#ajax_education::text').extract()[0] 275 | education = self.education_re.findall(education) 276 | education_phase = None 277 | education_type = None 278 | if len(education) > 0: 279 | # Stored in tuples.. appearantly 280 | education_phase = education[0][0] 281 | education_type = education[0][1] 282 | user['d_education_type'] = education_type 283 | user['d_education_phase'] = education_phase 284 | 285 | # Trim values 286 | for attr, val in user.iteritems(): 287 | if val is not None: 288 | user[attr] = val.strip() 289 | 290 | # Request parsing of the user's personality traits 291 | request = Request(self.base_url + '/profile/' + user['d_username'] + '/personality', callback=self.parse_personality, priority=100) 292 | request.meta['user'] = user 293 | yield request 294 | 295 | if self.target == None: 296 | # Find other users 297 | i = 0 298 | for url in selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract(): 299 | if url != response.request.url: 300 | if self.queue_user(Request(self.base_url + url, callback=self.parse_profile, priority=-100)): 301 | i += 1 302 | log.msg('Queued ' + `i` + ' users from ' + user['d_username'] + ' (' + str(len(self.user_queue)) + ')') 303 | 304 | def parse_personality(self, response): 305 | selector = Selector(response) 306 | 307 | user = response.meta['user'] 308 | 309 | i = 0 310 | for trait in selector.css('.pt_row'): 311 | label = trait.css('label::text').extract()[0] 312 | percentage = re.sub('(width\:)|(\%\;)', '', trait.css('span::attr(style)').extract()[0]) 313 | 314 | try: 315 | percentage = int(percentage) 316 | except ValueError: 317 | log.msg('Could not parse trait, moving on.', level=log.ERROR) 318 | continue 319 | 320 | if len(trait.css('p.right > label')) == 1: 321 | # Label is in right p, so negate the percentage 322 | percentage = -percentage 323 | 324 | actual = None 325 | for t, r in self.personality_scale_dict.iteritems(): 326 | if r.search(label): 327 | actual = t 328 | 329 | if actual == None: 330 | log.msg('Unknown trait ' + label, level=log.ERROR) 331 | else: 332 | user[actual] = percentage 333 | i += 1 334 | log.msg(`i` + ' traits parsed for user ' + user['d_username']) 335 | 336 | # Request parsing questions/answers 337 | request = Request(self.base_url + '/profile/' + user['d_username'] + '/questions', callback=self.parse_questions, priority=400) 338 | request.meta['user'] = user 339 | yield request 340 | 341 | def parse_questions(self, response): 342 | selector = Selector(response) 343 | 344 | user = response.meta['user'] 345 | 346 | i = 0 347 | for qa in selector.css('.question'): 348 | i += 1 349 | 350 | qid = qa.css('::attr(data-qid)').extract()[0] 351 | if qa.css('.not_answered'): 352 | # Not answered, answer it and store it. 353 | question = QuestionItem() 354 | question['id'] = qid 355 | question['text'] = qa.css('.qtext > p::text').extract()[0].strip() 356 | options = qa.css('.my_answer > label::text').extract() 357 | 358 | if len(options) > 0: 359 | question['option_1'] = options[0].strip() 360 | if len(options) > 1: 361 | question['option_2'] = options[1].strip() 362 | if len(options) > 2: 363 | question['option_3'] = options[2].strip() 364 | if len(options) > 3: 365 | question['option_4'] = options[3].strip() 366 | 367 | # TODO: Make the bot actually answer the question 368 | #yield self.answer_question(qid, 1) 369 | yield question 370 | else: 371 | answer = AnswerItem() 372 | answer['author'] = user['d_username'] 373 | answer['question'] = qid 374 | answer['answer'] = qa.css('.answers .target .text::text').extract()[0].strip() 375 | answer['answer_text'] = qa.css('.answers .target .note::text').extract()[0].strip() 376 | 377 | yield answer 378 | 379 | log.msg(`i` + ' questions/answers parsed for user ' + user['d_username']) 380 | 381 | if len(selector.css('.pages .next.disabled').extract()) > 0: 382 | # We don't have any more pages. Yield the user. 383 | log.msg('Done processing ' + user['d_username']) 384 | yield user 385 | 386 | next_user = self.next_user() 387 | if next_user is None: 388 | log.msg('No more users in queue.') 389 | else: 390 | yield next_user 391 | else: 392 | # We're not done. 393 | next = selector.css('.pages .next > a::attr(href)').extract()[0] 394 | request = Request(self.base_url + next, callback=self.parse_questions, priority=400) 395 | request.meta['user'] = user 396 | yield request 397 | 398 | # FIXME: ? 399 | # The bot tries to answer a question in order to fetch answers from 400 | # the target profile. 401 | def answer_question(self, qid, option): 402 | return FormRequest("https://www.okcupid.com/questions/ask", 403 | formdata={ 404 | 'ajax': '1', 405 | 'submit': '1', 406 | 'answer_question': '1', 407 | 'skip': '0', 408 | 'show_all': '0', 409 | 'is_new': '1', 410 | 'matchanswers': 'irrelevant', 411 | 'qid': str(qid), 412 | 'importance': '5', 413 | 'is_public': '1', 414 | 'note': '', 415 | 'delete_note': '0', 416 | 'targetid': '', 417 | 'is_public': '1', 418 | 'answers': str(option) 419 | }, 420 | callback=self.answered, priority=1000, 421 | headers={ 422 | 'Accept': 'application/json', 423 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 424 | 'X-Requested-With': 'XMLHttpRequest' 425 | }) 426 | 427 | # TODO: Implement 428 | def answered(self, response): 429 | # TODO: WE NEED TO GET THEIR ANSWER (hint: target in POST) 430 | pass 431 | 432 | def monkey_patch_HTTPClientParser_statusReceived(self): 433 | """ 434 | Monkey patch for twisted.web._newclient.HTTPClientParser.statusReceived 435 | """ 436 | from twisted.web._newclient import HTTPClientParser, ParseError 437 | old_sr = HTTPClientParser.statusReceived 438 | def statusReceived(self, status): 439 | try: 440 | return old_sr(self, status) 441 | except ParseError, e: 442 | if e.args[0] == 'wrong number of parts': 443 | log.msg('Wrong number of parts in header. Assuming 200 OK', level=log.DEBUG) 444 | return old_sr(self, str(status) + ' OK') 445 | raise 446 | statusReceived.__doc__ == old_sr.__doc__ 447 | HTTPClientParser.statusReceived = statusReceived 448 | -------------------------------------------------------------------------------- /okcubot/project.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: project 3 | Version: 1.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /okcubot/project.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | okcubot/__init__.py 3 | okcubot/items.py 4 | okcubot/pipelines.py 5 | okcubot/settings.py 6 | okcubot/spiders/__init__.py 7 | okcubot/spiders/okcubot_spider.old.py 8 | okcubot/spiders/okcubot_spider.py 9 | project.egg-info/PKG-INFO 10 | project.egg-info/SOURCES.txt 11 | project.egg-info/dependency_links.txt 12 | project.egg-info/entry_points.txt 13 | project.egg-info/top_level.txt -------------------------------------------------------------------------------- /okcubot/project.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /okcubot/project.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [scrapy] 2 | settings = okcubot.settings 3 | 4 | -------------------------------------------------------------------------------- /okcubot/project.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | okcubot 2 | -------------------------------------------------------------------------------- /okcubot/run.bat: -------------------------------------------------------------------------------- 1 | :loop 2 | scrapy crawl okcubot -auser=username -apass=password 3 | goto loop -------------------------------------------------------------------------------- /okcubot/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = okcubot.settings 8 | 9 | [deploy:aws-target] 10 | url = http://nbjerg.dk:6800/ 11 | project = okcubot 12 | -------------------------------------------------------------------------------- /okcubot/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapyd-deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = okcubot.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Twisted==11.0.0 2 | scrapy==0.24.5 3 | scrapely 4 | loginform 5 | lxml 6 | --------------------------------------------------------------------------------