├── .gitattributes
├── .gitignore
├── README.md
├── Vagrantfile
├── get-pip.py
├── okcubot
    ├── okcubot
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── okcubot_spider.old.py
    │   │   └── okcubot_spider.py
    ├── project.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── entry_points.txt
    │   └── top_level.txt
    ├── run.bat
    ├── scrapy.cfg
    └── setup.py
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 
45 | # Build files
46 | *.pyc
47 | okcubot/build/
48 | 
49 | # R files
50 | .Rhistory
51 | 
52 | # Project files
53 | .idea
54 | .sync
55 | .vagrant
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OKCubot
 2 | 
 3 | Scraping people for science and stuff.
 4 | 
 5 | ## Instructions
 6 | 
 7 | 1. Install Python 2.7
 8 | 2. Install pip (you could use python get-pip.py, but don't trust anyone)
 9 | 3. Add pip to your path
10 | 4. python -m pip install -r requirements.txt
11 | 
12 | ## Troubleshooting
13 | 
14 | I don't know. Add an issue.
15 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
  1 | # -*- mode: ruby -*-
  2 | # vi: set ft=ruby :
  3 | 
  4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
  5 | VAGRANTFILE_API_VERSION = "2"
  6 | 
  7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
  8 |   # All Vagrant configuration is done here. The most common configuration
  9 |   # options are documented and commented below. For a complete reference,
 10 |   # please see the online documentation at vagrantup.com.
 11 | 
 12 |   config.vm.hostname = "okcubot"
 13 | 
 14 |   # Every Vagrant virtual environment requires a box to build off of.
 15 |   config.vm.box = "precise64"
 16 | 
 17 |   # The url from where the 'config.vm.box' box will be fetched if it
 18 |   # doesn't already exist on the user's system.
 19 |   config.vm.box_url = "http://files.vagrantup.com/precise64.box"
 20 | 
 21 |   # Create a forwarded port mapping which allows access to a specific port
 22 |   # within the machine from a port on the host machine. In the example below,
 23 |   # accessing "localhost:8080" will access port 80 on the guest machine.
 24 |   # config.vm.network :forwarded_port, guest: 80, host: 8080
 25 | 
 26 |   # Create a private network, which allows host-only access to the machine
 27 |   # using a specific IP.
 28 |   config.vm.network :private_network, ip: "192.168.33.10"
 29 | 
 30 |   # Create a public network, which generally matched to bridged network.
 31 |   # Bridged networks make the machine appear as another physical device on
 32 |   # your network.
 33 |   # config.vm.network :public_network
 34 | 
 35 |   # If true, then any SSH connections made will enable agent forwarding.
 36 |   # Default value: false
 37 |   config.ssh.forward_agent = true
 38 | 
 39 |   # Share an additional folder to the guest VM. The first argument is
 40 |   # the path on the host to the actual folder. The second argument is
 41 |   # the path on the guest to mount the folder. And the optional third
 42 |   # argument is a set of non-required options.
 43 |   # config.vm.synced_folder "../data", "/vagrant_data"
 44 | 
 45 |   # Provider-specific configuration so you can fine-tune various
 46 |   # backing providers for Vagrant. These expose provider-specific options.
 47 |   # Example for VirtualBox:
 48 |   #
 49 |   config.vm.provider :virtualbox do |vb|
 50 |   #   # Don't boot with headless mode
 51 |   #   vb.gui = true
 52 |   #
 53 |   #   # Use VBoxManage to customize the VM. For example to change memory:
 54 |     vb.customize ["modifyvm", :id, "--memory", "512"]
 55 |   end
 56 |   #
 57 |   # View the documentation for the provider you're using for more
 58 |   # information on available options.
 59 | 
 60 |   config.vm.provision :shell do |shell|
 61 |     if File.exists?(Dir.home + '/.gitconfig')
 62 |       shell.args = "'#{File.read(Dir.home + '/.gitconfig').strip.gsub!(/\n/, '\n')}'"
 63 |     end
 64 |     shell.inline = setup_project(config.vm.hostname)
 65 |   end
 66 | end
 67 | 
 68 | # shell script to set up project
 69 | def setup_project(project)
 70 |   return <<-EOS
 71 |     export PROJECT=#{project}
 72 | 
 73 |     export PYTHON_VERSION=2.7
 74 |     export SETUPTOOLS_VERSION=1.1.6
 75 |     export VIRTUALENVWRAPPER_VERSION=4.1.1
 76 | 
 77 |     export VAGRANT_USER=vagrant
 78 |     export VAGRANT_HOME=/home/$VAGRANT_USER
 79 | 
 80 |     if [ -n "$1" ]; then
 81 |       echo -e $1 > $VAGRANT_HOME/.gitconfig
 82 |       chown $VAGRANT_USER.$VAGRANT_USER $VAGRANT_HOME/.gitconfig
 83 |     fi
 84 | 
 85 |     apt-get update
 86 |     apt-get install -y build-essential curl git python-dev libxml2-dev \
 87 |       libxslt1-dev python-software-properties nodejs npm \
 88 |       libffi-dev libssl-dev
 89 | 
 90 |     ln -sf /usr/bin/nodejs /usr/bin/node
 91 | 
 92 |     # For building presentation slides
 93 |     npm install -g cleaver
 94 | 
 95 |     if [ ! -e /usr/local/bin/virtualenv-$PYTHON_VERSION ]; then
 96 |       cd $VAGRANT_HOME
 97 |       curl -ksLo virtualenv.tar.gz https://github.com/pypa/virtualenv/tarball/develop
 98 |       tar xzf virtualenv.tar.gz
 99 |       cd pypa-virtualenv*
100 |       python$PYTHON_VERSION setup.py install
101 |       cd $VAGRANT_HOME
102 |       rm -rf pypa-virtualenv* virtualenv.tar.gz
103 |     fi
104 | 
105 |     if [ ! -e /usr/local/lib/python$PYTHON_VERSION/dist-packages/setuptools-$SETUPTOOLS_VERSION-py$PYTHON_VERSION.egg ]; then
106 |       cd $VAGRANT_HOME
107 |       curl -sO https://pypi.python.org/packages/source/s/setuptools/setuptools-$SETUPTOOLS_VERSION.tar.gz
108 |       tar xzf setuptools-$SETUPTOOLS_VERSION.tar.gz
109 |       cd setuptools-$SETUPTOOLS_VERSION
110 |       python$PYTHON_VERSION setup.py install
111 |       cd $VAGRANT_HOME
112 |       rm -rf setuptools-$SETUPTOOLS_VERSION*
113 |     fi
114 | 
115 |     if [ ! -e /usr/local/bin/pip$PYTHON_VERSION ]; then
116 |       cd $VAGRANT_HOME
117 |       curl -ksLo pip.tar.gz https://github.com/pypa/pip/tarball/develop
118 |       tar xzf pip.tar.gz
119 |       cd pypa-pip*
120 |       python$PYTHON_VERSION setup.py install
121 |       cd $VAGRANT_HOME
122 |       rm -rf pypa-pip* pip.tar.gz
123 |     fi
124 | 
125 |     if [ ! -e /usr/local/lib/python$PYTHON_VERSION/dist-packages/virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION-py$PYTHON_VERSION.egg-info ]; then
126 |       cd $VAGRANT_HOME
127 |       curl -ksLO https://pypi.python.org/packages/source/v/virtualenvwrapper/virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION.tar.gz
128 |       tar xzf virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION.tar.gz
129 |       cd virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION
130 |       python$PYTHON_VERSION setup.py install
131 |       cd $VAGRANT_HOME
132 |       rm -rf virtualenvwrapper-$VIRTUALENVWRAPPER_VERSION*
133 |     fi
134 | 
135 |     if ! grep -q WORKON_HOME $VAGRANT_HOME/.bashrc; then
136 |       echo >> $VAGRANT_HOME/.bashrc
137 |       echo 'export VIRTUALENVWRAPPER_PYTHON=/usr/bin/python'$PYTHON_VERSION >> $VAGRANT_HOME/.bashrc
138 |       echo 'export WORKON_HOME=$HOME/.virtualenvs' >> $VAGRANT_HOME/.bashrc
139 |       echo 'export PROJECT_HOME=/vagrant' >> $VAGRANT_HOME/.bashrc
140 |       echo 'source /usr/local/bin/virtualenvwrapper.sh' >> $VAGRANT_HOME/.bashrc
141 |     fi
142 | 
143 |     if ! grep -q 'alias co=' $VAGRANT_HOME/.bashrc; then
144 |       echo >> $VAGRANT_HOME/.bashrc
145 |       echo 'alias co="cd /vagrant"' >> $VAGRANT_HOME/.bashrc
146 |     fi
147 | 
148 |     if ! grep -q 'EDITOR=' $VAGRANT_HOME/.bashrc; then
149 |       echo >> $VAGRANT_HOME/.bashrc
150 |       echo 'export EDITOR=vim' >> $VAGRANT_HOME/.bashrc
151 |     fi
152 | 
153 |     if ! grep -q "workon $PROJECT" $VAGRANT_HOME/.profile; then
154 |       echo >> $VAGRANT_HOME/.profile
155 |       echo "workon $PROJECT" >> $VAGRANT_HOME/.profile
156 |     fi
157 | 
158 |     if [ ! -e $VAGRANT_HOME/.virtualenvs/$PROJECT ]; then
159 |       mkdir -p $VAGRANT_HOME/.virtualenvs
160 |       echo "virtualenv --system-site-packages $VAGRANT_HOME/.virtualenvs/$PROJECT"
161 |       virtualenv -q --system-site-packages $VAGRANT_HOME/.virtualenvs/$PROJECT
162 | 
163 |       echo "#!/bin/bash" > $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate
164 |       echo "export PYTHONPATH=/vagrant" >> $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate
165 |       chmod 775 $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postactivate
166 | 
167 |       echo "#!/bin/bash" > $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate
168 |       echo "unset PYTHONPATH" >> $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate
169 |       chmod 775 $VAGRANT_HOME/.virtualenvs/$PROJECT/bin/postdeactivate
170 | 
171 |       chown -R $VAGRANT_USER.$VAGRANT_USER $VAGRANT_HOME/.virtualenvs
172 |     fi
173 | 
174 |     # Install requirements
175 |     pip install -r /vagrant/requirements.txt
176 | 
177 |     # Cleanup veewee post install scripts
178 |     if [[ -e $VAGRANT_HOME/vagrant.sh || -e $VAGRANT_HOME/postinstall.sh ]]; then
179 |       rm $VAGRANT_HOME/{apt,build_time,chef,cleanup,postinstall,ruby,sudo,vagrant,vbox}.sh
180 |     fi
181 |   EOS
182 | end


--------------------------------------------------------------------------------
/okcubot/okcubot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deleetdk/OKCubot/a40fd515da6cd4c108c7528dce7aeb9f8493dcd4/okcubot/okcubot/__init__.py


--------------------------------------------------------------------------------
/okcubot/okcubot/items.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your scraped items
  2 | #
  3 | # See documentation in:
  4 | # http://doc.scrapy.org/en/latest/topics/items.html
  5 | 
  6 | from scrapy.item import Item, Field
  7 | 
  8 | class UserItem(Item):
  9 |     # Details
 10 |     d_username = Field()
 11 |     d_age = Field()
 12 |     d_gender = Field()
 13 |     d_city = Field()
 14 |     d_country = Field()
 15 |     d_orientation = Field()
 16 |     d_ethnicity = Field()
 17 |     d_bodytype = Field()
 18 |     d_diet_manner = Field()
 19 |     d_diet_type = Field()
 20 |     d_smokes = Field()
 21 |     d_drinks = Field()
 22 |     d_drugs = Field()
 23 |     # Fiction section
 24 |     d_religion_type = Field()
 25 |     d_religion_seriosity = Field()
 26 |     d_astrology_sign = Field()
 27 |     d_astrology_seriosity = Field()
 28 |     d_education_phase = Field()
 29 |     d_education_type = Field()
 30 |     d_job = Field()
 31 |     d_income = Field()
 32 |     d_relationship = Field()
 33 |     d_relationship_manner = Field()
 34 |     d_relationship_type = Field()
 35 |     d_offspring_current = Field()
 36 |     d_offspring_desires = Field()
 37 |     d_pets_dogs = Field()
 38 |     d_pets_cats = Field()
 39 |     d_languages = Field()
 40 |     # Looking for
 41 |     lf_want = Field()
 42 |     lf_min_age = Field()
 43 |     lf_max_age = Field()
 44 |     lf_location = Field()
 45 |     lf_single = Field()
 46 |     lf_for = Field()
 47 |     # Personality scale
 48 |     p_explove = Field() # Experienced in Love
 49 |     p_adven = Field() # Adventurous
 50 |     p_indie = Field() # Indie
 51 |     p_spon = Field() # Spontaneous
 52 |     p_scien = Field() # Scientific
 53 |     p_inde = Field() # Independent
 54 |     p_conf = Field() # Confident
 55 |     p_math = Field() # Mathematical
 56 |     p_logic = Field() # Logical
 57 |     p_organ = Field() # Organized
 58 |     p_oldfash = Field() # Old-Fashioned
 59 |     p_lit = Field() # Literary
 60 |     p_opti = Field() # Optimistic
 61 |     p_roman = Field() # Romantic
 62 |     p_comp = Field() # Compassionate
 63 |     p_lovedri = Field() # Love-driven
 64 |     p_sprit = Field() # Spiritual
 65 |     p_kinky = Field() # Kinky
 66 |     p_artsy = Field() # Artsy
 67 |     p_thrift = Field() # Thrifty
 68 |     p_drug = Field() # Drug-friendly
 69 |     p_arro = Field() # Arrogant
 70 |     p_sloppy = Field() # Sloppy
 71 |     p_extro = Field() # Extroverted
 72 |     p_geeky = Field() # Geeky
 73 |     p_aggre = Field() # Aggressive
 74 |     p_expsex = Field() # Experienced in sex
 75 |     p_capi = Field() # Capitalistic
 76 |     p_exer = Field() # Into Exercise
 77 |     p_kind = Field() # Kind
 78 |     p_pure = Field() # Pure
 79 |     p_convenmoral = Field() # Conventionally Moral
 80 |     p_manners = Field() # Mannered
 81 |     p_ambi = Field() # Ambitious
 82 |     p_polit = Field() # Political
 83 |     p_greed = Field() # Greedy
 84 |     p_sexdrive = Field() # Sex-driven
 85 |     p_energetic = Field() # Energetic
 86 |     p_cool = Field() # Cool
 87 |     p_introvert = Field() # Introverted
 88 |     p_trusting = Field() # Trusting
 89 |     p_dominant = Field() # Dominant
 90 |     p_laidback = Field() # Laid-back
 91 |     p_submissive = Field() # Submissive
 92 |     p_explife = Field() # Experienced in life
 93 |     p_friendstrangers = Field() # Fiendly to strangers
 94 |     p_honest = Field() # Honest
 95 |     p_giving = Field() # Giving 
 96 |     p_passion = Field() # Passion-driven
 97 |     p_progress = Field() # Progressive
 98 |     # Misc
 99 |     m_photocount = Field()
100 | 
101 | class QuestionItem(Item):
102 |     id = Field()
103 |     text = Field()
104 |     # The text of the options
105 |     option_1 = Field()
106 |     option_2 = Field()
107 |     option_3 = Field()
108 |     option_4 = Field()
109 | 
110 | class AnswerItem(Item):
111 |     # User who answered
112 |     author = Field()
113 |     # Question ID
114 |     question = Field()
115 |     # Value between 1-4
116 |     answer = Field()
117 |     # Answer text (if any)
118 |     answer_text = Field()
119 | 


--------------------------------------------------------------------------------
/okcubot/okcubot/pipelines.py:
--------------------------------------------------------------------------------
  1 | # Define your item pipelines here
  2 | #
  3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  5 | 
  6 | # TODO: Validate data-pipeline
  7 | # TODO: Answer pipeline - translate answer in to a numeric value
  8 | 
  9 | from scrapy import signals
 10 | from scrapy.contrib.exporter import CsvItemExporter
 11 | from scrapy.exceptions import DropItem
 12 | import csv
 13 | from collections import defaultdict
 14 | import os
 15 | import time
 16 | 
 17 | class OkcubotPipeline(object):
 18 |     def process_item(self, item, spider):
 19 |         return item
 20 | class DuplicatePipeline(object):
 21 | 	def __init__(self):
 22 | 		self.usernames = self.get_column('user.tsv', 'd_username')
 23 | 		self.ids = self.get_column('question.tsv', 'id')
 24 | 
 25 | 	def process_item(self, item, spider):
 26 | 		try:
 27 | 			if type(item).__name__ == 'UserItem':
 28 | 				return self.check_duplicate_user(item)
 29 | 			elif type(item).__name__ == 'QuestionItem':
 30 | 				return self.check_duplicate_question(item)
 31 | 			elif type(item).__name__ == 'AnswerItem':
 32 | 				return self.check_duplicate_answer(item)
 33 | 		except DropItem as e:
 34 | 			raise e
 35 | 			return
 36 | 		
 37 | 		# Not a user, not a question, not an answer: skip.
 38 | 		return item
 39 | 
 40 | 	# Get the values of a column in a CSV-file
 41 | 	def get_column(self, file, column):
 42 | 		if not os.path.isfile(file):
 43 | 			return list()
 44 | 
 45 | 		columns = defaultdict(list)
 46 | 
 47 | 		with open(file) as f:
 48 | 		    reader = csv.DictReader(f)
 49 | 		    for row in reader:
 50 | 		        for (k,v) in row.items():
 51 | 		            columns[k].append(v)
 52 | 
 53 | 		return columns[column]
 54 | 
 55 | 	def check_duplicate_user(self, item):
 56 | 		if item['d_username'] in self.usernames:
 57 | 			raise DropItem('Duplicate user found: %s' % item)
 58 | 		else:
 59 | 			self.usernames.append(item['d_username'])
 60 | 			return item
 61 | 
 62 | 	def check_duplicate_question(self, item):
 63 | 		if item['id'] in self.ids:
 64 | 			raise DropItem('Duplicate question found: %s' % item)
 65 | 		else:
 66 | 			self.ids.append(item['id'])
 67 | 			return item
 68 | 
 69 | 	def check_duplicate_answer(self, item):
 70 | 		if item['author'] in self.usernames:
 71 | 			raise DropItem('Duplicate answer found: %s' % item)
 72 | 		else:
 73 | 			return item
 74 | 
 75 | class AnswerSanitationPipeline(object):
 76 | 	def __init__(self):
 77 | 		self.questions = set()
 78 | 
 79 | 	def process_item(self, item, spider):
 80 | 		if type(item).__name__ == 'QuestionItem':
 81 | 			self.questions.add(item)
 82 | 			return item
 83 | 		if type(item).__name__ != 'AnswerItem':
 84 | 			return item
 85 | 
 86 | 		question = self.find_question(item['question'])
 87 | 
 88 | 		if question == None:
 89 | 			return item
 90 | 
 91 | 		if 'option_1' in question and item['answer'] == question['option_1']:
 92 | 			item['answer'] = 1
 93 | 		elif 'option_2' in question and item['answer'] == question['option_2']:
 94 | 			item['answer'] = 2
 95 | 		elif 'option_3' in question and item['answer'] == question['option_3']:
 96 | 			item['answer'] = 3
 97 | 		elif 'option_4' in question and item['answer'] == question['option_4']:
 98 | 			item['answer'] = 4
 99 | 
100 | 		return item
101 | 
102 | 	# Find question by ID
103 | 	def find_question(self, id):
104 | 		for question in self.questions:
105 | 			if question['id'] == id:
106 | 				return question
107 | 
108 | 		# TODO: Find in files
109 | 		return None
110 | 
111 | class TsvItemExporter(CsvItemExporter):
112 | 	def __init__(self, *args, **kwargs):
113 | 		kwargs['encoding'] = 'utf-8'
114 | 		kwargs['delimiter'] = '\t'
115 | 
116 | 		super(TsvItemExporter, self).__init__(*args, **kwargs)
117 | 
118 | #class TsvExportPipeline(object):
119 | #	def __init__(self):
120 | #		self.files = {}
121 | #
122 | #	@classmethod
123 | #	def from_crawler(cls, crawler):
124 | #		pipeline = cls()
125 | #
126 | #		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
127 | #		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
128 | #
129 | #		return pipeline
130 | #
131 | #	def spider_opened(self, spider):
132 | #		file = open('%s_data.tsv' % spider.name, 'w+b')
133 | #		self.files[spider] = file
134 | #		self.exporter = TsvItemExporter(file)
135 | #		self.exporter.start_exporting()
136 | #
137 | #	def spider_closed(self, spider):
138 | #		self.exporter.finish_exporting()
139 | #		file = self.files.pop(spider)
140 | #		file.close()
141 | #
142 | #	def process_item(self, item, spider):
143 | #		self.exporter.export_item(item)
144 | #
145 | #		return item
146 | 
147 | def item_type(item):
148 | 	return type(item).__name__.replace('Item', '').lower()
149 | 
150 | class MultiTSVItemPipeline(object):
151 | 	types = ['user', 'question', 'answer']
152 | 
153 | 	@classmethod
154 | 	def from_crawler(cls, crawler):
155 | 		pipeline = cls()
156 | 
157 | 		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
158 | 		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
159 | 
160 | 		return pipeline
161 | 
162 | 	def spider_opened(self, spider):
163 | 		headers = True
164 | 		for name in self.types:
165 | 			if os.path.isfile(name + '.tsv'):
166 | 				# File already exists -- don't write headers again
167 | 				headers = False
168 | 				break
169 | 
170 | 		self.files = dict([ (name, open(time.strftime("%d-%m-%Y") + name + '.tsv', 'ab+')) for name in self.types ])
171 | 		self.exporters = dict([ (name, TsvItemExporter(self.files[name], include_headers_line=headers)) for name in self.types])
172 | 		[e.start_exporting() for e in self.exporters.values()]
173 | 
174 | 	def spider_closed(self, spider):
175 | 		[e.finish_exporting() for e in self.exporters.values()]
176 | 		[f.close() for f in self.files.values()]
177 | 
178 | 	def process_item(self, item, spider):
179 | 		what = item_type(item)
180 | 
181 | 		if what in set(self.types):
182 | 			self.exporters[what].export_item(item)
183 | 			self.files[what].flush()
184 | 		return item


--------------------------------------------------------------------------------
/okcubot/okcubot/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for okcubot project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'okcubot'
10 | 
11 | SPIDER_MODULES = ['okcubot.spiders']
12 | NEWSPIDER_MODULE = 'okcubot.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
15 | #USER_AGENT = 'okcubot (+http://www.yourdomain.com)'
16 | 
17 | ITEM_PIPELINES = {
18 | 	'okcubot.pipelines.DuplicatePipeline': 300,
19 | 	'okcubot.pipelines.AnswerSanitationPipeline': 500,
20 | 	'okcubot.pipelines.MultiTSVItemPipeline': 800
21 | }
22 | 
23 | EXTENSIONS = {'scrapy.contrib.feedexport.FeedExporter': None}
24 | 
25 | # Log
26 | LOG_LEVEL = 'INFO'
27 | 
28 | # Human-like
29 | ALLOWED_DOMAINS = ["okcupid.com"]
30 | DOWNLOAD_DELAY = 2
31 | CONCURRENT_REQUESTS_PER_IP = 1
32 | 
33 | # Schedular
34 | DEPTH_PRIORITY = 1
35 | 


--------------------------------------------------------------------------------
/okcubot/okcubot/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/okcubot/okcubot/spiders/okcubot_spider.old.py:
--------------------------------------------------------------------------------
  1 | from scrapy.spider import Spider
  2 | from scrapy.http import Request, FormRequest
  3 | from scrapy.selector import Selector
  4 | from scrapy import log
  5 | 
  6 | import re
  7 | 
  8 | from okcubot.items import UserItem, QuestionItem, AnswerItem
  9 | 
 10 | class OkCubotSpider(Spider):
 11 |     # Spider settings
 12 |     name = "okcubot"
 13 | 
 14 |     # Others
 15 |     base_url = "http://www.okcupid.com"
 16 | 
 17 |     # TODO: Implement missing traits
 18 |     personality_scale_dict = {
 19 |         'p_explove': re.compile('(experienced in love)', re.IGNORECASE),
 20 |         'p_adven': re.compile('(adventurous)', re.IGNORECASE),
 21 |         'p_indie': re.compile('(indie)', re.IGNORECASE),
 22 |         'p_spon': re.compile('(spontaneous)', re.IGNORECASE),
 23 |         'p_scien': re.compile('(scientific)', re.IGNORECASE),
 24 |         'p_inde': re.compile('(independent)', re.IGNORECASE),
 25 |         'p_conf': re.compile('(confident)', re.IGNORECASE),
 26 |         'p_math': re.compile('(mathematical)', re.IGNORECASE),
 27 |         'p_logic': re.compile('(logical)', re.IGNORECASE),
 28 |         'p_organ': re.compile('(organized)', re.IGNORECASE),
 29 |         'p_oldfash': re.compile('(old\-fashioned)', re.IGNORECASE),
 30 |         'p_lit': re.compile('(literary)', re.IGNORECASE),
 31 |         'p_opti': re.compile('(optimistic)', re.IGNORECASE),
 32 |         'p_roman': re.compile('(romantic)', re.IGNORECASE),
 33 |         'p_comp': re.compile('(compassionate)', re.IGNORECASE),
 34 |         'p_lovedri': re.compile('(love\-driven)', re.IGNORECASE),
 35 |         'p_sprit': re.compile('(spiritual)', re.IGNORECASE),
 36 |         'p_kinky': re.compile('(kinky)', re.IGNORECASE),
 37 |         'p_artsy': re.compile('(artsy)', re.IGNORECASE),
 38 |         'p_thrift': re.compile('(thrifty)', re.IGNORECASE),
 39 |         'p_drug': re.compile('(drug\-friendly)', re.IGNORECASE),
 40 |         'p_arro': re.compile('(arrogant)', re.IGNORECASE),
 41 |         'p_sloppy': re.compile('(sloppy)', re.IGNORECASE),
 42 |         'p_extro': re.compile('(extroverted)', re.IGNORECASE),
 43 |         'p_geeky': re.compile('(geeky)', re.IGNORECASE),
 44 |         'p_aggre': re.compile('(aggressive)', re.IGNORECASE),
 45 |         'p_expsex': re.compile('(experienced in sex)', re.IGNORECASE),
 46 |         'p_capi': re.compile('(capitalistic)', re.IGNORECASE),
 47 |         'p_exer': re.compile('(into exercise)', re.IGNORECASE),
 48 |         'p_kind': re.compile('(kind)', re.IGNORECASE),
 49 |         'p_pure': re.compile('(pure)', re.IGNORECASE),
 50 |         'p_convenmoral': re.compile('(conventionally moral)', re.IGNORECASE),
 51 |         'p_manners': re.compile('(mannered)', re.IGNORECASE),
 52 |         'p_ambi': re.compile('(ambitious)', re.IGNORECASE),
 53 |         'p_polit': re.compile('(political)', re.IGNORECASE),
 54 |         'p_greed': re.compile('(greedy)', re.IGNORECASE),
 55 |         'p_sexdrive': re.compile('(sex\-driven)', re.IGNORECASE),
 56 |         'p_energetic': re.compile('(energetic)', re.IGNORECASE),
 57 |         'p_cool': re.compile('(cool)', re.IGNORECASE),
 58 |         'p_introvert': re.compile('(introverted)', re.IGNORECASE),
 59 |         'p_trusting': re.compile('(trusting)', re.IGNORECASE),
 60 |         'p_dominant': re.compile('(dominant)', re.IGNORECASE),
 61 |         'p_laidback': re.compile('(laid\-back)', re.IGNORECASE),
 62 |         'p_submissive': re.compile('(submissive)', re.IGNORECASE),
 63 |         'p_explife': re.compile('(experienced in life)', re.IGNORECASE),
 64 |         'p_friendstrangers': re.compile('(friendly to strangers)', re.IGNORECASE),
 65 |         'p_honest': re.compile('(honest)', re.IGNORECASE),
 66 |         'p_giving': re.compile('(giving)', re.IGNORECASE),
 67 |         'p_passion': re.compile('(passion\-driven)', re.IGNORECASE),
 68 |         'p_progress': re.compile('(progressive)', re.IGNORECASE)
 69 |     }
 70 | 
 71 |     # User
 72 |     user = None
 73 |     password = None
 74 | 
 75 |     # Seeds and target
 76 |     target = None
 77 | 
 78 |     # TODO: Format argument in constructor
 79 |     # Args
 80 |     #   user - username for bot account
 81 |     #   pass - password for bot account
 82 |     #   target - optional target for scraping single users
 83 |     #   format - optional (default: tsv) format to export data to (e.g. csv, tsv)
 84 |     def __init__(self, *args, **kwargs):
 85 |         super(OkCubotSpider, self).__init__(*args, **kwargs)
 86 | 
 87 |         if "user" not in kwargs or "pass" not in kwargs:
 88 |             print "Please supply a user and a password"
 89 |             exit()
 90 | 
 91 |         if "target" in kwargs:
 92 |             self.target = kwargs['target']
 93 | 
 94 |         self.user = kwargs['user']
 95 |         self.password = kwargs['pass']
 96 | 
 97 |         # Patch
 98 |         self.monkey_patch_HTTPClientParser_statusReceived()
 99 | 
100 |     def start_requests(self):
101 |         return [FormRequest("https://www.okcupid.com/login",
102 |                             formdata={'username': self.user, 'password': self.password},
103 |                             callback=self.logged_in)]
104 | 
105 |     def logged_in(self, response):
106 |         selector = Selector(response)
107 | 
108 |         if self.target != None:
109 |             # We only want to scrape this user
110 |             yield Request(self.base_url + '/profile/' + self.target, callback=self.parse_profile)
111 |         else:
112 |             profiles = selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract()
113 |             if len(profiles) == 0:
114 |                 log.msg('Credentials incorrect.', level=log.ERROR)
115 |             else:
116 |                 for url in profiles:
117 |                     log.msg('Seeded bot with user (' + url + ')')
118 |                     yield Request(self.base_url + url, callback=self.parse_profile)
119 | 
120 |     def parse_profile(self, response):
121 |         selector = Selector(response)
122 | 
123 |         # TODO: Handle parameters which are - as none.
124 |         # A note on this is that you can set default values
125 |         # etc. in the Field method of an item.
126 |         # TODO: Trim
127 |         # TODO: Implement
128 |         #       d_religion_type = Field()
129 |         #       d_religion_seriosity = Field()
130 |         #       d_astrology_sign = Field()
131 |         #       d_astrology_seriosity = Field()
132 |         #       d_education_phase = Field()
133 |         #       d_education_type = Field()
134 |         #       d_job = Field()
135 |         #       d_income = Field()
136 |         #       d_relationship = Field()
137 |         #       d_relationship_manner = Field()
138 |         #       d_relationship_type = Field()
139 |         #       d_offspring_current = Field()
140 |         #       d_offspring_desires = Field()
141 |         #       d_pets_dogs = Field()
142 |         #       d_pets_cats = Field()
143 |         #   Looking for
144 |         #       lf_want = Field()
145 |         #       lf_min_age = Field()
146 |         #       lf_max_age = Field()
147 |         #       lf_location = Field()
148 |         #       lf_single = Field()
149 |         #       lf_for = Field()
150 | 
151 |         attribute_dict = {
152 |             'd_username': '#basic_info_sn.name::text',
153 |             'd_age': '#ajax_age::text',
154 |             'd_gender': '.ajax_gender::text',
155 |             'd_orientation': '#ajax_orientation::text',
156 |             'd_ethnicity': '#ajax_ethnicities::text',
157 |             'd_bodytype': '#ajax_bodytype::text',
158 |             'd_smokes': '#ajax_smoking::text',
159 |             'd_drinks': '#ajax_drinking::text',
160 |             'd_drugs': '#ajax_drugs::text',
161 |             'd_languages': '#ajax_languages::text',
162 |         }
163 | 
164 |         user = UserItem()
165 |         # Iterate over attribute dictionary and fetch data
166 |         for attr, ident in attribute_dict.iteritems():
167 |             val = selector.css(ident).extract()[0]
168 | 
169 |             # Trim
170 |             val = val.strip()
171 | 
172 |             # Translate - to blanks
173 |             val = val.replace('—', '')
174 | 
175 |             # Set attribute
176 |             user[attr] = val
177 | 
178 |         #name = selector.css('#basic_info_sn.name::text').extract()[0]
179 | 
180 |         #age = selector.css('#ajax_age::text').extract()[0]
181 |         #gender = selector.css('.ajax_gender::text').extract()[0]
182 |         location = selector.css('#ajax_location::text').extract()[0].split(',')
183 |         city = location[0]
184 |         country = location[1]
185 |         #orientation = selector.css('#ajax_orientation::text').extract()[0]
186 |         #ethnicity = selector.css('#ajax_ethnicities::text').extract()
187 |         #bodytype = selector.css('#ajax_bodytype::text').extract()[0]
188 |         diet = selector.css('#ajax_diet::text').extract()[0].split(' ')
189 |         # Diet handling stuff
190 |         diet_manner = 0
191 |         diet_type = 0
192 |         if len(diet) == 1:
193 |             diet_type = diet[0]
194 |         else:
195 |             diet_manner = diet[0]
196 |             diet_type = diet[1]
197 |         # End diet handling stuff
198 |         #smokes = selector.css('#ajax_smoking::text').extract()[0]
199 |         #drinks = selector.css('#ajax_drinking::text').extract()[0]
200 |         #drugs = selector.css('#ajax_drugs::text').extract()[0]
201 |         
202 |         #user['d_username'] = name
203 |         #user['d_age'] = age
204 |         #user['d_gender'] = gender
205 |         user['d_city'] = city
206 |         user['d_country'] = country
207 |         #user['d_orientation'] = orientation
208 |         #user['d_ethnicity'] = ethnicity # TODO: fix
209 |         #user['d_bodytype'] = bodytype
210 |         user['d_diet_manner'] = diet_manner
211 |         user['d_diet_type'] = diet_type
212 |         #user['d_smokes'] = smokes
213 |         #user['d_drinks'] = drinks
214 |         #user['d_drugs'] = drugs
215 | 
216 |         # Request parsing of the user's personality traits
217 |         request = Request(self.base_url + '/profile/' + name + '/personality', callback=self.parse_personality, priority=100)
218 |         request.meta['user'] = user
219 |         yield request
220 | 
221 |         if self.target == None:
222 |             # Find other users
223 |             i = 0
224 |             for url in selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract():
225 |                 i += 1
226 |                 if url != response.request.url:
227 |                     yield Request(self.base_url + url, callback=self.parse_profile, priority=-100)
228 |             log.msg('Queued ' + `i` + ' users from ' + name)
229 | 
230 |     def parse_personality(self, response):
231 |         selector = Selector(response)
232 | 
233 |         user = response.meta['user']
234 | 
235 |         i = 0
236 |         for trait in selector.css('.pt_row'):
237 |             label = trait.css('label::text').extract()[0]
238 |             percentage = re.sub('(width\:)|(\%\;)', '', trait.css('span::attr(style)').extract()[0])
239 | 
240 |             try:
241 |                 percentage = int(percentage)
242 |             except ValueError:
243 |                 log.msg('Could not parse trait, moving on.', level=log.ERROR)
244 |                 continue
245 | 
246 |             if len(trait.css('p.right > label')) == 1:
247 |                 # Label is in right p, so negate the percentage
248 |                 percentage = -percentage
249 | 
250 |             actual = None
251 |             for t, r in self.personality_scale_dict.iteritems():
252 |                 if r.search(label):
253 |                     actual = t
254 | 
255 |             if actual == None:
256 |                 log.msg('Unknown trait ' + label, level=log.ERROR)
257 |             else:
258 |                 user[actual] = percentage
259 |                 i += 1
260 |         log.msg(`i` + ' traits parsed for user ' + user['d_username'])
261 | 
262 |         # Request parsing questions/answers
263 |         request = Request(self.base_url + '/profile/' + user['d_username'] + '/questions', callback=self.parse_questions, priority=400)
264 |         request.meta['user'] = user
265 |         yield request
266 |     
267 |     def parse_questions(self, response):
268 |         selector = Selector(response)
269 | 
270 |         user = response.meta['user']
271 | 
272 |         i = 0
273 |         for qa in selector.css('.question'):
274 |             i += 1
275 | 
276 |             qid = qa.css('::attr(data-qid)').extract()[0]
277 |             if qa.css('.not_answered'):
278 |                 # Not answered, answer it and store it.
279 |                 question = QuestionItem()
280 |                 question['id'] = qid
281 |                 question['text'] = qa.css('.qtext > p::text').extract()[0]
282 |                 options = qa.css('.my_answer > label::text').extract()
283 | 
284 |                 if len(options) > 0:
285 |                     question['option_1'] = options[0]
286 |                 if len(options) > 1:
287 |                     question['option_2'] = options[1]
288 |                 if len(options) > 2:
289 |                     question['option_3'] = options[2]
290 |                 if len(options) > 3:
291 |                     question['option_4'] = options[3]
292 | 
293 |                 # TODO: Make the bot actually answer the question
294 |                 #yield self.answer_question(qid, 1)
295 |                 yield question
296 |             else:
297 |                 answer = AnswerItem()
298 |                 answer['author'] = user['d_username']
299 |                 answer['question'] = qid
300 |                 answer['answer'] = qa.css('.answers .target .text::text').extract()[0]
301 |                 answer['answer_text'] = qa.css('.answers .target .note::text').extract()[0]
302 | 
303 |                 yield answer
304 | 
305 |         log.msg(`i` + ' questions/answers parsed for user ' + user['d_username'])
306 | 
307 |         if len(selector.css('.pages .next.disabled').extract()) > 0:
308 |             # We don't have any more pages. Yield the user.
309 |             log.msg('Done processing ' + user['d_username'])
310 |             yield user
311 |         else:
312 |             # We're not done.
313 |             next = selector.css('.pages .next > a::attr(href)').extract()[0]
314 |             request = Request(self.base_url + next, callback=self.parse_questions, priority=400)
315 |             request.meta['user'] = user
316 |             yield request
317 | 
318 |     def answer_question(self, qid, option):
319 |         return FormRequest("https://www.okcupid.com/questions/ask",
320 |                             formdata={
321 |                                 'ajax': '1',
322 |                                 'submit': '1',
323 |                                 'answer_question': '1',
324 |                                 'skip': '0',
325 |                                 'show_all': '0',
326 |                                 'is_new': '1',
327 |                                 'matchanswers': 'irrelevant',
328 |                                 'qid': str(qid),
329 |                                 'importance': '5',
330 |                                 'is_public': '1',
331 |                                 'note': '',
332 |                                 'delete_note': '0',
333 |                                 'targetid': '',
334 |                                 'is_public': '1',
335 |                                 'answers': str(option)
336 |                             },
337 |                             callback=self.answered, priority=1000,
338 |                             headers={
339 |                                 'Accept': 'application/json',
340 |                                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
341 |                                 'X-Requested-With': 'XMLHttpRequest'
342 |                             })
343 | 
344 |     def answered(self, response):
345 |         # TODO: WE NEED TO GET THEIR ANSWER (hint: target in POST)
346 |         pass
347 | 
348 |     def monkey_patch_HTTPClientParser_statusReceived(self):
349 |         """
350 |         Monkey patch for twisted.web._newclient.HTTPClientParser.statusReceived
351 |         """
352 |         from twisted.web._newclient import HTTPClientParser, ParseError
353 |         old_sr = HTTPClientParser.statusReceived
354 |         def statusReceived(self, status):
355 |             try:
356 |                 return old_sr(self, status)
357 |             except ParseError, e:
358 |                 if e.args[0] == 'wrong number of parts':
359 |                     log.msg('Wrong number of parts in header. Assuming 200 OK', level=log.DEBUG)
360 |                     return old_sr(self, str(status) + ' OK')
361 |                 raise
362 |                 statusReceived.__doc__ == old_sr.__doc__
363 |         HTTPClientParser.statusReceived = statusReceived


--------------------------------------------------------------------------------
/okcubot/okcubot/spiders/okcubot_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from scrapy.spider import Spider
  3 | from scrapy.http import Request, FormRequest
  4 | from scrapy.selector import Selector
  5 | from scrapy import log
  6 | 
  7 | import re
  8 | 
  9 | from okcubot.items import UserItem, QuestionItem, AnswerItem
 10 | 
 11 | class OkCubotSpider(Spider):
 12 |     # Spider settings
 13 |     name = "okcubot"
 14 | 
 15 |     # Others
 16 |     base_url = "http://www.okcupid.com"
 17 | 
 18 |     user_queue = []
 19 | 
 20 |     # TODO: Implement missing traits
 21 |     personality_scale_dict = {
 22 |         'p_explove': re.compile('(experienced in love)', re.IGNORECASE),
 23 |         'p_adven': re.compile('(adventurous)', re.IGNORECASE),
 24 |         'p_indie': re.compile('(indie)', re.IGNORECASE),
 25 |         'p_spon': re.compile('(spontaneous)', re.IGNORECASE),
 26 |         'p_scien': re.compile('(scientific)', re.IGNORECASE),
 27 |         'p_inde': re.compile('(independent)', re.IGNORECASE),
 28 |         'p_conf': re.compile('(confident)', re.IGNORECASE),
 29 |         'p_math': re.compile('(mathematical)', re.IGNORECASE),
 30 |         'p_logic': re.compile('(logical)', re.IGNORECASE),
 31 |         'p_organ': re.compile('(organized)', re.IGNORECASE),
 32 |         'p_oldfash': re.compile('(old\-fashioned)', re.IGNORECASE),
 33 |         'p_lit': re.compile('(literary)', re.IGNORECASE),
 34 |         'p_opti': re.compile('(optimistic)', re.IGNORECASE),
 35 |         'p_roman': re.compile('(romantic)', re.IGNORECASE),
 36 |         'p_comp': re.compile('(compassionate)', re.IGNORECASE),
 37 |         'p_lovedri': re.compile('(love\-driven)', re.IGNORECASE),
 38 |         'p_sprit': re.compile('(spiritual)', re.IGNORECASE),
 39 |         'p_kinky': re.compile('(kinky)', re.IGNORECASE),
 40 |         'p_artsy': re.compile('(artsy)', re.IGNORECASE),
 41 |         'p_thrift': re.compile('(thrifty)', re.IGNORECASE),
 42 |         'p_drug': re.compile('(drug\-friendly)', re.IGNORECASE),
 43 |         'p_arro': re.compile('(arrogant)', re.IGNORECASE),
 44 |         'p_sloppy': re.compile('(sloppy)', re.IGNORECASE),
 45 |         'p_extro': re.compile('(extroverted)', re.IGNORECASE),
 46 |         'p_geeky': re.compile('(geeky)', re.IGNORECASE),
 47 |         'p_aggre': re.compile('(aggressive)', re.IGNORECASE),
 48 |         'p_expsex': re.compile('(experienced in sex)', re.IGNORECASE),
 49 |         'p_capi': re.compile('(capitalistic)', re.IGNORECASE),
 50 |         'p_exer': re.compile('(into exercise)', re.IGNORECASE),
 51 |         'p_kind': re.compile('(kind)', re.IGNORECASE),
 52 |         'p_pure': re.compile('(pure)', re.IGNORECASE),
 53 |         'p_convenmoral': re.compile('(conventionally moral)', re.IGNORECASE),
 54 |         'p_manners': re.compile('(mannered)', re.IGNORECASE),
 55 |         'p_ambi': re.compile('(ambitious)', re.IGNORECASE),
 56 |         'p_polit': re.compile('(political)', re.IGNORECASE),
 57 |         'p_greed': re.compile('(greedy)', re.IGNORECASE),
 58 |         'p_sexdrive': re.compile('(sex\-driven)', re.IGNORECASE),
 59 |         'p_energetic': re.compile('(energetic)', re.IGNORECASE),
 60 |         'p_cool': re.compile('(cool)', re.IGNORECASE),
 61 |         'p_introvert': re.compile('(introverted)', re.IGNORECASE),
 62 |         'p_trusting': re.compile('(trusting)', re.IGNORECASE),
 63 |         'p_dominant': re.compile('(dominant)', re.IGNORECASE),
 64 |         'p_laidback': re.compile('(laid\-back)', re.IGNORECASE),
 65 |         'p_submissive': re.compile('(submissive)', re.IGNORECASE),
 66 |         'p_explife': re.compile('(experienced in life)', re.IGNORECASE),
 67 |         'p_friendstrangers': re.compile('(friendly to strangers)', re.IGNORECASE),
 68 |         'p_honest': re.compile('(honest)', re.IGNORECASE),
 69 |         'p_giving': re.compile('(giving)', re.IGNORECASE),
 70 |         'p_passion': re.compile('(passion\-driven)', re.IGNORECASE),
 71 |         'p_progress': re.compile('(progressive)', re.IGNORECASE)
 72 |     }
 73 | 
 74 |     # Regular expressions
 75 |     education_re = re.compile('(graduated from|working on|dropped out of|)\s?(high school|university|masters program|law school|med school|space camp|ph\.d program|two\-year college)', re.IGNORECASE)
 76 |     lf_age_re = re.compile('(\d+).(\d+)')
 77 | 
 78 |     # User
 79 |     user = None
 80 |     password = None
 81 | 
 82 |     # Seeds and target
 83 |     target = None
 84 | 
 85 |     # Args
 86 |     #   user - username for bot account
 87 |     #   pass - password for bot account
 88 |     #   target - optional target for scraping single users
 89 |     def __init__(self, *args, **kwargs):
 90 |         super(OkCubotSpider, self).__init__(*args, **kwargs)
 91 | 
 92 |         if "user" not in kwargs or "pass" not in kwargs:
 93 |             print "Please supply a user and a password"
 94 |             exit()
 95 | 
 96 |         if "target" in kwargs:
 97 |             self.target = kwargs['target']
 98 | 
 99 |         self.user = kwargs['user']
100 |         self.password = kwargs['pass']
101 | 
102 |         # Patch
103 |         self.monkey_patch_HTTPClientParser_statusReceived()
104 | 
105 |     def spider_idle(self, spider):
106 |         next_user = self.next_user()
107 |         if next_user is not None:
108 |             yield next_user
109 | 
110 |     def queue_user(self, req):
111 |         if req not in self.user_queue:
112 |             if len(self.user_queue) < 100:
113 |                 self.user_queue.append(req)
114 |                 return True
115 |             else:
116 |                 log.msg('User queue is too big. Skipping.')
117 |                 return False
118 | 
119 |         log.msg('User is already in queue. Skipping.')
120 |         return False
121 | 
122 | 
123 |     def next_user(self):
124 |         if len(self.user_queue) > 0:
125 |             return self.user_queue.pop()
126 |         return None
127 | 
128 |     def start_requests(self):
129 |         return [FormRequest("https://www.okcupid.com/login",
130 |                             formdata={'username': self.user, 'password': self.password},
131 |                             callback=self.logged_in)]
132 | 
133 |     def logged_in(self, response):
134 |         selector = Selector(response)
135 | 
136 |         if self.target != None:
137 |             # We only want to scrape this user
138 |             yield Request(self.base_url + '/profile/' + self.target, callback=self.parse_profile)
139 |         else:
140 |             profiles = selector.css('#similar_users_list li > a::attr(href), #matchphotobrowser_int .item a.name::attr(href)').extract()
141 |             if len(profiles) == 0:
142 |                 log.msg('Credentials incorrect.', level=log.ERROR)
143 |             else:
144 |                 for url in profiles:
145 |                     log.msg('Seeded bot with user (' + url + ')')
146 |                     self.queue_user(Request(self.base_url + url, callback=self.parse_profile))
147 | 
148 |         # Yield two users to get things started
149 |         yield self.next_user()
150 |         yield self.next_user()
151 | 
152 |     def parse_profile(self, response):
153 |         selector = Selector(response)
154 | 
155 |         # TODO: Implement
156 |         #       d_pets_dogs = Field()
157 |         #       d_pets_cats = Field()
158 | 
159 |         attribute_dict = {
160 |             # Details
161 |             'd_username': '#basic_info_sn.name::text',
162 |             'd_age': '#ajax_age::text',
163 |             'd_gender': '.ajax_gender::text',
164 |             'd_orientation': '#ajax_orientation::text',
165 |             'd_ethnicity': '#ajax_ethnicities::text',
166 |             'd_bodytype': '#ajax_bodytype::text',
167 |             'd_relationship': '#ajax_status::text',
168 |             'd_smokes': '#ajax_smoking::text',
169 |             'd_drinks': '#ajax_drinking::text',
170 |             'd_drugs': '#ajax_drugs::text',
171 |             'd_languages': '#ajax_languages::text',
172 |             'd_job': '#ajax_job::text',
173 |             'd_income': '#ajax_income::text',
174 | 
175 |             # Looking for
176 |             'lf_location': '#ajax_near::text',
177 |             'lf_want': '#ajax_gentation::text',
178 |             'lf_single': '#ajax_single::text'
179 |         }
180 | 
181 |         user = UserItem()
182 |         # Iterate over attribute dictionary and fetch data
183 |         for attr, ident in attribute_dict.iteritems():
184 |             val = selector.css(ident).extract()[0]
185 | 
186 |             # Translate - to blanks
187 |             val = val.encode('utf-8').replace('—', '')
188 | 
189 |             # Set attribute
190 |             user[attr] = val
191 | 
192 |         # Looking for
193 |         user['lf_for'] = selector.css('#ajax_lookingfor::text').extract()[0].replace('for', '')
194 | 
195 |         age = selector.css('#ajax_ages::text').extract()[0]
196 |         age = self.lf_age_re.findall(age)
197 |         lf_min_age = None
198 |         lf_max_age = None
199 |         if len(age) > 0:
200 |             # Stored in tuples.. appearantly
201 |             lf_min_age = age[0][0]
202 |             lf_max_age = age[0][1]
203 |         user['lf_min_age'] = lf_min_age
204 |         user['lf_max_age'] = lf_max_age
205 | 
206 |         # Location
207 |         location = selector.css('#ajax_location::text').extract()[0].split(',')
208 |         city = location[0]
209 |         country = location[1]
210 |         user['d_city'] = city
211 |         user['d_country'] = country
212 | 
213 |         # Diet
214 |         diet = selector.css('#ajax_diet::text').extract()[0].split(' ')
215 |         diet_manner = None
216 |         diet_type = None
217 |         if len(diet) == 1:
218 |             diet_type = diet[0]
219 |         elif len(diet) == 2:
220 |             diet_manner = diet[0]
221 |             diet_type = diet[1]
222 |         user['d_diet_manner'] = diet_manner
223 |         user['d_diet_type'] = diet_type
224 | 
225 |         # Religion
226 |         religion = selector.css('#ajax_religion::text').extract()[0].split(',')
227 |         religion_type = None
228 |         religion_seriosity = None
229 |         if len(religion) == 1:
230 |             religion_type = religion[0]
231 |         elif len(religion) == 2:
232 |             religion_type = religion[0]
233 |             religion_seriosity = religion[1]
234 |         user['d_religion_type'] = religion_type
235 |         user['d_religion_seriosity'] = religion_seriosity
236 | 
237 |         # Astrology
238 |         astrology = selector.css('#ajax_sign::text').extract()[0].split(',')
239 |         astrology_sign = None
240 |         astrology_seriosity = None
241 |         if len(astrology) == 1:
242 |             astrology_sign = astrology[0]
243 |         elif len(astrology) == 2:
244 |             astrology_sign = astrology[0]
245 |             astrology_seriosity = astrology[1]
246 |         user['d_astrology_sign'] = astrology_sign
247 |         user['d_astrology_seriosity'] = astrology_seriosity
248 | 
249 |         # Relationship
250 |         relationship = selector.css('#ajax_monogamous::text').extract()[0].split(' ')
251 |         relationship_manner = None
252 |         relationship_type = None
253 |         if len(relationship) == 1:
254 |             relationship_type = relationship[0]
255 |         elif len(relationship) == 2:
256 |             relationship_manner = relationship[0]
257 |             relationship_type = relationship[1]
258 |         user['d_relationship_manner'] = relationship_manner
259 |         user['d_relationship_type'] = relationship_type
260 | 
261 |         # Offspring
262 |         offspring = selector.css('#ajax_children::text').extract()[0].split(' ')
263 |         offspring_desires = None
264 |         offspring_current = None
265 |         if len(offspring) == 1:
266 |             offspring_current = offspring[0]
267 |         elif len(offspring) == 2:
268 |             offspring_desires = offspring[0]
269 |             offspring_current = offspring[1]
270 |         user['d_offspring_desires'] = offspring_desires
271 |         user['d_offspring_current'] = offspring_current
272 | 
273 |         # Education
274 |         education = selector.css('#ajax_education::text').extract()[0]
275 |         education = self.education_re.findall(education)
276 |         education_phase = None
277 |         education_type = None
278 |         if len(education) > 0:
279 |             # Stored in tuples.. appearantly
280 |             education_phase = education[0][0]
281 |             education_type = education[0][1]
282 |         user['d_education_type'] = education_type
283 |         user['d_education_phase'] = education_phase
284 | 
285 |         # Trim values
286 |         for attr, val in user.iteritems():
287 |             if val is not None:
288 |                 user[attr] = val.strip()
289 | 
290 |         # Request parsing of the user's personality traits
291 |         request = Request(self.base_url + '/profile/' + user['d_username'] + '/personality', callback=self.parse_personality, priority=100)
292 |         request.meta['user'] = user
293 |         yield request
294 | 
295 |         if self.target == None:
296 |             # Find other users
297 |             i = 0
298 |             for url in selector.css('#similar_users_list li > a::attr(href), .match > a::attr(href)').extract():
299 |                 if url != response.request.url:
300 |                     if self.queue_user(Request(self.base_url + url, callback=self.parse_profile, priority=-100)):
301 |                         i += 1
302 |             log.msg('Queued ' + `i` + ' users from ' + user['d_username'] + ' (' + str(len(self.user_queue)) + ')')
303 | 
304 |     def parse_personality(self, response):
305 |         selector = Selector(response)
306 | 
307 |         user = response.meta['user']
308 | 
309 |         i = 0
310 |         for trait in selector.css('.pt_row'):
311 |             label = trait.css('label::text').extract()[0]
312 |             percentage = re.sub('(width\:)|(\%\;)', '', trait.css('span::attr(style)').extract()[0])
313 | 
314 |             try:
315 |                 percentage = int(percentage)
316 |             except ValueError:
317 |                 log.msg('Could not parse trait, moving on.', level=log.ERROR)
318 |                 continue
319 | 
320 |             if len(trait.css('p.right > label')) == 1:
321 |                 # Label is in right p, so negate the percentage
322 |                 percentage = -percentage
323 | 
324 |             actual = None
325 |             for t, r in self.personality_scale_dict.iteritems():
326 |                 if r.search(label):
327 |                     actual = t
328 | 
329 |             if actual == None:
330 |                 log.msg('Unknown trait ' + label, level=log.ERROR)
331 |             else:
332 |                 user[actual] = percentage
333 |                 i += 1
334 |         log.msg(`i` + ' traits parsed for user ' + user['d_username'])
335 | 
336 |         # Request parsing questions/answers
337 |         request = Request(self.base_url + '/profile/' + user['d_username'] + '/questions', callback=self.parse_questions, priority=400)
338 |         request.meta['user'] = user
339 |         yield request
340 | 
341 |     def parse_questions(self, response):
342 |         selector = Selector(response)
343 | 
344 |         user = response.meta['user']
345 | 
346 |         i = 0
347 |         for qa in selector.css('.question'):
348 |             i += 1
349 | 
350 |             qid = qa.css('::attr(data-qid)').extract()[0]
351 |             if qa.css('.not_answered'):
352 |                 # Not answered, answer it and store it.
353 |                 question = QuestionItem()
354 |                 question['id'] = qid
355 |                 question['text'] = qa.css('.qtext > p::text').extract()[0].strip()
356 |                 options = qa.css('.my_answer > label::text').extract()
357 | 
358 |                 if len(options) > 0:
359 |                     question['option_1'] = options[0].strip()
360 |                 if len(options) > 1:
361 |                     question['option_2'] = options[1].strip()
362 |                 if len(options) > 2:
363 |                     question['option_3'] = options[2].strip()
364 |                 if len(options) > 3:
365 |                     question['option_4'] = options[3].strip()
366 | 
367 |                 # TODO: Make the bot actually answer the question
368 |                 #yield self.answer_question(qid, 1)
369 |                 yield question
370 |             else:
371 |                 answer = AnswerItem()
372 |                 answer['author'] = user['d_username']
373 |                 answer['question'] = qid
374 |                 answer['answer'] = qa.css('.answers .target .text::text').extract()[0].strip()
375 |                 answer['answer_text'] = qa.css('.answers .target .note::text').extract()[0].strip()
376 | 
377 |                 yield answer
378 | 
379 |         log.msg(`i` + ' questions/answers parsed for user ' + user['d_username'])
380 | 
381 |         if len(selector.css('.pages .next.disabled').extract()) > 0:
382 |             # We don't have any more pages. Yield the user.
383 |             log.msg('Done processing ' + user['d_username'])
384 |             yield user
385 | 
386 |             next_user = self.next_user()
387 |             if next_user is None:
388 |                 log.msg('No more users in queue.')
389 |             else:
390 |                 yield next_user
391 |         else:
392 |             # We're not done.
393 |             next = selector.css('.pages .next > a::attr(href)').extract()[0]
394 |             request = Request(self.base_url + next, callback=self.parse_questions, priority=400)
395 |             request.meta['user'] = user
396 |             yield request
397 | 
398 |     # FIXME: ?
399 |     # The bot tries to answer a question in order to fetch answers from
400 |     # the target profile.
401 |     def answer_question(self, qid, option):
402 |         return FormRequest("https://www.okcupid.com/questions/ask",
403 |                             formdata={
404 |                                 'ajax': '1',
405 |                                 'submit': '1',
406 |                                 'answer_question': '1',
407 |                                 'skip': '0',
408 |                                 'show_all': '0',
409 |                                 'is_new': '1',
410 |                                 'matchanswers': 'irrelevant',
411 |                                 'qid': str(qid),
412 |                                 'importance': '5',
413 |                                 'is_public': '1',
414 |                                 'note': '',
415 |                                 'delete_note': '0',
416 |                                 'targetid': '',
417 |                                 'is_public': '1',
418 |                                 'answers': str(option)
419 |                             },
420 |                             callback=self.answered, priority=1000,
421 |                             headers={
422 |                                 'Accept': 'application/json',
423 |                                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
424 |                                 'X-Requested-With': 'XMLHttpRequest'
425 |                             })
426 | 
427 |     # TODO: Implement
428 |     def answered(self, response):
429 |         # TODO: WE NEED TO GET THEIR ANSWER (hint: target in POST)
430 |         pass
431 | 
432 |     def monkey_patch_HTTPClientParser_statusReceived(self):
433 |         """
434 |         Monkey patch for twisted.web._newclient.HTTPClientParser.statusReceived
435 |         """
436 |         from twisted.web._newclient import HTTPClientParser, ParseError
437 |         old_sr = HTTPClientParser.statusReceived
438 |         def statusReceived(self, status):
439 |             try:
440 |                 return old_sr(self, status)
441 |             except ParseError, e:
442 |                 if e.args[0] == 'wrong number of parts':
443 |                     log.msg('Wrong number of parts in header. Assuming 200 OK', level=log.DEBUG)
444 |                     return old_sr(self, str(status) + ' OK')
445 |                 raise
446 |                 statusReceived.__doc__ == old_sr.__doc__
447 |         HTTPClientParser.statusReceived = statusReceived
448 | 


--------------------------------------------------------------------------------
/okcubot/project.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: project
 3 | Version: 1.0
 4 | Summary: UNKNOWN
 5 | Home-page: UNKNOWN
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/okcubot/project.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | okcubot/__init__.py
 3 | okcubot/items.py
 4 | okcubot/pipelines.py
 5 | okcubot/settings.py
 6 | okcubot/spiders/__init__.py
 7 | okcubot/spiders/okcubot_spider.old.py
 8 | okcubot/spiders/okcubot_spider.py
 9 | project.egg-info/PKG-INFO
10 | project.egg-info/SOURCES.txt
11 | project.egg-info/dependency_links.txt
12 | project.egg-info/entry_points.txt
13 | project.egg-info/top_level.txt


--------------------------------------------------------------------------------
/okcubot/project.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/okcubot/project.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [scrapy]
2 | settings = okcubot.settings
3 | 
4 | 


--------------------------------------------------------------------------------
/okcubot/project.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | okcubot
2 | 


--------------------------------------------------------------------------------
/okcubot/run.bat:
--------------------------------------------------------------------------------
1 | :loop
2 | scrapy crawl okcubot -auser=username -apass=password
3 | goto loop


--------------------------------------------------------------------------------
/okcubot/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = okcubot.settings
 8 | 
 9 | [deploy:aws-target]
10 | url = http://nbjerg.dk:6800/
11 | project = okcubot
12 | 


--------------------------------------------------------------------------------
/okcubot/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapyd-deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = okcubot.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Twisted==11.0.0
2 | scrapy==0.24.5
3 | scrapely
4 | loginform
5 | lxml
6 | 


--------------------------------------------------------------------------------