├── bmrs ├── __init__.py ├── parsers │ ├── __init__.py │ ├── pandora.py │ ├── hydra.py │ ├── cloudnine.py │ ├── silkroad2.py │ ├── agora.py │ └── evolution.py ├── freq_itemsets │ ├── __init__.py │ ├── agora.py │ └── spmf_interface.py └── postprocessing │ ├── __init__.py │ ├── priceparsers.py │ ├── enforce_schema.py │ ├── .ipynb_checkpoints │ ├── build_categories-checkpoint.ipynb │ ├── postp_agora-checkpoint.ipynb │ └── postprocess_all-checkpoint.ipynb │ ├── viz.ipynb │ ├── build_categories.ipynb │ ├── postprocess_all.ipynb │ └── recommender_viz.ipynb ├── bin └── spmf.jar ├── README.md ├── paper └── bmrs.tex ├── .gitignore └── LICENSE /bmrs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bmrs/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bmrs/freq_itemsets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bmrs/postprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/spmf.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcompton/black-market-recommender-systems/HEAD/bin/spmf.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # black-market-recommender-systems 2 | Code and paper 3 | 4 | aahu@barahv2 ~/Desktop> du -hs silkroad2/ 5 | 70G silkroad2/ 6 | 7 | aahu@barahv2 ~/Desktop> du -hs sheep/ 8 | 1.1G sheep/ 9 | 10 | aahu@barahv2 ~/Desktop> du -hs hydra/ 11 | 8.4G hydra/ 12 | 13 | aahu@barahv2 /m/a/b/dknet> du -sh agora/ 14 | 119G agora/ 15 | 16 | TODO: 17 | + scrape price data from evolution 18 | 19 | Contributions 20 | + collab filter builds categories 21 | + geographic spread of items 22 | + price over time? 23 | + lexicon? 24 | + entity resoultion 25 | -------------------------------------------------------------------------------- /bmrs/postprocessing/priceparsers.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | 4 | 5 | def price_btc_parse(s): 6 | if pd.isnull(s): 7 | return None 8 | if ('usd' in s.lower()) or ('$' in s): 9 | return None 10 | flt = re.findall("\d+\.\d+", s) 11 | if len(flt) > 0: 12 | return float(flt[0]) 13 | return None 14 | 15 | 16 | def price_usd_parse(s): 17 | if pd.isnull(s): 18 | return None 19 | if ('btc' in s.lower()) or ('฿' in s): 20 | return None 21 | if ('usd' not in s.lower()) and ('$' not in s): 22 | return None 23 | flt = re.findall("\d+\.\d+", s) 24 | if len(flt) > 0: 25 | return float(flt[0]) 26 | return None 27 | 28 | 29 | def is_btc_price(s): 30 | if pd.isnull(s): 31 | return False 32 | if ('btc' in s.lower()) or ('฿' in s): 33 | return True 34 | return False 35 | 36 | 37 | def is_usd_price(s): 38 | if pd.isnull(s): 39 | return False 40 | if ('usd' in s.lower()) or ('$' in s): 41 | return True 42 | return False 43 | 44 | 45 | def price_parse(s): 46 | if pd.isnull(s): 47 | return None 48 | if ('usd' in s.lower()) or ('$' in s): 49 | return price_usd_parse(s) 50 | if ('btc' in s.lower()) or ('฿' in s): 51 | return price_btc_parse(s) 52 | return None 53 | -------------------------------------------------------------------------------- /bmrs/freq_itemsets/agora.py: -------------------------------------------------------------------------------- 1 | 2 | from bmrs.freq_itemsets import spmf_interface 3 | import os 4 | import pandas as pd 5 | 6 | import logging 7 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 8 | DATE_FORMAT = '%b %d %H:%M:%S' 9 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 10 | handler = logging.StreamHandler() 11 | handler.setFormatter(formatter) 12 | logger = logging.getLogger(__name__) 13 | logger.addHandler(handler) 14 | logger.setLevel(logging.INFO) 15 | 16 | 17 | def load_agora(): 18 | DATA_DIR = '/home/aahu/Dropbox/black-market-recommender-systems/data/agora/' 19 | l = [] 20 | for fname in os.listdir(DATA_DIR): 21 | if fname.endswith('.tsv'): 22 | df0 = pd.read_csv(os.path.join(DATA_DIR, fname), sep='\t', parse_dates=['scrape_date']) 23 | l.append(df0) 24 | df = pd.concat(l) 25 | logger.info(df.columns) 26 | logger.info(df.shape) 27 | return df 28 | 29 | 30 | def main(): 31 | df = load_agora() 32 | 33 | baskets = [] 34 | dfg = df.groupby('vendor') 35 | for name, group in dfg: 36 | basket = set(group['category']) 37 | baskets.append(basket) 38 | 39 | fitms = spmf_interface.run_spmf_freq_itemsets(baskets, min_support=.02) 40 | 41 | print(fitms) 42 | 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /paper/bmrs.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt]{amsart} 2 | \usepackage{amsmath} 3 | \usepackage{amssymb} 4 | \usepackage{amsthm} 5 | \usepackage{graphicx} 6 | \usepackage{algorithmic} 7 | \usepackage{multicol} 8 | \usepackage{cleverref} 9 | 10 | \theoremstyle{remark} 11 | \newtheorem{rem}{Remark} 12 | \newtheorem{defn}{Definition} 13 | 14 | \title{Black market reccomender systems} 15 | \author{Ryan Compton} 16 | \date{\today} 17 | 18 | \begin{document} 19 | 20 | \begin{abstract} 21 | 22 | In this work we study $2,265,306$ unique public listings posted by $9,757$ vendors on $6$ online black markets: Evolution, Silk Road 2, Pandora, Hydra, Cloudnine, and Agora. A recommender system based on matrix factorization is developed to address the problem of recommending products for vendors to offer based on other vendor offerings. 23 | 24 | 25 | 26 | \end{abstract} 27 | 28 | \maketitle 29 | 30 | \section{Introduction} 31 | \label{sec:introduction} 32 | 33 | 2013-12-20 until 2015-04-23 34 | 35 | \section{Data} 36 | \label{sec:data} 37 | 38 | \subsection{Description of markets} 39 | 40 | \subsubsection{Evolution} 41 | Evolution \url{k5zq47j6wd3wdvjq.onion} operated from 2014-01-14 until 2015-03-18 when it suddenly disappeared in an apparent exit scam \footnote{\url{http://www.forbes.com/sites/thomasbrewster/2015/03/18/evolution-market-a-scam-says-site-pr/}}. 42 | 43 | \subsection{Financial} 44 | 45 | 46 | 47 | \end{document} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | 10 | ## Intermediate documents: 11 | *.dvi 12 | *-converted-to.* 13 | # these rules might exclude image files for figures etc. 14 | # *.ps 15 | # *.eps 16 | # *.pdf 17 | 18 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 19 | *.bbl 20 | *.bcf 21 | *.blg 22 | *-blx.aux 23 | *-blx.bib 24 | *.brf 25 | *.run.xml 26 | 27 | ## Build tool auxiliary files: 28 | *.fdb_latexmk 29 | *.synctex 30 | *.synctex.gz 31 | *.synctex.gz(busy) 32 | *.pdfsync 33 | 34 | ## Auxiliary and intermediate files from other packages: 35 | 36 | # algorithms 37 | *.alg 38 | *.loa 39 | 40 | # achemso 41 | acs-*.bib 42 | 43 | # amsthm 44 | *.thm 45 | 46 | # beamer 47 | *.nav 48 | *.snm 49 | *.vrb 50 | 51 | #(e)ledmac/(e)ledpar 52 | *.end 53 | *.[1-9] 54 | *.[1-9][0-9] 55 | *.[1-9][0-9][0-9] 56 | *.[1-9]R 57 | *.[1-9][0-9]R 58 | *.[1-9][0-9][0-9]R 59 | *.eledsec[1-9] 60 | *.eledsec[1-9]R 61 | *.eledsec[1-9][0-9] 62 | *.eledsec[1-9][0-9]R 63 | *.eledsec[1-9][0-9][0-9] 64 | *.eledsec[1-9][0-9][0-9]R 65 | 66 | # glossaries 67 | *.acn 68 | *.acr 69 | *.glg 70 | *.glo 71 | *.gls 72 | 73 | # gnuplottex 74 | *-gnuplottex-* 75 | 76 | # hyperref 77 | *.brf 78 | 79 | # knitr 80 | *-concordance.tex 81 | *.tikz 82 | *-tikzDictionary 83 | 84 | # listings 85 | *.lol 86 | 87 | # makeidx 88 | *.idx 89 | *.ilg 90 | *.ind 91 | *.ist 92 | 93 | # minitoc 94 | *.maf 95 | *.mtc 96 | *.mtc0 97 | 98 | # minted 99 | _minted* 100 | *.pyg 101 | 102 | # morewrites 103 | *.mw 104 | 105 | # nomencl 106 | *.nlo 107 | 108 | # sagetex 109 | *.sagetex.sage 110 | *.sagetex.py 111 | *.sagetex.scmd 112 | 113 | # sympy 114 | *.sout 115 | *.sympy 116 | sympy-plots-for-*.tex/ 117 | 118 | # todonotes 119 | *.tdo 120 | 121 | # xindy 122 | *.xdy 123 | 124 | # WinEdt 125 | *.bak 126 | *.sav 127 | 128 | # Byte-compiled / optimized / DLL files 129 | __pycache__/ 130 | *.py[cod] 131 | *$py.class 132 | 133 | # C extensions 134 | *.so 135 | 136 | # Distribution / packaging 137 | .Python 138 | env/ 139 | build/ 140 | develop-eggs/ 141 | dist/ 142 | downloads/ 143 | eggs/ 144 | .eggs/ 145 | lib/ 146 | lib64/ 147 | parts/ 148 | sdist/ 149 | var/ 150 | *.egg-info/ 151 | .installed.cfg 152 | *.egg 153 | 154 | # PyInstaller 155 | # Usually these files are written by a python script from a template 156 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 157 | *.manifest 158 | *.spec 159 | 160 | # Installer logs 161 | pip-log.txt 162 | pip-delete-this-directory.txt 163 | 164 | # Unit test / coverage reports 165 | htmlcov/ 166 | .tox/ 167 | .coverage 168 | .coverage.* 169 | .cache 170 | nosetests.xml 171 | coverage.xml 172 | *,cover 173 | 174 | # Translations 175 | *.mo 176 | *.pot 177 | 178 | # Django stuff: 179 | *.log 180 | 181 | # Sphinx documentation 182 | docs/_build/ 183 | 184 | # PyBuilder 185 | target/ 186 | -------------------------------------------------------------------------------- /bmrs/parsers/pandora.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | 11 | 12 | DATA_DIR = '/home/aahu/Desktop/pandora/' 13 | RESULT_DIR = 'data/pandora/' 14 | 15 | 16 | def html_to_df(fname, fdate, category): 17 | """ 18 | parse a pandora html file 19 | must spec date file was scraped 20 | """ 21 | soup = BeautifulSoup(open(fname)) 22 | 23 | # first, get cat name 24 | lis = soup.find_all('li') 25 | cats = {} 26 | for li in lis: 27 | cs = li.find(href=re.compile("listing/cat/.")) 28 | if cs is not None: 29 | cats[cs['href'].split('/')[-1]] = cs.text.strip() 30 | try: 31 | catname = cats[category] 32 | except KeyError: 33 | catname = category 34 | 35 | tbls = soup.find_all('tr') 36 | l = [] 37 | for tbl in tbls: 38 | d = {} 39 | if len(tbl.find_all('strong')) > 0: 40 | itemtag = tbl.find('a') 41 | item = itemtag.find(text=True) 42 | if item is not None: 43 | d['item'] = str(item).strip() 44 | sellertag = itemtag.findNext('a') 45 | seller = sellertag.find(text=True) 46 | d['vendor'] = str(seller).strip() 47 | shipfromtag = sellertag.findNext('strong') 48 | shipsfrom = shipfromtag.nextSibling 49 | d['ships_from'] = str(shipsfrom).strip() 50 | shiptotag = shipfromtag.findNext('strong') 51 | shipsto = shiptotag.nextSibling 52 | d['ships_to'] = str(shipsto).strip() 53 | pricetag = shiptotag.findNext('strong') 54 | price = pricetag.nextSibling 55 | d['price_usd'] = str(price).strip() 56 | d['scrape_date'] = fdate 57 | d['category'] = catname 58 | l.append(d) 59 | return pd.DataFrame(l) 60 | 61 | 62 | def catdir_to_df(catdir, cat, fdate): 63 | if not os.path.isdir(catdir): 64 | print('not dir!, trying to parse the file... ', catdir) 65 | try: 66 | return html_to_df(catdir, category=cat, fdate=fdate) 67 | except: 68 | traceback.print_tb() 69 | return 70 | fs = os.listdir(catdir) 71 | fs = map(lambda x: os.path.join(catdir, x), fs) 72 | l = [html_to_df(f, category=cat, fdate=fdate) for f in fs] 73 | return pd.concat(l) 74 | 75 | 76 | def main(): 77 | for datestr in os.listdir(DATA_DIR): 78 | d1 = os.path.join(DATA_DIR, datestr) 79 | fdate = dateutil.parser.parse(datestr).date() 80 | catdir = os.path.join(d1, 'listing/cat') 81 | if os.path.exists(catdir): 82 | l = [] 83 | for cat in os.listdir(catdir): 84 | onecat_dir = os.path.join(catdir, cat) 85 | df = catdir_to_df(onecat_dir, cat=cat, fdate=fdate) 86 | l.append(df) 87 | print('done with cat: ', cat) 88 | df = pd.concat(l) 89 | outname = 'pandora_' + fdate.isoformat() + '.tsv' 90 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 91 | print('done with: ' + outname) 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /bmrs/parsers/hydra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | import unicodedata as ud 11 | import logging 12 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 13 | DATE_FORMAT = '%b %d %H:%M:%S' 14 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 15 | handler = logging.StreamHandler() 16 | handler.setFormatter(formatter) 17 | logger = logging.getLogger(__name__) 18 | logger.addHandler(handler) 19 | logger.setLevel(logging.INFO) 20 | 21 | 22 | DATA_DIR = '/home/aahu/Desktop/hydra/' 23 | RESULT_DIR = 'data/hydra/' 24 | if not os.path.exists(RESULT_DIR): 25 | os.mkdir(RESULT_DIR) 26 | 27 | 28 | def html_to_df(fname, fdate): 29 | """ 30 | parse a hydra html file 31 | """ 32 | try: 33 | soup = BeautifulSoup(open(fname, encoding='utf-8', errors='ignore')) 34 | except UnicodeDecodeError: 35 | logger.info('UnicodeDecodeError... meh {}'.format(fname)) 36 | return 37 | 38 | cat = [x.strip() for x in soup.find('title').text.split("::")] 39 | 40 | tbl = soup.find('tbody') 41 | if not tbl: 42 | logger.warning('no items in {}'.format(fname)) 43 | return 44 | 45 | l = [] 46 | for item in tbl.find_all('tr', class_=re.compile('odd|even')): 47 | try: 48 | listing = item.find('a', href=re.compile('/sale/.*')).text 49 | vendor = item.find('a', href=re.compile('/vendor/.*')).text 50 | details = item.find('td', {'class', "col-xs-4"}) 51 | price = None 52 | ships_to = None 53 | ships_from = None 54 | if details: 55 | price = details.find('h5', {'class', "text-success"}) 56 | if price: 57 | price = price.text 58 | ships = details.find_all('span')[-1] 59 | if ships: 60 | ships_from, ships_to = ships.text.split(' ') 61 | d = {} 62 | d['listing'] = listing.strip() 63 | d['price_usd'] = price.split()[0] 64 | d['vendor'] = vendor.strip() 65 | d['ships_from'] = ships_from.strip() 66 | d['ships_to'] = ships_to.strip() 67 | d['category'] = cat 68 | d['scrape_date'] = fdate 69 | l.append(d) 70 | except: 71 | pass 72 | 73 | return pd.DataFrame(l) 74 | 75 | 76 | def main(): 77 | for datestr in os.listdir(DATA_DIR): 78 | fdate = dateutil.parser.parse(datestr).date() 79 | l = [] 80 | datedir = os.path.join(DATA_DIR, datestr) 81 | catdir = os.path.join(datedir, 'category') 82 | if not os.path.exists(catdir): 83 | continue 84 | logger.info(catdir) 85 | l = [] 86 | for cat in os.listdir(catdir): 87 | # logger.info(cat) 88 | if int(cat.split('.html')[0]) % 100 != 0: # avoid double counts 89 | l.append(html_to_df(os.path.join(catdir, cat), fdate)) 90 | df = pd.concat(l) 91 | if len(df) > 0: 92 | outname = 'hydra_' + fdate.isoformat() + '.tsv' 93 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 94 | logger.info('wrote {0} lines to: {1}'.format(len(df), outname)) 95 | else: 96 | logger.warning('no data?' + catdir) 97 | 98 | 99 | if __name__ == '__main__': 100 | main() 101 | -------------------------------------------------------------------------------- /bmrs/parsers/cloudnine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | import unicodedata as ud 11 | import logging 12 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 13 | DATE_FORMAT = '%b %d %H:%M:%S' 14 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 15 | handler = logging.StreamHandler() 16 | handler.setFormatter(formatter) 17 | logger = logging.getLogger(__name__) 18 | logger.addHandler(handler) 19 | logger.setLevel(logging.INFO) 20 | 21 | 22 | DATA_DIR = '/home/aahu/Desktop/cloudnine/' 23 | RESULT_DIR = 'data/cloudnine/' 24 | 25 | 26 | def html_to_dict(fname, fdate): 27 | """ 28 | parse a cloudnine html file 29 | must spec date file was scraped 30 | """ 31 | d = {} 32 | try: 33 | soup = BeautifulSoup(open(fname, encoding='utf-8', errors='ignore')) 34 | except UnicodeDecodeError: 35 | logger.info('UnicodeDecodeError... meh {}'.format(fname)) 36 | return 37 | 38 | # first, get cats name 39 | bcs = soup.find('ol', {'class', 'breadcrumb'}) 40 | if not bcs: 41 | logger.warning('no breadcrumb in {}'.format(fname)) 42 | return 43 | cat = [li.text for li in bcs.find_all('li')] 44 | listing = cat[-1] 45 | cat = cat[:-1] 46 | d['listing'] = listing.strip() 47 | d['cat'] = tuple(map(lambda x: x.strip(), cat)) 48 | 49 | prd = soup.find('div', {'class', 'productbox'}) 50 | if prd is None: 51 | return 52 | div = [l.text for l in prd.find_all('div')] 53 | if len(div) == 5: 54 | q_sold = div[-1] 55 | d['quantity_sold'] = q_sold 56 | 57 | price, ships_from, ships_to, quantity = div[:4] 58 | d['price'] = price.strip() 59 | d['ships_from'] = ships_from.strip() 60 | d['ships_to'] = ships_to.strip() 61 | d['quantity_available'] = quantity.strip() 62 | 63 | vtag = soup.find(text=re.compile('.*Public PGP key of.*')) 64 | if str(vtag) == vtag: 65 | if len(vtag.parent()) > 0: 66 | d['vendor'] = vtag.parent()[0].text.strip() 67 | else: 68 | try: 69 | d['vendor'] = str(vtag).split('\n')[1].strip() 70 | logger.debug(d['vendor']) 71 | except IndexError: 72 | logger.exception(vtag) 73 | else: 74 | try: 75 | vendor = vtag.parent.find('a').text 76 | d['vendor'] = vendor.strip() 77 | except AttributeError: 78 | logger.warning(vtag) 79 | 80 | d['scrape_date'] = fdate 81 | return d 82 | 83 | 84 | def main(): 85 | for datestr in os.listdir(DATA_DIR): 86 | fdate = dateutil.parser.parse(datestr).date() 87 | l = [] 88 | datedir = os.path.join(DATA_DIR, datestr) 89 | for fname in os.listdir(datedir): 90 | if fname.endswith('product'): 91 | d = html_to_dict(os.path.join(datedir, fname), fdate=fdate) 92 | if d is not None: 93 | l.append(d) 94 | if l: 95 | df = pd.DataFrame(l) 96 | outname = 'cloudnine_' + fdate.isoformat() + '.tsv' 97 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 98 | logger.info('done with: ' + outname) 99 | else: 100 | logger.warning('no data in {}'.format(datestr)) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /bmrs/parsers/silkroad2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | import unicodedata as ud 11 | import logging 12 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 13 | DATE_FORMAT = '%b %d %H:%M:%S' 14 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 15 | handler = logging.StreamHandler() 16 | handler.setFormatter(formatter) 17 | logger = logging.getLogger(__name__) 18 | logger.addHandler(handler) 19 | logger.setLevel(logging.INFO) 20 | 21 | 22 | DATA_DIR = '/home/aahu/Desktop/silkroad2/' 23 | RESULT_DIR = 'data/silkroad2/' 24 | if not os.path.exists(RESULT_DIR): 25 | os.mkdir(RESULT_DIR) 26 | 27 | 28 | def html_to_df(fname, fdate, cat): 29 | """ 30 | parse a silkroad2 html file 31 | """ 32 | try: 33 | soup = BeautifulSoup(open(fname, encoding='utf-8', errors='ignore')) 34 | except UnicodeDecodeError: 35 | logger.info('UnicodeDecodeError... meh {}'.format(fname)) 36 | return 37 | 38 | items = soup.find_all('div', {'class', 'item'}) 39 | if not items: 40 | logger.warning('no items in {}'.format(fname)) 41 | return 42 | 43 | l = [] 44 | for item in items: 45 | if not item.find('div', {'class', 'item_title'}): 46 | continue 47 | listing = item.find('div', {'class', 'item_title'}).find('a').text 48 | 49 | price = item.find('div', {'class', 'price'}) 50 | if not price: 51 | price = item.find('div', {'class', 'price_big'}) 52 | if not price: 53 | price = None 54 | else: 55 | price = price.text 56 | 57 | dtag = item.find('div', {'class', 'item_details'}) 58 | 59 | vtag = item.find('div', {'class', 'vendor'}) 60 | vendor = None 61 | if vtag: 62 | if vtag.find('a'): 63 | vendor = vtag.find('a').text 64 | if not vendor: 65 | if dtag: 66 | if dtag.find('a'): 67 | vendor = dtag.find('a').text 68 | 69 | ships_from = None 70 | ships_to = None 71 | stag = item.find('div', {'class', 'shipping'}) 72 | if stag: 73 | try: 74 | sl = stag.text.split('\n') 75 | ships_from = [x for x in sl if 'ships from:' in x.lower()][0] 76 | ships_from = ships_from.replace('ships from:', '').strip() 77 | ships_to = [x for x in sl if 'ships to:' in x.lower()][0].strip() 78 | ships_to = ships_to.replace('ships to:', '').strip() 79 | except: 80 | logger.info(stag) 81 | 82 | else: 83 | if dtag: 84 | try: 85 | sl = dtag.text.split('\n') 86 | ships_from = [x for x in sl if 'ships from:' in x.lower()][0] 87 | ships_from = ships_from.replace('ships from:', '').strip() 88 | ships_to = [x for x in sl if 'ships to:' in x.lower()][0].strip() 89 | ships_to = ships_to.replace('ships to:', '').strip() 90 | except: 91 | logger.info(dtag) 92 | 93 | d = {} 94 | d['listing'] = listing 95 | d['price_btc'] = price 96 | d['vendor'] = vendor 97 | d['ships_from'] = ships_from 98 | d['ships_to'] = ships_to 99 | d['category'] = cat 100 | d['scrape_date'] = fdate 101 | l.append(d) 102 | 103 | return pd.DataFrame(l) 104 | 105 | 106 | def main(): 107 | for datestr in os.listdir(DATA_DIR): 108 | fdate = dateutil.parser.parse(datestr).date() 109 | l = [] 110 | datedir = os.path.join(DATA_DIR, datestr) 111 | catdir = os.path.join(datedir, 'categories') 112 | if not os.path.exists(catdir): 113 | continue 114 | logger.info(catdir) 115 | l = [] 116 | for cat in os.listdir(catdir): 117 | dname = os.path.join(catdir, cat) 118 | for f in os.listdir(dname): 119 | fname = os.path.join(dname, f) 120 | catf = html_to_df(fname, fdate=fdate, cat=cat) 121 | l.append(catf) 122 | df = pd.concat(l) 123 | outname = 'silkroad2_' + fdate.isoformat() + '.tsv' 124 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 125 | logger.info('wrote {0} lines to: {1}'.format(len(df), outname)) 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /bmrs/parsers/agora.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | import unicodedata as ud 11 | import itertools 12 | import logging 13 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 14 | DATE_FORMAT = '%b %d %H:%M:%S' 15 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 16 | handler = logging.StreamHandler() 17 | handler.setFormatter(formatter) 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(handler) 20 | logger.setLevel(logging.INFO) 21 | 22 | 23 | DATA_DIR = '/media/aahu/buffalo/dknet/agora' 24 | RESULT_DIR = 'data/agora/' 25 | if not os.path.exists(RESULT_DIR): 26 | os.mkdir(RESULT_DIR) 27 | 28 | 29 | def no_table_html_to_df(fname, fdate, cat): 30 | """ 31 | when the table parse fails try this 32 | """ 33 | pass 34 | 35 | 36 | def html_to_df(fname, fdate): 37 | """ 38 | parse an agora html file 39 | """ 40 | try: 41 | soup = BeautifulSoup(open(fname, encoding='utf-8', errors='ignore')) 42 | except UnicodeDecodeError: 43 | logger.info('UnicodeDecodeError... meh {}'.format(fname)) 44 | return 45 | 46 | cat = soup.find('title') 47 | if cat: 48 | cat = cat.text.strip() 49 | else: 50 | logger.warning('no cat in {}'.format(fname)) 51 | return 52 | 53 | tbl = soup.find('tbody') 54 | tbl = soup.find('table', {'class', 'products-list'}) 55 | if not tbl: 56 | logger.warning('no items in {}'.format(fname)) 57 | return no_table_html_to_df(fname, fdate, cat) 58 | 59 | l = [] 60 | for item in tbl.find_all('tr', {'class', 'products-list-item'}): 61 | # try: 62 | listing = item.find('a', href=re.compile('/p/.*')) 63 | if listing: 64 | listing = listing.text 65 | else: 66 | return 67 | vendor = item.find('a', {'class', 'gen-user-link'}, href=re.compile('/vendor/.*')) 68 | if vendor: 69 | vendor = vendor.get('href') 70 | else: 71 | return 72 | 73 | price = None 74 | pricel = [x.text for x in item.find_all('td') if 'BTC' in x.text] 75 | if len(pricel) > 0: 76 | price = pricel[0].strip() 77 | 78 | ships_from = None 79 | ships_to = None 80 | ships = [x for x in item.find_all('td') if ('From:' in x.text) or ('To:' in x.text)] 81 | if len(ships) > 0: 82 | shipl = ships[0].text.split('\n') 83 | ships_from = [x for x in shipl if 'From:' in x] 84 | if ships_from: 85 | ships_from = ships_from[0].replace('From:', '').strip() 86 | else: 87 | ships_from = None 88 | ships_to = [x for x in shipl if 'To:' in x] 89 | if ships_to: 90 | ships_to = ships_to[0].replace('To:', '').strip() 91 | else: 92 | ships_to = None 93 | 94 | d = {} 95 | d['listing'] = listing.strip() 96 | d['price_btc'] = price.split()[0] 97 | d['vendor'] = vendor.strip() 98 | d['ships_from'] = ships_from 99 | d['ships_to'] = ships_to 100 | d['category'] = cat 101 | d['scrape_date'] = fdate 102 | l.append(d) 103 | # except: 104 | # pass 105 | # 106 | return pd.DataFrame(l) 107 | 108 | 109 | def main(): 110 | for datestr in os.listdir(DATA_DIR): 111 | try: 112 | fdate = dateutil.parser.parse(datestr).date() 113 | datedir = os.path.join(DATA_DIR, datestr) 114 | catdir = os.path.join(datedir, 'cat') 115 | if not os.path.exists(catdir): 116 | continue 117 | logger.info(catdir) 118 | 119 | # figure category files 120 | catfiles = [] 121 | for root, dirnames, filenames in os.walk(catdir): 122 | for filename in filenames: 123 | catfiles.append(os.path.join(root, filename)) 124 | 125 | l = [] 126 | for catfile in catfiles: 127 | df0 = html_to_df(catfile, fdate) 128 | l.append(df0) 129 | df = pd.concat(l) 130 | if len(df) > 0: 131 | outname = 'agora_' + fdate.isoformat() + '.tsv' 132 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 133 | logger.info('wrote {0} lines to: {1}'.format(len(df), outname)) 134 | else: 135 | logger.warning('no data?' + catdir) 136 | except: 137 | pass 138 | 139 | if __name__ == '__main__': 140 | main() 141 | -------------------------------------------------------------------------------- /bmrs/freq_itemsets/spmf_interface.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import uuid 4 | import pandas as pd 5 | # coding: utf-8 6 | import subprocess 7 | 8 | 9 | def run_spmf_association_rule(baskets): 10 | """ 11 | given a list of sets (ie market baskets) 12 | assign ints to each item in the data 13 | format and run spmf via subprocess 14 | map back to original format 15 | return dataframe 16 | """ 17 | 18 | # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python 19 | all_items = set([item for sublist in baskets for item in sublist]) 20 | 21 | # spmf wants ints 22 | lbls = {} 23 | rev_lbls = {} 24 | for idx, item in enumerate(all_items): 25 | lbls[item] = str(idx) 26 | rev_lbls[str(idx)] = item 27 | 28 | SPMF_INPUT = 'data/spmf_input_{}.txt'.format(uuid.uuid1()) 29 | 30 | with open(SPMF_INPUT, 'w') as fout: 31 | for basket in baskets: 32 | fout.write(' '.join([lbls[item] for item in basket])) 33 | fout.write('\n') 34 | 35 | SPMF_OUTPUT = 'data/spmf_output_{}.txt'.format(uuid.uuid1()) 36 | subprocess.call(["java", "-jar", "bin/spmf.jar", "run", "FPGrowth_association_rules", 37 | SPMF_INPUT, SPMF_OUTPUT, ".01", ".1"]) 38 | 39 | l = [] 40 | with open(SPMF_OUTPUT, 'r') as fin: 41 | for line in fin: 42 | d = {} 43 | antecedent = line.split('==>')[0].strip() 44 | d['antecedent'] = [rev_lbls[a] for a in antecedent.split()] 45 | consequent = line.split('==>')[1].split('#')[0].strip() 46 | d['consequent'] = [rev_lbls[c] for c in consequent.split()] 47 | support = line.split(':')[1].split('#')[0].strip() 48 | d['support'] = int(support) 49 | confidence = line.split(':')[2].strip() 50 | d['confidence'] = float(confidence) 51 | l.append(d) 52 | df = pd.DataFrame(l) 53 | # reorder columns 54 | df = df[['antecedent', 'consequent', 'support', 'confidence']] 55 | df = df.sort('confidence', ascending=False) 56 | return df 57 | 58 | 59 | def run_spmf_freq_itemsets(baskets, min_support=.1): 60 | """ 61 | given a list of sets (ie market baskets) 62 | assign ints to each item in the data 63 | format and run spmf via subprocess 64 | map back to original format 65 | return dataframe 66 | """ 67 | 68 | # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python 69 | all_items = set([item for sublist in baskets for item in sublist]) 70 | 71 | # spmf wants ints 72 | lbls = {} 73 | rev_lbls = {} 74 | for idx, item in enumerate(all_items): 75 | lbls[item] = str(idx) 76 | rev_lbls[str(idx)] = item 77 | SPMF_INPUT = 'data/spmf_input_{}.txt'.format(uuid.uuid1()) 78 | 79 | with open(SPMF_INPUT, 'w') as fout: 80 | for basket in baskets: 81 | fout.write(' '.join([lbls[item] for item in basket])) 82 | fout.write('\n') 83 | 84 | SPMF_OUTPUT = 'data/spmf_output_{}.txt'.format(uuid.uuid1()) 85 | subprocess.call(["java", "-jar", "bin/spmf.jar", "run", "Charm_MFI", 86 | SPMF_INPUT, SPMF_OUTPUT, str(min_support)]) 87 | 88 | l = [] 89 | with open(SPMF_OUTPUT, 'r') as fin: 90 | for line in fin: 91 | d = {} 92 | itemset = line.split('#SUP:')[0].strip() 93 | d['itemset'] = [rev_lbls[a] for a in itemset.split()] 94 | support = line.split(':')[1].split('#')[0].strip() 95 | d['support'] = int(support) 96 | l.append(d) 97 | df = pd.DataFrame(l) 98 | # reorder columns 99 | df = df[['itemset', 'support']] 100 | df = df.sort('support', ascending=False) 101 | 102 | os.unlink(SPMF_INPUT) 103 | os.unlink(SPMF_OUTPUT) 104 | 105 | return df 106 | 107 | 108 | def load_evo(): 109 | df = pd.read_csv('/home/aahu/Downloads/evolution/evolution/products_vendors.tsv', 110 | sep='\t') 111 | # discard meta-categories" 112 | meta_cats = ['Other', 'Drugs', 'Guides & Tutorials', 'Fraud Related', 113 | 'Services', 'Digital Goods', 'Electronics', 'Custom Listings'] 114 | df = df[df['category'].map(lambda x:x not in meta_cats)] 115 | return df 116 | 117 | 118 | def main(): 119 | #df = load_evo() 120 | 121 | df = pd.read_csv('data/pandora/pandora_2014-09-25.tsv', sep='\t') 122 | 123 | baskets = [] 124 | dfg = df.groupby('vendor') 125 | for name, group in dfg: 126 | basket = set(group['category']) 127 | baskets.append(basket) 128 | 129 | rule_df = run_spmf_freq_itemsets(baskets) 130 | # rule_df.to_csv('learned_rules.tsv',index=False,sep='\t') 131 | print(rule_df) 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /bmrs/parsers/evolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import pandas as pd 7 | import dateutil 8 | import os 9 | import traceback 10 | import itertools 11 | 12 | import concurrent.futures 13 | 14 | import logging 15 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 16 | DATE_FORMAT = '%b %d %H:%M:%S' 17 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 18 | handler = logging.StreamHandler() 19 | handler.setFormatter(formatter) 20 | logger = logging.getLogger(__name__) 21 | logger.addHandler(handler) 22 | logger.setLevel(logging.INFO) 23 | 24 | 25 | DATA_DIR = os.path.join(os.getenv('HOME'), 'Desktop/evolution/') 26 | RESULT_DIR = 'data/evolution/' 27 | if not os.path.exists(RESULT_DIR): 28 | os.mkdir(RESULT_DIR) 29 | 30 | 31 | def listing_html_to_dict(fname, fdate): 32 | """ 33 | parse an evolution listing html file 34 | must spec date file was scraped 35 | """ 36 | logger.debug(fname) 37 | d = {} 38 | try: 39 | soup = BeautifulSoup(open(fname, encoding='utf-8', errors='ignore')) 40 | except UnicodeDecodeError: 41 | logger.info('UnicodeDecodeError... meh {}'.format(fname)) 42 | return 43 | if soup.title.text.strip() == 'Evolution :: Home': 44 | logger.info('Home listing... {}'.format(fname)) 45 | return 46 | 47 | d['scrape_date'] = fdate 48 | 49 | col8 = soup.find('div', class_="col-md-8 page-product") 50 | if col8 is not None: 51 | vendor = col8.find(href=re.compile('http://k5zq47j6wd3wdvjq.onion/profile/.*')) 52 | d['vendor'] = vendor.text.strip() 53 | else: 54 | vendor = soup.find(href=re.compile('http://k5zq47j6wd3wdvjq.onion/profile/.*')) 55 | if vendor is not None: 56 | d['vendor'] = vendor.text.strip() + '__is_gwern??' 57 | 58 | bcs = soup.find('ol', {'class', 'breadcrumb'}) 59 | if not bcs: 60 | logger.warning('no breadcrumb in {}'.format(fname)) 61 | return 62 | cat = [li.text for li in bcs.find_all('li')] 63 | listing = cat[-1] 64 | cat = cat[:-1] 65 | d['listing'] = listing.strip() 66 | d['cat_tuple'] = tuple(map(lambda x: x.strip(), cat)) 67 | d['category'] = d['cat_tuple'][-1] 68 | 69 | md7 = soup.find('div', class_='col-md-7') 70 | if md7 is None: 71 | logger.warning('no md7') 72 | print(soup.prettify()) 73 | return 74 | price = md7.find('strong') 75 | if price: 76 | d['price'] = price.text.strip() 77 | ship = md7.find('dl', class_="dl-horizontal") 78 | if ship: 79 | for dt in ship.find_all('dt'): 80 | if dt.text.lower() == 'ships from': 81 | d['ships_from'] = dt.find_next_sibling('dd').text.strip() 82 | 83 | alldivs = soup.find_all('div', class_='container') 84 | bigdiv = [x for x in alldivs if x.parent == soup.body] 85 | if len(bigdiv) > 0: 86 | #d['big_text'] = bigdiv[0] 87 | for h4 in bigdiv[0].find_all('h4'): 88 | if h4.text.strip() == 'Description': 89 | d['description'] = h4.find_next_sibling('p').text.strip().replace('\n', ' ') 90 | if h4.text.strip() == 'Ships To': 91 | d['ships_to'] = h4.find_next_sibling('p').text.strip() 92 | #logger.info(d) 93 | return d 94 | 95 | 96 | def listdir_to_df(listdir, fdate): 97 | logger.info('processing: {}'.format(listdir)) 98 | fs = os.listdir(listdir) 99 | fs = map(lambda x: os.path.join(listdir, x), fs) 100 | l = [] 101 | for f in fs: 102 | if os.path.isfile(f): 103 | try: 104 | d = listing_html_to_dict(f, fdate) 105 | if d is not None: 106 | l.append(d) 107 | except: 108 | logger.exception("except") 109 | if len(l) > 0: 110 | dfout = pd.DataFrame(l) 111 | print(dfout.head(10)) 112 | logger.info('shape dfout: {}'.format(dfout.shape)) 113 | return dfout 114 | else: 115 | logger.warn('nothing in {}'.format(listdir)) 116 | return None 117 | 118 | def tuple_eater(tup): # for concurrent 119 | return listdir_to_df(tup[0], tup[1]) 120 | 121 | def get_dirs_and_dates(): 122 | l = [] 123 | for datestr in os.listdir(DATA_DIR): 124 | d1 = os.path.join(DATA_DIR, datestr) 125 | fdate = dateutil.parser.parse(datestr) 126 | listdir = os.path.join(d1, 'listing') 127 | if os.path.exists(listdir): 128 | l.append((listdir, fdate)) 129 | return l 130 | 131 | 132 | def main(): 133 | inp = get_dirs_and_dates() 134 | 135 | # concurrent! 136 | with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: 137 | ds = executor.map(tuple_eater, inp) 138 | 139 | ds = itertools.chain.from_iterable(ds) 140 | ds = list(ds) 141 | ds = [df for df in ds if df is not None] 142 | ds = [df for df in ds if len(df) > 0] 143 | 144 | for idx, df in enumerate(ds): 145 | try: 146 | outname = 'evolution_{}.tsv'.format(idx) 147 | df.to_csv(os.path.join(RESULT_DIR, outname), '\t', index=False) 148 | except: 149 | logger.exception('df?????{}'.format(type(df))) 150 | 151 | #dfs = [] 152 | #for dd in inp: 153 | #dfs.append(tuple_eater(dd)) 154 | 155 | # write 156 | #df = pd.DataFrame(ds) 157 | df = pd.concat(ds) 158 | df = df.drop_duplicates() 159 | outname = 'evolution.tsv' 160 | 161 | 162 | if __name__ == '__main__': 163 | main() 164 | -------------------------------------------------------------------------------- /bmrs/postprocessing/enforce_schema.py: -------------------------------------------------------------------------------- 1 | """ 2 | The dataypes across different parses have been a real drag. 3 | This script will dump all the tsvs into a sqllite table that is easier to work with 4 | """ 5 | import pandas as pd 6 | import numpy as np 7 | import re 8 | import ast 9 | import bmrs.postprocessing.priceparsers as pp 10 | import sqlalchemy 11 | import sqlite3 12 | import logging 13 | FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s' 14 | DATE_FORMAT = '%b %d %H:%M:%S' 15 | formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 16 | handler = logging.StreamHandler() 17 | handler.setFormatter(formatter) 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(handler) 20 | logger.setLevel(logging.INFO) 21 | 22 | 23 | def load_silkroad2(): 24 | logger.info('load_silkroad2') 25 | FNAME = 'data/silkroad2.tsv' 26 | df = pd.read_csv(FNAME, sep='\t', parse_dates=['scrape_date']) 27 | df['price_btc'] = df['price_btc'].map(pp.price_btc_parse) 28 | df['price_usd'] = df['price_btc'].map(lambda x: None) # TODO: another way 29 | df['cat_tuple'] = df['cat'].map(lambda x: tuple(ast.literal_eval(x))) 30 | df = df.drop('cat', axis=1) 31 | df['cat_tuple'] = df['cat_tuple'].astype(str) 32 | df['marketplace'] = 'silkroad2' 33 | return df 34 | 35 | 36 | def load_agora(): 37 | logger.info('load_agora') 38 | FNAME = 'data/agora.tsv' 39 | df = pd.read_csv(FNAME, sep='\t', parse_dates=['scrape_date']) 40 | df['vendor'] = df['vendor'].map(lambda x: x.split('/')[2].strip('#')) 41 | df['price_btc'] = df['price_btc'].map(pp.price_btc_parse) 42 | df['price_usd'] = df['price_btc'].map(lambda x: None) # TODO: another way 43 | df['marketplace'] = 'agora' 44 | return df 45 | 46 | 47 | def load_evolution(): 48 | FNAME = 'data/evolution.tsv' 49 | return pd.read_csv(FNAME, sep='\t', parse_dates=['scrape_date']) 50 | 51 | 52 | def load_cloudnine(): 53 | logger.info('load_cloudnine') 54 | FNAME = 'data/cloudnine.tsv' 55 | df = pd.read_csv(FNAME, sep='\t', parse_dates=['scrape_date']) 56 | df['cat_tuple'] = df['cat'] 57 | df = df.drop('cat', axis=1) 58 | df['price_btc'] = df[df['price'].map(pp.is_btc_price)]['price'].map(pp.price_parse) 59 | df['price_usd'] = df[df['price'].map(pp.is_usd_price)]['price'].map(pp.price_parse) 60 | df = df.drop('price', axis=1) 61 | df = df.drop('quantity_available', axis=1) 62 | df = df.drop('quantity_sold', axis=1) 63 | df['marketplace'] = 'cloudnine' 64 | return df 65 | 66 | 67 | def load_pandora(): 68 | logger.info('load_pandora') 69 | FNAME = 'data/pandora.tsv' 70 | df = pd.read_csv(FNAME, sep='\t') 71 | df['price_usd'] = df['price_usd'].map(pp.price_usd_parse) 72 | df['price_btc'] = df['price_usd'].map(lambda x: None) # TODO: another way 73 | df['listing'] = df['item'] 74 | df = df.drop('item', axis=1) 75 | df['marketplace'] = 'pandora' 76 | return df 77 | 78 | 79 | def load_hydra(): 80 | logger.info('load_hydra') 81 | FNAME = 'data/hydra.tsv' 82 | df = pd.read_csv(FNAME, sep='\t') 83 | df['price_usd'] = df['price_usd'].map(pp.price_usd_parse) 84 | df['price_btc'] = df['price_usd'].map(lambda x: np.NaN) # TODO: another way 85 | df['vendor'] = df['vendor'].map(lambda x: ' '.join(x.split()[:-2])) 86 | df['cat_tuple'] = df['cat'].map(lambda x: tuple(ast.literal_eval(x))) 87 | df = df.drop('cat', axis=1) 88 | df['cat_tuple'] = df['cat_tuple'].astype(str) 89 | df['listing'] = df['listing'].astype(str) 90 | df['marketplace'] = 'hydra' 91 | return df 92 | 93 | 94 | def load_bitstamp(): 95 | logger.info('load_bitstamp...') 96 | f = '/home/aahu/Dropbox/black-market-recommender-systems/data/bitstampUSD.csv' 97 | btp = pd.read_csv(f, header=None, names=['trade_date', 'trade_price', 'trade_vol']) 98 | btp['trade_date'] = pd.to_datetime(btp['trade_date'], unit='s') 99 | btp = btp[['trade_date', 'trade_price']] 100 | btp = btp.set_index('trade_date') 101 | return btp.resample('D', how='mean') 102 | 103 | 104 | def merge_bitstamp(df, btp): 105 | logger.info('len df {}'.format(len(df))) 106 | assert len(df[df['price_btc'].notnull() & df['price_usd'].notnull()]) == 0 107 | dfm = pd.merge(df, btp, left_on='scrape_date', right_index=True) 108 | df1 = df[df['price_btc'].notnull()].copy() 109 | df2 = df[df['price_usd'].notnull()] 110 | if len(df1) > 0: 111 | df1['price_usd'] = dfm[['trade_price', 'price_btc']].apply(lambda x: x[0] * x[1], axis=1) 112 | dfout = pd.concat((df1, df2)) 113 | else: 114 | dfout = df2 115 | logger.info('len dfout {}'.format(len(dfout))) 116 | return dfout 117 | 118 | 119 | def build_table(): 120 | dbname = 'sqlite+pysqlite:////home/aahu/Dropbox/black-market-recommender-systems/data/bmrs.db' 121 | conn = sqlalchemy.create_engine(dbname, module=sqlite3.dbapi2) 122 | 123 | df = load_evolution() 124 | print(df) 125 | # 126 | # btp = load_bitstamp() 127 | # 128 | # # do cloudnine first it has the perfect schema 129 | # df = load_cloudnine() 130 | # logger.info(df.columns) 131 | # merge_bitstamp(df, btp) 132 | # df.to_sql('bmrs', conn, index=False, if_exists='replace') 133 | # 134 | # df = load_pandora() 135 | # logger.info(df.columns) 136 | # merge_bitstamp(df, btp) 137 | # df.to_sql('bmrs', conn, index=False, if_exists='append') 138 | # 139 | # df = load_agora() 140 | # logger.info(df.columns) 141 | # merge_bitstamp(df, btp) 142 | # df.to_sql('bmrs', conn, index=False, if_exists='append') 143 | # 144 | # df = load_hydra() 145 | # logger.info(df.columns) 146 | # merge_bitstamp(df, btp) 147 | # print(df.dtypes) 148 | # df.to_sql('bmrs', conn, index=False, if_exists='append') 149 | # 150 | # df = load_silkroad2() 151 | # logger.info(df.columns) 152 | # merge_bitstamp(df, btp) 153 | # df.to_sql('bmrs', conn, index=False, if_exists='append') 154 | # return 155 | 156 | 157 | def dedup_table(): 158 | # http://stackoverflow.com/a/7745635/424631 159 | dbname = 'sqlite+pysqlite:////home/aahu/Dropbox/black-market-recommender-systems/data/bmrs.db' 160 | conn = sqlalchemy.create_engine(dbname, module=sqlite3.dbapi2) 161 | init_size = pd.read_sql('SELECT COUNT(*) FROM bmrs;', conn) 162 | logger.info('initial size: {}'.format(init_size)) 163 | logger.info('batch scrapes together...') 164 | q = """ 165 | SELECT d1.* 166 | FROM bmrs d1 167 | LEFT OUTER JOIN bmrs d2 168 | ON (d1.listing = d2.listing AND d1.vendor = d2.vendor AND 169 | d1.marketplace = d2.marketplace AND d1.category = d2.category AND 170 | d1.cat_tuple = d2.cat_tuple AND d1.ships_from = d2.ships_from AND 171 | d1.ships_to = d2.ships_to AND d1.scrape_date < d2.scrape_date) 172 | WHERE d2.listing IS NULL AND d2.vendor IS NULL AND 173 | d2.marketplace IS NULL AND d2.category IS NULL AND 174 | d2.cat_tuple IS NULL AND d2.ships_from IS NULL AND d2.ships_to IS NULL; 175 | """ 176 | df = pd.read_sql(q, conn) 177 | df = df.drop_duplicates() 178 | print(df) 179 | logger.info('shape now: {}'.format(df.shape)) 180 | logger.info('overwriting old table...') 181 | df.to_sql('bmrs', conn, index=False, if_exists='replace') 182 | return 183 | 184 | 185 | def main(): 186 | 187 | build_table() 188 | #dedup_table() 189 | pass 190 | 191 | if __name__ == '__main__': 192 | main() 193 | -------------------------------------------------------------------------------- /bmrs/postprocessing/.ipynb_checkpoints/build_categories-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import dateutil\n", 13 | "import os\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import seaborn as sns\n", 16 | "import ast\n", 17 | "\n", 18 | "import graph_tool as gt\n", 19 | "import graph_tool.draw\n", 20 | "import graph_tool.community\n", 21 | "import itertools\n", 22 | "import collections\n", 23 | "\n", 24 | "import logging\n", 25 | "FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'\n", 26 | "DATE_FORMAT = '%b %d %H:%M:%S'\n", 27 | "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)\n", 28 | "handler = logging.StreamHandler()\n", 29 | "handler.setFormatter(formatter)\n", 30 | "logger = logging.getLogger(__name__)\n", 31 | "logger.addHandler(handler)\n", 32 | "logger.setLevel(logging.INFO)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "May 12 16:05:27 INFO Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 47 | "INFO:__main__:Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 48 | "May 12 16:05:27 INFO (92407, 9)\n", 49 | "INFO:__main__:(92407, 9)\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'\n", 55 | "l=[]\n", 56 | "for fname in os.listdir(DATA_DIR):\n", 57 | " if fname.endswith('.tsv'):\n", 58 | " try:\n", 59 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scraped_date'])\n", 60 | " l.append(df0)\n", 61 | " except ValueError:\n", 62 | " #logger.exception('no data in {}'.format(fname))\n", 63 | " pass\n", 64 | "df = pd.concat(l)\n", 65 | "df['cat'] = df['cat'].map(ast.literal_eval)\n", 66 | "logger.info(df.columns)\n", 67 | "logger.info(df.shape)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "(Drugs, Prescription) 7312\n", 81 | "(Other, Books) 7090\n", 82 | "(Drugs, Opioids) 5045\n", 83 | "(Services, Money) 4780\n", 84 | "(Drugs, Benzos) 3046\n", 85 | "(Services, Other) 2221\n", 86 | "(Other, Software) 1335\n", 87 | "(Services, Sex) 1310\n", 88 | "(Tobacco, Cigarettes) 1157\n", 89 | "(Services, Hacking) 1015\n", 90 | "(Other, Electronics) 940\n", 91 | "(Drugs, Paraphernalia) 814\n", 92 | "(Drugs, Steroids) 813\n", 93 | "(Other, Accounts) 768\n", 94 | "(Drugs, Other) 767\n", 95 | "(Counterfeits, Other) 730\n", 96 | "(Drugs, RCs) 665\n", 97 | "(Drugs, Dissociatives) 565\n", 98 | "(Tobacco, Paraphernalia) 564\n", 99 | "(Other, Shipping Materials) 383\n", 100 | "(Counterfeits, Watches) 278\n", 101 | "(Counterfeits, Accessories) 272\n", 102 | "(Drugs, Wholesale) 235\n", 103 | "(Drugs, Supplements) 207\n", 104 | "(Other, Precious Metals) 205\n", 105 | "(Services, Training) 166\n", 106 | "(Services, Logistics) 110\n", 107 | "(Drugs, Weight Loss) 95\n", 108 | "(Services, Coding) 83\n", 109 | "(Chemicals, Precursors) 53\n", 110 | "(Tobacco, Other) 51\n", 111 | "(Tobacco, Rolling) 40\n", 112 | "(Other, Alcohol) 27\n", 113 | "(Drugs, Barbituates) 6\n", 114 | "(Other, Food) 6\n", 115 | "(Counterfeits, Clothing) 5\n", 116 | "(Other, Movies) 1\n", 117 | "dtype: int64" 118 | ] 119 | }, 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "df['cat'].map(lambda x:x if len(x) == 2 else None).value_counts()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 10, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "True\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "def build_cat_tree(df):\n", 146 | " #build category tree\n", 147 | " cats = set(itertools.chain.from_iterable(df['cat']))\n", 148 | "\n", 149 | " #build graph-tool ids\n", 150 | " node_lbs = {}\n", 151 | " rev_node_lbs = {}\n", 152 | " for idx,vendor in enumerate(cats):\n", 153 | " node_lbs[vendor] = idx\n", 154 | " rev_node_lbs[idx] = vendor\n", 155 | " \n", 156 | " edge_list = []\n", 157 | " for cat_branch in df['cat']:\n", 158 | " for i in range(len(cat_branch)-1):\n", 159 | " v0 = cat_branch[i]\n", 160 | " v1 = cat_branch[i+1]\n", 161 | " e = node_lbs[v0], node_lbs[v1]\n", 162 | " edge_list.append(e)\n", 163 | " \n", 164 | " edge_list = set(edge_list)\n", 165 | " edge_list = [e for e in edge_list if e[0] != e[1]] # self-loops\n", 166 | " g = graph_tool.Graph(directed=True)\n", 167 | " g.add_edge_list(edge_list)\n", 168 | " \n", 169 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 170 | " for v in g.vertices():\n", 171 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 172 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 173 | " \n", 174 | " pos = graph_tool.draw.arf_layout(g)\n", 175 | " graph_tool.draw.graph_draw(g,pos=pos,vertex_text=g.graph_properties['label'])\n", 176 | " print(graph_tool.topology.is_DAG(g))\n", 177 | "\n", 178 | "build_cat_tree(df)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 55, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.4.0" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 0 221 | } 222 | -------------------------------------------------------------------------------- /bmrs/postprocessing/viz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import graph_tool.all as gt\n", 12 | "import pandas as pd\n", 13 | "import glob\n", 14 | "import itertools\n", 15 | "import collections\n", 16 | "import matplotlib\n", 17 | "import math" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": false 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "DATA_DIR = '/home/aahu/Dropbox/black-market-recommender-systems/data/'\n", 29 | "l = []\n", 30 | "for fname in glob.glob(DATA_DIR+'*.tsv'):\n", 31 | " df = pd.read_csv(fname,sep='\\t')\n", 32 | " l.append(df)\n", 33 | "df_raw = pd.concat(l)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "cols = ['category','vendor']\n", 45 | "#drop dups and add count\n", 46 | "df = df_raw[cols].copy(deep=True)\n", 47 | "df['vendor'] = df['vendor'].map(lambda x: str(x).split('/')[-1].replace('#',''))\n", 48 | "dfc = df.groupby(cols).size()\n", 49 | "dfc = dfc.reset_index()\n", 50 | "dfc['count'] = dfc[0]\n", 51 | "cols.append('count')\n", 52 | "df = dfc[cols].copy(deep=True)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 33, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "g vert/edges: 100 3224\n", 67 | "done!\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "def build_cat_cat_net(df_in, n_nodes=100):\n", 73 | " \n", 74 | " df = df_in.copy(deep=True) # python mutable arguments...\n", 75 | " \n", 76 | " #filter to most common cats\n", 77 | " cats = collections.Counter(df['category']).most_common(n_nodes)\n", 78 | " cats = [c[0] for c in cats]\n", 79 | " df = df[df['category'].map(lambda x: x in cats)]\n", 80 | " \n", 81 | " #build graph-tool ids\n", 82 | " node_lbs = {}\n", 83 | " rev_node_lbs = {}\n", 84 | " for idx,cat in enumerate(cats):\n", 85 | " node_lbs[cat] = idx\n", 86 | " rev_node_lbs[idx] = cat\n", 87 | " df['id'] = df['category'].map(lambda x:node_lbs[x])\n", 88 | " \n", 89 | " edge_list = []\n", 90 | " dfg = df.groupby('vendor')\n", 91 | " for name,group in dfg:\n", 92 | " ei = itertools.combinations(group['id'].drop_duplicates(),2)\n", 93 | " for e in ei:\n", 94 | " edge_list.append(tuple(sorted(e)))\n", 95 | " \n", 96 | " #filter edges by num shared vendors\n", 97 | " MIN_SHARED_VENDORS=3\n", 98 | " c = collections.Counter(edge_list)\n", 99 | " edge_list = [e for e in c if c[e]>=MIN_SHARED_VENDORS]\n", 100 | "\n", 101 | " g = gt.Graph(directed=False)\n", 102 | " g.add_edge_list(edge_list)\n", 103 | " \n", 104 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 105 | " for v in g.vertices():\n", 106 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 107 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 108 | " \n", 109 | " #add edge weight property\n", 110 | " g.edge_properties['weight'] = g.new_edge_property('double')\n", 111 | " g.edge_properties['color'] = g.new_edge_property('vector')\n", 112 | " for e in g.edges():\n", 113 | " w = c[tuple(sorted([e.source(),e.target()]))]\n", 114 | " g.edge_properties['weight'][e] = w\n", 115 | " alpha = (float(w)/max(c.values())) + .025\n", 116 | " g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha] \n", 117 | " \n", 118 | " state = gt.minimize_nested_blockmodel_dl(g,deg_corr=False,\n", 119 | " eweight=g.ep['weight'])\n", 120 | " t = gt.get_hierarchy_tree(state)[0]\n", 121 | " tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)\n", 122 | " cts = gt.get_hierarchy_control_points(g, t, tpos,beta=.86)\n", 123 | " pos = g.own_property(tpos)\n", 124 | " b = state.levels[0].b\n", 125 | "\n", 126 | " #text rotation\n", 127 | " text_rot = g.new_vertex_property('double')\n", 128 | " g.vertex_properties['text_rot'] = text_rot\n", 129 | " text_pos = g.new_vertex_property('double')\n", 130 | " g.vertex_properties['text_pos'] = text_pos\n", 131 | " for v in g.vertices():\n", 132 | " if pos[v][0] > 0:\n", 133 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 134 | " else:\n", 135 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 136 | " text_pos[v] = 10 #len(g.vp['label'][v].strip())\n", 137 | " \n", 138 | " gt.graph_draw(g, pos=pos, vertex_fill_color=b,\n", 139 | " edge_control_points=cts,\n", 140 | " vertex_size=20,\n", 141 | " vertex_text=g.vertex_properties['label'],\n", 142 | " vertex_text_rotation=g.vertex_properties['text_rot'],\n", 143 | " vertex_text_position=g.vp['text_pos'],\n", 144 | " vertex_font_size=20,\n", 145 | " vertex_font_family='mono',\n", 146 | " vertex_anchor=0,\n", 147 | " vertex_color=b,\n", 148 | " vcmap=matplotlib.cm.Set1,\n", 149 | " edge_color=g.edge_properties['color'],\n", 150 | " bg_color=[0,0,0,1],\n", 151 | " output_size=[2*1024,2*1024],\n", 152 | " output='/home/aahu/Desktop/all_min_edgew={0}.png'.format(MIN_SHARED_VENDORS)) \n", 153 | " print('done!')\n", 154 | "# gt.draw_hierarchy(state,\n", 155 | "# vertex_text=g.vertex_properties['label'],\n", 156 | "# vertex_text_rotation=g.vp['text_rot'],\n", 157 | "# vertex_text_position=1,\n", 158 | "# vertex_font_size=20,\n", 159 | "# vertex_font_family='mono',\n", 160 | "# vertex_anchor=0,\n", 161 | "# vcmap=matplotlib.cm.Spectral,\n", 162 | "# ecmap=matplotlib.cm.Spectral,\n", 163 | "# bg_color=[0,0,0,1],\n", 164 | "# output_size=[1024*2,1024*2],\n", 165 | "# output='/home/aahu/Desktop/labeled_all_nvends={0}.png'.format(MIN_SHARED_VENDORS))\n", 166 | "\n", 167 | " \n", 168 | " return\n", 169 | "\n", 170 | "build_cat_cat_net(df)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.4.0" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 0 213 | } 214 | -------------------------------------------------------------------------------- /bmrs/postprocessing/build_categories.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import dateutil\n", 13 | "import os\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import seaborn as sns\n", 16 | "import ast\n", 17 | "\n", 18 | "import graph_tool as gt\n", 19 | "import graph_tool.draw\n", 20 | "import graph_tool.community\n", 21 | "import itertools\n", 22 | "import collections\n", 23 | "\n", 24 | "import logging\n", 25 | "FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'\n", 26 | "DATE_FORMAT = '%b %d %H:%M:%S'\n", 27 | "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)\n", 28 | "handler = logging.StreamHandler()\n", 29 | "handler.setFormatter(formatter)\n", 30 | "logger = logging.getLogger(__name__)\n", 31 | "logger.addHandler(handler)\n", 32 | "logger.setLevel(logging.INFO)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "May 12 16:05:27 INFO Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 47 | "INFO:__main__:Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 48 | "May 12 16:05:27 INFO (92407, 9)\n", 49 | "INFO:__main__:(92407, 9)\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'\n", 55 | "l=[]\n", 56 | "for fname in os.listdir(DATA_DIR):\n", 57 | " if fname.endswith('.tsv'):\n", 58 | " try:\n", 59 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scraped_date'])\n", 60 | " l.append(df0)\n", 61 | " except ValueError:\n", 62 | " #logger.exception('no data in {}'.format(fname))\n", 63 | " pass\n", 64 | "df = pd.concat(l)\n", 65 | "df['cat'] = df['cat'].map(ast.literal_eval)\n", 66 | "logger.info(df.columns)\n", 67 | "logger.info(df.shape)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "(Drugs, Prescription) 7312\n", 81 | "(Other, Books) 7090\n", 82 | "(Drugs, Opioids) 5045\n", 83 | "(Services, Money) 4780\n", 84 | "(Drugs, Benzos) 3046\n", 85 | "(Services, Other) 2221\n", 86 | "(Other, Software) 1335\n", 87 | "(Services, Sex) 1310\n", 88 | "(Tobacco, Cigarettes) 1157\n", 89 | "(Services, Hacking) 1015\n", 90 | "(Other, Electronics) 940\n", 91 | "(Drugs, Paraphernalia) 814\n", 92 | "(Drugs, Steroids) 813\n", 93 | "(Other, Accounts) 768\n", 94 | "(Drugs, Other) 767\n", 95 | "(Counterfeits, Other) 730\n", 96 | "(Drugs, RCs) 665\n", 97 | "(Drugs, Dissociatives) 565\n", 98 | "(Tobacco, Paraphernalia) 564\n", 99 | "(Other, Shipping Materials) 383\n", 100 | "(Counterfeits, Watches) 278\n", 101 | "(Counterfeits, Accessories) 272\n", 102 | "(Drugs, Wholesale) 235\n", 103 | "(Drugs, Supplements) 207\n", 104 | "(Other, Precious Metals) 205\n", 105 | "(Services, Training) 166\n", 106 | "(Services, Logistics) 110\n", 107 | "(Drugs, Weight Loss) 95\n", 108 | "(Services, Coding) 83\n", 109 | "(Chemicals, Precursors) 53\n", 110 | "(Tobacco, Other) 51\n", 111 | "(Tobacco, Rolling) 40\n", 112 | "(Other, Alcohol) 27\n", 113 | "(Drugs, Barbituates) 6\n", 114 | "(Other, Food) 6\n", 115 | "(Counterfeits, Clothing) 5\n", 116 | "(Other, Movies) 1\n", 117 | "dtype: int64" 118 | ] 119 | }, 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "df['cat'].map(lambda x:x if len(x) == 2 else None).value_counts()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 25, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "g vert/edges: 79 79\n", 141 | "True\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "def build_cat_tree(df):\n", 147 | " #build category tree\n", 148 | " cats = set(itertools.chain.from_iterable(df['cat']))\n", 149 | "\n", 150 | " #build graph-tool ids\n", 151 | " node_lbs = {}\n", 152 | " rev_node_lbs = {}\n", 153 | " for idx,vendor in enumerate(cats):\n", 154 | " node_lbs[vendor] = idx\n", 155 | " rev_node_lbs[idx] = vendor\n", 156 | " \n", 157 | " edge_list = []\n", 158 | " for cat_branch in df['cat']:\n", 159 | " for i in range(len(cat_branch)-1):\n", 160 | " v0 = cat_branch[i]\n", 161 | " v1 = cat_branch[i+1]\n", 162 | " e = node_lbs[v0], node_lbs[v1]\n", 163 | " edge_list.append(e)\n", 164 | " \n", 165 | " edge_list = set(edge_list)\n", 166 | " edge_list = [e for e in edge_list if e[0] != e[1]] # self-loops\n", 167 | " g = graph_tool.Graph(directed=True)\n", 168 | " g.add_edge_list(edge_list)\n", 169 | " \n", 170 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 171 | " for v in g.vertices():\n", 172 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 173 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 174 | " \n", 175 | " pos = graph_tool.draw.arf_layout(g)\n", 176 | " graph_tool.draw.graph_draw(g,pos=pos,\n", 177 | " vertex_text=g.vertex_properties['label'],\n", 178 | " vertex_text_position=.1,\n", 179 | " output_size=(1024,1024),\n", 180 | " output='/home/aahu/Desktop/drug_dag.pdf')\n", 181 | " print(graph_tool.topology.is_DAG(g))\n", 182 | "\n", 183 | "build_cat_tree(df)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 14, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.4.0" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 0 226 | } 227 | -------------------------------------------------------------------------------- /bmrs/postprocessing/.ipynb_checkpoints/postp_agora-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import dateutil\n", 13 | "import os\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import seaborn as sns\n", 16 | "\n", 17 | "import graph_tool as gt\n", 18 | "import graph_tool.draw\n", 19 | "import graph_tool.community\n", 20 | "import itertools\n", 21 | "import collections\n", 22 | "\n", 23 | "import logging\n", 24 | "FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'\n", 25 | "DATE_FORMAT = '%b %d %H:%M:%S'\n", 26 | "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)\n", 27 | "handler = logging.StreamHandler()\n", 28 | "handler.setFormatter(formatter)\n", 29 | "logger = logging.getLogger(__name__)\n", 30 | "logger.addHandler(handler)\n", 31 | "logger.setLevel(logging.INFO)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stderr", 43 | "output_type": "stream", 44 | "text": [ 45 | "May 16 21:08:14 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 46 | "INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 47 | "May 16 21:08:14 INFO (1773538, 7)\n", 48 | "INFO:__main__:(1773538, 7)\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'\n", 54 | "l=[]\n", 55 | "for fname in os.listdir(DATA_DIR):\n", 56 | " if fname.endswith('.tsv'):\n", 57 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 58 | " l.append(df0)\n", 59 | "df = pd.concat(l)\n", 60 | "logger.info(df.columns)\n", 61 | "logger.info(df.shape)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stderr", 73 | "output_type": "stream", 74 | "text": [ 75 | "May 16 21:11:50 INFO (1621645, 7)\n", 76 | "INFO:__main__:(1621645, 7)\n" 77 | ] 78 | }, 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "Weed 209911\n", 83 | "Prescription 102483\n", 84 | "Benzos 97789\n", 85 | "Cocaine 94635\n", 86 | "MDMA 91953\n", 87 | "Pills 90374\n", 88 | "Steroids 77417\n", 89 | "RCs 56010\n", 90 | "Watches 53163\n", 91 | "LSD 47886\n", 92 | "Hash 46623\n", 93 | "Concentrates 42281\n", 94 | "Speed 41430\n", 95 | "Meth 38145\n", 96 | "eBooks 31954\n", 97 | "Synthetics 28986\n", 98 | "Heroin 28845\n", 99 | "Guides 23747\n", 100 | "Edibles 23189\n", 101 | "Smoked 23185\n", 102 | "NB 23146\n", 103 | "Pirated 23105\n", 104 | "Money 22361\n", 105 | "2C 19233\n", 106 | "Accounts 16643\n", 107 | "Seeds 14835\n", 108 | "Physical documents 13723\n", 109 | "Mushrooms 13638\n", 110 | "Oxycodone 12981\n", 111 | "DMT 11871\n", 112 | "Scans/Photos 11834\n", 113 | "Fentanyl 9706\n", 114 | "Opioids 9040\n", 115 | "Software 8853\n", 116 | "Jewelry 8816\n", 117 | "Hacking 8197\n", 118 | "MDA 7993\n", 119 | "Weight loss 7433\n", 120 | "MXE 7240\n", 121 | "Clothing 7082\n", 122 | "Pipes 6648\n", 123 | "Accessories 6388\n", 124 | "Ketamine 6276\n", 125 | "5-MeO 5954\n", 126 | "Stashes 5206\n", 127 | "Making money 5077\n", 128 | "Containers 5047\n", 129 | "Melee 4192\n", 130 | "Mescaline 3975\n", 131 | "Paraphernalia 3890\n", 132 | "dtype: int64" 133 | ] 134 | }, 135 | "execution_count": 8, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "#discard meta-categories\"\n", 142 | "meta_cats = ['Other','Drugs','Guides & Tutorials','Fraud Related',\n", 143 | " 'Services','Digital Goods','Electronics', 'Custom Listings', 'Pills']\n", 144 | "df = df[df['category'].map(lambda x:x not in meta_cats)]\n", 145 | "logger.info(df.shape)\n", 146 | "\n", 147 | "df['category'].value_counts().head(50)\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 3, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "#takes too long\n", 159 | "def build_category_category_graph(df, min_shared_vendors):\n", 160 | " node_lbs = {}\n", 161 | " rev_node_lbs = {}\n", 162 | " for idx,vendor in enumerate(df['category'].drop_duplicates()):\n", 163 | " node_lbs[vendor] = idx\n", 164 | " rev_node_lbs[idx] = vendor\n", 165 | " df['id'] = df['category'].map(lambda x:node_lbs[x])\n", 166 | " \n", 167 | " edge_list = []\n", 168 | " dfg = df.groupby('vendor')\n", 169 | " for name,group in dfg:\n", 170 | " ei = itertools.combinations(group['id'].drop_duplicates(),2)\n", 171 | " for e in ei:\n", 172 | " edge_list.append(tuple(sorted(e)))\n", 173 | "\n", 174 | " #filter edges by num shared vendor\n", 175 | " c = collections.Counter(edge_list)\n", 176 | " edge_list = [e for e in c if c[e]>=min_shared_vendors]\n", 177 | "\n", 178 | " #build graph\n", 179 | " g = gt.Graph(directed=False)\n", 180 | " g.add_edge_list(edge_list)\n", 181 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 182 | " for v in g.vertices():\n", 183 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 184 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 185 | "\n", 186 | " #add edge weight property\n", 187 | " g.edge_properties['weight'] = g.new_edge_property('double')\n", 188 | " g.edge_properties['color'] = g.new_edge_property('vector')\n", 189 | " for e in g.edges():\n", 190 | " w = c[tuple(sorted([e.source(),e.target()]))]\n", 191 | " g.edge_properties['weight'][e] = w\n", 192 | " alpha = (float(w)/max(c.values())) + .07\n", 193 | " g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha] \n", 194 | " return g\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "def block_model_plot(df,min_shared_vendors=3):\n", 199 | " \"\"\"\n", 200 | " Graph-tool plot\n", 201 | " \"\"\"\n", 202 | " g = build_category_category_graph(df,min_shared_vendors)\n", 203 | " \n", 204 | " logger.info(g)\n", 205 | " logger.info('begin stochastic block model')\n", 206 | " state = gt.community.minimize_nested_blockmodel_dl(g,deg_corr=True,\n", 207 | " eweight=g.ep['weight'])\n", 208 | " bstack = state.get_bstack()\n", 209 | " t = gt.community.get_hierarchy_tree(bstack)[0]\n", 210 | " tpos = pos = gt.draw.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)\n", 211 | " cts = gt.draw.get_hierarchy_control_points(g, t, tpos,beta=.86)\n", 212 | " pos = g.own_property(tpos)\n", 213 | " b = bstack[0].vp[\"b\"]\n", 214 | "\n", 215 | " #text rotation\n", 216 | " text_rot = g.new_vertex_property('double')\n", 217 | " g.vertex_properties['text_rot'] = text_rot\n", 218 | " text_pos = g.new_vertex_property('double')\n", 219 | " g.vertex_properties['text_pos'] = text_pos\n", 220 | " for v in g.vertices():\n", 221 | " if pos[v][0] > 0:\n", 222 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 223 | " else:\n", 224 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 225 | " text_pos[v] = 10#len(g.vp['label'][v].strip())\n", 226 | "\n", 227 | " logger.info('saving to disk...')\n", 228 | " gt.draw.graph_draw(g, pos=pos, vertex_fill_color=b,\n", 229 | " edge_control_points=cts,\n", 230 | " vertex_size=20,\n", 231 | " vertex_text=g.vertex_properties['label'],\n", 232 | " vertex_text_rotation=g.vertex_properties['text_rot'],\n", 233 | " vertex_text_position=g.vp['text_pos'],\n", 234 | " vertex_font_size=20,\n", 235 | " vertex_font_family='mono',\n", 236 | " vertex_anchor=0,\n", 237 | " vertex_color=b,\n", 238 | " vcmap=matplotlib.cm.Spectral,\n", 239 | " edge_color=g.edge_properties['color'],\n", 240 | " bg_color=[0,0,0,1],\n", 241 | " output_size=[1024*2,1024*2],\n", 242 | " output='/home/aahu/Desktop/ago_nvends={0}.png'.format(MIN_SHARED_VENDORS))\n", 243 | "\n", 244 | " return" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stderr", 256 | "output_type": "stream", 257 | "text": [ 258 | "May 16 15:25:15 INFO \n", 259 | "INFO:__main__:\n", 260 | "May 16 15:25:15 INFO begin stochastic block model\n", 261 | "INFO:__main__:begin stochastic block model\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "block_model_plot(df[['vendor','category']].drop_duplicates(), min_shared_vendors=10)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [] 286 | } 287 | ], 288 | "metadata": { 289 | "kernelspec": { 290 | "display_name": "Python 3", 291 | "language": "python", 292 | "name": "python3" 293 | }, 294 | "language_info": { 295 | "codemirror_mode": { 296 | "name": "ipython", 297 | "version": 3 298 | }, 299 | "file_extension": ".py", 300 | "mimetype": "text/x-python", 301 | "name": "python", 302 | "nbconvert_exporter": "python", 303 | "pygments_lexer": "ipython3", 304 | "version": "3.4.0" 305 | } 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 0 309 | } 310 | -------------------------------------------------------------------------------- /bmrs/postprocessing/postprocess_all.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import dateutil\n", 13 | "import os\n", 14 | "import ast\n", 15 | "\n", 16 | "import logging\n", 17 | "FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'\n", 18 | "DATE_FORMAT = '%b %d %H:%M:%S'\n", 19 | "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)\n", 20 | "handler = logging.StreamHandler()\n", 21 | "handler.setFormatter(formatter)\n", 22 | "logger = logging.getLogger(__name__)\n", 23 | "logger.addHandler(handler)\n", 24 | "logger.setLevel(logging.INFO)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 12, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "META_CATS = ['Other','Drugs','Services', 'Custom Listings', 'DRUGS & MORE',\n", 36 | " 'other service','other drugs','others', 'digital', 'drug']\n", 37 | "META_CATS = [s.lower() for s in META_CATS]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 13, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "def load_agora():\n", 49 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'\n", 50 | " l=[]\n", 51 | " for fname in os.listdir(DATA_DIR):\n", 52 | " if fname.endswith('.tsv'):\n", 53 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 54 | " l.append(df0)\n", 55 | " df = pd.concat(l)\n", 56 | " logger.info(df.columns)\n", 57 | " logger.info(df.shape)\n", 58 | " return df" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 14, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "def load_pandora():\n", 70 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/pandora/'\n", 71 | " l=[]\n", 72 | " for fname in os.listdir(DATA_DIR):\n", 73 | " if fname.endswith('.tsv'):\n", 74 | " try:\n", 75 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 76 | " l.append(df0)\n", 77 | " except ValueError:\n", 78 | " #logger.exception('no data')\n", 79 | " pass\n", 80 | " df = pd.concat(l)\n", 81 | " logger.info(df.columns)\n", 82 | " logger.info(df.shape)\n", 83 | " return df" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 15, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "def load_cloudnine():\n", 95 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'\n", 96 | " l=[]\n", 97 | " for fname in os.listdir(DATA_DIR):\n", 98 | " if fname.endswith('.tsv'):\n", 99 | " try:\n", 100 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 101 | " l.append(df0)\n", 102 | " except ValueError:\n", 103 | " logger.exception('no data')\n", 104 | " pass\n", 105 | " df = pd.concat(l)\n", 106 | " logger.info(df.columns)\n", 107 | " logger.info(df.shape)\n", 108 | " \n", 109 | " #be consistent\n", 110 | " df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 111 | " df['cat'] = df['cat'].map(lambda x: ast.literal_eval(x))\n", 112 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 113 | " \n", 114 | " return df" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 16, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "def load_hydra():\n", 126 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/hydra/'\n", 127 | " l=[]\n", 128 | " for fname in os.listdir(DATA_DIR):\n", 129 | " if fname.endswith('.tsv'):\n", 130 | " try:\n", 131 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 132 | " l.append(df0)\n", 133 | " except ValueError:\n", 134 | " logger.exception('no data')\n", 135 | " pass\n", 136 | " df = pd.concat(l)\n", 137 | " logger.info(df.columns)\n", 138 | " logger.info(df.shape)\n", 139 | " \n", 140 | " #be consistent\n", 141 | " df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 142 | " df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))\n", 143 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 144 | " \n", 145 | " return df" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 17, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "def load_evolution():\n", 157 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/evolution/'\n", 158 | " l=[]\n", 159 | " for fname in os.listdir(DATA_DIR):\n", 160 | " if fname.endswith('.tsv'):\n", 161 | " try:\n", 162 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 163 | " l.append(df0)\n", 164 | " except ValueError:\n", 165 | " logger.exception('no data')\n", 166 | " pass\n", 167 | " df = pd.concat(l)\n", 168 | " logger.info(df.columns)\n", 169 | " logger.info(df.shape)\n", 170 | " \n", 171 | " #be consistent\n", 172 | " #df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 173 | " #df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))\n", 174 | " #df['category'] = df['cat'].map(lambda x: x[-1])\n", 175 | " \n", 176 | " return df" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 18, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "def postprocess(df):\n", 188 | " \"\"\"\n", 189 | " standardized postprocessing\n", 190 | " \"\"\"\n", 191 | " #normalize\n", 192 | " df['category'] = df['category'].map(lambda x:x.lower())\n", 193 | " \n", 194 | " #discard meta-categories\n", 195 | " df = df[df['category'].map(lambda x:x not in META_CATS)]\n", 196 | " logger.info(df.shape)\n", 197 | " \n", 198 | " #discard non-string categories\n", 199 | " def isfloat(value):\n", 200 | " try:\n", 201 | " float(value)\n", 202 | " return True\n", 203 | " except ValueError:\n", 204 | " return False\n", 205 | " df = df[df['category'].map(lambda x:not isfloat(x))]\n", 206 | " \n", 207 | " return df" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 19, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stderr", 219 | "output_type": "stream", 220 | "text": [ 221 | "May 20 21:32:05 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 222 | "May 20 21:32:05 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 223 | "INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 224 | "May 20 21:32:05 INFO (663912, 7)\n", 225 | "May 20 21:32:05 INFO (663912, 7)\n", 226 | "INFO:__main__:(663912, 7)\n" 227 | ] 228 | }, 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "drugs 541071\n", 233 | "digital 20048\n", 234 | "books 17697\n", 235 | "apparel 15615\n", 236 | "drug 15433\n", 237 | "money 12064\n", 238 | "custom 9945\n", 239 | "services 7971\n", 240 | "forgeries 7086\n", 241 | "erotica 4177\n", 242 | "jewelry 2830\n", 243 | "electronics 2096\n", 244 | "packaging 1636\n", 245 | "computer 1273\n", 246 | "writing 1192\n", 247 | "lotteries 956\n", 248 | "hardware 763\n", 249 | "lab 692\n", 250 | "medical 538\n", 251 | "art 531\n", 252 | "herbs 133\n", 253 | "biotic 83\n", 254 | "collectibles 82\n", 255 | "dtype: int64" 256 | ] 257 | }, 258 | "execution_count": 19, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "def load_silkroad2():\n", 265 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2/'\n", 266 | " l=[]\n", 267 | " for fname in os.listdir(DATA_DIR):\n", 268 | " if fname.endswith('.tsv'):\n", 269 | " try:\n", 270 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 271 | " l.append(df0)\n", 272 | " except ValueError:\n", 273 | " logger.exception('no data')\n", 274 | " pass\n", 275 | " df = pd.concat(l)\n", 276 | " logger.info(df.columns)\n", 277 | " logger.info(df.shape)\n", 278 | " \n", 279 | " #be consistent\n", 280 | " df['cat'] = df['category'].map(lambda x: x.split('-'))\n", 281 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 282 | " \n", 283 | " return df\n", 284 | "\n", 285 | "sr = load_silkroad2()\n", 286 | "sr['cat'].map(lambda x:x[0]).value_counts()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 20, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "May 20 21:32:10 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 301 | "May 20 21:32:10 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 302 | "INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 303 | "May 20 21:32:10 INFO (663912, 7)\n", 304 | "May 20 21:32:10 INFO (663912, 7)\n", 305 | "INFO:__main__:(663912, 7)\n", 306 | "May 20 21:32:12 INFO (616980, 8)\n", 307 | "May 20 21:32:12 INFO (616980, 8)\n", 308 | "INFO:__main__:(616980, 8)\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "# cn = load_cloudnine()\n", 314 | "# cn = postprocess(cn)\n", 315 | "# cn.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine.tsv',sep='\\t',index=False)\n", 316 | "\n", 317 | "# ag = load_agora()\n", 318 | "# ag = postprocess(ag)\n", 319 | "# ag.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/agora.tsv',sep='\\t',index=False)\n", 320 | "\n", 321 | "#pa = load_pandora()\n", 322 | "#pa = postprocess(pa)\n", 323 | "#pa.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/pandora.tsv',sep='\\t',index=False)\n", 324 | "\n", 325 | "#hy = load_hydra()\n", 326 | "#hy = postprocess(hy)\n", 327 | "#hy.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/hydra.tsv',sep='\\t',index=False)\n", 328 | "\n", 329 | "# ev = load_evolution()\n", 330 | "# ev = postprocess(ev)\n", 331 | "# ev.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/evolution.tsv',sep='\\t',index=False)\n", 332 | "\n", 333 | "sr2 = load_silkroad2()\n", 334 | "sr2 = postprocess(sr2)\n", 335 | "sr2.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2.tsv',sep='\\t',index=False)\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 21, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "ename": "NameError", 347 | "evalue": "name 'ev' is not defined", 348 | "output_type": "error", 349 | "traceback": [ 350 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 351 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 352 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mev\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 353 | "\u001b[1;31mNameError\u001b[0m: name 'ev' is not defined" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "ev" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "ev['vendor'].value_counts()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [] 380 | } 381 | ], 382 | "metadata": { 383 | "kernelspec": { 384 | "display_name": "Python 3", 385 | "language": "python", 386 | "name": "python3" 387 | }, 388 | "language_info": { 389 | "codemirror_mode": { 390 | "name": "ipython", 391 | "version": 3 392 | }, 393 | "file_extension": ".py", 394 | "mimetype": "text/x-python", 395 | "name": "python", 396 | "nbconvert_exporter": "python", 397 | "pygments_lexer": "ipython3", 398 | "version": "3.4.0" 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 0 403 | } 404 | -------------------------------------------------------------------------------- /bmrs/postprocessing/recommender_viz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 37, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Populating the interactive namespace from numpy and matplotlib\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import graph_tool.all as gt\n", 20 | "import pandas as pd\n", 21 | "import glob\n", 22 | "import itertools\n", 23 | "import collections\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns\n", 26 | "import math\n", 27 | "\n", 28 | "%pylab inline" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 38, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "version: 2.2.44dev (commit 85f11ae8, Thu Jun 18 15:33:16 2015 +0200)\n", 43 | "gcc version: 4.8.4\n", 44 | "compilation flags: -I/usr/include/python3.4m -I/usr/include -I/usr/local/lib/python3.4/dist-packages/numpy/core/include -DSPARSEHASH_PREFIX=sparsehash -Wall -Wextra -ftemplate-backtrace-limit=0 -DNDEBUG -std=gnu++11 -ftemplate-depth-250 -Wno-deprecated -Wno-unknown-pragmas -O3 -fvisibility=default -fvisibility-inlines-hidden -fopenmp -DSPARSEHASH_PREFIX=sparsehash -L/usr/lib -lpython3.4m\n", 45 | "install prefix: /usr/local\n", 46 | "python dir: /usr/lib/python3/dist-packages\n", 47 | "graph filtering: True\n", 48 | "openmp: True\n", 49 | "uname: Linux barahv2 3.13.0-45-generic #74-Ubuntu SMP Tue Jan 13 19:36:28 UTC 2015 x86_64\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "gt.show_config()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 39, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "df = pd.read_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/ev_item_sim.tsv',sep='\\t')\n", 66 | "#df = pd.read_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/df_all_item_sim.tsv',sep='\\t')" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 40, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAERCAYAAACAbee5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcHFW99/FP9cxkksyasCQCkQDBY1iMJGwCyqIgAn07\nLAqIXNkkL73ee8V24YEHfFT0ctXmiopXRLg8Lk9EwtI2CAiKsqkBZYvKgQQSMRCyTyaTyWxdzx9V\nbTrD9DZd3T1d/X2/XnnVpKu66lfpmW/OnKpzynFdFxERCZdIrQsQEZHgKdxFREJI4S4iEkIKdxGR\nEFK4i4iEkMJdRCSEmvOtNMa0ALcAewOtwDXW2lTW+suAi4F1/kuLrLUvVqhWEREpUt5wB84D1llr\nzzfGTAOeAVJZ6+cD51trn65UgSIiUrpC4X47sMT/OgIMj1q/ALjCGDMTuNdae23A9YmIyDjk7XO3\n1vZZa7caYzrwgv7KUZssBhYBJwDHGGNOrUyZIiJSioIXVI0xs4BfAz+01v501OrrrbUbrbVDwL3A\nIRWoUURESlToguoM4JfAJ6y1D49a1wU8Z4w5ANiG13q/ucDxtuNdmBURkeI5Jb8h38RhxpjrgQ8C\nNuvlm4A2a+1NxphzgcuAAeAha+0XCxzPHU+RdUTnV990fvUrzOc2LnnDvQLC/gHo/Opb2efnOE4E\n6Chi017XddPlHGscwvz5hfncxqXQ3TIiUpqOo87+6qLWtukDuTYY6NvY+sRtV9wI9FSxLmkwCneR\ngLW2TR9o657ZX+s6pLFp+gERkRBSuIuIhJDCXUQkhBTuIiIhpHAXEQkhhbuISAgp3EVEQkjhLiIS\nQgp3EZEQUriLiISQwl1EJIQU7iIiIaRwFxEJIYW7iEgIKdxFREJI4S4iEkIKdxGREFK4i4iEkMJd\nRCSEFO4iIiGkcBcRCSGFu4hICCncRURCSOEuIhJCCncRkRBSuIuIhJDCXUQkhBTuIiIhpHAXEQkh\nhbuISAgp3EVEQkjhLiISQgp3EZEQUriLiISQwl1EJISa8600xrQAtwB7A63ANdbaVNb6KHAVMAzc\nYq39QQVrFRGRIhVquZ8HrLPWvgc4GfhOZoUf/NcBJwLHApcaY3avVKEiIlK8QuF+O3B11rbDWevm\nAsuttT3W2iHgMeA9wZcoIiKlytstY63tAzDGdOAF/ZVZqzuBnqy/9wJdQRcoIiKlyxvuAMaYWcCd\nwA3W2p9mreoBOrL+3gFsKuKYbkkV1h+dX30r6/w2b97MQ0tX0dbemXObvq37cNd1m79ZznHKEObP\nL8zn5pT6hkIXVGcAvwQ+Ya19eNTqF4D9jTHTgD68LpmvV6LIOuKi86tnZZ9fd3d31/EXfe+Ctu6Z\n/bm26du8ZspZJ8271XXdnlzbVEiYP78wn9u4FGq5X4HX1XK1MSbT934T0GatvckY82ngAbz++Jut\nta9XrlQRESlWoT73fwf+Pc/6e4B7gi5KRETKo0FMIiIhpHAXEQkhhbuISAgp3EVEQkjhLiISQgp3\nkcqYgjfZnkhNFByhKiLFO/qc/5w7uX36ucACYA1wDeEeOSkTlFruIgGJxpNfmbaH+V1T86Qj8AJ9\nL2B+jcuSBqVwFwlANJ7cB/i866ZXDW7f+gPgy3gBf0ptK5NGpXAXCcbngaaBvs1fGtq+dRnwBvAU\nXuv94JpWJg1J4S5Spmg8uRdwIfDS7+/4wl1Zq+7zl2q9S9Up3EXK9xlgEvAfWze8ms56fTXwLLAv\nYGpRmDQuhbtIGaLx5AzgUuBvwI/H2OQX/vL4qhUlgsJdpFz/hndP+7WpRGxojPUr8R5ss081ixJR\nuIuMUzSebAYuwnsC2f/k2XQV0I33aEqRqlC4i4zfycBM4CepRGx7nu1W+cu9K1+SiEcjVKUojuO0\n3fbz3/L1H9x3Ub7tVr/w6FOv2ceeq1ZdNZb5t7ilwHbZ4f585coR2UHhLsVymlo7mTnnyLzfM+tW\nPTupWgXVUjSe3A2IAs+kErGnC2yulrtUnbplRMbnI3iNo3x97RlbgM0o3KWKFO4iJYrGkw5wMTAI\n/KTIt63Ee9h8V4XKEtmJwl2kdIcCBwLJVCK2ocj3/M1fqvUuVaFwFyndIn9Z6EJqtpX+UuEuVaFw\nFylBNJ6cDpwHrAB+WcJb1XKXqlK4i5TmImAycEMqEUsX2jhLL7ARhbtUicJdpEjReLIJ+ASwDbh1\nHLtYBXRGmlp0UVUqTuEuUrxT8OaI+XEqEds0jvevAmhqaZ0VaFUiY1C4ixTvk/7yhnG+fxVApKlZ\n4S4Vp3AXKUI0nnw7cBLwSCoRG+/0CmsAHKdpt8AKE8lB4S5SnK/4y/8qYx+bgBEnEtklgHpE8lK4\nixQQjSffDZwBPA4ky9iVC6x3nMiugRQmkofCXSSPaDwZARL+X+OpRMwtc5frHcdp2//IszvK3I9I\nXgp3kfw+BBwG3JZKxP4QwP7WAczY97DZAexLJCeFu0gO0XhyKvAfeBOE/a+AdrsOoLWtW4/dk4rS\nfO4iuf0XMBv4+j3XLVzlXFfUjI6dbnok3/p1AM0tk2eXXZ1IHgp3kTFE48kPAZcCzwBXAx1Hnf3V\nRa1t0wfyvW/L2pe7hgf7B4D+HJusB4g0tajlLhWlcBcZJRpP7gvcBPQB56QSse3OdbS2tk0faOue\nmSu0AejvXT+5wO7XATiRptmBFCuSQ1Hhbow5ArjWWnv8qNcvw3towTr/pUXW2heDLVGkeqLx5CRg\nMdAJfDSViNmADzHouulex4mo5S4VVTDcjTGfw3uk2NYxVs8HzrfWFnqGpEhNOI4TAYq57bDXdV2A\nrwOHAz9KJWI/rERNrutucJzIrGg82ZxKxIYrcQyRYlruy/EGcPxojHULgCuMMTOBe6211wZZnEgA\nCvaVD/RtbH3ititufOzZ1QD/BvwZ+HilCnLT6fWR5qbZwFuBlyt1HGlsBcPdWnunMWZ2jtWL8SZR\n6gXuMsacaq29N8D6RMpWTF/5uz54zZxv3fY0eP3sH0wlYn2Vqsd1RzZAC8C+KNylQsq9z/16a+1G\na+0QcC9wSAA1iVRbZNqec2/uHxgBuCSViP21kgdz0yPr/S/3reRxpLE5fj9jXn7LfbG19l1Zr3UB\nzwEH4D284GfAzdba+/Psqtyh21IjW7du5YEnltPW0Z13u7d0DDDvIFOlqgrr6enhoaWraGvvzLnN\n0mV/48kXNvHueTNYFJubcz9LX9hEe0f+W93XvrGaSKSFXXfbPec2L7+6lvv+8DpnHj+HC047sLgT\nkUbnlPqGUm6FdAGMMecC7dbam4wxlwMPAwPAQwWCfdxF1hGXkJ5fR0dH+5IHnu69Ycmzi/Jt9/yv\nbvzTymd+8VS16iqku7u76/iLvndBnm6ZTtdNfxkX9pjeOvn0T//0U2NttGXty11TOncf6J45J+9D\nOta/umxac8tk8m3X37t+xpSOXb90x8PLl1xw2oEfLOF0yhXa70/CfW7jUlS4W2tXAkf5Xy/Oen0x\nXr+7SL0603Eikwf6e+7edZdpC3P9J1DE/etFS48Mb3Fdt99xnP2C2qfIaJpbRhrZ/sCRI8NDq/u3\nrAtiUrDiuemVwH7ReFKtTakIhbs0sjMAtvW8cRdVvh7keuHeCUyv5nGlcSjcpVHt6/95dmDb5r9X\n++Dp9MjKrDpEAqdwl0Z1or98sBYHTw8PveJ/qX53qQiFuzSiXfHGZKwCXqpFASND21/1v3xrLY4v\n4adwl0b0Xrzb5h6qVQED/VtW+1/OqlUNEm4Kd2k0U4GjgU3AH2tVxOY1L2X6+RXuUhEKd2k0RwOt\nwK+BvI9MqqQXHvvRRmA7CnepEIW7NJp3AcPAY7UsYnigD+BVYK9a1iHhpXCXRrKn/2cZ3nxItfZ3\nYPdoPNla60IkfBTu0kgO95fVHY2aW+aOGbXeJXAKd2kUDl649+PNZjoRZMJd/e4SOIW7NIo5eEP9\nn8brc58I1HKXilG4S6M4wl/+vqZV7Ey3Q0rFKNylAThNeM/73Qy8WONisqlbRipG4S6h1zK5bS7e\n4KUnmVhPA1O4S8Uo3CX0mppb3+F/OWGeEOXbjPdAbvW5S+AU7hJq3TP3b4o0NR+EF6Sral1PtlQi\n5uK13tVyl8Ap3CXU3nnyp450HGcq8CwTq0sm4+/ALtF4cmqtC5FwUbhLqE1u3+UU/8tnalpIbrod\nUipC4S6hFY0nnabmSae6rjvAxLpLJpvCXSpC4S5hdpATicxOp4f/wsQZuDSa7piRilC4S5jFANJD\ng8/XupA8NJBJKkLhLmG20HXd4aHBbX+tdSF5qOUuFaFwl1CKxpN7AQvc9Mhjbnqkv9b15KE+d6kI\nhbuE1SkAw4P9v6h1IfmkErEtwBbUcpeAKdwlrE4F2PzG8l/WupAiaCCTBE7hLqHjP9nofYBdeucX\nV9a4nGL8HeiOxpPttS5EwkPhLmH0HryJwiZ0l0wW9btL4BTuEkaZUan31rSK4umOGQmcwl3C6BRg\nK/BYrQspUuZed7XcJTAKdwmVaDw5B3gb8FAqERuodT1jSadHHKDTcZwux3G6+jav2QgwNNC3X+Y1\n/49+PmXcmmtdgEjAMl0yE7a/faBvU+thp191Yfu0PXsAmppbZwBEmlqOP/6i763zttnY+sRtV9wI\n9NSwVKljCncJm0y431fTKgpondI10NY9MzO4ag1AU/OkjqzXRMqiX/skNKLxZBtwHPBcKhH7e4HN\nJ5LtQD8wrdaFSHgo3CVMjgNameCt9hw2o3CXACncJUxO9pf1GO6bgDZgUq0LkXAoKtyNMUcYYx4e\n4/WoMWapMeYJY8wlwZcnUpKTgV7giVoXMg6b/GV3TauQ0CgY7saYzwE34f26m/16C3AdcCJwLHCp\nMWb3ShQpUoh/C+QcvFsgh2pdzzhkwl1dMxKIYlruy4EzAGfU63OB5dbaHmvtEN6AkfcEXJ9IsTJd\nMvfXtIrxU7hLoAqGu7X2TsZ+RFknO9+D2wt0BVSXSKkU7iJZyrmg2gN0ZP29gx3foCJVE40nJwPH\nA39JJWJ/q3U947TZXyrcJRCO67oFNzLGzAYWW2vflfVaC/Bn4AigD+8iVtRa+3qeXRU+mExIW7du\n5YEnltPWkf9631s6Bph3kKlSVZ5nXlzLVTf+joXH7sfF/3TQTut6enp4aOkq2to7c75/7RuriURa\n2HW33JeMitmmnH1tHxzm5p//mdlv6eTUo/ehb+sW3nf43nR16ZdhAd7cLV5QKSNUXQBjzLlAu7X2\nJmPMp4EH8H4DuLlAsI+7yDriEtLz6+joaF/ywNO9Nyx5dlG+7Z7/1Y1/WvnML56qRk3+3CsdJ39y\n8TXNk6Z88ifJxxZeEjv4N6M26zzughvOaJ++Z86Rn+tfXTatuWUyV37yzGtznV9mm+6Zc/L+dlrM\ndnm2+fbK17esuWHJs1/p27xmylknzbvVdd0gpx8I7fcn4T63cSkq3K21K4Gj/K8XZ71+D3BPRSoT\nKazjqLO/uqipufUs13UHcSJzjr/oe7OzN9iy9uWu4cH+AbwRoBPdJtQtIwHR3DJS16Z0zpjiRCIz\ngefbumb0jl7f37t+cg3KGq9NwAz0cykB0AhVqWvNk6a83f/yLzUtJBgayCSBUbhLXWtqbsmE+7Ka\nFhIM3Q4pgVG4S93qnrl/kxNpNsAGYG2t6wmAwl0Co3CXujXvpH9d4DjOFLxbcsNA4S6BUbhL3Zrc\nset7/S8V7iKjKNylbjW1tL7Xdd00YGtdS0A0SlUCo3CXuhSNJ3dxnMgC102/Qn3cw16MrXjzOCnc\npWwKd6lX73Mcx0kPD71Q60ICtgndCikBULhLvXo/wMjQ9jCGeyc4TbUuROqbwl3qTjSedID3u667\nYXhoez09CLsYmwAn0tyiGcOkLAp3KSgaT3731MvuWn3Ho6sBPgu8u8YlHQjskR4ZepjwzTS6ASAS\naZ5e60KkvincJa9oPDkd+BgQGRpOA+wHfBBoqWFZ7wcYHtj2qxrWUCleuDc16aKqlEXhLoWcDjSP\nDPV/7ZzjZ4E3xXMrcHAlD+o4TsRxnK6x/qRHhk8BWPnsfU+56ZFKllELGwGcSJNa7lIWzT4nhZwN\n0LP2lTth7peApXiPtDsc+FMFj9tx1NlfXdTaNn1gp1cdp8WJNL3bTY+81rnr3qfU0XS+xdoA4DgR\ntdylLAp3ySkaT+4GnAAs/d3PrlzFxacArAZex2u5T6GCwdraNn2grXvm6P3vA7TgNC1rmdI5MNb7\n6pzXcnciarlLWdQtI/mcCTQBt416fSlew+CdVa8IDvCXYZlyYLQhoFfhLuVSuEs+Z/vL20e9/qS/\nPLyKtWQcCAwCy2tw7GrZgONMm9Kxqx4bJ+OmcJcxRePJtwDHAo+nErFXR61eB6wE3g50VLGsbmAP\n4CW8YfphtcFxnKa3H/PP+Z/GLZKHwl1yORPvgcM/y7F+Kd73z4KqVRT+LpmMjQAdu+09q9aFSP1S\nuEsu7/eXd+ZY/0d/+Y4q1JJxoL8MwyP18tkAMGlyh8Jdxk3hLrksAFanErFcw/s34z39aF+8Fn6l\nOXgt9414d+uE2QaApuZJb611IVK/FO7yJn5/+1vY0TrPZQXe7ZBvqXhRMBuYSvi7ZCBzO2RTi8Jd\nxk3hLmOZ7y8LDVJa4S/3q2AtGZkumUYId39+mYi6ZWTcFO4ylky4F2q5Z25HrFa4p4GwTfE7ln7X\ndbc7jsJdxk/hLmPJ3AFTqOW+BthG5cO9DW9k6suEa6qB3Nz0RpzILH96Y5GSKdxlLPOBNalE7LUC\n27l4gbs7lb3ffS7eBdVlFTzGhOK67kbHcdrRI/dknBTushN/PplZFO6SyahGv3sj9bcD4LrpTf6X\ne9e0EKlbCncZrdiLqRnVCvdeYPRI2dBy0yMb/S8V7jIuCncZLdPfXmzL/RW8C52VCvc9gS68gUth\ne+pSTun0iFruUhaFu4xWast9EK9FvTeVmUK64bpkQC13KZ/CXUZbgDcxWCkPnl6BF+yVGHQzD6/F\n3lDhnh4eUstdyqJwl3/wn5c6G/hTKhErpQsk0+8+J8h6nEjTVLzunhXA1iD3PdGl08O9rutuR+Eu\n46Rwl2yH+Mti+9szKjKYqaV16gF4t0A+H+R+64br/h3vP1uRkincJVup/e0Zm/GGzM/BCW7MTaR5\nUqa//dnAdlpHXHdkBbCL/xuVSEkU7pJtnr8cT5iuANp3e+u8QLoRdt9nQUsk0jwXWE/4Z4EcU3pk\nJPMb0dtqWojUJYW7ZDsYbzqBl8fx3uUAHbvMmldow2IceNwlRzmOMxl4Loj91aOR4YGX/C8V7lKy\nvLeuGWMiwHfxHsgwAFxirV2Rtf4y4GK8uysAFllrX6xQrVJB0XiyBW+Y/9OpRCw9jl0sB2iZ0hlI\nuLe2TzvZ/7Ihu2QAhgb6VrRO7QIwta5F6k+h+5IXApOstUcZY44AEv5rGfOB8621T1eqQKmatwEt\njP/i5WtAf3PL5HeWW0g0nnSamiZ9wHXdAcdxXir8jnDasnbFS+3T9gC13GUcCnXLHA3cD2Ct/QNw\n6Kj1C4ArjDGPGmMur0B9Uj0H+8vxhrsLvBxpan5rNJ4s98HOc51IZLabHv4rMFLmvurWcw99bw3e\nLaBquUvJCoV7J7Al6+8jfldNxmJgEXACcIwx5tSA65PqyTwLtZzbDjOt7KPLrOUsgJHhwYaZBXIs\nwwN9AC8C+0fjSV0fk5IU6pbZws5TuUastdn9sddba7cAGGPuxbtP+t4C+wz7/CB1eX6HHTCDJ//y\nBj/+4sm/Gmt9b28vDzyxnH85a96Nufaxeu1W7n5kBQuP3S/XQ7ULcl2XPXdrZ+2mbZz3vgM+0t3d\n/ZFc2659Y1cikRZ23S33LwqlbAPkPL9i9hNkTX1b9+Gu6zZ/8/s/f4lHnlnNzVeeGNRvMHX5/Vmk\nMJ9byfcYFwr3x4EocLsx5kiy7lwwxnQBzxljDsC7w+IE4OZKFFlHXOr0/J78yxsrgdau9tYxn4fa\n0dHRvuSBp3tvWPLsojy7meS67jfv/u2Kpy7+p4OOHE8d//SZn88H/jgyPHjX95f84bdt3TNzPpxj\n/avLpjW3TKZ75pxNQWxz5SfPvDbX+RWznyBr6tu8ZspZJ8279bRP330Z8IWLv/LgSalE7MF8xy5C\n3X5/FiHM5zYuhX7VuwvYbox5HO9i6mXGmHONMR+z1vYAlwMPA48Ay6y191e2XKmEaDzZhTfMvdyR\noIPpkaEXgPnReHLqOPdxLsDgtp4lZdYSFpm7z3RRVUqSt+VurXWBj496+cWs9Yvx+t2lvh3kL8se\n5j882P9Mkzey9HDgN6W81+9XPgfo+fNvbnlw7ns++uFy6wkB6y91UVVKoos0AuXfKfMP27dueMr/\ncjwX148B9gLuWLP8d4Pl1hISGsgk46JwF9gR7mWPBl2z/A9L8W7fO3McD3fOtNT/X7l1hEUqEduC\nN/2CWu5SEoW7gBfuaeCv5e5osH/LIHAPsA9Q9IAmf4TsWcAaSuzOaQAvAntH48nJtS5E6ofCvcH5\nreuDgZdSiVjOO1NKdIe/PKOE93wI2AVYnErEGnbgUg4W706QQOfLl3BTuMteQDfBzpl+H9APnFnM\nxv5/MJ/DG4367QDrCAvdMSMlU7hLYBdTM1KJWB/etBVzo/Hk3CLecjLeCNmfpRKxV4KqI0R0x4yU\nTOEumQd0PBPwfjNdM8W03j/vL78WcA1hoZa7lEzhLof5y6fyblW6e4AhCoR7NJ48AjgWeCCViAX9\nH0xYvAIM403JLFIUhbscCryWSsReC3KnqUSsB3gQeGc0njwkz6aZVvt/Bnn8MEklYkPAMmCef1eR\nSEEK9wYWjSf3APYAnqzQIb7lL/8nGk9OGuP45wCnA0vR7Y+FPAlMBg4stKEIKNwb3QJ/GXSXDACp\nROwB4Ca8Z7N+IXtdNJ6cD9wC9AIXpBKxMM/oF4TMf8CH5d1KxKdwb2yZh69UJNx9cbw+48uj8eSR\nANF4cgZwN15L9MOpRKzswVMNQOEuJVG4N7ZMuP+xUgdIJWK9wAV4g3B+HY0nNwB/A2YBV6YSsXsq\ndeyQ+TOwnTc/DU1kTAr3BuUPHDoUWJVKxNYV2r4cqUTsEeAyYDXe9ALPA18Frq3kccPEv6j6DHCw\npiGQYhR6WIeE1yxgd3bcj15RqUTseuD6ahwrxJ4EjsSbs+f3Na5FJji13BtXNfrbJVjqd5eiKdwb\nl8K9/ijcpWgK98ZV8YupErgX8W4d1UVVKUjh3oCyLqauSCVieR/4LBNHKhFL4/1n/PZoPNlR63pk\nYlO4N6b9gGlUbmSqVM6TeLeVLii0oTQ23S3TmE7yl48GudP0yLCzfevGNsdxugps2uu6bjrIYzeQ\n7H7339SwDpngFO6N6QP+8r4gd9rfu37yrIPee+YBx16Y8/F6A30bW5+47YobgZ4gj91AlvrL9wBf\nr2UhMrGpW6bB+ANgTgBeqMSDMSZNbh9s657Zn+tPa9v0gaCP2UhSidgqvNGqJ0bjyfZa1yMTl8K9\n8bwbmErArXapqruBVuD9tS5EJi6Fe+M5xV/WJNzT6REH6HQcpyvfH6DTTes52Tnc5S9Pr2kVMqGp\nz73xfADYBjxSi4MP9G1qPez0qy5sn7Zn3j73LWtf7hoe7B/Ae9C27OxPwKvAadF4ssWfd0ZkJ2q5\nN5BoPLkP3kOWf5VKxGrW9906pWsgX798W/fM/pYpneqbz8Gf+/5uoAs4rrbVyESlcG8sFblLRmri\nbn+5sKZVyISlcG8sCvfweATYBCyMxpP6OZY30TdFg4jGk13suAVyZY3LkTKlErFhIIX3DFzNNSNv\nonBvHJ/AuwXyh7UuRAJzp79cVNMqZEJSuDeAaDw5Fe9JSD3Ad2tcjgTnHuAvwAXReHJurYuRiUXh\n3hguAXYDvp1KxDTsPyRSidgIcCXez/E1NS5HJhiFe8hF48lJwGfx7m3XY+7CJ4n3yL0zovHkEbUu\nRiYOhXv4nQ/sBdyYSsTW17oYCZZ/z/vl/l+v9efqF1G4h1k0nnwr8EVgEPhGjcuRCkklYr/Fu731\nOODS2lYjE0Xe6QeMMRG8C3DvAAaAS6y1K7LWR4GrgGHgFmvtDypYq5QgGk/OAB4E9gQ+m0rEXqtx\nSVJZl+HN8f69aDxJKhG7sdYFSW0VarkvBCZZa4/C+9UvkVlhjGkBrgNOBI4FLjXG7F6pQqV40Xhy\nGvAA8Dbg2lQiplZ7yKUSMQscD6zDC/h/URdNYysU7kcD9wNYa//AzoMl5gLLrbU91toh4DG8BwhI\njUTjyY5oPBkHngfmAf8NXFHbqqRaUonYMryAXwt8B/h9NJ48QyNYG1OhWSE7gS1Zfx8xxkSstWl/\nXfZtdb14ExlVjN8SmV7JY5TjJ1/6AOddfd8uAezKyVpm/2nC+8xagA68z2A34CDgYLwRqN1AH/BV\n4Cr/gps0iFQi9udoPHkM8DW837zvAFZH48knzzv57fzk/hfOATYAG/G+Twb9PyNAGnD9Pxm5vp5Q\nAvzZy9js32patwqF+xa8EMnIBDt4wZ69rgNvrotKugH4eIWPMW7nXX0fQC3vSFmDdw3ku6lEbGPA\n+3YHt23mjRVP5vxVf3vfJiZN7Wzq27xmSq5thvq3tI4MD5Bvm2K3C3ybrVtyblftugf6NrbmO04+\nqUTsJeB0f2DTZ4DTgIU/uf8FgMXj3e9EVoGfvQfZ8azhuuS4bu7/jI0xZwBRa+2Fxpgjgaustaf6\n61rwHvd1BF4L4Al/29crX7aIiORTKNwddtwtA3AhsABot9beZIw5Dbgar+/+Zmvtf1e4XhERKULe\ncBcRkfqkq+giIiGkcBcRCSGFu4hICCncRURCqNB97oEwxpwOnGWtPW+MdR/Dm+xoGLjGWntvNWoK\ngjFmCvBjvIFEvcBHrbXrR21zPd5I3168QSALrbVbRu9rIgnznEJFnNtlwMV4w/gBFllrX6x6oWUw\nxhwBXGutPX7U63X7uWXLc35h+OxagFuAvYFWvExMZa0v+jOseLj74XYS8PQY62YC/4p3e+UU4DFj\nzIPW2sG3PBIMAAAFRUlEQVRK1xWQjwPPWmu/ZIw5G/jfwKdGbTMfOMlaG/Sgokr6x5xC/g9Swn8t\ne06hQ/HmiH/cGPNza+3amlVbmpzn5psPnG+tfdP3az0wxnwO+AiwddTr9f65AbnPz1fXn53vPGCd\ntfZ8Y8w04Bm8Z+WW/BlWo1vmcbwQHGtk4+HA49baIb81u5wd99TXg3/MveMv35e90m8l7g/cZIx5\nzBhzYZXrG68wzymU79zAa2hcYYx51Bhz+eg314HlwBm8+eet3j+3jFznB/X/2QHcjjd2CLx8Hs5a\nV9JnGFjL3RhzMW9utV5grf2ZMea4HG/roMrz04xXjvN7gx1z74xV+1TgW3j/2zYDDxtjnrLWPl/J\nWgMwoeYUCli+cwNveP4NeOd1lzHm1HrqKrTW3mmMmT3Gqnr/3IC85wd1/tkBWGv7AIwxHXhBf2XW\n6pI+w8DC3Vp7M3BziW8bPXdNNeanGZexzs8Ycwc76u8ANo962zbgW9ba7f72v8abrXGih/tEm1Mo\nSPnODeD6zDURY8y9wCFAXQVEDvX+uRUjFJ+dMWYWcCdwg7X2p1mrSvoMq3JBNY+lwFeMMa3AZLxf\nO5bVtqSSPA6cAjwJfAB4ZNR6Ayw2xszHm9HxGODWahY4To8DUeB2f06h57LWvQDs7/cH9uH9Wvj1\n6pc4bjnPzRjTBTxnjDkA7z/mEyi9wTJR1fvnlldYPjtjzAzgl8AnrLUPj1pd0mdYrXDfaRpR/6r2\ncmttyhjzLeBRvP6lK+roYip486X/X2PMo3h3XnwY3nR+PwR+BwwBt1pr/1qzaot3F3CiMeZx/+8X\nGmPOZcecQp/GexhIZk6heposrtC5XQ48jPd5PmStvT/XjiY4FyBEn9toY51fGD67K/C6Wq42xmT6\n3m8C2kr9DDW3jIhICGkQk4hICCncRURCSOEuIhJCCncRkRBSuIuIhJDCXUQkhGo9iEkkMMaY7wCr\ngWMyD3IfY5suvPEGp1e1OJEqU8tdwsQFVucKdt804J1VqkekZjSISeqaMeYbeNMJvAEMAj8C/o+1\ndh9jzIeBzwIjwCt4U8XeDrwfuMdae6Yx5it4Q9WnA+uBM6y1bxhjXve3PQZvZr4PWWtXGmPeB3wD\nr2G0Cm9U8ja8YeDH4k0zcau19ptV+QcQyUEtd6lbxpgz8absPQCIAXNGbfJl4ERr7aF483IYvOcH\nvOYH+xzgbdbad1lrDd50spkHyszAG8I+H2/OoE8aYybhPZzln62178Cbl+ajwMcA11q7ADgCWGiM\nOaZiJy5SBPW5Sz07DlhirR0BNhlj7h61PgU84b9+h7X2uezpYq21y40xnzHGXIoX/O/CC/iMzNwk\ny/AmaToYr9vnOf/9VwIYY5YA84wxJ/jbtwEH4c23LVITarlLPXPZ+Xs4+8EGWGs/BZwJbAR+bIzZ\n6TGPxpgFeDPwgdcFcxdZD4HImsTO9V8fGvX+TmPMXn4Nn7XWHmKtPQTvgSC3lnVmImVSuEs9exA4\nxxgzyRjTCZyWWWGMiRhjLLDeWnst8EO8C6lD7PiN9VjgN9ba7wN/xXscZNMYx8kEvgV2M8bM9f/+\neWAR8GvgUmNMszGmHW+W08MDPE+RkqlbRuqWP6XyoXjdJuvw+tXB6/9OG2O+ADxkjNmG91CDj/rb\n/c0Y8yvgfOBOY8zTeBdT7wP2yewj61Cuv88BY8xHgB/6/e/L/X0M4j1O8Wm8n6mbrbWj5/YXqSrd\nLSMiEkLqlhERCSGFu4hICCncRURCSOEuIhJCCncRkRBSuIuIhJDCXUQkhBTuIiIh9P8BKnU+O7QE\napUAAAAASUVORK5CYII=\n", 79 | "text/plain": [ 80 | "" 81 | ] 82 | }, 83 | "metadata": {}, 84 | "output_type": "display_data" 85 | }, 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "count 5.460000e+02\n", 90 | "mean 4.730751e-01\n", 91 | "std 2.469968e-01\n", 92 | "min -2.220446e-16\n", 93 | "25% 3.382893e-01\n", 94 | "50% 5.232503e-01\n", 95 | "75% 6.789978e-01\n", 96 | "max 8.250250e-01\n", 97 | "Name: distance, dtype: float64" 98 | ] 99 | }, 100 | "execution_count": 40, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "sns.distplot(df['distance'])\n", 107 | "plt.show()\n", 108 | "df['distance'].describe()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 41, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "(78,)\n" 123 | ] 124 | }, 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "(468, 4)" 129 | ] 130 | }, 131 | "execution_count": 41, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "df2 = df[df['distance'] < df['distance'].quantile(1)]\n", 138 | "df2 = df\n", 139 | "df2 = df2[df2['category'] != df2['similar']]\n", 140 | "print(df2['category'].unique().shape)\n", 141 | "df2.shape" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 42, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "g vert/edges: 78 468\n", 156 | "[10, 13, 14, 10, 12, 21, 17, 9, 8, 10, 13, 11, 11, 14, 8, 9, 9, 15, 10, 13, 9, 14, 11, 10, 17, 15, 8, 13, 13, 11, 14, 9, 10, 11, 13, 14, 15, 16, 15, 16, 17, 9, 12, 12, 15, 10, 11, 12, 10, 10, 10, 12, 11, 12, 10, 12, 16, 11, 9, 11, 7, 17, 19, 9, 11, 14, 10, 12, 9, 11, 11, 9, 7, 16, 8, 11, 13, 19]\n", 157 | "level 0 : resizing 78 -> 6 , dS: -544.0455019514075\n", 158 | "level 1 : resizing 1 -> 1 , dS: 0.0 [kept, rejected (1, 81.0761) vs (1, 81.0761)]\n", 159 | "level 0 : skipping 6\n", 160 | "drawing again, with rotation\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "def build_cat_cat_net(df_in):\n", 166 | " \n", 167 | " df = df_in.copy(deep=True) # python mutable arguments...\n", 168 | " \n", 169 | " #filter to most common cats\n", 170 | " cats = set(df['category']).union(set(df['similar']))\n", 171 | " \n", 172 | " #build graph-tool ids\n", 173 | " node_lbs = {}\n", 174 | " rev_node_lbs = {}\n", 175 | " for idx,cat in enumerate(cats):\n", 176 | " node_lbs[cat] = idx\n", 177 | " rev_node_lbs[idx] = cat\n", 178 | " \n", 179 | " edge_list = []\n", 180 | " edge_ws = {}\n", 181 | " MAX_DIST = df['distance'].max()\n", 182 | " for row in df.iterrows():\n", 183 | " edge = tuple(sorted((node_lbs[row[1]['category']], node_lbs[row[1]['similar']])))\n", 184 | " edge_list.append(edge)\n", 185 | " edge_ws[edge] = MAX_DIST - row[1]['distance']\n", 186 | "\n", 187 | " g = gt.Graph(directed=False)\n", 188 | " g.add_edge_list(edge_list)\n", 189 | " \n", 190 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 191 | " for v in g.vertices():\n", 192 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 193 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 194 | " \n", 195 | " #add edge weight property\n", 196 | " g.edge_properties['weight'] = g.new_edge_property('double')\n", 197 | " g.edge_properties['color'] = g.new_edge_property('vector')\n", 198 | " for e in g.edges():\n", 199 | " w = edge_ws[tuple(sorted([e.source(),e.target()]))]\n", 200 | " g.edge_properties['weight'][e] = w\n", 201 | " alpha = w + .125\n", 202 | " g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha] \n", 203 | " \n", 204 | " print([v.out_degree() for v in g.vertices()])\n", 205 | " \n", 206 | " state = gt.minimize_nested_blockmodel_dl(g, \n", 207 | " deg_corr=True,\n", 208 | " weight=g.ep['weight'],\n", 209 | " verbose=True)\n", 210 | " \n", 211 | " #this 'draw' is only to get the node coords sorted out\n", 212 | " pos,t,tpos = gt.draw_hierarchy(state,\n", 213 | " vertex_text=g.vertex_properties['label'],\n", 214 | " vertex_text_position=1,\n", 215 | " vertex_font_size=20,\n", 216 | " vertex_font_family='mono',\n", 217 | " vertex_anchor=0,\n", 218 | " vcmap=matplotlib.cm.Spectral,\n", 219 | " ecmap=matplotlib.cm.Spectral,\n", 220 | " bg_color=[0,0,0,1],\n", 221 | " output_size=[1024*2,1024*2],\n", 222 | " output='/tmp/badtext.png')\n", 223 | " \n", 224 | " #text rotation\n", 225 | " text_rot = g.new_vertex_property('double')\n", 226 | " g.vertex_properties['text_rot'] = text_rot\n", 227 | " text_pos = g.new_vertex_property('double')\n", 228 | " g.vertex_properties['text_pos'] = text_pos\n", 229 | " for v in g.vertices():\n", 230 | " if pos[v][0] > 0:\n", 231 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 232 | " else:\n", 233 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 234 | " text_pos[v] = 10\n", 235 | " \n", 236 | " print('drawing again, with rotation')\n", 237 | " pos,t,tpos = gt.draw_hierarchy(state,\n", 238 | " vertex_text=g.vertex_properties['label'],\n", 239 | " vertex_text_rotation=g.vertex_properties['text_rot'],\n", 240 | " vertex_text_position=g.vp['text_pos'],\n", 241 | " vertex_font_size=13,\n", 242 | " vertex_font_family='mono',\n", 243 | " vertex_anchor=0,\n", 244 | " halpha=0, hsize_scale=0,\n", 245 | " vcmap=matplotlib.cm.Spectral,\n", 246 | " ecmap=matplotlib.cm.Spectral,\n", 247 | " bg_color=[0,0,0,1],\n", 248 | " output_size=[2600,2600],\n", 249 | " fit_view=.75,\n", 250 | " output='/home/aahu/Desktop/similarity_df_all.png')\n", 251 | " \n", 252 | " return\n", 253 | "\n", 254 | "build_cat_cat_net(df2)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 43, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "g vert/edges: 76 2246\n", 269 | "level 0 : resizing 76 -> 9 , dS: -4744.86026056495\n", 270 | "level 1 : resizing 1 -> 2 , dS: -2.19109615384\n", 271 | "level 2 : resizing 1 -> 1 , dS: 0.0 [kept, rejected (1, 21.3895) vs (1, 21.3895)]\n", 272 | "level 1 : skipping 2\n", 273 | "level 0 : resizing 9 -> 9 , dS: 0.0 [kept, rejected (9, 656861) vs (9, 656561)]\n", 274 | "drawing again, with rotation\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "def build_cat_net_from_shared_vendors(df):\n", 280 | " #build graph-tool ids\n", 281 | " node_lbs = {}\n", 282 | " rev_node_lbs = {}\n", 283 | " for idx,vendor in enumerate(df['category'].drop_duplicates()):\n", 284 | " node_lbs[vendor] = idx\n", 285 | " rev_node_lbs[idx] = vendor\n", 286 | " df['id'] = df['category'].map(lambda x:node_lbs[x])\n", 287 | " \n", 288 | " edge_list = []\n", 289 | " dfg = df.groupby('vendor')\n", 290 | " for name,group in dfg:\n", 291 | " ei = itertools.combinations(group['id'].drop_duplicates(),2)\n", 292 | " for e in ei:\n", 293 | " edge_list.append(tuple(sorted(e)))\n", 294 | "\n", 295 | " #filter edges by num shared vendors\n", 296 | " MIN_SHARED_VENDORS=3\n", 297 | " c = collections.Counter(edge_list)\n", 298 | " edge_list = [e for e in c if c[e]>=MIN_SHARED_VENDORS]\n", 299 | "\n", 300 | " #build graph\n", 301 | " g = gt.Graph(directed=False)\n", 302 | " g.add_edge_list(edge_list)\n", 303 | " g.vertex_properties['label'] = g.new_vertex_property('string')\n", 304 | " for v in g.vertices():\n", 305 | " g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]\n", 306 | " print('g vert/edges: ',g.num_vertices(), g.num_edges())\n", 307 | "\n", 308 | " #add edge weight property\n", 309 | " g.edge_properties['weight'] = g.new_edge_property('double')\n", 310 | " g.edge_properties['color'] = g.new_edge_property('vector')\n", 311 | " for e in g.edges():\n", 312 | " w = c[tuple(sorted([e.source(),e.target()]))]\n", 313 | " g.edge_properties['weight'][e] = w\n", 314 | " alpha = (float(w)/max(c.values())) + .025\n", 315 | " g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha] \n", 316 | " \n", 317 | " state = gt.minimize_nested_blockmodel_dl(g,\n", 318 | " deg_corr=True,\n", 319 | " eweight=g.ep['weight'], \n", 320 | " verbose=True,\n", 321 | " max_B=9)\n", 322 | " \n", 323 | " pos,t,tpos = gt.draw_hierarchy(state,\n", 324 | " vertex_text=g.vertex_properties['label'],\n", 325 | " vertex_text_position=1,\n", 326 | " vertex_font_size=20,\n", 327 | " vertex_font_family='mono',\n", 328 | " vertex_anchor=0,\n", 329 | " vcmap=matplotlib.cm.Spectral,\n", 330 | " ecmap=matplotlib.cm.Spectral,\n", 331 | " bg_color=[0,0,0,1],\n", 332 | " output_size=[1024*2,1024*2],\n", 333 | " output='/home/aahu/Desktop/cats_from_vends.png')\n", 334 | " \n", 335 | " #text rotation\n", 336 | " text_rot = g.new_vertex_property('double')\n", 337 | " g.vertex_properties['text_rot'] = text_rot\n", 338 | " text_pos = g.new_vertex_property('double')\n", 339 | " g.vertex_properties['text_pos'] = text_pos\n", 340 | " for v in g.vertices():\n", 341 | " if pos[v][0] > 0:\n", 342 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 343 | " else:\n", 344 | " text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 345 | " text_pos[v] = 10 #len(g.vp['label'][v].strip())\n", 346 | "\n", 347 | "# #text rotation\n", 348 | "# text_rot = g.new_vertex_property('double')\n", 349 | "# g.vertex_properties['text_rot'] = text_rot\n", 350 | "# for v in g.vertices():\n", 351 | "# if pos[v][0] >0:\n", 352 | "# text_rot[v] = math.atan(pos[v][1]/pos[v][0])\n", 353 | "# else:\n", 354 | "# text_rot[v] = math.pi + math.atan(pos[v][1]/pos[v][0])\n", 355 | " \n", 356 | " print('drawing again, with rotation')\n", 357 | " pos,t,tpos = gt.draw_hierarchy(state,\n", 358 | " vertex_text=g.vertex_properties['label'],\n", 359 | " vertex_text_rotation=g.vertex_properties['text_rot'],\n", 360 | " vertex_text_position=g.vp['text_pos'],\n", 361 | " vertex_font_size=18,\n", 362 | " vertex_font_family='mono',\n", 363 | " vertex_anchor=0,\n", 364 | " vcmap=matplotlib.cm.Spectral,\n", 365 | " ecmap=matplotlib.cm.Spectral,\n", 366 | " bg_color=[0,0,0,1],\n", 367 | " output_size=[1024*2,1024*2],\n", 368 | " output='/home/aahu/Desktop/cats_from_vends.png')\n", 369 | " return\n", 370 | "\n", 371 | "df3 = pd.read_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/evolution.tsv',sep='\\t')\n", 372 | "build_cat_net_from_shared_vendors(df3)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [] 383 | } 384 | ], 385 | "metadata": { 386 | "kernelspec": { 387 | "display_name": "Python 3", 388 | "language": "python", 389 | "name": "python3" 390 | }, 391 | "language_info": { 392 | "codemirror_mode": { 393 | "name": "ipython", 394 | "version": 3 395 | }, 396 | "file_extension": ".py", 397 | "mimetype": "text/x-python", 398 | "name": "python", 399 | "nbconvert_exporter": "python", 400 | "pygments_lexer": "ipython3", 401 | "version": "3.4.0" 402 | } 403 | }, 404 | "nbformat": 4, 405 | "nbformat_minor": 0 406 | } 407 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | 676 | -------------------------------------------------------------------------------- /bmrs/postprocessing/.ipynb_checkpoints/postprocess_all-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import dateutil\n", 13 | "import os\n", 14 | "import ast\n", 15 | "\n", 16 | "import logging\n", 17 | "FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'\n", 18 | "DATE_FORMAT = '%b %d %H:%M:%S'\n", 19 | "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)\n", 20 | "handler = logging.StreamHandler()\n", 21 | "handler.setFormatter(formatter)\n", 22 | "logger = logging.getLogger(__name__)\n", 23 | "logger.addHandler(handler)\n", 24 | "logger.setLevel(logging.INFO)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "META_CATS = ['Other','Drugs','Services', 'Custom Listings', 'DRUGS & MORE','other service','other drugs','others']\n", 36 | "META_CATS = [s.lower() for s in META_CATS]" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "def load_agora():\n", 48 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'\n", 49 | " l=[]\n", 50 | " for fname in os.listdir(DATA_DIR):\n", 51 | " if fname.endswith('.tsv'):\n", 52 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 53 | " l.append(df0)\n", 54 | " df = pd.concat(l)\n", 55 | " logger.info(df.columns)\n", 56 | " logger.info(df.shape)\n", 57 | " return df" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def load_pandora():\n", 69 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/pandora/'\n", 70 | " l=[]\n", 71 | " for fname in os.listdir(DATA_DIR):\n", 72 | " if fname.endswith('.tsv'):\n", 73 | " try:\n", 74 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 75 | " l.append(df0)\n", 76 | " except ValueError:\n", 77 | " #logger.exception('no data')\n", 78 | " pass\n", 79 | " df = pd.concat(l)\n", 80 | " logger.info(df.columns)\n", 81 | " logger.info(df.shape)\n", 82 | " return df" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "def load_cloudnine():\n", 94 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'\n", 95 | " l=[]\n", 96 | " for fname in os.listdir(DATA_DIR):\n", 97 | " if fname.endswith('.tsv'):\n", 98 | " try:\n", 99 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 100 | " l.append(df0)\n", 101 | " except ValueError:\n", 102 | " logger.exception('no data')\n", 103 | " pass\n", 104 | " df = pd.concat(l)\n", 105 | " logger.info(df.columns)\n", 106 | " logger.info(df.shape)\n", 107 | " \n", 108 | " #be consistent\n", 109 | " df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 110 | " df['cat'] = df['cat'].map(lambda x: ast.literal_eval(x))\n", 111 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 112 | " \n", 113 | " return df" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "def load_hydra():\n", 125 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/hydra/'\n", 126 | " l=[]\n", 127 | " for fname in os.listdir(DATA_DIR):\n", 128 | " if fname.endswith('.tsv'):\n", 129 | " try:\n", 130 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 131 | " l.append(df0)\n", 132 | " except ValueError:\n", 133 | " logger.exception('no data')\n", 134 | " pass\n", 135 | " df = pd.concat(l)\n", 136 | " logger.info(df.columns)\n", 137 | " logger.info(df.shape)\n", 138 | " \n", 139 | " #be consistent\n", 140 | " df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 141 | " df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))\n", 142 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 143 | " \n", 144 | " return df" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 7, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "def load_evolution():\n", 156 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/evolution/'\n", 157 | " l=[]\n", 158 | " for fname in os.listdir(DATA_DIR):\n", 159 | " if fname.endswith('.tsv'):\n", 160 | " try:\n", 161 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 162 | " l.append(df0)\n", 163 | " except ValueError:\n", 164 | " logger.exception('no data')\n", 165 | " pass\n", 166 | " df = pd.concat(l)\n", 167 | " logger.info(df.columns)\n", 168 | " logger.info(df.shape)\n", 169 | " \n", 170 | " #be consistent\n", 171 | " #df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)\n", 172 | " #df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))\n", 173 | " #df['category'] = df['cat'].map(lambda x: x[-1])\n", 174 | " \n", 175 | " return df" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "def postprocess(df):\n", 187 | " \"\"\"\n", 188 | " standardized postprocessing\n", 189 | " \"\"\"\n", 190 | " #normalize\n", 191 | " df['category'] = df['category'].map(lambda x:x.lower())\n", 192 | " \n", 193 | " #discard meta-categories\n", 194 | " df = df[df['category'].map(lambda x:x not in META_CATS)]\n", 195 | " logger.info(df.shape)\n", 196 | " \n", 197 | " #discard non-string categories\n", 198 | " def isfloat(value):\n", 199 | " try:\n", 200 | " float(value)\n", 201 | " return True\n", 202 | " except ValueError:\n", 203 | " return False\n", 204 | " df = df[df['category'].map(lambda x:not isfloat(x))]\n", 205 | " \n", 206 | " return df" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [ 216 | { 217 | "name": "stderr", 218 | "output_type": "stream", 219 | "text": [ 220 | "May 20 21:29:41 INFO Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 221 | "INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')\n", 222 | "May 20 21:29:41 INFO (663912, 7)\n", 223 | "INFO:__main__:(663912, 7)\n" 224 | ] 225 | }, 226 | { 227 | "ename": "ValueError", 228 | "evalue": "malformed node or string: <_ast.BinOp object at 0x7f55aa712a90>", 229 | "output_type": "error", 230 | "traceback": [ 231 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 232 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 233 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0msr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_silkroad2\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[0msr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 234 | "\u001b[1;32m\u001b[0m in \u001b[0;36mload_silkroad2\u001b[1;34m()\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;31m#be consistent\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cat'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'category'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mast\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mliteral_eval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'category'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cat'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 235 | "\u001b[1;32m/usr/local/lib/python3.4/dist-packages/pandas/core/series.py\u001b[0m in \u001b[0;36mmap\u001b[1;34m(self, arg, na_action)\u001b[0m\n\u001b[0;32m 2013\u001b[0m index=self.index).__finalize__(self)\n\u001b[0;32m 2014\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2015\u001b[1;33m \u001b[0mmapped\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap_f\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2016\u001b[0m return self._constructor(mapped,\n\u001b[0;32m 2017\u001b[0m index=self.index).__finalize__(self)\n", 236 | "\u001b[1;32mpandas/src/inference.pyx\u001b[0m in \u001b[0;36mpandas.lib.map_infer (pandas/lib.c:57158)\u001b[1;34m()\u001b[0m\n", 237 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;31m#be consistent\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cat'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'category'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mast\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mliteral_eval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'category'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cat'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 238 | "\u001b[1;32m/usr/lib/python3.4/ast.py\u001b[0m in \u001b[0;36mliteral_eval\u001b[1;34m(node_or_string)\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mleft\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mright\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 83\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'malformed node or string: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 84\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_convert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode_or_string\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 85\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 239 | "\u001b[1;32m/usr/lib/python3.4/ast.py\u001b[0m in \u001b[0;36m_convert\u001b[1;34m(node)\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mleft\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mright\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 83\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'malformed node or string: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 84\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_convert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode_or_string\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 240 | "\u001b[1;31mValueError\u001b[0m: malformed node or string: <_ast.BinOp object at 0x7f55aa712a90>" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "def load_silkroad2():\n", 246 | " DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2/'\n", 247 | " l=[]\n", 248 | " for fname in os.listdir(DATA_DIR):\n", 249 | " if fname.endswith('.tsv'):\n", 250 | " try:\n", 251 | " df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\\t', parse_dates=['scrape_date'])\n", 252 | " l.append(df0)\n", 253 | " except ValueError:\n", 254 | " logger.exception('no data')\n", 255 | " pass\n", 256 | " df = pd.concat(l)\n", 257 | " logger.info(df.columns)\n", 258 | " logger.info(df.shape)\n", 259 | " \n", 260 | " #be consistent\n", 261 | " df['cat'] = df['category'].map(lambda x: x.split('-'))\n", 262 | " df['category'] = df['cat'].map(lambda x: x[-1])\n", 263 | " \n", 264 | " return df\n", 265 | "\n", 266 | "sr = load_silkroad2()\n", 267 | "sr" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 9, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [ 277 | { 278 | "name": "stderr", 279 | "output_type": "stream", 280 | "text": [ 281 | "May 18 18:50:45 INFO Index(['category', 'listing', 'scrape_date', 'vendor'], dtype='object')\n", 282 | "INFO:__main__:Index(['category', 'listing', 'scrape_date', 'vendor'], dtype='object')\n", 283 | "May 18 18:50:45 INFO (3702353, 4)\n", 284 | "INFO:__main__:(3702353, 4)\n", 285 | "May 18 18:50:48 INFO (3001716, 4)\n", 286 | "INFO:__main__:(3001716, 4)\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "# cn = load_cloudnine()\n", 292 | "# cn = postprocess(cn)\n", 293 | "# cn.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine.tsv',sep='\\t',index=False)\n", 294 | "\n", 295 | "# ag = load_agora()\n", 296 | "# ag = postprocess(ag)\n", 297 | "# ag.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/agora.tsv',sep='\\t',index=False)\n", 298 | "\n", 299 | "#pa = load_pandora()\n", 300 | "#pa = postprocess(pa)\n", 301 | "#pa.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/pandora.tsv',sep='\\t',index=False)\n", 302 | "\n", 303 | "#hy = load_hydra()\n", 304 | "#hy = postprocess(hy)\n", 305 | "#hy.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/hydra.tsv',sep='\\t',index=False)\n", 306 | "\n", 307 | "# ev = load_evolution()\n", 308 | "# ev = postprocess(ev)\n", 309 | "# ev.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/evolution.tsv',sep='\\t',index=False)\n", 310 | "\n", 311 | "sr2 = load_silkroad2()\n", 312 | "sr2 = postprocess(sr2)\n", 313 | "sr2.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2.tsv',sep='\\t',index=False)\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 10, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
\n", 327 | "\n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | "
categorylistingscrape_datevendor
0 stimulants Welcome back, gwern 0 0 0 BTC 0.0000 Hom...2014-07-21 gwern
1 stimulants NaN2014-07-21 NORWEGIANcom
2 stimulants NaN2014-07-21 drzheng
3 stimulants NaN2014-07-21 drzheng
4 stimulants NaN2014-07-21 godfatherNL
5 stimulants NaN2014-07-21 godfatherNL
6 stimulants NaN2014-07-21 godfatherNL
7 stimulants NaN2014-07-21 spencerhill
8 stimulants NaN2014-07-21 Zable
9 stimulants NaN2014-07-21 Zable
10 stimulants NaN2014-07-21 Zable
11 stimulants NaN2014-07-21 Zable
12 stimulants NaN2014-07-21 godfatherNL
13 stimulants NaN2014-07-21 godfatherNL
14 stimulants NaN2014-07-21 SheepShop
15 stimulants NaN2014-07-21 SheepShop
16 stimulants NaN2014-07-21 SheepShop
17 stimulants NaN2014-07-21 SheepShop
18 stimulants NaN2014-07-21 SheepShop
19 stimulants NaN2014-07-21 AmsterdamConnect
20 stimulants NaN2014-07-21 AmsterdamConnect
21 stimulants NaN2014-07-21 DarkVendor
22 stimulants NaN2014-07-21 DarkVendor
23 stimulants NaN2014-07-21 DarkVendor
24 stimulants NaN2014-07-21 DarkVendor
25 stimulants NaN2014-07-21 DarkVendor
26 stimulants NaN2014-07-21 DarkVendor
27 stimulants NaN2014-07-21 MeyerLansky2
28 stimulants NaN2014-07-21 MeyerLansky2
29 stimulants NaN2014-07-21 MeyerLansky2
...............
27110 hacking [METHOD #1] eBay - How I have made 100's of do...2015-01-08 Cyberzen
27111 hacking *UPDATED* Wanna BIG list of the Top 100 Darkne...2015-01-08 DrFeelG00d
27112 hacking Word Exploit | Change .EXE to .DOC - Word & EX...2015-01-08 Michael555
27113 hacking New eBay Identity2015-01-08 Cyberzen
27114 hacking Highly profitable Business ideas2015-01-08 Cyberzen
27115 hacking [ANDROID] Phone vip72 SOCKS + TOR Tutorial2015-01-08 Cyberzen
27116 hacking php web shell2015-01-08 Code
27117 hacking SNIFF WITH ANDROID2015-01-08 fake
27118 hacking DDoS Attack - A life lesson2015-01-08 BlackDuke
27119 hacking Android Remote Administration Tool (RAT) Setu...2015-01-08 BlackDuke
27120 hacking How to Spoof DNS on a LAN to Redirect Traffic ...2015-01-08 BlackDuke
27121 hacking Hacking Secrets Revealed2015-01-08 etimbuk
27122 hacking How to find someone’s IP address on Facebook2015-01-08 fake
27123 hacking How to hack MSN and HOTMAIL accounts2015-01-08 etimbuk
27124 ammunition Ammunition - .22LR Rounds2015-01-08 AUsTORe
27125 ammunition cartouche 7,65 x 17 mm Browning (.32 ACP)2015-01-08 projeccao
27126 ammunition 7,62x39 mm, AK47/SKS ammunition, bag of 1002015-01-08 ErichHartmann
27127 ammunition Rottweil 16/67,5 Kaliber. (7,4mm) 15pcs.2015-01-08 Vintorez
27128 ammunition Rottweil 16/67,5 Kaliber. (3,5mm) 30pcs.2015-01-08 Vintorez
27129 ammunition 9x19 mm Luger - SUBSONIC Hollowpoints/JHP, Mag...2015-01-08 ErichHartmann
27130 ammunition cartouche 9x19 mm Parabellum2015-01-08 projeccao
27131 ammunition cartouche .45 ACP2015-01-08 projeccao
27132 ammunition cartouche .22 Long Rifle2015-01-08 projeccao
27133 ammunition cartouche .38 Special2015-01-08 projeccao
27134 ammunition cartouche .357 Magnum2015-01-08 projeccao
27135 ammunition Bio-Hazard Gas Mask!!! Army Grade!!Protect you...2015-01-08 herostats
27136 ammunition .40 1000 Rounds Smith and wesson Remington2015-01-08 pillpusher
27137 ammunition 45 ACP 100 rounds Remington2015-01-08 pillpusher
27138 ammunition 45 ACP 1000 rounds PMC2015-01-08 pillpusher
27139 ammunition CUstom for Gravano2015-01-08 dosensuppe
\n", 767 | "

3001716 rows × 4 columns

\n", 768 | "
" 769 | ], 770 | "text/plain": [ 771 | " category listing \\\n", 772 | "0 stimulants Welcome back, gwern 0 0 0 BTC 0.0000 Hom... \n", 773 | "1 stimulants NaN \n", 774 | "2 stimulants NaN \n", 775 | "3 stimulants NaN \n", 776 | "4 stimulants NaN \n", 777 | "5 stimulants NaN \n", 778 | "6 stimulants NaN \n", 779 | "7 stimulants NaN \n", 780 | "8 stimulants NaN \n", 781 | "9 stimulants NaN \n", 782 | "10 stimulants NaN \n", 783 | "11 stimulants NaN \n", 784 | "12 stimulants NaN \n", 785 | "13 stimulants NaN \n", 786 | "14 stimulants NaN \n", 787 | "15 stimulants NaN \n", 788 | "16 stimulants NaN \n", 789 | "17 stimulants NaN \n", 790 | "18 stimulants NaN \n", 791 | "19 stimulants NaN \n", 792 | "20 stimulants NaN \n", 793 | "21 stimulants NaN \n", 794 | "22 stimulants NaN \n", 795 | "23 stimulants NaN \n", 796 | "24 stimulants NaN \n", 797 | "25 stimulants NaN \n", 798 | "26 stimulants NaN \n", 799 | "27 stimulants NaN \n", 800 | "28 stimulants NaN \n", 801 | "29 stimulants NaN \n", 802 | "... ... ... \n", 803 | "27110 hacking [METHOD #1] eBay - How I have made 100's of do... \n", 804 | "27111 hacking *UPDATED* Wanna BIG list of the Top 100 Darkne... \n", 805 | "27112 hacking Word Exploit | Change .EXE to .DOC - Word & EX... \n", 806 | "27113 hacking New eBay Identity \n", 807 | "27114 hacking Highly profitable Business ideas \n", 808 | "27115 hacking [ANDROID] Phone vip72 SOCKS + TOR Tutorial \n", 809 | "27116 hacking php web shell \n", 810 | "27117 hacking SNIFF WITH ANDROID \n", 811 | "27118 hacking DDoS Attack - A life lesson \n", 812 | "27119 hacking Android Remote Administration Tool (RAT) Setu... \n", 813 | "27120 hacking How to Spoof DNS on a LAN to Redirect Traffic ... \n", 814 | "27121 hacking Hacking Secrets Revealed \n", 815 | "27122 hacking How to find someone’s IP address on Facebook \n", 816 | "27123 hacking How to hack MSN and HOTMAIL accounts \n", 817 | "27124 ammunition Ammunition - .22LR Rounds \n", 818 | "27125 ammunition cartouche 7,65 x 17 mm Browning (.32 ACP) \n", 819 | "27126 ammunition 7,62x39 mm, AK47/SKS ammunition, bag of 100 \n", 820 | "27127 ammunition Rottweil 16/67,5 Kaliber. (7,4mm) 15pcs. \n", 821 | "27128 ammunition Rottweil 16/67,5 Kaliber. (3,5mm) 30pcs. \n", 822 | "27129 ammunition 9x19 mm Luger - SUBSONIC Hollowpoints/JHP, Mag... \n", 823 | "27130 ammunition cartouche 9x19 mm Parabellum \n", 824 | "27131 ammunition cartouche .45 ACP \n", 825 | "27132 ammunition cartouche .22 Long Rifle \n", 826 | "27133 ammunition cartouche .38 Special \n", 827 | "27134 ammunition cartouche .357 Magnum \n", 828 | "27135 ammunition Bio-Hazard Gas Mask!!! Army Grade!!Protect you... \n", 829 | "27136 ammunition .40 1000 Rounds Smith and wesson Remington \n", 830 | "27137 ammunition 45 ACP 100 rounds Remington \n", 831 | "27138 ammunition 45 ACP 1000 rounds PMC \n", 832 | "27139 ammunition CUstom for Gravano \n", 833 | "\n", 834 | " scrape_date vendor \n", 835 | "0 2014-07-21 gwern \n", 836 | "1 2014-07-21 NORWEGIANcom \n", 837 | "2 2014-07-21 drzheng \n", 838 | "3 2014-07-21 drzheng \n", 839 | "4 2014-07-21 godfatherNL \n", 840 | "5 2014-07-21 godfatherNL \n", 841 | "6 2014-07-21 godfatherNL \n", 842 | "7 2014-07-21 spencerhill \n", 843 | "8 2014-07-21 Zable \n", 844 | "9 2014-07-21 Zable \n", 845 | "10 2014-07-21 Zable \n", 846 | "11 2014-07-21 Zable \n", 847 | "12 2014-07-21 godfatherNL \n", 848 | "13 2014-07-21 godfatherNL \n", 849 | "14 2014-07-21 SheepShop \n", 850 | "15 2014-07-21 SheepShop \n", 851 | "16 2014-07-21 SheepShop \n", 852 | "17 2014-07-21 SheepShop \n", 853 | "18 2014-07-21 SheepShop \n", 854 | "19 2014-07-21 AmsterdamConnect \n", 855 | "20 2014-07-21 AmsterdamConnect \n", 856 | "21 2014-07-21 DarkVendor \n", 857 | "22 2014-07-21 DarkVendor \n", 858 | "23 2014-07-21 DarkVendor \n", 859 | "24 2014-07-21 DarkVendor \n", 860 | "25 2014-07-21 DarkVendor \n", 861 | "26 2014-07-21 DarkVendor \n", 862 | "27 2014-07-21 MeyerLansky2 \n", 863 | "28 2014-07-21 MeyerLansky2 \n", 864 | "29 2014-07-21 MeyerLansky2 \n", 865 | "... ... ... \n", 866 | "27110 2015-01-08 Cyberzen \n", 867 | "27111 2015-01-08 DrFeelG00d \n", 868 | "27112 2015-01-08 Michael555 \n", 869 | "27113 2015-01-08 Cyberzen \n", 870 | "27114 2015-01-08 Cyberzen \n", 871 | "27115 2015-01-08 Cyberzen \n", 872 | "27116 2015-01-08 Code \n", 873 | "27117 2015-01-08 fake \n", 874 | "27118 2015-01-08 BlackDuke \n", 875 | "27119 2015-01-08 BlackDuke \n", 876 | "27120 2015-01-08 BlackDuke \n", 877 | "27121 2015-01-08 etimbuk \n", 878 | "27122 2015-01-08 fake \n", 879 | "27123 2015-01-08 etimbuk \n", 880 | "27124 2015-01-08 AUsTORe \n", 881 | "27125 2015-01-08 projeccao \n", 882 | "27126 2015-01-08 ErichHartmann \n", 883 | "27127 2015-01-08 Vintorez \n", 884 | "27128 2015-01-08 Vintorez \n", 885 | "27129 2015-01-08 ErichHartmann \n", 886 | "27130 2015-01-08 projeccao \n", 887 | "27131 2015-01-08 projeccao \n", 888 | "27132 2015-01-08 projeccao \n", 889 | "27133 2015-01-08 projeccao \n", 890 | "27134 2015-01-08 projeccao \n", 891 | "27135 2015-01-08 herostats \n", 892 | "27136 2015-01-08 pillpusher \n", 893 | "27137 2015-01-08 pillpusher \n", 894 | "27138 2015-01-08 pillpusher \n", 895 | "27139 2015-01-08 dosensuppe \n", 896 | "\n", 897 | "[3001716 rows x 4 columns]" 898 | ] 899 | }, 900 | "execution_count": 10, 901 | "metadata": {}, 902 | "output_type": "execute_result" 903 | } 904 | ], 905 | "source": [ 906 | "ev" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 11, 912 | "metadata": { 913 | "collapsed": false 914 | }, 915 | "outputs": [ 916 | { 917 | "data": { 918 | "text/plain": [ 919 | "fake 113056\n", 920 | "rc4me 42827\n", 921 | "need4weed 42588\n", 922 | "profesorhouse 37186\n", 923 | "gwern 35980\n", 924 | "RepAAA 35560\n", 925 | "optiman 34522\n", 926 | "sexyhomer 30600\n", 927 | "Magicalthings123 28045\n", 928 | "FoxyGirl 23205\n", 929 | "DrPlatypus 21013\n", 930 | "etimbuk 20863\n", 931 | "cerberus 20190\n", 932 | "theben 19920\n", 933 | "AlbertHeijn 18801\n", 934 | "...\n", 935 | "fraudster 1\n", 936 | "artash 1\n", 937 | "EVERYTHING_CHEAP 1\n", 938 | "tornado888 1\n", 939 | "ViBE 1\n", 940 | "firepower 1\n", 941 | "franktors 1\n", 942 | "Kastell 1\n", 943 | "ambra 1\n", 944 | "5cent 1\n", 945 | "DonkeySausage 1\n", 946 | "freshvybz 1\n", 947 | "Bossen 1\n", 948 | "TheDruMonSer 1\n", 949 | "thecornershop 1\n", 950 | "Length: 4139, dtype: int64" 951 | ] 952 | }, 953 | "execution_count": 11, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "ev['vendor'].value_counts()" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": null, 965 | "metadata": { 966 | "collapsed": true 967 | }, 968 | "outputs": [], 969 | "source": [] 970 | } 971 | ], 972 | "metadata": { 973 | "kernelspec": { 974 | "display_name": "Python 3", 975 | "language": "python", 976 | "name": "python3" 977 | }, 978 | "language_info": { 979 | "codemirror_mode": { 980 | "name": "ipython", 981 | "version": 3 982 | }, 983 | "file_extension": ".py", 984 | "mimetype": "text/x-python", 985 | "name": "python", 986 | "nbconvert_exporter": "python", 987 | "pygments_lexer": "ipython3", 988 | "version": "3.4.0" 989 | } 990 | }, 991 | "nbformat": 4, 992 | "nbformat_minor": 0 993 | } 994 | --------------------------------------------------------------------------------