├── restaurant_scraper ├── restaurantSpider │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── README │ │ ├── extractRawText.py │ │ └── EaterSpider.py │ ├── items.py │ ├── pipelines.py │ └── settings.py ├── README └── scrapy.cfg ├── README.md ├── CONTRIBUTING.txt ├── wikipage_info_extractor └── wiki_home_construction_crawl.py ├── matching_schemas └── example.py ├── wiki_to_json └── Wikipedia_JSON_Generator.ipynb └── matching_movies ├── Tutorial_py3.py ├── Tutorial_py2.py └── Tutorial_py3.ipynb /restaurant_scraper/restaurantSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /restaurant_scraper/README: -------------------------------------------------------------------------------- 1 | Goto subdirectory restuarantSpider/spiders/README for the actual readme file. 2 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /restaurant_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = restaurantSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = restaurantSpider 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigGorilla 2 | BigGorilla is an open-source data integration and data preparation ecosystem 3 | (powered by Python) to enable data scientists to perform integration and 4 | analysis of data. Learn more about BigGorilla at [www.biggorilla.org](http://www.biggorilla.org). 5 | 6 | ## Directories: 7 | * packages: contains packages developed as part of BigGorilla 8 | * workflows: contains notebooks and python scripts 9 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class RestaurantSpiderItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | # url: url of page being crawled 13 | # title: title of the page 14 | # date: date that the page is posted 15 | # content: the crawled content 16 | url = scrapy.Field() 17 | title = scrapy.Field() 18 | date = scrapy.Field() 19 | content = scrapy.Field() 20 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/spiders/README: -------------------------------------------------------------------------------- 1 | # 2 | # Example of a pipeline for scraping content from coffee websites. 3 | # This pipeline can be tailored for different websites. 4 | # 5 | 6 | 7 | # 8 | # scrap with EatersSpider 9 | # 10 | scrapy runspider EaterSpider.py -s OUTFILE="Eater-acquired.json" 11 | 12 | # 13 | # The above generates Eater-acquired.json. 14 | # 15 | # Each url is extracted as one Json item {date:-, url:-, content:[ ]}, 16 | # where content contains the paragraphs under

tags of the url. 17 | # 18 | 19 | # 20 | # Now remove html tags and newline characters from the content 21 | # 22 | ./extractRawText.py Eater-acquired.json Eater-clean.json 23 | 24 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/spiders/extractRawText.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Authored by Wang-Chiew Tan 4 | """ 5 | import re 6 | import json 7 | import sys 8 | 9 | # remove all html tags, new line characters within the extracted paragraphs

10 | def cleanhtml(extracted): 11 | # remove html tags 12 | cleantext = re.sub('<.*?>', '', extracted) 13 | # remove all new lines 14 | cleantext = re.sub('\n *', "", cleantext) 15 | return cleantext 16 | 17 | 18 | def main(): 19 | ifilename = str(sys.argv[1]) 20 | ofilename = str(sys.argv[2]) 21 | with open(ofilename, 'w') as outfile: 22 | with open(ifilename, 'r') as ifile: 23 | for json_line in ifile: 24 | data = json.loads(json_line) 25 | newdata = [] 26 | for s in data["content"]: 27 | newdata.append(cleanhtml(s)) 28 | data["content"] = newdata 29 | json.dump(data, outfile) 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.txt: -------------------------------------------------------------------------------- 1 | ### CONTRIBUTION TO BIGGORILLA 2 | ------------------------------ 3 | 4 | BigGorilla is an open-source framework for data integration 5 | and data preparation tasks. We encourage all researchers, 6 | engineers, professors and students who work on data preparation 7 | and data integration tasks to contribute to BigGorilla. To make 8 | sure your contribution can be easily deployed by others, the 9 | BigGorilla team will review and test your code. Please read the 10 | following instructions before submitting your contribution. 11 | 12 | * If you are submitting a sample workflow, you need to create a 13 | new folder under the "workflows" folder. 14 | * If you are submitting a python package make sure that you create 15 | a new folder with the name of your package under the "packages" folder. 16 | * Include a file titled "Authors.txt" inside your folder and mention 17 | the creators as well as any publications that should be cited for 18 | academic purposes. 19 | * If your submission uses large datasets, make sure to host them 20 | somewhere else and point to them in your submission (in a README file). 21 | * Note that by submitting your work, you are making it public for 22 | everyone and can be used for commercial as well as non-commercial and 23 | academic use. 24 | -------------------------------------------------------------------------------- /wikipage_info_extractor/wiki_home_construction_crawl.py: -------------------------------------------------------------------------------- 1 | # This script crawls a couple of wiki urls, extracts the titles and 2 | # the first paragraphs and stores them in a json file. 3 | import urllib2 4 | import json 5 | from bs4 import BeautifulSoup 6 | 7 | data = [] 8 | header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia 9 | wiki_urls = [ 10 | 'https://en.wikipedia.org/wiki/Adobe', 11 | 'https://en.wikipedia.org/wiki/Brick', 12 | 'https://en.wikipedia.org/wiki/Concrete', 13 | 'https://en.wikipedia.org/wiki/Trunk_(botany)', 14 | 'https://en.wikipedia.org/wiki/Metal', 15 | 'https://en.wikipedia.org/wiki/Stone_(disambiguation)', 16 | 'https://en.wikipedia.org/wiki/Rock_(geology)', 17 | 'https://en.wikipedia.org/wiki/Straw', 18 | 'https://en.wikipedia.org/wiki/Wood' 19 | ] 20 | 21 | for wiki in wiki_urls: 22 | feature_dict = {} 23 | req = urllib2.Request(wiki,headers=header) 24 | page = urllib2.urlopen(req) 25 | 26 | #Parse the html in the 'page' variable, and store it in Beautiful Soup format 27 | soup = BeautifulSoup(page, 'html.parser') 28 | 29 | feature_dict["description"] = soup.p.get_text() 30 | feature_dict["title"] = soup.h1.get_text() 31 | data.append(feature_dict) 32 | 33 | 34 | with open('wiki_home_construction_features.json', 'w') as jsonData: 35 | json.dump(data, jsonData) 36 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | """ 2 | Authored by Wang-Chiew Tan 3 | """ 4 | 5 | # -*- coding: utf-8 -*- 6 | 7 | # Define your item pipelines here 8 | # 9 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 10 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 11 | from scrapy import signals 12 | from scrapy.exporters import JsonLinesItemExporter 13 | 14 | 15 | # 16 | # this pipeline writes each item to the file specified. it 17 | # gets called with each item. 18 | # 19 | class RestaurantSpiderPipeline(object): 20 | filename = "" 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | settings = crawler.settings 25 | # get the specified filename to write to 26 | filename = settings.get("OUTFILE") 27 | pipeline = cls(filename) 28 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 29 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 30 | return pipeline 31 | 32 | def __init__(self, filename): 33 | # open the file for writing 34 | self.file = open(filename, 'w+b') 35 | 36 | def spider_opened(self, spider): 37 | self.exporter = JsonLinesItemExporter(self.file) 38 | self.exporter.start_exporting() 39 | 40 | def spider_closed(self, spider): 41 | self.exporter.finish_exporting() 42 | self.file.close() 43 | 44 | def process_item(self, item, spider): 45 | self.exporter.export_item(item) 46 | return item 47 | 48 | -------------------------------------------------------------------------------- /matching_schemas/example.py: -------------------------------------------------------------------------------- 1 | import flexmatcher 2 | import pandas as pd 3 | 4 | # The mediated schema has three attributes: movie_name, movie_year, movie_rating 5 | 6 | # Creating the first schema, a subset of its data and the mapping to the mediated schema 7 | vals1 = [['year', 'Movie', 'imdb_rating'], 8 | ['2001', 'Lord of the Rings', '8.8'], 9 | ['2010', 'Inception', '8.7'], 10 | ['1999', 'The Matrix', '8.7']] 11 | header = vals1.pop(0) 12 | data1 = pd.DataFrame(vals1, columns=header) 13 | data1_mapping = {'year': 'movie_year', 'imdb_rating': 'movie_rating', 'Movie': 'movie_name'} 14 | 15 | # Creating the second schema, a subset of its data and the mapping to the mediated schema 16 | vals2 = [['title', 'produced', 'popularity'], 17 | ['The Godfather', '1972', '9.2'], 18 | ['Silver Linings Playbook', '2012', '7.8'], 19 | ['The Big Short', '2015', '7.8']] 20 | header = vals2.pop(0) 21 | data2 = pd.DataFrame(vals2, columns=header) 22 | data2_mapping = {'popularity': 'movie_rating', 'produced': 'movie_year', 'title': 'movie_name'} 23 | 24 | # Using Flexmatcher 25 | fm = flexmatcher.FlexMatcher() 26 | schema_list = [data1, data2] 27 | mapping_list = [data1_mapping, data2_mapping] 28 | fm.create_training_data(schema_list, mapping_list) 29 | fm.train() 30 | 31 | # Creating a test schmea 32 | vals3 = [['rt', 'id', 'yr'], 33 | ['8.5', 'The Pianist', '2002'], 34 | ['7.7', 'The Social Network', '2010']] 35 | header = vals3.pop(0) 36 | data3 = pd.DataFrame(vals3, columns=header) 37 | print (fm.make_prediction(data3)) 38 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/spiders/EaterSpider.py: -------------------------------------------------------------------------------- 1 | """ 2 | Authored by Wang-Chiew Tan 3 | """ 4 | from scrapy.selector import Selector 5 | from scrapy.http import HtmlResponse 6 | from restaurantSpider.items import RestaurantSpiderItem 7 | from scrapy.crawler import CrawlerProcess 8 | import scrapy 9 | #import sys, os 10 | 11 | # 12 | # spider for crawling www.eater.com/review 13 | # 14 | class EaterSpider(scrapy.Spider): 15 | name = "EaterSpider" 16 | allowed_domains = [] 17 | start_urls = [] 18 | 19 | # url to page. You can replace this with your site url 20 | urlstr = "http://myexample.com" 21 | # suppose we are scraping pages 1 to 25 of this url. add all urls 22 | # to "start_urls" 23 | for i in xrange(1,25): 24 | start_urls.append(urlstr+str(i)) 25 | 26 | print("=== Start URLs: {}".format(start_urls)) 27 | 28 | def parse(self, response): 29 | print "=== Starting to crawl the website === " 30 | # 31 | # The following are all hypothetical. We will extract the urls 32 | # in each page (which we will scrap individually). We will 33 | # also collect the corresponding titles and dates of the urls. 34 | # 35 | urls = response.selector.xpath('//h3/a[@data-analytics-link="review"]/@href').extract() 36 | titles = response.selector.xpath('//h3/a[@data-analytics-link="review"]/text()').extract() 37 | dates = response.selector.xpath('//div[@class="m-entry-box__body"]/p/span[@class="p-byline__time"]/text()').extract() 38 | 39 | items = [] 40 | for j in xrange(0,len(urls)): 41 | # item(url,title,date,content) is defined in items.py 42 | i = RestaurantSpiderItem(url=urls[j], title=titles[j], date=dates[j]) 43 | items.append(i) 44 | # start scraping the content 45 | request = scrapy.Request(url=urls[j], callback=self.parse_cafe, errback=self.parse_error) 46 | request.meta['item'] = i # pass item information to pass to parse_cafe 47 | yield request 48 | 49 | # capture and print error messages on console if needed 50 | def parse_error(self, response): 51 | item = response.meta['item'] 52 | print("=== Error on {} ===".format(item['url'])) 53 | yield item 54 | 55 | def parse_cafe(self, response): 56 | item = response.meta['item'] 57 | print("=== Retrieving {} ===".format(item['url'])) 58 | # extracting all paragraphs from the article 59 | item['content'] = response.selector.xpath('//p').extract() 60 | yield item 61 | -------------------------------------------------------------------------------- /restaurant_scraper/restaurantSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for restaurantSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'restaurantSpider' 13 | 14 | SPIDER_MODULES = ['restaurantSpider.spiders'] 15 | NEWSPIDER_MODULE = 'restaurantSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'restaurantSpider (+http://www.yourdomain.com)' 20 | #USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36" 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'restaurantSpider.middlewares.MyCustomSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'restaurantSpider.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'restaurantSpider.pipelines.RestaurantSpiderPipeline': 300, 70 | } 71 | 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /wiki_to_json/Wikipedia_JSON_Generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wikipedia JSON Generator\n", 8 | "-------\n", 9 | "\n", 10 | "## This ipython notebook provides a quick and dirty implementation converting Wikipedia raw texts into JSON format.\n", 11 | "\n", 12 | "1. In the code below, the file wikipedia.txt is a small sample of the Wikipedia dump file for the purposes of illustrating our code. The filw is obtained as follows:\n", 13 | " Apply [WikiExtractor](http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py) on the wikipedia dump file, such as [this](https://dumps.wikimedia.org/enwiki/20161101/enwiki-20161101-pages-articles.xml.bz2). Obtain a small sample of the resulting file, which is wikipedia.txt. \n", 14 | "\n", 15 | "2. Retrieve (article title, article content text) pairs and generate a JSON file" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# You can download a sample wikipedia.txt file from here:\n", 27 | "# https://anaconda.org/BigGorilla/datasets/1/download/wikipedia.txt\n", 28 | "wikipedia_file_path = \"wikipedia.txt\"\n", 29 | "output_json_file_path = \"wikipedia.json\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "def load_wikifile(file_path):\n", 41 | " result = []\n", 42 | " with open(file_path, \"r\") as read_f:\n", 43 | " wikititle = \"\"\n", 44 | " wikitext = \"\"\n", 45 | " start_flg = False\n", 46 | " for line in read_f:\n", 47 | " line = line.rstrip()\n", 48 | " if line == \"\":\n", 49 | " continue\n", 50 | " if len(line) >= 8 and line[:8] == \"= 6 and line[:6] == \"\":\n", 58 | " # retrieve only the title and content pairs\n", 59 | " result.append({\"title\": wikititle,\n", 60 | " \"text\": wikitext.rstrip()})\n", 61 | " wikitext = \"\"\n", 62 | " continue\n", 63 | " wikitext += line + '\\n'\n", 64 | " return result" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "import json\n", 76 | "\n", 77 | "wiki_data = load_wikifile(wikipedia_file_path)\n", 78 | "with open(output_json_file_path, 'w') as outfile:\n", 79 | " # write out wiki_data in json format\n", 80 | " json.dump(wiki_data, outfile, indent=4)" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 2", 87 | "language": "python", 88 | "name": "python2" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 2 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython2", 100 | "version": "2.7.12" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 0 105 | } 106 | -------------------------------------------------------------------------------- /matching_movies/Tutorial_py3.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Part 1: Data Acquistion 5 | # -------------------------- 6 | # BigGorilla recommends a list of tools for different data acquisition tasks (See [here]()). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial. 7 | # 8 | # ### Step 1: downloading the "Kaggle 5000 Movie Dataset" 9 | # The desired dataset is a _.csv_ file with a url that is specified in the code snippet below. 10 | 11 | # In[1]: 12 | 13 | # Importing urlib (BigGorilla's recommendation for data acquisition from the web) 14 | import urllib.request 15 | import os 16 | 17 | # Creating the data folder 18 | if not os.path.exists('./data'): 19 | os.makedirs('./data') 20 | 21 | # Obtaining the dataset using the url that hosts it 22 | kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv' 23 | if not os.path.exists('./data/kaggle_dataset.csv'): # avoid downloading if the file exists 24 | response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv') 25 | 26 | 27 | # ### Step 2: downloading the "IMDB Plain Text Data" 28 | # The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below: 29 | # 30 | # * genres.list.gz 31 | # * ratings.list.gz 32 | # 33 | # _** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._ 34 | 35 | # In[2]: 36 | 37 | import gzip 38 | 39 | # Obtaining IMDB's text files 40 | imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/' 41 | imdb_files_list = ['genres.list.gz', 'ratings.list.gz'] 42 | for name in imdb_files_list: 43 | if not os.path.exists('./data/' + name): 44 | response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name) 45 | with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'wb') as reg_file: 46 | file_content = comp_file.read() 47 | reg_file.write(file_content) 48 | 49 | 50 | # ### Step 3: downloading the "IMDB Prepared Data" 51 | # During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the "IMDB Plain Text Data". The "IMDB Prepared Data" is the dataset that we obtained by integrating a number of files from the "IMDB Plain Text Data" which we will use during later stages of this tutorial. The following code snippet downloads this dataset. 52 | 53 | # In[3]: 54 | 55 | imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv' 56 | if not os.path.exists('./data/imdb_dataset.csv'): # avoid downloading if the file exists 57 | response = urllib.request.urlretrieve(kaggle_url, './data/imdb_dataset.csv') 58 | 59 | 60 | # ----- 61 | 62 | # # Part 2: Data Extraction 63 | # ----------------- 64 | # The "Kaggle 5000 Movie Dataset" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the "IMDB Plain Text Data" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately. 65 | # 66 | # ##### Content of "ratings.list" data file 67 | 68 | # In[4]: 69 | 70 | with open("./data/ratings.list", encoding='latin1') as myfile: 71 | head = [next(myfile) for x in range(38)] 72 | print (''.join(head[28:38])) # skipping the first 28 lines as they are descriptive headers 73 | 74 | 75 | # ##### Content of the "genres.list" data file 76 | 77 | # In[5]: 78 | 79 | with open("./data/genres.list", encoding='latin1') as myfile: 80 | head = [next(myfile) for x in range(392)] 81 | print (''.join(head[382:392])) # skipping the first 382 lines as they are descriptive header 82 | 83 | 84 | # ### Step 1: Extracting the information from "genres.list" 85 | # The goal of this step is to extract the movie titles and their production year from "movies.list", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package "**re**". 86 | 87 | # In[6]: 88 | 89 | import re 90 | import pandas as pd 91 | 92 | with open("./data/genres.list", encoding='latin1') as genres_file: 93 | raw_content = genres_file.readlines() 94 | genres_list = [] 95 | content = raw_content[382:] 96 | for line in content: 97 | m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip()) 98 | genres_list.append([m.group(1), m.group(2), m.group(3)]) 99 | genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre']) 100 | 101 | 102 | # ### Step 2: Extracting the information from "ratings.list" 103 | 104 | # In[7]: 105 | 106 | with open("./data/ratings.list", encoding='latin1') as ratings_file: 107 | raw_content = ratings_file.readlines() 108 | ratings_list = [] 109 | content = raw_content[28:] 110 | for line in content: 111 | m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip()) 112 | if m is None: continue 113 | ratings_list.append([m.group(2), m.group(3), m.group(1)]) 114 | ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating']) 115 | 116 | 117 | # Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**). 118 | # 119 | # ------ 120 | 121 | # # Part 3: Data Profiling & Cleaning 122 | # --------------------------- 123 | # 124 | # The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task. 125 | # 126 | # ### Step 1: Loading the "Kaggle 5000 Movies Dataset" 127 | # 128 | # According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%202%20--%20Data%20Extraction.ipynb) of the tutorial, we stored the extracted data from "IMDB Plain Text Data" into dataframes. It would be appropriate to load the "Kaggle 5000 Movies Dataset" into a dataframe as well and follow the same data profiling procedure for all datasets. 129 | 130 | # In[8]: 131 | 132 | import pandas as pd 133 | 134 | # Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv) 135 | kaggle_data = pd.read_csv('./data/kaggle_dataset.csv') 136 | 137 | 138 | # ### Step 2: Calculating some basic statistics (profiling) 139 | # 140 | # Let's start by finding out how many movies are listed in each dataframe. 141 | 142 | # In[9]: 143 | 144 | print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0])) 145 | print ('Number of movies in genres_data: {}'.format(genres_data.shape[0])) 146 | print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0])) 147 | 148 | 149 | # We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year. 150 | 151 | # In[10]: 152 | 153 | print ('Number of duplicates in kaggle_data: {}'.format( 154 | sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False)))) 155 | print ('Number of duplicates in genres_data: {}'.format( 156 | sum(genres_data.duplicated(subset=['movie', 'year'], keep=False)))) 157 | print ('Number of duplicates in ratings_data: {}'.format( 158 | sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False)))) 159 | 160 | 161 | # ### Step 3: Dealing with duplicates (cleaning) 162 | # 163 | # There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest. 164 | 165 | # In[11]: 166 | 167 | kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy() 168 | genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy() 169 | ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy() 170 | 171 | 172 | # ### Step 4: Normalizing the text (cleaning) 173 | # 174 | # The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as "'" and "?", and replaces some other special characters (e.g., "&" is replaced with "and"). 175 | 176 | # In[12]: 177 | 178 | def preprocess_title(title): 179 | title = title.lower() 180 | title = title.replace(',', ' ') 181 | title = title.replace("'", '') 182 | title = title.replace('&', 'and') 183 | title = title.replace('?', '') 184 | return title.strip() 185 | 186 | kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title) 187 | genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title) 188 | ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title) 189 | 190 | 191 | # ### Step 5: Looking at a few samples 192 | # 193 | # The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the "Kaggle 5000 Movies Dataset" which is stored in the **kaggle_data** dataframe. 194 | 195 | # In[13]: 196 | 197 | kaggle_data.sample(3, random_state=0) 198 | 199 | 200 | # Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **"?"**. 201 | 202 | # In[14]: 203 | 204 | def preprocess_year(year): 205 | if pd.isnull(year): 206 | return '?' 207 | else: 208 | return str(int(year)) 209 | 210 | kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year) 211 | kaggle_data.head() 212 | 213 | 214 | # ----- 215 | 216 | # # Part 4: Data Matching & Merging 217 | # ------------------------- 218 | # The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%203%20--%20Data%20Profiling%20%26%20Cleaning.ipynb), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far. 219 | # 220 | # ### Step 1: Integrating the "IMDB Plain Text Data" files 221 | # Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., "the IMDB Plain Text data"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple "exact match" can be done simply using dataframes. 222 | 223 | # In[15]: 224 | 225 | brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year']) 226 | brief_imdb_data.head() 227 | 228 | 229 | # We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the "IMDB Plain Text Data". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *"imdb_dataset.csv"* under the _"data"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset. 230 | 231 | # In[16]: 232 | 233 | # reading the new IMDB dataset 234 | imdb_data = pd.read_csv('./data/imdb_dataset.csv') 235 | # let's normlize the title as we did in Part 3 of the tutorial 236 | imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title) 237 | imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year) 238 | imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy() 239 | imdb_data.shape 240 | 241 | 242 | # ### Step 2: Integrating the Kaggle and IMDB datasets 243 | # 244 | # A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach. 245 | 246 | # In[17]: 247 | 248 | data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'], 249 | right_on=['norm_movie_title', 'norm_title_year']) 250 | data_attempt1.shape 251 | 252 | 253 | # But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year. 254 | 255 | # In[18]: 256 | 257 | import py_stringsimjoin as ssj 258 | import py_stringmatching as sm 259 | 260 | imdb_data['id'] = range(imdb_data.shape[0]) 261 | kaggle_data['id'] = range(kaggle_data.shape[0]) 262 | similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title', 263 | 'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'], 264 | r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1) 265 | # selecting the entries that have the same production year 266 | data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year] 267 | data_attempt2.shape 268 | 269 | 270 | # We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical. 271 | 272 | # In[19]: 273 | 274 | data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head() 275 | 276 | 277 | # While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan). 278 | # 279 | # In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching. 280 | # 281 | # ### Step 3: Using Magellan for data matching 282 | # 283 | # #### Substep A: Finding a candiate set (Blocking) 284 | # The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily. 285 | 286 | # In[20]: 287 | 288 | # transforming the "budget" column into string and creating a new **mixture** column 289 | ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True) 290 | imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget'] 291 | 292 | # repeating the same thing for the Kaggle dataset 293 | ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True) 294 | kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + ' ' + kaggle_data['budget'] 295 | 296 | 297 | # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**. 298 | 299 | # In[21]: 300 | 301 | C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 302 | l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration', 303 | 'budget', 'content_rating'], 304 | r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'], 305 | threshold=0.65) 306 | C.shape 307 | 308 | 309 | # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs. 310 | # 311 | # #### Substep B: Specifying the keys 312 | # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set. 313 | 314 | # In[22]: 315 | 316 | import py_entitymatching as em 317 | em.set_key(kaggle_data, 'id') # specifying the key column in the kaggle dataset 318 | em.set_key(imdb_data, 'id') # specifying the key column in the imdb dataset 319 | em.set_key(C, '_id') # specifying the key in the candidate set 320 | em.set_ltable(C, kaggle_data) # specifying the left table 321 | em.set_rtable(C, imdb_data) # specifying the right table 322 | em.set_fk_rtable(C, 'r_id') # specifying the column that matches the key in the right table 323 | em.set_fk_ltable(C, 'l_id') # specifying the column that matches the key in the left table 324 | 325 | 326 | # 327 | # #### Subset C: Debugging the blocker 328 | # 329 | # Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not. 330 | # 331 | # *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.* 332 | 333 | # In[23]: 334 | 335 | C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year', 336 | 'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head() 337 | 338 | 339 | # Based on the above sample we can see that the blocking seems to be reasonable. 340 | # 341 | # #### Substep D: Sampling from the candiate set 342 | # 343 | # The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not. 344 | 345 | # In[24]: 346 | 347 | # Sampling 500 pairs and writing this sample into a .csv file 348 | sampled = C.sample(500, random_state=0) 349 | sampled.to_csv('./data/sampled.csv', encoding='utf-8') 350 | 351 | 352 | # In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**. 353 | 354 | # In[25]: 355 | 356 | # If you would like to avoid labeling the pairs for now, you can download the labled.csv file from 357 | # BigGorilla using the following command (if you prefer to do it yourself, commend the next line) 358 | response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv', 359 | './data/labeled.csv') 360 | labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data, 361 | fk_ltable='l_id', fk_rtable='r_id', key='_id') 362 | labeled.head() 363 | 364 | 365 | # #### Substep E: Traning machine learning algorithms 366 | # 367 | # Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task. 368 | 369 | # In[26]: 370 | 371 | split = em.split_train_test(labeled, train_proportion=0.5, random_state=0) 372 | train_data = split['train'] 373 | test_data = split['test'] 374 | 375 | dt = em.DTMatcher(name='DecisionTree', random_state=0) 376 | svm = em.SVMMatcher(name='SVM', random_state=0) 377 | rf = em.RFMatcher(name='RF', random_state=0) 378 | lg = em.LogRegMatcher(name='LogReg', random_state=0) 379 | ln = em.LinRegMatcher(name='LinReg') 380 | nb = em.NBMatcher(name='NaiveBayes') 381 | 382 | 383 | # Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features. 384 | 385 | # In[27]: 386 | 387 | attr_corres = em.get_attr_corres(kaggle_data, imdb_data) 388 | attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 389 | ('norm_title_year', 'norm_year'), 390 | ('content_rating', 'mpaa'), 391 | ('budget', 'budget'), 392 | ] 393 | 394 | l_attr_types = em.get_attr_types(kaggle_data) 395 | r_attr_types = em.get_attr_types(imdb_data) 396 | 397 | tok = em.get_tokenizers_for_matching() 398 | sim = em.get_sim_funs_for_matching() 399 | 400 | F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim) 401 | 402 | 403 | # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column. 404 | 405 | # In[28]: 406 | 407 | train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 408 | train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') 409 | 410 | 411 | # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task. 412 | 413 | # In[29]: 414 | 415 | result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 416 | exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5, 417 | target_attr='label', metric='f1', random_state=0) 418 | result['cv_stats'] 419 | 420 | 421 | # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching. 422 | 423 | # #### Substep F: Evaluating the quality of our matching 424 | # 425 | # It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well. 426 | 427 | # In[30]: 428 | 429 | best_model = result['selected_matcher'] 430 | best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label') 431 | 432 | test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False) 433 | test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') 434 | 435 | # Predict on the test data 436 | predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 437 | append=True, target_attr='predicted', inplace=False) 438 | 439 | # Evaluate the predictions 440 | eval_result = em.eval_matches(predictions, 'label', 'predicted') 441 | em.print_eval_summary(eval_result) 442 | 443 | 444 | # #### Substep G: Using the trained model to match the datasets 445 | # 446 | # Now, we can use the trained model to match the two tables as follows: 447 | 448 | # In[31]: 449 | 450 | candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True) 451 | candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') 452 | predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], 453 | append=True, target_attr='predicted', inplace=False) 454 | matches = predictions[predictions.predicted == 1] 455 | 456 | 457 | # Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset. 458 | 459 | # In[32]: 460 | 461 | from py_entitymatching.catalog import catalog_manager as cm 462 | matches = matches[['_id', 'l_id', 'r_id', 'predicted']] 463 | matches.reset_index(drop=True, inplace=True) 464 | cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data) 465 | matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'], 466 | r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'], 467 | l_output_prefix='l_', r_output_prefix='r_', 468 | delete_from_catalog=False) 469 | matches.drop('predicted', axis=1, inplace=True) 470 | matches.head() 471 | 472 | -------------------------------------------------------------------------------- /matching_movies/Tutorial_py2.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Part 1: Data Acquistion 5 | # -------------------------- 6 | # BigGorilla recommends a list of tools for different data acquisition tasks (See [here]()). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial. 7 | # 8 | # ### Step 1: downloading the "Kaggle 5000 Movie Dataset" 9 | # The desired dataset is a _.csv_ file with a url that is specified in the code snippet below. 10 | 11 | # In[1]: 12 | 13 | # Importing urlib (BigGorilla's recommendation for data acquisition from the web) 14 | import urllib 15 | import os 16 | 17 | # Creating the data folder 18 | if not os.path.exists('./data'): 19 | os.makedirs('./data') 20 | 21 | # Obtaining the dataset using the url that hosts it 22 | kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv' 23 | if not os.path.exists('./data/kaggle_dataset.csv'): # avoid downloading if the file exists 24 | response = urllib.urlretrieve(kaggle_url, './data/kaggle_dataset.csv') 25 | 26 | 27 | # ### Step 2: downloading the "IMDB Plain Text Data" 28 | # The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below: 29 | # 30 | # * genres.list.gz 31 | # * ratings.list.gz 32 | # 33 | # _** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._ 34 | 35 | # In[2]: 36 | 37 | import gzip 38 | 39 | # Obtaining IMDB's text files 40 | imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/' 41 | imdb_files_list = ['genres.list.gz', 'ratings.list.gz'] 42 | for name in imdb_files_list: 43 | if not os.path.exists('./data/' + name): 44 | response = urllib.urlretrieve(imdb_url_prefix + name, './data/' + name) 45 | urllib.urlcleanup() # urllib fails to download two files from a ftp source. This fixes the bug! 46 | with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'w') as reg_file: 47 | file_content = comp_file.read() 48 | reg_file.write(file_content) 49 | 50 | 51 | # ### Step 3: downloading the "IMDB Prepared Data" 52 | # During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the "IMDB Plain Text Data". The "IMDB Prepared Data" is the dataset that we obtained by integrating a number of files from the "IMDB Plain Text Data" which we will use during later stages of this tutorial. The following code snippet downloads this dataset. 53 | 54 | # In[3]: 55 | 56 | imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv' 57 | if not os.path.exists('./data/imdb_dataset.csv'): # avoid downloading if the file exists 58 | response = urllib.urlretrieve(kaggle_url, './data/imdb_dataset.csv') 59 | 60 | 61 | # ----- 62 | 63 | # # Part 2: Data Extraction 64 | # ----------------- 65 | # The "Kaggle 5000 Movie Dataset" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the "IMDB Plain Text Data" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately. 66 | # 67 | # ##### Content of "ratings.list" data file 68 | 69 | # In[4]: 70 | 71 | with open("./data/ratings.list") as myfile: 72 | head = [next(myfile) for x in range(38)] 73 | print (''.join(head[28:38])) # skipping the first 28 lines as they are descriptive headers 74 | 75 | 76 | # ##### Content of the "genres.list" data file 77 | 78 | # In[5]: 79 | 80 | with open("./data/genres.list") as myfile: 81 | head = [next(myfile) for x in range(392)] 82 | print (''.join(head[382:392])) # skipping the first 382 lines as they are descriptive header 83 | 84 | 85 | # ### Step 1: Extracting the information from "genres.list" 86 | # The goal of this step is to extract the movie titles and their production year from "movies.list", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package "**re**". 87 | 88 | # In[6]: 89 | 90 | import re 91 | import pandas as pd 92 | 93 | with open("./data/genres.list") as genres_file: 94 | raw_content = genres_file.readlines() 95 | genres_list = [] 96 | content = raw_content[382:] 97 | for line in content: 98 | m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip()) 99 | genres_list.append([m.group(1), m.group(2), m.group(3)]) 100 | genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre']) 101 | 102 | 103 | # ### Step 2: Extracting the information from "ratings.list" 104 | 105 | # In[7]: 106 | 107 | with open("./data/ratings.list") as ratings_file: 108 | raw_content = ratings_file.readlines() 109 | ratings_list = [] 110 | content = raw_content[28:] 111 | for line in content: 112 | m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip()) 113 | if m is None: continue 114 | ratings_list.append([m.group(2), m.group(3), m.group(1)]) 115 | ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating']) 116 | 117 | 118 | # Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**). 119 | # 120 | # ------ 121 | 122 | # # Part 3: Data Profiling & Cleaning 123 | # --------------------------- 124 | # 125 | # The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task. 126 | # 127 | # ### Step 1: Loading the "Kaggle 5000 Movies Dataset" 128 | # 129 | # According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%202%20--%20Data%20Extraction.ipynb) of the tutorial, we stored the extracted data from "IMDB Plain Text Data" into dataframes. It would be appropriate to load the "Kaggle 5000 Movies Dataset" into a dataframe as well and follow the same data profiling procedure for all datasets. 130 | 131 | # In[8]: 132 | 133 | import pandas as pd 134 | 135 | # Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv) 136 | kaggle_data = pd.read_csv('./data/kaggle_dataset.csv') 137 | 138 | 139 | # ### Step 2: Calculating some basic statistics (profiling) 140 | # 141 | # Let's start by finding out how many movies are listed in each dataframe. 142 | 143 | # In[9]: 144 | 145 | print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0])) 146 | print ('Number of movies in genres_data: {}'.format(genres_data.shape[0])) 147 | print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0])) 148 | 149 | 150 | # We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year. 151 | 152 | # In[10]: 153 | 154 | print ('Number of duplicates in kaggle_data: {}'.format( 155 | sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False)))) 156 | print ('Number of duplicates in genres_data: {}'.format( 157 | sum(genres_data.duplicated(subset=['movie', 'year'], keep=False)))) 158 | print ('Number of duplicates in ratings_data: {}'.format( 159 | sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False)))) 160 | 161 | 162 | # ### Step 3: Dealing with duplicates (cleaning) 163 | # 164 | # There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest. 165 | 166 | # In[11]: 167 | 168 | kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy() 169 | genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy() 170 | ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy() 171 | 172 | 173 | # ### Step 4: Normalizing the text (cleaning) 174 | # 175 | # The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as "'" and "?", and replaces some other special characters (e.g., "&" is replaced with "and"). 176 | 177 | # In[12]: 178 | 179 | def preprocess_title(title): 180 | title = title.lower() 181 | title = title.replace(',', ' ') 182 | title = title.replace("'", '') 183 | title = title.replace('&', 'and') 184 | title = title.replace('?', '') 185 | title = title.decode('utf-8', 'ignore') 186 | return title.strip() 187 | 188 | kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title) 189 | genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title) 190 | ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title) 191 | 192 | 193 | # ### Step 5: Looking at a few samples 194 | # 195 | # The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the "Kaggle 5000 Movies Dataset" which is stored in the **kaggle_data** dataframe. 196 | 197 | # In[13]: 198 | 199 | kaggle_data.sample(3, random_state=0) 200 | 201 | 202 | # Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **"?"**. 203 | 204 | # In[14]: 205 | 206 | def preprocess_year(year): 207 | if pd.isnull(year): 208 | return '?' 209 | else: 210 | return str(int(year)) 211 | 212 | kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year) 213 | kaggle_data.head() 214 | 215 | 216 | # ----- 217 | 218 | # # Part 4: Data Matching & Merging 219 | # ------------------------- 220 | # The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%203%20--%20Data%20Profiling%20%26%20Cleaning.ipynb), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far. 221 | # 222 | # ### Step 1: Integrating the "IMDB Plain Text Data" files 223 | # Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., "the IMDB Plain Text data"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple "exact match" can be done simply using dataframes. 224 | 225 | # In[15]: 226 | 227 | brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year']) 228 | brief_imdb_data.head() 229 | 230 | 231 | # We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the "IMDB Plain Text Data". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *"imdb_dataset.csv"* under the _"data"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset. 232 | 233 | # In[16]: 234 | 235 | # reading the new IMDB dataset 236 | imdb_data = pd.read_csv('./data/imdb_dataset.csv') 237 | # let's normlize the title as we did in Part 3 of the tutorial 238 | imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title) 239 | imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year) 240 | imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy() 241 | imdb_data.shape 242 | 243 | 244 | # ### Step 2: Integrating the Kaggle and IMDB datasets 245 | # 246 | # A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach. 247 | 248 | # In[17]: 249 | 250 | data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'], 251 | right_on=['norm_movie_title', 'norm_title_year']) 252 | data_attempt1.shape 253 | 254 | 255 | # But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year. 256 | 257 | # In[18]: 258 | 259 | import py_stringsimjoin as ssj 260 | import py_stringmatching as sm 261 | 262 | imdb_data['id'] = range(imdb_data.shape[0]) 263 | kaggle_data['id'] = range(kaggle_data.shape[0]) 264 | similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title', 265 | 'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'], 266 | r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1) 267 | # selecting the entries that have the same production year 268 | data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year] 269 | data_attempt2.shape 270 | 271 | 272 | # We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical. 273 | 274 | # In[19]: 275 | 276 | data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head() 277 | 278 | 279 | # While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan). 280 | # 281 | # In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching. 282 | # 283 | # ### Step 3: Using Magellan for data matching 284 | # 285 | # #### Substep A: Finding a candiate set (Blocking) 286 | # The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily. 287 | 288 | # In[20]: 289 | 290 | # transforming the "budget" column into string and creating a new **mixture** column 291 | ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True) 292 | imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget'] 293 | 294 | # repeating the same thing for the Kaggle dataset 295 | ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True) 296 | kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + ' ' + kaggle_data['budget'] 297 | 298 | 299 | # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**. 300 | 301 | # In[21]: 302 | 303 | C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 304 | l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration', 305 | 'budget', 'content_rating'], 306 | r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'], 307 | threshold=0.65) 308 | C.shape 309 | 310 | 311 | # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs. 312 | # 313 | # #### Substep B: Specifying the keys 314 | # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set. 315 | 316 | # In[22]: 317 | 318 | import py_entitymatching as em 319 | em.set_key(kaggle_data, 'id') # specifying the key column in the kaggle dataset 320 | em.set_key(imdb_data, 'id') # specifying the key column in the imdb dataset 321 | em.set_key(C, '_id') # specifying the key in the candidate set 322 | em.set_ltable(C, kaggle_data) # specifying the left table 323 | em.set_rtable(C, imdb_data) # specifying the right table 324 | em.set_fk_rtable(C, 'r_id') # specifying the column that matches the key in the right table 325 | em.set_fk_ltable(C, 'l_id') # specifying the column that matches the key in the left table 326 | 327 | 328 | # 329 | # #### Subset C: Debugging the blocker 330 | # 331 | # Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not. 332 | # 333 | # *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.* 334 | 335 | # In[23]: 336 | 337 | C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year', 338 | 'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head() 339 | 340 | 341 | # Based on the above sample we can see that the blocking seems to be reasonable. 342 | # 343 | # #### Substep D: Sampling from the candiate set 344 | # 345 | # The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not. 346 | 347 | # In[24]: 348 | 349 | # Sampling 500 pairs and writing this sample into a .csv file 350 | sampled = C.sample(500, random_state=0) 351 | sampled.to_csv('./data/sampled.csv', encoding='utf-8') 352 | 353 | 354 | # In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**. 355 | 356 | # In[25]: 357 | 358 | # If you would like to avoid labeling the pairs for now, you can download the labled.csv file from 359 | # BigGorilla using the following command (if you prefer to do it yourself, commend the next line) 360 | response = urllib.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv', 361 | './data/labeled.csv') 362 | labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data, 363 | fk_ltable='l_id', fk_rtable='r_id', key='_id') 364 | labeled.head() 365 | 366 | 367 | # #### Substep E: Traning machine learning algorithms 368 | # 369 | # Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task. 370 | 371 | # In[26]: 372 | 373 | split = em.split_train_test(labeled, train_proportion=0.5, random_state=0) 374 | train_data = split['train'] 375 | test_data = split['test'] 376 | 377 | dt = em.DTMatcher(name='DecisionTree', random_state=0) 378 | svm = em.SVMMatcher(name='SVM', random_state=0) 379 | rf = em.RFMatcher(name='RF', random_state=0) 380 | lg = em.LogRegMatcher(name='LogReg', random_state=0) 381 | ln = em.LinRegMatcher(name='LinReg') 382 | nb = em.NBMatcher(name='NaiveBayes') 383 | 384 | 385 | # Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features. 386 | 387 | # In[27]: 388 | 389 | attr_corres = em.get_attr_corres(kaggle_data, imdb_data) 390 | attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 391 | ('norm_title_year', 'norm_year'), 392 | ('content_rating', 'mpaa'), 393 | ('budget', 'budget'), 394 | ] 395 | 396 | l_attr_types = em.get_attr_types(kaggle_data) 397 | r_attr_types = em.get_attr_types(imdb_data) 398 | 399 | tok = em.get_tokenizers_for_matching() 400 | sim = em.get_sim_funs_for_matching() 401 | 402 | F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim) 403 | 404 | 405 | # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column. 406 | 407 | # In[28]: 408 | 409 | train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 410 | train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') 411 | 412 | 413 | # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task. 414 | 415 | # In[29]: 416 | 417 | result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 418 | exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5, 419 | target_attr='label', metric='f1', random_state=0) 420 | result['cv_stats'] 421 | 422 | 423 | # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching. 424 | 425 | # #### Substep F: Evaluating the quality of our matching 426 | # 427 | # It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well. 428 | 429 | # In[30]: 430 | 431 | best_model = result['selected_matcher'] 432 | best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label') 433 | 434 | test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False) 435 | test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') 436 | 437 | # Predict on the test data 438 | predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 439 | append=True, target_attr='predicted', inplace=False) 440 | 441 | # Evaluate the predictions 442 | eval_result = em.eval_matches(predictions, 'label', 'predicted') 443 | em.print_eval_summary(eval_result) 444 | 445 | 446 | # #### Substep G: Using the trained model to match the datasets 447 | # 448 | # Now, we can use the trained model to match the two tables as follows: 449 | 450 | # In[31]: 451 | 452 | candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True) 453 | candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') 454 | predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], 455 | append=True, target_attr='predicted', inplace=False) 456 | matches = predictions[predictions.predicted == 1] 457 | 458 | 459 | # Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset. 460 | 461 | # In[32]: 462 | 463 | from py_entitymatching.catalog import catalog_manager as cm 464 | matches = matches[['_id', 'l_id', 'r_id', 'predicted']] 465 | matches.reset_index(drop=True, inplace=True) 466 | cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data) 467 | matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'], 468 | r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'], 469 | l_output_prefix='l_', r_output_prefix='r_', 470 | delete_from_catalog=False) 471 | matches.drop('predicted', axis=1, inplace=True) 472 | matches.head() 473 | 474 | -------------------------------------------------------------------------------- /matching_movies/Tutorial_py3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 1: Data Acquistion\n", 8 | "--------------------------\n", 9 | "BigGorilla recommends a list of tools for different data acquisition tasks (See [here](http://www.biggorilla.org/data-acquisition)). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial.\n", 10 | "\n", 11 | "### Step 1: downloading the \"Kaggle 5000 Movie Dataset\"\n", 12 | "The desired dataset is a _.csv_ file with a url that is specified in the code snippet below." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# Importing urlib (BigGorilla's recommendation for data acquisition from the web)\n", 24 | "import urllib.request\n", 25 | "import os\n", 26 | "\n", 27 | "# Creating the data folder\n", 28 | "if not os.path.exists('./data'):\n", 29 | " os.makedirs('./data')\n", 30 | "\n", 31 | "# Obtaining the dataset using the url that hosts it\n", 32 | "kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'\n", 33 | "if not os.path.exists('./data/kaggle_dataset.csv'): # avoid downloading if the file exists\n", 34 | " response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Step 2: downloading the \"IMDB Plain Text Data\"\n", 42 | "The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below:\n", 43 | "\n", 44 | "* genres.list.gz\n", 45 | "* ratings.list.gz\n", 46 | "\n", 47 | "_** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "import gzip\n", 59 | "\n", 60 | "# Obtaining IMDB's text files\n", 61 | "imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/'\n", 62 | "imdb_files_list = ['genres.list.gz', 'ratings.list.gz']\n", 63 | "for name in imdb_files_list:\n", 64 | " if not os.path.exists('./data/' + name):\n", 65 | " response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)\n", 66 | " with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'wb') as reg_file:\n", 67 | " file_content = comp_file.read()\n", 68 | " reg_file.write(file_content)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Step 3: downloading the \"IMDB Prepared Data\"\n", 76 | "During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the \"IMDB Plain Text Data\". The \"IMDB Prepared Data\" is the dataset that we obtained by integrating a number of files from the \"IMDB Plain Text Data\" which we will use during later stages of this tutorial. The following code snippet downloads this dataset." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'\n", 88 | "if not os.path.exists('./data/imdb_dataset.csv'): # avoid downloading if the file exists\n", 89 | " response = urllib.request.urlretrieve(kaggle_url, './data/imdb_dataset.csv')" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "-----" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "# Part 2: Data Extraction\n", 104 | "-----------------\n", 105 | "The \"Kaggle 5000 Movie Dataset\" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the \"IMDB Plain Text Data\" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately.\n", 106 | "\n", 107 | "##### Content of \"ratings.list\" data file" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | " 0000000125 1728818 9.2 The Shawshank Redemption (1994)\n", 122 | " 0000000125 1181412 9.2 The Godfather (1972)\n", 123 | " 0000000124 810055 9.0 The Godfather: Part II (1974)\n", 124 | " 0000000124 1714042 8.9 The Dark Knight (2008)\n", 125 | " 0000000133 461310 8.9 12 Angry Men (1957)\n", 126 | " 0000000133 885509 8.9 Schindler's List (1993)\n", 127 | " 0000000123 1354135 8.9 Pulp Fiction (1994)\n", 128 | " 0000000124 1241908 8.9 The Lord of the Rings: The Return of the King (2003)\n", 129 | " 0000000123 514540 8.9 Il buono, il brutto, il cattivo (1966)\n", 130 | " 0000000133 1380148 8.8 Fight Club (1999)\n", 131 | "\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "with open(\"./data/ratings.list\", encoding='latin1') as myfile:\n", 137 | " head = [next(myfile) for x in range(38)]\n", 138 | "print (''.join(head[28:38])) # skipping the first 28 lines as they are descriptive headers" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "##### Content of the \"genres.list\" data file" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 5, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "\"!Next?\" (1994)\t\t\t\t\t\tDocumentary\n", 160 | "\"#1 Single\" (2006)\t\t\t\t\tReality-TV\n", 161 | "\"#15SecondScare\" (2015)\t\t\t\t\tHorror\n", 162 | "\"#15SecondScare\" (2015)\t\t\t\t\tShort\n", 163 | "\"#15SecondScare\" (2015)\t\t\t\t\tThriller\n", 164 | "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tDrama\n", 165 | "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tHorror\n", 166 | "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tShort\n", 167 | "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tThriller\n", 168 | "\"#1MinuteNightmare\" (2014)\t\t\t\tHorror\n", 169 | "\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "with open(\"./data/genres.list\", encoding='latin1') as myfile:\n", 175 | " head = [next(myfile) for x in range(392)]\n", 176 | "print (''.join(head[382:392])) # skipping the first 382 lines as they are descriptive header" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Step 1: Extracting the information from \"genres.list\"\n", 184 | "The goal of this step is to extract the movie titles and their production year from \"movies.list\", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package \"**re**\"." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "import re\n", 196 | "import pandas as pd\n", 197 | "\n", 198 | "with open(\"./data/genres.list\", encoding='latin1') as genres_file:\n", 199 | " raw_content = genres_file.readlines()\n", 200 | " genres_list = []\n", 201 | " content = raw_content[382:]\n", 202 | " for line in content:\n", 203 | " m = re.match(r'\"?(.*[^\"])\"? \\(((?:\\d|\\?){4})(?:/\\w*)?\\).*\\s((?:\\w|-)+)', line.strip())\n", 204 | " genres_list.append([m.group(1), m.group(2), m.group(3)])\n", 205 | " genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### Step 2: Extracting the information from \"ratings.list\"" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "with open(\"./data/ratings.list\", encoding='latin1') as ratings_file:\n", 224 | " raw_content = ratings_file.readlines()\n", 225 | " ratings_list = []\n", 226 | " content = raw_content[28:]\n", 227 | " for line in content:\n", 228 | " m = re.match(r'(?:\\d|\\.|\\*){10}\\s+\\d+\\s+(1?\\d\\.\\d)\\s\"?(.*[^\"])\"? \\(((?:\\d|\\?){4})(?:/\\w*)?\\)', line.strip())\n", 229 | " if m is None: continue\n", 230 | " ratings_list.append([m.group(2), m.group(3), m.group(1)])\n", 231 | " ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**).\n", 239 | "\n", 240 | "------" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "# Part 3: Data Profiling & Cleaning\n", 248 | "---------------------------\n", 249 | "\n", 250 | "The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task.\n", 251 | "\n", 252 | "### Step 1: Loading the \"Kaggle 5000 Movies Dataset\"\n", 253 | "\n", 254 | "According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](#Part-2:-Data-Extraction) of the tutorial, we stored the extracted data from \"IMDB Plain Text Data\" into dataframes. It would be appropriate to load the \"Kaggle 5000 Movies Dataset\" into a dataframe as well and follow the same data profiling procedure for all datasets." 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 8, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "import pandas as pd\n", 266 | "\n", 267 | "# Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)\n", 268 | "kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "### Step 2: Calculating some basic statistics (profiling)\n", 276 | "\n", 277 | "Let's start by finding out how many movies are listed in each dataframe." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 9, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "Number of movies in kaggle_data: 5043\n", 292 | "Number of movies in genres_data: 2384400\n", 293 | "Number of movies in ratings_data: 691621\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))\n", 299 | "print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))\n", 300 | "print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 10, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "Number of duplicates in kaggle_data: 241\n", 322 | "Number of duplicates in genres_data: 1807712\n", 323 | "Number of duplicates in ratings_data: 286515\n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "print ('Number of duplicates in kaggle_data: {}'.format(\n", 329 | " sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))\n", 330 | "print ('Number of duplicates in genres_data: {}'.format(\n", 331 | " sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))\n", 332 | "print ('Number of duplicates in ratings_data: {}'.format(\n", 333 | " sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### Step 3: Dealing with duplicates (cleaning)\n", 341 | "\n", 342 | "There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 11, 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()\n", 354 | "genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()\n", 355 | "ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "### Step 4: Normalizing the text (cleaning)\n", 363 | "\n", 364 | "The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as \"'\" and \"?\", and replaces some other special characters (e.g., \"&\" is replaced with \"and\"). " 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 12, 370 | "metadata": { 371 | "collapsed": false 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "def preprocess_title(title):\n", 376 | " title = title.lower()\n", 377 | " title = title.replace(',', ' ')\n", 378 | " title = title.replace(\"'\", '') \n", 379 | " title = title.replace('&', 'and')\n", 380 | " title = title.replace('?', '')\n", 381 | " return title.strip()\n", 382 | "\n", 383 | "kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)\n", 384 | "genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)\n", 385 | "ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "### Step 5: Looking at a few samples\n", 393 | "\n", 394 | "The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the \"Kaggle 5000 Movies Dataset\" which is stored in the **kaggle_data** dataframe. " 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 13, 400 | "metadata": { 401 | "collapsed": false 402 | }, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/html": [ 407 | "

\n", 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...languagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likesnorm_movie_title
4422ColorSimeon Rice6.093.06.056.0Lisa Brave393.0NaNAction|Horror|Thriller...EnglishUSAR1500000.02014.0191.05.52.35307unsullied
1022ColorDoug Liman214.0108.0218.0405.0Ty Burrell6000.09528092.0Biography|Drama|Thriller...EnglishUSAPG-1322000000.02010.03000.06.82.359000fair game
3631ColorJonathan Levine147.099.0129.0362.0Aaron Yoo976.02077046.0Comedy|Drama|Romance...EnglishUSAR6000000.02008.0617.07.02.350the wackness
\n", 510 | "

3 rows × 29 columns

\n", 511 | "
" 512 | ], 513 | "text/plain": [ 514 | " color director_name num_critic_for_reviews duration \\\n", 515 | "4422 Color Simeon Rice 6.0 93.0 \n", 516 | "1022 Color Doug Liman 214.0 108.0 \n", 517 | "3631 Color Jonathan Levine 147.0 99.0 \n", 518 | "\n", 519 | " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", 520 | "4422 6.0 56.0 Lisa Brave \n", 521 | "1022 218.0 405.0 Ty Burrell \n", 522 | "3631 129.0 362.0 Aaron Yoo \n", 523 | "\n", 524 | " actor_1_facebook_likes gross genres \\\n", 525 | "4422 393.0 NaN Action|Horror|Thriller \n", 526 | "1022 6000.0 9528092.0 Biography|Drama|Thriller \n", 527 | "3631 976.0 2077046.0 Comedy|Drama|Romance \n", 528 | "\n", 529 | " ... language country content_rating budget title_year \\\n", 530 | "4422 ... English USA R 1500000.0 2014.0 \n", 531 | "1022 ... English USA PG-13 22000000.0 2010.0 \n", 532 | "3631 ... English USA R 6000000.0 2008.0 \n", 533 | "\n", 534 | " actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \\\n", 535 | "4422 191.0 5.5 2.35 307 \n", 536 | "1022 3000.0 6.8 2.35 9000 \n", 537 | "3631 617.0 7.0 2.35 0 \n", 538 | "\n", 539 | " norm_movie_title \n", 540 | "4422 unsullied \n", 541 | "1022 fair game \n", 542 | "3631 the wackness \n", 543 | "\n", 544 | "[3 rows x 29 columns]" 545 | ] 546 | }, 547 | "execution_count": 13, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "kaggle_data.sample(3, random_state=0)" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **\"?\"**." 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 14, 566 | "metadata": { 567 | "collapsed": false 568 | }, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/html": [ 573 | "
\n", 574 | "\n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...countrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likesnorm_movie_titlenorm_title_year
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...USAPG-13237000000.02009.0936.07.91.7833000avatar2009
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...USAPG-13300000000.02007.05000.07.12.350pirates of the caribbean: at worlds end2007
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...UKPG-13245000000.02015.0393.06.82.3585000spectre2015
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...USAPG-13250000000.02012.023000.08.52.35164000the dark knight rises2012
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaN12.07.1NaN0star wars: episode vii - the force awakens?
\n", 724 | "

5 rows × 30 columns

\n", 725 | "
" 726 | ], 727 | "text/plain": [ 728 | " color director_name num_critic_for_reviews duration \\\n", 729 | "0 Color James Cameron 723.0 178.0 \n", 730 | "1 Color Gore Verbinski 302.0 169.0 \n", 731 | "2 Color Sam Mendes 602.0 148.0 \n", 732 | "3 Color Christopher Nolan 813.0 164.0 \n", 733 | "4 NaN Doug Walker NaN NaN \n", 734 | "\n", 735 | " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", 736 | "0 0.0 855.0 Joel David Moore \n", 737 | "1 563.0 1000.0 Orlando Bloom \n", 738 | "2 0.0 161.0 Rory Kinnear \n", 739 | "3 22000.0 23000.0 Christian Bale \n", 740 | "4 131.0 NaN Rob Walker \n", 741 | "\n", 742 | " actor_1_facebook_likes gross genres \\\n", 743 | "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", 744 | "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", 745 | "2 11000.0 200074175.0 Action|Adventure|Thriller \n", 746 | "3 27000.0 448130642.0 Action|Thriller \n", 747 | "4 131.0 NaN Documentary \n", 748 | "\n", 749 | " ... country content_rating budget title_year \\\n", 750 | "0 ... USA PG-13 237000000.0 2009.0 \n", 751 | "1 ... USA PG-13 300000000.0 2007.0 \n", 752 | "2 ... UK PG-13 245000000.0 2015.0 \n", 753 | "3 ... USA PG-13 250000000.0 2012.0 \n", 754 | "4 ... NaN NaN NaN NaN \n", 755 | "\n", 756 | " actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \\\n", 757 | "0 936.0 7.9 1.78 33000 \n", 758 | "1 5000.0 7.1 2.35 0 \n", 759 | "2 393.0 6.8 2.35 85000 \n", 760 | "3 23000.0 8.5 2.35 164000 \n", 761 | "4 12.0 7.1 NaN 0 \n", 762 | "\n", 763 | " norm_movie_title norm_title_year \n", 764 | "0 avatar 2009 \n", 765 | "1 pirates of the caribbean: at worlds end 2007 \n", 766 | "2 spectre 2015 \n", 767 | "3 the dark knight rises 2012 \n", 768 | "4 star wars: episode vii - the force awakens ? \n", 769 | "\n", 770 | "[5 rows x 30 columns]" 771 | ] 772 | }, 773 | "execution_count": 14, 774 | "metadata": {}, 775 | "output_type": "execute_result" 776 | } 777 | ], 778 | "source": [ 779 | "def preprocess_year(year):\n", 780 | " if pd.isnull(year):\n", 781 | " return '?'\n", 782 | " else:\n", 783 | " return str(int(year))\n", 784 | "\n", 785 | "kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)\n", 786 | "kaggle_data.head()" 787 | ] 788 | }, 789 | { 790 | "cell_type": "markdown", 791 | "metadata": {}, 792 | "source": [ 793 | "-----" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "# Part 4: Data Matching & Merging\n", 801 | "-------------------------\n", 802 | "The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](#Part-3:-Data-Profiling-&-Cleaning), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far.\n", 803 | "\n", 804 | "### Step 1: Integrating the \"IMDB Plain Text Data\" files\n", 805 | "Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., \"the IMDB Plain Text data\"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple \"exact match\" can be done simply using dataframes." 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 15, 811 | "metadata": { 812 | "collapsed": false 813 | }, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/html": [ 818 | "
\n", 819 | "\n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | "
movie_xyearratingnorm_moviemovie_ygenre
0The Shawshank Redemption19949.2the shawshank redemptionThe Shawshank RedemptionCrime
1The Godfather19729.2the godfatherThe GodfatherCrime
2The Godfather: Part II19749.0the godfather: part iiThe Godfather: Part IICrime
3The Dark Knight20088.9the dark knightThe Dark KnightAction
412 Angry Men19578.912 angry men12 Angry MenCrime
\n", 879 | "
" 880 | ], 881 | "text/plain": [ 882 | " movie_x year rating norm_movie \\\n", 883 | "0 The Shawshank Redemption 1994 9.2 the shawshank redemption \n", 884 | "1 The Godfather 1972 9.2 the godfather \n", 885 | "2 The Godfather: Part II 1974 9.0 the godfather: part ii \n", 886 | "3 The Dark Knight 2008 8.9 the dark knight \n", 887 | "4 12 Angry Men 1957 8.9 12 angry men \n", 888 | "\n", 889 | " movie_y genre \n", 890 | "0 The Shawshank Redemption Crime \n", 891 | "1 The Godfather Crime \n", 892 | "2 The Godfather: Part II Crime \n", 893 | "3 The Dark Knight Action \n", 894 | "4 12 Angry Men Crime " 895 | ] 896 | }, 897 | "execution_count": 15, 898 | "metadata": {}, 899 | "output_type": "execute_result" 900 | } 901 | ], 902 | "source": [ 903 | "brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])\n", 904 | "brief_imdb_data.head()" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the \"IMDB Plain Text Data\". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *\"imdb_dataset.csv\"* under the _\"data\"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset." 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 16, 917 | "metadata": { 918 | "collapsed": false 919 | }, 920 | "outputs": [ 921 | { 922 | "data": { 923 | "text/plain": [ 924 | "(869178, 27)" 925 | ] 926 | }, 927 | "execution_count": 16, 928 | "metadata": {}, 929 | "output_type": "execute_result" 930 | } 931 | ], 932 | "source": [ 933 | "# reading the new IMDB dataset\n", 934 | "imdb_data = pd.read_csv('./data/imdb_dataset.csv')\n", 935 | "# let's normlize the title as we did in Part 3 of the tutorial\n", 936 | "imdb_data['norm_title'] = imdb_data['movie_title'].map(preprocess_title)\n", 937 | "imdb_data['norm_year'] = imdb_data['title_year'].map(preprocess_year)\n", 938 | "imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()\n", 939 | "imdb_data.shape" 940 | ] 941 | }, 942 | { 943 | "cell_type": "markdown", 944 | "metadata": {}, 945 | "source": [ 946 | "### Step 2: Integrating the Kaggle and IMDB datasets\n", 947 | "\n", 948 | "A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach." 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": 17, 954 | "metadata": { 955 | "collapsed": false 956 | }, 957 | "outputs": [ 958 | { 959 | "data": { 960 | "text/plain": [ 961 | "(4248, 57)" 962 | ] 963 | }, 964 | "execution_count": 17, 965 | "metadata": {}, 966 | "output_type": "execute_result" 967 | } 968 | ], 969 | "source": [ 970 | "data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],\n", 971 | " right_on=['norm_movie_title', 'norm_title_year'])\n", 972 | "data_attempt1.shape" 973 | ] 974 | }, 975 | { 976 | "cell_type": "markdown", 977 | "metadata": {}, 978 | "source": [ 979 | "But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. \"Wall.E\" vs \"WallE\"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year." 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 18, 985 | "metadata": { 986 | "collapsed": false 987 | }, 988 | "outputs": [ 989 | { 990 | "name": "stderr", 991 | "output_type": "stream", 992 | "text": [ 993 | "0% 100%\n", 994 | "[##############################] | ETA: 00:00:00\n", 995 | "Total time elapsed: 00:02:01\n" 996 | ] 997 | }, 998 | { 999 | "data": { 1000 | "text/plain": [ 1001 | "(4689, 8)" 1002 | ] 1003 | }, 1004 | "execution_count": 18, 1005 | "metadata": {}, 1006 | "output_type": "execute_result" 1007 | } 1008 | ], 1009 | "source": [ 1010 | "import py_stringsimjoin as ssj\n", 1011 | "import py_stringmatching as sm\n", 1012 | "\n", 1013 | "imdb_data['id'] = range(imdb_data.shape[0])\n", 1014 | "kaggle_data['id'] = range(kaggle_data.shape[0])\n", 1015 | "similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',\n", 1016 | " 'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],\n", 1017 | " r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)\n", 1018 | "# selecting the entries that have the same production year\n", 1019 | "data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]\n", 1020 | "data_attempt2.shape" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical." 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 19, 1033 | "metadata": { 1034 | "collapsed": false 1035 | }, 1036 | "outputs": [ 1037 | { 1038 | "data": { 1039 | "text/html": [ 1040 | "
\n", 1041 | "\n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | "
_idl_idr_idl_norm_titlel_norm_yearr_norm_movie_titler_norm_title_year_sim_score
14614685273646world war v2013world war z20131.0
16016028164956grave2012brave20121.0
18018083149058walle2008wall·e20081.0
23923981618867upe2009up20091.0
24624681736667ut2009up20091.0
\n", 1113 | "
" 1114 | ], 1115 | "text/plain": [ 1116 | " _id l_id r_id l_norm_title l_norm_year r_norm_movie_title \\\n", 1117 | "146 146 852736 46 world war v 2013 world war z \n", 1118 | "160 160 281649 56 grave 2012 brave \n", 1119 | "180 180 831490 58 walle 2008 wall·e \n", 1120 | "239 239 816188 67 upe 2009 up \n", 1121 | "246 246 817366 67 ut 2009 up \n", 1122 | "\n", 1123 | " r_norm_title_year _sim_score \n", 1124 | "146 2013 1.0 \n", 1125 | "160 2012 1.0 \n", 1126 | "180 2008 1.0 \n", 1127 | "239 2009 1.0 \n", 1128 | "246 2009 1.0 " 1129 | ] 1130 | }, 1131 | "execution_count": 19, 1132 | "metadata": {}, 1133 | "output_type": "execute_result" 1134 | } 1135 | ], 1136 | "source": [ 1137 | "data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "While instances such as \"walle\" and \"wall.e\" are correctly matched, we can see that this techniques also makes some errors (e.g., \"grave\" and \"brave\"). This raises the following questions: \"what method should be used for data matching?\" and \"how can we determine the quality of the matching?\". BigGorilla's recommendation for dealing with this problem is using the python package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan).\n", 1145 | "\n", 1146 | "In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching.\n", 1147 | "\n", 1148 | "### Step 3: Using Magellan for data matching\n", 1149 | "\n", 1150 | "#### Substep A: Finding a candiate set (Blocking)\n", 1151 | "The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily." 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": 20, 1157 | "metadata": { 1158 | "collapsed": false 1159 | }, 1160 | "outputs": [], 1161 | "source": [ 1162 | "# transforming the \"budget\" column into string and creating a new **mixture** column\n", 1163 | "ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)\n", 1164 | "imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']\n", 1165 | "\n", 1166 | "# repeating the same thing for the Kaggle dataset\n", 1167 | "ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)\n", 1168 | "kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + \\\n", 1169 | " ' ' + kaggle_data['budget']" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "markdown", 1174 | "metadata": {}, 1175 | "source": [ 1176 | "Now, we can use the **mixture** columns to create a desired candiate set which we call **C**." 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "execution_count": 21, 1182 | "metadata": { 1183 | "collapsed": false 1184 | }, 1185 | "outputs": [ 1186 | { 1187 | "name": "stderr", 1188 | "output_type": "stream", 1189 | "text": [ 1190 | "0% 100%\n", 1191 | "[##############################] | ETA: 00:00:00\n", 1192 | "Total time elapsed: 00:00:49\n" 1193 | ] 1194 | }, 1195 | { 1196 | "data": { 1197 | "text/plain": [ 1198 | "(18317, 14)" 1199 | ] 1200 | }, 1201 | "execution_count": 21, 1202 | "metadata": {}, 1203 | "output_type": "execute_result" 1204 | } 1205 | ], 1206 | "source": [ 1207 | "C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), \n", 1208 | " l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',\n", 1209 | " 'budget', 'content_rating'],\n", 1210 | " r_out_attrs=['norm_title', 'norm_year', 'duration',\n", 1211 | " 'budget', 'content_rating'],\n", 1212 | " threshold=0.65)\n", 1213 | "C.shape" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "markdown", 1218 | "metadata": {}, 1219 | "source": [ 1220 | "We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.\n", 1221 | "\n", 1222 | "#### Substep B: Specifying the keys \n", 1223 | "The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set." 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": 22, 1229 | "metadata": { 1230 | "collapsed": false 1231 | }, 1232 | "outputs": [ 1233 | { 1234 | "data": { 1235 | "text/plain": [ 1236 | "True" 1237 | ] 1238 | }, 1239 | "execution_count": 22, 1240 | "metadata": {}, 1241 | "output_type": "execute_result" 1242 | } 1243 | ], 1244 | "source": [ 1245 | "import py_entitymatching as em\n", 1246 | "em.set_key(kaggle_data, 'id') # specifying the key column in the kaggle dataset\n", 1247 | "em.set_key(imdb_data, 'id') # specifying the key column in the imdb dataset\n", 1248 | "em.set_key(C, '_id') # specifying the key in the candidate set\n", 1249 | "em.set_ltable(C, kaggle_data) # specifying the left table \n", 1250 | "em.set_rtable(C, imdb_data) # specifying the right table\n", 1251 | "em.set_fk_rtable(C, 'r_id') # specifying the column that matches the key in the right table \n", 1252 | "em.set_fk_ltable(C, 'l_id') # specifying the column that matches the key in the left table " 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "markdown", 1257 | "metadata": {}, 1258 | "source": [ 1259 | "\n", 1260 | "#### Subset C: Debugging the blocker\n", 1261 | "\n", 1262 | "Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.\n", 1263 | "\n", 1264 | "*Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.*" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": 23, 1270 | "metadata": { 1271 | "collapsed": false 1272 | }, 1273 | "outputs": [ 1274 | { 1275 | "data": { 1276 | "text/html": [ 1277 | "
\n", 1278 | "\n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | "
l_norm_movie_titler_norm_titlel_norm_title_yearr_norm_yearl_budgetr_budgetl_content_ratingr_mpaa
0dude wheres my dog!#hacked201420142000020000PGNaN
1road hard#horror2015201515000001500000NaNNaN
2#horror#horror2015201515000001500000Not RatedNaN
3me you and five bucks#horror2015201515000001500000NaNNaN
4checkmate#horror2015201515000001500000NaNNaN
\n", 1350 | "
" 1351 | ], 1352 | "text/plain": [ 1353 | " l_norm_movie_title r_norm_title l_norm_title_year r_norm_year l_budget \\\n", 1354 | "0 dude wheres my dog! #hacked 2014 2014 20000 \n", 1355 | "1 road hard #horror 2015 2015 1500000 \n", 1356 | "2 #horror #horror 2015 2015 1500000 \n", 1357 | "3 me you and five bucks #horror 2015 2015 1500000 \n", 1358 | "4 checkmate #horror 2015 2015 1500000 \n", 1359 | "\n", 1360 | " r_budget l_content_rating r_mpaa \n", 1361 | "0 20000 PG NaN \n", 1362 | "1 1500000 NaN NaN \n", 1363 | "2 1500000 Not Rated NaN \n", 1364 | "3 1500000 NaN NaN \n", 1365 | "4 1500000 NaN NaN " 1366 | ] 1367 | }, 1368 | "execution_count": 23, 1369 | "metadata": {}, 1370 | "output_type": "execute_result" 1371 | } 1372 | ], 1373 | "source": [ 1374 | "C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',\n", 1375 | " 'l_budget', 'r_budget', 'l_content_rating', 'r_content_rating']].head()" 1376 | ] 1377 | }, 1378 | { 1379 | "cell_type": "markdown", 1380 | "metadata": {}, 1381 | "source": [ 1382 | "Based on the above sample we can see that the blocking seems to be reasonable.\n", 1383 | "\n", 1384 | "#### Substep D: Sampling from the candiate set\n", 1385 | "\n", 1386 | "The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not." 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": 24, 1392 | "metadata": { 1393 | "collapsed": false 1394 | }, 1395 | "outputs": [], 1396 | "source": [ 1397 | "# Sampling 500 pairs and writing this sample into a .csv file\n", 1398 | "sampled = C.sample(500, random_state=0)\n", 1399 | "sampled.to_csv('./data/sampled.csv', encoding='utf-8')" 1400 | ] 1401 | }, 1402 | { 1403 | "cell_type": "markdown", 1404 | "metadata": {}, 1405 | "source": [ 1406 | "In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**." 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": 25, 1412 | "metadata": { 1413 | "collapsed": false 1414 | }, 1415 | "outputs": [ 1416 | { 1417 | "name": "stderr", 1418 | "output_type": "stream", 1419 | "text": [ 1420 | "Metadata file is not present in the given path; proceeding to read the csv file.\n" 1421 | ] 1422 | }, 1423 | { 1424 | "data": { 1425 | "text/html": [ 1426 | "
\n", 1427 | "\n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | "
Unnamed: 0_idl_idr_idl_norm_movie_titlel_norm_title_yearl_durationl_budgetl_content_ratingr_norm_titler_norm_yearr_lengthr_budgetr_mpaa_sim_scorelabel
0477147712639235925eye of the beholder1999109.015000000Reye of the beholder1999109.035000000R0.8333331
111478114782001600301rocky balboa2006139.024000000PGrocky balboa2006139.024000000PG1.0000001
213630136304160691766from russia with love1963115.02000000Approvedthe aeolians: from russia with love2012NaN20000NaN0.6666670
3197219721248101029sex tape201494.040000000Rblended2014117.040000000PG-130.6666670
41590315903722758133the scorch trials2015132.061000000PG-13the scorch trials2015132.061000000PG-131.0000001
\n", 1547 | "
" 1548 | ], 1549 | "text/plain": [ 1550 | " Unnamed: 0 _id l_id r_id l_norm_movie_title l_norm_title_year \\\n", 1551 | "0 4771 4771 2639 235925 eye of the beholder 1999 \n", 1552 | "1 11478 11478 2001 600301 rocky balboa 2006 \n", 1553 | "2 13630 13630 4160 691766 from russia with love 1963 \n", 1554 | "3 1972 1972 1248 101029 sex tape 2014 \n", 1555 | "4 15903 15903 722 758133 the scorch trials 2015 \n", 1556 | "\n", 1557 | " l_duration l_budget l_content_rating r_norm_title \\\n", 1558 | "0 109.0 15000000 R eye of the beholder \n", 1559 | "1 139.0 24000000 PG rocky balboa \n", 1560 | "2 115.0 2000000 Approved the aeolians: from russia with love \n", 1561 | "3 94.0 40000000 R blended \n", 1562 | "4 132.0 61000000 PG-13 the scorch trials \n", 1563 | "\n", 1564 | " r_norm_year r_length r_budget r_mpaa _sim_score label \n", 1565 | "0 1999 109.0 35000000 R 0.833333 1 \n", 1566 | "1 2006 139.0 24000000 PG 1.000000 1 \n", 1567 | "2 2012 NaN 20000 NaN 0.666667 0 \n", 1568 | "3 2014 117.0 40000000 PG-13 0.666667 0 \n", 1569 | "4 2015 132.0 61000000 PG-13 1.000000 1 " 1570 | ] 1571 | }, 1572 | "execution_count": 25, 1573 | "metadata": {}, 1574 | "output_type": "execute_result" 1575 | } 1576 | ], 1577 | "source": [ 1578 | "# If you would like to avoid labeling the pairs for now, you can download the labled.csv file from\n", 1579 | "# BigGorilla using the following command (if you prefer to do it yourself, commend the next line)\n", 1580 | "response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv',\n", 1581 | " './data/labeled.csv')\n", 1582 | "labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data,\n", 1583 | " fk_ltable='l_id', fk_rtable='r_id', key='_id')\n", 1584 | "labeled.head()" 1585 | ] 1586 | }, 1587 | { 1588 | "cell_type": "markdown", 1589 | "metadata": {}, 1590 | "source": [ 1591 | "#### Substep E: Traning machine learning algorithms\n", 1592 | "\n", 1593 | "Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task." 1594 | ] 1595 | }, 1596 | { 1597 | "cell_type": "code", 1598 | "execution_count": 26, 1599 | "metadata": { 1600 | "collapsed": true 1601 | }, 1602 | "outputs": [], 1603 | "source": [ 1604 | "split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)\n", 1605 | "train_data = split['train']\n", 1606 | "test_data = split['test']\n", 1607 | "\n", 1608 | "dt = em.DTMatcher(name='DecisionTree', random_state=0)\n", 1609 | "svm = em.SVMMatcher(name='SVM', random_state=0)\n", 1610 | "rf = em.RFMatcher(name='RF', random_state=0)\n", 1611 | "lg = em.LogRegMatcher(name='LogReg', random_state=0)\n", 1612 | "ln = em.LinRegMatcher(name='LinReg')\n", 1613 | "nb = em.NBMatcher(name='NaiveBayes')" 1614 | ] 1615 | }, 1616 | { 1617 | "cell_type": "markdown", 1618 | "metadata": {}, 1619 | "source": [ 1620 | "Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features." 1621 | ] 1622 | }, 1623 | { 1624 | "cell_type": "code", 1625 | "execution_count": 27, 1626 | "metadata": { 1627 | "collapsed": true 1628 | }, 1629 | "outputs": [], 1630 | "source": [ 1631 | "attr_corres = em.get_attr_corres(kaggle_data, imdb_data)\n", 1632 | "attr_corres['corres'] = [('norm_movie_title', 'norm_title'), \n", 1633 | " ('norm_title_year', 'norm_year'),\n", 1634 | " ('content_rating', 'content_rating'),\n", 1635 | " ('budget', 'budget'),\n", 1636 | "]\n", 1637 | "\n", 1638 | "l_attr_types = em.get_attr_types(kaggle_data)\n", 1639 | "r_attr_types = em.get_attr_types(imdb_data)\n", 1640 | "\n", 1641 | "tok = em.get_tokenizers_for_matching()\n", 1642 | "sim = em.get_sim_funs_for_matching()\n", 1643 | "\n", 1644 | "F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "markdown", 1649 | "metadata": {}, 1650 | "source": [ 1651 | "Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column." 1652 | ] 1653 | }, 1654 | { 1655 | "cell_type": "code", 1656 | "execution_count": 28, 1657 | "metadata": { 1658 | "collapsed": true 1659 | }, 1660 | "outputs": [], 1661 | "source": [ 1662 | "train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) \n", 1663 | "train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "markdown", 1668 | "metadata": {}, 1669 | "source": [ 1670 | "Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task." 1671 | ] 1672 | }, 1673 | { 1674 | "cell_type": "code", 1675 | "execution_count": 29, 1676 | "metadata": { 1677 | "collapsed": false 1678 | }, 1679 | "outputs": [ 1680 | { 1681 | "data": { 1682 | "text/html": [ 1683 | "
\n", 1684 | "\n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | "
NameMatcherNum foldsFold 1Fold 2Fold 3Fold 4Fold 5Mean score
0DecisionTree<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x160ef1e80>51.0000000.9677421.01.0000001.0000.993548
1RF<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x160ef1240>51.0000000.9677421.01.0000001.0000.993548
2SVM<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x160ef1f60>50.9565220.9677421.01.0000000.8750.959853
3LinReg<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x160ef17f0>51.0000000.9677421.01.0000001.0000.993548
4LogReg<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x160ef1438>51.0000000.9677421.00.9565221.0000.984853
5NaiveBayes<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x160ef1d30>51.0000000.9677421.01.0000001.0000.993548
\n", 1774 | "
" 1775 | ], 1776 | "text/plain": [ 1777 | " Name \\\n", 1778 | "0 DecisionTree \n", 1779 | "1 RF \n", 1780 | "2 SVM \n", 1781 | "3 LinReg \n", 1782 | "4 LogReg \n", 1783 | "5 NaiveBayes \n", 1784 | "\n", 1785 | " Matcher \\\n", 1786 | "0 \n", 1787 | "1 \n", 1788 | "2 \n", 1789 | "3 \n", 1790 | "4 \n", 1791 | "5 \n", 1792 | "\n", 1793 | " Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score \n", 1794 | "0 5 1.000000 0.967742 1.0 1.000000 1.000 0.993548 \n", 1795 | "1 5 1.000000 0.967742 1.0 1.000000 1.000 0.993548 \n", 1796 | "2 5 0.956522 0.967742 1.0 1.000000 0.875 0.959853 \n", 1797 | "3 5 1.000000 0.967742 1.0 1.000000 1.000 0.993548 \n", 1798 | "4 5 1.000000 0.967742 1.0 0.956522 1.000 0.984853 \n", 1799 | "5 5 1.000000 0.967742 1.0 1.000000 1.000 0.993548 " 1800 | ] 1801 | }, 1802 | "execution_count": 29, 1803 | "metadata": {}, 1804 | "output_type": "execute_result" 1805 | } 1806 | ], 1807 | "source": [ 1808 | "result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, \n", 1809 | " exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,\n", 1810 | " target_attr='label', metric='f1', random_state=0)\n", 1811 | "result['cv_stats']" 1812 | ] 1813 | }, 1814 | { 1815 | "cell_type": "markdown", 1816 | "metadata": {}, 1817 | "source": [ 1818 | "We can observe based on the reported accuracy of different techniques that the \"random forest (RF)\" algorithm achieves the best performance. Thus, it is best to use this technique for the matching." 1819 | ] 1820 | }, 1821 | { 1822 | "cell_type": "markdown", 1823 | "metadata": {}, 1824 | "source": [ 1825 | "#### Substep F: Evaluating the quality of our matching\n", 1826 | "\n", 1827 | "It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well." 1828 | ] 1829 | }, 1830 | { 1831 | "cell_type": "code", 1832 | "execution_count": null, 1833 | "metadata": { 1834 | "collapsed": false 1835 | }, 1836 | "outputs": [ 1837 | { 1838 | "name": "stdout", 1839 | "output_type": "stream", 1840 | "text": [ 1841 | "Precision : 94.44% (51/54)\n", 1842 | "Recall : 100.0% (51/51)\n", 1843 | "F1 : 97.14%\n", 1844 | "False positives : 3 (out of 54 positive predictions)\n", 1845 | "False negatives : 0 (out of 196 negative predictions)\n" 1846 | ] 1847 | } 1848 | ], 1849 | "source": [ 1850 | "best_model = result['selected_matcher']\n", 1851 | "best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')\n", 1852 | "\n", 1853 | "test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)\n", 1854 | "test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')\n", 1855 | "\n", 1856 | "# Predict on the test data\n", 1857 | "predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], \n", 1858 | " append=True, target_attr='predicted', inplace=False)\n", 1859 | "\n", 1860 | "# Evaluate the predictions\n", 1861 | "eval_result = em.eval_matches(predictions, 'label', 'predicted')\n", 1862 | "em.print_eval_summary(eval_result)" 1863 | ] 1864 | }, 1865 | { 1866 | "cell_type": "markdown", 1867 | "metadata": {}, 1868 | "source": [ 1869 | "#### Substep G: Using the trained model to match the datasets\n", 1870 | "\n", 1871 | "Now, we can use the trained model to match the two tables as follows:" 1872 | ] 1873 | }, 1874 | { 1875 | "cell_type": "code", 1876 | "execution_count": null, 1877 | "metadata": { 1878 | "collapsed": false 1879 | }, 1880 | "outputs": [ 1881 | { 1882 | "name": "stderr", 1883 | "output_type": "stream", 1884 | "text": [ 1885 | "0% 100%\n", 1886 | "[ ]" 1887 | ] 1888 | } 1889 | ], 1890 | "source": [ 1891 | "candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)\n", 1892 | "candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')\n", 1893 | "predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'],\n", 1894 | " append=True, target_attr='predicted', inplace=False)\n", 1895 | "matches = predictions[predictions.predicted == 1] " 1896 | ] 1897 | }, 1898 | { 1899 | "cell_type": "markdown", 1900 | "metadata": {}, 1901 | "source": [ 1902 | "Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset." 1903 | ] 1904 | }, 1905 | { 1906 | "cell_type": "code", 1907 | "execution_count": null, 1908 | "metadata": { 1909 | "collapsed": false, 1910 | "scrolled": true 1911 | }, 1912 | "outputs": [], 1913 | "source": [ 1914 | "from py_entitymatching.catalog import catalog_manager as cm\n", 1915 | "matches = matches[['_id', 'l_id', 'r_id', 'predicted']]\n", 1916 | "matches.reset_index(drop=True, inplace=True)\n", 1917 | "cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)\n", 1918 | "matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'],\n", 1919 | " r_output_attrs=['norm_title', 'norm_year', 'budget', 'content_rating'],\n", 1920 | " l_output_prefix='l_', r_output_prefix='r_',\n", 1921 | " delete_from_catalog=False)\n", 1922 | "matches.drop('predicted', axis=1, inplace=True)\n", 1923 | "matches.head()" 1924 | ] 1925 | } 1926 | ], 1927 | "metadata": { 1928 | "anaconda-cloud": {}, 1929 | "kernelspec": { 1930 | "display_name": "Python [conda env:py3k]", 1931 | "language": "python", 1932 | "name": "conda-env-py3k-py" 1933 | }, 1934 | "language_info": { 1935 | "codemirror_mode": { 1936 | "name": "ipython", 1937 | "version": 3 1938 | }, 1939 | "file_extension": ".py", 1940 | "mimetype": "text/x-python", 1941 | "name": "python", 1942 | "nbconvert_exporter": "python", 1943 | "pygments_lexer": "ipython3", 1944 | "version": "3.5.2" 1945 | } 1946 | }, 1947 | "nbformat": 4, 1948 | "nbformat_minor": 1 1949 | } 1950 | --------------------------------------------------------------------------------