├── restaurant_scraper
    ├── restaurantSpider
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── README
    │   │   ├── extractRawText.py
    │   │   └── EaterSpider.py
    │   ├── items.py
    │   ├── pipelines.py
    │   └── settings.py
    ├── README
    └── scrapy.cfg
├── README.md
├── CONTRIBUTING.txt
├── wikipage_info_extractor
    └── wiki_home_construction_crawl.py
├── matching_schemas
    └── example.py
├── wiki_to_json
    └── Wikipedia_JSON_Generator.ipynb
└── matching_movies
    ├── Tutorial_py3.py
    ├── Tutorial_py2.py
    └── Tutorial_py3.ipynb


/restaurant_scraper/restaurantSpider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/restaurant_scraper/README:
--------------------------------------------------------------------------------
1 | Goto subdirectory restuarantSpider/spiders/README for the actual readme file.
2 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/restaurant_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = restaurantSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = restaurantSpider
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BigGorilla
2 | BigGorilla is an open-source data integration and data preparation ecosystem
3 | (powered by Python) to enable data scientists to perform integration and
4 | analysis of data. Learn more about BigGorilla at [www.biggorilla.org](http://www.biggorilla.org).
5 | 
6 | ## Directories:
7 | * packages: contains packages developed as part of BigGorilla
8 | * workflows: contains notebooks and python scripts
9 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class RestaurantSpiderItem(scrapy.Item):
11 |     # define the fields for your item here like:
12 |     # url: url of page being crawled
13 |     # title: title of the page
14 |     # date: date that the page is posted
15 |     # content: the crawled content
16 |     url = scrapy.Field()
17 |     title = scrapy.Field()
18 |     date = scrapy.Field()
19 |     content = scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/spiders/README:
--------------------------------------------------------------------------------
 1 | #
 2 | # Example of a pipeline for scraping content from coffee websites. 
 3 | # This pipeline can be tailored for different websites.
 4 | #
 5 | 
 6 | 
 7 | #
 8 | # scrap with EatersSpider
 9 | #
10 | scrapy runspider EaterSpider.py -s OUTFILE="Eater-acquired.json"
11 | 
12 | #
13 | # The above generates Eater-acquired.json. 
14 | #
15 | # Each url is extracted as one Json item {date:-, url:-, content:[ ]},
16 | # where content contains the paragraphs under <p> tags of the url.
17 | #
18 | 
19 | #
20 | # Now remove html tags and newline characters from the content 
21 | #
22 | ./extractRawText.py Eater-acquired.json Eater-clean.json
23 | 
24 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/spiders/extractRawText.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | """
 3 | Authored by Wang-Chiew Tan
 4 | """
 5 | import re
 6 | import json
 7 | import sys
 8 | 
 9 | # remove all html tags, new line characters within the extracted paragraphs <p>
10 | def cleanhtml(extracted):
11 |     # remove html tags
12 |     cleantext = re.sub('<.*?>', '', extracted)
13 |     # remove all new lines
14 |     cleantext = re.sub('\n *', "", cleantext)
15 |     return cleantext
16 | 
17 | 
18 | def main():
19 |         ifilename = str(sys.argv[1])
20 |         ofilename = str(sys.argv[2])
21 |         with open(ofilename, 'w') as outfile:
22 |             with open(ifilename, 'r') as ifile:
23 |                 for json_line in ifile:
24 |                     data = json.loads(json_line)
25 |                     newdata = []
26 |                     for s in data["content"]:
27 |                         newdata.append(cleanhtml(s))
28 |                     data["content"] = newdata
29 |                     json.dump(data, outfile)
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.txt:
--------------------------------------------------------------------------------
 1 | ### CONTRIBUTION TO BIGGORILLA
 2 | ------------------------------
 3 | 
 4 | BigGorilla is an open-source framework for data integration
 5 | and data preparation tasks. We encourage all researchers,
 6 | engineers, professors and students who work on data preparation
 7 | and data integration tasks to contribute to BigGorilla. To make
 8 | sure your contribution can be easily deployed by others, the
 9 | BigGorilla team will review and test your code. Please read the
10 | following instructions before submitting your contribution.
11 | 
12 | * If you are submitting a sample workflow, you need to create a
13 | new folder under the "workflows" folder.
14 | * If you are submitting a python package make sure that you create
15 | a new folder with the name of your package under the "packages" folder.
16 | * Include a file titled "Authors.txt" inside your folder and mention
17 | the creators as well as any publications that should be cited for
18 | academic purposes.
19 | * If your submission uses large datasets, make sure to host them
20 | somewhere else and point to them in your submission (in a README file).
21 | * Note that by submitting your work, you are making it public for
22 | everyone and can be used for commercial as well as non-commercial and
23 | academic use.
24 | 


--------------------------------------------------------------------------------
/wikipage_info_extractor/wiki_home_construction_crawl.py:
--------------------------------------------------------------------------------
 1 | # This script crawls a couple of wiki urls, extracts the titles and
 2 | # the first paragraphs and stores them in a json file.
 3 | import urllib2
 4 | import json
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | data = []
 8 | header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
 9 | wiki_urls = [
10 | 	'https://en.wikipedia.org/wiki/Adobe',
11 | 	'https://en.wikipedia.org/wiki/Brick',
12 | 	'https://en.wikipedia.org/wiki/Concrete',
13 | 	'https://en.wikipedia.org/wiki/Trunk_(botany)',
14 | 	'https://en.wikipedia.org/wiki/Metal',
15 | 	'https://en.wikipedia.org/wiki/Stone_(disambiguation)',
16 | 	'https://en.wikipedia.org/wiki/Rock_(geology)',
17 | 	'https://en.wikipedia.org/wiki/Straw',
18 | 	'https://en.wikipedia.org/wiki/Wood'
19 | ]
20 | 
21 | for wiki in wiki_urls:
22 | 	feature_dict = {}
23 | 	req = urllib2.Request(wiki,headers=header)
24 | 	page = urllib2.urlopen(req)
25 | 
26 | 	#Parse the html in the 'page' variable, and store it in Beautiful Soup format
27 | 	soup = BeautifulSoup(page, 'html.parser')
28 | 
29 | 	feature_dict["description"] = soup.p.get_text()
30 | 	feature_dict["title"] = soup.h1.get_text()
31 | 	data.append(feature_dict)
32 | 
33 | 
34 | with open('wiki_home_construction_features.json', 'w') as jsonData:
35 |     json.dump(data, jsonData)
36 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Authored by Wang-Chiew Tan
 3 | """
 4 | 
 5 | # -*- coding: utf-8 -*-
 6 | 
 7 | # Define your item pipelines here
 8 | #
 9 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
10 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
11 | from scrapy import signals
12 | from scrapy.exporters import JsonLinesItemExporter
13 | 
14 | 
15 | #
16 | # this pipeline writes each item to the file specified. it
17 | # gets called with each item.
18 | #
19 | class RestaurantSpiderPipeline(object):
20 |     filename = ""
21 | 
22 |     @classmethod
23 |     def from_crawler(cls, crawler):
24 |         settings = crawler.settings
25 |         # get the specified filename to write to
26 |         filename = settings.get("OUTFILE")
27 |         pipeline = cls(filename)
28 |         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
29 |         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
30 |         return pipeline
31 | 
32 |     def __init__(self, filename):
33 |         # open the file for writing
34 |         self.file = open(filename, 'w+b')
35 |     
36 |     def spider_opened(self, spider):
37 |         self.exporter = JsonLinesItemExporter(self.file)
38 |         self.exporter.start_exporting()
39 | 
40 |     def spider_closed(self, spider):
41 |         self.exporter.finish_exporting()
42 |         self.file.close()
43 | 
44 |     def process_item(self, item, spider):
45 |         self.exporter.export_item(item)
46 |         return item
47 | 
48 | 


--------------------------------------------------------------------------------
/matching_schemas/example.py:
--------------------------------------------------------------------------------
 1 | import flexmatcher
 2 | import pandas as pd
 3 | 
 4 | # The mediated schema has three attributes: movie_name, movie_year, movie_rating
 5 | 
 6 | # Creating the first schema, a subset of its data and the mapping to the mediated schema
 7 | vals1 = [['year', 'Movie', 'imdb_rating'],
 8 |          ['2001', 'Lord of the Rings', '8.8'],
 9 |          ['2010', 'Inception', '8.7'],
10 |          ['1999', 'The Matrix', '8.7']]
11 | header = vals1.pop(0)
12 | data1 = pd.DataFrame(vals1, columns=header)
13 | data1_mapping = {'year': 'movie_year', 'imdb_rating': 'movie_rating', 'Movie': 'movie_name'}
14 | 
15 | # Creating the second schema, a subset of its data and the mapping to the mediated schema
16 | vals2 = [['title', 'produced', 'popularity'],
17 |          ['The Godfather', '1972', '9.2'],
18 |          ['Silver Linings Playbook', '2012', '7.8'],
19 |          ['The Big Short', '2015', '7.8']]
20 | header = vals2.pop(0)
21 | data2 = pd.DataFrame(vals2, columns=header)
22 | data2_mapping = {'popularity': 'movie_rating', 'produced': 'movie_year', 'title': 'movie_name'}
23 | 
24 | # Using Flexmatcher
25 | fm = flexmatcher.FlexMatcher()
26 | schema_list = [data1, data2]
27 | mapping_list = [data1_mapping, data2_mapping]
28 | fm.create_training_data(schema_list, mapping_list)
29 | fm.train()
30 | 
31 | # Creating a test schmea
32 | vals3 = [['rt', 'id', 'yr'],
33 |          ['8.5', 'The Pianist', '2002'],
34 |          ['7.7', 'The Social Network', '2010']]
35 | header = vals3.pop(0)
36 | data3 = pd.DataFrame(vals3, columns=header)
37 | print (fm.make_prediction(data3))
38 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/spiders/EaterSpider.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Authored by Wang-Chiew Tan
 3 | """
 4 | from scrapy.selector import Selector
 5 | from scrapy.http import HtmlResponse
 6 | from restaurantSpider.items import RestaurantSpiderItem
 7 | from scrapy.crawler import CrawlerProcess
 8 | import scrapy
 9 | #import sys, os
10 | 
11 | #
12 | # spider for crawling www.eater.com/review
13 | #
14 | class EaterSpider(scrapy.Spider):
15 |     name = "EaterSpider"
16 |     allowed_domains = []
17 |     start_urls = []
18 | 
19 |     # url to page. You can replace this with your site url
20 |     urlstr = "http://myexample.com"
21 |     # suppose we are scraping pages 1 to 25 of this url. add all urls
22 |     # to "start_urls"
23 |     for i in xrange(1,25):
24 |         start_urls.append(urlstr+str(i))
25 | 
26 |     print("=== Start URLs: {}".format(start_urls))
27 | 
28 |     def parse(self, response):
29 |         print "=== Starting to crawl the website === "
30 |         #
31 |         # The following are all hypothetical. We will extract the urls
32 |         # in each page (which we will scrap individually). We will
33 |         # also collect the corresponding titles and dates of the urls.
34 |         # 
35 |         urls = response.selector.xpath('//h3/a[@data-analytics-link="review"]/@href').extract()
36 |         titles = response.selector.xpath('//h3/a[@data-analytics-link="review"]/text()').extract()
37 |         dates = response.selector.xpath('//div[@class="m-entry-box__body"]/p/span[@class="p-byline__time"]/text()').extract()
38 | 
39 |         items = []
40 |         for j in xrange(0,len(urls)):
41 |             # item(url,title,date,content) is defined in items.py
42 |             i = RestaurantSpiderItem(url=urls[j], title=titles[j], date=dates[j])
43 |             items.append(i)
44 |             # start scraping the content
45 |             request = scrapy.Request(url=urls[j], callback=self.parse_cafe, errback=self.parse_error)
46 |             request.meta['item'] = i  # pass item information to pass to parse_cafe
47 |             yield request
48 | 
49 |     # capture and print error messages on console if needed
50 |     def parse_error(self, response):
51 |         item = response.meta['item']
52 |         print("=== Error on {} ===".format(item['url']))
53 |         yield item
54 | 
55 |     def parse_cafe(self, response):
56 |         item = response.meta['item']
57 |         print("=== Retrieving {} ===".format(item['url']))
58 |         # extracting all paragraphs from the article
59 |         item['content'] = response.selector.xpath('//p').extract()
60 |         yield item
61 | 


--------------------------------------------------------------------------------
/restaurant_scraper/restaurantSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for restaurantSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'restaurantSpider'
13 | 
14 | SPIDER_MODULES = ['restaurantSpider.spiders']
15 | NEWSPIDER_MODULE = 'restaurantSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'restaurantSpider (+http://www.yourdomain.com)'
20 | #USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36"
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'restaurantSpider.middlewares.MyCustomSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'restaurantSpider.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'restaurantSpider.pipelines.RestaurantSpiderPipeline': 300,
70 | }
71 | 
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/wiki_to_json/Wikipedia_JSON_Generator.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Wikipedia JSON Generator\n",
  8 |     "-------\n",
  9 |     "\n",
 10 |     "##  This ipython notebook provides a quick and dirty implementation converting Wikipedia raw texts into JSON format.\n",
 11 |     "\n",
 12 |     "1. In the code below, the file wikipedia.txt is a small sample of the Wikipedia dump file for the purposes of illustrating our code. The filw is obtained as follows:\n",
 13 |     "   Apply [WikiExtractor](http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py) on the wikipedia dump file, such as [this](https://dumps.wikimedia.org/enwiki/20161101/enwiki-20161101-pages-articles.xml.bz2). Obtain a small sample of the resulting file, which is wikipedia.txt. \n",
 14 |     "\n",
 15 |     "2. Retrieve (article title, article content text) pairs and generate a JSON file"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# You can download a sample wikipedia.txt file from here:\n",
 27 |     "# https://anaconda.org/BigGorilla/datasets/1/download/wikipedia.txt\n",
 28 |     "wikipedia_file_path = \"wikipedia.txt\"\n",
 29 |     "output_json_file_path = \"wikipedia.json\""
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "def load_wikifile(file_path):\n",
 41 |     "    result = []\n",
 42 |     "    with open(file_path, \"r\") as read_f:\n",
 43 |     "        wikititle = \"\"\n",
 44 |     "        wikitext = \"\"\n",
 45 |     "        start_flg = False\n",
 46 |     "        for line in read_f:\n",
 47 |     "            line = line.rstrip()\n",
 48 |     "            if line == \"\":\n",
 49 |     "                continue\n",
 50 |     "            if len(line) >= 8 and line[:8] == \"<doc id=\":\n",
 51 |     "                start_flg = True\n",
 52 |     "                continue\n",
 53 |     "            if start_flg is True:\n",
 54 |     "                wikititle = line\n",
 55 |     "                start_flg = False\n",
 56 |     "                continue\n",
 57 |     "            if len(line) >= 6 and line[:6] == \"</doc>\":\n",
 58 |     "                # retrieve only the title and content pairs\n",
 59 |     "                result.append({\"title\": wikititle,\n",
 60 |     "                              \"text\": wikitext.rstrip()})\n",
 61 |     "                wikitext = \"\"\n",
 62 |     "                continue\n",
 63 |     "            wikitext += line + '\\n'\n",
 64 |     "    return result"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "import json\n",
 76 |     "\n",
 77 |     "wiki_data = load_wikifile(wikipedia_file_path)\n",
 78 |     "with open(output_json_file_path, 'w') as outfile:\n",
 79 |     "    # write out wiki_data in json format\n",
 80 |     "    json.dump(wiki_data, outfile, indent=4)"
 81 |    ]
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "Python 2",
 87 |    "language": "python",
 88 |    "name": "python2"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 2
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython2",
100 |    "version": "2.7.12"
101 |   }
102 |  },
103 |  "nbformat": 4,
104 |  "nbformat_minor": 0
105 | }
106 | 


--------------------------------------------------------------------------------
/matching_movies/Tutorial_py3.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # # Part 1: Data Acquistion
  5 | # --------------------------
  6 | # BigGorilla recommends a list of tools for different data acquisition tasks (See [here]()). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial.
  7 | # 
  8 | # ### Step 1: downloading the "Kaggle 5000 Movie Dataset"
  9 | # The desired dataset is a _.csv_ file with a url that is specified in the code snippet below.
 10 | 
 11 | # In[1]:
 12 | 
 13 | # Importing urlib (BigGorilla's recommendation for data acquisition from the web)
 14 | import urllib.request
 15 | import os
 16 | 
 17 | # Creating the data folder
 18 | if not os.path.exists('./data'):
 19 |     os.makedirs('./data')
 20 | 
 21 | # Obtaining the dataset using the url that hosts it
 22 | kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'
 23 | if not os.path.exists('./data/kaggle_dataset.csv'):     # avoid downloading if the file exists
 24 |     response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')
 25 | 
 26 | 
 27 | # ### Step 2: downloading the "IMDB Plain Text Data"
 28 | # The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below:
 29 | # 
 30 | # * genres.list.gz
 31 | # * ratings.list.gz
 32 | # 
 33 | # _** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._
 34 | 
 35 | # In[2]:
 36 | 
 37 | import gzip
 38 | 
 39 | # Obtaining IMDB's text files
 40 | imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/'
 41 | imdb_files_list = ['genres.list.gz', 'ratings.list.gz']
 42 | for name in imdb_files_list:
 43 |     if not os.path.exists('./data/' + name):
 44 |         response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)
 45 |         with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'wb') as reg_file:
 46 |             file_content = comp_file.read()
 47 |             reg_file.write(file_content)
 48 | 
 49 | 
 50 | # ### Step 3: downloading the "IMDB Prepared Data"
 51 | # During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the "IMDB Plain Text Data". The "IMDB Prepared Data" is the dataset that we obtained by integrating a number of files from the "IMDB Plain Text Data" which we will use during later stages of this tutorial. The following code snippet downloads this dataset.
 52 | 
 53 | # In[3]:
 54 | 
 55 | imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'
 56 | if not os.path.exists('./data/imdb_dataset.csv'):     # avoid downloading if the file exists
 57 |     response = urllib.request.urlretrieve(kaggle_url, './data/imdb_dataset.csv')
 58 | 
 59 | 
 60 | # -----
 61 | 
 62 | # # Part 2: Data Extraction
 63 | # -----------------
 64 | # The "Kaggle 5000 Movie Dataset" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the "IMDB Plain Text Data" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately.
 65 | # 
 66 | # ##### Content of "ratings.list" data file
 67 | 
 68 | # In[4]:
 69 | 
 70 | with open("./data/ratings.list", encoding='latin1') as myfile:
 71 |     head = [next(myfile) for x in range(38)]
 72 | print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers
 73 | 
 74 | 
 75 | # ##### Content of the "genres.list" data file
 76 | 
 77 | # In[5]:
 78 | 
 79 | with open("./data/genres.list", encoding='latin1') as myfile:
 80 |     head = [next(myfile) for x in range(392)]
 81 | print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header
 82 | 
 83 | 
 84 | # ### Step 1: Extracting the information from "genres.list"
 85 | # The goal of this step is to extract the movie titles and their production year from "movies.list", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package "**re**".
 86 | 
 87 | # In[6]:
 88 | 
 89 | import re
 90 | import pandas as pd
 91 | 
 92 | with open("./data/genres.list", encoding='latin1') as genres_file:
 93 |     raw_content = genres_file.readlines()
 94 |     genres_list = []
 95 |     content = raw_content[382:]
 96 |     for line in content:
 97 |         m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
 98 |         genres_list.append([m.group(1), m.group(2), m.group(3)])
 99 |     genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])
100 | 
101 | 
102 | # ### Step 2: Extracting the information from "ratings.list"
103 | 
104 | # In[7]:
105 | 
106 | with open("./data/ratings.list", encoding='latin1') as ratings_file:
107 |     raw_content = ratings_file.readlines()
108 |     ratings_list = []
109 |     content = raw_content[28:]
110 |     for line in content:
111 |         m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip())
112 |         if m is None: continue
113 |         ratings_list.append([m.group(2), m.group(3), m.group(1)])
114 |     ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])
115 | 
116 | 
117 | # Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**).
118 | # 
119 | # ------
120 | 
121 | # # Part 3: Data Profiling & Cleaning
122 | # ---------------------------
123 | # 
124 | # The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task.
125 | # 
126 | # ### Step 1: Loading the "Kaggle 5000 Movies Dataset"
127 | # 
128 | # According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%202%20--%20Data%20Extraction.ipynb) of the tutorial, we stored the extracted data from "IMDB Plain Text Data" into dataframes. It would be appropriate to load the "Kaggle 5000 Movies Dataset" into a dataframe as well and follow the same data profiling procedure for all datasets.
129 | 
130 | # In[8]:
131 | 
132 | import pandas as pd
133 | 
134 | # Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)
135 | kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')
136 | 
137 | 
138 | # ### Step 2: Calculating some basic statistics (profiling)
139 | # 
140 | # Let's start by finding out how many movies are listed in each dataframe.
141 | 
142 | # In[9]:
143 | 
144 | print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))
145 | print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))
146 | print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))
147 | 
148 | 
149 | # We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year.
150 | 
151 | # In[10]:
152 | 
153 | print ('Number of duplicates in kaggle_data: {}'.format(
154 |     sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))
155 | print ('Number of duplicates in genres_data: {}'.format(
156 |     sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))
157 | print ('Number of duplicates in ratings_data: {}'.format(
158 |     sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))
159 | 
160 | 
161 | # ### Step 3: Dealing with duplicates (cleaning)
162 | # 
163 | # There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest.
164 | 
165 | # In[11]:
166 | 
167 | kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()
168 | genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
169 | ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
170 | 
171 | 
172 | # ### Step 4: Normalizing the text (cleaning)
173 | # 
174 | # The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as "'" and "?", and replaces some other special characters (e.g., "&" is replaced with "and"). 
175 | 
176 | # In[12]:
177 | 
178 | def preprocess_title(title):
179 |     title = title.lower()
180 |     title = title.replace(',', ' ')
181 |     title = title.replace("'", '')    
182 |     title = title.replace('&', 'and')
183 |     title = title.replace('?', '')
184 |     return title.strip()
185 | 
186 | kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)
187 | genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)
188 | ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)
189 | 
190 | 
191 | # ### Step 5: Looking at a few samples
192 | # 
193 | # The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the "Kaggle 5000 Movies Dataset" which is stored in the **kaggle_data** dataframe. 
194 | 
195 | # In[13]:
196 | 
197 | kaggle_data.sample(3, random_state=0)
198 | 
199 | 
200 | # Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **"?"**.
201 | 
202 | # In[14]:
203 | 
204 | def preprocess_year(year):
205 |     if pd.isnull(year):
206 |         return '?'
207 |     else:
208 |         return str(int(year))
209 | 
210 | kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)
211 | kaggle_data.head()
212 | 
213 | 
214 | # -----
215 | 
216 | # # Part 4: Data Matching & Merging
217 | # -------------------------
218 | # The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%203%20--%20Data%20Profiling%20%26%20Cleaning.ipynb), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far.
219 | # 
220 | # ### Step 1: Integrating the "IMDB Plain Text Data" files
221 | # Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., "the IMDB Plain Text data"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple "exact match" can be done simply using dataframes.
222 | 
223 | # In[15]:
224 | 
225 | brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])
226 | brief_imdb_data.head()
227 | 
228 | 
229 | # We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the "IMDB Plain Text Data". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *"imdb_dataset.csv"* under the _"data"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset.
230 | 
231 | # In[16]:
232 | 
233 | # reading the new IMDB dataset
234 | imdb_data = pd.read_csv('./data/imdb_dataset.csv')
235 | # let's normlize the title as we did in Part 3 of the tutorial
236 | imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title)
237 | imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year)
238 | imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()
239 | imdb_data.shape
240 | 
241 | 
242 | # ### Step 2: Integrating the Kaggle and IMDB datasets
243 | # 
244 | # A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach.
245 | 
246 | # In[17]:
247 | 
248 | data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],
249 |                          right_on=['norm_movie_title', 'norm_title_year'])
250 | data_attempt1.shape
251 | 
252 | 
253 | # But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year.
254 | 
255 | # In[18]:
256 | 
257 | import py_stringsimjoin as ssj
258 | import py_stringmatching as sm
259 | 
260 | imdb_data['id'] = range(imdb_data.shape[0])
261 | kaggle_data['id'] = range(kaggle_data.shape[0])
262 | similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',
263 |                                         'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],
264 |                                          r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)
265 | # selecting the entries that have the same production year
266 | data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]
267 | data_attempt2.shape
268 | 
269 | 
270 | # We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical.
271 | 
272 | # In[19]:
273 | 
274 | data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()
275 | 
276 | 
277 | # While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan).
278 | # 
279 | # In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching.
280 | # 
281 | # ### Step 3: Using Magellan for data matching
282 | # 
283 | # #### Substep A: Finding a candiate set (Blocking)
284 | # The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily.
285 | 
286 | # In[20]:
287 | 
288 | # transforming the "budget" column into string and creating a new **mixture** column
289 | ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)
290 | imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']
291 | 
292 | # repeating the same thing for the Kaggle dataset
293 | ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)
294 | kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] +                          ' ' + kaggle_data['budget']
295 | 
296 | 
297 | # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**.
298 | 
299 | # In[21]:
300 | 
301 | C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
302 |                                  l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',
303 |                                               'budget', 'content_rating'],
304 |                                  r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'],
305 |                                  threshold=0.65)
306 | C.shape
307 | 
308 | 
309 | # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.
310 | # 
311 | # #### Substep B: Specifying the keys 
312 | # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.
313 | 
314 | # In[22]:
315 | 
316 | import py_entitymatching as em
317 | em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset
318 | em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset
319 | em.set_key(C, '_id')            # specifying the key in the candidate set
320 | em.set_ltable(C, kaggle_data)   # specifying the left table 
321 | em.set_rtable(C, imdb_data)     # specifying the right table
322 | em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table 
323 | em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table 
324 | 
325 | 
326 | # 
327 | # #### Subset C: Debugging the blocker
328 | # 
329 | # Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.
330 | # 
331 | # *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.*
332 | 
333 | # In[23]:
334 | 
335 | C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',
336 |    'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()
337 | 
338 | 
339 | # Based on the above sample we can see that the blocking seems to be reasonable.
340 | # 
341 | # #### Substep D: Sampling from the candiate set
342 | # 
343 | # The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not.
344 | 
345 | # In[24]:
346 | 
347 | # Sampling 500 pairs and writing this sample into a .csv file
348 | sampled = C.sample(500, random_state=0)
349 | sampled.to_csv('./data/sampled.csv', encoding='utf-8')
350 | 
351 | 
352 | # In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**.
353 | 
354 | # In[25]:
355 | 
356 | # If you would like to avoid labeling the pairs for now, you can download the labled.csv file from
357 | # BigGorilla using the following command (if you prefer to do it yourself, commend the next line)
358 | response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv',
359 |                               './data/labeled.csv')
360 | labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data,
361 |                                fk_ltable='l_id', fk_rtable='r_id', key='_id')
362 | labeled.head()
363 | 
364 | 
365 | # #### Substep E: Traning machine learning algorithms
366 | # 
367 | # Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task.
368 | 
369 | # In[26]:
370 | 
371 | split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)
372 | train_data = split['train']
373 | test_data = split['test']
374 | 
375 | dt = em.DTMatcher(name='DecisionTree', random_state=0)
376 | svm = em.SVMMatcher(name='SVM', random_state=0)
377 | rf = em.RFMatcher(name='RF', random_state=0)
378 | lg = em.LogRegMatcher(name='LogReg', random_state=0)
379 | ln = em.LinRegMatcher(name='LinReg')
380 | nb = em.NBMatcher(name='NaiveBayes')
381 | 
382 | 
383 | # Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features.
384 | 
385 | # In[27]:
386 | 
387 | attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
388 | attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 
389 |                          ('norm_title_year', 'norm_year'),
390 |                         ('content_rating', 'mpaa'),
391 |                          ('budget', 'budget'),
392 | ]
393 | 
394 | l_attr_types = em.get_attr_types(kaggle_data)
395 | r_attr_types = em.get_attr_types(imdb_data)
396 | 
397 | tok = em.get_tokenizers_for_matching()
398 | sim = em.get_sim_funs_for_matching()
399 | 
400 | F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)
401 | 
402 | 
403 | # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column.
404 | 
405 | # In[28]:
406 | 
407 | train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 
408 | train_features = em.impute_table(train_features,  exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')
409 | 
410 | 
411 | # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task.
412 | 
413 | # In[29]:
414 | 
415 | result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 
416 |                            exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,
417 |                            target_attr='label', metric='f1', random_state=0)
418 | result['cv_stats']
419 | 
420 | 
421 | # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching.
422 | 
423 | # #### Substep F: Evaluating the quality of our matching
424 | # 
425 | # It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well.
426 | 
427 | # In[30]:
428 | 
429 | best_model = result['selected_matcher']
430 | best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')
431 | 
432 | test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)
433 | test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')
434 | 
435 | # Predict on the test data
436 | predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 
437 |                                  append=True, target_attr='predicted', inplace=False)
438 | 
439 | # Evaluate the predictions
440 | eval_result = em.eval_matches(predictions, 'label', 'predicted')
441 | em.print_eval_summary(eval_result)
442 | 
443 | 
444 | # #### Substep G: Using the trained model to match the datasets
445 | # 
446 | # Now, we can use the trained model to match the two tables as follows:
447 | 
448 | # In[31]:
449 | 
450 | candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)
451 | candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')
452 | predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'],
453 |                                  append=True, target_attr='predicted', inplace=False)
454 | matches = predictions[predictions.predicted == 1] 
455 | 
456 | 
457 | # Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset.
458 | 
459 | # In[32]:
460 | 
461 | from py_entitymatching.catalog import catalog_manager as cm
462 | matches = matches[['_id', 'l_id', 'r_id', 'predicted']]
463 | matches.reset_index(drop=True, inplace=True)
464 | cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)
465 | matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'],
466 |                                    r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'],
467 |                                    l_output_prefix='l_', r_output_prefix='r_',
468 |                                    delete_from_catalog=False)
469 | matches.drop('predicted', axis=1, inplace=True)
470 | matches.head()
471 | 
472 | 


--------------------------------------------------------------------------------
/matching_movies/Tutorial_py2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # # Part 1: Data Acquistion
  5 | # --------------------------
  6 | # BigGorilla recommends a list of tools for different data acquisition tasks (See [here]()). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial.
  7 | # 
  8 | # ### Step 1: downloading the "Kaggle 5000 Movie Dataset"
  9 | # The desired dataset is a _.csv_ file with a url that is specified in the code snippet below.
 10 | 
 11 | # In[1]:
 12 | 
 13 | # Importing urlib (BigGorilla's recommendation for data acquisition from the web)
 14 | import urllib
 15 | import os
 16 | 
 17 | # Creating the data folder
 18 | if not os.path.exists('./data'):
 19 |     os.makedirs('./data')
 20 | 
 21 | # Obtaining the dataset using the url that hosts it
 22 | kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'
 23 | if not os.path.exists('./data/kaggle_dataset.csv'):     # avoid downloading if the file exists
 24 |     response = urllib.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')
 25 | 
 26 | 
 27 | # ### Step 2: downloading the "IMDB Plain Text Data"
 28 | # The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below:
 29 | # 
 30 | # * genres.list.gz
 31 | # * ratings.list.gz
 32 | # 
 33 | # _** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._
 34 | 
 35 | # In[2]:
 36 | 
 37 | import gzip
 38 | 
 39 | # Obtaining IMDB's text files
 40 | imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/'
 41 | imdb_files_list = ['genres.list.gz', 'ratings.list.gz']
 42 | for name in imdb_files_list:
 43 |     if not os.path.exists('./data/' + name):
 44 |         response = urllib.urlretrieve(imdb_url_prefix + name, './data/' + name)
 45 |         urllib.urlcleanup()   # urllib fails to download two files from a ftp source. This fixes the bug!
 46 |         with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'w') as reg_file:
 47 |             file_content = comp_file.read()
 48 |             reg_file.write(file_content)
 49 | 
 50 | 
 51 | # ### Step 3: downloading the "IMDB Prepared Data"
 52 | # During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the "IMDB Plain Text Data". The "IMDB Prepared Data" is the dataset that we obtained by integrating a number of files from the "IMDB Plain Text Data" which we will use during later stages of this tutorial. The following code snippet downloads this dataset.
 53 | 
 54 | # In[3]:
 55 | 
 56 | imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'
 57 | if not os.path.exists('./data/imdb_dataset.csv'):     # avoid downloading if the file exists
 58 |     response = urllib.urlretrieve(kaggle_url, './data/imdb_dataset.csv')
 59 | 
 60 | 
 61 | # -----
 62 | 
 63 | # # Part 2: Data Extraction
 64 | # -----------------
 65 | # The "Kaggle 5000 Movie Dataset" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the "IMDB Plain Text Data" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately.
 66 | # 
 67 | # ##### Content of "ratings.list" data file
 68 | 
 69 | # In[4]:
 70 | 
 71 | with open("./data/ratings.list") as myfile:
 72 |     head = [next(myfile) for x in range(38)]
 73 | print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers
 74 | 
 75 | 
 76 | # ##### Content of the "genres.list" data file
 77 | 
 78 | # In[5]:
 79 | 
 80 | with open("./data/genres.list") as myfile:
 81 |     head = [next(myfile) for x in range(392)]
 82 | print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header
 83 | 
 84 | 
 85 | # ### Step 1: Extracting the information from "genres.list"
 86 | # The goal of this step is to extract the movie titles and their production year from "movies.list", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package "**re**".
 87 | 
 88 | # In[6]:
 89 | 
 90 | import re
 91 | import pandas as pd
 92 | 
 93 | with open("./data/genres.list") as genres_file:
 94 |     raw_content = genres_file.readlines()
 95 |     genres_list = []
 96 |     content = raw_content[382:]
 97 |     for line in content:
 98 |         m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
 99 |         genres_list.append([m.group(1), m.group(2), m.group(3)])
100 |     genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])
101 | 
102 | 
103 | # ### Step 2: Extracting the information from "ratings.list"
104 | 
105 | # In[7]:
106 | 
107 | with open("./data/ratings.list") as ratings_file:
108 |     raw_content = ratings_file.readlines()
109 |     ratings_list = []
110 |     content = raw_content[28:]
111 |     for line in content:
112 |         m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip())
113 |         if m is None: continue
114 |         ratings_list.append([m.group(2), m.group(3), m.group(1)])
115 |     ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])
116 | 
117 | 
118 | # Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**).
119 | # 
120 | # ------
121 | 
122 | # # Part 3: Data Profiling & Cleaning
123 | # ---------------------------
124 | # 
125 | # The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task.
126 | # 
127 | # ### Step 1: Loading the "Kaggle 5000 Movies Dataset"
128 | # 
129 | # According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%202%20--%20Data%20Extraction.ipynb) of the tutorial, we stored the extracted data from "IMDB Plain Text Data" into dataframes. It would be appropriate to load the "Kaggle 5000 Movies Dataset" into a dataframe as well and follow the same data profiling procedure for all datasets.
130 | 
131 | # In[8]:
132 | 
133 | import pandas as pd
134 | 
135 | # Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)
136 | kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')
137 | 
138 | 
139 | # ### Step 2: Calculating some basic statistics (profiling)
140 | # 
141 | # Let's start by finding out how many movies are listed in each dataframe.
142 | 
143 | # In[9]:
144 | 
145 | print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))
146 | print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))
147 | print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))
148 | 
149 | 
150 | # We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year.
151 | 
152 | # In[10]:
153 | 
154 | print ('Number of duplicates in kaggle_data: {}'.format(
155 |     sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))
156 | print ('Number of duplicates in genres_data: {}'.format(
157 |     sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))
158 | print ('Number of duplicates in ratings_data: {}'.format(
159 |     sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))
160 | 
161 | 
162 | # ### Step 3: Dealing with duplicates (cleaning)
163 | # 
164 | # There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest.
165 | 
166 | # In[11]:
167 | 
168 | kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()
169 | genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
170 | ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
171 | 
172 | 
173 | # ### Step 4: Normalizing the text (cleaning)
174 | # 
175 | # The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as "'" and "?", and replaces some other special characters (e.g., "&" is replaced with "and"). 
176 | 
177 | # In[12]:
178 | 
179 | def preprocess_title(title):
180 |     title = title.lower()
181 |     title = title.replace(',', ' ')
182 |     title = title.replace("'", '')    
183 |     title = title.replace('&', 'and')
184 |     title = title.replace('?', '')
185 |     title = title.decode('utf-8', 'ignore')
186 |     return title.strip()
187 | 
188 | kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)
189 | genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)
190 | ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)
191 | 
192 | 
193 | # ### Step 5: Looking at a few samples
194 | # 
195 | # The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the "Kaggle 5000 Movies Dataset" which is stored in the **kaggle_data** dataframe. 
196 | 
197 | # In[13]:
198 | 
199 | kaggle_data.sample(3, random_state=0)
200 | 
201 | 
202 | # Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **"?"**.
203 | 
204 | # In[14]:
205 | 
206 | def preprocess_year(year):
207 |     if pd.isnull(year):
208 |         return '?'
209 |     else:
210 |         return str(int(year))
211 | 
212 | kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)
213 | kaggle_data.head()
214 | 
215 | 
216 | # -----
217 | 
218 | # # Part 4: Data Matching & Merging
219 | # -------------------------
220 | # The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](https://github.com/rit-git/BigGorilla/blob/tutorial/Tutorial/Part%203%20--%20Data%20Profiling%20%26%20Cleaning.ipynb), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far.
221 | # 
222 | # ### Step 1: Integrating the "IMDB Plain Text Data" files
223 | # Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., "the IMDB Plain Text data"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple "exact match" can be done simply using dataframes.
224 | 
225 | # In[15]:
226 | 
227 | brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])
228 | brief_imdb_data.head()
229 | 
230 | 
231 | # We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the "IMDB Plain Text Data". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *"imdb_dataset.csv"* under the _"data"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset.
232 | 
233 | # In[16]:
234 | 
235 | # reading the new IMDB dataset
236 | imdb_data = pd.read_csv('./data/imdb_dataset.csv')
237 | # let's normlize the title as we did in Part 3 of the tutorial
238 | imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title)
239 | imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year)
240 | imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()
241 | imdb_data.shape
242 | 
243 | 
244 | # ### Step 2: Integrating the Kaggle and IMDB datasets
245 | # 
246 | # A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach.
247 | 
248 | # In[17]:
249 | 
250 | data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],
251 |                          right_on=['norm_movie_title', 'norm_title_year'])
252 | data_attempt1.shape
253 | 
254 | 
255 | # But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year.
256 | 
257 | # In[18]:
258 | 
259 | import py_stringsimjoin as ssj
260 | import py_stringmatching as sm
261 | 
262 | imdb_data['id'] = range(imdb_data.shape[0])
263 | kaggle_data['id'] = range(kaggle_data.shape[0])
264 | similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',
265 |                                         'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],
266 |                                          r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)
267 | # selecting the entries that have the same production year
268 | data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]
269 | data_attempt2.shape
270 | 
271 | 
272 | # We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical.
273 | 
274 | # In[19]:
275 | 
276 | data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()
277 | 
278 | 
279 | # While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan).
280 | # 
281 | # In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching.
282 | # 
283 | # ### Step 3: Using Magellan for data matching
284 | # 
285 | # #### Substep A: Finding a candiate set (Blocking)
286 | # The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily.
287 | 
288 | # In[20]:
289 | 
290 | # transforming the "budget" column into string and creating a new **mixture** column
291 | ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)
292 | imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']
293 | 
294 | # repeating the same thing for the Kaggle dataset
295 | ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)
296 | kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] +                          ' ' + kaggle_data['budget']
297 | 
298 | 
299 | # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**.
300 | 
301 | # In[21]:
302 | 
303 | C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
304 |                                  l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',
305 |                                               'budget', 'content_rating'],
306 |                                  r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'],
307 |                                  threshold=0.65)
308 | C.shape
309 | 
310 | 
311 | # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.
312 | # 
313 | # #### Substep B: Specifying the keys 
314 | # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.
315 | 
316 | # In[22]:
317 | 
318 | import py_entitymatching as em
319 | em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset
320 | em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset
321 | em.set_key(C, '_id')            # specifying the key in the candidate set
322 | em.set_ltable(C, kaggle_data)   # specifying the left table 
323 | em.set_rtable(C, imdb_data)     # specifying the right table
324 | em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table 
325 | em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table 
326 | 
327 | 
328 | # 
329 | # #### Subset C: Debugging the blocker
330 | # 
331 | # Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.
332 | # 
333 | # *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.*
334 | 
335 | # In[23]:
336 | 
337 | C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',
338 |    'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()
339 | 
340 | 
341 | # Based on the above sample we can see that the blocking seems to be reasonable.
342 | # 
343 | # #### Substep D: Sampling from the candiate set
344 | # 
345 | # The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not.
346 | 
347 | # In[24]:
348 | 
349 | # Sampling 500 pairs and writing this sample into a .csv file
350 | sampled = C.sample(500, random_state=0)
351 | sampled.to_csv('./data/sampled.csv', encoding='utf-8')
352 | 
353 | 
354 | # In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**.
355 | 
356 | # In[25]:
357 | 
358 | # If you would like to avoid labeling the pairs for now, you can download the labled.csv file from
359 | # BigGorilla using the following command (if you prefer to do it yourself, commend the next line)
360 | response = urllib.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv',
361 |                               './data/labeled.csv')
362 | labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data,
363 |                                fk_ltable='l_id', fk_rtable='r_id', key='_id')
364 | labeled.head()
365 | 
366 | 
367 | # #### Substep E: Traning machine learning algorithms
368 | # 
369 | # Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task.
370 | 
371 | # In[26]:
372 | 
373 | split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)
374 | train_data = split['train']
375 | test_data = split['test']
376 | 
377 | dt = em.DTMatcher(name='DecisionTree', random_state=0)
378 | svm = em.SVMMatcher(name='SVM', random_state=0)
379 | rf = em.RFMatcher(name='RF', random_state=0)
380 | lg = em.LogRegMatcher(name='LogReg', random_state=0)
381 | ln = em.LinRegMatcher(name='LinReg')
382 | nb = em.NBMatcher(name='NaiveBayes')
383 | 
384 | 
385 | # Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features.
386 | 
387 | # In[27]:
388 | 
389 | attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
390 | attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 
391 |                          ('norm_title_year', 'norm_year'),
392 |                         ('content_rating', 'mpaa'),
393 |                          ('budget', 'budget'),
394 | ]
395 | 
396 | l_attr_types = em.get_attr_types(kaggle_data)
397 | r_attr_types = em.get_attr_types(imdb_data)
398 | 
399 | tok = em.get_tokenizers_for_matching()
400 | sim = em.get_sim_funs_for_matching()
401 | 
402 | F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)
403 | 
404 | 
405 | # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column.
406 | 
407 | # In[28]:
408 | 
409 | train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 
410 | train_features = em.impute_table(train_features,  exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')
411 | 
412 | 
413 | # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task.
414 | 
415 | # In[29]:
416 | 
417 | result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 
418 |                            exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,
419 |                            target_attr='label', metric='f1', random_state=0)
420 | result['cv_stats']
421 | 
422 | 
423 | # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching.
424 | 
425 | # #### Substep F: Evaluating the quality of our matching
426 | # 
427 | # It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well.
428 | 
429 | # In[30]:
430 | 
431 | best_model = result['selected_matcher']
432 | best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')
433 | 
434 | test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)
435 | test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')
436 | 
437 | # Predict on the test data
438 | predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 
439 |                                  append=True, target_attr='predicted', inplace=False)
440 | 
441 | # Evaluate the predictions
442 | eval_result = em.eval_matches(predictions, 'label', 'predicted')
443 | em.print_eval_summary(eval_result)
444 | 
445 | 
446 | # #### Substep G: Using the trained model to match the datasets
447 | # 
448 | # Now, we can use the trained model to match the two tables as follows:
449 | 
450 | # In[31]:
451 | 
452 | candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)
453 | candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')
454 | predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'],
455 |                                  append=True, target_attr='predicted', inplace=False)
456 | matches = predictions[predictions.predicted == 1] 
457 | 
458 | 
459 | # Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset.
460 | 
461 | # In[32]:
462 | 
463 | from py_entitymatching.catalog import catalog_manager as cm
464 | matches = matches[['_id', 'l_id', 'r_id', 'predicted']]
465 | matches.reset_index(drop=True, inplace=True)
466 | cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)
467 | matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'],
468 |                                    r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'],
469 |                                    l_output_prefix='l_', r_output_prefix='r_',
470 |                                    delete_from_catalog=False)
471 | matches.drop('predicted', axis=1, inplace=True)
472 | matches.head()
473 | 
474 | 


--------------------------------------------------------------------------------
/matching_movies/Tutorial_py3.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Part 1: Data Acquistion\n",
   8 |     "--------------------------\n",
   9 |     "BigGorilla recommends a list of tools for different data acquisition tasks (See [here](http://www.biggorilla.org/data-acquisition)). Among these tools, **urllib** is a popular python package for fetching data across the web. In this part, we use **urllib** to download the datasets that we need for this tutorial.\n",
  10 |     "\n",
  11 |     "### Step 1: downloading the \"Kaggle 5000 Movie Dataset\"\n",
  12 |     "The desired dataset is a _.csv_ file with a url that is specified in the code snippet below."
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "code",
  17 |    "execution_count": 1,
  18 |    "metadata": {
  19 |     "collapsed": false
  20 |    },
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "# Importing urlib (BigGorilla's recommendation for data acquisition from the web)\n",
  24 |     "import urllib.request\n",
  25 |     "import os\n",
  26 |     "\n",
  27 |     "# Creating the data folder\n",
  28 |     "if not os.path.exists('./data'):\n",
  29 |     "    os.makedirs('./data')\n",
  30 |     "\n",
  31 |     "# Obtaining the dataset using the url that hosts it\n",
  32 |     "kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'\n",
  33 |     "if not os.path.exists('./data/kaggle_dataset.csv'):     # avoid downloading if the file exists\n",
  34 |     "    response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "metadata": {},
  40 |    "source": [
  41 |     "### Step 2: downloading the \"IMDB Plain Text Data\"\n",
  42 |     "The IMDB Plain Text Data (see [here](ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/)) is a collection of files where each files describe one or a few attributes of a movie. We are going to focus on a subset of movie attribues which subsequently means that we are only interested in a few of these files which are listed below:\n",
  43 |     "\n",
  44 |     "* genres.list.gz\n",
  45 |     "* ratings.list.gz\n",
  46 |     "\n",
  47 |     "_** Note: The total size of files mentioned above is roughly 30M. Running the following code may take a few minutes._"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": 2,
  53 |    "metadata": {
  54 |     "collapsed": false
  55 |    },
  56 |    "outputs": [],
  57 |    "source": [
  58 |     "import gzip\n",
  59 |     "\n",
  60 |     "# Obtaining IMDB's text files\n",
  61 |     "imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/'\n",
  62 |     "imdb_files_list = ['genres.list.gz', 'ratings.list.gz']\n",
  63 |     "for name in imdb_files_list:\n",
  64 |     "    if not os.path.exists('./data/' + name):\n",
  65 |     "        response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)\n",
  66 |     "        with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'wb') as reg_file:\n",
  67 |     "            file_content = comp_file.read()\n",
  68 |     "            reg_file.write(file_content)"
  69 |    ]
  70 |   },
  71 |   {
  72 |    "cell_type": "markdown",
  73 |    "metadata": {},
  74 |    "source": [
  75 |     "### Step 3: downloading the \"IMDB Prepared Data\"\n",
  76 |     "During this tutorial, we discuss how the contents of _genres.list.gz_ and _ratings.list.gz_ files can be integrated. However, to make the tutorial more concise, we avoid including the same process for all the files in the \"IMDB Plain Text Data\". The \"IMDB Prepared Data\" is the dataset that we obtained by integrating a number of files from the \"IMDB Plain Text Data\" which we will use during later stages of this tutorial. The following code snippet downloads this dataset."
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": 3,
  82 |    "metadata": {
  83 |     "collapsed": true
  84 |    },
  85 |    "outputs": [],
  86 |    "source": [
  87 |     "imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'\n",
  88 |     "if not os.path.exists('./data/imdb_dataset.csv'):     # avoid downloading if the file exists\n",
  89 |     "    response = urllib.request.urlretrieve(kaggle_url, './data/imdb_dataset.csv')"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {},
  95 |    "source": [
  96 |     "-----"
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "markdown",
 101 |    "metadata": {},
 102 |    "source": [
 103 |     "# Part 2: Data Extraction\n",
 104 |     "-----------------\n",
 105 |     "The \"Kaggle 5000 Movie Dataset\" is stored in a _.csv_ file which is alreday structured and ready to use. On the other hand, the \"IMDB Plain Text Data\" is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately.\n",
 106 |     "\n",
 107 |     "##### Content of \"ratings.list\" data file"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "code",
 112 |    "execution_count": 4,
 113 |    "metadata": {
 114 |     "collapsed": false
 115 |    },
 116 |    "outputs": [
 117 |     {
 118 |      "name": "stdout",
 119 |      "output_type": "stream",
 120 |      "text": [
 121 |       "      0000000125  1728818   9.2  The Shawshank Redemption (1994)\n",
 122 |       "      0000000125  1181412   9.2  The Godfather (1972)\n",
 123 |       "      0000000124  810055   9.0  The Godfather: Part II (1974)\n",
 124 |       "      0000000124  1714042   8.9  The Dark Knight (2008)\n",
 125 |       "      0000000133  461310   8.9  12 Angry Men (1957)\n",
 126 |       "      0000000133  885509   8.9  Schindler's List (1993)\n",
 127 |       "      0000000123  1354135   8.9  Pulp Fiction (1994)\n",
 128 |       "      0000000124  1241908   8.9  The Lord of the Rings: The Return of the King (2003)\n",
 129 |       "      0000000123  514540   8.9  Il buono, il brutto, il cattivo (1966)\n",
 130 |       "      0000000133  1380148   8.8  Fight Club (1999)\n",
 131 |       "\n"
 132 |      ]
 133 |     }
 134 |    ],
 135 |    "source": [
 136 |     "with open(\"./data/ratings.list\", encoding='latin1') as myfile:\n",
 137 |     "    head = [next(myfile) for x in range(38)]\n",
 138 |     "print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers"
 139 |    ]
 140 |   },
 141 |   {
 142 |    "cell_type": "markdown",
 143 |    "metadata": {},
 144 |    "source": [
 145 |     "##### Content of the \"genres.list\" data file"
 146 |    ]
 147 |   },
 148 |   {
 149 |    "cell_type": "code",
 150 |    "execution_count": 5,
 151 |    "metadata": {
 152 |     "collapsed": false
 153 |    },
 154 |    "outputs": [
 155 |     {
 156 |      "name": "stdout",
 157 |      "output_type": "stream",
 158 |      "text": [
 159 |       "\"!Next?\" (1994)\t\t\t\t\t\tDocumentary\n",
 160 |       "\"#1 Single\" (2006)\t\t\t\t\tReality-TV\n",
 161 |       "\"#15SecondScare\" (2015)\t\t\t\t\tHorror\n",
 162 |       "\"#15SecondScare\" (2015)\t\t\t\t\tShort\n",
 163 |       "\"#15SecondScare\" (2015)\t\t\t\t\tThriller\n",
 164 |       "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tDrama\n",
 165 |       "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tHorror\n",
 166 |       "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tShort\n",
 167 |       "\"#15SecondScare\" (2015) {Who Wants to Play with the Rabbit? (#1.2)}\tThriller\n",
 168 |       "\"#1MinuteNightmare\" (2014)\t\t\t\tHorror\n",
 169 |       "\n"
 170 |      ]
 171 |     }
 172 |    ],
 173 |    "source": [
 174 |     "with open(\"./data/genres.list\", encoding='latin1') as myfile:\n",
 175 |     "    head = [next(myfile) for x in range(392)]\n",
 176 |     "print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header"
 177 |    ]
 178 |   },
 179 |   {
 180 |    "cell_type": "markdown",
 181 |    "metadata": {},
 182 |    "source": [
 183 |     "### Step 1: Extracting the information from \"genres.list\"\n",
 184 |     "The goal of this step is to extract the movie titles and their production year from \"movies.list\", and store the extracted data into a dataframe. Dataframe (from the python package **pandas**) is one of the key BigGorilla's recommendation for data profiling and cleaning. To extract the desired information from the text, we rely on **regular expressions** which are implemented in the python package \"**re**\"."
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "code",
 189 |    "execution_count": 6,
 190 |    "metadata": {
 191 |     "collapsed": true
 192 |    },
 193 |    "outputs": [],
 194 |    "source": [
 195 |     "import re\n",
 196 |     "import pandas as pd\n",
 197 |     "\n",
 198 |     "with open(\"./data/genres.list\", encoding='latin1') as genres_file:\n",
 199 |     "    raw_content = genres_file.readlines()\n",
 200 |     "    genres_list = []\n",
 201 |     "    content = raw_content[382:]\n",
 202 |     "    for line in content:\n",
 203 |     "        m = re.match(r'\"?(.*[^\"])\"? \\(((?:\\d|\\?){4})(?:/\\w*)?\\).*\\s((?:\\w|-)+)', line.strip())\n",
 204 |     "        genres_list.append([m.group(1), m.group(2), m.group(3)])\n",
 205 |     "    genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])"
 206 |    ]
 207 |   },
 208 |   {
 209 |    "cell_type": "markdown",
 210 |    "metadata": {},
 211 |    "source": [
 212 |     "### Step 2: Extracting the information from \"ratings.list\""
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": 7,
 218 |    "metadata": {
 219 |     "collapsed": true
 220 |    },
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "with open(\"./data/ratings.list\", encoding='latin1') as ratings_file:\n",
 224 |     "    raw_content = ratings_file.readlines()\n",
 225 |     "    ratings_list = []\n",
 226 |     "    content = raw_content[28:]\n",
 227 |     "    for line in content:\n",
 228 |     "        m = re.match(r'(?:\\d|\\.|\\*){10}\\s+\\d+\\s+(1?\\d\\.\\d)\\s\"?(.*[^\"])\"? \\(((?:\\d|\\?){4})(?:/\\w*)?\\)', line.strip())\n",
 229 |     "        if m is None: continue\n",
 230 |     "        ratings_list.append([m.group(2), m.group(3), m.group(1)])\n",
 231 |     "    ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "markdown",
 236 |    "metadata": {},
 237 |    "source": [
 238 |     "Note that one has to repeat the information extraction procedure for other data files as well if he is interested in their content. For now (and to keep the tutorial simple), we assume that we are only interested in genres and ratings of movies. The above code snippets store the extracted data on these two attributes into two dataframes (namely, **genres_list** and **ratings_list**).\n",
 239 |     "\n",
 240 |     "------"
 241 |    ]
 242 |   },
 243 |   {
 244 |    "cell_type": "markdown",
 245 |    "metadata": {},
 246 |    "source": [
 247 |     "# Part 3: Data Profiling & Cleaning\n",
 248 |     "---------------------------\n",
 249 |     "\n",
 250 |     "The high-level goal in this stage of data prepration is to look into the data that we have acquired and extracted so far. This helps us to get familiar with data, understand in what ways the data needs cleaning or transformation, and finally enables us to prepare the data for the following steps of the data integration task.\n",
 251 |     "\n",
 252 |     "### Step 1: Loading the \"Kaggle 5000 Movies Dataset\"\n",
 253 |     "\n",
 254 |     "According to BigGorilla, dataframes (from the python package **pandas**) are suitable for data exploration and data profiling. In [Part 2](#Part-2:-Data-Extraction) of the tutorial, we stored the extracted data from \"IMDB Plain Text Data\" into dataframes. It would be appropriate to load the \"Kaggle 5000 Movies Dataset\" into a dataframe as well and follow the same data profiling procedure for all datasets."
 255 |    ]
 256 |   },
 257 |   {
 258 |    "cell_type": "code",
 259 |    "execution_count": 8,
 260 |    "metadata": {
 261 |     "collapsed": true
 262 |    },
 263 |    "outputs": [],
 264 |    "source": [
 265 |     "import pandas as pd\n",
 266 |     "\n",
 267 |     "# Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)\n",
 268 |     "kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "markdown",
 273 |    "metadata": {},
 274 |    "source": [
 275 |     "### Step 2: Calculating some basic statistics (profiling)\n",
 276 |     "\n",
 277 |     "Let's start by finding out how many movies are listed in each dataframe."
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "code",
 282 |    "execution_count": 9,
 283 |    "metadata": {
 284 |     "collapsed": false
 285 |    },
 286 |    "outputs": [
 287 |     {
 288 |      "name": "stdout",
 289 |      "output_type": "stream",
 290 |      "text": [
 291 |       "Number of movies in kaggle_data: 5043\n",
 292 |       "Number of movies in genres_data: 2384400\n",
 293 |       "Number of movies in ratings_data: 691621\n"
 294 |      ]
 295 |     }
 296 |    ],
 297 |    "source": [
 298 |     "print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))\n",
 299 |     "print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))\n",
 300 |     "print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "markdown",
 305 |    "metadata": {},
 306 |    "source": [
 307 |     "We can also check to see if we have duplicates (i.e., a movie appearing more than once) in the data. We consider an entry duplicate if we can find another entry with the same movie title and production year."
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": 10,
 313 |    "metadata": {
 314 |     "collapsed": false
 315 |    },
 316 |    "outputs": [
 317 |     {
 318 |      "name": "stdout",
 319 |      "output_type": "stream",
 320 |      "text": [
 321 |       "Number of duplicates in kaggle_data: 241\n",
 322 |       "Number of duplicates in genres_data: 1807712\n",
 323 |       "Number of duplicates in ratings_data: 286515\n"
 324 |      ]
 325 |     }
 326 |    ],
 327 |    "source": [
 328 |     "print ('Number of duplicates in kaggle_data: {}'.format(\n",
 329 |     "    sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))\n",
 330 |     "print ('Number of duplicates in genres_data: {}'.format(\n",
 331 |     "    sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))\n",
 332 |     "print ('Number of duplicates in ratings_data: {}'.format(\n",
 333 |     "    sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "markdown",
 338 |    "metadata": {},
 339 |    "source": [
 340 |     "### Step 3: Dealing with duplicates (cleaning)\n",
 341 |     "\n",
 342 |     "There are many strategies to deal with duplicates. Here, we are going to use a simple method for dealing with duplicates and that is to only keep the first occurrence of a duplicated entry and remove the rest."
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "code",
 347 |    "execution_count": 11,
 348 |    "metadata": {
 349 |     "collapsed": true
 350 |    },
 351 |    "outputs": [],
 352 |    "source": [
 353 |     "kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()\n",
 354 |     "genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()\n",
 355 |     "ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()"
 356 |    ]
 357 |   },
 358 |   {
 359 |    "cell_type": "markdown",
 360 |    "metadata": {},
 361 |    "source": [
 362 |     "### Step 4: Normalizing the text (cleaning)\n",
 363 |     "\n",
 364 |     "The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as \"'\" and \"?\", and replaces some other special characters (e.g., \"&\" is replaced with \"and\"). "
 365 |    ]
 366 |   },
 367 |   {
 368 |    "cell_type": "code",
 369 |    "execution_count": 12,
 370 |    "metadata": {
 371 |     "collapsed": false
 372 |    },
 373 |    "outputs": [],
 374 |    "source": [
 375 |     "def preprocess_title(title):\n",
 376 |     "    title = title.lower()\n",
 377 |     "    title = title.replace(',', ' ')\n",
 378 |     "    title = title.replace(\"'\", '')    \n",
 379 |     "    title = title.replace('&', 'and')\n",
 380 |     "    title = title.replace('?', '')\n",
 381 |     "    return title.strip()\n",
 382 |     "\n",
 383 |     "kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)\n",
 384 |     "genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)\n",
 385 |     "ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "markdown",
 390 |    "metadata": {},
 391 |    "source": [
 392 |     "### Step 5: Looking at a few samples\n",
 393 |     "\n",
 394 |     "The goal here is to a look at a few sample entries from each dataset for a quick sanity check. To keep the tutorial consice, we just present this step for the \"Kaggle 5000 Movies Dataset\" which is stored in the **kaggle_data** dataframe. "
 395 |    ]
 396 |   },
 397 |   {
 398 |    "cell_type": "code",
 399 |    "execution_count": 13,
 400 |    "metadata": {
 401 |     "collapsed": false
 402 |    },
 403 |    "outputs": [
 404 |     {
 405 |      "data": {
 406 |       "text/html": [
 407 |        "<div>\n",
 408 |        "<table border=\"1\" class=\"dataframe\">\n",
 409 |        "  <thead>\n",
 410 |        "    <tr style=\"text-align: right;\">\n",
 411 |        "      <th></th>\n",
 412 |        "      <th>color</th>\n",
 413 |        "      <th>director_name</th>\n",
 414 |        "      <th>num_critic_for_reviews</th>\n",
 415 |        "      <th>duration</th>\n",
 416 |        "      <th>director_facebook_likes</th>\n",
 417 |        "      <th>actor_3_facebook_likes</th>\n",
 418 |        "      <th>actor_2_name</th>\n",
 419 |        "      <th>actor_1_facebook_likes</th>\n",
 420 |        "      <th>gross</th>\n",
 421 |        "      <th>genres</th>\n",
 422 |        "      <th>...</th>\n",
 423 |        "      <th>language</th>\n",
 424 |        "      <th>country</th>\n",
 425 |        "      <th>content_rating</th>\n",
 426 |        "      <th>budget</th>\n",
 427 |        "      <th>title_year</th>\n",
 428 |        "      <th>actor_2_facebook_likes</th>\n",
 429 |        "      <th>imdb_score</th>\n",
 430 |        "      <th>aspect_ratio</th>\n",
 431 |        "      <th>movie_facebook_likes</th>\n",
 432 |        "      <th>norm_movie_title</th>\n",
 433 |        "    </tr>\n",
 434 |        "  </thead>\n",
 435 |        "  <tbody>\n",
 436 |        "    <tr>\n",
 437 |        "      <th>4422</th>\n",
 438 |        "      <td>Color</td>\n",
 439 |        "      <td>Simeon Rice</td>\n",
 440 |        "      <td>6.0</td>\n",
 441 |        "      <td>93.0</td>\n",
 442 |        "      <td>6.0</td>\n",
 443 |        "      <td>56.0</td>\n",
 444 |        "      <td>Lisa Brave</td>\n",
 445 |        "      <td>393.0</td>\n",
 446 |        "      <td>NaN</td>\n",
 447 |        "      <td>Action|Horror|Thriller</td>\n",
 448 |        "      <td>...</td>\n",
 449 |        "      <td>English</td>\n",
 450 |        "      <td>USA</td>\n",
 451 |        "      <td>R</td>\n",
 452 |        "      <td>1500000.0</td>\n",
 453 |        "      <td>2014.0</td>\n",
 454 |        "      <td>191.0</td>\n",
 455 |        "      <td>5.5</td>\n",
 456 |        "      <td>2.35</td>\n",
 457 |        "      <td>307</td>\n",
 458 |        "      <td>unsullied</td>\n",
 459 |        "    </tr>\n",
 460 |        "    <tr>\n",
 461 |        "      <th>1022</th>\n",
 462 |        "      <td>Color</td>\n",
 463 |        "      <td>Doug Liman</td>\n",
 464 |        "      <td>214.0</td>\n",
 465 |        "      <td>108.0</td>\n",
 466 |        "      <td>218.0</td>\n",
 467 |        "      <td>405.0</td>\n",
 468 |        "      <td>Ty Burrell</td>\n",
 469 |        "      <td>6000.0</td>\n",
 470 |        "      <td>9528092.0</td>\n",
 471 |        "      <td>Biography|Drama|Thriller</td>\n",
 472 |        "      <td>...</td>\n",
 473 |        "      <td>English</td>\n",
 474 |        "      <td>USA</td>\n",
 475 |        "      <td>PG-13</td>\n",
 476 |        "      <td>22000000.0</td>\n",
 477 |        "      <td>2010.0</td>\n",
 478 |        "      <td>3000.0</td>\n",
 479 |        "      <td>6.8</td>\n",
 480 |        "      <td>2.35</td>\n",
 481 |        "      <td>9000</td>\n",
 482 |        "      <td>fair game</td>\n",
 483 |        "    </tr>\n",
 484 |        "    <tr>\n",
 485 |        "      <th>3631</th>\n",
 486 |        "      <td>Color</td>\n",
 487 |        "      <td>Jonathan Levine</td>\n",
 488 |        "      <td>147.0</td>\n",
 489 |        "      <td>99.0</td>\n",
 490 |        "      <td>129.0</td>\n",
 491 |        "      <td>362.0</td>\n",
 492 |        "      <td>Aaron Yoo</td>\n",
 493 |        "      <td>976.0</td>\n",
 494 |        "      <td>2077046.0</td>\n",
 495 |        "      <td>Comedy|Drama|Romance</td>\n",
 496 |        "      <td>...</td>\n",
 497 |        "      <td>English</td>\n",
 498 |        "      <td>USA</td>\n",
 499 |        "      <td>R</td>\n",
 500 |        "      <td>6000000.0</td>\n",
 501 |        "      <td>2008.0</td>\n",
 502 |        "      <td>617.0</td>\n",
 503 |        "      <td>7.0</td>\n",
 504 |        "      <td>2.35</td>\n",
 505 |        "      <td>0</td>\n",
 506 |        "      <td>the wackness</td>\n",
 507 |        "    </tr>\n",
 508 |        "  </tbody>\n",
 509 |        "</table>\n",
 510 |        "<p>3 rows × 29 columns</p>\n",
 511 |        "</div>"
 512 |       ],
 513 |       "text/plain": [
 514 |        "      color    director_name  num_critic_for_reviews  duration  \\\n",
 515 |        "4422  Color      Simeon Rice                     6.0      93.0   \n",
 516 |        "1022  Color       Doug Liman                   214.0     108.0   \n",
 517 |        "3631  Color  Jonathan Levine                   147.0      99.0   \n",
 518 |        "\n",
 519 |        "      director_facebook_likes  actor_3_facebook_likes actor_2_name  \\\n",
 520 |        "4422                      6.0                    56.0   Lisa Brave   \n",
 521 |        "1022                    218.0                   405.0   Ty Burrell   \n",
 522 |        "3631                    129.0                   362.0    Aaron Yoo   \n",
 523 |        "\n",
 524 |        "      actor_1_facebook_likes      gross                    genres  \\\n",
 525 |        "4422                   393.0        NaN    Action|Horror|Thriller   \n",
 526 |        "1022                  6000.0  9528092.0  Biography|Drama|Thriller   \n",
 527 |        "3631                   976.0  2077046.0      Comedy|Drama|Romance   \n",
 528 |        "\n",
 529 |        "           ...        language country  content_rating      budget title_year  \\\n",
 530 |        "4422       ...         English     USA               R   1500000.0     2014.0   \n",
 531 |        "1022       ...         English     USA           PG-13  22000000.0     2010.0   \n",
 532 |        "3631       ...         English     USA               R   6000000.0     2008.0   \n",
 533 |        "\n",
 534 |        "      actor_2_facebook_likes imdb_score aspect_ratio  movie_facebook_likes  \\\n",
 535 |        "4422                   191.0        5.5         2.35                   307   \n",
 536 |        "1022                  3000.0        6.8         2.35                  9000   \n",
 537 |        "3631                   617.0        7.0         2.35                     0   \n",
 538 |        "\n",
 539 |        "     norm_movie_title  \n",
 540 |        "4422        unsullied  \n",
 541 |        "1022        fair game  \n",
 542 |        "3631     the wackness  \n",
 543 |        "\n",
 544 |        "[3 rows x 29 columns]"
 545 |       ]
 546 |      },
 547 |      "execution_count": 13,
 548 |      "metadata": {},
 549 |      "output_type": "execute_result"
 550 |     }
 551 |    ],
 552 |    "source": [
 553 |     "kaggle_data.sample(3, random_state=0)"
 554 |    ]
 555 |   },
 556 |   {
 557 |    "cell_type": "markdown",
 558 |    "metadata": {},
 559 |    "source": [
 560 |     "Looking at the data guides us to decide in what ways we might want to clean the data. For instance, the small sample data shown above, reveals that the **title_year** attribute is stored as floats (i.e., rational numbers). We can add another cleaning step to transform the **title_year** into strings and replace the missing title years with symbol **\"?\"**."
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "code",
 565 |    "execution_count": 14,
 566 |    "metadata": {
 567 |     "collapsed": false
 568 |    },
 569 |    "outputs": [
 570 |     {
 571 |      "data": {
 572 |       "text/html": [
 573 |        "<div>\n",
 574 |        "<table border=\"1\" class=\"dataframe\">\n",
 575 |        "  <thead>\n",
 576 |        "    <tr style=\"text-align: right;\">\n",
 577 |        "      <th></th>\n",
 578 |        "      <th>color</th>\n",
 579 |        "      <th>director_name</th>\n",
 580 |        "      <th>num_critic_for_reviews</th>\n",
 581 |        "      <th>duration</th>\n",
 582 |        "      <th>director_facebook_likes</th>\n",
 583 |        "      <th>actor_3_facebook_likes</th>\n",
 584 |        "      <th>actor_2_name</th>\n",
 585 |        "      <th>actor_1_facebook_likes</th>\n",
 586 |        "      <th>gross</th>\n",
 587 |        "      <th>genres</th>\n",
 588 |        "      <th>...</th>\n",
 589 |        "      <th>country</th>\n",
 590 |        "      <th>content_rating</th>\n",
 591 |        "      <th>budget</th>\n",
 592 |        "      <th>title_year</th>\n",
 593 |        "      <th>actor_2_facebook_likes</th>\n",
 594 |        "      <th>imdb_score</th>\n",
 595 |        "      <th>aspect_ratio</th>\n",
 596 |        "      <th>movie_facebook_likes</th>\n",
 597 |        "      <th>norm_movie_title</th>\n",
 598 |        "      <th>norm_title_year</th>\n",
 599 |        "    </tr>\n",
 600 |        "  </thead>\n",
 601 |        "  <tbody>\n",
 602 |        "    <tr>\n",
 603 |        "      <th>0</th>\n",
 604 |        "      <td>Color</td>\n",
 605 |        "      <td>James Cameron</td>\n",
 606 |        "      <td>723.0</td>\n",
 607 |        "      <td>178.0</td>\n",
 608 |        "      <td>0.0</td>\n",
 609 |        "      <td>855.0</td>\n",
 610 |        "      <td>Joel David Moore</td>\n",
 611 |        "      <td>1000.0</td>\n",
 612 |        "      <td>760505847.0</td>\n",
 613 |        "      <td>Action|Adventure|Fantasy|Sci-Fi</td>\n",
 614 |        "      <td>...</td>\n",
 615 |        "      <td>USA</td>\n",
 616 |        "      <td>PG-13</td>\n",
 617 |        "      <td>237000000.0</td>\n",
 618 |        "      <td>2009.0</td>\n",
 619 |        "      <td>936.0</td>\n",
 620 |        "      <td>7.9</td>\n",
 621 |        "      <td>1.78</td>\n",
 622 |        "      <td>33000</td>\n",
 623 |        "      <td>avatar</td>\n",
 624 |        "      <td>2009</td>\n",
 625 |        "    </tr>\n",
 626 |        "    <tr>\n",
 627 |        "      <th>1</th>\n",
 628 |        "      <td>Color</td>\n",
 629 |        "      <td>Gore Verbinski</td>\n",
 630 |        "      <td>302.0</td>\n",
 631 |        "      <td>169.0</td>\n",
 632 |        "      <td>563.0</td>\n",
 633 |        "      <td>1000.0</td>\n",
 634 |        "      <td>Orlando Bloom</td>\n",
 635 |        "      <td>40000.0</td>\n",
 636 |        "      <td>309404152.0</td>\n",
 637 |        "      <td>Action|Adventure|Fantasy</td>\n",
 638 |        "      <td>...</td>\n",
 639 |        "      <td>USA</td>\n",
 640 |        "      <td>PG-13</td>\n",
 641 |        "      <td>300000000.0</td>\n",
 642 |        "      <td>2007.0</td>\n",
 643 |        "      <td>5000.0</td>\n",
 644 |        "      <td>7.1</td>\n",
 645 |        "      <td>2.35</td>\n",
 646 |        "      <td>0</td>\n",
 647 |        "      <td>pirates of the caribbean: at worlds end</td>\n",
 648 |        "      <td>2007</td>\n",
 649 |        "    </tr>\n",
 650 |        "    <tr>\n",
 651 |        "      <th>2</th>\n",
 652 |        "      <td>Color</td>\n",
 653 |        "      <td>Sam Mendes</td>\n",
 654 |        "      <td>602.0</td>\n",
 655 |        "      <td>148.0</td>\n",
 656 |        "      <td>0.0</td>\n",
 657 |        "      <td>161.0</td>\n",
 658 |        "      <td>Rory Kinnear</td>\n",
 659 |        "      <td>11000.0</td>\n",
 660 |        "      <td>200074175.0</td>\n",
 661 |        "      <td>Action|Adventure|Thriller</td>\n",
 662 |        "      <td>...</td>\n",
 663 |        "      <td>UK</td>\n",
 664 |        "      <td>PG-13</td>\n",
 665 |        "      <td>245000000.0</td>\n",
 666 |        "      <td>2015.0</td>\n",
 667 |        "      <td>393.0</td>\n",
 668 |        "      <td>6.8</td>\n",
 669 |        "      <td>2.35</td>\n",
 670 |        "      <td>85000</td>\n",
 671 |        "      <td>spectre</td>\n",
 672 |        "      <td>2015</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th>3</th>\n",
 676 |        "      <td>Color</td>\n",
 677 |        "      <td>Christopher Nolan</td>\n",
 678 |        "      <td>813.0</td>\n",
 679 |        "      <td>164.0</td>\n",
 680 |        "      <td>22000.0</td>\n",
 681 |        "      <td>23000.0</td>\n",
 682 |        "      <td>Christian Bale</td>\n",
 683 |        "      <td>27000.0</td>\n",
 684 |        "      <td>448130642.0</td>\n",
 685 |        "      <td>Action|Thriller</td>\n",
 686 |        "      <td>...</td>\n",
 687 |        "      <td>USA</td>\n",
 688 |        "      <td>PG-13</td>\n",
 689 |        "      <td>250000000.0</td>\n",
 690 |        "      <td>2012.0</td>\n",
 691 |        "      <td>23000.0</td>\n",
 692 |        "      <td>8.5</td>\n",
 693 |        "      <td>2.35</td>\n",
 694 |        "      <td>164000</td>\n",
 695 |        "      <td>the dark knight rises</td>\n",
 696 |        "      <td>2012</td>\n",
 697 |        "    </tr>\n",
 698 |        "    <tr>\n",
 699 |        "      <th>4</th>\n",
 700 |        "      <td>NaN</td>\n",
 701 |        "      <td>Doug Walker</td>\n",
 702 |        "      <td>NaN</td>\n",
 703 |        "      <td>NaN</td>\n",
 704 |        "      <td>131.0</td>\n",
 705 |        "      <td>NaN</td>\n",
 706 |        "      <td>Rob Walker</td>\n",
 707 |        "      <td>131.0</td>\n",
 708 |        "      <td>NaN</td>\n",
 709 |        "      <td>Documentary</td>\n",
 710 |        "      <td>...</td>\n",
 711 |        "      <td>NaN</td>\n",
 712 |        "      <td>NaN</td>\n",
 713 |        "      <td>NaN</td>\n",
 714 |        "      <td>NaN</td>\n",
 715 |        "      <td>12.0</td>\n",
 716 |        "      <td>7.1</td>\n",
 717 |        "      <td>NaN</td>\n",
 718 |        "      <td>0</td>\n",
 719 |        "      <td>star wars: episode vii - the force awakens</td>\n",
 720 |        "      <td>?</td>\n",
 721 |        "    </tr>\n",
 722 |        "  </tbody>\n",
 723 |        "</table>\n",
 724 |        "<p>5 rows × 30 columns</p>\n",
 725 |        "</div>"
 726 |       ],
 727 |       "text/plain": [
 728 |        "   color      director_name  num_critic_for_reviews  duration  \\\n",
 729 |        "0  Color      James Cameron                   723.0     178.0   \n",
 730 |        "1  Color     Gore Verbinski                   302.0     169.0   \n",
 731 |        "2  Color         Sam Mendes                   602.0     148.0   \n",
 732 |        "3  Color  Christopher Nolan                   813.0     164.0   \n",
 733 |        "4    NaN        Doug Walker                     NaN       NaN   \n",
 734 |        "\n",
 735 |        "   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \\\n",
 736 |        "0                      0.0                   855.0  Joel David Moore   \n",
 737 |        "1                    563.0                  1000.0     Orlando Bloom   \n",
 738 |        "2                      0.0                   161.0      Rory Kinnear   \n",
 739 |        "3                  22000.0                 23000.0    Christian Bale   \n",
 740 |        "4                    131.0                     NaN        Rob Walker   \n",
 741 |        "\n",
 742 |        "   actor_1_facebook_likes        gross                           genres  \\\n",
 743 |        "0                  1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi   \n",
 744 |        "1                 40000.0  309404152.0         Action|Adventure|Fantasy   \n",
 745 |        "2                 11000.0  200074175.0        Action|Adventure|Thriller   \n",
 746 |        "3                 27000.0  448130642.0                  Action|Thriller   \n",
 747 |        "4                   131.0          NaN                      Documentary   \n",
 748 |        "\n",
 749 |        "        ...       country content_rating       budget  title_year  \\\n",
 750 |        "0       ...           USA          PG-13  237000000.0      2009.0   \n",
 751 |        "1       ...           USA          PG-13  300000000.0      2007.0   \n",
 752 |        "2       ...            UK          PG-13  245000000.0      2015.0   \n",
 753 |        "3       ...           USA          PG-13  250000000.0      2012.0   \n",
 754 |        "4       ...           NaN            NaN          NaN         NaN   \n",
 755 |        "\n",
 756 |        "  actor_2_facebook_likes  imdb_score aspect_ratio movie_facebook_likes  \\\n",
 757 |        "0                  936.0         7.9         1.78                33000   \n",
 758 |        "1                 5000.0         7.1         2.35                    0   \n",
 759 |        "2                  393.0         6.8         2.35                85000   \n",
 760 |        "3                23000.0         8.5         2.35               164000   \n",
 761 |        "4                   12.0         7.1          NaN                    0   \n",
 762 |        "\n",
 763 |        "                             norm_movie_title norm_title_year  \n",
 764 |        "0                                      avatar            2009  \n",
 765 |        "1     pirates of the caribbean: at worlds end            2007  \n",
 766 |        "2                                     spectre            2015  \n",
 767 |        "3                       the dark knight rises            2012  \n",
 768 |        "4  star wars: episode vii - the force awakens               ?  \n",
 769 |        "\n",
 770 |        "[5 rows x 30 columns]"
 771 |       ]
 772 |      },
 773 |      "execution_count": 14,
 774 |      "metadata": {},
 775 |      "output_type": "execute_result"
 776 |     }
 777 |    ],
 778 |    "source": [
 779 |     "def preprocess_year(year):\n",
 780 |     "    if pd.isnull(year):\n",
 781 |     "        return '?'\n",
 782 |     "    else:\n",
 783 |     "        return str(int(year))\n",
 784 |     "\n",
 785 |     "kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)\n",
 786 |     "kaggle_data.head()"
 787 |    ]
 788 |   },
 789 |   {
 790 |    "cell_type": "markdown",
 791 |    "metadata": {},
 792 |    "source": [
 793 |     "-----"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "markdown",
 798 |    "metadata": {},
 799 |    "source": [
 800 |     "# Part 4: Data Matching & Merging\n",
 801 |     "-------------------------\n",
 802 |     "The main goal in this part is go match the data that we have acquired from different sources to create a single rich dataset. Recall that in [Part 3](#Part-3:-Data-Profiling-&-Cleaning), we transformed all datasets into a dataframe which we used to clean the data. In this part, we continue using the same dataframes for the data that we have prepared so far.\n",
 803 |     "\n",
 804 |     "### Step 1: Integrating the \"IMDB Plain Text Data\" files\n",
 805 |     "Note that both **ratings_data** and **genres_data** dataframes contain data that come from the same source (i.e., \"the IMDB Plain Text data\"). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple \"exact match\" can be done simply using dataframes."
 806 |    ]
 807 |   },
 808 |   {
 809 |    "cell_type": "code",
 810 |    "execution_count": 15,
 811 |    "metadata": {
 812 |     "collapsed": false
 813 |    },
 814 |    "outputs": [
 815 |     {
 816 |      "data": {
 817 |       "text/html": [
 818 |        "<div>\n",
 819 |        "<table border=\"1\" class=\"dataframe\">\n",
 820 |        "  <thead>\n",
 821 |        "    <tr style=\"text-align: right;\">\n",
 822 |        "      <th></th>\n",
 823 |        "      <th>movie_x</th>\n",
 824 |        "      <th>year</th>\n",
 825 |        "      <th>rating</th>\n",
 826 |        "      <th>norm_movie</th>\n",
 827 |        "      <th>movie_y</th>\n",
 828 |        "      <th>genre</th>\n",
 829 |        "    </tr>\n",
 830 |        "  </thead>\n",
 831 |        "  <tbody>\n",
 832 |        "    <tr>\n",
 833 |        "      <th>0</th>\n",
 834 |        "      <td>The Shawshank Redemption</td>\n",
 835 |        "      <td>1994</td>\n",
 836 |        "      <td>9.2</td>\n",
 837 |        "      <td>the shawshank redemption</td>\n",
 838 |        "      <td>The Shawshank Redemption</td>\n",
 839 |        "      <td>Crime</td>\n",
 840 |        "    </tr>\n",
 841 |        "    <tr>\n",
 842 |        "      <th>1</th>\n",
 843 |        "      <td>The Godfather</td>\n",
 844 |        "      <td>1972</td>\n",
 845 |        "      <td>9.2</td>\n",
 846 |        "      <td>the godfather</td>\n",
 847 |        "      <td>The Godfather</td>\n",
 848 |        "      <td>Crime</td>\n",
 849 |        "    </tr>\n",
 850 |        "    <tr>\n",
 851 |        "      <th>2</th>\n",
 852 |        "      <td>The Godfather: Part II</td>\n",
 853 |        "      <td>1974</td>\n",
 854 |        "      <td>9.0</td>\n",
 855 |        "      <td>the godfather: part ii</td>\n",
 856 |        "      <td>The Godfather: Part II</td>\n",
 857 |        "      <td>Crime</td>\n",
 858 |        "    </tr>\n",
 859 |        "    <tr>\n",
 860 |        "      <th>3</th>\n",
 861 |        "      <td>The Dark Knight</td>\n",
 862 |        "      <td>2008</td>\n",
 863 |        "      <td>8.9</td>\n",
 864 |        "      <td>the dark knight</td>\n",
 865 |        "      <td>The Dark Knight</td>\n",
 866 |        "      <td>Action</td>\n",
 867 |        "    </tr>\n",
 868 |        "    <tr>\n",
 869 |        "      <th>4</th>\n",
 870 |        "      <td>12 Angry Men</td>\n",
 871 |        "      <td>1957</td>\n",
 872 |        "      <td>8.9</td>\n",
 873 |        "      <td>12 angry men</td>\n",
 874 |        "      <td>12 Angry Men</td>\n",
 875 |        "      <td>Crime</td>\n",
 876 |        "    </tr>\n",
 877 |        "  </tbody>\n",
 878 |        "</table>\n",
 879 |        "</div>"
 880 |       ],
 881 |       "text/plain": [
 882 |        "                     movie_x  year rating                norm_movie  \\\n",
 883 |        "0   The Shawshank Redemption  1994    9.2  the shawshank redemption   \n",
 884 |        "1              The Godfather  1972    9.2             the godfather   \n",
 885 |        "2     The Godfather: Part II  1974    9.0    the godfather: part ii   \n",
 886 |        "3            The Dark Knight  2008    8.9           the dark knight   \n",
 887 |        "4               12 Angry Men  1957    8.9              12 angry men   \n",
 888 |        "\n",
 889 |        "                    movie_y   genre  \n",
 890 |        "0  The Shawshank Redemption   Crime  \n",
 891 |        "1             The Godfather   Crime  \n",
 892 |        "2    The Godfather: Part II   Crime  \n",
 893 |        "3           The Dark Knight  Action  \n",
 894 |        "4              12 Angry Men   Crime  "
 895 |       ]
 896 |      },
 897 |      "execution_count": 15,
 898 |      "metadata": {},
 899 |      "output_type": "execute_result"
 900 |     }
 901 |    ],
 902 |    "source": [
 903 |     "brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])\n",
 904 |     "brief_imdb_data.head()"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "metadata": {},
 910 |    "source": [
 911 |     "We refer to the dataset created above as the **brief_imdb_data** since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the \"IMDB Plain Text Data\". If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in *\"imdb_dataset.csv\"* under the _\"data\"_ folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset."
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": 16,
 917 |    "metadata": {
 918 |     "collapsed": false
 919 |    },
 920 |    "outputs": [
 921 |     {
 922 |      "data": {
 923 |       "text/plain": [
 924 |        "(869178, 27)"
 925 |       ]
 926 |      },
 927 |      "execution_count": 16,
 928 |      "metadata": {},
 929 |      "output_type": "execute_result"
 930 |     }
 931 |    ],
 932 |    "source": [
 933 |     "# reading the new IMDB dataset\n",
 934 |     "imdb_data = pd.read_csv('./data/imdb_dataset.csv')\n",
 935 |     "# let's normlize the title as we did in Part 3 of the tutorial\n",
 936 |     "imdb_data['norm_title'] = imdb_data['movie_title'].map(preprocess_title)\n",
 937 |     "imdb_data['norm_year'] = imdb_data['title_year'].map(preprocess_year)\n",
 938 |     "imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()\n",
 939 |     "imdb_data.shape"
 940 |    ]
 941 |   },
 942 |   {
 943 |    "cell_type": "markdown",
 944 |    "metadata": {},
 945 |    "source": [
 946 |     "### Step 2: Integrating the Kaggle and IMDB datasets\n",
 947 |     "\n",
 948 |     "A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. The following code reveals that 4,248 matches are found using this simple approach."
 949 |    ]
 950 |   },
 951 |   {
 952 |    "cell_type": "code",
 953 |    "execution_count": 17,
 954 |    "metadata": {
 955 |     "collapsed": false
 956 |    },
 957 |    "outputs": [
 958 |     {
 959 |      "data": {
 960 |       "text/plain": [
 961 |        "(4248, 57)"
 962 |       ]
 963 |      },
 964 |      "execution_count": 17,
 965 |      "metadata": {},
 966 |      "output_type": "execute_result"
 967 |     }
 968 |    ],
 969 |    "source": [
 970 |     "data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],\n",
 971 |     "                         right_on=['norm_movie_title', 'norm_title_year'])\n",
 972 |     "data_attempt1.shape"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "markdown",
 977 |    "metadata": {},
 978 |    "source": [
 979 |     "But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. \"Wall.E\" vs \"WallE\"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year."
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "code",
 984 |    "execution_count": 18,
 985 |    "metadata": {
 986 |     "collapsed": false
 987 |    },
 988 |    "outputs": [
 989 |     {
 990 |      "name": "stderr",
 991 |      "output_type": "stream",
 992 |      "text": [
 993 |       "0%                          100%\n",
 994 |       "[##############################] | ETA: 00:00:00\n",
 995 |       "Total time elapsed: 00:02:01\n"
 996 |      ]
 997 |     },
 998 |     {
 999 |      "data": {
1000 |       "text/plain": [
1001 |        "(4689, 8)"
1002 |       ]
1003 |      },
1004 |      "execution_count": 18,
1005 |      "metadata": {},
1006 |      "output_type": "execute_result"
1007 |     }
1008 |    ],
1009 |    "source": [
1010 |     "import py_stringsimjoin as ssj\n",
1011 |     "import py_stringmatching as sm\n",
1012 |     "\n",
1013 |     "imdb_data['id'] = range(imdb_data.shape[0])\n",
1014 |     "kaggle_data['id'] = range(kaggle_data.shape[0])\n",
1015 |     "similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',\n",
1016 |     "                                        'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],\n",
1017 |     "                                         r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)\n",
1018 |     "# selecting the entries that have the same production year\n",
1019 |     "data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]\n",
1020 |     "data_attempt2.shape"
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "markdown",
1025 |    "metadata": {},
1026 |    "source": [
1027 |     "We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical."
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": 19,
1033 |    "metadata": {
1034 |     "collapsed": false
1035 |    },
1036 |    "outputs": [
1037 |     {
1038 |      "data": {
1039 |       "text/html": [
1040 |        "<div>\n",
1041 |        "<table border=\"1\" class=\"dataframe\">\n",
1042 |        "  <thead>\n",
1043 |        "    <tr style=\"text-align: right;\">\n",
1044 |        "      <th></th>\n",
1045 |        "      <th>_id</th>\n",
1046 |        "      <th>l_id</th>\n",
1047 |        "      <th>r_id</th>\n",
1048 |        "      <th>l_norm_title</th>\n",
1049 |        "      <th>l_norm_year</th>\n",
1050 |        "      <th>r_norm_movie_title</th>\n",
1051 |        "      <th>r_norm_title_year</th>\n",
1052 |        "      <th>_sim_score</th>\n",
1053 |        "    </tr>\n",
1054 |        "  </thead>\n",
1055 |        "  <tbody>\n",
1056 |        "    <tr>\n",
1057 |        "      <th>146</th>\n",
1058 |        "      <td>146</td>\n",
1059 |        "      <td>852736</td>\n",
1060 |        "      <td>46</td>\n",
1061 |        "      <td>world war v</td>\n",
1062 |        "      <td>2013</td>\n",
1063 |        "      <td>world war z</td>\n",
1064 |        "      <td>2013</td>\n",
1065 |        "      <td>1.0</td>\n",
1066 |        "    </tr>\n",
1067 |        "    <tr>\n",
1068 |        "      <th>160</th>\n",
1069 |        "      <td>160</td>\n",
1070 |        "      <td>281649</td>\n",
1071 |        "      <td>56</td>\n",
1072 |        "      <td>grave</td>\n",
1073 |        "      <td>2012</td>\n",
1074 |        "      <td>brave</td>\n",
1075 |        "      <td>2012</td>\n",
1076 |        "      <td>1.0</td>\n",
1077 |        "    </tr>\n",
1078 |        "    <tr>\n",
1079 |        "      <th>180</th>\n",
1080 |        "      <td>180</td>\n",
1081 |        "      <td>831490</td>\n",
1082 |        "      <td>58</td>\n",
1083 |        "      <td>walle</td>\n",
1084 |        "      <td>2008</td>\n",
1085 |        "      <td>wall·e</td>\n",
1086 |        "      <td>2008</td>\n",
1087 |        "      <td>1.0</td>\n",
1088 |        "    </tr>\n",
1089 |        "    <tr>\n",
1090 |        "      <th>239</th>\n",
1091 |        "      <td>239</td>\n",
1092 |        "      <td>816188</td>\n",
1093 |        "      <td>67</td>\n",
1094 |        "      <td>upe</td>\n",
1095 |        "      <td>2009</td>\n",
1096 |        "      <td>up</td>\n",
1097 |        "      <td>2009</td>\n",
1098 |        "      <td>1.0</td>\n",
1099 |        "    </tr>\n",
1100 |        "    <tr>\n",
1101 |        "      <th>246</th>\n",
1102 |        "      <td>246</td>\n",
1103 |        "      <td>817366</td>\n",
1104 |        "      <td>67</td>\n",
1105 |        "      <td>ut</td>\n",
1106 |        "      <td>2009</td>\n",
1107 |        "      <td>up</td>\n",
1108 |        "      <td>2009</td>\n",
1109 |        "      <td>1.0</td>\n",
1110 |        "    </tr>\n",
1111 |        "  </tbody>\n",
1112 |        "</table>\n",
1113 |        "</div>"
1114 |       ],
1115 |       "text/plain": [
1116 |        "     _id    l_id  r_id l_norm_title l_norm_year r_norm_movie_title  \\\n",
1117 |        "146  146  852736    46  world war v        2013        world war z   \n",
1118 |        "160  160  281649    56        grave        2012              brave   \n",
1119 |        "180  180  831490    58        walle        2008             wall·e   \n",
1120 |        "239  239  816188    67          upe        2009                 up   \n",
1121 |        "246  246  817366    67           ut        2009                 up   \n",
1122 |        "\n",
1123 |        "    r_norm_title_year  _sim_score  \n",
1124 |        "146              2013         1.0  \n",
1125 |        "160              2012         1.0  \n",
1126 |        "180              2008         1.0  \n",
1127 |        "239              2009         1.0  \n",
1128 |        "246              2009         1.0  "
1129 |       ]
1130 |      },
1131 |      "execution_count": 19,
1132 |      "metadata": {},
1133 |      "output_type": "execute_result"
1134 |     }
1135 |    ],
1136 |    "source": [
1137 |     "data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "markdown",
1142 |    "metadata": {},
1143 |    "source": [
1144 |     "While instances such as \"walle\" and \"wall.e\" are correctly matched, we can see that this techniques also makes some errors (e.g., \"grave\" and \"brave\"). This raises the following questions: \"what method should be used for data matching?\" and \"how can we determine the quality of the matching?\". BigGorilla's recommendation for dealing with this problem is using the python package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan).\n",
1145 |     "\n",
1146 |     "In the next step, we demonstrate how **py_entitymatching** uses machine learning techniques for the data-matching purposes as well as how it enables us to evaluate the quality of the produced matching.\n",
1147 |     "\n",
1148 |     "### Step 3: Using Magellan for data matching\n",
1149 |     "\n",
1150 |     "#### Substep A: Finding a candiate set (Blocking)\n",
1151 |     "The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the **mixture**). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The **py_stringsimjoin** package allows us to do so easily."
1152 |    ]
1153 |   },
1154 |   {
1155 |    "cell_type": "code",
1156 |    "execution_count": 20,
1157 |    "metadata": {
1158 |     "collapsed": false
1159 |    },
1160 |    "outputs": [],
1161 |    "source": [
1162 |     "# transforming the \"budget\" column into string and creating a new **mixture** column\n",
1163 |     "ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)\n",
1164 |     "imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']\n",
1165 |     "\n",
1166 |     "# repeating the same thing for the Kaggle dataset\n",
1167 |     "ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)\n",
1168 |     "kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + \\\n",
1169 |     "                         ' ' + kaggle_data['budget']"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "markdown",
1174 |    "metadata": {},
1175 |    "source": [
1176 |     "Now, we can use the **mixture** columns to create a desired candiate set which we call **C**."
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "code",
1181 |    "execution_count": 21,
1182 |    "metadata": {
1183 |     "collapsed": false
1184 |    },
1185 |    "outputs": [
1186 |     {
1187 |      "name": "stderr",
1188 |      "output_type": "stream",
1189 |      "text": [
1190 |       "0%                          100%\n",
1191 |       "[##############################] | ETA: 00:00:00\n",
1192 |       "Total time elapsed: 00:00:49\n"
1193 |      ]
1194 |     },
1195 |     {
1196 |      "data": {
1197 |       "text/plain": [
1198 |        "(18317, 14)"
1199 |       ]
1200 |      },
1201 |      "execution_count": 21,
1202 |      "metadata": {},
1203 |      "output_type": "execute_result"
1204 |     }
1205 |    ],
1206 |    "source": [
1207 |     "C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), \n",
1208 |     "                                 l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',\n",
1209 |     "                                              'budget', 'content_rating'],\n",
1210 |     "                                 r_out_attrs=['norm_title', 'norm_year', 'duration',\n",
1211 |     "                                              'budget', 'content_rating'],\n",
1212 |     "                                 threshold=0.65)\n",
1213 |     "C.shape"
1214 |    ]
1215 |   },
1216 |   {
1217 |    "cell_type": "markdown",
1218 |    "metadata": {},
1219 |    "source": [
1220 |     "We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.\n",
1221 |     "\n",
1222 |     "#### Substep B: Specifying the keys \n",
1223 |     "The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set."
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "code",
1228 |    "execution_count": 22,
1229 |    "metadata": {
1230 |     "collapsed": false
1231 |    },
1232 |    "outputs": [
1233 |     {
1234 |      "data": {
1235 |       "text/plain": [
1236 |        "True"
1237 |       ]
1238 |      },
1239 |      "execution_count": 22,
1240 |      "metadata": {},
1241 |      "output_type": "execute_result"
1242 |     }
1243 |    ],
1244 |    "source": [
1245 |     "import py_entitymatching as em\n",
1246 |     "em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset\n",
1247 |     "em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset\n",
1248 |     "em.set_key(C, '_id')            # specifying the key in the candidate set\n",
1249 |     "em.set_ltable(C, kaggle_data)   # specifying the left table \n",
1250 |     "em.set_rtable(C, imdb_data)     # specifying the right table\n",
1251 |     "em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table \n",
1252 |     "em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table "
1253 |    ]
1254 |   },
1255 |   {
1256 |    "cell_type": "markdown",
1257 |    "metadata": {},
1258 |    "source": [
1259 |     "\n",
1260 |     "#### Subset C: Debugging the blocker\n",
1261 |     "\n",
1262 |     "Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.\n",
1263 |     "\n",
1264 |     "*Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.*"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "code",
1269 |    "execution_count": 23,
1270 |    "metadata": {
1271 |     "collapsed": false
1272 |    },
1273 |    "outputs": [
1274 |     {
1275 |      "data": {
1276 |       "text/html": [
1277 |        "<div>\n",
1278 |        "<table border=\"1\" class=\"dataframe\">\n",
1279 |        "  <thead>\n",
1280 |        "    <tr style=\"text-align: right;\">\n",
1281 |        "      <th></th>\n",
1282 |        "      <th>l_norm_movie_title</th>\n",
1283 |        "      <th>r_norm_title</th>\n",
1284 |        "      <th>l_norm_title_year</th>\n",
1285 |        "      <th>r_norm_year</th>\n",
1286 |        "      <th>l_budget</th>\n",
1287 |        "      <th>r_budget</th>\n",
1288 |        "      <th>l_content_rating</th>\n",
1289 |        "      <th>r_mpaa</th>\n",
1290 |        "    </tr>\n",
1291 |        "  </thead>\n",
1292 |        "  <tbody>\n",
1293 |        "    <tr>\n",
1294 |        "      <th>0</th>\n",
1295 |        "      <td>dude  wheres my dog!</td>\n",
1296 |        "      <td>#hacked</td>\n",
1297 |        "      <td>2014</td>\n",
1298 |        "      <td>2014</td>\n",
1299 |        "      <td>20000</td>\n",
1300 |        "      <td>20000</td>\n",
1301 |        "      <td>PG</td>\n",
1302 |        "      <td>NaN</td>\n",
1303 |        "    </tr>\n",
1304 |        "    <tr>\n",
1305 |        "      <th>1</th>\n",
1306 |        "      <td>road hard</td>\n",
1307 |        "      <td>#horror</td>\n",
1308 |        "      <td>2015</td>\n",
1309 |        "      <td>2015</td>\n",
1310 |        "      <td>1500000</td>\n",
1311 |        "      <td>1500000</td>\n",
1312 |        "      <td>NaN</td>\n",
1313 |        "      <td>NaN</td>\n",
1314 |        "    </tr>\n",
1315 |        "    <tr>\n",
1316 |        "      <th>2</th>\n",
1317 |        "      <td>#horror</td>\n",
1318 |        "      <td>#horror</td>\n",
1319 |        "      <td>2015</td>\n",
1320 |        "      <td>2015</td>\n",
1321 |        "      <td>1500000</td>\n",
1322 |        "      <td>1500000</td>\n",
1323 |        "      <td>Not Rated</td>\n",
1324 |        "      <td>NaN</td>\n",
1325 |        "    </tr>\n",
1326 |        "    <tr>\n",
1327 |        "      <th>3</th>\n",
1328 |        "      <td>me you and five bucks</td>\n",
1329 |        "      <td>#horror</td>\n",
1330 |        "      <td>2015</td>\n",
1331 |        "      <td>2015</td>\n",
1332 |        "      <td>1500000</td>\n",
1333 |        "      <td>1500000</td>\n",
1334 |        "      <td>NaN</td>\n",
1335 |        "      <td>NaN</td>\n",
1336 |        "    </tr>\n",
1337 |        "    <tr>\n",
1338 |        "      <th>4</th>\n",
1339 |        "      <td>checkmate</td>\n",
1340 |        "      <td>#horror</td>\n",
1341 |        "      <td>2015</td>\n",
1342 |        "      <td>2015</td>\n",
1343 |        "      <td>1500000</td>\n",
1344 |        "      <td>1500000</td>\n",
1345 |        "      <td>NaN</td>\n",
1346 |        "      <td>NaN</td>\n",
1347 |        "    </tr>\n",
1348 |        "  </tbody>\n",
1349 |        "</table>\n",
1350 |        "</div>"
1351 |       ],
1352 |       "text/plain": [
1353 |        "      l_norm_movie_title r_norm_title l_norm_title_year r_norm_year l_budget  \\\n",
1354 |        "0   dude  wheres my dog!      #hacked              2014        2014    20000   \n",
1355 |        "1              road hard      #horror              2015        2015  1500000   \n",
1356 |        "2                #horror      #horror              2015        2015  1500000   \n",
1357 |        "3  me you and five bucks      #horror              2015        2015  1500000   \n",
1358 |        "4              checkmate      #horror              2015        2015  1500000   \n",
1359 |        "\n",
1360 |        "  r_budget l_content_rating r_mpaa  \n",
1361 |        "0    20000               PG    NaN  \n",
1362 |        "1  1500000              NaN    NaN  \n",
1363 |        "2  1500000        Not Rated    NaN  \n",
1364 |        "3  1500000              NaN    NaN  \n",
1365 |        "4  1500000              NaN    NaN  "
1366 |       ]
1367 |      },
1368 |      "execution_count": 23,
1369 |      "metadata": {},
1370 |      "output_type": "execute_result"
1371 |     }
1372 |    ],
1373 |    "source": [
1374 |     "C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',\n",
1375 |     "   'l_budget', 'r_budget', 'l_content_rating', 'r_content_rating']].head()"
1376 |    ]
1377 |   },
1378 |   {
1379 |    "cell_type": "markdown",
1380 |    "metadata": {},
1381 |    "source": [
1382 |     "Based on the above sample we can see that the blocking seems to be reasonable.\n",
1383 |     "\n",
1384 |     "#### Substep D: Sampling from the candiate set\n",
1385 |     "\n",
1386 |     "The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candiate pair is a correct match or not."
1387 |    ]
1388 |   },
1389 |   {
1390 |    "cell_type": "code",
1391 |    "execution_count": 24,
1392 |    "metadata": {
1393 |     "collapsed": false
1394 |    },
1395 |    "outputs": [],
1396 |    "source": [
1397 |     "# Sampling 500 pairs and writing this sample into a .csv file\n",
1398 |     "sampled = C.sample(500, random_state=0)\n",
1399 |     "sampled.to_csv('./data/sampled.csv', encoding='utf-8')"
1400 |    ]
1401 |   },
1402 |   {
1403 |    "cell_type": "markdown",
1404 |    "metadata": {},
1405 |    "source": [
1406 |     "In order to label the sampled data, we can create a new column in the _.csv_ file (which we call **label**) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let's rename the new file as **labeled.csv**."
1407 |    ]
1408 |   },
1409 |   {
1410 |    "cell_type": "code",
1411 |    "execution_count": 25,
1412 |    "metadata": {
1413 |     "collapsed": false
1414 |    },
1415 |    "outputs": [
1416 |     {
1417 |      "name": "stderr",
1418 |      "output_type": "stream",
1419 |      "text": [
1420 |       "Metadata file is not present in the given path; proceeding to read the csv file.\n"
1421 |      ]
1422 |     },
1423 |     {
1424 |      "data": {
1425 |       "text/html": [
1426 |        "<div>\n",
1427 |        "<table border=\"1\" class=\"dataframe\">\n",
1428 |        "  <thead>\n",
1429 |        "    <tr style=\"text-align: right;\">\n",
1430 |        "      <th></th>\n",
1431 |        "      <th>Unnamed: 0</th>\n",
1432 |        "      <th>_id</th>\n",
1433 |        "      <th>l_id</th>\n",
1434 |        "      <th>r_id</th>\n",
1435 |        "      <th>l_norm_movie_title</th>\n",
1436 |        "      <th>l_norm_title_year</th>\n",
1437 |        "      <th>l_duration</th>\n",
1438 |        "      <th>l_budget</th>\n",
1439 |        "      <th>l_content_rating</th>\n",
1440 |        "      <th>r_norm_title</th>\n",
1441 |        "      <th>r_norm_year</th>\n",
1442 |        "      <th>r_length</th>\n",
1443 |        "      <th>r_budget</th>\n",
1444 |        "      <th>r_mpaa</th>\n",
1445 |        "      <th>_sim_score</th>\n",
1446 |        "      <th>label</th>\n",
1447 |        "    </tr>\n",
1448 |        "  </thead>\n",
1449 |        "  <tbody>\n",
1450 |        "    <tr>\n",
1451 |        "      <th>0</th>\n",
1452 |        "      <td>4771</td>\n",
1453 |        "      <td>4771</td>\n",
1454 |        "      <td>2639</td>\n",
1455 |        "      <td>235925</td>\n",
1456 |        "      <td>eye of the beholder</td>\n",
1457 |        "      <td>1999</td>\n",
1458 |        "      <td>109.0</td>\n",
1459 |        "      <td>15000000</td>\n",
1460 |        "      <td>R</td>\n",
1461 |        "      <td>eye of the beholder</td>\n",
1462 |        "      <td>1999</td>\n",
1463 |        "      <td>109.0</td>\n",
1464 |        "      <td>35000000</td>\n",
1465 |        "      <td>R</td>\n",
1466 |        "      <td>0.833333</td>\n",
1467 |        "      <td>1</td>\n",
1468 |        "    </tr>\n",
1469 |        "    <tr>\n",
1470 |        "      <th>1</th>\n",
1471 |        "      <td>11478</td>\n",
1472 |        "      <td>11478</td>\n",
1473 |        "      <td>2001</td>\n",
1474 |        "      <td>600301</td>\n",
1475 |        "      <td>rocky balboa</td>\n",
1476 |        "      <td>2006</td>\n",
1477 |        "      <td>139.0</td>\n",
1478 |        "      <td>24000000</td>\n",
1479 |        "      <td>PG</td>\n",
1480 |        "      <td>rocky balboa</td>\n",
1481 |        "      <td>2006</td>\n",
1482 |        "      <td>139.0</td>\n",
1483 |        "      <td>24000000</td>\n",
1484 |        "      <td>PG</td>\n",
1485 |        "      <td>1.000000</td>\n",
1486 |        "      <td>1</td>\n",
1487 |        "    </tr>\n",
1488 |        "    <tr>\n",
1489 |        "      <th>2</th>\n",
1490 |        "      <td>13630</td>\n",
1491 |        "      <td>13630</td>\n",
1492 |        "      <td>4160</td>\n",
1493 |        "      <td>691766</td>\n",
1494 |        "      <td>from russia with love</td>\n",
1495 |        "      <td>1963</td>\n",
1496 |        "      <td>115.0</td>\n",
1497 |        "      <td>2000000</td>\n",
1498 |        "      <td>Approved</td>\n",
1499 |        "      <td>the aeolians: from russia with love</td>\n",
1500 |        "      <td>2012</td>\n",
1501 |        "      <td>NaN</td>\n",
1502 |        "      <td>20000</td>\n",
1503 |        "      <td>NaN</td>\n",
1504 |        "      <td>0.666667</td>\n",
1505 |        "      <td>0</td>\n",
1506 |        "    </tr>\n",
1507 |        "    <tr>\n",
1508 |        "      <th>3</th>\n",
1509 |        "      <td>1972</td>\n",
1510 |        "      <td>1972</td>\n",
1511 |        "      <td>1248</td>\n",
1512 |        "      <td>101029</td>\n",
1513 |        "      <td>sex tape</td>\n",
1514 |        "      <td>2014</td>\n",
1515 |        "      <td>94.0</td>\n",
1516 |        "      <td>40000000</td>\n",
1517 |        "      <td>R</td>\n",
1518 |        "      <td>blended</td>\n",
1519 |        "      <td>2014</td>\n",
1520 |        "      <td>117.0</td>\n",
1521 |        "      <td>40000000</td>\n",
1522 |        "      <td>PG-13</td>\n",
1523 |        "      <td>0.666667</td>\n",
1524 |        "      <td>0</td>\n",
1525 |        "    </tr>\n",
1526 |        "    <tr>\n",
1527 |        "      <th>4</th>\n",
1528 |        "      <td>15903</td>\n",
1529 |        "      <td>15903</td>\n",
1530 |        "      <td>722</td>\n",
1531 |        "      <td>758133</td>\n",
1532 |        "      <td>the scorch trials</td>\n",
1533 |        "      <td>2015</td>\n",
1534 |        "      <td>132.0</td>\n",
1535 |        "      <td>61000000</td>\n",
1536 |        "      <td>PG-13</td>\n",
1537 |        "      <td>the scorch trials</td>\n",
1538 |        "      <td>2015</td>\n",
1539 |        "      <td>132.0</td>\n",
1540 |        "      <td>61000000</td>\n",
1541 |        "      <td>PG-13</td>\n",
1542 |        "      <td>1.000000</td>\n",
1543 |        "      <td>1</td>\n",
1544 |        "    </tr>\n",
1545 |        "  </tbody>\n",
1546 |        "</table>\n",
1547 |        "</div>"
1548 |       ],
1549 |       "text/plain": [
1550 |        "   Unnamed: 0    _id  l_id    r_id     l_norm_movie_title  l_norm_title_year  \\\n",
1551 |        "0        4771   4771  2639  235925    eye of the beholder               1999   \n",
1552 |        "1       11478  11478  2001  600301           rocky balboa               2006   \n",
1553 |        "2       13630  13630  4160  691766  from russia with love               1963   \n",
1554 |        "3        1972   1972  1248  101029               sex tape               2014   \n",
1555 |        "4       15903  15903   722  758133      the scorch trials               2015   \n",
1556 |        "\n",
1557 |        "   l_duration  l_budget l_content_rating                         r_norm_title  \\\n",
1558 |        "0       109.0  15000000                R                  eye of the beholder   \n",
1559 |        "1       139.0  24000000               PG                         rocky balboa   \n",
1560 |        "2       115.0   2000000         Approved  the aeolians: from russia with love   \n",
1561 |        "3        94.0  40000000                R                              blended   \n",
1562 |        "4       132.0  61000000            PG-13                    the scorch trials   \n",
1563 |        "\n",
1564 |        "   r_norm_year  r_length  r_budget r_mpaa  _sim_score  label  \n",
1565 |        "0         1999     109.0  35000000      R    0.833333      1  \n",
1566 |        "1         2006     139.0  24000000     PG    1.000000      1  \n",
1567 |        "2         2012       NaN     20000    NaN    0.666667      0  \n",
1568 |        "3         2014     117.0  40000000  PG-13    0.666667      0  \n",
1569 |        "4         2015     132.0  61000000  PG-13    1.000000      1  "
1570 |       ]
1571 |      },
1572 |      "execution_count": 25,
1573 |      "metadata": {},
1574 |      "output_type": "execute_result"
1575 |     }
1576 |    ],
1577 |    "source": [
1578 |     "# If you would like to avoid labeling the pairs for now, you can download the labled.csv file from\n",
1579 |     "# BigGorilla using the following command (if you prefer to do it yourself, commend the next line)\n",
1580 |     "response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv',\n",
1581 |     "                              './data/labeled.csv')\n",
1582 |     "labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data,\n",
1583 |     "                               fk_ltable='l_id', fk_rtable='r_id', key='_id')\n",
1584 |     "labeled.head()"
1585 |    ]
1586 |   },
1587 |   {
1588 |    "cell_type": "markdown",
1589 |    "metadata": {},
1590 |    "source": [
1591 |     "#### Substep E: Traning machine learning algorithms\n",
1592 |     "\n",
1593 |     "Now we can use the sampled dataset to train various machine learning algorithms for our prediction task. To do so, we need to split our dataset into a training and a test set, and then select the desired machine learning techniques for our prediction task."
1594 |    ]
1595 |   },
1596 |   {
1597 |    "cell_type": "code",
1598 |    "execution_count": 26,
1599 |    "metadata": {
1600 |     "collapsed": true
1601 |    },
1602 |    "outputs": [],
1603 |    "source": [
1604 |     "split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)\n",
1605 |     "train_data = split['train']\n",
1606 |     "test_data = split['test']\n",
1607 |     "\n",
1608 |     "dt = em.DTMatcher(name='DecisionTree', random_state=0)\n",
1609 |     "svm = em.SVMMatcher(name='SVM', random_state=0)\n",
1610 |     "rf = em.RFMatcher(name='RF', random_state=0)\n",
1611 |     "lg = em.LogRegMatcher(name='LogReg', random_state=0)\n",
1612 |     "ln = em.LinRegMatcher(name='LinReg')\n",
1613 |     "nb = em.NBMatcher(name='NaiveBayes')"
1614 |    ]
1615 |   },
1616 |   {
1617 |    "cell_type": "markdown",
1618 |    "metadata": {},
1619 |    "source": [
1620 |     "Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features."
1621 |    ]
1622 |   },
1623 |   {
1624 |    "cell_type": "code",
1625 |    "execution_count": 27,
1626 |    "metadata": {
1627 |     "collapsed": true
1628 |    },
1629 |    "outputs": [],
1630 |    "source": [
1631 |     "attr_corres = em.get_attr_corres(kaggle_data, imdb_data)\n",
1632 |     "attr_corres['corres'] = [('norm_movie_title', 'norm_title'), \n",
1633 |     "                         ('norm_title_year', 'norm_year'),\n",
1634 |     "                        ('content_rating', 'content_rating'),\n",
1635 |     "                         ('budget', 'budget'),\n",
1636 |     "]\n",
1637 |     "\n",
1638 |     "l_attr_types = em.get_attr_types(kaggle_data)\n",
1639 |     "r_attr_types = em.get_attr_types(imdb_data)\n",
1640 |     "\n",
1641 |     "tok = em.get_tokenizers_for_matching()\n",
1642 |     "sim = em.get_sim_funs_for_matching()\n",
1643 |     "\n",
1644 |     "F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)"
1645 |    ]
1646 |   },
1647 |   {
1648 |    "cell_type": "markdown",
1649 |    "metadata": {},
1650 |    "source": [
1651 |     "Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column."
1652 |    ]
1653 |   },
1654 |   {
1655 |    "cell_type": "code",
1656 |    "execution_count": 28,
1657 |    "metadata": {
1658 |     "collapsed": true
1659 |    },
1660 |    "outputs": [],
1661 |    "source": [
1662 |     "train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) \n",
1663 |     "train_features = em.impute_table(train_features,  exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')"
1664 |    ]
1665 |   },
1666 |   {
1667 |    "cell_type": "markdown",
1668 |    "metadata": {},
1669 |    "source": [
1670 |     "Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task."
1671 |    ]
1672 |   },
1673 |   {
1674 |    "cell_type": "code",
1675 |    "execution_count": 29,
1676 |    "metadata": {
1677 |     "collapsed": false
1678 |    },
1679 |    "outputs": [
1680 |     {
1681 |      "data": {
1682 |       "text/html": [
1683 |        "<div>\n",
1684 |        "<table border=\"1\" class=\"dataframe\">\n",
1685 |        "  <thead>\n",
1686 |        "    <tr style=\"text-align: right;\">\n",
1687 |        "      <th></th>\n",
1688 |        "      <th>Name</th>\n",
1689 |        "      <th>Matcher</th>\n",
1690 |        "      <th>Num folds</th>\n",
1691 |        "      <th>Fold 1</th>\n",
1692 |        "      <th>Fold 2</th>\n",
1693 |        "      <th>Fold 3</th>\n",
1694 |        "      <th>Fold 4</th>\n",
1695 |        "      <th>Fold 5</th>\n",
1696 |        "      <th>Mean score</th>\n",
1697 |        "    </tr>\n",
1698 |        "  </thead>\n",
1699 |        "  <tbody>\n",
1700 |        "    <tr>\n",
1701 |        "      <th>0</th>\n",
1702 |        "      <td>DecisionTree</td>\n",
1703 |        "      <td>&lt;py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x160ef1e80&gt;</td>\n",
1704 |        "      <td>5</td>\n",
1705 |        "      <td>1.000000</td>\n",
1706 |        "      <td>0.967742</td>\n",
1707 |        "      <td>1.0</td>\n",
1708 |        "      <td>1.000000</td>\n",
1709 |        "      <td>1.000</td>\n",
1710 |        "      <td>0.993548</td>\n",
1711 |        "    </tr>\n",
1712 |        "    <tr>\n",
1713 |        "      <th>1</th>\n",
1714 |        "      <td>RF</td>\n",
1715 |        "      <td>&lt;py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x160ef1240&gt;</td>\n",
1716 |        "      <td>5</td>\n",
1717 |        "      <td>1.000000</td>\n",
1718 |        "      <td>0.967742</td>\n",
1719 |        "      <td>1.0</td>\n",
1720 |        "      <td>1.000000</td>\n",
1721 |        "      <td>1.000</td>\n",
1722 |        "      <td>0.993548</td>\n",
1723 |        "    </tr>\n",
1724 |        "    <tr>\n",
1725 |        "      <th>2</th>\n",
1726 |        "      <td>SVM</td>\n",
1727 |        "      <td>&lt;py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x160ef1f60&gt;</td>\n",
1728 |        "      <td>5</td>\n",
1729 |        "      <td>0.956522</td>\n",
1730 |        "      <td>0.967742</td>\n",
1731 |        "      <td>1.0</td>\n",
1732 |        "      <td>1.000000</td>\n",
1733 |        "      <td>0.875</td>\n",
1734 |        "      <td>0.959853</td>\n",
1735 |        "    </tr>\n",
1736 |        "    <tr>\n",
1737 |        "      <th>3</th>\n",
1738 |        "      <td>LinReg</td>\n",
1739 |        "      <td>&lt;py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x160ef17f0&gt;</td>\n",
1740 |        "      <td>5</td>\n",
1741 |        "      <td>1.000000</td>\n",
1742 |        "      <td>0.967742</td>\n",
1743 |        "      <td>1.0</td>\n",
1744 |        "      <td>1.000000</td>\n",
1745 |        "      <td>1.000</td>\n",
1746 |        "      <td>0.993548</td>\n",
1747 |        "    </tr>\n",
1748 |        "    <tr>\n",
1749 |        "      <th>4</th>\n",
1750 |        "      <td>LogReg</td>\n",
1751 |        "      <td>&lt;py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x160ef1438&gt;</td>\n",
1752 |        "      <td>5</td>\n",
1753 |        "      <td>1.000000</td>\n",
1754 |        "      <td>0.967742</td>\n",
1755 |        "      <td>1.0</td>\n",
1756 |        "      <td>0.956522</td>\n",
1757 |        "      <td>1.000</td>\n",
1758 |        "      <td>0.984853</td>\n",
1759 |        "    </tr>\n",
1760 |        "    <tr>\n",
1761 |        "      <th>5</th>\n",
1762 |        "      <td>NaiveBayes</td>\n",
1763 |        "      <td>&lt;py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x160ef1d30&gt;</td>\n",
1764 |        "      <td>5</td>\n",
1765 |        "      <td>1.000000</td>\n",
1766 |        "      <td>0.967742</td>\n",
1767 |        "      <td>1.0</td>\n",
1768 |        "      <td>1.000000</td>\n",
1769 |        "      <td>1.000</td>\n",
1770 |        "      <td>0.993548</td>\n",
1771 |        "    </tr>\n",
1772 |        "  </tbody>\n",
1773 |        "</table>\n",
1774 |        "</div>"
1775 |       ],
1776 |       "text/plain": [
1777 |        "           Name  \\\n",
1778 |        "0  DecisionTree   \n",
1779 |        "1            RF   \n",
1780 |        "2           SVM   \n",
1781 |        "3        LinReg   \n",
1782 |        "4        LogReg   \n",
1783 |        "5    NaiveBayes   \n",
1784 |        "\n",
1785 |        "                                                                         Matcher  \\\n",
1786 |        "0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x160ef1e80>   \n",
1787 |        "1          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x160ef1240>   \n",
1788 |        "2        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x160ef1f60>   \n",
1789 |        "3  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x160ef17f0>   \n",
1790 |        "4  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x160ef1438>   \n",
1791 |        "5          <py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x160ef1d30>   \n",
1792 |        "\n",
1793 |        "   Num folds    Fold 1    Fold 2  Fold 3    Fold 4  Fold 5  Mean score  \n",
1794 |        "0          5  1.000000  0.967742     1.0  1.000000   1.000    0.993548  \n",
1795 |        "1          5  1.000000  0.967742     1.0  1.000000   1.000    0.993548  \n",
1796 |        "2          5  0.956522  0.967742     1.0  1.000000   0.875    0.959853  \n",
1797 |        "3          5  1.000000  0.967742     1.0  1.000000   1.000    0.993548  \n",
1798 |        "4          5  1.000000  0.967742     1.0  0.956522   1.000    0.984853  \n",
1799 |        "5          5  1.000000  0.967742     1.0  1.000000   1.000    0.993548  "
1800 |       ]
1801 |      },
1802 |      "execution_count": 29,
1803 |      "metadata": {},
1804 |      "output_type": "execute_result"
1805 |     }
1806 |    ],
1807 |    "source": [
1808 |     "result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, \n",
1809 |     "                           exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,\n",
1810 |     "                           target_attr='label', metric='f1', random_state=0)\n",
1811 |     "result['cv_stats']"
1812 |    ]
1813 |   },
1814 |   {
1815 |    "cell_type": "markdown",
1816 |    "metadata": {},
1817 |    "source": [
1818 |     "We can observe based on the reported accuracy of different techniques that the \"random forest (RF)\" algorithm achieves the best performance. Thus, it is best to use this technique for the matching."
1819 |    ]
1820 |   },
1821 |   {
1822 |    "cell_type": "markdown",
1823 |    "metadata": {},
1824 |    "source": [
1825 |     "#### Substep F: Evaluating the quality of our matching\n",
1826 |     "\n",
1827 |     "It is important to evaluate the quality of our matching. We can now, use the traning set for this purpose and measure how well the random forest predicts the matches. We can see that we are obtaining a high accuracy and recall on the test set as well."
1828 |    ]
1829 |   },
1830 |   {
1831 |    "cell_type": "code",
1832 |    "execution_count": null,
1833 |    "metadata": {
1834 |     "collapsed": false
1835 |    },
1836 |    "outputs": [
1837 |     {
1838 |      "name": "stdout",
1839 |      "output_type": "stream",
1840 |      "text": [
1841 |       "Precision : 94.44% (51/54)\n",
1842 |       "Recall : 100.0% (51/51)\n",
1843 |       "F1 : 97.14%\n",
1844 |       "False positives : 3 (out of 54 positive predictions)\n",
1845 |       "False negatives : 0 (out of 196 negative predictions)\n"
1846 |      ]
1847 |     }
1848 |    ],
1849 |    "source": [
1850 |     "best_model = result['selected_matcher']\n",
1851 |     "best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')\n",
1852 |     "\n",
1853 |     "test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)\n",
1854 |     "test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')\n",
1855 |     "\n",
1856 |     "# Predict on the test data\n",
1857 |     "predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], \n",
1858 |     "                                 append=True, target_attr='predicted', inplace=False)\n",
1859 |     "\n",
1860 |     "# Evaluate the predictions\n",
1861 |     "eval_result = em.eval_matches(predictions, 'label', 'predicted')\n",
1862 |     "em.print_eval_summary(eval_result)"
1863 |    ]
1864 |   },
1865 |   {
1866 |    "cell_type": "markdown",
1867 |    "metadata": {},
1868 |    "source": [
1869 |     "#### Substep G: Using the trained model to match the datasets\n",
1870 |     "\n",
1871 |     "Now, we can use the trained model to match the two tables as follows:"
1872 |    ]
1873 |   },
1874 |   {
1875 |    "cell_type": "code",
1876 |    "execution_count": null,
1877 |    "metadata": {
1878 |     "collapsed": false
1879 |    },
1880 |    "outputs": [
1881 |     {
1882 |      "name": "stderr",
1883 |      "output_type": "stream",
1884 |      "text": [
1885 |       "0%                          100%\n",
1886 |       "[                              ]"
1887 |      ]
1888 |     }
1889 |    ],
1890 |    "source": [
1891 |     "candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)\n",
1892 |     "candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')\n",
1893 |     "predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'],\n",
1894 |     "                                 append=True, target_attr='predicted', inplace=False)\n",
1895 |     "matches = predictions[predictions.predicted == 1] "
1896 |    ]
1897 |   },
1898 |   {
1899 |    "cell_type": "markdown",
1900 |    "metadata": {},
1901 |    "source": [
1902 |     "Note that the **matches** dataframe contains many columns storing the extracted features for both datasets. The following code snippet removes all the unnecessary columns and creates a nice formatted dataframe that has the resulting integrated dataset."
1903 |    ]
1904 |   },
1905 |   {
1906 |    "cell_type": "code",
1907 |    "execution_count": null,
1908 |    "metadata": {
1909 |     "collapsed": false,
1910 |     "scrolled": true
1911 |    },
1912 |    "outputs": [],
1913 |    "source": [
1914 |     "from py_entitymatching.catalog import catalog_manager as cm\n",
1915 |     "matches = matches[['_id', 'l_id', 'r_id', 'predicted']]\n",
1916 |     "matches.reset_index(drop=True, inplace=True)\n",
1917 |     "cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)\n",
1918 |     "matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'],\n",
1919 |     "                                   r_output_attrs=['norm_title', 'norm_year', 'budget', 'content_rating'],\n",
1920 |     "                                   l_output_prefix='l_', r_output_prefix='r_',\n",
1921 |     "                                   delete_from_catalog=False)\n",
1922 |     "matches.drop('predicted', axis=1, inplace=True)\n",
1923 |     "matches.head()"
1924 |    ]
1925 |   }
1926 |  ],
1927 |  "metadata": {
1928 |   "anaconda-cloud": {},
1929 |   "kernelspec": {
1930 |    "display_name": "Python [conda env:py3k]",
1931 |    "language": "python",
1932 |    "name": "conda-env-py3k-py"
1933 |   },
1934 |   "language_info": {
1935 |    "codemirror_mode": {
1936 |     "name": "ipython",
1937 |     "version": 3
1938 |    },
1939 |    "file_extension": ".py",
1940 |    "mimetype": "text/x-python",
1941 |    "name": "python",
1942 |    "nbconvert_exporter": "python",
1943 |    "pygments_lexer": "ipython3",
1944 |    "version": "3.5.2"
1945 |   }
1946 |  },
1947 |  "nbformat": 4,
1948 |  "nbformat_minor": 1
1949 | }
1950 | 


--------------------------------------------------------------------------------