├── Licence.md ├── README.md ├── wikipedia-link-analysis-mapper.py ├── wikipedia-link-analysis-reducer.py ├── wordcount-mapper.py └── wordcount-reducer.py /Licence.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Hardik Vasa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hadoop Mapreduce Examples in Python 2 | Couple of the Mapreduce examples in python and a documentation on running them! 3 | 4 | ## Steps of running the codes 5 | 6 | **Folder Structure** 7 | 8 | The files are assumed to be stored in the given locations in the Linux OS. This is just an example illustration and in real the location does not matter. 9 | 10 | * Hadoop installed in: /usr/local 11 | * words.txt (sample word file on which the mapreduce jobs are run): /usr/local 12 | * mapper.py (mapper file) and reducer.py (reducer file): /usr/local 13 | * words.txt in hdfs: /wordcount 14 | 15 | 16 | **Creating Files** 17 | 18 | `touch words.txt` 19 | 20 | 21 | **Making Directory in hdfs** 22 | 23 | `hadoop fs -mkdir -p /wordcount` 24 | 25 | 26 | 27 | **Copying test file from local directory to hdfs** 28 | 29 | `hadoop fs -copyFromLocal /usr/local/words.txt /wordcount` 30 | 31 | 32 | 33 | **Check for file listing on hdfs:** 34 | 35 | `hadoop fs -ls /wordcount` 36 | 37 | 38 | **Running the mapreduce job** 39 | 40 | `/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar -file /usr/local/mapper.py -mapper mapper.py -file /usr/local/reducer.py -reducer reducer.py -input /wordcount/words.txt -output /wordcount/output` 41 | 42 | 43 | **Print the output** 44 | 45 | `hadoop fs -cat /wordcount/output/part-00000` 46 | 47 | 48 | **Remove the output folder from hdfs** 49 | 50 | `hadoop dfs -rmr hdfs:///wordcount/output` 51 | 52 | 53 | **User friendly list of files and sizes in a directory** 54 | 55 | `ls -lh` 56 | 57 | 58 | **Giving full permissions to a folder if required** 59 | 60 | `chmod 777 -R /usr/local/hadoop_store` 61 | -------------------------------------------------------------------------------- /wikipedia-link-analysis-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | #Finding 'Next Link' on a given web page 5 | def get_next_link(s): 6 | start_link = s.find("href=") 7 | if start_link == -1: #If no links are found then give an error! 8 | end_quote = 0 9 | link = "no_links" 10 | return link, end_quote 11 | else: 12 | start_quote = s.find('"', start_link) 13 | end_quote = s.find('"',start_quote+1) 14 | link = str(s[start_quote+1:end_quote]) 15 | return link, end_quote 16 | 17 | #Getting all links with the help of 'get_next_links' 18 | def get_all_links(page): 19 | links = [] 20 | while True: 21 | link, end_link = get_next_link(page) 22 | if link == "no_links": 23 | break 24 | else: 25 | links.append(link) #Append all the links in the list named 'Links' 26 | #time.sleep(0.1) 27 | page = page[end_link:] 28 | return links 29 | 30 | ##Main Program 31 | for line in sys.stdin: 32 | line = line.strip() #remove white spaces 33 | 34 | if 'wgArticleId' in line: 35 | key = 'Articles' 36 | value = 1 37 | print( "%s\t%d" % (key, value) ) 38 | else: 39 | links = get_all_links(line) 40 | for j in links: 41 | if 'href=' in line: 42 | s = line.find('href') 43 | if '.jpg' in line or '.png' in line or '.svg' in line or '.gif' in line or '.jpeg' in line or '.tiff' in line or '.xcf' in line: 44 | key = 'Image Links' 45 | value = 1 46 | print( "%s\t%d" % (key, value) ) 47 | elif 'en.wikipedia.org' in line or '/w/' in line: 48 | key = 'Internal but Irrelevant' 49 | value = 1 50 | print( "%s\t%d" % (key, value) ) 51 | elif '.wikipedia.org' in line: 52 | key = 'Non-English Wikipedia Link' 53 | value = 1 54 | print( "%s\t%d" % (key, value) ) 55 | elif 'wikimedia.org' in line or 'wikimediafoundation.org' in line: 56 | key = 'Organizational Link' 57 | value = 1 58 | print( "%s\t%d" % (key, value) ) 59 | elif '/wiki/' in line[s+6:s+15]: 60 | key = 'Internal Link' 61 | value = 1 62 | print( "%s\t%d" % (key, value) ) 63 | else: 64 | key = 'External Link' 65 | value = 1 66 | print( "%s\t%d" % (key, value) ) 67 | else: 68 | pass 69 | -------------------------------------------------------------------------------- /wikipedia-link-analysis-reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from operator import itemgetter 4 | import sys 5 | 6 | current_word = None 7 | current_count = 0 8 | word = None 9 | 10 | 11 | for line in sys.stdin: # input comes from STDIN 12 | line = line.strip() # remove leading and trailing whitespace 13 | 14 | word, count = line.split('\t', 1) # parse the input we got from mapper.py by a tab (space) 15 | 16 | try: 17 | count = int(count) # convert count from string to int 18 | except ValueError: 19 | continue #If the count is not a number then discard the line by doing nothing 20 | 21 | 22 | if current_word == word: #comparing the current word with the previous word (since they are ordered by key (word)) 23 | current_count += count 24 | else: 25 | if current_word: 26 | # write result to STDOUT 27 | print '%s\t%s' % (current_word, current_count) 28 | current_count = count 29 | current_word = word 30 | 31 | if current_word == word: # do not forget to output the last word if needed! 32 | print '%s\t%s' % (current_word, current_count) 33 | -------------------------------------------------------------------------------- /wordcount-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | for line in sys.stdin: # Input is read from STDIN and the output of this file is written into STDOUT 6 | line = line.strip() # remove leading and trailing whitespace 7 | words = line.split() # split the line into words 8 | 9 | for word in words: 10 | print '%s\t%s' % (word, 1) #Print all words (key) individually with the value 1 11 | -------------------------------------------------------------------------------- /wordcount-reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from operator import itemgetter 4 | import sys 5 | 6 | current_word = None 7 | current_count = 0 8 | word = None 9 | 10 | 11 | for line in sys.stdin: # input comes from STDIN 12 | line = line.strip() # remove leading and trailing whitespace 13 | 14 | word, count = line.split('\t', 1) # parse the input we got from mapper.py by a tab (space) 15 | 16 | try: 17 | count = int(count) # convert count from string to int 18 | except ValueError: 19 | continue #If the count is not a number then discard the line by doing nothing 20 | 21 | 22 | if current_word == word: #comparing the current word with the previous word (since they are ordered by key (word)) 23 | current_count += count 24 | else: 25 | if current_word: 26 | # write result to STDOUT 27 | print '%s\t%s' % (current_word, current_count) 28 | current_count = count 29 | current_word = word 30 | 31 | if current_word == word: # do not forget to output the last word if needed! 32 | print '%s\t%s' % (current_word, current_count) 33 | --------------------------------------------------------------------------------