├── .gitattributes ├── Jaccard Index.py ├── Latent Semantic Analysis.py ├── README.md ├── REPORT.pdf ├── REPORT2b.pdf ├── REPORT2c.pdf ├── cosine similarity.py ├── dm2c.py ├── msc-plagiarism-assigment ├── ass1-1019.txt ├── ass1-1037.txt ├── ass1-1046.txt ├── ass1-1138.txt ├── ass1-1147.txt ├── ass1-202.txt ├── ass1-211.txt ├── ass1-321.txt ├── ass1-440.txt ├── ass1-505.txt ├── ass1-532.txt ├── ass1-541.txt ├── ass1-606.txt ├── ass1-743.txt ├── ass1-817.txt ├── ass1-826.txt ├── ass1-909.txt ├── ass1_1349.txt ├── ass1_422.txt ├── ass1_734.txt ├── ass1_808.txt └── ass1_936.txt ├── plots ├── Figure_1.png ├── cosine.png ├── hist.png └── kmeans.png └── to_test ├── ass1-1019.txt ├── ass1-1037.txt ├── ass1-1046.txt ├── ass1-1138.txt ├── ass1-1147.txt ├── ass1-202.txt ├── ass1-211.txt ├── ass1-321.txt ├── ass1-440.txt ├── ass1-505.txt ├── ass1-532.txt ├── ass1-541.txt ├── ass1-606.txt ├── ass1-743.txt ├── ass1-817.txt ├── ass1-826.txt ├── ass1-909.txt ├── ass1_1349.txt ├── ass1_422.txt ├── ass1_734.txt ├── ass1_808.txt ├── ass1_936.txt ├── code.py ├── codee.py └── infoRet.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Jaccard Index.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import sys 3 | from nltk.corpus import stopwords 4 | from nltk.stem import PorterStemmer 5 | import glob 6 | import re 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from scipy.spatial.distance import pdist 10 | from scipy.cluster.hierarchy import linkage, dendrogram 11 | from sklearn.cluster import AgglomerativeClustering 12 | 13 | path = './msc-plagiarism-assigment/*.txt' 14 | files=glob.glob(path) 15 | n = len(files) 16 | i=0 17 | fname = [None]*n 18 | raw = [None]*n 19 | ftext = [None]*n 20 | i = 0 21 | for f in files: 22 | try: 23 | fname[i] = open(f,"r") 24 | raw[i] = fname[i].read() 25 | except UnicodeDecodeError as e: 26 | fname[i] = open(f,"r",encoding="utf8") 27 | raw[i] = fname[i].read() 28 | ftext[i] = re.sub("[^A-Za-z]", " ", raw[i]) 29 | i = i + 1 30 | 31 | tokens = [None]*n 32 | swr = [None]*n 33 | stopwords = stopwords.words("english") 34 | add = ['search','engine','web','internet'] 35 | stopwords.extend(add) 36 | st = [None]*n 37 | for i in range(0,n): 38 | tokens[i] = nltk.word_tokenize(ftext[i]) 39 | swr[i] = [w for w in tokens[i] if not w.lower() in stopwords] 40 | ps = PorterStemmer() 41 | st[i]= [] 42 | for ws in swr[i]: 43 | st[i].append(ps.stem(ws)) 44 | 45 | def jaccard_similarity(query, document): 46 | intersection = set(query).intersection(set(document)) 47 | union = set(query).union(set(document)) 48 | return len(intersection)/len(union) 49 | 50 | stem_words = [None]*n 51 | for i in range(0,n): 52 | stem_words[i] = set(st[i]) 53 | 54 | jam = np.zeros((n,n),dtype='double') 55 | min=1 56 | max=0 57 | for i in range (0,n): 58 | for j in range(i,n): 59 | jam[i][j] = jaccard_similarity(st[i], st[j]) 60 | jam[j][i] = jam[i][j] 61 | 62 | np.set_printoptions(precision=3) 63 | print(jam) 64 | 65 | #DENDROGRAM 66 | 67 | plt.figure(figsize=(10, 7)) 68 | plt.title("hist") 69 | 70 | distanceMatrix = 1-jam 71 | print(distanceMatrix) 72 | dend = dendrogram(linkage(1-jam,method='complete'), 5, 73 | color_threshold=10, 74 | leaf_font_size=10) 75 | plt.show() 76 | 77 | cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='complete') 78 | cluster.fit_predict(1-jam.T) 79 | -------------------------------------------------------------------------------- /Latent Semantic Analysis.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import string 3 | import os 4 | from nltk.corpus import stopwords 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.preprocessing import Normalizer 7 | from nltk.stem.porter import PorterStemmer 8 | import re 9 | import numpy as np 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | from matplotlib import pyplot as plt 12 | from sklearn.decomposition import TruncatedSVD as SVD 13 | 14 | 15 | path = './msc-plagiarism-assigment' 16 | token_dict = {} 17 | 18 | 19 | def tokenize(text): 20 | tokens = nltk.word_tokenize(text) 21 | stems = [] 22 | for item in tokens: 23 | stems.append(PorterStemmer().stem(item)) 24 | return stems 25 | 26 | for dirpath, dirs, files in os.walk(path): 27 | for f in files: 28 | fname = os.path.join(dirpath, f) 29 | print ("fname=", fname) 30 | try: 31 | with open(fname) as pearl: 32 | text = pearl.read() 33 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 34 | except UnicodeDecodeError as e: 35 | with open(fname,encoding="utf8") as pearl: 36 | text = pearl.read() 37 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 38 | 39 | 40 | stopwords = stopwords.words("english") 41 | add = ['search','engine','web','internet'] 42 | stopwords.extend(add) 43 | tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords) 44 | tfs = tfidf.fit_transform(token_dict.values()) 45 | 46 | lsa = SVD(n_components = 4, n_iter =100) 47 | doc_top=lsa.fit_transform(tfs) 48 | doc_top=Normalizer(copy=False).fit_transform(doc_top) 49 | terms = tfidf.get_feature_names() 50 | for i, comp in enumerate(lsa.components_): 51 | termsInComp = zip(terms,comp) 52 | sortedTerms = sorted(termsInComp, key=lambda x:x[1], reverse=True) [:5] 53 | print ("Topic %d:" %i) 54 | for term in sortedTerms: 55 | print (term[0]) 56 | print (" ") 57 | 58 | 59 | ##import umap 60 | ##X_topics = lsa.fit_transform(tfs) 61 | ##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics) 62 | ## 63 | ##plt.figure(figsize=(7,5)) 64 | ##plt.scatter(embedding[:, 0], embedding[:, 1], 65 | ##c = tfidf.get_feature_names(), 66 | ##s = 10, # size 67 | ##edgecolor='none' 68 | ##) 69 | ##plt.show() 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Mining 2 | 3 | The data is a collection of documents (text/pdf files) contained in the "msc-plagiarism-assigment" folder. 4 | 5 | ### The Assignment is divided into 3 parts : 6 | a)Normalize the text and create a similarity matrix using Jaccard index. 7 | Apply hierarchical 8 | clustering. Cut the dendrogram at k and identify clusters of similar documents. 9 | 10 | b)Create Tf-idf matrix of the collection. 11 | Using Cosine distance, create a similarity matrix. 12 | Cluster the documents using K means clustering, and find the number of clusters (k) that minimizes SSE. 13 | Apply hierarchical clustering. Cut the dendrogram at k and identify clusters of similar documents 14 | 15 | c)Perform LSA using reduced latent space with 4 dimensions. 16 | For each topic identify the set of 5 top weighted terms. 17 | Find the similarity matrix for the documents in the reduced space. 18 | Apply hierarchical clustering. Cut the dendrogram at k and identify clusters of similar documents. 19 | 20 | -------------------------------------------------------------------------------- /REPORT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/REPORT.pdf -------------------------------------------------------------------------------- /REPORT2b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/REPORT2b.pdf -------------------------------------------------------------------------------- /REPORT2c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/REPORT2c.pdf -------------------------------------------------------------------------------- /cosine similarity.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import string 3 | import os 4 | from nltk.corpus import stopwords 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from nltk.stem.porter import PorterStemmer 7 | import re 8 | import numpy as np 9 | from sklearn.metrics.pairwise import cosine_similarity 10 | from matplotlib import pyplot as plt 11 | 12 | path = './msc-plagiarism-assigment' 13 | token_dict = {} 14 | 15 | 16 | def tokenize(text): 17 | tokens = nltk.word_tokenize(text) 18 | stems = [] 19 | for item in tokens: 20 | stems.append(PorterStemmer().stem(item)) 21 | return stems 22 | 23 | for dirpath, dirs, files in os.walk(path): 24 | for f in files: 25 | fname = os.path.join(dirpath, f) 26 | print ("fname=", fname) 27 | try: 28 | with open(fname) as pearl: 29 | text = pearl.read() 30 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 31 | except UnicodeDecodeError as e: 32 | with open(fname,encoding="utf8") as pearl: 33 | text = pearl.read() 34 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 35 | 36 | 37 | stopwords = stopwords.words("english") 38 | add = ['search','engine','web','internet'] 39 | stopwords.extend(add) 40 | tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords) 41 | tfs = tfidf.fit_transform(token_dict.values()) 42 | 43 | cosim = cosine_similarity(tfs,tfs) 44 | print(cosim) 45 | 46 | 47 | #Spectral Clustering 48 | #from sklearn.cluster import SpectralClustering 49 | #sc = SpectralClustering(n_clusters = 2, affinity = 'precomputed') 50 | #sc.fit_predict(cosim[0:3,0:3]) 51 | 52 | #DBScan 53 | #from sklearn.cluster import DBSCAN 54 | #db=DBSCAN(min_samples=1) 55 | #db.fit_predict(cosim) 56 | 57 | #K-Means 58 | 59 | from sklearn.cluster import KMeans 60 | 61 | #sse={} 62 | 63 | eigen_values, eigen_vectors = np.linalg.eigh(cosim) 64 | km = KMeans(n_clusters=5, init='k-means++') 65 | km.fit_predict(eigen_vectors[:, -4:]) 66 | 67 | #for i in range(1,21): 68 | # km = KMeans(n_clusters=i, init='k-means++') 69 | # km.fit_predict(eigen_vectors[:, -4:]) 70 | # sse[i]=km.inertia_ 71 | #x,y=(zip(*sse.items())) 72 | #plt.plot(x,y) 73 | #plt.title("Elbow curve") 74 | #plt.xlabel("Clusters(k)") 75 | #plt.ylabel("SSE") 76 | #plt.show() 77 | 78 | #dense_tfs = tfs.toarray() 79 | #KM = KMeans(n_clusters=i, n_init=50, max_iter=100) 80 | #KM.fit_transform(dense_tfs) 81 | 82 | #HIERARCHICAL 83 | 84 | 85 | from scipy.cluster.hierarchy import dendrogram, linkage,cut_tree 86 | from sklearn.cluster import AgglomerativeClustering 87 | 88 | 89 | plt.figure(figsize=(10, 7)) 90 | plt.title("dendrogram") 91 | distanceMatrix = 1-cosim 92 | Z=linkage(cosim,method='complete') 93 | 94 | dend = dendrogram(Z, 5, orientation = 'top', 95 | color_threshold=10, 96 | leaf_font_size=10, show_leaf_counts=True) 97 | plt.show() 98 | 99 | cluster = AgglomerativeClustering(n_clusters=10, affinity='precomputed', linkage='complete') 100 | cluster.fit_predict(cosim) 101 | #str = 'all great and precious things are lonely.' 102 | #response = tfidf.transform([str]) 103 | #print (response) 104 | 105 | #feature_names = tfidf.get_feature_names() 106 | #for col in response.nonzero()[1]: 107 | # print (feature_names[col], ' - ', response[0, col]) 108 | 109 | -------------------------------------------------------------------------------- /dm2c.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import string 3 | import os 4 | from nltk.corpus import stopwords 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.preprocessing import Normalizer 7 | from nltk.stem.porter import PorterStemmer 8 | import re 9 | import numpy as np 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | from matplotlib import pyplot as plt 12 | from sklearn.decomposition import TruncatedSVD as SVD 13 | from scipy.cluster.hierarchy import dendrogram, linkage,cut_tree 14 | from sklearn.cluster import AgglomerativeClustering 15 | 16 | 17 | path = './msc-plagiarism-assigment' 18 | token_dict = {} 19 | 20 | 21 | def tokenize(text): 22 | tokens = nltk.word_tokenize(text) 23 | stems = [] 24 | for item in tokens: 25 | stems.append(PorterStemmer().stem(item)) 26 | return stems 27 | 28 | for dirpath, dirs, files in os.walk(path): 29 | for f in files: 30 | fname = os.path.join(dirpath, f) 31 | print ("fname=", fname) 32 | try: 33 | with open(fname) as pearl: 34 | text = pearl.read() 35 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 36 | except UnicodeDecodeError as e: 37 | with open(fname,encoding="utf8") as pearl: 38 | text = pearl.read() 39 | token_dict[f] = re.sub("[^A-Za-z]", " ", text) 40 | 41 | 42 | stopwords = stopwords.words("english") 43 | add = ['search','engine','web','internet'] 44 | stopwords.extend(add) 45 | tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords) 46 | tfs = tfidf.fit_transform(token_dict.values()) 47 | 48 | lsa = SVD(n_components = 4, n_iter =100) 49 | doc_top=lsa.fit_transform(tfs) 50 | doc_top=Normalizer(copy=False).fit_transform(doc_top) 51 | terms = tfidf.get_feature_names() 52 | for i, comp in enumerate(lsa.components_): 53 | termsInComp = zip(terms,comp) 54 | sortedTerms = sorted(termsInComp, key=lambda x:x[1], reverse=True) [:5] 55 | print ("Topic %d:" %i) 56 | for term in sortedTerms: 57 | print (term[0]) 58 | print (" ") 59 | 60 | cos = cosine_similarity(doc_top) 61 | 62 | 63 | #Dendrogram 64 | plt.figure(figsize=(10, 7)) 65 | plt.title("dendrogram") 66 | distanceMatrix = 1-cos 67 | Z=linkage(cos,method='complete') 68 | 69 | dend = dendrogram(Z, 5, orientation = 'top', 70 | color_threshold=10, 71 | leaf_font_size=10, show_leaf_counts=True) 72 | plt.show() 73 | 74 | cluster = AgglomerativeClustering(n_clusters=10, affinity='precomputed', linkage='complete') 75 | cluster.fit_predict(cos) 76 | 77 | 78 | 79 | 80 | ##import umap 81 | ##X_topics = lsa.fit_transform(tfs) 82 | ##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics) 83 | ## 84 | ##plt.figure(figsize=(7,5)) 85 | ##plt.scatter(embedding[:, 0], embedding[:, 1], 86 | ##c = tfidf.get_feature_names(), 87 | ##s = 10, # size 88 | ##edgecolor='none' 89 | ##) 90 | ##plt.show() 91 | -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-1019.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-1019.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-1037.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-1037.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-1046.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-1046.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-1138.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-1138.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-1147.txt: -------------------------------------------------------------------------------- 1 | History of Search Engines 2 | 3 | 4 | 5 | 6 | Search Engines are softwares which retrieve information from a specific platform according to the information fed to it. The platform can be a simple local file system on a personal computer, a database inside a corporation, or even something as large as the whole of the World Wide Web. 7 | 8 | 9 | The first recorded search engine was Archie and was created in 1990 by three Computer Science students, Alan Emtage, Bill Heelan and J. Peter Deutsch. It retrieved information from the public FTP servers and created a listing of these data. Following the rise of the Gopher protocol in 1991, two new programs called Veronica (Very Easy Rodent-Oriented Net-Wide Index to Computerized Archives) and Jughead (Jonzy’s Universal Gopher Hierarchy Excavation And Display) were launched. As they were the technical successors of Archie, they were named in reference to the characters from the Archie comic series. 10 | 11 | 12 | In 1993, Matthew Gray created the World Wide Web Wanderer, a Perl based system which was used to measure the size of the then-known internet. In the same year, Oscar Nierstrasz wrote W3Catalog. It was the very first search engine that retrieved information from websites, but suffered from the drawbacks of speed. Similarly, Aliweb was released by Martijn Koster, which depended on each website admin registering their webpages to facilitate the search functions. 13 | 14 | 15 | Jonathan Fletcher released JumpStation in the same year, which was the first program to use the three key features of a search engine, namely crawling, indexing and searching. 16 | 17 | 18 | In 1994, WebCrawler was launched which could search for any word in any webpage in the internet. This was the basis on which all future search engines would be built upon. 19 | 20 | 21 | In the next few years, the internet saw the rise of search engines and soon enough there were a number of them which offered free search functions. Notable names among them were Magellan, Excite, Infoseek, Inktomi, Northern Light and AltaVista. 22 | 23 | 24 | In 1994, Yahoo! created its web directory, which it would not use until the launch of its own search engine in 2002. Instead, it presented its search functions for other companies to use. 25 | 26 | 27 | In 1996, Steven Yang and Gary Culliss from MIT began working on a popularity engine, which provided ranked results to users according to their past selections. This was the start of what we see now as rank-based search engines where the results are shown to the user, ranked by relevance. In 1997, Ask Jeeves was released which used natural language search. This would later come to be known as Ask.com. 28 | 29 | 30 | In 1998, MSN launches MSN Search, which would later change its name to Bing in 2009. Yahoo! Search came to be powered by Bing technology. 31 | 32 | 33 | In 1997, Google.com came into existence. It would rise to its height in the 2000s. Over the years, it would provide many technological innovations to help it become the most popular search engine in recent times. One of them is PageRank, which used the concepts of iterative ranking algorithm as was described in the paper “Anatomy of a Search Engine” by Sergey Brin and Larry Page. It ranks the web pages based on the number of links to and from it and the ranks of those links, with the assumption that good pages are linked to more good ranked pages. This facilitated the ranked search feature and helped migrate more users to its platform. Also, one of its main advantages was its minimalistic interface. 34 | 35 | 36 | Thus, search engines have risen and provided a helpful interface for users to search whatever they need in the internet. Now, more new technologies are getting popularized such as semantic searching which searches according to context. They are also being used to provide advertisements to users. Many companies are analysing search data and providing targeted marketing. All in all, search engines have made navigating the internet so much easier and hassle-free. -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-202.txt: -------------------------------------------------------------------------------- 1 | The History of Search Engine 2 | 3 | The objective of all the search engines is to find and organize scattered data found on the Internet. Before the development of search engines, the Internet was merely a collection of File Transfer Protocol (FTP) sites. At that time the users could only navigate to find specific shared files. But by the addition of more web servers, the Internet grew, and the World Wide Web became the medium for accessing the Internet, and due to the presence of huge amount of data the need for finding and organizing the distributed data files on FTP web servers grew. Search engines started coming up to help the users navigate the web and files on the internet. 4 | 5 | A search engine is a software that helps its users, to retrieve any information from the World Wide Web. A user enters key phrases or keywords into a search engine and in result receives a list of Web content which is in the form of websites, videos, images etc. Basically, all the modern search engines have the following four parts. 6 | 7 | 1. Crawling: An automated bot or program scans a website and collects details about every page. 8 | 9 | 2. Indexing: The data collected from crawling is then processed and placed in a database. 10 | 11 | 3. Retrieval: The third step is when a search engine processes the search query and gives the most relevant pages. 12 | 13 | 4. Ranking: The final step involves ranking the most relevant pages for a search query. Search engines use a different ranking algorithm to show pages. 14 | 15 | In 1945 when Vannevar Bush published an article in The Atlantic Monthly, the need for the search engine was started. He emphasized the necessity for an expansive index for all knowledge. He urged scientists to work together to help build a body of knowledge for all mankind. He has also proposed the idea of a fast, reliable, extensible, virtually limitless, associative memory storage and retrieval system. He named this device a memex. 16 | 17 | Archie: It was the first search engine which started in the 1980s. It searched FTP (File Transfer Protocol) sites to create the index of downloadable files. Due to limited space, only the listings were available and not for the contents for each site. 18 | 19 | Lycos: It was created as a university project in 1993; it was the first to attain commercial search engine success. In addition to providing ranked relevance retrieval, Lycos has some additional features like prefix matching and word proximity bonuses. Now it is currently comprised of a social network with email, web hosting, and media entertainment pages. 20 | 21 | Excite: It was created in 1993 by Stanford University students. The students had the idea of using statistical analysis of word relationships to make searching more efficient and improve the relevancy of searches on the Internet. 22 | 23 | WebCrawler: It was created by Brian Pinkerton in 1994. WebCrawler was the first crawler which indexed complete pages online. 24 | 25 | Yahoo: It was started at Stanford University by Jerry Yang and David Filo in 1994. It became a web portal and search engine. It started out as a listing of their favorite Web sites. What made it different was that each entry, in addition to the URL, also had a description of the page. 26 | 27 | Google: It was started by Larry Page and Sergey Brin began in 1997. According to Page and Brin believed search engines on the basis of the number of times search terms appeared on the web page the website should be ranked. Hence It was based on relevancy ranking. 28 | 29 | And after that many search engine came like MSN search by Microsoft in 1998, AllTheWeb in 1999, StartPage (2009), and many more. 30 | -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-211.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-211.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-321.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-321.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-440.txt: -------------------------------------------------------------------------------- 1 | SEARCH ENGINE 2 | 3 | Search Engine , a service for user queries which was given via world wide web. Search Engine searches for paticulars, words or characters specified by the internet user, to the corresponding database, that used for discovering sites related to the user query on the WWW. 4 | Search engines consist of following processes: 5 | 1)Information retrieval (IR): retrieving specific information from stored data. 6 | 2)Web crawling: browses the world wide web in a systematic, computerized manner 7 | 3)Indexing: analysis of pages by titles, sub titles, headings and specific area. This is the fastest form of search. 8 | 9 | 10 | HISTORY 11 | 12 | Archie, first engine or tool used for searching on the Internet, was created in 1990 by Alan Emtage, Bill Heelan and J. Peter Deutsch, students of computer science at McGill University in Montreal. Functionality of archie:directory listings of all the files located on public anonymous FTP sites, creating a searchable database, however Archie did not use indexing. 13 | 14 | Next engine come to the picture in 1991, which was Gopher, created by Mark McCahill at the University of Minnesota, guide to two new search programs, Veronica and Jughead. Similar to Archie, Veronica and Jughead had searched the filenames, headings and titles stored in Gopher index systems. Veronica provided a keyword search of most Gopher menu tables in the entire Gopher listings. Jughead Excavation and Display, a mechanism used for obtaining menu information from specific Gopher servers. 15 | 16 | W3Catalog, the web's first primitive search engine, released on September 2, 1993. 17 | Matthew Gray produced the first web robot in june 1993 at MIT, the Perl-based WWW Wanderer, and used it to make an index called 'Wandex'. Upto 1995, Wanderer purpose was to measure the size of World Wide Web. 18 | 19 | Lately in 1993, the web second search engine Aliweb come into sight in November 1993. Some of the sights considered Aliweb as the first Search Engine because former search engines(Archie, Veronica, Jughead) were infact just indexers. A facility provided to the user by Aliweb is that user can give in the location information of the index files which in turn outputted 20 | the user-written page interpretation and keywords and webpages. Aliweb was not used broadly. 21 | 22 | In December 1993, Jump Station used web form as an interface for the queries and used web robot to detect web pages. 23 | Jump Station was the first WWW resource-discovery device to combine the three required features of a web search engine that are crawling, indexing, and searching. The crawler experienced that J.S. indexing and its searching were bounded to the titles and headings found on the web pages, as few resources were reachable on the platform it ran. 24 | 25 | 26 | In 1994, the first completely text based crawler search engines was Web Crawler. dissimilar to its predecessors, it let users search for any word in any webpage. It was the first one to be broadly known by the public. Also in 1994, Lycos was started and became a major commercial attempt. After that, many search engines appeared and compete for popularity. 27 | 28 | At the time Yahoo! was the most popular manner for people to find web pages of attentiveness, but its search function worked/operated on its web directory, instead full-text copies of web pages. Information aspirant could also browse the directory instead of doing a keyword-based search. 29 | In 1996, Netscape was looking to give a single search engine an unique deal to the search engine for featuring on Netscape's web browser. There was so much interest that a deal was finalised with Netscape by five of the major search engines, where for $5Million per year each search engine would be in a rotation on the Netscape search engine page which costed for $5Million per year. The five engines were Yahoo!, Magellan, Lycos, Infoseek, and Excite. 30 | Many search engine companies were caught up in the .com fantasy, a market speculation resonate that peaked in 1999 and ceased in 2001. 31 | 32 | Around 2000, Google's search engine rose to greatness. The Google brought about effective results for many searches with an innovation known as PageRank. This repetiton algorithm ranks web pages based on the number and PageRank of other websites and pages that link there. Google too kept a minimal and essential interface to its search engine. In contrast, many of its challenger rooted a search engine in the web portal. 33 | 34 | Past 2000, Yahoo was providing search services based on Inktomi 's search engine. Yahoo! acquired Inktomi in 2002 and Overture (which owned AlltheWeb and AltaVista) in 2003. Yahoo! changed to Google's search engine until 2004, when it launched its own search engine based on the combined technologies of its making. 35 | 36 | In 1998, Microsoft first launched MSN Search using search results from Inktomi. In 1999 the result were listed from the combined result of Looksmart and Inktomi except for a short span of timein 1999 when AltaVista was also used to get the results.But in 2004,Microsoft own web crawler(called msnbot)was used for its own search technology. Bing, was launched on June 1, 2009 which was Microsoft's rebranded search engine. On July 29, 2009, Yahoo! and Microsoft came in collobaration in which Microsoft Bing technology will be used for Yahoo Search. 37 | 38 | There are many more every year search engines are either evolved or improved. 39 | --Bing launches Social Sidebar where users see search results through the lens of their 40 | social networks. 41 | --In 2015, Bing releases its own mobile-friendly algorithm update. 42 | --Google is improving its functionalities by using machine learning etc -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-505.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-505.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-532.txt: -------------------------------------------------------------------------------- 1 | HISTORY OF SEARCH ENGINES 2 | 3 | 4 | For information retrieval, Search Engines are used as a tool in web. To get relevant information from unnecessary data Search Engines are needed. It helps people to find information quickly in short span of time. 5 | It is a software program that works on keyword matching i.e it searches the internet based on the words that you consign as search terms. 6 | 7 | 8 | FIRST SEARCH ENGINE: 9 | Archie - 1990 by Alan Emtage,Bill Heelan and J.Peter Deutsch 10 | It works on the basis of a file’s name i.e used to search for File Transfer Protocol (FTP) files. 11 | Archie was fail to do indexing of contents. 12 | 13 | 14 | Gopher - 1991 by Mark McCahill 15 | It was a type of system based on Menu. All you have to do is select the item which is required to you from the menu. 16 | 17 | 18 | Veronica and Jughead - 1991 19 | First text based Search Engine. 20 | Veronica (Very Easy Rodent-Oriented Net-wide index to Computerized Archives). It provides keyword searching which are present in the Gopher listings. 21 | 22 | 23 | Jughead (Jonzy’s Universal Gopher Hierarchy Excavation and Displays). It gives menu information from specific Gophers servers. 24 | 25 | 26 | W3Catalog and Wanderer - 1993 by Oscar Nierstrasz 27 | To display images in line with text, Mosaic was introduced and was the first Graphical Web Browser. 28 | W3Catalog: was one of the first Search Engines that attempted to provide a general searchable catalog foe www resources. 29 | 30 | 31 | Wanderer: First web robot which provides an index called wandex. It is a perl based web crawler. 32 | 33 | 34 | Aliweb -1993 35 | Second search engine 36 | Users are allowed to submit the locations of index file to include web pages and can add user written page description and keywords 37 | 38 | 39 | JumpStation - 1993 40 | It combines the three essential features of a web search engine(crawling, indexing , and searching) and thus, was the first www resource discovery tool. To search web pages and to build its index, JS used a web robot. 41 | 42 | 43 | Web Crawler - 1994 by Brian Pinkerton 44 | It is a web service but at first it was a desktop application. It was the first web search engine to enable full text search. 45 | 46 | 47 | MetaCrawler - 1995 48 | It has its own search syntax. Rather than a single search engine algorithm , it provides a multiple search engine. 49 | 50 | 51 | Alta Vista - 1995 52 | It was the fastest search engine and could manage tons of hits a day without any deterioration. 53 | 54 | 55 | Excite - 1995 56 | It is an internet portal and one of the major ”dotcom-portals” 57 | 58 | 59 | Dogpile and Hotbot - 1996 60 | Dogpile: was a meta search site. It uses the search results of multiple search engine and before presenting to the user , it filters the duplicates. 61 | 62 | 63 | HotBot: The first search engine which offers the facility to search within search results. 64 | 65 | 66 | Ask Jeeves - (1996-97) : Garrett Gruener and David Warthen 67 | The concept is to provide answer to the user on the everyday basis that is users can able to get the answers of questions posted everyday. 68 | 69 | 70 | Google - 1998 71 | Google employs PageRank Algorithm which lead to successful regime. This algorithm maintain a web page ranking which matches with a given search string. Google along with indexes and caches web pages using snapshots of other file types for eg : PDF and word documents. 72 | 73 | 74 | Yahoo! Search - 2004 by David Filo and Jerry Yang 75 | Yahoo! Began as a web directory of favourable web pages, each including a man-made description in its URL. Even though Yahoo! owned multiple search engines, but kept using Google search engine for its results. 76 | 77 | 78 | WikiSeek and Guruji(2006-07) 79 | WikiSeek: It is a type of search engine which indulge with the idea of wikipedia , that is it indexed wikipedia pages only. 80 | 81 | 82 | Guruji: This search engine was for the indian users providing information in the context of India. 83 | 84 | 85 | Bing -2009 : 86 | It is designed by Microsoft. It was started as MSN search by Microsoft which used search results from Inktomi and Outsource. Later MSN Search was replaced by Bing. Bing has taken over Yahoo! Search product as well. -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-541.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-541.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-606.txt: -------------------------------------------------------------------------------- 1 | HISTORY OF SEARCH ENGINE 2 | 3 | Before knowing the history of search engine we must understand, what really search engine is? So, the search is a 4 | program where you place your query/problem and it provides you with desired solution by looking at 5 | database/servers. 6 | 7 | 8 | It all started in 1990, when the first search engine ARCHIE was invented by Alan Emtage, Bill Heelan and J. Peter 9 | Deutsch. Archie was just capable of downloading the directory listings of all the files located on File Transfer 10 | Protocol sites and it just provided the user with the filename not the content. 11 | 12 | 13 | In 1993, JUMPSTATION, the first WWW search engine came into picture. It used the titles and headings of the 14 | documents to index the web pages. It used linear search to find the web pages. It searched results on the basis of 15 | input provided by the user and output the listing of urls that matches the keywords of the input. 16 | 17 | Then, YAHOO came in 1994 which was invented by Jerry Yang and David Filo. The site was the first collection of 18 | web pages across the internet. They include Man-made descriptions for the URLs. For Site owners it was free to 19 | add Informational sites, but commercial sites it was paid. 20 | 21 | 22 | In the same year when Yahoo was launched two more search engine was also launched WebCrawler and Excite. 23 | 24 | WebCrawler was the first search engine that indexed whole page means full text search. Excite invented bought 25 | WebCrawler in 1996.It provided features like instant messaging ,weather report, news report,metasearch. 26 | 27 | 28 | Lycos search engine was also established in 1995.It enjoyed several years of growth and became the most visited 29 | site in the world. Till 1998, Lycos was the 30 | largest search engine. 31 | 32 | 33 | After Lycos then came Askjeeves search engine. In this search engine humans answered the query that was 34 | posted by the user. Now, it is known with a name of Ask.com. Facing immense competition with other search 35 | engines like google Ask.com launched a Q&A community for generating answers from real people. 36 | 37 | 38 | In 1998, Google launched,which was invented by Larry Page and Sergey Berin. Both Page and Berin developed the 39 | PageRank algorithm. Convinced that the pages with the most links to them from other highly relevant Web pages 40 | must be the most relevant pages associated with the search, Page and Berin tested their thesis as part of their 41 | studies, and laid the foundation for their search engine. Now, Google is the most widely used web based search 42 | engine.From 1998,Google has never looked back it is now the most widely used search engine. After Pagerank 43 | algorithm, Google used Panda algorithm to reward high-quality websites and diminish the presence of low-quality 44 | websites in Google’s organic search engine results.After that google again updated their algorithm to Penguin 45 | algorithm which was improved version of Panda. 46 | 47 | 48 | Microsoft launched their first search engine named-MSN search engine using search results. It consisted of a web 49 | crawler, index, and search engine . After that, MSN Search launched a improved version which displayed listings 50 | from Looksmart with the results from Inktomi but for a short time after that the results from AltaVista were used. 51 | Now, it is known with the name of Bing search engine which is developed in Asp.net. 52 | 53 | -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-743.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-743.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-817.txt: -------------------------------------------------------------------------------- 1 | 2 | History of Search Engines 3 | 4 | 5 | What is a Search Engine? 6 | 7 | Search Engine is any online tool or facility which is on the web that allows to scan through a specific domain of data (usually the WWW) and displays the most appropriate matches that appear according to the provided keywords or queries. 8 | These tools or software system are majorly written in Python, C, C++ languages. 9 | 10 | How and why did these systems originate? 11 | 12 | The first search engines appeared and were brought into usage in the 1990s. 13 | These initial systems used to search FTP sites and created an indexed downloadable files which can later be seen through to get the query results. 14 | 15 | Major reason for coming up of these systems was the rapid increase in the amount of data available on the web. Ease of storage and growth of the data sector. Thus to avail the required information from the web in efficient time without requiring extensive manual look-about, was the need of the hour. 16 | 17 | Evolution of the Search Engines.. 18 | 19 | 1990 20 | FTP sites were the target. 21 | Downloadable indexed listings were created. 22 | Due to less storage available only the listings were available and not the content. 23 | 24 | 1991 25 | Coming up of WWW virtual library(VLib), hosted by CERN. 26 | CERN web servers hosted a list of web servers- initial internet. 27 | 1993 28 | Coming up of first search engines which resembled the modern search engines. 29 | Primitive web searches stated to appear in the desired and more informative patterns and formats. 30 | Information about the pages required by the user. 31 | Redirection to the required page. 32 | Indexed titles and URLs started to appear as the query results. 33 | Example- RBSE Spider also had a rating system. 34 | 35 | 1994 36 | 37 | Coming up of yahoo! 38 | Wait time for the results was still very high. 39 | Service for commercial entities was a little expensive. 40 | results had man made descriptions with the URL of the searched pages. 41 | 42 | Web Crawler came out. 43 | The oldest web search engine which is still active as it was, with the systems and procedures intact even today. 44 | This system indexed the entire pages. 45 | 46 | 1998 47 | 48 | The landmark year in the journey of search engines. 49 | 50 | MSN search launched 51 | This system worked extensively on the user experience and the outlay of the information search in lesser time. 52 | 53 | Google 54 | Revolutionised the industry of sears engines by highly improving the user experience and the result outputs by working extensively on the backlink models. 55 | 56 | 2000-2018 57 | Business in the sector boomed with improvement in technology. 58 | Various successful search engines came out. 59 | Competition in the sector is still there, however google remains on the top in its class. 60 | 61 | Algorithmic journey of google.. 62 | 63 | Most popular search engine since its inception. 64 | Algorithm advanced with time. 65 | 66 | Initially the search results were highly objective and static. 67 | Coming up of social media highly influenced the relevancy algorithm by providing an insight on the jargons and meaning thereof. 68 | Roping in of inbound links and connections in the pages which were the result of initial searches. 69 | Adding more contents and improved data sets. 70 | Judging websites according to their authority. 71 | 72 | All these advancements helped google to improvise the algorithm each day and made the competitors remain far behind. 73 | 74 | Best Search Engines in their class, according to me.. 75 | 76 | Google - For speed and popularity of the content. 77 | Yahoo - Email related searches and associated activity. 78 | Ask - Question answering kind of sessions 79 | AOL - wide range of websites connected.. better for linked searches. 80 | Wolfram - Searches amalgamated with Maths. 81 | DuckDuckGo - Privacy, no tracking. -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-826.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1-826.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1-909.txt: -------------------------------------------------------------------------------- 1 | Text Analytics Assignment 1 2 | HISTORY OF SEARCH ENGINES 3 | Introduction: Search Engines 4 | To begin with, a search engine is a tool or a service that users can use to search for content that they require. Simply, the user enters or commands through voice certain keywords or phrases to receive a list of data available online such as documents, books, images, etc. 5 | 6 | 7 | To attain this simple task a search engine has two components : a spider or crawler and algorithm. But before reaching this design, many search engine designs have been tried. Earlier search engines relied on page contents largely, but as new algorithms and technologies like machine learning came into being, search results now returned are highly variable in nature. 8 | The Pre-Web Search Engines : 1990-1991 9 | Archie 10 | It has been described as the pre-web search engine. Created by Alan Emtage, Bill Heelan and J. Peter Deutsch. 11 | Search Type : It searched FTP Sites to create an index of all the files downloaded. It had a space issue. Hence, the search was entirely based on list headers rather than the page content. 12 | Veronica and Jughead 13 | After Archie, two new search engines came into being : Veronica and Jughead. These two were based on Gopher Indexed System. 14 | Search Type : VERONICA(Very Easy Rodent-Oriented NetWide Index to Computerized Archives) used Gopher’s menu titles. The keywords entered were matched with words in Gopher’s entire listing. 15 | Meanwhile, Jughead was used to obtain information from a specific server. 16 | Developing World Wide Web Virtual Library - 1992 17 | Tim Berner Lee had setup a Virtual Library (VLib). This library was hosted on CERN’s Web Server. It was a loose collection of topic-wise link-list. Each one maintained by the topic-expert. The CERN’s Web Servers contained listing of web server. 18 | Earlier Search Engines : 1993 19 | First Web Robot : World Wide Web Wanderer 20 | Created by Matthew Gray, it was only a robot meant to create index of the world wide web. It was not meant for search. The technology was developed in Perl. 21 | First Web Search Engine : W3 Catalog 22 | Oscar Nierstrasz released W3Catalog. 23 | Search type : It used a high quality existing list of websites instead of crawlers and indexes. But, a major performance degradation was caused when the bot accessed same pages hundreds of time in same day. 24 | AllWeb 25 | Contributed by Martijn koster. 26 | Search Type : No web robots used. Rather the system was notified by web-admin of the site’s existence on an index file. 27 | Other Primitive Searches 28 | 1. JumpStation was created by Jonathon Fletcher. 29 | Search Type : Linear Search on the resource directory containing headers. Contained crawler, indexes and search feature. 30 | 1. World Wide Web Worm searched on indexes titles and URLs .No RANK system used. 31 | 2. RBSE Spider : it used exact title matches and had a rating system. 32 | New Web Search Engines : 1994 onwards 33 | 1994-2000 34 | 1. 1994 35 | 1. Infoseek : a page could be submitted in real time. 36 | 2. Web Crawler : Created by Brian Pinkerton. Search Type : Search for any word. 37 | 3. Yahoo ! : A utility to other search engines. Yahoo is a web directory. 38 | 4. Lycos : Designed by Michael Loren Mauldin. A simple search engine research project. 39 | 1. 1995 40 | 1. LookSmart : A web directory. 41 | 2. AltaVista : Search Type : Allows Natural Language and gives search tips. Domains can be added and deleted within 24 hours. 42 | 1. 1996 43 | 1. BackRub : Created by Larry Page and Sergey Brin. It is a web crawler. Search Type : Used backlinks to search. And pages were ranked based on citation notation. 44 | 2. HotBot : An engine by Inktomi. 45 | 3. Popular Engine: A version of Direct Hit Technologies : Gary Culliss and Steven Yang designed it. This search engine ranks pages/results based on selections previously made. 46 | 1. 1997 47 | 1. Ask Jeeves : Search Type : Based on natural language. Ranks documents on popularity. 48 | 2. Google.com is registered. Based on BACKLINK Model. 49 | 1. 1998 50 | 1. Overture : Formerly Goto.com. Provided pay per click service. 51 | 2. MSN Search/Bing : Search Type : search results generated by Overture, Looksmart, and Inktomi. 52 | 3. Popular Engine: Direct Hit Technologies in partnership with Hotbot released it. Results were based on prior user searches. 53 | 1. 1999 54 | 1. AlltheWeb : Sleek Interface with Advanced features Based on FTP Search project of Tor Egge. 55 | 2000-2006 56 | 1. 2004 57 | 1. MSN Search : Microsoft uses its own indexer instead of Inktomi and Looksmart. 58 | 2. Google Suggest is launched as Google Labs. 59 | 1. 2005 60 | 1. Webmaster’s Collectively introduce Nofollow attribute to combat spam link. 61 | 2. Snap : Created by Bill Gross. Allows many features such as : display of search volumes, sophisticated auto-completion etc. 62 | 2006-2012 63 | 1. 2006-9 64 | 1. Wikia Search : By Wikia. Search Type :based on human curation. 65 | 1. 2008 66 | 1. Cuil : Search Type : uses picture thumbnails to display search results 67 | 1. 2009 68 | 1. Search Algorithm Update: Caffeine: can crawl fast, index expansion becomes easier and ranking and indexing can be maintained in real time. 69 | 1. 2010 70 | 1. Google Instant or search-before-you-type feature: Google tries to predict the query user is going to enter. 71 | 2. Blekko : Search Type : slashtags for targeted search. 72 | 1. 2011 73 | 1. Web Master Tool : Schema.org : A joint venture to provide targeted search using tag system. 74 | 2. Web Search Algorithm Update : Google Panda an algorithm update to work on cracking down of spam, scrapers, and websites with a high ad-to-content ratio. 75 | 2012-2018 76 | 1. 2012 77 | 1. Search Plus Your World : Google Tool. Uses integration of social data into search. 78 | 2. Google Penguin: Algorithm Update for handling Webspam . 79 | 3. Side Bar : Bing’s Update to use users' social networks for information relevant to the search query. 80 | 4. Google’s Search Algorithm Update: Using Knowledge Graph technique to store semantic relationships between objects. 81 | 5. Google Hummingbird :A core algorithm using knowledge graphs. 82 | 1. 2015 83 | 1. MobileGeddon : Google released an update to generate mobile friendly pages. 84 | 2. RankBrain : Google uses machine learning to rank pages. 85 | 1. 2016 86 | 1. Penguin Algorithm goes real time. 87 | 1. 2017 88 | 1. Fred: Google’s update to punish sites having poor backlinks . -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1_1349.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/msc-plagiarism-assigment/ass1_1349.txt -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1_422.txt: -------------------------------------------------------------------------------- 1 | Web Search Engines are software programs that searches the Internet based on the words that you designate as search terms. Search engines look through their own databases of information in order to find what it is that you are looking for. Web Search Engines are a good example for massively sized Information Retrieval Systems. 2 | During the early development of the web, there was a list of web servers edited by Tim Berners-Lee and hosted on the CERN webserver. As more web servers went online the central list could not keep up. On the NCSA site new servers were announced under the title "What's New!". The very first tool used for searching on the Internet was Archie.] The name stands for "archive" without the "v". It was created in 1990 by Alan Emtage, Bill Heelan and J. Peter Deutsch, computer science students at McGill University in Montreal. The program downloaded the directory listings of all the files located on public anonymous FTP(File Transfer Protocol) sites, creating a searchable database of filenames; however, Archie did not index the contents of these sites since the amount of data was so limited it could be readily searched manually. The rise of Gopher (created in 1991 by Mark McCahill at the University of Minnesota) led to two new search programs, Veronica and Jughead. Like Archie, they searched the filenames and titles stored in Gopher index systems. Veronica(Very Easy Rodent-Oriented Net-wide Index to Computerized Archives) provided a key word search of most Gopher menu titles in the entire Gopher listings. Jughead (Jonzy's Universal Gopher Hierarchy Excavation and Display) was a tool for obtaining menu information from specific Gopher servers. While the name of the search engine "Archie" was not a reference to the Archie comic book series, "Veronica" and "Jughead" are characters in the series, thus referencing their predecessor. In the summer of 1993, no search engine existed yet for the web, though numerous specialized catalogues were maintained by hand. Oscar Nierstrasz at the University of Geneva wrote a series of Perl scripts that would periodically mirror these pages and rewrite the min to a standard format which formed the basis for W3Catalog, the web's first primitive search engine, released on September2, 1993. In June 1993, Matthew Gray, then at MIT, produced what was probably the first web robot, the Perl based World Wide Web Wanderer, and used it to generate an index called 'Wandex'. The purpose of the Wanderer was to measure the size of the World Wide Web, which it did until late 1995. The web's second search engine Aliweb appeared in November 1993. Aliweb did not use a web robot, but instead depended on being notified by website administrators of the existence at each site of an index file in a particular format. Jump Station (released in December 1993) used a web robot to find web pages and to build its index, and used a web form as the interface to its query program. It was thus the first WWW resource-discovery tool to combine the three essential features of a web search engine (crawling, indexing, and searching) as described below. Because of the limited resources available on the platform on which it ran, its indexing and hence searching were limited to the titles and headings found in the web pages the crawler encountered. One of the first "full text" crawler-based search engines was WebCrawler, which came out in 1994. Unlike its predecessors, it let users search for any word in any webpage, which has become the standard for all major search engines since. It was also the first one to be widely known by the public. Also in 1994, Lycos (which started at Carnegie Mellon University) was launched and became a major commercial endeavor. Soon after, many search engines appeared and vied for popularity. These included Magellan (search engine), Excite, Infoseek, Inktomi, Northern Light, and AltaVista. Yahoo! was among the most popular ways for people to find web pages of interest, but its search function operated on its web directory, rather than full-text copies of web pages. Information seekers could also browse the directory instead of doing a keyword-based search. In 1996, Netscape was looking to give a single search engine an exclusive deal to be the featured search engine on Netscape's web browser. There was so much interest that instead a deal was struck with Netscape by five of the major search engines, where for $5Million per year each search engine would be in a rotation on the Netscape search engine page. The five engines were Yahoo!, Magellan, Lycos, Infoseek, and Excite. Search engines were also known as some of the brightest stars in the Internet investing frenzy that occurred in the late 1990s. 3 | 4 | -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1_734.txt: -------------------------------------------------------------------------------- 1 | History of web search engines-: 2 | The beginning of it all-: 3 | The need for search engine first surfaced in July 1945. Vannevar Bush published an article "As We May Think" in Atlantic monthly. Few lines from his article were -: "The difficulty seems to be, not so much that we publish unduly in view of the extent and variety of present-day interests, but rather that publication has been extended far beyond our present ability to make real use of the record" which shed light on a greater issue i.e. we need an index to be built on the data we have as the data is growing so much so we are unable to make real use of it. So he proposed the idea of a system called Memex. Memex is a blend of words memory and index and is essentially a device in which people would compress and store their books, communications and records, it is done in such a manner that it will allow easy, fast and flexible access to content in it and this idea of memex influenced the development of early hypertext systems and eventually lead to creation of WWW(world wide web). 4 | Then came the era of the father of information retrieval systems Gerard Salton (1960s - 1990s), his work greatly contributed towards the development of modern web search technologies. He and his team at Cornell and Harvard developed the SMART(Salton’s Magic Automatic Retriever of Text). SMART exhibited the concepts like Term Frequency(TF), vector space model, term discrimination values, Inverse Document Frequency(IDF) and relevancy feedback mechanisms. He also wrote a book called A theory of Indexing which explains many of his tests upon which search is still largely based 5 | The first Search Engine-: 6 | Archie was the first web search engine to surface in 1990 created by Alan Emtage, a student at McGill University in Montreal. The initial name was "archives," but it was shortened to Archie.It was a tool to index FTP archives and allowed people to find specific files, it made use of a script based data gatherer and a regular expression matcher for finding file names queried by a user. 7 | 8 | 9 | Before WWW files were shared using FTP but it was only effective for small groups and then Tim Berner Lee introduced WWW and created the virtual library which is a catalogue of the web. 10 | 11 | 12 | Archie gained popularity and University of Nevada System Computing Services developed Veronica. Veronica worked on plain text files which were sent via Gopher an alternative to Archie. Another user interface appeared and it was Jughead which performed the same task as Veronica but only searched a single server at a time. 13 | 14 | 15 | World Wide Web Wanderer - The web’s first bot which was created by Matthew Gray in 1993. At first, he wanted to measure the growth of the web by counting the active web servers but he upgraded the bot to capture actual URLs and the database he created was called Wandex but the problem with this bot was the fact that it caused lag because it was accessing same pages hundred of times a day. Though he fixed the bot but people became sceptical of bots and started questioning their value. 16 | 17 | 18 | Around december 1993 there were three bot based web search engine -: Jumpstart, World Wide Web Worm and RBSE(Repository-Based Software Engineering) spider. Jumpstart collected the info about a page’s title and header using a simple linear search which eventually lead to its downfall as web grew. The WWW Worm indexed titles and URLs but it showed the results in order it found them without any ranking as it was the case with jumpstart. This problem was took care of in RBSE spider as it had a rating system but the problem with these all was the fact that you only get results if you know the exact name of what you were looking for. 19 | Then in 1994 came the Infoseek(webmasters could submit pages to it in realtime and became default search engine for Netscape.) a popular search engine which came around in 1994.. 20 | The ElNet Galaxy was a web directory and it was organized in a similar manner to how today’s web directory are organized and it utilized different web features but the web size in 1994 doesn’t called for a web directory even then soon more web directories were followed. 21 | In may 1994 ALIWEB(Archie-Like indexing of web) was launched and it crawled meta info and allowed users to submit there pages they wanted indexed along with a description what this meant was there was no need for a bot which collects data and no excessive usage of bandwidth but the thing was users didn’t know how to submit their sites. 22 | Then came the Yahoo Directory created by David Filo and Jerry Yang in april 1994 and it was a collection of their favorite web pages but increasing number of pages motivated them to become searchable directory. It also included a man-made description associated with each URL. They started to add informational sites for free but commercial sites were charged. But by 2014 end Yahoo Directory was closed. 23 | WebCrawler was the first crawler that indexed whole pages and gain too much popularity that it cannot be used during day time. It was followed by other search engine such as Lycos(went public with a huge catalog of 54k, and used ranked retrieval, prefix matching and word proximity), LookSmart(gave tough competition to Yahoo!), Excite. 24 | Then AltaVista was launched and it offered many new features such as unlimited bandwidth(first one to do so), first one to allow for natural language queries, advanced search techniques, users can add or delete their own URL within 24 hrs and it also provided search tips. 25 | 26 | 27 | The Dawn of Modern Web Search- Google(initially BackRub) 28 | In 1996, Larry page and Sergey Brin students at Stanford University developed a search algorithm- BackRub named so because of it uses BackLinks(the incoming links to a web page) to rank its search results and this ranking scheme was called Citation notation. In their PageRank algorithm all incoming links to a web page counted as vote towards the reliability of that page but some links counted more than other based on certain criteria. The modern name for this type of search is Social Search. 29 | Between 1996 BackRub and launch of Google in 1998 the other search engines that surfaced were Inktomi:HotBot (used paid inclusion model), AskJeeves (natural language search engine which used human editors to match search queries, used clustering to find local web communities), MSN Search. 30 | 31 | 32 | Then there were other search engines which were launched after launch of Google like AllTheWeb, Teoma, Snap(too complicated for general web surfer),Cuil, LiveSearch(by microsoft),Bing(a rebranding of LiveSearch/msn), Schema(created in collaboration of Google, Microsoft, Yahoo). 33 | During this period Google changed SERP(search engine results page) with new features like news, videos, images, local and others. It released “Google Suggest”- dropdown of suggested topic related to search. It introduced new web indexing system named Caffeine which provided 50% fresher search results. Updates likes Google Instant(real-time search results), Panda(major change in ranking algorithm),Penguin, Hummingbird(first search algo that has ability to parse intent behind a query not just the language), Pigeon, Mobilegeddon,RankBrain(google revealed that machine learning has played important role in ranking algo), Possum. 34 | Google in 2014 started prioritizing websites which uses HTTPS which was seen as google effort towards web security. 35 | Google is still dominating the market with approx 90.46% and other search engine like bing(3.13%),Yahoo!(2.21%) and etc. These statistics show how Google has continuously dominated and revolutionized modern web search. -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1_808.txt: -------------------------------------------------------------------------------- 1 | ASSIGNMENT 1 - HISTORY OF SEARCH ENGINES 2 | 3 | Beginning of the development of Search Engines 4 | It was first noted in 1945 when an American engineer and scientist Vannevar Bush published in the magazine, The Atlantic Monthly, stating the importance for an “expansive index of all knowledge”. This marked the beginning of the development of Search engines that aims to find and organise huge amount of data. 5 | 6 | Search Engines before the web 7 | It was only in 1990 that this kind of index was made a reality with the development of the very first search engine called ‘Archie’ created by Alan Emtage, a student at McGrill University. The name stands for “archive” without “v”. Usually, location of the server and the name, location of the file that is to be accessed were required to retrieve information from the FTP servers. Archie searched for FTP sites to create index of downloadable files but due to limited space, only a searchable list of file names were available so it could search using the file name and not through the file content. 8 | 9 | Archie indexed files on FTP servers, Gopher created in 1991 by Mark McCahill indexed on plain text documents. Based on titles and descriptions of the document, manual traversal through a series of menus and submenus were required to get information from Gopher. 10 | Then in 1992, ‘Veronica’ and ‘Jughead’ were developed to search files stored in Gopher indexed systems. Veronica (1992) which meant ‘Very Easy Rodent-Oriented Net-wide Index to Computer Archives’ enabled keyword search for Gopher menus and submenus present in the entire Gopher system. Jughead (1993) which meant Jonzy's Universal Gopher Hierarchy Excavation and Display also searched file names (menus and submenus) in Gopher index systems but only searched a single server at a time. 11 | But all these had one major common limitation. These entire search engines were only able to search based on titles and file names, and not on the content of the file. 12 | 13 | Search Engines after the Web 14 | When the world wide web was created in 1991, there were no search engines developed to work fine over HTTP. Tim Berners Lee manually created and updated a directory of all web servers ( World wide web virtual library). It helped users find URLs for different websites. The CERN web servers hosted it at the beginning of the internet. 15 | But when the web grew to the extent that manual maintenance of directories were not possible, the need for search engine became inescapable. The requirements that were needed to be met were: 16 | ● Content discovery 17 | ● Content indexing 18 | ● Search 19 | 20 | First web Search Engine - Searchable manual directories 21 | W3Catalog - It used a computer program to pull out information from few curated website indices and provided relevant listings. After W3Catalog came Aliweb where any website could be submitted for indexing. 22 | Most popular web directory - Yahoo 23 | Created by David Filo and Jerry Yang in 1994, provided the first collection of web pages across the internet. 24 | World Wide Web Wanderer (1993) 25 | Matthew Gray created it in 1993. An index wandex was generated to measure of the size of the world wide web. 26 | Jumpstation (1993) 27 | It used web page titles as well as headings to create indexes that were generated using web crawlers. Did not provide any ranking. 28 | First Full Text Search Engine - WebCrawler (1994) 29 | The first full text crawler based search engine was WebCrawler which was developed in 1994. The first search engine to index entire pages, but the amount of data required to do search made it too slow to be used during the day. Search for any keyword in a webpage was allowed unlike the previous engines. 30 | Lycos - 1994 31 | Launched in 1994, it’s huge collection of indexed documents made it so popular. Initially, 400,000 documents were indexed per month and it grew index of 60,000,0000 documents in less than two years. Such huge number of indexed pages as compared to other search engines led to it’s popularity. 32 | Excite - 1995 33 | Launched in 1995, it was the first search engine to rank the results to provide relevant ones using word relationships and statistical analysis. 34 | AltaVista - 1995 35 | It allowed natural language search queries and was also the first to use Boolean operators. 36 | AskJeeves - 1996 37 | Operated on question-answer platform, attempted to have human editors respond to search queries. 38 | Private Search Segment 39 | Private search engines as DuckDuckGo, Qrobe and Startpage gave users an escape from getting their search habits tracked. 40 | Predecessor to Google - Backrub - 1996 41 | Larry page and Sergey brin created it in 1996. The initial idea used backlinks to help rank websites for better search. 42 | Google - 1997 43 | Main revolution was to use Page Rank algorithm which marked the relevance (rank) of a web page contributed by two factors: 44 | 1. Backlinks i.e. the number of websites that link to that particular web page indicates relevance thereby increasing the rank of the page. 45 | 2. Backlinks from trustworthy webpage increases the rank of the page whereas links from non trustworthy webpage decreases the rank of the page. (These links are assigned respective weights called link juices) 46 | Google Fred penalizes sites with low quality backlinks. Google Instant shows real time search results for users as they enter a query and Google suggest provides dropdowns of suggested topics. Google Hummingbird algorithm attempts to understand the human intent behind a search query to help user search more precisely. 47 | These edge cutting advancements effectively helps Google to provide with better and relevant web pages. Google really tops by commanding over the 70% of search engine market. 48 | 49 | Some Other modern search engines are: MSN Search (1998) - By Microsoft, Overture (1998) - Paid Web search engine, All the Web (1999) – Also referred to as FAST search -------------------------------------------------------------------------------- /msc-plagiarism-assigment/ass1_936.txt: -------------------------------------------------------------------------------- 1 | History of Search Engines 2 | Search engines have become such an integral framework for how we learn, cross-check facts, and process information.Instead of logging hours poring over books in a library, we can access seemingly limitless databases at our fingertips in a matter of seconds. 3 | 4 | What search engines are and how they work 5 | Search engines scan the content and information, and follow links that lead to other pages. Search engines operate using algorithms, which find information on websites, and store them in a large index, or catalogue. As sites are updated or changes, the index is updated as well. The engines sort through pages, and bring up the matches that are closest to the keywords searched. The pages are ranked according to the greatest relevance of content, and the order of ranking greatly contributes to the site's popularity and success. 6 | 7 | How Search Engine Development Began 8 | The need for search engines was first noted in 1945 when American engineer and scientist Vannevar Bush published an article in The Atlantic Monthly, emphasizing the necessity for an expansive index for all knowledge. Information has been extended far beyond our present ability to make real use of the record. A record, if it is to be useful to science, must be continuously extended, it must be stored. 9 | Decades later, college students and electrical engineers attempted to make this kind of index a reality. 10 | 11 | Search Engines Development Timeline 12 | Archie(1990) was the first tool created by Alan Emtage and L. Peter Deutsch for indexing, and is considered the first basic search engine. 13 | Lycos(1993) was created as a university project, but was the first to attain commercial search engine success. In 1999 Lycos was the most visited search engine in the world. 14 | Yahoo!(1994) started at Stanford University by Jerry Yang and David Filo (both electrical engineering grad students) that became a web portal and search engine. 15 | WebCrawler(1994) created by Brian Pinkerton WebCrawler was the first crawler which indexed complete pages online. 16 | AltaVista(1995) an industry leader, was once the most popular search engines of its time. It differed from its contemporaries because of two factors: Alta Vista used a multi-threaded crawler (Scooter) that covered more webpages than people knew existed at the time. It also had a well-organized search-running back-end advanced hardware. 17 | Looksmart(1995) competed with Yahoo! 18 | WiseNut(2001) was a crawler-based search engine that was introduced as a beta, and was owned by Looksmart. 19 | Excite(1995) Founded originally as "Architext"by Stanford University students, Excite was launched officially having purchased two search engines (Magellan and WebCrawler), and signed exclusive agreements with Microsoft and Apple. 20 | Hotbot(1996) a search engine also popular in the 90's was launched by Wired Magazine, and is now owned by Lycos. 21 | Dogpile(1996) was a search engine developed by Aaron Flin and shortly thereafter sold to Go2net. Now Dogpile fetches results from Google, Yahoo, and Yandex. 22 | Google(1996) Started for a research project by Stanford students Larry Page and Sergey Brin. They created a search engine that would rank websites based on the number of other websites that linked to that page. This strategy developed the world's most successful search engine today. 23 | MSN Search(1998) was the engine used by Microsoft, sourcing search results from Inktomi, and later Looksmart. By 2006 Micosoft started performing their own image searches, and MSN became branded as Windows Live Search, then Live Search, and finally to Bing (2009). 24 | ASK(1996) was originally titled "AskJeeves.com"and was designed by Garret Gruener and David Warthen in Berkeley, CA. The goal was to provide users with answers to queries typed with normal everyday language and colloquialisms. It was acquired in 2005 by IAC and continues to grow with over 100 million users. 25 | Teoma(2000) meaning "expert"in Gaelic, was a search engine created by professor Apostolos Gerasoulis and Tao Yang at Rutgers University. 26 | Infoseek(1994) was a search engine begun by Steve Kirsch, and was bought by The Walt Disney Company in 1998. Eventually it was replaced by Yahoo, and no longer exists. 27 | Overture(1998) was originally named "GoTo,"where top listings were sold on a cost-per-click or pay-per-click basis. 28 | Alltheweb(1999) began in 1994 out of FTP Search, from Norwegian University of Science and Technology, when then turned into Fast Search &Transfer, or FAST. 29 | AOL Search(1999) bought Web Crawler (one of the major crawler-based engines of it's time) in 1995, and after a number of deals, purchases and exchanges, AOL relaunched their search engine, calling it AOL Search. 30 | 31 | Newer Search Engines 32 | Cuil(2008) was a search engine that arranged pages by content, showing large entries with pictures, and thumbnails for results etc. The search engine claimed to have over 120 billion web pages indexed, and would not store user's search activity or their IP number. 33 | Secure Search Engines 34 | Ixquick.com(1998) is a metasearch engine that offers a proxy service for Ixquick and an email service that offer privacy protection, called StartMail. It was relaunched in 2005 and included a re-engineered metasearch algorithm. 35 | StartPage(2009) is a secure search engine, meaning it pulls all the same results as Google, but uses the privacy protection of Ixquick, which allows users to search with privacy. 36 | DuckDuckGo(2006) is a search engine that does not store or share any information about the user, and is unique to other search engines by providing all users the same results for a given search term, as well as providing search results from what they describe as the "best sources"rather than from the most sources. 37 | 38 | Specialized Search Engines 39 | Wolfram Alpha(2009) is a "computational knowledge engine"that answers factual queries by computing the answer from externally sourced "curated data"instead of listing relevant websites which could lead to the answer. 40 | Major Non-US Search Engines 41 | Baidu(2000) is one of the main search engines in China, based on a special identification technology that classifies and groups articles. Baidu locates information, products, and services through Chinese language search terms (via phonetic Chinese), advanced searches, snapshots, spell checker, stock quotes, news, images, video, space information, weather, train and flight schedules and other local information. Baidu's greatest competitors are Google Hong Kong and Yahoo! China. 42 | Yandex(1997), originally standing for "yet another indexer,"is the largest search engine in Russia, and ranked as the 4th largest search engine in the world, serving over 150 millions searches per day. 43 | 44 | Local Engines 45 | Yelp(2004), named for the concept "Yellow Pages"began as an email service exchange recommending local business. Yelp now is connected to social networking sites and functions as a search engine, where users can access reviews for companies/restaurants/businesses under a specific search/product. Yelp recently announced that it is now powering the Microsoft Bing local search engine results. 46 | Foursquare(2009) is a location-based social networking search engine for mobile devices, utilizing a GPS hardware system where users can search for restaurants/entertainment, etc in their immediate locale and connect with others in the area. 47 | -------------------------------------------------------------------------------- /plots/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/plots/Figure_1.png -------------------------------------------------------------------------------- /plots/cosine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/plots/cosine.png -------------------------------------------------------------------------------- /plots/hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/plots/hist.png -------------------------------------------------------------------------------- /plots/kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/plots/kmeans.png -------------------------------------------------------------------------------- /to_test/ass1-1019.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-1019.txt -------------------------------------------------------------------------------- /to_test/ass1-1037.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-1037.txt -------------------------------------------------------------------------------- /to_test/ass1-1046.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-1046.txt -------------------------------------------------------------------------------- /to_test/ass1-1138.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-1138.txt -------------------------------------------------------------------------------- /to_test/ass1-1147.txt: -------------------------------------------------------------------------------- 1 | History of Search Engines 2 | 3 | 4 | 5 | 6 | Search Engines are softwares which retrieve information from a specific platform according to the information fed to it. The platform can be a simple local file system on a personal computer, a database inside a corporation, or even something as large as the whole of the World Wide Web. 7 | 8 | 9 | The first recorded search engine was Archie and was created in 1990 by three Computer Science students, Alan Emtage, Bill Heelan and J. Peter Deutsch. It retrieved information from the public FTP servers and created a listing of these data. Following the rise of the Gopher protocol in 1991, two new programs called Veronica (Very Easy Rodent-Oriented Net-Wide Index to Computerized Archives) and Jughead (Jonzy’s Universal Gopher Hierarchy Excavation And Display) were launched. As they were the technical successors of Archie, they were named in reference to the characters from the Archie comic series. 10 | 11 | 12 | In 1993, Matthew Gray created the World Wide Web Wanderer, a Perl based system which was used to measure the size of the then-known internet. In the same year, Oscar Nierstrasz wrote W3Catalog. It was the very first search engine that retrieved information from websites, but suffered from the drawbacks of speed. Similarly, Aliweb was released by Martijn Koster, which depended on each website admin registering their webpages to facilitate the search functions. 13 | 14 | 15 | Jonathan Fletcher released JumpStation in the same year, which was the first program to use the three key features of a search engine, namely crawling, indexing and searching. 16 | 17 | 18 | In 1994, WebCrawler was launched which could search for any word in any webpage in the internet. This was the basis on which all future search engines would be built upon. 19 | 20 | 21 | In the next few years, the internet saw the rise of search engines and soon enough there were a number of them which offered free search functions. Notable names among them were Magellan, Excite, Infoseek, Inktomi, Northern Light and AltaVista. 22 | 23 | 24 | In 1994, Yahoo! created its web directory, which it would not use until the launch of its own search engine in 2002. Instead, it presented its search functions for other companies to use. 25 | 26 | 27 | In 1996, Steven Yang and Gary Culliss from MIT began working on a popularity engine, which provided ranked results to users according to their past selections. This was the start of what we see now as rank-based search engines where the results are shown to the user, ranked by relevance. In 1997, Ask Jeeves was released which used natural language search. This would later come to be known as Ask.com. 28 | 29 | 30 | In 1998, MSN launches MSN Search, which would later change its name to Bing in 2009. Yahoo! Search came to be powered by Bing technology. 31 | 32 | 33 | In 1997, Google.com came into existence. It would rise to its height in the 2000s. Over the years, it would provide many technological innovations to help it become the most popular search engine in recent times. One of them is PageRank, which used the concepts of iterative ranking algorithm as was described in the paper “Anatomy of a Search Engine” by Sergey Brin and Larry Page. It ranks the web pages based on the number of links to and from it and the ranks of those links, with the assumption that good pages are linked to more good ranked pages. This facilitated the ranked search feature and helped migrate more users to its platform. Also, one of its main advantages was its minimalistic interface. 34 | 35 | 36 | Thus, search engines have risen and provided a helpful interface for users to search whatever they need in the internet. Now, more new technologies are getting popularized such as semantic searching which searches according to context. They are also being used to provide advertisements to users. Many companies are analysing search data and providing targeted marketing. All in all, search engines have made navigating the internet so much easier and hassle-free. -------------------------------------------------------------------------------- /to_test/ass1-202.txt: -------------------------------------------------------------------------------- 1 | The History of Search Engine 2 | 3 | The objective of all the search engines is to find and organize scattered data found on the Internet. Before the development of search engines, the Internet was merely a collection of File Transfer Protocol (FTP) sites. At that time the users could only navigate to find specific shared files. But by the addition of more web servers, the Internet grew, and the World Wide Web became the medium for accessing the Internet, and due to the presence of huge amount of data the need for finding and organizing the distributed data files on FTP web servers grew. Search engines started coming up to help the users navigate the web and files on the internet. 4 | 5 | A search engine is a software that helps its users, to retrieve any information from the World Wide Web. A user enters key phrases or keywords into a search engine and in result receives a list of Web content which is in the form of websites, videos, images etc. Basically, all the modern search engines have the following four parts. 6 | 7 | 1. Crawling: An automated bot or program scans a website and collects details about every page. 8 | 9 | 2. Indexing: The data collected from crawling is then processed and placed in a database. 10 | 11 | 3. Retrieval: The third step is when a search engine processes the search query and gives the most relevant pages. 12 | 13 | 4. Ranking: The final step involves ranking the most relevant pages for a search query. Search engines use a different ranking algorithm to show pages. 14 | 15 | In 1945 when Vannevar Bush published an article in The Atlantic Monthly, the need for the search engine was started. He emphasized the necessity for an expansive index for all knowledge. He urged scientists to work together to help build a body of knowledge for all mankind. He has also proposed the idea of a fast, reliable, extensible, virtually limitless, associative memory storage and retrieval system. He named this device a memex. 16 | 17 | Archie: It was the first search engine which started in the 1980s. It searched FTP (File Transfer Protocol) sites to create the index of downloadable files. Due to limited space, only the listings were available and not for the contents for each site. 18 | 19 | Lycos: It was created as a university project in 1993; it was the first to attain commercial search engine success. In addition to providing ranked relevance retrieval, Lycos has some additional features like prefix matching and word proximity bonuses. Now it is currently comprised of a social network with email, web hosting, and media entertainment pages. 20 | 21 | Excite: It was created in 1993 by Stanford University students. The students had the idea of using statistical analysis of word relationships to make searching more efficient and improve the relevancy of searches on the Internet. 22 | 23 | WebCrawler: It was created by Brian Pinkerton in 1994. WebCrawler was the first crawler which indexed complete pages online. 24 | 25 | Yahoo: It was started at Stanford University by Jerry Yang and David Filo in 1994. It became a web portal and search engine. It started out as a listing of their favorite Web sites. What made it different was that each entry, in addition to the URL, also had a description of the page. 26 | 27 | Google: It was started by Larry Page and Sergey Brin began in 1997. According to Page and Brin believed search engines on the basis of the number of times search terms appeared on the web page the website should be ranked. Hence It was based on relevancy ranking. 28 | 29 | And after that many search engine came like MSN search by Microsoft in 1998, AllTheWeb in 1999, StartPage (2009), and many more. 30 | -------------------------------------------------------------------------------- /to_test/ass1-211.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-211.txt -------------------------------------------------------------------------------- /to_test/ass1-321.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-321.txt -------------------------------------------------------------------------------- /to_test/ass1-440.txt: -------------------------------------------------------------------------------- 1 | SEARCH ENGINE 2 | 3 | Search Engine , a service for user queries which was given via world wide web. Search Engine searches for paticulars, words or characters specified by the internet user, to the corresponding database, that used for discovering sites related to the user query on the WWW. 4 | Search engines consist of following processes: 5 | 1)Information retrieval (IR): retrieving specific information from stored data. 6 | 2)Web crawling: browses the world wide web in a systematic, computerized manner 7 | 3)Indexing: analysis of pages by titles, sub titles, headings and specific area. This is the fastest form of search. 8 | 9 | 10 | HISTORY 11 | 12 | Archie, first engine or tool used for searching on the Internet, was created in 1990 by Alan Emtage, Bill Heelan and J. Peter Deutsch, students of computer science at McGill University in Montreal. Functionality of archie:directory listings of all the files located on public anonymous FTP sites, creating a searchable database, however Archie did not use indexing. 13 | 14 | Next engine come to the picture in 1991, which was Gopher, created by Mark McCahill at the University of Minnesota, guide to two new search programs, Veronica and Jughead. Similar to Archie, Veronica and Jughead had searched the filenames, headings and titles stored in Gopher index systems. Veronica provided a keyword search of most Gopher menu tables in the entire Gopher listings. Jughead Excavation and Display, a mechanism used for obtaining menu information from specific Gopher servers. 15 | 16 | W3Catalog, the web's first primitive search engine, released on September 2, 1993. 17 | Matthew Gray produced the first web robot in june 1993 at MIT, the Perl-based WWW Wanderer, and used it to make an index called 'Wandex'. Upto 1995, Wanderer purpose was to measure the size of World Wide Web. 18 | 19 | Lately in 1993, the web second search engine Aliweb come into sight in November 1993. Some of the sights considered Aliweb as the first Search Engine because former search engines(Archie, Veronica, Jughead) were infact just indexers. A facility provided to the user by Aliweb is that user can give in the location information of the index files which in turn outputted 20 | the user-written page interpretation and keywords and webpages. Aliweb was not used broadly. 21 | 22 | In December 1993, Jump Station used web form as an interface for the queries and used web robot to detect web pages. 23 | Jump Station was the first WWW resource-discovery device to combine the three required features of a web search engine that are crawling, indexing, and searching. The crawler experienced that J.S. indexing and its searching were bounded to the titles and headings found on the web pages, as few resources were reachable on the platform it ran. 24 | 25 | 26 | In 1994, the first completely text based crawler search engines was Web Crawler. dissimilar to its predecessors, it let users search for any word in any webpage. It was the first one to be broadly known by the public. Also in 1994, Lycos was started and became a major commercial attempt. After that, many search engines appeared and compete for popularity. 27 | 28 | At the time Yahoo! was the most popular manner for people to find web pages of attentiveness, but its search function worked/operated on its web directory, instead full-text copies of web pages. Information aspirant could also browse the directory instead of doing a keyword-based search. 29 | In 1996, Netscape was looking to give a single search engine an unique deal to the search engine for featuring on Netscape's web browser. There was so much interest that a deal was finalised with Netscape by five of the major search engines, where for $5Million per year each search engine would be in a rotation on the Netscape search engine page which costed for $5Million per year. The five engines were Yahoo!, Magellan, Lycos, Infoseek, and Excite. 30 | Many search engine companies were caught up in the .com fantasy, a market speculation resonate that peaked in 1999 and ceased in 2001. 31 | 32 | Around 2000, Google's search engine rose to greatness. The Google brought about effective results for many searches with an innovation known as PageRank. This repetiton algorithm ranks web pages based on the number and PageRank of other websites and pages that link there. Google too kept a minimal and essential interface to its search engine. In contrast, many of its challenger rooted a search engine in the web portal. 33 | 34 | Past 2000, Yahoo was providing search services based on Inktomi 's search engine. Yahoo! acquired Inktomi in 2002 and Overture (which owned AlltheWeb and AltaVista) in 2003. Yahoo! changed to Google's search engine until 2004, when it launched its own search engine based on the combined technologies of its making. 35 | 36 | In 1998, Microsoft first launched MSN Search using search results from Inktomi. In 1999 the result were listed from the combined result of Looksmart and Inktomi except for a short span of timein 1999 when AltaVista was also used to get the results.But in 2004,Microsoft own web crawler(called msnbot)was used for its own search technology. Bing, was launched on June 1, 2009 which was Microsoft's rebranded search engine. On July 29, 2009, Yahoo! and Microsoft came in collobaration in which Microsoft Bing technology will be used for Yahoo Search. 37 | 38 | There are many more every year search engines are either evolved or improved. 39 | --Bing launches Social Sidebar where users see search results through the lens of their 40 | social networks. 41 | --In 2015, Bing releases its own mobile-friendly algorithm update. 42 | --Google is improving its functionalities by using machine learning etc -------------------------------------------------------------------------------- /to_test/ass1-505.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-505.txt -------------------------------------------------------------------------------- /to_test/ass1-532.txt: -------------------------------------------------------------------------------- 1 | HISTORY OF SEARCH ENGINES 2 | 3 | 4 | For information retrieval, Search Engines are used as a tool in web. To get relevant information from unnecessary data Search Engines are needed. It helps people to find information quickly in short span of time. 5 | It is a software program that works on keyword matching i.e it searches the internet based on the words that you consign as search terms. 6 | 7 | 8 | FIRST SEARCH ENGINE: 9 | Archie - 1990 by Alan Emtage,Bill Heelan and J.Peter Deutsch 10 | It works on the basis of a file’s name i.e used to search for File Transfer Protocol (FTP) files. 11 | Archie was fail to do indexing of contents. 12 | 13 | 14 | Gopher - 1991 by Mark McCahill 15 | It was a type of system based on Menu. All you have to do is select the item which is required to you from the menu. 16 | 17 | 18 | Veronica and Jughead - 1991 19 | First text based Search Engine. 20 | Veronica (Very Easy Rodent-Oriented Net-wide index to Computerized Archives). It provides keyword searching which are present in the Gopher listings. 21 | 22 | 23 | Jughead (Jonzy’s Universal Gopher Hierarchy Excavation and Displays). It gives menu information from specific Gophers servers. 24 | 25 | 26 | W3Catalog and Wanderer - 1993 by Oscar Nierstrasz 27 | To display images in line with text, Mosaic was introduced and was the first Graphical Web Browser. 28 | W3Catalog: was one of the first Search Engines that attempted to provide a general searchable catalog foe www resources. 29 | 30 | 31 | Wanderer: First web robot which provides an index called wandex. It is a perl based web crawler. 32 | 33 | 34 | Aliweb -1993 35 | Second search engine 36 | Users are allowed to submit the locations of index file to include web pages and can add user written page description and keywords 37 | 38 | 39 | JumpStation - 1993 40 | It combines the three essential features of a web search engine(crawling, indexing , and searching) and thus, was the first www resource discovery tool. To search web pages and to build its index, JS used a web robot. 41 | 42 | 43 | Web Crawler - 1994 by Brian Pinkerton 44 | It is a web service but at first it was a desktop application. It was the first web search engine to enable full text search. 45 | 46 | 47 | MetaCrawler - 1995 48 | It has its own search syntax. Rather than a single search engine algorithm , it provides a multiple search engine. 49 | 50 | 51 | Alta Vista - 1995 52 | It was the fastest search engine and could manage tons of hits a day without any deterioration. 53 | 54 | 55 | Excite - 1995 56 | It is an internet portal and one of the major ”dotcom-portals” 57 | 58 | 59 | Dogpile and Hotbot - 1996 60 | Dogpile: was a meta search site. It uses the search results of multiple search engine and before presenting to the user , it filters the duplicates. 61 | 62 | 63 | HotBot: The first search engine which offers the facility to search within search results. 64 | 65 | 66 | Ask Jeeves - (1996-97) : Garrett Gruener and David Warthen 67 | The concept is to provide answer to the user on the everyday basis that is users can able to get the answers of questions posted everyday. 68 | 69 | 70 | Google - 1998 71 | Google employs PageRank Algorithm which lead to successful regime. This algorithm maintain a web page ranking which matches with a given search string. Google along with indexes and caches web pages using snapshots of other file types for eg : PDF and word documents. 72 | 73 | 74 | Yahoo! Search - 2004 by David Filo and Jerry Yang 75 | Yahoo! Began as a web directory of favourable web pages, each including a man-made description in its URL. Even though Yahoo! owned multiple search engines, but kept using Google search engine for its results. 76 | 77 | 78 | WikiSeek and Guruji(2006-07) 79 | WikiSeek: It is a type of search engine which indulge with the idea of wikipedia , that is it indexed wikipedia pages only. 80 | 81 | 82 | Guruji: This search engine was for the indian users providing information in the context of India. 83 | 84 | 85 | Bing -2009 : 86 | It is designed by Microsoft. It was started as MSN search by Microsoft which used search results from Inktomi and Outsource. Later MSN Search was replaced by Bing. Bing has taken over Yahoo! Search product as well. -------------------------------------------------------------------------------- /to_test/ass1-541.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-541.txt -------------------------------------------------------------------------------- /to_test/ass1-606.txt: -------------------------------------------------------------------------------- 1 | HISTORY OF SEARCH ENGINE 2 | 3 | Before knowing the history of search engine we must understand, what really search engine is? So, the search is a 4 | program where you place your query/problem and it provides you with desired solution by looking at 5 | database/servers. 6 | 7 | 8 | It all started in 1990, when the first search engine ARCHIE was invented by Alan Emtage, Bill Heelan and J. Peter 9 | Deutsch. Archie was just capable of downloading the directory listings of all the files located on File Transfer 10 | Protocol sites and it just provided the user with the filename not the content. 11 | 12 | 13 | In 1993, JUMPSTATION, the first WWW search engine came into picture. It used the titles and headings of the 14 | documents to index the web pages. It used linear search to find the web pages. It searched results on the basis of 15 | input provided by the user and output the listing of urls that matches the keywords of the input. 16 | 17 | Then, YAHOO came in 1994 which was invented by Jerry Yang and David Filo. The site was the first collection of 18 | web pages across the internet. They include Man-made descriptions for the URLs. For Site owners it was free to 19 | add Informational sites, but commercial sites it was paid. 20 | 21 | 22 | In the same year when Yahoo was launched two more search engine was also launched WebCrawler and Excite. 23 | 24 | WebCrawler was the first search engine that indexed whole page means full text search. Excite invented bought 25 | WebCrawler in 1996.It provided features like instant messaging ,weather report, news report,metasearch. 26 | 27 | 28 | Lycos search engine was also established in 1995.It enjoyed several years of growth and became the most visited 29 | site in the world. Till 1998, Lycos was the 30 | largest search engine. 31 | 32 | 33 | After Lycos then came Askjeeves search engine. In this search engine humans answered the query that was 34 | posted by the user. Now, it is known with a name of Ask.com. Facing immense competition with other search 35 | engines like google Ask.com launched a Q&A community for generating answers from real people. 36 | 37 | 38 | In 1998, Google launched,which was invented by Larry Page and Sergey Berin. Both Page and Berin developed the 39 | PageRank algorithm. Convinced that the pages with the most links to them from other highly relevant Web pages 40 | must be the most relevant pages associated with the search, Page and Berin tested their thesis as part of their 41 | studies, and laid the foundation for their search engine. Now, Google is the most widely used web based search 42 | engine.From 1998,Google has never looked back it is now the most widely used search engine. After Pagerank 43 | algorithm, Google used Panda algorithm to reward high-quality websites and diminish the presence of low-quality 44 | websites in Google’s organic search engine results.After that google again updated their algorithm to Penguin 45 | algorithm which was improved version of Panda. 46 | 47 | 48 | Microsoft launched their first search engine named-MSN search engine using search results. It consisted of a web 49 | crawler, index, and search engine . After that, MSN Search launched a improved version which displayed listings 50 | from Looksmart with the results from Inktomi but for a short time after that the results from AltaVista were used. 51 | Now, it is known with the name of Bing search engine which is developed in Asp.net. 52 | 53 | -------------------------------------------------------------------------------- /to_test/ass1-743.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-743.txt -------------------------------------------------------------------------------- /to_test/ass1-817.txt: -------------------------------------------------------------------------------- 1 | 2 | History of Search Engines 3 | 4 | 5 | What is a Search Engine? 6 | 7 | Search Engine is any online tool or facility which is on the web that allows to scan through a specific domain of data (usually the WWW) and displays the most appropriate matches that appear according to the provided keywords or queries. 8 | These tools or software system are majorly written in Python, C, C++ languages. 9 | 10 | How and why did these systems originate? 11 | 12 | The first search engines appeared and were brought into usage in the 1990s. 13 | These initial systems used to search FTP sites and created an indexed downloadable files which can later be seen through to get the query results. 14 | 15 | Major reason for coming up of these systems was the rapid increase in the amount of data available on the web. Ease of storage and growth of the data sector. Thus to avail the required information from the web in efficient time without requiring extensive manual look-about, was the need of the hour. 16 | 17 | Evolution of the Search Engines.. 18 | 19 | 1990 20 | FTP sites were the target. 21 | Downloadable indexed listings were created. 22 | Due to less storage available only the listings were available and not the content. 23 | 24 | 1991 25 | Coming up of WWW virtual library(VLib), hosted by CERN. 26 | CERN web servers hosted a list of web servers- initial internet. 27 | 1993 28 | Coming up of first search engines which resembled the modern search engines. 29 | Primitive web searches stated to appear in the desired and more informative patterns and formats. 30 | Information about the pages required by the user. 31 | Redirection to the required page. 32 | Indexed titles and URLs started to appear as the query results. 33 | Example- RBSE Spider also had a rating system. 34 | 35 | 1994 36 | 37 | Coming up of yahoo! 38 | Wait time for the results was still very high. 39 | Service for commercial entities was a little expensive. 40 | results had man made descriptions with the URL of the searched pages. 41 | 42 | Web Crawler came out. 43 | The oldest web search engine which is still active as it was, with the systems and procedures intact even today. 44 | This system indexed the entire pages. 45 | 46 | 1998 47 | 48 | The landmark year in the journey of search engines. 49 | 50 | MSN search launched 51 | This system worked extensively on the user experience and the outlay of the information search in lesser time. 52 | 53 | Google 54 | Revolutionised the industry of sears engines by highly improving the user experience and the result outputs by working extensively on the backlink models. 55 | 56 | 2000-2018 57 | Business in the sector boomed with improvement in technology. 58 | Various successful search engines came out. 59 | Competition in the sector is still there, however google remains on the top in its class. 60 | 61 | Algorithmic journey of google.. 62 | 63 | Most popular search engine since its inception. 64 | Algorithm advanced with time. 65 | 66 | Initially the search results were highly objective and static. 67 | Coming up of social media highly influenced the relevancy algorithm by providing an insight on the jargons and meaning thereof. 68 | Roping in of inbound links and connections in the pages which were the result of initial searches. 69 | Adding more contents and improved data sets. 70 | Judging websites according to their authority. 71 | 72 | All these advancements helped google to improvise the algorithm each day and made the competitors remain far behind. 73 | 74 | Best Search Engines in their class, according to me.. 75 | 76 | Google - For speed and popularity of the content. 77 | Yahoo - Email related searches and associated activity. 78 | Ask - Question answering kind of sessions 79 | AOL - wide range of websites connected.. better for linked searches. 80 | Wolfram - Searches amalgamated with Maths. 81 | DuckDuckGo - Privacy, no tracking. -------------------------------------------------------------------------------- /to_test/ass1-826.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1-826.txt -------------------------------------------------------------------------------- /to_test/ass1-909.txt: -------------------------------------------------------------------------------- 1 | Text Analytics Assignment 1 2 | HISTORY OF SEARCH ENGINES 3 | Introduction: Search Engines 4 | To begin with, a search engine is a tool or a service that users can use to search for content that they require. Simply, the user enters or commands through voice certain keywords or phrases to receive a list of data available online such as documents, books, images, etc. 5 | 6 | 7 | To attain this simple task a search engine has two components : a spider or crawler and algorithm. But before reaching this design, many search engine designs have been tried. Earlier search engines relied on page contents largely, but as new algorithms and technologies like machine learning came into being, search results now returned are highly variable in nature. 8 | The Pre-Web Search Engines : 1990-1991 9 | Archie 10 | It has been described as the pre-web search engine. Created by Alan Emtage, Bill Heelan and J. Peter Deutsch. 11 | Search Type : It searched FTP Sites to create an index of all the files downloaded. It had a space issue. Hence, the search was entirely based on list headers rather than the page content. 12 | Veronica and Jughead 13 | After Archie, two new search engines came into being : Veronica and Jughead. These two were based on Gopher Indexed System. 14 | Search Type : VERONICA(Very Easy Rodent-Oriented NetWide Index to Computerized Archives) used Gopher’s menu titles. The keywords entered were matched with words in Gopher’s entire listing. 15 | Meanwhile, Jughead was used to obtain information from a specific server. 16 | Developing World Wide Web Virtual Library - 1992 17 | Tim Berner Lee had setup a Virtual Library (VLib). This library was hosted on CERN’s Web Server. It was a loose collection of topic-wise link-list. Each one maintained by the topic-expert. The CERN’s Web Servers contained listing of web server. 18 | Earlier Search Engines : 1993 19 | First Web Robot : World Wide Web Wanderer 20 | Created by Matthew Gray, it was only a robot meant to create index of the world wide web. It was not meant for search. The technology was developed in Perl. 21 | First Web Search Engine : W3 Catalog 22 | Oscar Nierstrasz released W3Catalog. 23 | Search type : It used a high quality existing list of websites instead of crawlers and indexes. But, a major performance degradation was caused when the bot accessed same pages hundreds of time in same day. 24 | AllWeb 25 | Contributed by Martijn koster. 26 | Search Type : No web robots used. Rather the system was notified by web-admin of the site’s existence on an index file. 27 | Other Primitive Searches 28 | 1. JumpStation was created by Jonathon Fletcher. 29 | Search Type : Linear Search on the resource directory containing headers. Contained crawler, indexes and search feature. 30 | 1. World Wide Web Worm searched on indexes titles and URLs .No RANK system used. 31 | 2. RBSE Spider : it used exact title matches and had a rating system. 32 | New Web Search Engines : 1994 onwards 33 | 1994-2000 34 | 1. 1994 35 | 1. Infoseek : a page could be submitted in real time. 36 | 2. Web Crawler : Created by Brian Pinkerton. Search Type : Search for any word. 37 | 3. Yahoo ! : A utility to other search engines. Yahoo is a web directory. 38 | 4. Lycos : Designed by Michael Loren Mauldin. A simple search engine research project. 39 | 1. 1995 40 | 1. LookSmart : A web directory. 41 | 2. AltaVista : Search Type : Allows Natural Language and gives search tips. Domains can be added and deleted within 24 hours. 42 | 1. 1996 43 | 1. BackRub : Created by Larry Page and Sergey Brin. It is a web crawler. Search Type : Used backlinks to search. And pages were ranked based on citation notation. 44 | 2. HotBot : An engine by Inktomi. 45 | 3. Popular Engine: A version of Direct Hit Technologies : Gary Culliss and Steven Yang designed it. This search engine ranks pages/results based on selections previously made. 46 | 1. 1997 47 | 1. Ask Jeeves : Search Type : Based on natural language. Ranks documents on popularity. 48 | 2. Google.com is registered. Based on BACKLINK Model. 49 | 1. 1998 50 | 1. Overture : Formerly Goto.com. Provided pay per click service. 51 | 2. MSN Search/Bing : Search Type : search results generated by Overture, Looksmart, and Inktomi. 52 | 3. Popular Engine: Direct Hit Technologies in partnership with Hotbot released it. Results were based on prior user searches. 53 | 1. 1999 54 | 1. AlltheWeb : Sleek Interface with Advanced features Based on FTP Search project of Tor Egge. 55 | 2000-2006 56 | 1. 2004 57 | 1. MSN Search : Microsoft uses its own indexer instead of Inktomi and Looksmart. 58 | 2. Google Suggest is launched as Google Labs. 59 | 1. 2005 60 | 1. Webmaster’s Collectively introduce Nofollow attribute to combat spam link. 61 | 2. Snap : Created by Bill Gross. Allows many features such as : display of search volumes, sophisticated auto-completion etc. 62 | 2006-2012 63 | 1. 2006-9 64 | 1. Wikia Search : By Wikia. Search Type :based on human curation. 65 | 1. 2008 66 | 1. Cuil : Search Type : uses picture thumbnails to display search results 67 | 1. 2009 68 | 1. Search Algorithm Update: Caffeine: can crawl fast, index expansion becomes easier and ranking and indexing can be maintained in real time. 69 | 1. 2010 70 | 1. Google Instant or search-before-you-type feature: Google tries to predict the query user is going to enter. 71 | 2. Blekko : Search Type : slashtags for targeted search. 72 | 1. 2011 73 | 1. Web Master Tool : Schema.org : A joint venture to provide targeted search using tag system. 74 | 2. Web Search Algorithm Update : Google Panda an algorithm update to work on cracking down of spam, scrapers, and websites with a high ad-to-content ratio. 75 | 2012-2018 76 | 1. 2012 77 | 1. Search Plus Your World : Google Tool. Uses integration of social data into search. 78 | 2. Google Penguin: Algorithm Update for handling Webspam . 79 | 3. Side Bar : Bing’s Update to use users' social networks for information relevant to the search query. 80 | 4. Google’s Search Algorithm Update: Using Knowledge Graph technique to store semantic relationships between objects. 81 | 5. Google Hummingbird :A core algorithm using knowledge graphs. 82 | 1. 2015 83 | 1. MobileGeddon : Google released an update to generate mobile friendly pages. 84 | 2. RankBrain : Google uses machine learning to rank pages. 85 | 1. 2016 86 | 1. Penguin Algorithm goes real time. 87 | 1. 2017 88 | 1. Fred: Google’s update to punish sites having poor backlinks . -------------------------------------------------------------------------------- /to_test/ass1_1349.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShreshthSaxena/Text-Mining/b2e4b17d258a72702a147e173f9676d0d9e294c1/to_test/ass1_1349.txt -------------------------------------------------------------------------------- /to_test/ass1_422.txt: -------------------------------------------------------------------------------- 1 | Web Search Engines are software programs that searches the Internet based on the words that you designate as search terms. Search engines look through their own databases of information in order to find what it is that you are looking for. Web Search Engines are a good example for massively sized Information Retrieval Systems. 2 | During the early development of the web, there was a list of web servers edited by Tim Berners-Lee and hosted on the CERN webserver. As more web servers went online the central list could not keep up. On the NCSA site new servers were announced under the title "What's New!". The very first tool used for searching on the Internet was Archie.] The name stands for "archive" without the "v". It was created in 1990 by Alan Emtage, Bill Heelan and J. Peter Deutsch, computer science students at McGill University in Montreal. The program downloaded the directory listings of all the files located on public anonymous FTP(File Transfer Protocol) sites, creating a searchable database of filenames; however, Archie did not index the contents of these sites since the amount of data was so limited it could be readily searched manually. The rise of Gopher (created in 1991 by Mark McCahill at the University of Minnesota) led to two new search programs, Veronica and Jughead. Like Archie, they searched the filenames and titles stored in Gopher index systems. Veronica(Very Easy Rodent-Oriented Net-wide Index to Computerized Archives) provided a key word search of most Gopher menu titles in the entire Gopher listings. Jughead (Jonzy's Universal Gopher Hierarchy Excavation and Display) was a tool for obtaining menu information from specific Gopher servers. While the name of the search engine "Archie" was not a reference to the Archie comic book series, "Veronica" and "Jughead" are characters in the series, thus referencing their predecessor. In the summer of 1993, no search engine existed yet for the web, though numerous specialized catalogues were maintained by hand. Oscar Nierstrasz at the University of Geneva wrote a series of Perl scripts that would periodically mirror these pages and rewrite the min to a standard format which formed the basis for W3Catalog, the web's first primitive search engine, released on September2, 1993. In June 1993, Matthew Gray, then at MIT, produced what was probably the first web robot, the Perl based World Wide Web Wanderer, and used it to generate an index called 'Wandex'. The purpose of the Wanderer was to measure the size of the World Wide Web, which it did until late 1995. The web's second search engine Aliweb appeared in November 1993. Aliweb did not use a web robot, but instead depended on being notified by website administrators of the existence at each site of an index file in a particular format. Jump Station (released in December 1993) used a web robot to find web pages and to build its index, and used a web form as the interface to its query program. It was thus the first WWW resource-discovery tool to combine the three essential features of a web search engine (crawling, indexing, and searching) as described below. Because of the limited resources available on the platform on which it ran, its indexing and hence searching were limited to the titles and headings found in the web pages the crawler encountered. One of the first "full text" crawler-based search engines was WebCrawler, which came out in 1994. Unlike its predecessors, it let users search for any word in any webpage, which has become the standard for all major search engines since. It was also the first one to be widely known by the public. Also in 1994, Lycos (which started at Carnegie Mellon University) was launched and became a major commercial endeavor. Soon after, many search engines appeared and vied for popularity. These included Magellan (search engine), Excite, Infoseek, Inktomi, Northern Light, and AltaVista. Yahoo! was among the most popular ways for people to find web pages of interest, but its search function operated on its web directory, rather than full-text copies of web pages. Information seekers could also browse the directory instead of doing a keyword-based search. In 1996, Netscape was looking to give a single search engine an exclusive deal to be the featured search engine on Netscape's web browser. There was so much interest that instead a deal was struck with Netscape by five of the major search engines, where for $5Million per year each search engine would be in a rotation on the Netscape search engine page. The five engines were Yahoo!, Magellan, Lycos, Infoseek, and Excite. Search engines were also known as some of the brightest stars in the Internet investing frenzy that occurred in the late 1990s. 3 | 4 | -------------------------------------------------------------------------------- /to_test/ass1_734.txt: -------------------------------------------------------------------------------- 1 | History of web search engines-: 2 | The beginning of it all-: 3 | The need for search engine first surfaced in July 1945. Vannevar Bush published an article "As We May Think" in Atlantic monthly. Few lines from his article were -: "The difficulty seems to be, not so much that we publish unduly in view of the extent and variety of present-day interests, but rather that publication has been extended far beyond our present ability to make real use of the record" which shed light on a greater issue i.e. we need an index to be built on the data we have as the data is growing so much so we are unable to make real use of it. So he proposed the idea of a system called Memex. Memex is a blend of words memory and index and is essentially a device in which people would compress and store their books, communications and records, it is done in such a manner that it will allow easy, fast and flexible access to content in it and this idea of memex influenced the development of early hypertext systems and eventually lead to creation of WWW(world wide web). 4 | Then came the era of the father of information retrieval systems Gerard Salton (1960s - 1990s), his work greatly contributed towards the development of modern web search technologies. He and his team at Cornell and Harvard developed the SMART(Salton’s Magic Automatic Retriever of Text). SMART exhibited the concepts like Term Frequency(TF), vector space model, term discrimination values, Inverse Document Frequency(IDF) and relevancy feedback mechanisms. He also wrote a book called A theory of Indexing which explains many of his tests upon which search is still largely based 5 | The first Search Engine-: 6 | Archie was the first web search engine to surface in 1990 created by Alan Emtage, a student at McGill University in Montreal. The initial name was "archives," but it was shortened to Archie.It was a tool to index FTP archives and allowed people to find specific files, it made use of a script based data gatherer and a regular expression matcher for finding file names queried by a user. 7 | 8 | 9 | Before WWW files were shared using FTP but it was only effective for small groups and then Tim Berner Lee introduced WWW and created the virtual library which is a catalogue of the web. 10 | 11 | 12 | Archie gained popularity and University of Nevada System Computing Services developed Veronica. Veronica worked on plain text files which were sent via Gopher an alternative to Archie. Another user interface appeared and it was Jughead which performed the same task as Veronica but only searched a single server at a time. 13 | 14 | 15 | World Wide Web Wanderer - The web’s first bot which was created by Matthew Gray in 1993. At first, he wanted to measure the growth of the web by counting the active web servers but he upgraded the bot to capture actual URLs and the database he created was called Wandex but the problem with this bot was the fact that it caused lag because it was accessing same pages hundred of times a day. Though he fixed the bot but people became sceptical of bots and started questioning their value. 16 | 17 | 18 | Around december 1993 there were three bot based web search engine -: Jumpstart, World Wide Web Worm and RBSE(Repository-Based Software Engineering) spider. Jumpstart collected the info about a page’s title and header using a simple linear search which eventually lead to its downfall as web grew. The WWW Worm indexed titles and URLs but it showed the results in order it found them without any ranking as it was the case with jumpstart. This problem was took care of in RBSE spider as it had a rating system but the problem with these all was the fact that you only get results if you know the exact name of what you were looking for. 19 | Then in 1994 came the Infoseek(webmasters could submit pages to it in realtime and became default search engine for Netscape.) a popular search engine which came around in 1994.. 20 | The ElNet Galaxy was a web directory and it was organized in a similar manner to how today’s web directory are organized and it utilized different web features but the web size in 1994 doesn’t called for a web directory even then soon more web directories were followed. 21 | In may 1994 ALIWEB(Archie-Like indexing of web) was launched and it crawled meta info and allowed users to submit there pages they wanted indexed along with a description what this meant was there was no need for a bot which collects data and no excessive usage of bandwidth but the thing was users didn’t know how to submit their sites. 22 | Then came the Yahoo Directory created by David Filo and Jerry Yang in april 1994 and it was a collection of their favorite web pages but increasing number of pages motivated them to become searchable directory. It also included a man-made description associated with each URL. They started to add informational sites for free but commercial sites were charged. But by 2014 end Yahoo Directory was closed. 23 | WebCrawler was the first crawler that indexed whole pages and gain too much popularity that it cannot be used during day time. It was followed by other search engine such as Lycos(went public with a huge catalog of 54k, and used ranked retrieval, prefix matching and word proximity), LookSmart(gave tough competition to Yahoo!), Excite. 24 | Then AltaVista was launched and it offered many new features such as unlimited bandwidth(first one to do so), first one to allow for natural language queries, advanced search techniques, users can add or delete their own URL within 24 hrs and it also provided search tips. 25 | 26 | 27 | The Dawn of Modern Web Search- Google(initially BackRub) 28 | In 1996, Larry page and Sergey Brin students at Stanford University developed a search algorithm- BackRub named so because of it uses BackLinks(the incoming links to a web page) to rank its search results and this ranking scheme was called Citation notation. In their PageRank algorithm all incoming links to a web page counted as vote towards the reliability of that page but some links counted more than other based on certain criteria. The modern name for this type of search is Social Search. 29 | Between 1996 BackRub and launch of Google in 1998 the other search engines that surfaced were Inktomi:HotBot (used paid inclusion model), AskJeeves (natural language search engine which used human editors to match search queries, used clustering to find local web communities), MSN Search. 30 | 31 | 32 | Then there were other search engines which were launched after launch of Google like AllTheWeb, Teoma, Snap(too complicated for general web surfer),Cuil, LiveSearch(by microsoft),Bing(a rebranding of LiveSearch/msn), Schema(created in collaboration of Google, Microsoft, Yahoo). 33 | During this period Google changed SERP(search engine results page) with new features like news, videos, images, local and others. It released “Google Suggest”- dropdown of suggested topic related to search. It introduced new web indexing system named Caffeine which provided 50% fresher search results. Updates likes Google Instant(real-time search results), Panda(major change in ranking algorithm),Penguin, Hummingbird(first search algo that has ability to parse intent behind a query not just the language), Pigeon, Mobilegeddon,RankBrain(google revealed that machine learning has played important role in ranking algo), Possum. 34 | Google in 2014 started prioritizing websites which uses HTTPS which was seen as google effort towards web security. 35 | Google is still dominating the market with approx 90.46% and other search engine like bing(3.13%),Yahoo!(2.21%) and etc. These statistics show how Google has continuously dominated and revolutionized modern web search. -------------------------------------------------------------------------------- /to_test/ass1_808.txt: -------------------------------------------------------------------------------- 1 | ASSIGNMENT 1 - HISTORY OF SEARCH ENGINES 2 | 3 | Beginning of the development of Search Engines 4 | It was first noted in 1945 when an American engineer and scientist Vannevar Bush published in the magazine, The Atlantic Monthly, stating the importance for an “expansive index of all knowledge”. This marked the beginning of the development of Search engines that aims to find and organise huge amount of data. 5 | 6 | Search Engines before the web 7 | It was only in 1990 that this kind of index was made a reality with the development of the very first search engine called ‘Archie’ created by Alan Emtage, a student at McGrill University. The name stands for “archive” without “v”. Usually, location of the server and the name, location of the file that is to be accessed were required to retrieve information from the FTP servers. Archie searched for FTP sites to create index of downloadable files but due to limited space, only a searchable list of file names were available so it could search using the file name and not through the file content. 8 | 9 | Archie indexed files on FTP servers, Gopher created in 1991 by Mark McCahill indexed on plain text documents. Based on titles and descriptions of the document, manual traversal through a series of menus and submenus were required to get information from Gopher. 10 | Then in 1992, ‘Veronica’ and ‘Jughead’ were developed to search files stored in Gopher indexed systems. Veronica (1992) which meant ‘Very Easy Rodent-Oriented Net-wide Index to Computer Archives’ enabled keyword search for Gopher menus and submenus present in the entire Gopher system. Jughead (1993) which meant Jonzy's Universal Gopher Hierarchy Excavation and Display also searched file names (menus and submenus) in Gopher index systems but only searched a single server at a time. 11 | But all these had one major common limitation. These entire search engines were only able to search based on titles and file names, and not on the content of the file. 12 | 13 | Search Engines after the Web 14 | When the world wide web was created in 1991, there were no search engines developed to work fine over HTTP. Tim Berners Lee manually created and updated a directory of all web servers ( World wide web virtual library). It helped users find URLs for different websites. The CERN web servers hosted it at the beginning of the internet. 15 | But when the web grew to the extent that manual maintenance of directories were not possible, the need for search engine became inescapable. The requirements that were needed to be met were: 16 | ● Content discovery 17 | ● Content indexing 18 | ● Search 19 | 20 | First web Search Engine - Searchable manual directories 21 | W3Catalog - It used a computer program to pull out information from few curated website indices and provided relevant listings. After W3Catalog came Aliweb where any website could be submitted for indexing. 22 | Most popular web directory - Yahoo 23 | Created by David Filo and Jerry Yang in 1994, provided the first collection of web pages across the internet. 24 | World Wide Web Wanderer (1993) 25 | Matthew Gray created it in 1993. An index wandex was generated to measure of the size of the world wide web. 26 | Jumpstation (1993) 27 | It used web page titles as well as headings to create indexes that were generated using web crawlers. Did not provide any ranking. 28 | First Full Text Search Engine - WebCrawler (1994) 29 | The first full text crawler based search engine was WebCrawler which was developed in 1994. The first search engine to index entire pages, but the amount of data required to do search made it too slow to be used during the day. Search for any keyword in a webpage was allowed unlike the previous engines. 30 | Lycos - 1994 31 | Launched in 1994, it’s huge collection of indexed documents made it so popular. Initially, 400,000 documents were indexed per month and it grew index of 60,000,0000 documents in less than two years. Such huge number of indexed pages as compared to other search engines led to it’s popularity. 32 | Excite - 1995 33 | Launched in 1995, it was the first search engine to rank the results to provide relevant ones using word relationships and statistical analysis. 34 | AltaVista - 1995 35 | It allowed natural language search queries and was also the first to use Boolean operators. 36 | AskJeeves - 1996 37 | Operated on question-answer platform, attempted to have human editors respond to search queries. 38 | Private Search Segment 39 | Private search engines as DuckDuckGo, Qrobe and Startpage gave users an escape from getting their search habits tracked. 40 | Predecessor to Google - Backrub - 1996 41 | Larry page and Sergey brin created it in 1996. The initial idea used backlinks to help rank websites for better search. 42 | Google - 1997 43 | Main revolution was to use Page Rank algorithm which marked the relevance (rank) of a web page contributed by two factors: 44 | 1. Backlinks i.e. the number of websites that link to that particular web page indicates relevance thereby increasing the rank of the page. 45 | 2. Backlinks from trustworthy webpage increases the rank of the page whereas links from non trustworthy webpage decreases the rank of the page. (These links are assigned respective weights called link juices) 46 | Google Fred penalizes sites with low quality backlinks. Google Instant shows real time search results for users as they enter a query and Google suggest provides dropdowns of suggested topics. Google Hummingbird algorithm attempts to understand the human intent behind a search query to help user search more precisely. 47 | These edge cutting advancements effectively helps Google to provide with better and relevant web pages. Google really tops by commanding over the 70% of search engine market. 48 | 49 | Some Other modern search engines are: MSN Search (1998) - By Microsoft, Overture (1998) - Paid Web search engine, All the Web (1999) – Also referred to as FAST search -------------------------------------------------------------------------------- /to_test/ass1_936.txt: -------------------------------------------------------------------------------- 1 | History of Search Engines 2 | Search engines have become such an integral framework for how we learn, cross-check facts, and process information.Instead of logging hours poring over books in a library, we can access seemingly limitless databases at our fingertips in a matter of seconds. 3 | 4 | What search engines are and how they work 5 | Search engines scan the content and information, and follow links that lead to other pages. Search engines operate using algorithms, which find information on websites, and store them in a large index, or catalogue. As sites are updated or changes, the index is updated as well. The engines sort through pages, and bring up the matches that are closest to the keywords searched. The pages are ranked according to the greatest relevance of content, and the order of ranking greatly contributes to the site's popularity and success. 6 | 7 | How Search Engine Development Began 8 | The need for search engines was first noted in 1945 when American engineer and scientist Vannevar Bush published an article in The Atlantic Monthly, emphasizing the necessity for an expansive index for all knowledge. Information has been extended far beyond our present ability to make real use of the record. A record, if it is to be useful to science, must be continuously extended, it must be stored. 9 | Decades later, college students and electrical engineers attempted to make this kind of index a reality. 10 | 11 | Search Engines Development Timeline 12 | Archie(1990) was the first tool created by Alan Emtage and L. Peter Deutsch for indexing, and is considered the first basic search engine. 13 | Lycos(1993) was created as a university project, but was the first to attain commercial search engine success. In 1999 Lycos was the most visited search engine in the world. 14 | Yahoo!(1994) started at Stanford University by Jerry Yang and David Filo (both electrical engineering grad students) that became a web portal and search engine. 15 | WebCrawler(1994) created by Brian Pinkerton WebCrawler was the first crawler which indexed complete pages online. 16 | AltaVista(1995) an industry leader, was once the most popular search engines of its time. It differed from its contemporaries because of two factors: Alta Vista used a multi-threaded crawler (Scooter) that covered more webpages than people knew existed at the time. It also had a well-organized search-running back-end advanced hardware. 17 | Looksmart(1995) competed with Yahoo! 18 | WiseNut(2001) was a crawler-based search engine that was introduced as a beta, and was owned by Looksmart. 19 | Excite(1995) Founded originally as "Architext"by Stanford University students, Excite was launched officially having purchased two search engines (Magellan and WebCrawler), and signed exclusive agreements with Microsoft and Apple. 20 | Hotbot(1996) a search engine also popular in the 90's was launched by Wired Magazine, and is now owned by Lycos. 21 | Dogpile(1996) was a search engine developed by Aaron Flin and shortly thereafter sold to Go2net. Now Dogpile fetches results from Google, Yahoo, and Yandex. 22 | Google(1996) Started for a research project by Stanford students Larry Page and Sergey Brin. They created a search engine that would rank websites based on the number of other websites that linked to that page. This strategy developed the world's most successful search engine today. 23 | MSN Search(1998) was the engine used by Microsoft, sourcing search results from Inktomi, and later Looksmart. By 2006 Micosoft started performing their own image searches, and MSN became branded as Windows Live Search, then Live Search, and finally to Bing (2009). 24 | ASK(1996) was originally titled "AskJeeves.com"and was designed by Garret Gruener and David Warthen in Berkeley, CA. The goal was to provide users with answers to queries typed with normal everyday language and colloquialisms. It was acquired in 2005 by IAC and continues to grow with over 100 million users. 25 | Teoma(2000) meaning "expert"in Gaelic, was a search engine created by professor Apostolos Gerasoulis and Tao Yang at Rutgers University. 26 | Infoseek(1994) was a search engine begun by Steve Kirsch, and was bought by The Walt Disney Company in 1998. Eventually it was replaced by Yahoo, and no longer exists. 27 | Overture(1998) was originally named "GoTo,"where top listings were sold on a cost-per-click or pay-per-click basis. 28 | Alltheweb(1999) began in 1994 out of FTP Search, from Norwegian University of Science and Technology, when then turned into Fast Search &Transfer, or FAST. 29 | AOL Search(1999) bought Web Crawler (one of the major crawler-based engines of it's time) in 1995, and after a number of deals, purchases and exchanges, AOL relaunched their search engine, calling it AOL Search. 30 | 31 | Newer Search Engines 32 | Cuil(2008) was a search engine that arranged pages by content, showing large entries with pictures, and thumbnails for results etc. The search engine claimed to have over 120 billion web pages indexed, and would not store user's search activity or their IP number. 33 | Secure Search Engines 34 | Ixquick.com(1998) is a metasearch engine that offers a proxy service for Ixquick and an email service that offer privacy protection, called StartMail. It was relaunched in 2005 and included a re-engineered metasearch algorithm. 35 | StartPage(2009) is a secure search engine, meaning it pulls all the same results as Google, but uses the privacy protection of Ixquick, which allows users to search with privacy. 36 | DuckDuckGo(2006) is a search engine that does not store or share any information about the user, and is unique to other search engines by providing all users the same results for a given search term, as well as providing search results from what they describe as the "best sources"rather than from the most sources. 37 | 38 | Specialized Search Engines 39 | Wolfram Alpha(2009) is a "computational knowledge engine"that answers factual queries by computing the answer from externally sourced "curated data"instead of listing relevant websites which could lead to the answer. 40 | Major Non-US Search Engines 41 | Baidu(2000) is one of the main search engines in China, based on a special identification technology that classifies and groups articles. Baidu locates information, products, and services through Chinese language search terms (via phonetic Chinese), advanced searches, snapshots, spell checker, stock quotes, news, images, video, space information, weather, train and flight schedules and other local information. Baidu's greatest competitors are Google Hong Kong and Yahoo! China. 42 | Yandex(1997), originally standing for "yet another indexer,"is the largest search engine in Russia, and ranked as the 4th largest search engine in the world, serving over 150 millions searches per day. 43 | 44 | Local Engines 45 | Yelp(2004), named for the concept "Yellow Pages"began as an email service exchange recommending local business. Yelp now is connected to social networking sites and functions as a search engine, where users can access reviews for companies/restaurants/businesses under a specific search/product. Yelp recently announced that it is now powering the Microsoft Bing local search engine results. 46 | Foursquare(2009) is a location-based social networking search engine for mobile devices, utilizing a GPS hardware system where users can search for restaurants/entertainment, etc in their immediate locale and connect with others in the area. 47 | -------------------------------------------------------------------------------- /to_test/code.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import nltk 3 | from nltk.corpus import stopwords 4 | 5 | f=open('ass1_422.txt','rU') 6 | raw=f.read() 7 | tokens = nltk.word_tokenize(raw) 8 | print(tokens) 9 | 10 | stopwords = stopwords.words("english") 11 | add = ['search','engine','web','.','(',')',',','!','&','?',':',';'] 12 | stopwords.extend(add) 13 | sw = [] 14 | for w in tokens: 15 | if w not in stopwords: 16 | sw.append(w) 17 | 18 | print(sw) 19 | 20 | 21 | -------------------------------------------------------------------------------- /to_test/codee.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import nltk 3 | from nltk.corpus import PlaintextCorpusReader 4 | 5 | files = ".*\.txt" 6 | 7 | corpus0 = PlaintextCorpusReader("/Users/Shreshth/Documents/du_shman/DM assignment/msc-plagiarism-assigment/", files) 8 | corpus = nltk.Text(corpus0.words()) 9 | print() 10 | #input() 11 | 12 | -------------------------------------------------------------------------------- /to_test/infoRet.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import nltk 3 | import scipy 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plot 7 | from nltk.tokenize import RegexpTokenizer 8 | from nltk.corpus import stopwords 9 | from nltk.stem import PorterStemmer 10 | from scipy.cluster.hierarchy import linkage, dendrogram 11 | from scipy.cluster import hierarchy 12 | from scipy.spatial import distance 13 | 14 | import numpy 15 | 16 | 17 | tokenizer = RegexpTokenizer(r'\w+') 18 | docid=[211,422,734,808,936,202,440,532,606,817,909,1147,321,1349,505,541,743,826,1019, 1037,1046,1138] 19 | doc=[] 20 | 21 | 22 | stoplist = stopwords.words('english') 23 | additional_stopwords = "google yahoo archie bing search engine internet user crawler launch history engines world wide web download system first data page index file" 24 | stoplist.extend(additional_stopwords.split()) 25 | 26 | stemmer = PorterStemmer() 27 | for l in docid: 28 | fileName="ass1-"+str(l)+".txt" 29 | print(fileName) 30 | f=open(fileName) 31 | tokens=[] 32 | for x in f: 33 | tokens.extend(tokenizer.tokenize(x)) 34 | token=[stemmer.stem(x.lower()) for x in tokens if x.isalpha()==True and x.lower() not in stoplist] 35 | print(str(len(token))) 36 | doc.append(token) # contains the list of words in each document 37 | 38 | lenDoc=len(doc) 39 | with open('data.csv', 'w') as csvFile: 40 | writer = csv.writer(csvFile) 41 | writer.writerows(doc) 42 | 43 | csvFile.close() 44 | 45 | #100-jaccard-> dissimilaity 46 | jaccardMat=numpy.zeros([lenDoc,lenDoc]) 47 | 48 | for l in range(0,lenDoc): 49 | for i in range(l,lenDoc): 50 | intLen=len(list(set(doc[l]).intersection(doc[i]))) 51 | uniLen=len(list(set(doc[l]) | set(doc[i]))) 52 | jaccardMat[l][i]=numpy.around((intLen/uniLen)*100,decimals=2) 53 | jaccardMat[i][l]=jaccardMat[l][i] 54 | 55 | 56 | with open('data 1.csv', 'w') as csvFile: 57 | writer = csv.writer(csvFile) 58 | writer.writerows(jaccardMat) 59 | 60 | 61 | #displaying the similarity matrix 62 | #sim_matrix_dataframe = pd.DataFrame(jaccardMat, docid, docid) 63 | #print(sim_matrix_dataframe) 64 | 65 | plot.imshow(jaccardMat, interpolation='nearest', cmap=plot.cm.ocean) 66 | plot.xticks(rotation=90) 67 | plot.title("Jaccard Matrix", color='Blue') 68 | plot.xlabel("Document ID", color='green') 69 | plot.ylabel("Document ID", color='green') 70 | plot.xticks(range(len(docid)), docid, fontsize=8, color='blue') 71 | plot.yticks(range(len(docid)), docid, fontsize=8, color='blue') 72 | plot.colorbar() 73 | plot.show() 74 | 75 | 76 | jaccardMat= np.divide(jaccardMat, 100) 77 | #print(jaccardMat) 78 | disMat = distance.squareform(1-jaccardMat) 79 | #print(disMat) 80 | 81 | 82 | Z = hierarchy.linkage(disMat, method="complete") 83 | clusters = hierarchy.cut_tree(Z, 4) 84 | 85 | #Plottting Dendrogram 86 | fig = plot.figure() 87 | den= hierarchy.dendrogram(Z, labels=docid, color_threshold=0.77,get_leaves=False, orientation='top') 88 | plot.title("Dendrogram") 89 | plot.xlabel("Dissimilarity") 90 | plot.ylabel("Document IDs") 91 | plot.rcParams['lines.linewidth'] = 2 92 | plot.rcParams['lines.color'] = 'r' 93 | plot.show() 94 | 95 | 96 | #Plottting Dendrogram 97 | fig = plot.figure() 98 | den= hierarchy.dendrogram(Z, labels=docid, color_threshold=0.5,get_leaves=False, orientation='top',count_sort='ascending') 99 | plot.title("Dendrogram") 100 | plot.xlabel("Dissimilarity") 101 | plot.ylabel("Document IDs") 102 | plot.show() 103 | 104 | csvFile.close() 105 | --------------------------------------------------------------------------------