<Knowledge Graph Search>

├── FinalCode ├── Database │ ├── db1.txt │ ├── db2.txt │ ├── db3.txt │ ├── db4.txt │ ├── db5.txt │ └── db6.txt ├── Main.cpp ├── Main.exe ├── Main.o ├── final_merged.txt ├── merge.cpp ├── merge.exe ├── merge.txt ├── result.txt ├── s.PNG └── search.php ├── README.md ├── _config.yml ├── finalreport.pdf └── proposal.pdf /FinalCode/Database/db1.txt: -------------------------------------------------------------------------------- 1 | George studies at IIITA.George is from bangalore. -------------------------------------------------------------------------------- /FinalCode/Database/db2.txt: -------------------------------------------------------------------------------- 1 | IIITA is in allahabad.IIITA is one of the best colleges in Allahabad. 2 | -------------------------------------------------------------------------------- /FinalCode/Database/db3.txt: -------------------------------------------------------------------------------- 1 | Paul was born in Allahabad.Allahabad is in UP. -------------------------------------------------------------------------------- /FinalCode/Database/db4.txt: -------------------------------------------------------------------------------- 1 | Sangam is a holy river.Sangam is in Allahabad. -------------------------------------------------------------------------------- /FinalCode/Database/db5.txt: -------------------------------------------------------------------------------- 1 | There are many colleges in Allahabad. MNNIT is in Allahabad. 2 | 3 | 4 | -------------------------------------------------------------------------------- /FinalCode/Database/db6.txt: -------------------------------------------------------------------------------- 1 | Bangalore is a cool city. Bangalore is in south. -------------------------------------------------------------------------------- /FinalCode/Main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #define SW 29 12 | using namespace std; 13 | 14 | 15 | vector input; 16 | vector< vector > tokens; 17 | string stopwords[SW] = { "the", "of", "in", "and", "a", "an", "to", "with", "is", "was", 18 | "were", "by", "that", "for", "be", "from", "as", "are", "on", "or", 19 | "this", "an", "not", "it", "which", "with", "too", "at", "these" }; 20 | set words; 21 | map< string, vector > index; 22 | string input1, input2; 23 | 24 | void read() 25 | { 26 | std::ofstream("result.txt"); 27 | ifstream infile; 28 | infile.open("final_merged.txt"); 29 | 30 | //cout << "INPUT\n"; 31 | string s; 32 | while( getline(infile,s) ) { 33 | if( s[0] == '#' ) { 34 | break; 35 | } 36 | cout << s << endl; 37 | input.push_back(s); 38 | } 39 | infile.close(); 40 | } 41 | 42 | bool checkSW( string s ) 43 | { 44 | for( int a = 0; a < SW; a ++ ) { 45 | if( s.compare( stopwords[a] ) == 0 ) { 46 | return false; 47 | } 48 | } 49 | return true; 50 | } 51 | 52 | void tokenize() 53 | { 54 | cout << "TOKENIZE\n"; 55 | string s, s1; 56 | vector temp; 57 | for( int a = 0; a < input.size(); a ++ ) { 58 | cout << a << " : "; 59 | s = input[a]; 60 | s = s + " "; 61 | s1 = ""; 62 | temp.clear(); 63 | for( int b = 0; b < s.size(); b ++ ) { 64 | if( s[b] == ' ' ) { 65 | if( checkSW(s1) ) { 66 | temp.push_back(s1); 67 | words.insert(s1); 68 | cout << s1 << " , "; 69 | } 70 | s1 = ""; 71 | } else { 72 | if( s[b] >= 65 && s[b] <= 90 ) { 73 | s[b] += 32; 74 | } 75 | s1 = s1 + s[b]; 76 | } 77 | } 78 | tokens.push_back(temp); 79 | cout << "\b\b\b \n"; 80 | } 81 | } 82 | 83 | void createIndex() 84 | { 85 | cout << "INDEX\n"; 86 | set :: iterator its; 87 | vector< int > temp; 88 | for( its = words.begin(); its != words.end(); its ++ ) { 89 | cout << *its << " : "; 90 | temp.clear(); 91 | for( int a = 0; a < tokens.size(); a ++ ) { 92 | if( find( tokens[a].begin(), tokens[a].end(), *its ) != tokens[a].end() ) { 93 | cout << a << " , "; 94 | temp.push_back(a); 95 | } 96 | } 97 | index[*its] = temp; 98 | cout << "\b\b\b \n"; 99 | } 100 | } 101 | 102 | void userInput(string arg1 , string arg2) 103 | { 104 | cout << "User Input\n"; 105 | cout << "Word 1 : "; 106 | input1=arg1; 107 | //input1 = "ram"; 108 | cout << "Word 2 : "; 109 | input2=arg2; 110 | //input2 = "nepal"; 111 | } 112 | 113 | map< string, bool > visited; 114 | int cost; 115 | string final; 116 | 117 | string getString(int a) 118 | { 119 | string s = ""; 120 | if( a == 0 ) { 121 | return "0"; 122 | } 123 | while( a != 0 ) { 124 | s = s + char((a%10)+'0'); 125 | a = a / 10; 126 | } 127 | return s; 128 | } 129 | 130 | void dfs( string w, int level, string cs, string cd ) 131 | { 132 | if( visited[w] == true ) { 133 | return; 134 | } 135 | cs = cs + w; 136 | cs = cs + ","; 137 | visited[w] = true; 138 | vector doc = index[w]; 139 | string cs1; 140 | string cd1; 141 | int i; 142 | for( int a = 0; a < doc.size(); a ++ ) { 143 | i = doc[a]; 144 | cd1 = cd + getString(i); 145 | cd1 = cd1 + ","; 146 | for( int b = 0; b < tokens[i].size(); b ++ ) { 147 | if( visited[tokens[i][b]] == true ) { 148 | continue; 149 | } else if( input2.compare( tokens[i][b] ) == 0 ) { 150 | cs1 = cs + tokens[i][b]; 151 | cout << a << " --- " << level << " --- " << cs1 << " --- " << cd1 << endl; 152 | if( level < cost ) { 153 | cost = level; 154 | final = cd1; 155 | } 156 | } else { 157 | dfs( tokens[i][b], level + 1, cs, cd1 ); 158 | } 159 | } 160 | } 161 | visited[w] = false; 162 | } 163 | 164 | void execute() 165 | { 166 | set< string > :: iterator its; 167 | for( its = words.begin(); its != words.end(); its ++ ) { 168 | visited[*its] = false; 169 | } 170 | cost = input.size() + 1; 171 | final = ""; 172 | string cs = ""; 173 | string cd = ""; 174 | dfs(input1,0,cs,cd); 175 | } 176 | 177 | void output() 178 | { 179 | ofstream outfile; 180 | outfile.open("result.txt"); 181 | cout << "FINAL\n"; 182 | cout << cost << " , " << final << endl; 183 | 184 | string s; 185 | int i = 0, doc; 186 | for( int a = 0; a <= cost; a ++ ) { 187 | s = ""; 188 | while( final[i] != ',' ) { 189 | s = s + final[i]; 190 | i ++; 191 | } 192 | i ++; 193 | doc = atoi(s.c_str()); 194 | cout << doc << " : " << input[doc] << endl; 195 | outfile< 2 | #include 3 | #include 4 | 5 | 6 | using namespace std; 7 | 8 | int main() 9 | { 10 | 11 | std::ofstream("merge.txt"); 12 | std::ofstream("final_merged.txt"); 13 | int i, j, k, n; 14 | system("type C:\\wamp\\www\\FinalCode\\Database\\*.txt >> merge.txt"); // Database is folder name where this c++ file is executed 15 | string line; 16 | ofstream myfile; 17 | myfile.open ("final_merged.txt"); 18 | ifstream infile; 19 | infile.open ("merge.txt"); 20 | 21 | if (infile.is_open() && myfile.is_open() ) 22 | { 23 | while ( infile.good() ) 24 | { 25 | getline (infile, line); 26 | if ( line[0] == ' ' ) 27 | { 28 | } else 29 | { 30 | myfile << line[0]; 31 | } 32 | for ( int i = 1; i < line.length(); i++ ) 33 | { 34 | if ( line[i] == '?' || line[i] == '.' ) { 35 | myfile << endl; 36 | if ( i < line.length() - 1 && line[i+1] == ' ' ) 37 | i++; 38 | } 39 | else myfile << line[i]; 40 | } 41 | myfile << " "; 42 | } 43 | infile.close(); 44 | myfile.close(); 45 | } 46 | 47 | system("pause"); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /FinalCode/merge.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kckishan/knowledgegraph/0b531f926572a8339d7f05cece3324b7450c55aa/FinalCode/merge.exe -------------------------------------------------------------------------------- /FinalCode/merge.txt: -------------------------------------------------------------------------------- 1 | George studies at IIITA.George is from bangalore.IIITA is in allahabad.IIITA is one of the best colleges in Allahabad. 2 | Paul was born in Allahabad.Allahabad is in UP.Sangam is a holy river.Sangam is in Allahabad.There are many colleges in Allahabad. MNNIT is in Allahabad. 3 | 4 | 5 | Bangalore is a cool city. Bangalore is in south. -------------------------------------------------------------------------------- /FinalCode/result.txt: -------------------------------------------------------------------------------- 1 | George studies at IIITA 2 | IIITA is in allahabad 3 | Paul was born in Allahabad 4 | -------------------------------------------------------------------------------- /FinalCode/s.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kckishan/knowledgegraph/0b531f926572a8339d7f05cece3324b7450c55aa/FinalCode/s.PNG -------------------------------------------------------------------------------- /FinalCode/search.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <Knowledge Graph Search> 5 | 15 | 16 | 17 | Knowlegde Graph Search 18 | 24 | please enter both fields"; 27 | if(!empty($_POST["input1"]) && !empty($_POST["input2"])) 28 | { 29 | echo "

Result

"; 30 | exec("merge.exe"); 31 | $input1=$_POST["input1"]; 32 | $input2=$_POST["input2"]; 33 | $result=shell_exec("Main.exe"." ".$input1." ".$input2); 34 | $file = fopen("result.txt", "r") or exit("Unable to open file!"); 35 | //Output a line of the file until the end is reached 36 | while(!feof($file)) 37 | { 38 | echo "".fgets($file).""."
"; 39 | } 40 | fclose($file); 41 | //$output = file_get_contents('http://127.0.0.1/search/result.txt'); 42 | //echo $output; 43 | //echo $results; 44 | //unset($results); 45 | } 46 | ?> 47 | 48 | 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Knowledge Graph 3 | ## A search engine 4 | 5 | ##### Kishan K.C. 6 | 7 | Steps: 8 | 9 | 1. Tokenizing Web Documents 10 | 2. Indexing 11 | 3. Forming Knowledge Graph 12 | 4. Performing Search and Constructing Information. 13 | 5. Displaying Information 14 | 15 | The project uses c++. There are two C++ programs , main.cpp and merge.cpp. 16 | 17 | 1. Merge.cpp 18 | It merges all the content from different documents in a single text file named final_merged.txt. 19 | Then it splits the whole document into individual sentences so that there is single sentence in 20 | each line. It is necessary for indexing because sentences act as edges in the graph. So during 21 | indexing sentence number is necessary. 22 | 23 | 24 | System command “type *.txt >> merge.txt” is used to merge all the text files from folder 25 | Database. After merging all the text documents into merge.txt, each sentences are arranged in a 26 | single line. Whenever a full stop or question mark is found , the sentence is printed in new line. 27 | 28 | 2. Main.cpp 29 | 30 | There are several functions in it. They are described briefly below: 31 | 32 | * read (); 33 | This function opens the text document final_merged.txt and reads individual strings from the 34 | file then it performs vector operation push_back to store the strings in vector of string named 35 | vector input. 36 | 37 | 38 | * checkSW( string s ) 39 | This function checks whether the strings from input file final_merged.txt are useful for 40 | tokenizing or not. The words that can be ignored are defined in an array called string 41 | stopwords[SW]. 42 | ``` 43 | string stopwords[SW] = { "the", "of", "in", "and", "a", "an", "to", "with", "is", "was", 44 | "were", "by", "that", "for", "be", "from", "as", "are", "on", "or", "this", "an", "not", "it", "which", 45 | "with", "too", "at", "these" }; 46 | ``` 47 | 48 | The function checks the strings are one of these stop words or not. 49 | 50 | 51 | * tokenize() 52 | 53 | This function tokenizes the strings contained in final_merged.txt if they are not one of 54 | the stop words defined above. 55 | 56 | * createIndex() 57 | 58 | After tokenization it is necessary to index the tokens , this function tags each token with 59 | the sentence number it belongs to. 60 | 61 | * userInput(string arg1 , string arg2) 62 | 63 | This function takes input from the user. User is provided with two fields in search engines. 64 | 65 | He can provide his query which must be a string in these fields. 66 | 67 | * dfs( string w, int level, string cs, string cd ) 68 | 69 | Using tokens as nodes and sentences where they occur as edges a graph is made. 70 | 71 | * execute() 72 | 73 | This function executes dfs and finds the path between two search words in the graph. 74 | 75 | * output() 76 | 77 | This function provides the result in result.txt. The result is the path between two search 78 | words provided by the user. 79 | 80 | 81 | This function calls all the functions discussed above. 82 | ``` 83 | int main(int argc, char* argv[]) { 84 | 85 | for (int i = 0; i