├── src ├── main.cpp ├── scanner.hpp └── scanner.cpp ├── program.txt └── README.md /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "scanner.hpp" 2 | 3 | int main() 4 | { 5 | lexicalAnalyze("../program.txt"); 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /program.txt: -------------------------------------------------------------------------------- 1 | // This is the c test file 2 | 3 | int x ; 4 | x = x + 5 ; 5 | /* 6 | Multi line Comment 7 | */ 8 | 9 | // Single line Comment 10 | for(int i = 0, i <= 4; i++){ 11 | x += i; 12 | } 13 | x++; 14 | 15 | 54dsa 16 | _sda = "gdsdg"; 17 | -------------------------------------------------------------------------------- /src/scanner.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file scanner.hpp 3 | * @author Amirhossein Hakimnejad 4 | * 5 | * @date 2018 Nov 6 | */ 7 | 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | using std::cout; 19 | using std::vector; 20 | 21 | bool isID(const std::string &str); 22 | bool isComment(const std::string &str); 23 | bool isDigit(const std::string &str); 24 | bool isString(const std::string &str); 25 | bool isBool(const std::string &str); 26 | bool isLiteral(const std::string &str); 27 | bool isKeyword(const std::string &str); 28 | bool isStatement(const std::string &str); 29 | bool isOperator(const std::string &str); 30 | bool isSeparator(const std::string &str); 31 | bool isNotLegal(const std::string &str); 32 | void printRoleOfToken(const vector& tokens); 33 | void lexicalAnalyze(const std::string &nameOfFile); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scanner-for-lexical-analyzer-in-cpp 2 | A simple c++ program that takes a file as input and after tokenizing the file it finds what each token name is(identifier, keyword, separator, operator, literal or comment). 3 | 4 | 5 | To see what a compiler's lexical analyzer is click [here](https://en.wikipedia.org/wiki/Lexical_analysis) 6 | ### Prerequisites 7 | 8 | You need c++11 or higher to run the code. 9 | To compile the code do: 10 | ``` 11 | sudo apt-get install g++ 12 | ``` 13 | 14 | ### Using 15 | 16 | To use it, first you just need to clone it: 17 | 18 | ``` 19 | git@github.com:amirhakimnejad/Scanner-for-lexical-analyzer-in-cpp.git 20 | ``` 21 | 22 | Change the testfile (program.txt) as your desire: 23 | ``` 24 | // This is the c test file 25 | 26 | int x ; 27 | x = x + 5 ; 28 | /* 29 | Multi line Comment 30 | */ 31 | 32 | // Single line Comment 33 | for(int i = 0, i <= 4; i++){ 34 | x += i; 35 | } 36 | x++; 37 | 38 | 54dsa 39 | _sda = "gdsdg"; 40 | 41 | ``` 42 | Compile with c++11: 43 | 44 | ``` 45 | g++ main.cpp scanner.cpp -std=c++11 46 | ``` 47 | 48 | Run it: 49 | ``` 50 | ./a.out 51 | ``` 52 | 53 | This will be your output: 54 | ``` 55 | [(comment, //), 56 | (keyword, int), 57 | (identifier, x), 58 | (separator, ;), 59 | (identifier, x), 60 | (operator, =), 61 | (identifier, x), 62 | (operator, +), 63 | (literal, 5), 64 | (separator, ;), 65 | (comment, /*), 66 | (comment, //), 67 | (statement, for), 68 | (separator, (), 69 | (keyword, int), 70 | (identifier, i), 71 | (operator, =), 72 | (literal, 0), 73 | (separator, ,), 74 | (identifier, i), 75 | unknown 76 | , 77 | (operator, =), 78 | (literal, 4), 79 | (separator, ;), 80 | (identifier, i), 81 | (operator, ++), 82 | (separator, )), 83 | (separator, {), 84 | (identifier, x), 85 | (operator, +=), 86 | (identifier, i), 87 | (separator, ;), 88 | (separator, }), 89 | (identifier, x), 90 | (operator, ++), 91 | (separator, ;), 92 | unknown 93 | , 94 | (identifier, _sda), 95 | (operator, =), 96 | (literal, "gdsdg"), 97 | (separator, ;), 98 | ] 99 | ``` 100 | 101 | As you can see there are few keywords or statements in my program. You can manually add any other values to their vector. 102 | ```c++ 103 | const vector keywords{"int", "float", "auto", "double", "do", "switch", "return"}; 104 | const vector statements{"for", "while"}; 105 | const vector operators{"*", "+", "-", "/", "=", "-=", "*=", "+=", "/=", "++", "--", "=="}; 106 | const vector Separators{"{", "}", ",", "(", ")", ";"}; 107 | ``` 108 | 109 | 110 | 111 | ### How each function works 112 | ```c++ 113 | bool isID(const std::string &str); 114 | bool isComment(const std::string &str); 115 | bool isDigit(const std::string &str); 116 | bool isString(const std::string &str); 117 | bool isBool(const std::string &str); 118 | bool isLiteral(const std::string &str); 119 | bool isKeyword(const std::string &str); 120 | bool isStatement(const std::string &str); 121 | bool isOperator(const std::string &str); 122 | bool isSeparator(const std::string &str); 123 | bool isNotLegal(const std::string &str); 124 | void printRoleOfToken(const vector& tokens); 125 | void lexicalAnalyze(const std::string &nameOfFile); 126 | ``` 127 | Most of the functions above don't need any explanation. Generally lexicalAnalyze() function tokenizes the given file to 128 | a vector of strings with ignoring whitespaces, newlines and the contents of comments with using the help of isOperator() 129 | isNotLegal() and isComment() because these are the only things that comes between main things of our program. 130 | ``` 131 | for(i=5; i<= 10; i++){} 132 | ``` 133 | After making tokens vector lexicalAnalyze() passes it to printRoleOfToken() function then the functions simply uses all 134 | other functions to print the name of each token. 135 | [Known tokens](https://en.wikipedia.org/wiki/Lexical_analysis#Token): 136 | ``` 137 | identifier: names the programmer chooses; 138 | keyword: names already in the programming language; 139 | separator (also known as punctuators): punctuation characters and paired-delimiters; 140 | operator: symbols that operate on arguments and produce results; 141 | literal: numeric, logical, textual, reference literals; 142 | comment: line, block. 143 | ``` 144 | 145 | Feel free to ask questions, find bugs(:D) or anything else. 146 | 147 | ## Authors 148 | 149 | * **Amirhossein Hakimnejad** - *Initial work* - [amirhakimnejad](https://github.com/amirhakimnejad) 150 | * **Henry** - *Contributor* - [henry-bugfree](https://github.com/henry-bugfree) 151 | 152 | ## License 153 | 154 | This project is licensed under the MIT License 155 | -------------------------------------------------------------------------------- /src/scanner.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file scanner.cpp 3 | * @author Amirhossein Hakimnejad 4 | * 5 | * @date 2018 Nov 6 | */ 7 | 8 | 9 | #include "scanner.hpp" 10 | 11 | using std::cout; 12 | using std::vector; 13 | 14 | bool isID(const std::string &str) 15 | { 16 | if(std::isdigit(str[0])) 17 | return false; 18 | int counter = 0; 19 | if(str[0] == '_') 20 | counter++; 21 | 22 | for(; counter < str.size(); counter++) 23 | if(!(isalnum(str[counter]))) 24 | return false; 25 | 26 | return true; 27 | } 28 | 29 | bool isComment(const std::string &str) 30 | { 31 | return str == "/*" || str == "//"; 32 | } 33 | 34 | bool isDigit(const std::string &str) 35 | { 36 | return std::all_of(str.begin(), str.end(), ::isdigit); 37 | } 38 | 39 | bool isString(const std::string &str) 40 | { 41 | return str[0] == '"' && str[str.size()-1] == '"'; 42 | } 43 | 44 | bool isBool(const std::string &str) 45 | { 46 | return str == "true" || str == "false"; 47 | } 48 | 49 | bool isLiteral(const std::string &str) 50 | { 51 | return isDigit(str) || isString(str) || isBool(str); 52 | } 53 | 54 | bool isKeyword(const std::string &str) 55 | { 56 | const vector keywords{"int", "float", "auto", "double", "do", "switch", "return"}; 57 | for(const auto& keyword : keywords) 58 | if (keyword == str) 59 | return true; 60 | 61 | return false; 62 | } 63 | 64 | bool isStatement(const std::string &str) 65 | { 66 | const vector statements{"for", "while"}; 67 | for(const auto& statement : statements) 68 | if (statement == str) 69 | return true; 70 | 71 | return false; 72 | } 73 | 74 | bool isOperator(const std::string &str) 75 | { 76 | const vector operators{"<", ">", "<=", ">=", "*", "+", "-", "/", "=", "-=", "*=", "+=", "/=", "++", "--", "=="}; 77 | for(const auto& op : operators) 78 | if (op == str) 79 | return true; 80 | 81 | return false; 82 | } 83 | 84 | bool isSeparator(const std::string &str) 85 | { 86 | const vector Separators{"{", "}", ",", "(", ")", ";"}; 87 | for(const auto& separate : Separators) 88 | if (separate == str) 89 | return true; 90 | 91 | return false; 92 | } 93 | 94 | bool isNotLegal(const std::string &str) 95 | { 96 | return str == " " || str == "\n"; 97 | } 98 | 99 | void printRoleOfToken(const std::string& token) 100 | { 101 | if(isOperator(token)) 102 | cout << "(operator, " << token << ")"; 103 | else if(isSeparator(token)) 104 | cout << "(separator, " << token << ")"; 105 | else if(isKeyword(token)) 106 | cout << "(keyword, " << token << ")"; 107 | else if(isStatement(token)) 108 | cout << "(statement, " << token << ")"; 109 | else if(isLiteral(token)) 110 | cout << "(literal, " << token << ")"; 111 | else if(isID(token)) 112 | cout << "(identifier, " << token << ")"; 113 | else if(isComment(token)) 114 | cout << "(comment, " << token << ")"; 115 | else 116 | throw std::runtime_error("Invalid token: " + token); 117 | } 118 | 119 | void lexicalAnalyze(const std::string &nameOfFile) 120 | { 121 | char ch; 122 | std::string buffer; 123 | std::fstream file(nameOfFile, std::fstream::in); 124 | 125 | if (!file.is_open()) 126 | { 127 | cout << "error while opening the file\n"; 128 | exit(0); 129 | } 130 | 131 | bool miltiCm = false, singleCm = false; 132 | while (file >> std::noskipws >> ch) 133 | { 134 | if(singleCm || miltiCm) 135 | { 136 | if(singleCm && ch == '\n') 137 | singleCm = false; 138 | 139 | if(miltiCm && ch == '*') 140 | { 141 | file >> ch; 142 | if(ch == EOF) 143 | break; 144 | 145 | if(ch == '/') 146 | miltiCm = false; 147 | } 148 | continue; 149 | } 150 | 151 | if(ch == '/') 152 | { 153 | std::string comm(1, ch); 154 | file >> ch; 155 | if(ch == EOF) 156 | { 157 | printRoleOfToken(comm); 158 | break; 159 | } 160 | 161 | if(ch == '*') 162 | { 163 | miltiCm = true; 164 | comm += ch; 165 | } 166 | else if(ch == '/') 167 | { 168 | singleCm = true; 169 | comm += ch; 170 | } 171 | if(miltiCm || singleCm) 172 | { 173 | printRoleOfToken(comm); 174 | continue; 175 | } 176 | } 177 | 178 | if(isNotLegal(std::string(1, ch))) 179 | { 180 | if(!buffer.empty()) 181 | { 182 | printRoleOfToken(buffer); 183 | buffer = ""; 184 | } 185 | continue; 186 | } 187 | 188 | if(isOperator(std::string(1, ch)) && !isOperator(buffer)) 189 | { 190 | if(!buffer.empty()) 191 | { 192 | printRoleOfToken(buffer); 193 | buffer = ""; 194 | } 195 | } 196 | 197 | if(!isOperator(std::string(1, ch)) && isOperator(buffer)) 198 | { 199 | printRoleOfToken(buffer); 200 | buffer = ""; 201 | } 202 | 203 | if(isSeparator(std::string(1, ch))) 204 | { 205 | if(!buffer.empty()) 206 | { 207 | printRoleOfToken(buffer); 208 | buffer = ""; 209 | } 210 | if(isSeparator(std::string(1, ch))) 211 | { 212 | printRoleOfToken(std::string(1, ch)); 213 | continue; 214 | } 215 | } 216 | buffer += ch; 217 | } 218 | file.close(); 219 | } 220 | --------------------------------------------------------------------------------