├── .github └── FUNDING.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── ConnectionSettingsDlg.cpp ├── ConnectionSettingsDlg.h ├── HLinkCtrl.cpp ├── HLinkCtrl.h ├── HtmlToText.cpp ├── HtmlToText.h ├── LICENSE ├── ODBCWrappers.h ├── OldImplementation.py ├── README.md ├── ReadMe.txt ├── ReleaseNotes.html ├── Resource.h ├── SoftwareContextRegister.html ├── UnquoteHTML.cpp ├── VersionInfo.cpp ├── VersionInfo.h ├── WebSearchEngine-MySQL.png ├── WebSearchEngine.cpp ├── WebSearchEngine.h ├── WebSearchEngine.png ├── WebSearchEngine.py ├── WebSearchEngine.rc ├── WebSearchEngine.sln ├── WebSearchEngine.vcxproj ├── WebSearchEngine.vcxproj.filters ├── WebSearchEngineDlg.cpp ├── WebSearchEngineDlg.h ├── WebSearchEngineExt.cpp ├── WebSearchEngineExt.h ├── index.html ├── res ├── WebSearchEngine.ico ├── WebSearchEngine.png └── WebSearchEngine.rc2 ├── search.php ├── search.sql ├── stdafx.cpp ├── stdafx.h ├── targetver.h └── x64 └── Release └── WebSearchEngine.exe /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: "https://www.paypal.com/donate/?business=X57JM9BLZFXXA&amount=10&no_recurring=0&item_name=WebSearchEngine¤cy_code=EUR" 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/visualstudio 2 | # Edit at https://www.gitignore.io/?templates=visualstudio 3 | 4 | ### VisualStudio ### 5 | ## Ignore Visual Studio temporary files, build results, and 6 | ## files generated by popular Visual Studio add-ons. 7 | ## 8 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 9 | 10 | # User-specific files 11 | *.rsuser 12 | *.suo 13 | *.user 14 | *.userosscache 15 | *.sln.docstates 16 | 17 | # User-specific files (MonoDevelop/Xamarin Studio) 18 | *.userprefs 19 | 20 | # Mono auto generated files 21 | mono_crash.* 22 | 23 | # Build results 24 | [Dd]ebug/ 25 | [Dd]ebugPublic/ 26 | [Rr]elease/ 27 | [Rr]eleases/ 28 | x64/ 29 | x86/ 30 | [Aa][Rr][Mm]/ 31 | [Aa][Rr][Mm]64/ 32 | bld/ 33 | [Bb]in/ 34 | [Oo]bj/ 35 | [Ll]og/ 36 | 37 | # Visual Studio 2015/2017 cache/options directory 38 | .vs/ 39 | # Uncomment if you have tasks that create the project's static files in wwwroot 40 | #wwwroot/ 41 | 42 | # Visual Studio 2017 auto generated files 43 | Generated\ Files/ 44 | 45 | # MSTest test Results 46 | [Tt]est[Rr]esult*/ 47 | [Bb]uild[Ll]og.* 48 | 49 | # NUnit 50 | *.VisualState.xml 51 | TestResult.xml 52 | nunit-*.xml 53 | 54 | # Build Results of an ATL Project 55 | [Dd]ebugPS/ 56 | [Rr]eleasePS/ 57 | dlldata.c 58 | 59 | # Benchmark Results 60 | BenchmarkDotNet.Artifacts/ 61 | 62 | # .NET Core 63 | project.lock.json 64 | project.fragment.lock.json 65 | artifacts/ 66 | 67 | # StyleCop 68 | StyleCopReport.xml 69 | 70 | # Files built by Visual Studio 71 | *_i.c 72 | *_p.c 73 | *_h.h 74 | *.ilk 75 | *.meta 76 | *.obj 77 | *.iobj 78 | *.pch 79 | *.pdb 80 | *.ipdb 81 | *.pgc 82 | *.pgd 83 | *.rsp 84 | *.sbr 85 | *.tlb 86 | *.tli 87 | *.tlh 88 | *.tmp 89 | *.tmp_proj 90 | *_wpftmp.csproj 91 | *.log 92 | *.vspscc 93 | *.vssscc 94 | .builds 95 | *.pidb 96 | *.svclog 97 | *.scc 98 | 99 | # Chutzpah Test files 100 | _Chutzpah* 101 | 102 | # Visual C++ cache files 103 | ipch/ 104 | *.aps 105 | *.ncb 106 | *.opendb 107 | *.opensdf 108 | *.sdf 109 | *.cachefile 110 | *.VC.db 111 | *.VC.VC.opendb 112 | 113 | # Visual Studio profiler 114 | *.psess 115 | *.vsp 116 | *.vspx 117 | *.sap 118 | 119 | # Visual Studio Trace Files 120 | *.e2e 121 | 122 | # TFS 2012 Local Workspace 123 | $tf/ 124 | 125 | # Guidance Automation Toolkit 126 | *.gpState 127 | 128 | # ReSharper is a .NET coding add-in 129 | _ReSharper*/ 130 | *.[Rr]e[Ss]harper 131 | *.DotSettings.user 132 | 133 | # JustCode is a .NET coding add-in 134 | .JustCode 135 | 136 | # TeamCity is a build add-in 137 | _TeamCity* 138 | 139 | # DotCover is a Code Coverage Tool 140 | *.dotCover 141 | 142 | # AxoCover is a Code Coverage Tool 143 | .axoCover/* 144 | !.axoCover/settings.json 145 | 146 | # Visual Studio code coverage results 147 | *.coverage 148 | *.coveragexml 149 | 150 | # NCrunch 151 | _NCrunch_* 152 | .*crunch*.local.xml 153 | nCrunchTemp_* 154 | 155 | # MightyMoose 156 | *.mm.* 157 | AutoTest.Net/ 158 | 159 | # Web workbench (sass) 160 | .sass-cache/ 161 | 162 | # Installshield output folder 163 | [Ee]xpress/ 164 | 165 | # DocProject is a documentation generator add-in 166 | DocProject/buildhelp/ 167 | DocProject/Help/*.HxT 168 | DocProject/Help/*.HxC 169 | DocProject/Help/*.hhc 170 | DocProject/Help/*.hhk 171 | DocProject/Help/*.hhp 172 | DocProject/Help/Html2 173 | DocProject/Help/html 174 | 175 | # Click-Once directory 176 | publish/ 177 | 178 | # Publish Web Output 179 | *.[Pp]ublish.xml 180 | *.azurePubxml 181 | # Note: Comment the next line if you want to checkin your web deploy settings, 182 | # but database connection strings (with potential passwords) will be unencrypted 183 | *.pubxml 184 | *.publishproj 185 | 186 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 187 | # checkin your Azure Web App publish settings, but sensitive information contained 188 | # in these scripts will be unencrypted 189 | PublishScripts/ 190 | 191 | # NuGet Packages 192 | *.nupkg 193 | # NuGet Symbol Packages 194 | *.snupkg 195 | # The packages folder can be ignored because of Package Restore 196 | **/[Pp]ackages/* 197 | # except build/, which is used as an MSBuild target. 198 | !**/[Pp]ackages/build/ 199 | # Uncomment if necessary however generally it will be regenerated when needed 200 | #!**/[Pp]ackages/repositories.config 201 | # NuGet v3's project.json files produces more ignorable files 202 | *.nuget.props 203 | *.nuget.targets 204 | 205 | # Microsoft Azure Build Output 206 | csx/ 207 | *.build.csdef 208 | 209 | # Microsoft Azure Emulator 210 | ecf/ 211 | rcf/ 212 | 213 | # Windows Store app package directories and files 214 | AppPackages/ 215 | BundleArtifacts/ 216 | Package.StoreAssociation.xml 217 | _pkginfo.txt 218 | *.appx 219 | *.appxbundle 220 | *.appxupload 221 | 222 | # Visual Studio cache files 223 | # files ending in .cache can be ignored 224 | *.[Cc]ache 225 | # but keep track of directories ending in .cache 226 | !?*.[Cc]ache/ 227 | 228 | # Others 229 | ClientBin/ 230 | ~$* 231 | *~ 232 | *.dbmdl 233 | *.dbproj.schemaview 234 | *.jfm 235 | *.pfx 236 | *.publishsettings 237 | orleans.codegen.cs 238 | 239 | # Including strong name files can present a security risk 240 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 241 | #*.snk 242 | 243 | # Since there are multiple workflows, uncomment next line to ignore bower_components 244 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 245 | #bower_components/ 246 | 247 | # RIA/Silverlight projects 248 | Generated_Code/ 249 | 250 | # Backup & report files from converting an old project file 251 | # to a newer Visual Studio version. Backup files are not needed, 252 | # because we have git ;-) 253 | _UpgradeReport_Files/ 254 | Backup*/ 255 | UpgradeLog*.XML 256 | UpgradeLog*.htm 257 | ServiceFabricBackup/ 258 | *.rptproj.bak 259 | 260 | # SQL Server files 261 | *.mdf 262 | *.ldf 263 | *.ndf 264 | 265 | # Business Intelligence projects 266 | *.rdl.data 267 | *.bim.layout 268 | *.bim_*.settings 269 | *.rptproj.rsuser 270 | *- [Bb]ackup.rdl 271 | *- [Bb]ackup ([0-9]).rdl 272 | *- [Bb]ackup ([0-9][0-9]).rdl 273 | 274 | # Microsoft Fakes 275 | FakesAssemblies/ 276 | 277 | # GhostDoc plugin setting file 278 | *.GhostDoc.xml 279 | 280 | # Node.js Tools for Visual Studio 281 | .ntvs_analysis.dat 282 | node_modules/ 283 | 284 | # Visual Studio 6 build log 285 | *.plg 286 | 287 | # Visual Studio 6 workspace options file 288 | *.opt 289 | 290 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 291 | *.vbw 292 | 293 | # Visual Studio LightSwitch build output 294 | **/*.HTMLClient/GeneratedArtifacts 295 | **/*.DesktopClient/GeneratedArtifacts 296 | **/*.DesktopClient/ModelManifest.xml 297 | **/*.Server/GeneratedArtifacts 298 | **/*.Server/ModelManifest.xml 299 | _Pvt_Extensions 300 | 301 | # Paket dependency manager 302 | .paket/paket.exe 303 | paket-files/ 304 | 305 | # FAKE - F# Make 306 | .fake/ 307 | 308 | # CodeRush personal settings 309 | .cr/personal 310 | 311 | # Python Tools for Visual Studio (PTVS) 312 | __pycache__/ 313 | *.pyc 314 | 315 | # Cake - Uncomment if you are using it 316 | # tools/** 317 | # !tools/packages.config 318 | 319 | # Tabs Studio 320 | *.tss 321 | 322 | # Telerik's JustMock configuration file 323 | *.jmconfig 324 | 325 | # BizTalk build output 326 | *.btp.cs 327 | *.btm.cs 328 | *.odx.cs 329 | *.xsd.cs 330 | 331 | # OpenCover UI analysis results 332 | OpenCover/ 333 | 334 | # Azure Stream Analytics local run output 335 | ASALocalRun/ 336 | 337 | # MSBuild Binary and Structured Log 338 | *.binlog 339 | 340 | # NVidia Nsight GPU debugger configuration file 341 | *.nvuser 342 | 343 | # MFractors (Xamarin productivity tool) working folder 344 | .mfractor/ 345 | 346 | # Local History for Visual Studio 347 | .localhistory/ 348 | 349 | # BeatPulse healthcheck temp database 350 | healthchecksdb 351 | 352 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 353 | MigrationBackup/ 354 | 355 | # End of https://www.gitignore.io/api/visualstudio 356 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, religion, or sexual identity 11 | and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the 27 | overall community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or 32 | advances of any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email 36 | address, without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official e-mail address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at 64 | [stefan-mihai@moga.doctor](mailto:stefan-mihai@moga.doctor). 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Enforcement Guidelines 71 | 72 | Community leaders will follow these Community Impact Guidelines in determining 73 | the consequences for any action they deem in violation of this Code of Conduct: 74 | 75 | ### 1. Correction 76 | 77 | **Community Impact**: Use of inappropriate language or other behavior deemed 78 | unprofessional or unwelcome in the community. 79 | 80 | **Consequence**: A private, written warning from community leaders, providing 81 | clarity around the nature of the violation and an explanation of why the 82 | behavior was inappropriate. A public apology may be requested. 83 | 84 | ### 2. Warning 85 | 86 | **Community Impact**: A violation through a single incident or series 87 | of actions. 88 | 89 | **Consequence**: A warning with consequences for continued behavior. No 90 | interaction with the people involved, including unsolicited interaction with 91 | those enforcing the Code of Conduct, for a specified period of time. This 92 | includes avoiding interactions in community spaces as well as external channels 93 | like social media. Violating these terms may lead to a temporary or 94 | permanent ban. 95 | 96 | ### 3. Temporary Ban 97 | 98 | **Community Impact**: A serious violation of community standards, including 99 | sustained inappropriate behavior. 100 | 101 | **Consequence**: A temporary ban from any sort of interaction or public 102 | communication with the community for a specified period of time. No public or 103 | private interaction with the people involved, including unsolicited interaction 104 | with those enforcing the Code of Conduct, is allowed during this period. 105 | Violating these terms may lead to a permanent ban. 106 | 107 | ### 4. Permanent Ban 108 | 109 | **Community Impact**: Demonstrating a pattern of violation of community 110 | standards, including sustained inappropriate behavior, harassment of an 111 | individual, or aggression toward or disparagement of classes of individuals. 112 | 113 | **Consequence**: A permanent ban from any sort of public interaction within 114 | the community. 115 | 116 | ## Attribution 117 | 118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 119 | version 2.0, available at 120 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. 121 | 122 | Community Impact Guidelines were inspired by 123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 127 | at [https://www.contributor-covenant.org/translations][translations]. 128 | 129 | [homepage]: https://www.contributor-covenant.org 130 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html 131 | [Mozilla CoC]: https://github.com/mozilla/diversity 132 | [FAQ]: https://www.contributor-covenant.org/faq 133 | [translations]: https://www.contributor-covenant.org/translations 134 | 135 | -------------------------------------------------------------------------------- /ConnectionSettingsDlg.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // ConnectionSettingsDlg.cpp : implementation file 16 | // 17 | 18 | #include "stdafx.h" 19 | #include "WebSearchEngineDlg.h" 20 | #include "ConnectionSettingsDlg.h" 21 | 22 | #include 23 | #include 24 | #pragma comment(lib, "crypt32") 25 | #pragma comment(lib, "advapi32") 26 | 27 | // This method is used to display the last error generated by Crypto API calls 28 | void DisplayLastError(LPCTSTR lpszOperation) 29 | { 30 | //Display a message and the last error in the TRACE. 31 | LPVOID lpMsgBuf = nullptr; 32 | CString strLastError = _T("[CryptoAPI] "); 33 | 34 | FormatMessage( 35 | FORMAT_MESSAGE_ALLOCATE_BUFFER | 36 | FORMAT_MESSAGE_FROM_SYSTEM | 37 | FORMAT_MESSAGE_IGNORE_INSERTS, 38 | nullptr, 39 | GetLastError(), 40 | MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), 41 | (LPTSTR)&lpMsgBuf, 42 | 0, 43 | nullptr); 44 | strLastError += lpszOperation; 45 | strLastError += (LPCTSTR)lpMsgBuf; 46 | //Trim CR/LF from the error message. 47 | strLastError.TrimRight(); 48 | 49 | //Display the last error. 50 | TRACE(_T("%s\n"), static_cast(strLastError)); 51 | AfxMessageBox(strLastError, MB_OK | MB_ICONERROR); 52 | 53 | // free alocated buffer by FormatMessage 54 | LocalFree(lpMsgBuf); 55 | } // DisplayLastError(LPCTSTR lpszOperation) 56 | 57 | /// Returns a decrypted password read from Windows Registry, using Crypto API calls 58 | bool GetRegistryPassword(LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue, LPCTSTR lpszDefault) 59 | { 60 | if (!lpszSection || !lpszEntry || !lpszValue) 61 | return false; 62 | 63 | if (!USE_CRYPTO_METHODS) 64 | { 65 | return (_tcscpy(lpszValue, AfxGetApp()->GetProfileString(lpszSection, lpszEntry, lpszDefault)) != NULL); 66 | } 67 | 68 | if (!lpszCryptoKey) 69 | lpszCryptoKey = AfxGetAppName(); 70 | 71 | LPBYTE lpcbPassword = (LPBYTE)lpszCryptoKey; 72 | const DWORD dwPasswordLen = (DWORD)(sizeof(TCHAR) * _tcslen(lpszCryptoKey)); 73 | 74 | BYTE lpcbDataValue[PASSWORD_MAXLENGTH]; 75 | DWORD dwHowManyBytes = 0; 76 | LPBYTE lpcbTempBuffer = NULL; 77 | 78 | bool bDecryptionDone = false; 79 | HCRYPTPROV hCryptoProvider = NULL; 80 | HCRYPTHASH hCryptoHash = NULL; 81 | HCRYPTKEY hCryptoKey = NULL; 82 | 83 | if (AfxGetApp()->GetProfileBinary(lpszSection, lpszEntry, (LPBYTE *)&lpcbTempBuffer, (UINT *)&dwHowManyBytes)) 84 | { 85 | if (dwHowManyBytes != 0) 86 | { 87 | ZeroMemory(lpcbDataValue, sizeof(lpcbDataValue)); 88 | CopyMemory(lpcbDataValue, lpcbTempBuffer, dwHowManyBytes); 89 | 90 | if (CryptAcquireContext(&hCryptoProvider, NULL, NULL, PROV_RSA_FULL, 0)) 91 | { 92 | if (CryptCreateHash(hCryptoProvider, CALG_MD5, NULL, 0, &hCryptoHash)) 93 | { 94 | if (CryptHashData(hCryptoHash, lpcbPassword, dwPasswordLen, 0)) 95 | { 96 | if (CryptDeriveKey(hCryptoProvider, CALG_RC4, hCryptoHash, CRYPT_EXPORTABLE, &hCryptoKey)) 97 | { 98 | if (CryptDecrypt(hCryptoKey, NULL, TRUE, 0, lpcbDataValue, &dwHowManyBytes)) 99 | { 100 | bDecryptionDone = true; 101 | _tcscpy(lpszValue, (LPTSTR)lpcbDataValue); 102 | } 103 | else 104 | { 105 | DisplayLastError(_T("CryptDecrypt: ")); 106 | } 107 | VERIFY(CryptDestroyKey(hCryptoKey)); 108 | } 109 | else 110 | { 111 | DisplayLastError(_T("CryptDeriveKey: ")); 112 | } 113 | } 114 | else 115 | { 116 | DisplayLastError(_T("CryptHashData: ")); 117 | } 118 | VERIFY(CryptDestroyHash(hCryptoHash)); 119 | } 120 | else 121 | { 122 | DisplayLastError(_T("CryptCreateHash: ")); 123 | } 124 | VERIFY(CryptReleaseContext(hCryptoProvider, 0)); 125 | } 126 | else 127 | { 128 | DisplayLastError(_T("CryptAcquireContext: ")); 129 | } 130 | } 131 | } 132 | else 133 | { 134 | if (!dwHowManyBytes) 135 | { 136 | _tcscpy(lpszValue, lpszDefault); 137 | bDecryptionDone = true; 138 | } 139 | } 140 | 141 | if (lpcbTempBuffer != NULL) 142 | delete lpcbTempBuffer; 143 | 144 | return bDecryptionDone; 145 | } // GetRegistryPassword( LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue, LPCTSTR lpszDefault ) 146 | 147 | /// Writes an encrypted password to Windows Registry, using Crypto API calls 148 | bool SetRegistryPassword(LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue) 149 | { 150 | if (!lpszSection || !lpszEntry || !lpszValue) 151 | return false; 152 | 153 | if (!USE_CRYPTO_METHODS) 154 | { 155 | return AfxGetApp()->WriteProfileString(lpszSection, lpszEntry, lpszValue); 156 | } 157 | 158 | if (!lpszCryptoKey) 159 | lpszCryptoKey = AfxGetAppName(); 160 | 161 | LPBYTE lpcbPassword = (LPBYTE)lpszCryptoKey; 162 | const DWORD dwPasswordLen = (DWORD)(sizeof(TCHAR) * _tcslen(lpszCryptoKey)); 163 | 164 | BYTE lpcbDataValue[PASSWORD_MAXLENGTH]; 165 | const DWORD dwDataValueLen = PASSWORD_MAXLENGTH; 166 | DWORD dwHowManyBytes = dwDataValueLen; 167 | 168 | bool bEncryptionDone = false; 169 | HCRYPTPROV hCryptoProvider = NULL; 170 | HCRYPTHASH hCryptoHash = NULL; 171 | HCRYPTKEY hCryptoKey = NULL; 172 | 173 | ZeroMemory(lpcbDataValue, sizeof(lpcbDataValue)); 174 | CopyMemory(lpcbDataValue, lpszValue, dwDataValueLen); 175 | 176 | if (CryptAcquireContext(&hCryptoProvider, NULL, NULL, PROV_RSA_FULL, 0)) 177 | { 178 | if (CryptCreateHash(hCryptoProvider, CALG_MD5, NULL, 0, &hCryptoHash)) 179 | { 180 | if (CryptHashData(hCryptoHash, lpcbPassword, dwPasswordLen, 0)) 181 | { 182 | if (CryptDeriveKey(hCryptoProvider, CALG_RC4, hCryptoHash, CRYPT_EXPORTABLE, &hCryptoKey)) 183 | { 184 | if (CryptEncrypt(hCryptoKey, NULL, TRUE, 0, lpcbDataValue, &dwHowManyBytes, dwDataValueLen)) 185 | { 186 | bEncryptionDone = AfxGetApp()->WriteProfileBinary(lpszSection, lpszEntry, lpcbDataValue, (UINT)dwHowManyBytes); 187 | } 188 | else 189 | { 190 | DisplayLastError(_T("CryptEncrypt: ")); 191 | } 192 | VERIFY(CryptDestroyKey(hCryptoKey)); 193 | } 194 | else 195 | { 196 | DisplayLastError(_T("CryptDeriveKey: ")); 197 | } 198 | } 199 | else 200 | { 201 | DisplayLastError(_T("CryptHashData: ")); 202 | } 203 | VERIFY(CryptDestroyHash(hCryptoHash)); 204 | } 205 | else 206 | { 207 | DisplayLastError(_T("CryptCreateHash: ")); 208 | } 209 | VERIFY(CryptReleaseContext(hCryptoProvider, 0)); 210 | } 211 | else 212 | { 213 | DisplayLastError(_T("CryptAcquireContext: ")); 214 | } 215 | 216 | return bEncryptionDone; 217 | } // SetRegistryPassword( LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue ) 218 | 219 | // CConnectionSettingsDlg dialog 220 | 221 | IMPLEMENT_DYNAMIC(CConnectionSettingsDlg, CDialogEx) 222 | 223 | CConnectionSettingsDlg::CConnectionSettingsDlg(CWnd* pParent /*=NULL*/) 224 | : CDialogEx(CConnectionSettingsDlg::IDD, pParent) 225 | { 226 | } 227 | 228 | CConnectionSettingsDlg::~CConnectionSettingsDlg() 229 | { 230 | } 231 | 232 | void CConnectionSettingsDlg::DoDataExchange(CDataExchange* pDX) 233 | { 234 | CDialogEx::DoDataExchange(pDX); 235 | DDX_Control(pDX, IDC_HOSTNAME, m_editHostName); 236 | DDX_Control(pDX, IDC_HOSTPORT, m_editHostPort); 237 | DDX_Control(pDX, IDC_DATABASE, m_editDatabase); 238 | DDX_Control(pDX, IDC_USERNAME, m_editUsername); 239 | DDX_Control(pDX, IDC_PASSWORD, m_editPassword); 240 | } 241 | 242 | BEGIN_MESSAGE_MAP(CConnectionSettingsDlg, CDialogEx) 243 | ON_WM_DESTROY() 244 | ON_BN_CLICKED(IDOK, OnBnClickedOk) 245 | ON_BN_CLICKED(IDCANCEL, OnBnClickedCancel) 246 | END_MESSAGE_MAP() 247 | 248 | 249 | // CConnectionSettingsDlg message handlers 250 | 251 | BOOL CConnectionSettingsDlg::OnInitDialog() 252 | { 253 | CDialogEx::OnInitDialog(); 254 | 255 | m_editHostName.SetLimitText(64); 256 | m_editHostPort.SetLimitText(64); 257 | m_editDatabase.SetLimitText(64); 258 | m_editUsername.SetLimitText(64); 259 | m_editPassword.SetLimitText(64); 260 | 261 | CString strHostName, strHostPort, strDatabase, strUsername; TCHAR lpszPassword[0x100]; 262 | 263 | CWinApp* pWinApp = AfxGetApp(); 264 | ASSERT(pWinApp != NULL); 265 | 266 | strHostName = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_HOSTNAME, DEFAULT_HOSTNAME); 267 | strHostPort = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_HOSTPORT, DEFAULT_HOSTPORT); 268 | strDatabase = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_DATABASE, DEFAULT_DATABASE); 269 | strUsername = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_USERNAME, DEFAULT_USERNAME); 270 | 271 | VERIFY(GetRegistryPassword(NULL, REGKEY_SECTION, REGKEY_PASSWORD, lpszPassword, DEFAULT_PASSWORD)); 272 | 273 | m_editHostName.SetWindowText(strHostName); 274 | m_editHostPort.SetWindowText(strHostPort); 275 | m_editDatabase.SetWindowText(strDatabase); 276 | m_editUsername.SetWindowText(strUsername); 277 | m_editPassword.SetWindowText(lpszPassword); 278 | 279 | return TRUE; // return TRUE unless you set the focus to a control 280 | // EXCEPTION: OCX Property Pages should return FALSE 281 | } 282 | 283 | void CConnectionSettingsDlg::OnDestroy() 284 | { 285 | CDialogEx::OnDestroy(); 286 | } 287 | 288 | void CConnectionSettingsDlg::OnBnClickedOk() 289 | { 290 | CString strHostName, strHostPort, strDatabase, strUsername, strPassword; 291 | 292 | CWinApp* pWinApp = AfxGetApp(); 293 | ASSERT(pWinApp != NULL); 294 | 295 | m_editHostName.GetWindowText(strHostName); 296 | m_editHostPort.GetWindowText(strHostPort); 297 | m_editDatabase.GetWindowText(strDatabase); 298 | m_editUsername.GetWindowText(strUsername); 299 | m_editPassword.GetWindowText(strPassword); 300 | 301 | if (strHostName.IsEmpty()) 302 | { 303 | MessageBox(_T("Error: Server field cannot be empty! Please fill it in and try again."), _T("WebSearchEngine"), MB_OK | MB_ICONWARNING); 304 | return; 305 | } 306 | 307 | if (strDatabase.IsEmpty()) 308 | { 309 | MessageBox(_T("Error: Database field cannot be empty! Please fill it in and try again."), _T("WebSearchEngine"), MB_OK | MB_ICONWARNING); 310 | return; 311 | } 312 | 313 | VERIFY(pWinApp->WriteProfileString(REGKEY_SECTION, REGKEY_HOSTNAME, strHostName)); 314 | VERIFY(pWinApp->WriteProfileString(REGKEY_SECTION, REGKEY_HOSTPORT, strHostPort)); 315 | VERIFY(pWinApp->WriteProfileString(REGKEY_SECTION, REGKEY_DATABASE, strDatabase)); 316 | VERIFY(pWinApp->WriteProfileString(REGKEY_SECTION, REGKEY_USERNAME, strUsername)); 317 | 318 | VERIFY(SetRegistryPassword(NULL, REGKEY_SECTION, REGKEY_PASSWORD, strPassword.GetBuffer(0))); 319 | strPassword.ReleaseBuffer(); 320 | 321 | CDialogEx::EndDialog(IDOK); 322 | } 323 | 324 | void CConnectionSettingsDlg::OnBnClickedCancel() 325 | { 326 | CDialogEx::EndDialog(IDCANCEL); 327 | PostQuitMessage(-1); 328 | } 329 | -------------------------------------------------------------------------------- /ConnectionSettingsDlg.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // ConnectionSettingsDlg.h : header file 16 | // 17 | 18 | #ifndef __CONNECTIONSETTINGSDLG__ 19 | #define __CONNECTIONSETTINGSDLG__ 20 | 21 | #pragma once 22 | 23 | #include "Resource.h" 24 | 25 | // set the following value to FALSE if you don't want to use Crypto API calls 26 | #define USE_CRYPTO_METHODS FALSE 27 | 28 | // maximum length of the password stored as array of BYTEs in Windows Registry 29 | #define PASSWORD_MAXLENGTH 128 30 | 31 | bool GetRegistryPassword(LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue, LPCTSTR lpszDefault); 32 | 33 | bool SetRegistryPassword(LPCTSTR lpszCryptoKey, LPCTSTR lpszSection, LPCTSTR lpszEntry, LPTSTR lpszValue); 34 | 35 | // CConnectionSettingsDlg dialog 36 | 37 | class CConnectionSettingsDlg : public CDialogEx 38 | { 39 | DECLARE_DYNAMIC(CConnectionSettingsDlg) 40 | 41 | public: 42 | CConnectionSettingsDlg(CWnd* pParent = NULL); // standard constructor 43 | virtual ~CConnectionSettingsDlg(); 44 | 45 | // Dialog Data 46 | enum { IDD = IDD_CONNECTIONSETTINGSDLG }; 47 | 48 | protected: 49 | virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support 50 | 51 | protected: 52 | CEdit m_editHostName; 53 | CEdit m_editHostPort; 54 | CEdit m_editDatabase; 55 | CEdit m_editUsername; 56 | CEdit m_editPassword; 57 | 58 | protected: 59 | virtual BOOL OnInitDialog(); 60 | afx_msg void OnDestroy(); 61 | afx_msg void OnBnClickedOk(); 62 | afx_msg void OnBnClickedCancel(); 63 | 64 | DECLARE_MESSAGE_MAP() 65 | }; 66 | 67 | #endif // __CONNECTIONSETTINGSDLG__ 68 | -------------------------------------------------------------------------------- /HLinkCtrl.h: -------------------------------------------------------------------------------- 1 | /* 2 | Module : HLinkCtrl.h 3 | Purpose: Interface for a MFC class for a static text control class with hyperlink support 4 | Created: PJN / 16-06-1997 5 | 6 | Copyright (c) 1997 - 2022 by PJ Naughter (Web: www.naughter.com, Email: pjna@naughter.com) 7 | 8 | All rights reserved. 9 | 10 | Copyright / Usage Details: 11 | 12 | You are allowed to include the source code in any product (commercial, shareware, freeware or otherwise) 13 | when your product is released in binary form. You are allowed to modify the source code in any way you want 14 | except you cannot modify the copyright details at the top of each module. If you want to distribute source 15 | code with your application, then you are only allowed to distribute versions released by the author. This is 16 | to maintain a single distribution point for the source code. 17 | 18 | */ 19 | 20 | 21 | /////////////////////////// Defines /////////////////////////////////////////// 22 | 23 | #pragma once 24 | 25 | #ifndef __HLINKCTRL_H__ 26 | #define __HLINKCTRL_H__ 27 | 28 | #ifndef HLINKCTRL_EXT_CLASS 29 | #define HLINKCTRL_EXT_CLASS 30 | #endif //#ifndef HLINKCTRL_EXT_CLASS 31 | 32 | 33 | /////////////////////////// Classes /////////////////////////////////////////// 34 | 35 | class HLINKCTRL_EXT_CLASS CHLinkCtrl : public CStatic 36 | { 37 | public: 38 | //Enums 39 | enum class TriStateSetting 40 | { 41 | YES, 42 | NO, 43 | HOVER 44 | }; 45 | 46 | //Constructors / Destructors 47 | CHLinkCtrl(); 48 | 49 | //Methods 50 | //Set or get the hyperlink to use 51 | void SetHyperLink(_In_ const CString& sLink); 52 | _NODISCARD CString GetHyperLink() const { return m_sLink; }; 53 | 54 | //Set or get the hyperlink color 55 | void SetLinkColor(_In_ const COLORREF& color); 56 | _NODISCARD COLORREF GetLinkColor() const noexcept { return m_Color; }; 57 | 58 | //Set or get the hyperlink color for visited links 59 | void SetVisitedLinkColor(_In_ const COLORREF& color); 60 | _NODISCARD COLORREF GetVisitedLinkColor() const noexcept { return m_VisitedColor; }; 61 | 62 | //Set or get the hyperlink color for highlighted links 63 | void SetHighlightLinkColor(_In_ const COLORREF& color); 64 | _NODISCARD COLORREF GetHighlightLinkColor() const noexcept { return m_HighlightColor; }; 65 | void SetUseHighlightColor(_In_ bool bUseHighlight) noexcept { m_bUseHighlight = bUseHighlight; }; 66 | 67 | //Set or get whether the hyperlink should be drawn underlined 68 | void SetUnderline(_In_ TriStateSetting underline); 69 | _NODISCARD TriStateSetting GetUnderline() const noexcept { return m_Underline; }; 70 | 71 | //Gets whether the hyperlink has been visited 72 | _NODISCARD bool GetVisited() const noexcept { return m_State == LinkState::VISITED; }; 73 | 74 | //Should tooltips be shown or not 75 | void SetToolTips(_In_ bool bToolTip); 76 | _NODISCARD bool GetToolTips() const noexcept { return m_bToolTips; }; 77 | 78 | //Set or get the ShellExecute verb to use 79 | void SetShellExecuteVerb(_In_ const CString& sVerb) { m_sShellExecuteVerb = sVerb; }; 80 | _NODISCARD CString GetShellExecuteVerb() const { return m_sShellExecuteVerb; }; 81 | 82 | //Executes the URL 83 | virtual bool Open(); 84 | 85 | //Free and cache the parent window's bitmap 86 | void FreeParentBitmap(); 87 | bool CacheParentBitmap(); 88 | 89 | protected: 90 | //Enums 91 | enum class LinkState 92 | { 93 | NOT_VISITED, 94 | VISITED, 95 | HIGHLIGHTED 96 | }; 97 | 98 | //Virtual methods 99 | COLORREF GetDefaultLinkColor(_In_z_ LPCTSTR pszSubKey, _In_z_ LPCTSTR lpszEntry, _In_ COLORREF clrDefault); 100 | bool GetDefaultLinkBoolean(_In_z_ LPCTSTR pszSubKey, _In_z_ LPCTSTR lpszEntry, _In_ bool bDefault); 101 | TriStateSetting GetDefaultLinkTriStateSetting(_In_z_ LPCTSTR pszSubKey, _In_z_ LPCTSTR lpszEntry, _In_ TriStateSetting defaultSettting); 102 | void PreSubclassWindow() override; 103 | INT_PTR OnToolHitTest(CPoint point, TOOLINFO* pTI) const override; 104 | 105 | //Message handlers 106 | afx_msg BOOL OnSetCursor(CWnd* pWnd, UINT nHitTest, UINT message); 107 | afx_msg void OnMouseMove(UINT nFlags, CPoint point); 108 | afx_msg HBRUSH CtlColor(CDC* pDC, UINT nCtlColor); 109 | afx_msg BOOL OnToolTipTextA(UINT id, NMHDR* pNMHDR, LRESULT* pResult); 110 | afx_msg BOOL OnToolTipTextW(UINT id, NMHDR* pNMHDR, LRESULT* pResult); 111 | afx_msg LRESULT OnNcHitTest(CPoint point); 112 | afx_msg LRESULT OnSetText(WPARAM wParam, LPARAM lParam); 113 | afx_msg void OnEnable(BOOL bEnable); 114 | afx_msg BOOL OnEraseBkgnd(CDC* pDC); 115 | afx_msg void OnMouseLeave(); 116 | afx_msg void OnStnClicked(); 117 | 118 | //Member variables 119 | CString m_sLink; 120 | HCURSOR m_hLinkCursor; 121 | COLORREF m_Color; 122 | COLORREF m_VisitedColor; 123 | COLORREF m_HighlightColor; 124 | bool m_bUseHighlight; 125 | TriStateSetting m_Underline; 126 | LinkState m_State; 127 | LinkState m_OldState; 128 | CFont m_UnderlineFont; 129 | bool m_bToolTips; 130 | CString m_sShellExecuteVerb; 131 | bool m_bTrackLeave; 132 | CBitmap m_bmpParent; 133 | 134 | DECLARE_MESSAGE_MAP() 135 | }; 136 | 137 | #endif //#ifndef __HLINKCTRL_H__ 138 | -------------------------------------------------------------------------------- /HtmlToText.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #include "stdafx.h" 16 | #include "HtmlToText.h" 17 | #include 18 | 19 | CHtmlToText::CHtmlToText() 20 | { 21 | _tags.SetAt(_T("address"), _T("\n")); 22 | _tags.SetAt(_T("blockquote"), _T("\n")); 23 | _tags.SetAt(_T("div"), _T("\n")); 24 | _tags.SetAt(_T("dl"), _T("\n")); 25 | _tags.SetAt(_T("fieldset"), _T("\n")); 26 | _tags.SetAt(_T("form"), _T("\n")); 27 | _tags.SetAt(_T("h1"), _T("\n")); 28 | _tags.SetAt(_T("/h1"), _T("\n")); 29 | _tags.SetAt(_T("h2"), _T("\n")); 30 | _tags.SetAt(_T("/h2"), _T("\n")); 31 | _tags.SetAt(_T("h3"), _T("\n")); 32 | _tags.SetAt(_T("/h3"), _T("\n")); 33 | _tags.SetAt(_T("h4"), _T("\n")); 34 | _tags.SetAt(_T("/h4"), _T("\n")); 35 | _tags.SetAt(_T("h5"), _T("\n")); 36 | _tags.SetAt(_T("/h5"), _T("\n")); 37 | _tags.SetAt(_T("h6"), _T("\n")); 38 | _tags.SetAt(_T("/h6"), _T("\n")); 39 | _tags.SetAt(_T("p"), _T("\n")); 40 | _tags.SetAt(_T("/p"), _T("\n")); 41 | _tags.SetAt(_T("table"), _T("\n")); 42 | _tags.SetAt(_T("/table"), _T("\n")); 43 | _tags.SetAt(_T("ul"), _T("\n")); 44 | _tags.SetAt(_T("/ul"), _T("\n")); 45 | _tags.SetAt(_T("ol"), _T("\n")); 46 | _tags.SetAt(_T("/ol"), _T("\n")); 47 | _tags.SetAt(_T("/li"), _T("\n")); 48 | _tags.SetAt(_T("br"), _T("\n")); 49 | _tags.SetAt(_T("/td"), _T("\t")); 50 | _tags.SetAt(_T("/tr"), _T("\n")); 51 | _tags.SetAt(_T("/pre"), _T("\n")); 52 | 53 | _ignoreTags.AddTail(_T("script")); 54 | _ignoreTags.AddTail(_T("noscript")); 55 | _ignoreTags.AddTail(_T("style")); 56 | _ignoreTags.AddTail(_T("object")); 57 | } 58 | 59 | CHtmlToText::~CHtmlToText() 60 | { 61 | } 62 | 63 | const std::string& CHtmlToText::Convert(const std::string& html) 64 | { 65 | // Initialize state variables 66 | bool selfClosing = false; 67 | _html = html; 68 | _pos = 0; 69 | 70 | // Process input 71 | while (!EndOfText()) 72 | { 73 | if (Peek() == '<') 74 | { 75 | // HTML tag 76 | std::string tag = ParseTag(selfClosing); 77 | 78 | // Handle special tag cases 79 | if (tag.compare("body") == 0) 80 | { 81 | // Discard content before 82 | VERIFY(_text.empty()); 83 | } 84 | else if (tag.compare("/body") == 0) 85 | { 86 | // Discard content after 87 | _pos = _html.length(); 88 | } 89 | else if (tag.compare("pre") == 0) 90 | { 91 | // Enter preformatted mode 92 | _preformatted = true; 93 | EatWhitespaceToNextLine(); 94 | } 95 | else if (tag.compare("/pre") == 0) 96 | { 97 | // Exit preformatted mode 98 | _preformatted = false; 99 | } 100 | 101 | _text.append(" "); 102 | 103 | if (_ignoreTags.Find(CString(tag.c_str())) != NULL) 104 | EatInnerContent(tag); 105 | } 106 | else if (IsWhiteSpace(Peek())) 107 | { 108 | // Whitespace (treat all as space) 109 | _text += (_preformatted ? Peek() : ' '); 110 | MoveAhead(); 111 | } 112 | else 113 | { 114 | // Other text 115 | _text += Peek(); 116 | MoveAhead(); 117 | } 118 | } 119 | 120 | return _text; 121 | } 122 | 123 | char toclower(char ch) { return (char)tolower(ch); } 124 | 125 | std::string CHtmlToText::ParseTag(bool& selfClosing) 126 | { 127 | std::string tag; 128 | selfClosing = false; 129 | 130 | // Eat comments 131 | if (((_pos + 4) < _html.length()) && 132 | (_html[_pos] == '<') && 133 | (_html[_pos + 1] == '!') && 134 | (_html[_pos + 2] == '-') && 135 | (_html[_pos + 3] == '-')) 136 | { 137 | MoveAhead(); 138 | MoveAhead(); 139 | MoveAhead(); 140 | MoveAhead(); 141 | 142 | while (!EndOfText()) 143 | { 144 | if (((_pos + 3) < _html.length()) && 145 | (_html[_pos] == '-') && 146 | (_html[_pos + 1] == '-') && 147 | (_html[_pos + 2] == '>')) 148 | break; 149 | 150 | MoveAhead(); 151 | } 152 | 153 | MoveAhead(); 154 | MoveAhead(); 155 | MoveAhead(); 156 | EatWhitespace(); 157 | } 158 | 159 | // Eat scripts 160 | if (((_pos + 7) < _html.length()) && 161 | (_html[_pos] == '<') && 162 | (_html[_pos + 1] == 's') && 163 | (_html[_pos + 2] == 'c') && 164 | (_html[_pos + 3] == 'r') && 165 | (_html[_pos + 4] == 'i') && 166 | (_html[_pos + 5] == 'p') && 167 | (_html[_pos + 6] == 't')) 168 | { 169 | MoveAhead(); 170 | MoveAhead(); 171 | MoveAhead(); 172 | MoveAhead(); 173 | MoveAhead(); 174 | MoveAhead(); 175 | MoveAhead(); 176 | 177 | 178 | while (!EndOfText()) 179 | { 180 | if (((_pos + 7) < _html.length()) && 181 | (_html[_pos] == '/') && 182 | (_html[_pos + 1] == 's') && 183 | (_html[_pos + 2] == 'c') && 184 | (_html[_pos + 3] == 'r') && 185 | (_html[_pos + 4] == 'i') && 186 | (_html[_pos + 5] == 'p') && 187 | (_html[_pos + 6] == 't')) 188 | break; 189 | 190 | MoveAhead(); 191 | } 192 | 193 | MoveAhead(); 194 | MoveAhead(); 195 | MoveAhead(); 196 | MoveAhead(); 197 | MoveAhead(); 198 | MoveAhead(); 199 | MoveAhead(); 200 | MoveAhead(); 201 | EatWhitespace(); 202 | } 203 | 204 | if (Peek() == _T('<')) 205 | { 206 | MoveAhead(); 207 | 208 | // Parse tag name 209 | EatWhitespace(); 210 | size_t start = _pos; 211 | if (Peek() == '/') 212 | MoveAhead(); 213 | while (!EndOfText() && !IsWhiteSpace(Peek()) && 214 | (Peek() != '/') && (Peek() != '>')) 215 | MoveAhead(); 216 | tag = _html.substr(start, _pos - start); 217 | std::transform(tag.begin(), tag.end(), tag.begin(), toclower); 218 | 219 | // Parse rest of tag 220 | while (!EndOfText() && (Peek() != '>')) 221 | { 222 | if ((Peek() == '\"') || (Peek() == '\'')) 223 | EatQuotedValue(); 224 | else 225 | { 226 | if (Peek() == '/') 227 | selfClosing = true; 228 | MoveAhead(); 229 | } 230 | } 231 | 232 | MoveAhead(); 233 | } 234 | return tag; 235 | } 236 | 237 | void CHtmlToText::EatInnerContent(const std::string& tag) 238 | { 239 | bool selfClosing = false; 240 | const std::string endTag = "/" + tag; 241 | 242 | while (!EndOfText()) 243 | { 244 | if (Peek() == '<') 245 | { 246 | // Consume a tag 247 | if (ParseTag(selfClosing).compare(endTag) == 0) 248 | return; 249 | // Use recursion to consume nested tags 250 | if (!selfClosing && (tag[0] != '/')) 251 | EatInnerContent(tag); 252 | } 253 | else 254 | MoveAhead(); 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /HtmlToText.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #pragma once 16 | 17 | class CHtmlToText 18 | { 19 | public: 20 | CHtmlToText(); 21 | ~CHtmlToText(); 22 | 23 | public: 24 | const std::string& Convert(const std::string& html); 25 | std::string ParseTag(bool& selfClosing); 26 | void EatInnerContent(const std::string& tag); 27 | 28 | bool EndOfText() { return (_pos >= _html.length()); }; 29 | 30 | char Peek() { return (_pos < _html.length()) ? _html[_pos] : (char)0; } 31 | 32 | void MoveAhead() { _pos = ((_pos + 1 < _html.length()) ? (_pos + 1) : _html.length()); } 33 | 34 | bool IsWhiteSpace(char ch) 35 | { 36 | if ((ch == _T(' ')) || (ch == _T('\t')) || (ch == _T('\r')) || (ch == _T('\n'))) 37 | return true; 38 | return false; 39 | } 40 | 41 | void EatWhitespace() 42 | { 43 | while (IsWhiteSpace(Peek())) 44 | MoveAhead(); 45 | } 46 | 47 | void EatWhitespaceToNextLine() 48 | { 49 | while (IsWhiteSpace(Peek())) 50 | { 51 | char ch = Peek(); 52 | MoveAhead(); 53 | if (ch == _T('\n')) 54 | break; 55 | } 56 | } 57 | 58 | void EatQuotedValue() 59 | { 60 | char mark = Peek(); 61 | if ((mark == _T('\"')) || (mark == _T('\''))) 62 | { 63 | // Opening quote 64 | MoveAhead(); 65 | // Find end of value 66 | while (!EndOfText()) 67 | { 68 | char ch = Peek(); 69 | MoveAhead(); 70 | if ((ch == mark) || (ch == _T('\r')) || (ch == _T('\n'))) 71 | break; 72 | } 73 | } 74 | } 75 | 76 | protected: 77 | std::string _text; 78 | std::string _html; 79 | size_t _pos; 80 | bool _preformatted; 81 | 82 | CMapStringToString _tags; 83 | CStringList _ignoreTags; 84 | }; 85 | -------------------------------------------------------------------------------- /OldImplementation.py: -------------------------------------------------------------------------------- 1 | # This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 2 | # 3 | # WebSearchEngine is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Open 5 | # Source Initiative, either version 3 of the License, or any later version. 6 | # 7 | # WebSearchEngine is distributed in the hope that it will be useful, but 8 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 9 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 10 | # 11 | # You should have received a copy of the GNU General Public License along with 12 | # WebSearchEngine. If not, see 13 | 14 | import mysql.connector 15 | from bs4 import BeautifulSoup 16 | from urllib.request import Request, urlopen 17 | from urllib.parse import urljoin 18 | import urllib 19 | import time 20 | from gensim.utils import tokenize 21 | 22 | 23 | HOSTNAME = 'localhost' 24 | DATABASE = 'r46882text_mining' 25 | USERNAME = 'r46882text_engine' 26 | PASSWORD = 'TextMining2021!@#$' 27 | 28 | visited_urls = [] 29 | frontier_array = [] 30 | frontier_score = {} 31 | 32 | webpage_count = 0 33 | keyword_array = [] 34 | 35 | 36 | def create_database(): 37 | try: 38 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, password=PASSWORD, 39 | autocommit=True) 40 | server_info = connection.get_server_info() 41 | print("MySQL connection is open on", server_info) 42 | sql_drop_table = "DROP TABLE IF EXISTS `occurrence`" 43 | cursor = connection.cursor() 44 | cursor.execute(sql_drop_table) 45 | sql_drop_table = "DROP TABLE IF EXISTS `keyword`" 46 | cursor.execute(sql_drop_table) 47 | sql_drop_table = "DROP TABLE IF EXISTS `webpage`" 48 | cursor.execute(sql_drop_table) 49 | sql_create_table = "CREATE TABLE `webpage` (`webpage_id` BIGINT NOT NULL AUTO_INCREMENT, " \ 50 | "`url` VARCHAR(256) NOT NULL, `title` VARCHAR(256) NOT NULL, " \ 51 | "`content` TEXT NOT NULL, PRIMARY KEY(`webpage_id`)) ENGINE=InnoDB" 52 | cursor.execute(sql_create_table) 53 | sql_create_table = "CREATE TABLE `keyword` (`keyword_id` BIGINT NOT NULL AUTO_INCREMENT, " \ 54 | "`name` VARCHAR(256) NOT NULL, PRIMARY KEY(`keyword_id`)) ENGINE=InnoDB" 55 | cursor.execute(sql_create_table) 56 | sql_create_table = "CREATE TABLE `occurrence` (`webpage_id` BIGINT NOT NULL, " \ 57 | "`keyword_id` BIGINT NOT NULL, `counter` BIGINT NOT NULL, " \ 58 | "`pagerank` REAL NOT NULL, PRIMARY KEY(`webpage_id`, `keyword_id`), " \ 59 | "FOREIGN KEY webpage_fk(webpage_id) REFERENCES webpage(webpage_id), " \ 60 | "FOREIGN KEY keyword_fk(keyword_id) REFERENCES keyword(keyword_id)) ENGINE=InnoDB" 61 | cursor.execute(sql_create_table) 62 | sql_create_index = "CREATE OR REPLACE UNIQUE INDEX index_name ON `keyword`(`name`)" 63 | cursor.execute(sql_create_index) 64 | sql_no_of_words = "CREATE OR REPLACE FUNCTION no_of_words(token VARCHAR(256)) RETURNS " \ 65 | "REAL READS SQL DATA RETURN (SELECT MAX(`counter`) FROM `occurrence` " \ 66 | "INNER JOIN `keyword` USING(`keyword_id`) WHERE `name` = token)" 67 | cursor.execute(sql_no_of_words) 68 | sql_no_of_pages = "CREATE OR REPLACE FUNCTION no_of_pages(token VARCHAR(256)) RETURNS " \ 69 | "REAL READS SQL DATA RETURN (SELECT COUNT(`webpage_id`) FROM `occurrence` " \ 70 | "INNER JOIN `keyword` USING(`keyword_id`) WHERE `name` = token)" 71 | cursor.execute(sql_no_of_pages) 72 | sql_total_pages = "CREATE OR REPLACE FUNCTION total_pages() RETURNS REAL READS SQL DATA " \ 73 | "RETURN (SELECT COUNT(`webpage_id`) FROM `webpage`)" 74 | cursor.execute(sql_total_pages) 75 | sql_data_mining = "CREATE OR REPLACE FUNCTION data_mining(webpage_no BIGINT, token VARCHAR(256)) " \ 76 | "RETURNS REAL READS SQL DATA RETURN (SELECT SUM(`counter`)/no_of_words(token)*" \ 77 | "LOG((1+total_pages())/no_of_pages(token)) FROM `occurrence` INNER JOIN `keyword` " \ 78 | "USING(`keyword_id`) WHERE `name` = token AND `webpage_id` = webpage_no)" 79 | cursor.execute(sql_data_mining) 80 | except mysql.connector.Error as err: 81 | print("MySQL connector error:", str(err)) 82 | return False 83 | finally: 84 | if connection.is_connected(): 85 | cursor.close() 86 | connection.close() 87 | print("MySQL connection is now closed") 88 | return True 89 | 90 | 91 | def add_url_to_frontier(url): 92 | global visited_urls 93 | global frontier_array 94 | global frontier_score 95 | found = False 96 | if url.find('#') > 0: 97 | url = url.split('#')[0] 98 | if url.endswith('.3g2'): 99 | return # 3GPP2 multimedia file 100 | if url.endswith('.3gp'): 101 | return # 3GPP multimedia file 102 | if url.endswith('.7z'): 103 | return # 7-Zip compressed file 104 | if url.endswith('.ai'): 105 | return # Adobe Illustrator file 106 | if url.endswith('.apk'): 107 | return # Android package file 108 | if url.endswith('.arj'): 109 | return # ARJ compressed file 110 | if url.endswith('.aif'): 111 | return # AIF audio file 112 | if url.endswith('.avi'): 113 | return # AVI file 114 | if url.endswith('.bat'): 115 | return # Batch file 116 | if url.endswith('.bin'): 117 | return # Binary disc image 118 | if url.endswith('.bmp'): 119 | return # Bitmap image 120 | if url.endswith('.cda'): 121 | return # CD audio track file 122 | if url.endswith('.com'): 123 | return # MS-DOS command file 124 | if url.endswith('.csv'): 125 | return # Comma separated value file 126 | if url.endswith('.dat'): 127 | return # Binary Data file 128 | if url.endswith('.db') or url.endswith('.dbf'): 129 | return # Database file 130 | if url.endswith('.deb'): 131 | return # Debian software package file 132 | if url.endswith('.dmg'): 133 | return # macOS X disk image 134 | if url.endswith('.doc') or url.endswith('.docx'): 135 | return # Microsoft Word Open XML document file 136 | if url.endswith('.email') or url.endswith('.eml'): 137 | return # E-mail message file from multiple e-mail clients 138 | if url.endswith('.emlx'): 139 | return # Apple Mail e-mail file 140 | if url.endswith('.exe'): 141 | return # MS-DOS executable file 142 | if url.endswith('.flv'): 143 | return # Adobe Flash file 144 | if url.endswith('.fon'): 145 | return # Generic font file 146 | if url.endswith('.fnt'): 147 | return # Windows font file 148 | if url.endswith('.gadget'): 149 | return # Windows gadget 150 | if url.endswith('.gif'): 151 | return # GIF image 152 | if url.endswith('.h264'): 153 | return # H.264 video file 154 | if url.endswith('.ico'): 155 | return # Icon file 156 | if url.endswith('.iso'): 157 | return # ISO disc image 158 | if url.endswith('.jar'): 159 | return # Java archive file 160 | if url.endswith('.jpg') or url.endswith('.jpeg'): 161 | return # JPEG image 162 | if url.endswith('.log'): 163 | return # Log file 164 | if url.endswith('.m4v'): 165 | return # Apple MP4 video file 166 | if url.endswith('.mdb'): 167 | return # Microsoft Access database file 168 | if url.endswith('.mid') or url.endswith('.midi'): 169 | return # MIDI audio file 170 | if url.endswith('.mov'): 171 | return # Apple QuickTime movie file 172 | if url.endswith('.mp3') or url.endswith('.mpa'): 173 | return # MP3 audio file 174 | if url.endswith('.mp4'): 175 | return # MPEG4 video file 176 | if url.endswith('.mpa'): 177 | return # MPEG-2 audio file 178 | if url.endswith('.mpg') or url.endswith('.mpeg'): 179 | return # MPEG video file 180 | if url.endswith('.msg'): 181 | return # Microsoft Outlook e-mail message file 182 | if url.endswith('.msi'): 183 | return # Windows installer package 184 | if url.endswith('.odt'): 185 | return # OpenOffice Writer document file 186 | if url.endswith('.ods'): 187 | return # OpenOffice Calc spreadsheet file 188 | if url.endswith('.oft'): 189 | return # Microsoft Outlook e-mail template file 190 | if url.endswith('.ogg'): 191 | return # Ogg Vorbis audio file 192 | if url.endswith('.ost'): 193 | return # Microsoft Outlook e-mail storage file 194 | if url.endswith('.otf'): 195 | return # Open type font file 196 | if url.endswith('.pkg'): 197 | return # Package file 198 | if url.endswith('.pdf'): 199 | return # Adobe PDF file 200 | if url.endswith('.png'): 201 | return # PNG image 202 | if url.endswith('.ppt') or url.endswith('.pptx'): 203 | return # Microsoft PowerPoint Open XML presentation 204 | if url.endswith('.ps'): 205 | return # PostScript file 206 | if url.endswith('.psd'): 207 | return # PSD image 208 | if url.endswith('.pst'): 209 | return # Microsoft Outlook e-mail storage file 210 | if url.endswith('.rar'): 211 | return # RAR file 212 | if url.endswith('.rpm'): 213 | return # Red Hat Package Manager 214 | if url.endswith('.rtf'): 215 | return # Rich Text Format file 216 | if url.endswith('.sql'): 217 | return # SQL database file 218 | if url.endswith('.svg'): 219 | return # Scalable Vector Graphics file 220 | if url.endswith('.swf'): 221 | return # Shockwave flash file 222 | if url.endswith('.xls') or url.endswith('.xlsx'): 223 | return # Microsoft Excel Open XML spreadsheet file 224 | if url.endswith('.toast'): 225 | return # Toast disc image 226 | if url.endswith('.tar'): 227 | return # Linux tarball file archive 228 | if url.endswith('.tar.gz'): 229 | return # Tarball compressed file 230 | if url.endswith('.tex'): 231 | return # A LaTeX document file 232 | if url.endswith('.ttf'): 233 | return # TrueType font file 234 | if url.endswith('.txt'): 235 | return # Plain text file 236 | if url.endswith('.tif') or url.endswith('.tiff'): 237 | return # TIFF image 238 | if url.endswith('.vcd'): 239 | return # Virtual CD 240 | if url.endswith('.vcf'): 241 | return # E-mail contact file 242 | if url.endswith('.vob'): 243 | return # DVD Video Object 244 | if url.endswith('.xml'): 245 | return # XML file 246 | if url.endswith('.wav') or url.endswith('.wma'): 247 | return # WAV file 248 | if url.endswith('.wmv'): 249 | return # Windows Media Video file 250 | if url.endswith('.wpd'): 251 | return # WordPerfect document 252 | if url.endswith('.wpl'): 253 | return # Windows Media Player playlist 254 | if url.endswith('.wsf'): 255 | return # Windows script file 256 | if url.endswith('.z') or url.endswith('.zip'): 257 | return # Z or Zip compressed file 258 | if url not in visited_urls: 259 | if url in frontier_array: 260 | found = True 261 | frontier_score[url] = frontier_score.get(url) + 1 262 | if not found: 263 | frontier_array.append(url) 264 | frontier_score[url] = 1 265 | 266 | 267 | def extract_url_from_frontier(): 268 | global frontier_array 269 | global frontier_score 270 | score = 0 271 | url = None 272 | for item in frontier_array: 273 | if score < frontier_score.get(item): 274 | url = item 275 | score = frontier_score.get(url) 276 | if url: 277 | frontier_array.remove(url) 278 | del frontier_score[url] 279 | visited_urls.append(url) 280 | return url 281 | 282 | 283 | def download_page_from_url(url): 284 | html_title = None 285 | plain_text = None 286 | try: 287 | req = Request(url) 288 | html_page = urlopen(req) 289 | soup = BeautifulSoup(html_page, "html.parser") 290 | html_title = soup.title.get_text().strip() 291 | plain_text = soup.get_text().strip() 292 | plain_text = " ".join(plain_text.split()) 293 | for hyperlink in soup.find_all('a'): 294 | hyperlink = urljoin(url, hyperlink.get('href')) 295 | add_url_to_frontier(hyperlink) 296 | except urllib.error.URLError as err: 297 | print(str(err)) 298 | except urllib.error.HTTPError as err: 299 | print(str(err)) 300 | except urllib.error.ContentTooShortError as err: 301 | print(str(err)) 302 | finally: 303 | return html_title, plain_text 304 | 305 | 306 | def web_search_engine(): 307 | global webpage_count 308 | try: 309 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, password=PASSWORD, 310 | autocommit=True) 311 | server_info = connection.get_server_info() 312 | print("MySQL connection is open on", server_info) 313 | while True: 314 | url = extract_url_from_frontier() 315 | if url: 316 | print("Crawling %s... [%d]" % (url, webpage_count + 1)) 317 | html_title, plain_text = download_page_from_url(url) 318 | if html_title and plain_text: 319 | if len(html_title) > 0: 320 | connection = analyze_webpage(connection, url, html_title, plain_text) 321 | if (webpage_count > 0) and ((webpage_count % 1000) == 0): 322 | if connection.is_connected(): 323 | connection.close() 324 | print("MySQL connection is now closed") 325 | data_mining() 326 | else: 327 | break 328 | except mysql.connector.Error as err: 329 | print("MySQL connector error:", str(err)) 330 | finally: 331 | if connection.is_connected(): 332 | connection.close() 333 | print("MySQL connection is now closed") 334 | 335 | 336 | def analyze_webpage(connection, url, html_title, plain_text): 337 | global webpage_count 338 | while not connection.is_connected(): 339 | try: 340 | time.sleep(30) 341 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, password=PASSWORD, 342 | autocommit=True) 343 | server_info = connection.get_server_info() 344 | print("MySQL connection is open on", server_info) 345 | except mysql.connector.Error as err: 346 | print("MySQL connector error:", str(err)) 347 | finally: 348 | pass 349 | try: 350 | # html_title = html_title.encode(encoding='utf-8') 351 | # plain_text = plain_text.encode(encoding='utf-8') 352 | sql_statement = "INSERT INTO `webpage` (`url`, `title`, `content`) VALUES ('%s', '%s', '%s')" % \ 353 | (url, html_title.replace("'", "\""), plain_text.replace("'", "\"")) 354 | cursor = connection.cursor() 355 | cursor.execute(sql_statement) 356 | if cursor.rowcount == 0: 357 | return connection 358 | sql_last_id = "SET @last_webpage_id = LAST_INSERT_ID()" 359 | cursor = connection.cursor() 360 | cursor.execute(sql_last_id) 361 | cursor.close() 362 | webpage_count = webpage_count + 1 363 | return analyze_keyword(connection, plain_text) 364 | except mysql.connector.Error as err: 365 | print("MySQL connector error:", str(err)) 366 | finally: 367 | pass 368 | return connection 369 | 370 | 371 | def analyze_keyword(connection, plain_text): 372 | global webpage_count 373 | global keyword_array 374 | new_keyword = {} 375 | old_keyword = {} 376 | tokenize_list = tokenize(plain_text) 377 | for keyword in tokenize_list: 378 | if keyword.isascii() and keyword.isalnum(): 379 | keyword = keyword.lower() 380 | if keyword not in keyword_array: 381 | keyword_array.append(keyword) 382 | new_keyword[keyword] = 1 383 | else: 384 | if new_keyword.get(keyword) is not None: 385 | new_keyword[keyword] = new_keyword[keyword] + 1 386 | else: 387 | if old_keyword.get(keyword) is None: 388 | old_keyword[keyword] = 1 389 | else: 390 | old_keyword[keyword] = old_keyword[keyword] + 1 391 | try: 392 | for keyword in new_keyword.keys(): 393 | while not connection.is_connected(): 394 | time.sleep(30) 395 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, 396 | password=PASSWORD, autocommit=True) 397 | server_info = connection.get_server_info() 398 | print("MySQL connection is open on", server_info) 399 | sql_last_id = "SET @last_webpage_id = %d" % webpage_count 400 | cursor = connection.cursor() 401 | cursor.execute(sql_last_id) 402 | # keyword = keyword.encode(encoding='utf-8') 403 | sql_statement = "INSERT INTO `keyword` (`name`) VALUES ('%s')" % keyword 404 | cursor = connection.cursor() 405 | cursor.execute(sql_statement) 406 | if cursor.rowcount == 0: 407 | keyword_array.remove(keyword) 408 | continue 409 | sql_last_id = "SET @last_keyword_id = LAST_INSERT_ID()" 410 | cursor = connection.cursor() 411 | cursor.execute(sql_last_id) 412 | sql_statement = "INSERT INTO `occurrence` (`webpage_id`, `keyword_id`, `counter`, `pagerank`) " \ 413 | "VALUES (@last_webpage_id, @last_keyword_id, %d, 0.0)" % new_keyword[keyword] 414 | cursor = connection.cursor() 415 | cursor.execute(sql_statement) 416 | cursor.close() 417 | for keyword in old_keyword.keys(): 418 | while not connection.is_connected(): 419 | time.sleep(30) 420 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, 421 | password=PASSWORD, autocommit=True) 422 | server_info = connection.get_server_info() 423 | print("MySQL connection is open on", server_info) 424 | sql_last_id = "SET @last_webpage_id = %d" % webpage_count 425 | cursor = connection.cursor() 426 | cursor.execute(sql_last_id) 427 | sql_last_id = "SET @last_keyword_id = (SELECT `keyword_id` FROM `keyword` WHERE `name` = '%s')" % keyword 428 | cursor = connection.cursor() 429 | cursor.execute(sql_last_id) 430 | sql_statement = "INSERT INTO `occurrence` (`webpage_id`, `keyword_id`, `counter`, `pagerank`) " \ 431 | "VALUES (@last_webpage_id, @last_keyword_id, %d, 0.0)" % old_keyword[keyword] 432 | cursor = connection.cursor() 433 | cursor.execute(sql_statement) 434 | cursor.close() 435 | except mysql.connector.Error as err: 436 | print("MySQL connector error:", str(err)) 437 | finally: 438 | pass 439 | return connection 440 | 441 | 442 | def data_mining(): 443 | records = None 444 | connection = None 445 | rowcount = 0 446 | try: 447 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, password=PASSWORD, 448 | autocommit=True) 449 | server_info = connection.get_server_info() 450 | print("MySQL connection is open on", server_info) 451 | sql_select_query = "SELECT * FROM `keyword` ORDER BY `keyword_id`" 452 | cursor = connection.cursor() 453 | cursor.execute(sql_select_query) 454 | # get all records 455 | records = cursor.fetchall() 456 | print("Total number of rows in table:", cursor.rowcount) 457 | rowcount = cursor.rowcount 458 | cursor.close() 459 | except mysql.connector.Error as err: 460 | print("MySQL connector error:", str(err)) 461 | finally: 462 | pass 463 | for row in records: 464 | done = False 465 | while not done: 466 | try: 467 | if not connection.is_connected(): 468 | time.sleep(30) 469 | connection = mysql.connector.connect(host=HOSTNAME, database=DATABASE, user=USERNAME, 470 | password=PASSWORD, autocommit=True) 471 | server_info = connection.get_server_info() 472 | print("MySQL connection is open on", server_info) 473 | data_update = connection.cursor() 474 | sql_update_query = "UPDATE `occurrence` INNER JOIN `keyword` USING(`keyword_id`)" \ 475 | "SET `pagerank` = data_mining(`webpage_id`, `name`) WHERE `name` = '%s'" % row[1] 476 | print("Applying data mining for '%s'... [%d/%d]" % (row[1], records.index(row) + 1, rowcount)) 477 | data_update.execute(sql_update_query) 478 | data_update.close() 479 | done = True 480 | except mysql.connector.Error as err: 481 | print("MySQL connector error:", str(err)) 482 | finally: 483 | pass 484 | try: 485 | if connection.is_connected(): 486 | connection.close() 487 | print("MySQL connection is now closed") 488 | except mysql.connector.Error as err: 489 | print("MySQL connector error:", str(err)) 490 | finally: 491 | pass 492 | 493 | 494 | add_url_to_frontier('https://en.wikipedia.org/') 495 | if create_database(): 496 | web_search_engine() 497 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![WebSearchEngine.png](WebSearchEngine.png) 2 | 3 | A [search engine](https://www.text-mining.ro/) is a software system that is designed to carry out web searches. They search the World Wide Web in a systematic way for particular information specified in a textual web search query. The search results are generally presented in a line of results, often referred to as search engine results pages (SERPs) The information may be a mix of links to web pages, images, videos, infographics, articles, research papers, and other types of files. Some search engines also mine data available in databases or open directories. Unlike web directories, which are maintained only by human editors, search engines also maintain real-time information by running an algorithm on a web crawler. Internet content that is not capable of being searched by a web search engine is generally described as the deep web. 4 | 5 | A search engine maintains the following processes in near real time: 6 | 1. Web crawling 7 | 2. Indexing 8 | 3. Searching 9 | 10 | Web search engines get their information by web crawling from site to site. The "spider" checks for the standard filename robots.txt, addressed to it. The robots.txt file contains directives for search spiders, telling it which pages to crawl and which pages not to crawl. After checking for robots.txt and either finding it or not, the spider sends certain information back to be indexed depending on many factors, such as the titles, page content, JavaScript, Cascading Style Sheets (CSS), headings, or its metadata in HTML meta tags. After a certain number of pages crawled, amount of data indexed, or time spent on the website, the spider stops crawling and moves on. "[N]o web crawler may actually crawl the entire reachable web. Due to infinite websites, spider traps, spam, and other exigencies of the real web, crawlers instead apply a crawl policy to determine when the crawling of a site should be deemed sufficient. Some websites are crawled exhaustively, while others are crawled only partially". 11 | 12 | Indexing means associating words and other definable tokens found on web pages to their domain names and HTML-based fields. The associations are made in a public database, made available for web search queries. A query from a user can be a single word, multiple words or a sentence. The index helps find information relating to the query as quickly as possible. Some of the techniques for indexing, and caching are trade secrets, whereas web crawling is a straightforward process of visiting all sites on a systematic basis. 13 | 14 | Between visits by the spider, the cached version of the page (some or all the content needed to render it) stored in the search engine working memory is quickly sent to an inquirer. If a visit is overdue, the search engine can just act as a web proxy instead. In this case, the page may differ from the search terms indexed. The cached page holds the appearance of the version whose words were previously indexed, so a cached version of a page can be useful to the website when the actual page has been lost, but this problem is also considered a mild form of linkrot. 15 | 16 | Typically when a user enters a query into a search engine it is a few keywords. The index already has the names of the sites containing the keywords, and these are instantly obtained from the index. The real processing load is in generating the web pages that are the search results list: Every page in the entire list must be weighted according to information in the indexes. Then the top search result item requires the lookup, reconstruction, and markup of the snippets showing the context of the keywords matched. These are only part of the processing each search results web page requires, and further pages (next to the top) require more of this post-processing. 17 | 18 | Beyond simple keyword lookups, search engines offer their own GUI- or command-driven operators and search parameters to refine the search results. These provide the necessary controls for the user engaged in the feedback loop users create by filtering and weighting while refining the search results, given the initial pages of the first search results. For example, from 2007 the Google.com search engine has allowed one to filter by date by clicking "Show search tools" in the leftmost column of the initial search results page, and then selecting the desired date range. It's also possible to weight by date because each page has a modification time. Most search engines support the use of the boolean operators AND, OR and NOT to help end users refine the search query. Boolean operators are for literal searches that allow the user to refine and extend the terms of the search. The engine looks for the words or phrases exactly as entered. Some search engines provide an advanced feature called proximity search, which allows users to define the distance between keywords. There is also concept-based searching where the research involves using statistical analysis on pages containing the words or phrases you search for. 19 | 20 | The usefulness of a search engine depends on the relevance of the result set it gives back. While there may be millions of web pages that include a particular word or phrase, some pages may be more relevant, popular, or authoritative than others. Most search engines employ methods to rank the results to provide the "best" results first. How a search engine decides which pages are the best matches, and what order the results should be shown in, varies widely from one engine to another. The methods also change over time as Internet usage changes and new techniques evolve. There are two main types of search engine that have evolved: one is a system of predefined and hierarchically ordered keywords that humans have programmed extensively. The other is a system that generates an "inverted index" by analyzing texts it locates. This first form relies much more heavily on the computer itself to do the bulk of the work. 21 | 22 | Most Web search engines are commercial ventures supported by advertising revenue and thus some of them allow advertisers to have their listings ranked higher in search results for a fee. Search engines that do not accept money for their search results make money by running search related ads alongside the regular search engine results. The search engines make money every time someone clicks on one of these ads. 23 | 24 | # Setup of this application 25 | 26 | - Install [MySQL ODBC connector](https://dev.mysql.com/downloads/connector/odbc/); 27 | - Choose a MySQL hosting service and create the MySQL database; 28 | - Configure ODBC connection on application start-up: 29 | ![WebSearchEngine-MySQL.png](WebSearchEngine-MySQL.png). 30 | -------------------------------------------------------------------------------- /ReadMe.txt: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | MICROSOFT FOUNDATION CLASS LIBRARY : WebSearchEngine Project Overview 3 | =============================================================================== 4 | 5 | The application wizard has created this WebSearchEngine application for 6 | you. This application not only demonstrates the basics of using the Microsoft 7 | Foundation Classes but is also a starting point for writing your application. 8 | 9 | This file contains a summary of what you will find in each of the files that 10 | make up your WebSearchEngine application. 11 | 12 | WebSearchEngine.vcxproj 13 | This is the main project file for VC++ projects generated using an application wizard. 14 | It contains information about the version of Visual C++ that generated the file, and 15 | information about the platforms, configurations, and project features selected with the 16 | application wizard. 17 | 18 | WebSearchEngine.vcxproj.filters 19 | This is the filters file for VC++ projects generated using an Application Wizard. 20 | It contains information about the assciation between the files in your project 21 | and the filters. This association is used in the IDE to show grouping of files with 22 | similar extensions under a specific node (for e.g. ".cpp" files are associated with the 23 | "Source Files" filter). 24 | 25 | WebSearchEngine.h 26 | This is the main header file for the application. It includes other 27 | project specific headers (including Resource.h) and declares the 28 | CWebSearchEngineApp application class. 29 | 30 | WebSearchEngine.cpp 31 | This is the main application source file that contains the application 32 | class CWebSearchEngineApp. 33 | 34 | WebSearchEngine.rc 35 | This is a listing of all of the Microsoft Windows resources that the 36 | program uses. It includes the icons, bitmaps, and cursors that are stored 37 | in the RES subdirectory. This file can be directly edited in Microsoft 38 | Visual C++. Your project resources are in 1033. 39 | 40 | res\WebSearchEngine.ico 41 | This is an icon file, which is used as the application's icon. This 42 | icon is included by the main resource file WebSearchEngine.rc. 43 | 44 | res\WebSearchEngine.rc2 45 | This file contains resources that are not edited by Microsoft 46 | Visual C++. You should place all resources not editable by 47 | the resource editor in this file. 48 | 49 | 50 | ///////////////////////////////////////////////////////////////////////////// 51 | 52 | The application wizard creates one dialog class: 53 | 54 | WebSearchEngineDlg.h, WebSearchEngineDlg.cpp - the dialog 55 | These files contain your CWebSearchEngineDlg class. This class defines 56 | the behavior of your application's main dialog. The dialog's template is 57 | in WebSearchEngine.rc, which can be edited in Microsoft Visual C++. 58 | 59 | ///////////////////////////////////////////////////////////////////////////// 60 | 61 | Help Support: 62 | 63 | hlp\WebSearchEngine.hhp 64 | This file is a help project file. It contains the data needed to 65 | compile the help files into a .chm file. 66 | 67 | hlp\WebSearchEngine.hhc 68 | This file lists the contents of the help project. 69 | 70 | hlp\WebSearchEngine.hhk 71 | This file contains an index of the help topics. 72 | 73 | hlp\afxcore.htm 74 | This file contains the standard help topics for standard MFC 75 | commands and screen objects. Add your own help topics to this file. 76 | 77 | makehtmlhelp.bat 78 | This file is used by the build system to compile the help files. 79 | 80 | hlp\Images\*.gif 81 | These are bitmap files required by the standard help file topics for 82 | Microsoft Foundation Class Library standard commands. 83 | 84 | 85 | ///////////////////////////////////////////////////////////////////////////// 86 | 87 | Other Features: 88 | 89 | ActiveX Controls 90 | The application includes support to use ActiveX controls. 91 | 92 | Windows Sockets 93 | The application has support for establishing communications over TCP/IP networks. 94 | 95 | ///////////////////////////////////////////////////////////////////////////// 96 | 97 | Other standard files: 98 | 99 | StdAfx.h, StdAfx.cpp 100 | These files are used to build a precompiled header (PCH) file 101 | named WebSearchEngine.pch and a precompiled types file named StdAfx.obj. 102 | 103 | Resource.h 104 | This is the standard header file, which defines new resource IDs. 105 | Microsoft Visual C++ reads and updates this file. 106 | 107 | WebSearchEngine.manifest 108 | Application manifest files are used by Windows XP to describe an applications 109 | dependency on specific versions of Side-by-Side assemblies. The loader uses this 110 | information to load the appropriate assembly from the assembly cache or private 111 | from the application. The Application manifest maybe included for redistribution 112 | as an external .manifest file that is installed in the same folder as the application 113 | executable or it may be included in the executable in the form of a resource. 114 | ///////////////////////////////////////////////////////////////////////////// 115 | 116 | Other notes: 117 | 118 | The application wizard uses "TODO:" to indicate parts of the source code you 119 | should add to or customize. 120 | 121 | If your application uses MFC in a shared DLL, you will need 122 | to redistribute the MFC DLLs. If your application is in a language 123 | other than the operating system's locale, you will also have to 124 | redistribute the corresponding localized resources MFC100XXX.DLL. 125 | For more information on both of these topics, please see the section on 126 | redistributing Visual C++ applications in MSDN documentation. 127 | 128 | ///////////////////////////////////////////////////////////////////////////// 129 | -------------------------------------------------------------------------------- /ReleaseNotes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Search Engine - Release Notes 6 | 7 | 8 | 9 | 10 | 11 | 12 | 36 | 37 |
38 |

Release Notes

39 |

Design and implementation of a Web Search Engine using Text Mining techniques.

40 |

Homepage: https://www.text-mining.ro/

41 |

GitHub repo: https://github.com/mihaimoga/WebSearchEngine

42 |

Article: https://www.codeproject.com/Articles/5319612/Web-Search-Engine

43 |

Social Media

44 |

Please feel free to contact me if you need any further information.

45 | 51 | 52 |

History

53 |
    54 |
  • December 9th, 2021: Initial version.
  • 55 |
  • December 20th, 2021: Added works cited.
  • 56 |
  • January 14th, 2022: Added index.html and search.php.
  • 57 |
  • December 23rd, 2022: Moved source code from GitLab to GitHub.
  • 58 |
  • March 23rd, 2023: Replaced NULL throughout the codebase with nullptr.
    59 | This means that the minimum requirement for the application is now VC 2010.
  • 60 |
  • April 16th, 2023 - Updated MFC application to work with the latest MySQL ODBC 8.0 Unicode Driver.
  • 61 |
  • May 27th, 2023 - Added Python version of web crawler to Web Search Engine repository.
  • 62 |
  • June 22nd, 2023 - Updated PJ Naughter's ODBCWrappers library to the latest version available.
  • 63 |
  • July 28th, 2023: 64 |
      65 |
    • Replaced old CHyperlinkStatic class with PJ Naughter's CHLinkCtrl library;
    • 66 |
    • Updated the Python script (frontier is now persistent in database) and the PHP search script.
    • 67 |
    68 |
  • 69 |
  • October 22nd, 2023: 70 |
      71 |
    • Switched to Visual Studio Enterprise 2022 (some changes were made in the source code);
    • 72 |
    • Added social media links: Twitter, LinkedIn, Facebook, and Instagram;
    • 73 |
    • Added shortcuts to GitHub repository's Issues, Discussions, and Wiki.
    • 74 |
    75 |
  • 76 |
  • January 1st, 2024: Updated PJ Naughter's ODBCWrappers library to the latest version available.
    Updated module to remove usage of _if_exists by now using ODBCVER and _ATL_MODULES preprocessor macro checks along with SFINAE.
  • 77 |
  • Same version (January 21st, 2024) - Added ReleaseNotes.html and SoftwareContentRegister.html to GitHub repo.
  • 78 |
79 |
80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Resource.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/Resource.h -------------------------------------------------------------------------------- /SoftwareContextRegister.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Search Engine - Software Content Register 6 | 7 | 8 | 9 | 10 | 11 | 12 | 36 | 37 |
38 |

Software Content Register

39 |

Release Name: Web Search Engine
40 | Description: Design and implementation of a Web Search Engine using Text Mining techniques.
41 | Outgoing License: GNU General Public License v3.0
42 | Type of content: static/dynamic libraries, source code, binary
43 | Release location: https://www.codeproject.com/Articles/5319612/Web-Search-Engine

44 | 45 |

CHLinkCtrl
46 | Description: This class allows you to create a static control on a window or dialog which behaves similar to a hyperlink as seen on a web page. When the mouse moves over the text the cursor becomes a hand and when you click on the text the hyperlink is executed.
47 | Version: 1.43
48 | Home Page: https://www.naughter.com/hlinkctrl.html
49 | License: Custom (PJ Naughter's license)
50 | Format: source code, binary

51 | 52 |

ODBCWrappers
53 | Description: The classes provided are: CODBC::CHandle, CODBC::CEnvironment, CODBC::CConnection, CODBC::CStatement & CODBC::CDescriptor.
54 | CHandle provides a class based encapsulation of a SQLHANDLE handle and the various ODBC v3 APIs which work on any SQLHANDLE type handle.
55 | CEnvironment derives from CHandle and provides encapsulation of a ODBC environment handle. This class allows configuration of ODBC for initial usage and creation of connections to a database.
56 | CConnection derives from CHandle and provides encapsulation of a logical connection to a database.
57 | CStatement derives from CHandle and provides encapsulation of an ODBC "statement" which allows execution of commands against a ODBC connection.
58 | CDescriptor derives from CHandle and provides encapsulation of an ODBC "descriptor".

59 | Version: 1.22
60 | Home Page: https://www.naughter.com/odbcwrappers.html
61 | License: Custom (PJ Naughter's license)
62 | Format: source code, binary

63 | 64 |

CVersionInfo
65 | Description: This simple little class encapsulates the SDK calls which access version info from Win32 files. Most executables and DLL's have a VS_VERSION_INFO resource associated with them and this class provides a simple C++ class to programmatically access this information.
66 | Version: 1.13
67 | Home Page: https://www.naughter.com/versioninfo.html
68 | License: Custom (PJ Naughter's license)
69 | Format: source code, binary

70 | 71 |

PJ Naughter's license: You are allowed to include the source code in any product (commercial, shareware, freeware or otherwise) when your product is released in binary form. You are allowed to modify the source code in any way you want except you cannot modify the copyright details at the top of each module. If you want to distribute source code with your application, then you are only allowed to distribute versions released by the author. This is to maintain a single distribution point for the source code.

72 |
73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /UnquoteHTML.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #include "stdafx.h" 16 | 17 | // https://brajeshwar.github.io/entities/ 18 | 19 | typedef struct 20 | { 21 | const char * Name; 22 | unsigned int Value; 23 | } EntityNameEntry; 24 | 25 | static const EntityNameEntry StaticEntityNames[] = 26 | /* list of entity names defined in HTML 4.0 spec */ 27 | { 28 | { "nbsp", /*160*/32 }, 29 | { "iexcl", 161 }, 30 | { "cent", 162 }, 31 | { "pound", 163 }, 32 | { "curren", 164 }, 33 | { "yen", 165 }, 34 | { "brvbar", 166 }, 35 | { "sect", 167 }, 36 | { "uml", 168 }, 37 | { "copy", 169 }, 38 | { "ordf", 170 }, 39 | { "laquo", 171 }, 40 | { "not", 172 }, 41 | { "shy", 173 }, 42 | { "reg", 174 }, 43 | { "macr", 175 }, 44 | { "deg", 176 }, 45 | { "plusmn", 177 }, 46 | { "sup2", 178 }, 47 | { "sup3", 179 }, 48 | { "acute", 180 }, 49 | { "micro", 181 }, 50 | { "para", 182 }, 51 | { "middot", 183 }, 52 | { "cedil", 184 }, 53 | { "sup1", 185 }, 54 | { "ordm", 186 }, 55 | { "raquo", 187 }, 56 | { "frac14", 188 }, 57 | { "frac12", 189 }, 58 | { "frac34", 190 }, 59 | { "iquest", 191 }, 60 | { "Agrave", 192 }, 61 | { "Aacute", 193 }, 62 | { "Acirc", 194 }, 63 | { "Atilde", 195 }, 64 | { "Auml", 196 }, 65 | { "Aring", 197 }, 66 | { "AElig", 198 }, 67 | { "Ccedil", 199 }, 68 | { "Egrave", 200 }, 69 | { "Eacute", 201 }, 70 | { "Ecirc", 202 }, 71 | { "Euml", 203 }, 72 | { "Igrave", 204 }, 73 | { "Iacute", 205 }, 74 | { "Icirc", 206 }, 75 | { "Iuml", 207 }, 76 | { "ETH", 208 }, 77 | { "Ntilde", 209 }, 78 | { "Ograve", 210 }, 79 | { "Oacute", 211 }, 80 | { "Ocirc", 212 }, 81 | { "Otilde", 213 }, 82 | { "Ouml", 214 }, 83 | { "times", 215 }, 84 | { "Oslash", 216 }, 85 | { "Ugrave", 217 }, 86 | { "Uacute", 218 }, 87 | { "Ucirc", 219 }, 88 | { "Uuml", 220 }, 89 | { "Yacute", 221 }, 90 | { "THORN", 222 }, 91 | { "szlig", 223 }, 92 | { "agrave", 224 }, 93 | { "aacute", 225 }, 94 | { "acirc", 226 }, 95 | { "atilde", 227 }, 96 | { "auml", 228 }, 97 | { "aring", 229 }, 98 | { "aelig", 230 }, 99 | { "ccedil", 231 }, 100 | { "egrave", 232 }, 101 | { "eacute", 233 }, 102 | { "ecirc", 234 }, 103 | { "euml", 235 }, 104 | { "igrave", 236 }, 105 | { "iacute", 237 }, 106 | { "icirc", 238 }, 107 | { "iuml", 239 }, 108 | { "eth", 240 }, 109 | { "ntilde", 241 }, 110 | { "ograve", 242 }, 111 | { "oacute", 243 }, 112 | { "ocirc", 244 }, 113 | { "otilde", 245 }, 114 | { "ouml", 246 }, 115 | { "divide", 247 }, 116 | { "oslash", 248 }, 117 | { "ugrave", 249 }, 118 | { "uacute", 250 }, 119 | { "ucirc", 251 }, 120 | { "uuml", 252 }, 121 | { "yacute", 253 }, 122 | { "thorn", 254 }, 123 | { "yuml", 255 }, 124 | { "fnof", 402 }, 125 | /* Greek */ 126 | { "Alpha", 913 }, 127 | { "Beta", 914 }, 128 | { "Gamma", 915 }, 129 | { "Delta", 916 }, 130 | { "Epsilon", 917 }, 131 | { "Zeta", 918 }, 132 | { "Eta", 919 }, 133 | { "Theta", 920 }, 134 | { "Iota", 921 }, 135 | { "Kappa", 922 }, 136 | { "Lambda", 923 }, 137 | { "Mu", 924 }, 138 | { "Nu", 925 }, 139 | { "Xi", 926 }, 140 | { "Omicron", 927 }, 141 | { "Pi", 928 }, 142 | { "Rho", 929 }, 143 | { "Sigma", 931 }, 144 | { "Tau", 932 }, 145 | { "Upsilon", 933 }, 146 | { "Phi", 934 }, 147 | { "Chi", 935 }, 148 | { "Psi", 936 }, 149 | { "Omega", 937 }, 150 | { "alpha", 945 }, 151 | { "beta", 946 }, 152 | { "gamma", 947 }, 153 | { "delta", 948 }, 154 | { "epsilon", 949 }, 155 | { "zeta", 950 }, 156 | { "eta", 951 }, 157 | { "theta", 952 }, 158 | { "iota", 953 }, 159 | { "kappa", 954 }, 160 | { "lambda", 955 }, 161 | { "mu", 956 }, 162 | { "nu", 957 }, 163 | { "xi", 958 }, 164 | { "omicron", 959 }, 165 | { "pi", 960 }, 166 | { "rho", 961 }, 167 | { "sigmaf", 962 }, 168 | { "sigma", 963 }, 169 | { "tau", 964 }, 170 | { "upsilon", 965 }, 171 | { "phi", 966 }, 172 | { "chi", 967 }, 173 | { "psi", 968 }, 174 | { "omega", 969 }, 175 | { "thetasym", 977 }, 176 | { "upsih", 978 }, 177 | { "piv", 982 }, 178 | /* General Punctuation */ 179 | { "bull", 8226 }, 180 | { "hellip", 8230 }, 181 | { "prime", 8242 }, 182 | { "Prime", 8243 }, 183 | { "oline", 8254 }, 184 | { "frasl", 8260 }, 185 | /* Letterlike Symbols */ 186 | { "weierp", 8472 }, 187 | { "image", 8465 }, 188 | { "real", 8476 }, 189 | { "trade", 8482 }, 190 | { "alefsym", 8501 }, 191 | /* Arrows */ 192 | { "larr", 8592 }, 193 | { "uarr", 8593 }, 194 | { "rarr", 8594 }, 195 | { "darr", 8595 }, 196 | { "harr", 8596 }, 197 | { "crarr", 8629 }, 198 | { "lArr", 8656 }, 199 | { "uArr", 8657 }, 200 | { "rArr", 8658 }, 201 | { "dArr", 8659 }, 202 | { "hArr", 8660 }, 203 | /* Mathematical Operators */ 204 | { "forall", 8704 }, 205 | { "part", 8706 }, 206 | { "exist", 8707 }, 207 | { "empty", 8709 }, 208 | { "nabla", 8711 }, 209 | { "isin", 8712 }, 210 | { "notin", 8713 }, 211 | { "ni", 8715 }, 212 | { "prod", 8719 }, 213 | { "sum", 8721 }, 214 | { "minus", 8722 }, 215 | { "lowast", 8727 }, 216 | { "radic", 8730 }, 217 | { "prop", 8733 }, 218 | { "infin", 8734 }, 219 | { "and", 8743 }, 220 | { "or", 8744 }, 221 | { "cap", 8745 }, 222 | { "cup", 8746 }, 223 | { "int", 8747 }, 224 | { "there4", 8756 }, 225 | { "sim", 8764 }, 226 | { "cong", 8773 }, 227 | { "asymp", 8776 }, 228 | { "ne", 8800 }, 229 | { "equiv", 8801 }, 230 | { "le", 8804 }, 231 | { "ge", 8805 }, 232 | { "sub", 8834 }, 233 | { "sup", 8835 }, 234 | { "nsub", 8836 }, 235 | { "sube", 8838 }, 236 | { "supe", 8839 }, 237 | { "oplus", 8853 }, 238 | { "otimes", 8855 }, 239 | { "perp", 8869 }, 240 | { "sdot", 8901 }, 241 | /* Miscellaneous Technical */ 242 | { "lceil", 8968 }, 243 | { "rceil", 8969 }, 244 | { "lfloor", 8970 }, 245 | { "rfloor", 8971 }, 246 | { "lang", 9001 }, 247 | { "rang", 9002 }, 248 | /* Geometric Shapes */ 249 | { "loz", 9674 }, 250 | /* Miscellaneous Symbols */ 251 | { "spades", 9824 }, 252 | { "clubs", 9827 }, 253 | { "hearts", 9829 }, 254 | { "diams", 9830 }, 255 | { "quot", 34 }, 256 | { "amp", 38 }, 257 | { "lt", 60 }, 258 | { "gt", 62 }, 259 | /* Latin Extended-A */ 260 | { "OElig", 338 }, 261 | { "oelig", 339 }, 262 | { "Scaron", 352 }, 263 | { "scaron", 353 }, 264 | { "Yuml", 376 }, 265 | /* Spacing Modifier Letters */ 266 | { "circ", 710 }, 267 | { "tilde", 732 }, 268 | /* General Punctuation */ 269 | { "ensp", 8194 }, 270 | { "emsp", 8195 }, 271 | { "thinsp", 8201 }, 272 | { "zwnj", 8204 }, 273 | { "zwj", 8205 }, 274 | { "lrm", 8206 }, 275 | { "rlm", 8207 }, 276 | { "ndash", 8211 }, 277 | { "mdash", 8212 }, 278 | { "lsquo", 8216 }, 279 | { "rsquo", 8217 }, 280 | { "sbquo", 8218 }, 281 | { "ldquo", 8220 }, 282 | { "rdquo", 8221 }, 283 | { "bdquo", 8222 }, 284 | { "dagger", 8224 }, 285 | { "Dagger", 8225 }, 286 | { "permil", 8240 }, 287 | { "lsaquo", 8249 }, 288 | { "rsaquo", 8250 }, 289 | { "euro", 8364 }, 290 | { nullptr, 0 } /* marks end of list */ 291 | } /*StaticEntityNames*/; 292 | 293 | typedef std::map EntityNameMap; 294 | typedef std::pair EntityNamePair; 295 | static EntityNameMap EntityNames; 296 | 297 | /* writes Ch in UTF-8 encoding to Out. Note this version only deals with characters up to 16 bits. */ 298 | static void WriteUTF8(std::string& Out, unsigned int Ch) 299 | { 300 | if (Ch >= 0x800) 301 | { 302 | Out += (0xE0 | Ch >> 12 & 0x0F); 303 | Out += (0x80 | Ch >> 6 & 0x3F); 304 | Out += (0x80 | Ch & 0x3F); 305 | } 306 | else if (Ch >= 0x80) 307 | { 308 | Out += (0xC0 | Ch >> 6 & 0x1F); 309 | Out += (0x80 | Ch & 0x3F); 310 | } 311 | else 312 | { 313 | Out += (char)Ch; 314 | } /*if*/ 315 | } /*WriteUTF8*/ 316 | 317 | /* copies In to Out, expanding any HTML entity references into literal UTF-8 characters. */ 318 | const std::string UnquoteHTML(const std::string& InBuffer) 319 | { 320 | enum 321 | { 322 | NoMatch, 323 | MatchBegin, 324 | MatchName, 325 | MatchNumber, 326 | MatchDecimalNumber, 327 | MatchHexNumber, 328 | } MatchState; 329 | std::string MatchingName; 330 | unsigned int CharCode = 0; 331 | bool ProcessedChar, GotCharCode; 332 | MatchState = NoMatch; 333 | std::string OutBuffer; 334 | for (size_t index = 0; index < InBuffer.length(); index++) 335 | { 336 | const unsigned char ThisCh = InBuffer[index]; 337 | ProcessedChar = false; /* to begin with */ 338 | GotCharCode = false; /* to begin with */ 339 | switch (MatchState) 340 | { 341 | case MatchBegin: 342 | { 343 | if (ThisCh == '#') 344 | { 345 | MatchState = MatchNumber; 346 | ProcessedChar = true; 347 | } 348 | else if ((ThisCh >= 'a') && (ThisCh <= 'z') 349 | || (ThisCh >= 'A') && (ThisCh <= 'Z')) 350 | { 351 | MatchingName.append(1, ThisCh); 352 | MatchState = MatchName; 353 | ProcessedChar = true; 354 | } 355 | else 356 | { 357 | OutBuffer += '&'; 358 | MatchState = NoMatch; 359 | } /*if*/ 360 | break; 361 | } 362 | case MatchName: 363 | { 364 | if ((ThisCh >= 'a') && (ThisCh <= 'z') 365 | || (ThisCh >= 'A') && (ThisCh <= 'Z') 366 | || (ThisCh >= '0') && (ThisCh <= '9')) 367 | { 368 | MatchingName.append(1, ThisCh); 369 | ProcessedChar = true; 370 | } 371 | else if (ThisCh == ';') 372 | { 373 | if (EntityNames.empty()) 374 | { 375 | /* first use, load EntityNames from StaticEntityNames */ 376 | const EntityNameEntry* ThisEntry; 377 | ThisEntry = StaticEntityNames; 378 | for (;;) 379 | { 380 | if (ThisEntry->Name == nullptr) 381 | break; 382 | EntityNames.insert(EntityNamePair(std::string(ThisEntry->Name), ThisEntry->Value)); 383 | ++ThisEntry; 384 | } /*for*/ 385 | } /*if*/ 386 | const EntityNameMap::const_iterator NameEntry = EntityNames.find(MatchingName); 387 | if (NameEntry != EntityNames.end()) 388 | { 389 | CharCode = NameEntry->second; 390 | ProcessedChar = true; 391 | GotCharCode = true; 392 | } /*if*/ 393 | } /*if*/ 394 | if (!ProcessedChar) 395 | { 396 | OutBuffer += '&'; 397 | for (unsigned int i = 0; i < MatchingName.size(); ++i) 398 | { 399 | OutBuffer += MatchingName[i]; 400 | } /*for*/ 401 | MatchState = NoMatch; 402 | } /*if*/ 403 | break; 404 | } 405 | case MatchNumber: 406 | { 407 | if ((ThisCh == 'x') || (ThisCh == 'X')) 408 | { 409 | ProcessedChar = true; 410 | MatchState = MatchHexNumber; 411 | CharCode = 0; 412 | } 413 | else if ((ThisCh >= '0') && (ThisCh <= '9')) 414 | { 415 | CharCode = ThisCh - '0'; 416 | MatchState = MatchDecimalNumber; 417 | ProcessedChar = true; 418 | } 419 | else 420 | { 421 | MatchState = NoMatch; 422 | } /*if*/ 423 | break; 424 | } 425 | case MatchDecimalNumber: 426 | { 427 | if ((ThisCh >= '0') && (ThisCh <= '9')) 428 | { 429 | CharCode = CharCode * 10 + ThisCh - '0'; 430 | ProcessedChar = true; 431 | } 432 | else if (ThisCh == ';') 433 | { 434 | ProcessedChar = true; 435 | GotCharCode = true; 436 | } 437 | else 438 | { 439 | MatchState = NoMatch; 440 | } /*if*/ 441 | break; 442 | } 443 | case MatchHexNumber: 444 | { 445 | if ((ThisCh >= '0') && (ThisCh <= '9')) 446 | { 447 | CharCode = CharCode * 16 + ThisCh - '0'; 448 | ProcessedChar = true; 449 | } 450 | else if ((ThisCh >= 'a') && (ThisCh <= 'f')) 451 | { 452 | CharCode = CharCode * 16 + ThisCh - 'a' + 10; 453 | ProcessedChar = true; 454 | } 455 | else if ((ThisCh >= 'A') && (ThisCh <= 'F')) 456 | { 457 | CharCode = CharCode * 16 + ThisCh - 'A' + 10; 458 | ProcessedChar = true; 459 | } 460 | else if (ThisCh == ';') 461 | { 462 | ProcessedChar = true; 463 | GotCharCode = true; 464 | } 465 | else 466 | { 467 | MatchState = NoMatch; 468 | } /*if*/ 469 | break; 470 | } 471 | } /*switch*/ 472 | if (GotCharCode) 473 | { 474 | WriteUTF8(OutBuffer, CharCode); 475 | MatchState = NoMatch; 476 | } 477 | else if (!ProcessedChar && (MatchState == NoMatch)) 478 | { 479 | if (ThisCh == '&') 480 | { 481 | MatchState = MatchBegin; 482 | MatchingName.erase(); 483 | } 484 | else 485 | { 486 | OutBuffer += ThisCh; 487 | } /*if*/ 488 | } /*if*/ 489 | } /*for*/ 490 | return OutBuffer; 491 | } /*UnquoteHTML*/ 492 | -------------------------------------------------------------------------------- /VersionInfo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Module : VersionInfo.cpp 3 | Purpose: Implementation for a C++ class encapsulation of Window's "Version Infos" 4 | Created: PJN / 10-04-2000 5 | History: PJN / 07-07-2006 1. Updated copyright details 6 | 2. Updated the code to clean compile on VC 2005 7 | 3. Addition of CVERSIONINFO_EXT_CLASS and CVERSIONINFO_EXT_API macros to allow 8 | the class to be easily added to an extension DLL. 9 | 4. Optimized CVersionInfo constructor code 10 | 5. Reviewed all TRACE statements for correctness 11 | 6. Updated the documentation to use the same style as the web site. 12 | PJN / 14-09-2008 1. Updated copyright details. 13 | 2. Code now compiles cleanly using Code Analysis (/analyze) 14 | 3. Updated code to compile correctly using _ATL_CSTRING_EXPLICIT_CONSTRUCTORS define 15 | 4. Updated sample app to clean compile on VC 2008 16 | 5. The code has now been updated to support VC 2005 or later only. 17 | 6. Removed VC 6 style AppWizard comments from the code. 18 | 7. Reworked code to use ATL::CHeapPtr for required memory allocations 19 | PJN / 04-01-2015 1. Updated the code to clean compile in VC 2010 - VC 2013. 20 | 2. Updated copyright details. 21 | 3. Replaced all TRACE calls with ATLTRACE. 22 | PJN / 29-11-2015 1. Updated the code to clean compile in VC 2015. 23 | 2. Reworked the classes to optionally compile without MFC. By default the class now 24 | use STL classes and idioms but if you define CVERSIONINFO_MFC_EXTENSIONS the class 25 | will revert back to the MFC behaviour. 26 | 3. All the class methods have had SAL annotations added 27 | PJN / 02-01-2016 1. Updated copyright details. 28 | 2. CVersionInfo::GetValue now uses wostringstream and ostringstream instead of 29 | wstringstream and stringstream. 30 | PJN / 26-09-2017 1. Updated copyright details. 31 | 2. Replaced NULL throughout the codebase with nullptr. This means that the minimum 32 | requirement for the framework is now VC 2010. 33 | 3. Replaced CString::operator LPC*STR() calls throughout the codebase with 34 | CString::GetString calls 35 | 4. Made all the Get* methods const. 36 | PJN / 04-06-2018 1. Updated copyright details. 37 | 2. Fixed a number of C++ core guidelines compiler warnings. These changes mean that 38 | the code will now only compile on VC 2017 or later. 39 | PJN / 16-09-2018 1. Fixed a number of compiler warnings when using VS 2017 15.8.4 40 | PJN / 21-04-2019 1. Updated copyright details. 41 | 2. Removed the code path supported by the non defunct CVERSIONINFO_MFC_EXTENSIONS 42 | enum 43 | PJN / 21-12-2019 1. Fixed various Clang-Tidy static code analysis warnings in the code. 44 | PJN / 14-03-2020 1. Updated copyright details. 45 | 2. Fixed more Clang-Tidy static code analysis warnings in the code. 46 | PJN / 12-04-2020 1. Updated copyright details. 47 | 2. Fixed more Clang-Tidy static code analysis warnings in the code. 48 | PJN / 07-02-2022 1. Updated the code to use C++ uniform initialization for all variable declarations 49 | 2. Updated copyright details. 50 | 51 | Copyright (c) 2000 - 2022 by PJ Naughter (Web: www.naughter.com, Email: pjna@naughter.com) 52 | 53 | All rights reserved. 54 | 55 | Copyright / Usage Details: 56 | 57 | You are allowed to include the source code in any product (commercial, shareware, freeware or otherwise) 58 | when your product is released in binary form. You are allowed to modify the source code in any way you want 59 | except you cannot modify the copyright details at the top of each module. If you want to distribute source 60 | code with your application, then you are only allowed to distribute versions released by the author. This is 61 | to maintain a single distribution point for the source code. 62 | 63 | */ 64 | 65 | //////////////// Includes ///////////////////////////////////////////////////// 66 | 67 | #include "stdafx.h" 68 | #include "VersionInfo.h" 69 | 70 | #ifndef _SSTREAM_ 71 | #pragma message("To avoid this message, please put sstream in your pre compiled header (normally stdafx.h)") 72 | #include 73 | #endif //#ifndef _SSTREAM_ 74 | 75 | #ifndef _IOMANIP_ 76 | #pragma message("To avoid this message, please put iomanip in your pre compiled header (normally stdafx.h)") 77 | #include 78 | #endif //#ifndef _IOMANIP_ 79 | 80 | 81 | //////////////// Macros / Locals ////////////////////////////////////////////// 82 | 83 | #ifdef _DEBUG 84 | #define new DEBUG_NEW 85 | #endif //#ifdef _DEBUG 86 | 87 | //Automatically pull in the win32 version Library 88 | #pragma comment(lib, "version.lib") 89 | 90 | 91 | //////////////// Implementation /////////////////////////////////////////////// 92 | 93 | CVersionInfo::CVersionInfo() noexcept : m_wLangID{0}, 94 | m_wCharset{1252}, //Use the ANSI code page as a default 95 | m_pTranslations{nullptr}, 96 | m_nTranslations{0}, 97 | m_pffi{nullptr} 98 | { 99 | } 100 | 101 | CVersionInfo::~CVersionInfo() noexcept 102 | { 103 | Unload(); 104 | } 105 | 106 | void CVersionInfo::Unload() noexcept 107 | { 108 | m_pffi = nullptr; 109 | m_VerData.clear(); 110 | m_wLangID = 0; 111 | m_wCharset = 1252; //Use the ANSI code page as a default 112 | m_pTranslations = nullptr; 113 | m_nTranslations = 0; 114 | } 115 | 116 | #pragma warning(suppress: 26440) 117 | BOOL CVersionInfo::Load(_In_z_ LPCTSTR szFileName) 118 | { 119 | //Free up any previous memory lying around 120 | Unload(); 121 | 122 | BOOL bSuccess{ FALSE }; 123 | const DWORD dwSize{ GetFileVersionInfoSize(szFileName, nullptr) }; 124 | if (dwSize) 125 | { 126 | //Allocate some memory to hold the version info data 127 | m_VerData.resize(dwSize); 128 | if (GetFileVersionInfo(szFileName, 0, dwSize, m_VerData.data())) 129 | { 130 | //Get the fixed size version info data 131 | UINT nLen{ 0 }; 132 | #pragma warning(suppress: 26490) 133 | if (VerQueryValue(m_VerData.data(), _T("\\"), reinterpret_cast(&m_pffi), &nLen)) 134 | { 135 | //Retrieve the Lang ID and Character set ID 136 | #pragma warning(suppress: 26490) 137 | if (VerQueryValue(m_VerData.data(), _T("\\VarFileInfo\\Translation"), reinterpret_cast(&m_pTranslations), &nLen) && (nLen >= sizeof(TRANSLATION))) 138 | { 139 | m_nTranslations = nLen / sizeof(TRANSLATION); 140 | #pragma warning(suppress: 26481) 141 | m_wLangID = m_pTranslations[0].m_wLangID; 142 | #pragma warning(suppress: 26481) 143 | m_wCharset = m_pTranslations[0].m_wCodePage; 144 | } 145 | bSuccess = TRUE; 146 | } 147 | else 148 | ATLTRACE(_T("CVersionInfo::Load, Failed to query file size version info for file %s, Error:%u\n"), szFileName, ::GetLastError()); 149 | } 150 | else 151 | ATLTRACE(_T("CVersionInfo::Load, Failed to read in version info for file %s, Error:%u\n"), szFileName, ::GetLastError()); 152 | } 153 | else 154 | ATLTRACE(_T("CVersionInfo::Load, Failed to get version info for file %s, Error:%u\n"), szFileName, ::GetLastError()); 155 | 156 | return bSuccess; 157 | } 158 | 159 | VS_FIXEDFILEINFO* CVersionInfo::GetFixedFileInfo() const noexcept 160 | { 161 | return m_pffi; 162 | } 163 | 164 | DWORD CVersionInfo::GetFileFlagsMask() const noexcept 165 | { 166 | //Validate our parameters 167 | #pragma warning(suppress: 26477) 168 | ATLASSUME(m_pffi != nullptr); 169 | 170 | return m_pffi->dwFileFlagsMask; 171 | } 172 | 173 | DWORD CVersionInfo::GetFileFlags() const noexcept 174 | { 175 | //Validate our parameters 176 | #pragma warning(suppress: 26477) 177 | ATLASSUME(m_pffi != nullptr); 178 | 179 | return m_pffi->dwFileFlags; 180 | } 181 | 182 | DWORD CVersionInfo::GetOS() const noexcept 183 | { 184 | //Validate our parameters 185 | #pragma warning(suppress: 26477) 186 | ATLASSUME(m_pffi != nullptr); 187 | 188 | return m_pffi->dwFileOS; 189 | } 190 | 191 | DWORD CVersionInfo::GetFileType() const noexcept 192 | { 193 | //Validate our parameters 194 | #pragma warning(suppress: 26477) 195 | ATLASSUME(m_pffi != nullptr); 196 | 197 | return m_pffi->dwFileType; 198 | } 199 | 200 | DWORD CVersionInfo::GetFileSubType() const noexcept 201 | { 202 | //Validate our parameters 203 | #pragma warning(suppress: 26477) 204 | ATLASSUME(m_pffi != nullptr); 205 | 206 | return m_pffi->dwFileSubtype; 207 | } 208 | 209 | FILETIME CVersionInfo::GetCreationTime() const noexcept 210 | { 211 | //Validate our parameters 212 | #pragma warning(suppress: 26477) 213 | ATLASSUME(m_pffi != nullptr); 214 | 215 | FILETIME CreationTime{}; 216 | CreationTime.dwHighDateTime = m_pffi->dwFileDateMS; 217 | CreationTime.dwLowDateTime = m_pffi->dwFileDateLS; 218 | return CreationTime; 219 | } 220 | 221 | unsigned __int64 CVersionInfo::GetFileVersion() const noexcept 222 | { 223 | //Validate our parameters 224 | #pragma warning(suppress: 26477) 225 | ATLASSUME(m_pffi != nullptr); 226 | 227 | unsigned __int64 nFileVersion{ m_pffi->dwFileVersionLS }; 228 | #pragma warning(suppress: 26472) 229 | nFileVersion += ((static_cast(m_pffi->dwFileVersionMS)) << 32); 230 | return nFileVersion; 231 | } 232 | 233 | unsigned __int64 CVersionInfo::GetProductVersion() const noexcept 234 | { 235 | //Validate our parameters 236 | #pragma warning(suppress: 26477) 237 | ATLASSUME(m_pffi != nullptr); 238 | 239 | unsigned __int64 nProductVersion{ m_pffi->dwProductVersionLS }; 240 | #pragma warning(suppress: 26472) 241 | nProductVersion += ((static_cast(m_pffi->dwProductVersionMS)) << 32); 242 | return nProductVersion; 243 | } 244 | 245 | CVersionInfo::String CVersionInfo::GetValue(_In_z_ LPCTSTR pszKey) const 246 | { 247 | //What will be the return value from this function 248 | String sVal; 249 | 250 | //Form the string to query with 251 | String sQueryValue; 252 | #ifdef _UNICODE 253 | std::wostringstream ss; 254 | #else 255 | std::ostringstream ss; 256 | #endif //#ifdef _UNICODE 257 | ss << _T("\\StringFileInfo\\"); 258 | ss << std::setfill(_T('0')) << std::setw(4) << std::hex << m_wLangID; 259 | ss << std::setfill(_T('0')) << std::setw(4) << std::hex << m_wCharset; 260 | ss << _T("\\"); 261 | ss << pszKey; 262 | sQueryValue = ss.str(); 263 | 264 | LPCTSTR pszQueryValue = sQueryValue.c_str(); 265 | 266 | //Do the query 267 | LPTSTR pVal{ nullptr }; 268 | UINT nLen{ 0 }; 269 | #pragma warning(suppress: 26490) 270 | if (VerQueryValue(m_VerData.data(), pszQueryValue, reinterpret_cast(&pVal), &nLen)) 271 | sVal = pVal; 272 | 273 | return sVal; 274 | } 275 | 276 | CVersionInfo::String CVersionInfo::GetCompanyName() const 277 | { 278 | return GetValue(_T("CompanyName")); 279 | } 280 | 281 | CVersionInfo::String CVersionInfo::GetFileDescription() const 282 | { 283 | return GetValue(_T("FileDescription")); 284 | } 285 | 286 | CVersionInfo::String CVersionInfo::GetFileVersionAsString() const 287 | { 288 | return GetValue(_T("FileVersion")); 289 | } 290 | 291 | CVersionInfo::String CVersionInfo::GetInternalName() const 292 | { 293 | return GetValue(_T("InternalName")); 294 | } 295 | 296 | CVersionInfo::String CVersionInfo::GetLegalCopyright() const 297 | { 298 | return GetValue(_T("LegalCopyright")); 299 | } 300 | 301 | CVersionInfo::String CVersionInfo::GetOriginalFilename() const 302 | { 303 | return GetValue(_T("OriginalFilename")); 304 | } 305 | 306 | CVersionInfo::String CVersionInfo::GetProductName() const 307 | { 308 | return GetValue(_T("Productname")); 309 | } 310 | 311 | CVersionInfo::String CVersionInfo::GetProductVersionAsString() const 312 | { 313 | return GetValue(_T("ProductVersion")); 314 | } 315 | 316 | int CVersionInfo::GetNumberOfTranslations() const noexcept 317 | { 318 | return m_nTranslations; 319 | } 320 | 321 | CVersionInfo::String CVersionInfo::GetComments() const 322 | { 323 | return GetValue(_T("Comments")); 324 | } 325 | 326 | CVersionInfo::String CVersionInfo::GetLegalTrademarks() const 327 | { 328 | return GetValue(_T("LegalTrademarks")); 329 | } 330 | 331 | CVersionInfo::String CVersionInfo::GetPrivateBuild() const 332 | { 333 | return GetValue(_T("PrivateBuild")); 334 | } 335 | 336 | CVersionInfo::String CVersionInfo::GetSpecialBuild() const 337 | { 338 | return GetValue(_T("SpecialBuild")); 339 | } 340 | 341 | CVersionInfo::TRANSLATION* CVersionInfo::GetTranslation(_In_ int nIndex) const noexcept 342 | { 343 | //Validate our parameters 344 | #pragma warning(suppress: 26477) 345 | ATLASSERT((nIndex >= 0) && (nIndex < m_nTranslations)); 346 | #pragma warning(suppress: 26477) 347 | ATLASSUME(m_pTranslations != nullptr); 348 | 349 | #pragma warning(suppress: 26481) 350 | return &m_pTranslations[nIndex]; 351 | } 352 | 353 | void CVersionInfo::SetTranslation(_In_ int nIndex) noexcept 354 | { 355 | const TRANSLATION* pTranslation{ GetTranslation(nIndex) }; 356 | #pragma warning(suppress: 26477) 357 | ATLASSUME(pTranslation != nullptr); 358 | 359 | m_wLangID = pTranslation->m_wLangID; //NOLINT(clang-analyzer-core.NullDereference) 360 | m_wCharset = pTranslation->m_wCodePage; 361 | } 362 | -------------------------------------------------------------------------------- /VersionInfo.h: -------------------------------------------------------------------------------- 1 | /* 2 | Module : VersionInfo.h 3 | Purpose: Interface for an C++ class encapsulation of Window's "Version Infos" 4 | 5 | Copyright (c) 2000 - 2022 by PJ Naughter (Web: www.naughter.com, Email: pjna@naughter.com) 6 | 7 | All rights reserved. 8 | 9 | Copyright / Usage Details: 10 | 11 | You are allowed to include the source code in any product (commercial, shareware, freeware or otherwise) 12 | when your product is released in binary form. You are allowed to modify the source code in any way you want 13 | except you cannot modify the copyright details at the top of each module. If you want to distribute source 14 | code with your application, then you are only allowed to distribute versions released by the author. This is 15 | to maintain a single distribution point for the source code. 16 | 17 | */ 18 | 19 | 20 | /////////////////////////////// Macros / Defines ////////////////////////////// 21 | 22 | #pragma once 23 | 24 | #ifndef __VERSIONINFO_H__ 25 | #define __VERSIONINFO_H__ 26 | 27 | #ifndef CVERSIONINFO_EXT_CLASS 28 | #define CVERSIONINFO_EXT_CLASS 29 | #endif //#ifndef CVERSIONINFO_EXT_CLASS 30 | 31 | #ifndef CVERSIONINFO_EXT_API 32 | #define CVERSIONINFO_EXT_API 33 | #endif //#ifndef CVERSIONINFO_EXT_API 34 | 35 | #ifndef _STRING_ 36 | #pragma message("To avoid this message, please put string in your pre compiled header (normally stdafx.h)") 37 | #include 38 | #endif //#ifndef _STRING_ 39 | 40 | #ifndef _VECTOR_ 41 | #pragma message("To avoid this message, please put vector in your pre compiled header (normally stdafx.h)") 42 | #include 43 | #endif //#ifndef _VECTOR_ 44 | 45 | 46 | /////////////////////////////// Classes /////////////////////////////////////// 47 | 48 | class CVERSIONINFO_EXT_CLASS CVersionInfo 49 | { 50 | public: 51 | //Typedefs 52 | #ifdef _UNICODE 53 | using String = std::wstring; 54 | #else 55 | using String = std::string; 56 | #endif //#ifdef _UNICODE 57 | 58 | //Structs 59 | struct TRANSLATION 60 | { 61 | WORD m_wLangID; //e.g. 0x0409 LANG_ENGLISH, SUBLANG_ENGLISH_USA 62 | WORD m_wCodePage; //e.g. 1252 Codepage for Windows:Multilingual 63 | }; 64 | 65 | //Constructors / Destructors 66 | CVersionInfo() noexcept; 67 | CVersionInfo(_In_ const CVersionInfo&) = delete; 68 | CVersionInfo(_In_ CVersionInfo&&) = delete; 69 | ~CVersionInfo() noexcept; 70 | 71 | //Methods 72 | CVersionInfo& operator=(_In_ const CVersionInfo&) = delete; 73 | CVersionInfo& operator=(_In_ CVersionInfo&&) = delete; 74 | BOOL Load(_In_z_ LPCTSTR szFileName); 75 | _NODISCARD VS_FIXEDFILEINFO* GetFixedFileInfo() const noexcept; 76 | _NODISCARD DWORD GetFileFlagsMask() const noexcept; 77 | _NODISCARD DWORD GetFileFlags() const noexcept; 78 | _NODISCARD DWORD GetOS() const noexcept; 79 | _NODISCARD DWORD GetFileType() const noexcept; 80 | _NODISCARD DWORD GetFileSubType() const noexcept; 81 | _NODISCARD FILETIME GetCreationTime() const noexcept; 82 | _NODISCARD unsigned __int64 GetFileVersion() const noexcept; 83 | _NODISCARD unsigned __int64 GetProductVersion() const noexcept; 84 | _NODISCARD String GetValue(_In_z_ LPCTSTR pszKeyName) const; 85 | _NODISCARD String GetComments() const; 86 | _NODISCARD String GetCompanyName() const; 87 | _NODISCARD String GetFileDescription() const; 88 | _NODISCARD String GetFileVersionAsString() const; 89 | _NODISCARD String GetInternalName() const; 90 | _NODISCARD String GetLegalCopyright() const; 91 | _NODISCARD String GetLegalTrademarks() const; 92 | _NODISCARD String GetOriginalFilename() const; 93 | _NODISCARD String GetPrivateBuild() const; 94 | _NODISCARD String GetProductName() const; 95 | _NODISCARD String GetProductVersionAsString() const; 96 | _NODISCARD String GetSpecialBuild() const; 97 | _NODISCARD int GetNumberOfTranslations() const noexcept; 98 | _NODISCARD TRANSLATION* GetTranslation(_In_ int nIndex) const noexcept; 99 | void SetTranslation(_In_ int nIndex) noexcept; 100 | 101 | protected: 102 | //Methods 103 | void Unload() noexcept; 104 | 105 | //Data 106 | WORD m_wLangID; //The current language ID of the resource 107 | WORD m_wCharset; //The current Character set ID of the resource 108 | std::vector m_VerData; //Pointer to the version info blob 109 | TRANSLATION* m_pTranslations; //Pointer to the "\\VarFileInfo\\Translation" version info 110 | int m_nTranslations; //The number of translated version infos in the resource 111 | VS_FIXEDFILEINFO* m_pffi; //Pointer to the fixed size version info data 112 | }; 113 | 114 | #endif //#ifndef __VERSIONINFO_H__ 115 | -------------------------------------------------------------------------------- /WebSearchEngine-MySQL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/WebSearchEngine-MySQL.png -------------------------------------------------------------------------------- /WebSearchEngine.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // WebSearchEngine.cpp : Defines the class behaviors for the application. 16 | // 17 | 18 | #include "stdafx.h" 19 | #include "WebSearchEngine.h" 20 | #include "WebSearchEngineDlg.h" 21 | 22 | #ifdef _DEBUG 23 | #define new DEBUG_NEW 24 | #endif 25 | 26 | // CWebSearchEngineApp 27 | 28 | BEGIN_MESSAGE_MAP(CWebSearchEngineApp, CWinApp) 29 | ON_COMMAND(ID_HELP, &CWinApp::OnHelp) 30 | END_MESSAGE_MAP() 31 | 32 | // CWebSearchEngineApp construction 33 | 34 | CWebSearchEngineApp::CWebSearchEngineApp() 35 | { 36 | // support Restart Manager 37 | m_dwRestartManagerSupportFlags = AFX_RESTART_MANAGER_SUPPORT_RESTART; 38 | 39 | // TODO: add construction code here, 40 | // Place all significant initialization in InitInstance 41 | } 42 | 43 | // The one and only CWebSearchEngineApp object 44 | 45 | CWebSearchEngineApp theApp; 46 | 47 | // CWebSearchEngineApp initialization 48 | 49 | BOOL CWebSearchEngineApp::InitInstance() 50 | { 51 | // InitCommonControlsEx() is required on Windows XP if an application 52 | // manifest specifies use of ComCtl32.dll version 6 or later to enable 53 | // visual styles. Otherwise, any window creation will fail. 54 | INITCOMMONCONTROLSEX InitCtrls; 55 | InitCtrls.dwSize = sizeof(InitCtrls); 56 | // Set this to include all the common control classes you want to use 57 | // in your application. 58 | InitCtrls.dwICC = ICC_WIN95_CLASSES; 59 | InitCommonControlsEx(&InitCtrls); 60 | 61 | CWinApp::InitInstance(); 62 | 63 | if (!AfxSocketInit()) 64 | { 65 | AfxMessageBox(IDP_SOCKETS_INIT_FAILED); 66 | return FALSE; 67 | } 68 | 69 | AfxEnableControlContainer(); 70 | 71 | // Create the shell manager, in case the dialog contains 72 | // any shell tree view or shell list view controls. 73 | CShellManager *pShellManager = new CShellManager; 74 | 75 | // Activate "Windows Native" visual manager for enabling themes in MFC controls 76 | CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows)); 77 | 78 | // Standard initialization 79 | // If you are not using these features and wish to reduce the size 80 | // of your final executable, you should remove from the following 81 | // the specific initialization routines you do not need 82 | // Change the registry key under which our settings are stored 83 | // TODO: You should modify this string to be something appropriate 84 | // such as the name of your company or organization 85 | SetRegistryKey(_T("Stefan-Mihai MOGA")); 86 | 87 | CWebSearchEngineDlg dlg; 88 | m_pMainWnd = &dlg; 89 | INT_PTR nResponse = dlg.DoModal(); 90 | if (nResponse == IDOK) 91 | { 92 | // TODO: Place code here to handle when the dialog is 93 | // dismissed with OK 94 | } 95 | else if (nResponse == IDCANCEL) 96 | { 97 | // TODO: Place code here to handle when the dialog is 98 | // dismissed with Cancel 99 | } 100 | else if (nResponse == -1) 101 | { 102 | TRACE(traceAppMsg, 0, "Warning: dialog creation failed, so application is terminating unexpectedly.\n"); 103 | TRACE(traceAppMsg, 0, "Warning: if you are using MFC controls on the dialog, you cannot #define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n"); 104 | } 105 | 106 | dlg.m_pConnection.Close(); 107 | dlg.m_pEnvironment.Close(); 108 | 109 | // Delete the shell manager created above. 110 | if (pShellManager != NULL) 111 | { 112 | delete pShellManager; 113 | } 114 | 115 | #ifndef _AFXDLL 116 | ControlBarCleanUp(); 117 | #endif 118 | 119 | // Since the dialog has been closed, return FALSE so that we exit the 120 | // application, rather than start the application's message pump. 121 | return FALSE; 122 | } 123 | -------------------------------------------------------------------------------- /WebSearchEngine.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // WebSearchEngine.h : main header file for the PROJECT_NAME application 16 | // 17 | 18 | #pragma once 19 | 20 | #ifndef __AFXWIN_H__ 21 | #error "include 'stdafx.h' before including this file for PCH" 22 | #endif 23 | 24 | #include "resource.h" // main symbols 25 | 26 | // CWebSearchEngineApp: 27 | // See WebSearchEngine.cpp for the implementation of this class 28 | // 29 | 30 | class CWebSearchEngineApp : public CWinApp 31 | { 32 | public: 33 | CWebSearchEngineApp(); 34 | 35 | // Overrides 36 | public: 37 | virtual BOOL InitInstance(); 38 | 39 | // Implementation 40 | 41 | DECLARE_MESSAGE_MAP() 42 | }; 43 | 44 | extern CWebSearchEngineApp theApp; 45 | -------------------------------------------------------------------------------- /WebSearchEngine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/WebSearchEngine.png -------------------------------------------------------------------------------- /WebSearchEngine.rc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/WebSearchEngine.rc -------------------------------------------------------------------------------- /WebSearchEngine.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WebSearchEngine", "WebSearchEngine.vcxproj", "{8CA73E38-C08B-48CB-A4E2-922B23006DCA}" 7 | EndProject 8 | Global 9 | GlobalSection(SubversionScc) = preSolution 10 | Svn-Managed = True 11 | Manager = AnkhSVN - Subversion Support for Visual Studio 12 | EndGlobalSection 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|x64 = Debug|x64 15 | Debug|x86 = Debug|x86 16 | Release|x64 = Release|x64 17 | Release|x86 = Release|x86 18 | EndGlobalSection 19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 20 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Debug|x64.ActiveCfg = Debug|x64 21 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Debug|x64.Build.0 = Debug|x64 22 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Debug|x86.ActiveCfg = Debug|Win32 23 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Debug|x86.Build.0 = Debug|Win32 24 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Release|x64.ActiveCfg = Release|x64 25 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Release|x64.Build.0 = Release|x64 26 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Release|x86.ActiveCfg = Release|Win32 27 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA}.Release|x86.Build.0 = Release|Win32 28 | EndGlobalSection 29 | GlobalSection(SolutionProperties) = preSolution 30 | HideSolutionNode = FALSE 31 | EndGlobalSection 32 | EndGlobal 33 | -------------------------------------------------------------------------------- /WebSearchEngine.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {8CA73E38-C08B-48CB-A4E2-922B23006DCA} 23 | WebSearchEngine 24 | 10.0 25 | MFCProj 26 | 27 | 28 | 29 | Application 30 | true 31 | v143 32 | Unicode 33 | Static 34 | 35 | 36 | Application 37 | false 38 | v143 39 | true 40 | Unicode 41 | Static 42 | 43 | 44 | Application 45 | true 46 | v143 47 | Unicode 48 | Static 49 | 50 | 51 | Application 52 | false 53 | v143 54 | true 55 | Unicode 56 | Static 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | true 78 | 79 | 80 | true 81 | 82 | 83 | false 84 | 85 | 86 | false 87 | 88 | 89 | 90 | Use 91 | Level3 92 | Disabled 93 | WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions) 94 | true 95 | stdcpplatest 96 | stdclatest 97 | 98 | 99 | Windows 100 | 101 | 102 | false 103 | true 104 | _DEBUG;%(PreprocessorDefinitions) 105 | 106 | 107 | 0x0409 108 | _DEBUG;%(PreprocessorDefinitions) 109 | $(IntDir);%(AdditionalIncludeDirectories) 110 | 111 | 112 | 113 | 114 | Use 115 | Level4 116 | Disabled 117 | _WINDOWS;_DEBUG;%(PreprocessorDefinitions) 118 | true 119 | stdcpplatest 120 | stdclatest 121 | 122 | 123 | Windows 124 | 125 | 126 | false 127 | true 128 | _DEBUG;%(PreprocessorDefinitions) 129 | 130 | 131 | 0x0409 132 | _DEBUG;%(PreprocessorDefinitions) 133 | $(IntDir);%(AdditionalIncludeDirectories) 134 | 135 | 136 | 137 | 138 | Level3 139 | Use 140 | MaxSpeed 141 | true 142 | true 143 | WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions) 144 | true 145 | stdcpplatest 146 | stdclatest 147 | 148 | 149 | Windows 150 | true 151 | true 152 | 153 | 154 | false 155 | true 156 | NDEBUG;%(PreprocessorDefinitions) 157 | 158 | 159 | 0x0409 160 | NDEBUG;%(PreprocessorDefinitions) 161 | $(IntDir);%(AdditionalIncludeDirectories) 162 | 163 | 164 | 165 | 166 | Level3 167 | Use 168 | MaxSpeed 169 | true 170 | true 171 | _WINDOWS;NDEBUG;%(PreprocessorDefinitions) 172 | true 173 | stdcpplatest 174 | stdclatest 175 | 176 | 177 | Windows 178 | true 179 | true 180 | 181 | 182 | false 183 | true 184 | NDEBUG;%(PreprocessorDefinitions) 185 | 186 | 187 | 0x0409 188 | NDEBUG;%(PreprocessorDefinitions) 189 | $(IntDir);%(AdditionalIncludeDirectories) 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | Create 214 | Create 215 | Create 216 | Create 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /WebSearchEngine.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | Header Files 23 | 24 | 25 | Header Files 26 | 27 | 28 | Header Files 29 | 30 | 31 | Header Files 32 | 33 | 34 | Header Files 35 | 36 | 37 | Header Files 38 | 39 | 40 | Header Files 41 | 42 | 43 | Header Files 44 | 45 | 46 | Header Files 47 | 48 | 49 | Header Files 50 | 51 | 52 | Header Files 53 | 54 | 55 | 56 | 57 | Source Files 58 | 59 | 60 | Source Files 61 | 62 | 63 | Source Files 64 | 65 | 66 | Source Files 67 | 68 | 69 | Source Files 70 | 71 | 72 | Source Files 73 | 74 | 75 | Source Files 76 | 77 | 78 | Source Files 79 | 80 | 81 | Source Files 82 | 83 | 84 | 85 | 86 | Resource Files 87 | 88 | 89 | 90 | 91 | Resource Files 92 | 93 | 94 | Source Files 95 | 96 | 97 | Source Files 98 | 99 | 100 | Source Files 101 | 102 | 103 | 104 | 105 | Resource Files 106 | 107 | 108 | Resource Files 109 | 110 | 111 | -------------------------------------------------------------------------------- /WebSearchEngineDlg.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // WebSearchEngineDlg.cpp : implementation file 16 | // 17 | 18 | #include "stdafx.h" 19 | #include "WebSearchEngine.h" 20 | 21 | #include "WebSearchEngineDlg.h" 22 | #include "WebSearchEngineExt.h" 23 | #include "ConnectionSettingsDlg.h" 24 | #include "HtmlToText.h" 25 | 26 | #include "HLinkCtrl.h" 27 | #include "VersionInfo.h" 28 | 29 | #ifdef _DEBUG 30 | #define new DEBUG_NEW 31 | #endif 32 | 33 | DWORD WINAPI CrawlingThreadProc(LPVOID lpParam); 34 | 35 | // CAboutDlg dialog used for App About 36 | 37 | class CAboutDlg : public CDialog 38 | { 39 | public: 40 | CAboutDlg(); 41 | 42 | // Dialog Data 43 | enum { IDD = IDD_ABOUTBOX }; 44 | 45 | protected: 46 | virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support 47 | 48 | // Implementation 49 | public: 50 | virtual BOOL OnInitDialog(); 51 | afx_msg void OnDestroy(); 52 | 53 | protected: 54 | CStatic m_ctrlVersion; 55 | CEdit m_ctrlWarning; 56 | CVersionInfo m_pVersionInfo; 57 | CHLinkCtrl m_ctrlWebsite; 58 | CHLinkCtrl m_ctrlEmail; 59 | 60 | DECLARE_MESSAGE_MAP() 61 | }; 62 | 63 | CAboutDlg::CAboutDlg() : CDialog(CAboutDlg::IDD) 64 | { 65 | } 66 | 67 | void CAboutDlg::DoDataExchange(CDataExchange* pDX) 68 | { 69 | CDialog::DoDataExchange(pDX); 70 | DDX_Control(pDX, IDC_VERSION, m_ctrlVersion); 71 | DDX_Control(pDX, IDC_WARNING, m_ctrlWarning); 72 | DDX_Control(pDX, IDC_WEBSITE, m_ctrlWebsite); 73 | DDX_Control(pDX, IDC_EMAIL, m_ctrlEmail); 74 | } 75 | 76 | BEGIN_MESSAGE_MAP(CAboutDlg, CDialog) 77 | ON_WM_DESTROY() 78 | END_MESSAGE_MAP() 79 | 80 | CString GetModuleFileName(_Inout_opt_ DWORD* pdwLastError = nullptr) 81 | { 82 | CString strModuleFileName; 83 | DWORD dwSize{ _MAX_PATH }; 84 | while (true) 85 | { 86 | TCHAR* pszModuleFileName{ strModuleFileName.GetBuffer(dwSize) }; 87 | const DWORD dwResult{ ::GetModuleFileName(nullptr, pszModuleFileName, dwSize) }; 88 | if (dwResult == 0) 89 | { 90 | if (pdwLastError != nullptr) 91 | *pdwLastError = GetLastError(); 92 | strModuleFileName.ReleaseBuffer(0); 93 | return CString{}; 94 | } 95 | else if (dwResult < dwSize) 96 | { 97 | if (pdwLastError != nullptr) 98 | *pdwLastError = ERROR_SUCCESS; 99 | strModuleFileName.ReleaseBuffer(dwResult); 100 | return strModuleFileName; 101 | } 102 | else if (dwResult == dwSize) 103 | { 104 | strModuleFileName.ReleaseBuffer(0); 105 | dwSize *= 2; 106 | } 107 | } 108 | } 109 | 110 | BOOL CAboutDlg::OnInitDialog() 111 | { 112 | CDialog::OnInitDialog(); 113 | 114 | CString strFullPath{ GetModuleFileName() }; 115 | if (strFullPath.IsEmpty()) 116 | #pragma warning(suppress: 26487) 117 | return FALSE; 118 | 119 | if (m_pVersionInfo.Load(strFullPath.GetString())) 120 | { 121 | CString strName = m_pVersionInfo.GetProductName().c_str(); 122 | CString strVersion = m_pVersionInfo.GetProductVersionAsString().c_str(); 123 | strVersion.Replace(_T(" "), _T("")); 124 | strVersion.Replace(_T(","), _T(".")); 125 | const int nFirst = strVersion.Find(_T('.')); 126 | const int nSecond = strVersion.Find(_T('.'), nFirst + 1); 127 | strVersion.Truncate(nSecond); 128 | m_ctrlVersion.SetWindowText(strName + _T(" version ") + strVersion); 129 | } 130 | 131 | m_ctrlWarning.SetWindowText(_T("This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see .")); 132 | 133 | m_ctrlWebsite.SetHyperLink(_T("https://www.moga.doctor/")); 134 | m_ctrlEmail.SetHyperLink(_T("mailto:stefan-mihai@moga.doctor")); 135 | 136 | return TRUE; // return TRUE unless you set the focus to a control 137 | // EXCEPTION: OCX Property Pages should return FALSE 138 | } 139 | 140 | void CAboutDlg::OnDestroy() 141 | { 142 | CDialog::OnDestroy(); 143 | } 144 | 145 | // CWebSearchEngineDlg dialog 146 | 147 | CWebSearchEngineDlg::CWebSearchEngineDlg(CWnd* pParent /*=NULL*/) 148 | : CDialogEx(IDD_WEBSEARCHENGINE_DIALOG, pParent) 149 | { 150 | m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); 151 | m_bThreadRunning = false; 152 | m_nThreadID = 0; 153 | } 154 | 155 | void CWebSearchEngineDlg::DoDataExchange(CDataExchange* pDX) 156 | { 157 | CDialogEx::DoDataExchange(pDX); 158 | DDX_Control(pDX, IDC_CRAWLING, m_pCrawling); 159 | DDX_Control(pDX, IDC_PROGRESS, m_pProgress); 160 | DDX_Control(pDX, IDC_WEBPAGES, m_pWebpageCounter); 161 | DDX_Control(pDX, IDC_KEYWORDS, m_pKeywordCounter); 162 | } 163 | 164 | BEGIN_MESSAGE_MAP(CWebSearchEngineDlg, CDialogEx) 165 | ON_WM_SYSCOMMAND() 166 | ON_WM_PAINT() 167 | ON_WM_QUERYDRAGICON() 168 | ON_BN_CLICKED(IDCANCEL, &CWebSearchEngineDlg::OnBnClickedCancel) 169 | END_MESSAGE_MAP() 170 | 171 | // CWebSearchEngineDlg message handlers 172 | 173 | BOOL CWebSearchEngineDlg::OnInitDialog() 174 | { 175 | CDialogEx::OnInitDialog(); 176 | 177 | // Add "About..." menu item to system menu. 178 | 179 | // IDM_ABOUTBOX must be in the system command range. 180 | ASSERT((IDM_ABOUTBOX & 0xFFF0) == IDM_ABOUTBOX); 181 | ASSERT(IDM_ABOUTBOX < 0xF000); 182 | 183 | CMenu* pSysMenu = GetSystemMenu(FALSE); 184 | if (pSysMenu != NULL) 185 | { 186 | BOOL bNameValid; 187 | CString strAboutMenu; 188 | bNameValid = strAboutMenu.LoadString(IDS_ABOUTBOX); 189 | ASSERT(bNameValid); 190 | if (!strAboutMenu.IsEmpty()) 191 | { 192 | pSysMenu->AppendMenu(MF_SEPARATOR); 193 | pSysMenu->AppendMenu(MF_STRING, IDM_ABOUTBOX, strAboutMenu); 194 | } 195 | pSysMenu->AppendMenu(MF_SEPARATOR); 196 | pSysMenu->AppendMenu(MF_STRING, IDM_TWITTER, _T("Twitter")); 197 | pSysMenu->AppendMenu(MF_STRING, IDM_LINKEDIN, _T("LinkedIn")); 198 | pSysMenu->AppendMenu(MF_STRING, IDM_FACEBOOK, _T("Facebook")); 199 | pSysMenu->AppendMenu(MF_STRING, IDM_INSTAGRAM, _T("Instagram")); 200 | pSysMenu->AppendMenu(MF_SEPARATOR); 201 | pSysMenu->AppendMenu(MF_STRING, IDM_ISSUES, _T("Issues")); 202 | pSysMenu->AppendMenu(MF_STRING, IDM_DISCUSSIONS, _T("Discussions")); 203 | pSysMenu->AppendMenu(MF_STRING, IDM_WIKI, _T("Wiki")); 204 | } 205 | 206 | // Set the icon for this dialog. The framework does this automatically 207 | // when the application's main window is not a dialog 208 | SetIcon(m_hIcon, TRUE); // Set big icon 209 | SetIcon(m_hIcon, FALSE); // Set small icon 210 | 211 | // TODO: Add extra initialization here 212 | CConnectionSettingsDlg pConnectionSettingsDlg(this); 213 | if (pConnectionSettingsDlg.DoModal() != IDOK) 214 | return FALSE; 215 | 216 | CWinApp* pWinApp = AfxGetApp(); 217 | ASSERT(pWinApp != NULL); 218 | 219 | CString strHostName = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_HOSTNAME, DEFAULT_HOSTNAME); 220 | CString strHostPort = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_HOSTPORT, DEFAULT_HOSTPORT); 221 | CString strDatabase = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_DATABASE, DEFAULT_DATABASE); 222 | // CString strFileName = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_FILENAME, DEFAULT_FILENAME); 223 | CString strUsername = pWinApp->GetProfileString(REGKEY_SECTION, REGKEY_USERNAME, DEFAULT_USERNAME); 224 | 225 | TCHAR lpszPassword[0x100] = { 0, }; 226 | VERIFY(GetRegistryPassword(NULL, REGKEY_SECTION, REGKEY_PASSWORD, lpszPassword, DEFAULT_PASSWORD)); 227 | 228 | SQLRETURN nRet = m_pEnvironment.Create(); 229 | ODBC_CHECK_RETURN_FALSE(nRet, m_pEnvironment); 230 | 231 | nRet = m_pEnvironment.SetAttr(SQL_ATTR_ODBC_VERSION, SQL_OV_ODBC3_80); 232 | ODBC_CHECK_RETURN_FALSE(nRet, m_pEnvironment); 233 | 234 | nRet = m_pEnvironment.SetAttrU(SQL_ATTR_CONNECTION_POOLING, SQL_CP_DEFAULT); 235 | ODBC_CHECK_RETURN_FALSE(nRet, m_pEnvironment); 236 | 237 | nRet = m_pConnection.Create(m_pEnvironment); 238 | ODBC_CHECK_RETURN_FALSE(nRet, m_pConnection); 239 | 240 | _stprintf(m_sConnectionInString, _T("Driver={MySQL ODBC 8.0 Unicode Driver};Server=%s;Port=%s;Database=%s;User=%s;Password=%s;"), 241 | strHostName.GetBuffer(0), strHostPort.GetBuffer(0), strDatabase.GetBuffer(0), strUsername.GetBuffer(0), lpszPassword); 242 | strHostName.ReleaseBuffer(); 243 | strHostPort.ReleaseBuffer(); 244 | strDatabase.ReleaseBuffer(); 245 | strUsername.ReleaseBuffer(); 246 | nRet = m_pConnection.DriverConnect(const_cast(reinterpret_cast(m_sConnectionInString)), m_sConnectionOutString); 247 | ODBC_CHECK_RETURN_FALSE(nRet, m_pConnection); 248 | 249 | m_pWebpageCounter.SetWindowText(_T("0")); 250 | m_pKeywordCounter.SetWindowText(_T("0")); 251 | 252 | CGenericStatement pGenericStatement; 253 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("DROP TABLE IF EXISTS `occurrence`;"))); 254 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("DROP TABLE IF EXISTS `keyword`;"))); 255 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("DROP TABLE IF EXISTS `webpage`;"))); 256 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("CREATE TABLE `webpage` (`webpage_id` BIGINT NOT NULL AUTO_INCREMENT, `url` VARCHAR(256) NOT NULL, `title` VARCHAR(256) NOT NULL, `content` LONGTEXT NOT NULL, PRIMARY KEY(`webpage_id`)) ENGINE=InnoDB CHARACTER SET utf8 COLLATE utf8_general_ci;"))); 257 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("CREATE TABLE `keyword` (`keyword_id` BIGINT NOT NULL AUTO_INCREMENT, `name` VARCHAR(256) NOT NULL, PRIMARY KEY(`keyword_id`)) ENGINE=InnoDB CHARACTER SET utf8 COLLATE utf8_general_ci;"))); 258 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("CREATE TABLE `occurrence` (`webpage_id` BIGINT NOT NULL, `keyword_id` BIGINT NOT NULL, `counter` BIGINT NOT NULL, `pagerank` REAL NOT NULL, PRIMARY KEY(`webpage_id`, `keyword_id`), FOREIGN KEY webpage_fk(webpage_id) REFERENCES webpage(webpage_id), FOREIGN KEY keyword_fk(keyword_id) REFERENCES keyword(keyword_id)) ENGINE=InnoDB CHARACTER SET utf8 COLLATE utf8_general_ci;"))); 259 | VERIFY(pGenericStatement.Execute(m_pConnection, _T("CREATE UNIQUE INDEX index_name ON `keyword`(`name`);"))); 260 | 261 | m_hThread = ::CreateThread(nullptr, 0, (LPTHREAD_START_ROUTINE)CrawlingThreadProc, this, 0, &m_nThreadID); 262 | 263 | return TRUE; // return TRUE unless you set the focus to a control 264 | } 265 | 266 | void CWebSearchEngineDlg::OnSysCommand(UINT nID, LPARAM lParam) 267 | { 268 | if ((nID & 0xFFF0) == IDM_ABOUTBOX) 269 | { 270 | CAboutDlg dlgAbout; 271 | dlgAbout.DoModal(); 272 | } 273 | else 274 | { 275 | if (nID == IDM_TWITTER) 276 | { 277 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://x.com/stefanmihaimoga"), nullptr, nullptr, SW_SHOW); 278 | } 279 | else 280 | { 281 | if (nID == IDM_LINKEDIN) 282 | { 283 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://www.linkedin.com/in/stefanmihaimoga/"), nullptr, nullptr, SW_SHOW); 284 | } 285 | else 286 | { 287 | if (nID == IDM_FACEBOOK) 288 | { 289 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://www.facebook.com/stefanmihaimoga"), nullptr, nullptr, SW_SHOW); 290 | } 291 | else 292 | { 293 | if (nID == IDM_INSTAGRAM) 294 | { 295 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://www.instagram.com/stefanmihaimoga/"), nullptr, nullptr, SW_SHOW); 296 | } 297 | else 298 | { 299 | if (nID == IDM_ISSUES) 300 | { 301 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://github.com/mihaimoga/WebSearchEngine/issues"), nullptr, nullptr, SW_SHOW); 302 | } 303 | else 304 | { 305 | if (nID == IDM_DISCUSSIONS) 306 | { 307 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://github.com/mihaimoga/WebSearchEngine/discussions"), nullptr, nullptr, SW_SHOW); 308 | } 309 | else 310 | { 311 | if (nID == IDM_WIKI) 312 | { 313 | ::ShellExecute(GetSafeHwnd(), _T("open"), _T("https://github.com/mihaimoga/WebSearchEngine/wiki"), nullptr, nullptr, SW_SHOW); 314 | } 315 | else 316 | { 317 | CDialog::OnSysCommand(nID, lParam); 318 | } 319 | } 320 | } 321 | } 322 | 323 | } 324 | } 325 | } 326 | } 327 | } 328 | 329 | // If you add a minimize button to your dialog, you will need the code below 330 | // to draw the icon. For MFC applications using the document/view model, 331 | // this is automatically done for you by the framework. 332 | 333 | void CWebSearchEngineDlg::OnPaint() 334 | { 335 | if (IsIconic()) 336 | { 337 | CPaintDC dc(this); // device context for painting 338 | 339 | SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()), 0); 340 | 341 | // Center icon in client rectangle 342 | int cxIcon = GetSystemMetrics(SM_CXICON); 343 | int cyIcon = GetSystemMetrics(SM_CYICON); 344 | CRect rect; 345 | GetClientRect(&rect); 346 | int x = (rect.Width() - cxIcon + 1) / 2; 347 | int y = (rect.Height() - cyIcon + 1) / 2; 348 | 349 | // Draw the icon 350 | dc.DrawIcon(x, y, m_hIcon); 351 | } 352 | else 353 | { 354 | CDialogEx::OnPaint(); 355 | } 356 | } 357 | 358 | // The system calls this function to obtain the cursor to display while the user drags 359 | // the minimized window. 360 | HCURSOR CWebSearchEngineDlg::OnQueryDragIcon() 361 | { 362 | return static_cast(m_hIcon); 363 | } 364 | 365 | DWORD WINAPI CrawlingThreadProc(LPVOID lpParam) 366 | { 367 | std::string lpszURL, lpszFilename; 368 | if (lpParam != NULL) 369 | { 370 | CWebSearchEngineDlg* pWebSearchEngineDlg = (CWebSearchEngineDlg*)lpParam; 371 | pWebSearchEngineDlg->m_bThreadRunning = true; 372 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 373 | AddURLToFrontier("https://en.wikipedia.org/"); 374 | while (pWebSearchEngineDlg->m_bThreadRunning) 375 | { 376 | if (ExtractURLFromFrontier(lpszURL)) 377 | { 378 | pWebSearchEngineDlg->m_pCrawling.SetWindowText(CString(lpszURL.c_str())); 379 | if (DownloadURLToFile(lpszURL, lpszFilename)) 380 | { 381 | if (!ProcessHTML(pWebSearchEngineDlg, lpszFilename, lpszURL)) 382 | { 383 | break; 384 | } 385 | } 386 | } 387 | else 388 | break; 389 | } 390 | 391 | pWebSearchEngineDlg->m_bThreadRunning = false; 392 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 393 | } 394 | 395 | ::ExitThread(0); 396 | return 0; 397 | } 398 | 399 | BOOL WaitWithMessageLoop(HANDLE hEvent, DWORD dwTimeout) 400 | { 401 | DWORD dwRet; 402 | MSG msg; 403 | hEvent = hEvent ? hEvent : CreateEvent(NULL, FALSE, FALSE, NULL); 404 | 405 | while (true) 406 | { 407 | dwRet = MsgWaitForMultipleObjects(1, &hEvent, FALSE, dwTimeout, QS_ALLINPUT); 408 | if (dwRet == WAIT_OBJECT_0) 409 | return TRUE; 410 | if (dwRet != WAIT_OBJECT_0 + 1) 411 | break; 412 | while (PeekMessage(&msg, NULL, NULL, NULL, PM_REMOVE)) 413 | { 414 | TranslateMessage(&msg); 415 | DispatchMessage(&msg); 416 | if (WaitForSingleObject(hEvent, 0) == WAIT_OBJECT_0) 417 | return TRUE; 418 | } 419 | } 420 | return FALSE; 421 | } 422 | 423 | void CWebSearchEngineDlg::OnBnClickedCancel() 424 | { 425 | if (m_bThreadRunning) 426 | { 427 | m_bThreadRunning = false; 428 | VERIFY(WaitWithMessageLoop(m_hThread, INFINITE)); 429 | } 430 | CDialogEx::OnCancel(); 431 | } 432 | -------------------------------------------------------------------------------- /WebSearchEngineDlg.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // WebSearchEngineDlg.h : header file 16 | // 17 | 18 | #pragma once 19 | 20 | #include "ODBCWrappers.h" 21 | #include "afxwin.h" 22 | #include "afxcmn.h" 23 | 24 | // CWebSearchEngineDlg dialog 25 | class CWebSearchEngineDlg : public CDialogEx 26 | { 27 | // Construction 28 | public: 29 | CWebSearchEngineDlg(CWnd* pParent = NULL); // standard constructor 30 | 31 | // Dialog Data 32 | #ifdef AFX_DESIGN_TIME 33 | enum { IDD = IDD_WEBSEARCHENGINE_DIALOG }; 34 | #endif 35 | CEdit m_pCrawling; 36 | CProgressCtrl m_pProgress; 37 | CStatic m_pWebpageCounter; 38 | CStatic m_pKeywordCounter; 39 | 40 | protected: 41 | virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support 42 | 43 | // Implementation 44 | public: 45 | bool m_bThreadRunning; 46 | HICON m_hIcon; 47 | CODBC::CEnvironment m_pEnvironment; 48 | CODBC::CConnection m_pConnection; 49 | CODBC::String m_sConnectionOutString; 50 | TCHAR m_sConnectionInString[0x100]; 51 | DWORD m_nThreadID; 52 | HANDLE m_hThread; 53 | 54 | protected: 55 | // Generated message map functions 56 | virtual BOOL OnInitDialog(); 57 | afx_msg void OnSysCommand(UINT nID, LPARAM lParam); 58 | afx_msg void OnPaint(); 59 | afx_msg HCURSOR OnQueryDragIcon(); 60 | afx_msg void OnBnClickedCancel(); 61 | DECLARE_MESSAGE_MAP() 62 | }; 63 | -------------------------------------------------------------------------------- /WebSearchEngineExt.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #include "stdafx.h" 16 | #include "WebSearchEngineExt.h" 17 | #include "HtmlToText.h" 18 | #include "ODBCWrappers.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #pragma comment(lib, "Urlmon") 31 | 32 | #ifdef _DEBUG 33 | #define new DEBUG_NEW 34 | #endif 35 | 36 | FrontierArray gFrontierOlder; // visited URLs 37 | FrontierArray gFrontierArray; // enque/deque URLs 38 | FrontierScore gFrontierScore; // the score of each URL 39 | WebpageIndex gWebpageID; // map of each webpage ID 40 | KeywordIndex gKeywordID; // map of each keyword ID 41 | KeywordArray gWordArray; // list of all keywords 42 | 43 | std::vector gDataMiningTerms; 44 | 45 | static __int64 gCurrentWebpageID = 0; 46 | static __int64 gCurrentKeywordID = 0; 47 | 48 | #define DELIMITERS _T("\t\n\r\"\' !?#$%&|(){}[]*/+-:;<>=.,") 49 | 50 | const std::string UnquoteHTML(const std::string& InBuffer); 51 | 52 | // convert UTF-8 string to wstring 53 | std::wstring utf8_to_wstring(const std::string& str) 54 | { 55 | std::wstring_convert> myconv; 56 | return myconv.from_bytes(str); 57 | } 58 | 59 | // convert wstring to UTF-8 string 60 | std::string wstring_to_utf8(const std::wstring& str) 61 | { 62 | std::wstring_convert> myconv; 63 | return myconv.to_bytes(str); 64 | } 65 | 66 | // adds a new URL to frontier 67 | bool AddURLToFrontier(const std::string& lpszURL) 68 | { 69 | bool found = false; 70 | for (auto it = gFrontierOlder.begin(); it != gFrontierOlder.end(); it++) 71 | { 72 | if (lpszURL.compare(it->c_str()) == 0) 73 | { 74 | return true; // URL already visited 75 | } 76 | } 77 | for (auto it = gFrontierArray.begin(); it != gFrontierArray.end(); it++) 78 | { 79 | if (lpszURL.compare(it->c_str()) == 0) 80 | { 81 | found = true; 82 | gFrontierScore[lpszURL]++; 83 | break; 84 | } 85 | } 86 | if (!found) 87 | { 88 | gFrontierArray.push_back(lpszURL); 89 | gFrontierScore[lpszURL] = 1; 90 | } 91 | return true; 92 | } 93 | 94 | // gets the next URL from frontier 95 | bool ExtractURLFromFrontier(std::string& lpszURL) 96 | { 97 | lpszURL = ""; 98 | int score = 0; 99 | if (gFrontierArray.size() > 0) 100 | { 101 | for (std::string it : gFrontierArray) 102 | { 103 | if (lpszURL.empty()) // select the first element 104 | { 105 | lpszURL = it; 106 | score = gFrontierScore[it]; 107 | } 108 | else 109 | { 110 | if (score < gFrontierScore[it]) // update the selection if necesary 111 | { 112 | lpszURL = it; 113 | score = gFrontierScore[it]; 114 | } 115 | } 116 | } 117 | 118 | // remove selected element from frontier 119 | for (auto it = gFrontierArray.begin(); it != gFrontierArray.end(); it++) 120 | { 121 | if (lpszURL.compare(it->c_str()) == 0) 122 | { 123 | gFrontierArray.erase(it); 124 | break; 125 | } 126 | } 127 | for (auto it = gFrontierScore.begin(); it != gFrontierScore.end(); it++) 128 | { 129 | if (lpszURL.compare(it->first.c_str()) == 0) 130 | { 131 | gFrontierScore.erase(it); 132 | break; 133 | } 134 | } 135 | 136 | gFrontierOlder.push_back(lpszURL); 137 | return true; 138 | } 139 | return false; 140 | } 141 | 142 | // function to download a Web page 143 | bool DownloadURLToFile(const std::string& lpszURL, std::string& lpszFilename) 144 | { 145 | char lpszTempPath[MAX_PATH + 1] = { 0, }; 146 | char lpszTempFile[MAX_PATH + 1] = { 0, }; 147 | 148 | const DWORD dwTempPath = GetTempPathA(sizeof(lpszTempPath) - 1, lpszTempPath); 149 | if (dwTempPath > 0) 150 | { 151 | lpszTempPath[dwTempPath] = '\0'; 152 | if (GetTempFileNameA(lpszTempPath, "map", 0, lpszTempFile) != 0) 153 | { 154 | if (URLDownloadToFileA(NULL, lpszURL.c_str(), lpszTempFile, 0, NULL) == S_OK) 155 | { 156 | lpszFilename = lpszTempFile; 157 | return true; 158 | } 159 | } 160 | } 161 | return false; 162 | } 163 | 164 | // finds and replaces all occurences from a given string 165 | int findAndReplaceAll(std::wstring& data, std::wstring toSearch, std::wstring replaceStr) 166 | { 167 | int counter = 0; 168 | // Get the first occurrence 169 | size_t pos = data.find(toSearch); 170 | 171 | // Repeat till end is reached 172 | while (pos != std::string::npos) 173 | { 174 | counter++; 175 | // Replace this occurrence of Sub String 176 | data.replace(pos, toSearch.size(), replaceStr); 177 | // Get the next occurrence from the current position 178 | pos = data.find(toSearch, pos + replaceStr.size()); 179 | } 180 | return counter; 181 | } 182 | 183 | // trim from start (in place) 184 | static inline std::wstring ltrim(std::wstring s) { 185 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { 186 | return !iswspace((wint_t)ch); 187 | })); 188 | return s; 189 | } 190 | 191 | // trim from end (in place) 192 | static inline std::wstring rtrim(std::wstring s) { 193 | s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { 194 | return !iswspace((wint_t)ch); 195 | }).base(), s.end()); 196 | return s; 197 | } 198 | 199 | // trim from both ends (in place) 200 | static inline std::wstring trim(std::wstring s) 201 | { 202 | return ltrim(rtrim(s)); 203 | } 204 | 205 | // string to lower 206 | static inline std::wstring to_lower(std::wstring s) 207 | { 208 | std::transform(s.begin(), s.end(), s.begin(), 209 | [](wchar_t c) { return (wchar_t)std::tolower(c); }); 210 | return s; 211 | } 212 | 213 | // string to upper 214 | static inline std::wstring to_upper(std::wstring s) 215 | { 216 | std::transform(s.begin(), s.end(), s.begin(), 217 | [](wchar_t c) { return (wchar_t)std::toupper(c); }); 218 | return s; 219 | } 220 | 221 | // process HTML page and extract plain text and hyperlinks 222 | bool ProcessHTML(CWebSearchEngineDlg* pWebSearchEngineDlg, const std::string& lpszFilename, const std::string& lpszURL) 223 | { 224 | CString strMessage; 225 | CHtmlToText pHtmlToText; 226 | std::ifstream pHtmlFile(lpszFilename); 227 | std::string pHtmlContent; 228 | if (pHtmlFile.is_open()) 229 | { 230 | pHtmlFile.seekg(0, std::ios::end); 231 | pHtmlContent.reserve(static_cast(pHtmlFile.tellg())); 232 | pHtmlFile.seekg(0, std::ios::beg); 233 | 234 | pHtmlContent.assign((std::istreambuf_iterator(pHtmlFile)), 235 | std::istreambuf_iterator()); 236 | pHtmlFile.close(); 237 | 238 | std::wstring pTitle; 239 | std::size_t found = pHtmlContent.find("", 0); 240 | if (std::string::npos != found) 241 | { 242 | found += 7; 243 | const std::size_t last_char = pHtmlContent.find("", found); 244 | if (std::string::npos != last_char) 245 | { 246 | pTitle = trim(utf8_to_wstring(UnquoteHTML(pHtmlContent.substr(found, last_char - found)))).substr(0, 0x100 - 1); 247 | } 248 | } 249 | if (pTitle.length() == 0) 250 | return true; 251 | 252 | found = pHtmlContent.find(" 0); 292 | pPlainText = pPlainText.substr(0, 0x10000 - 1); 293 | OutputDebugString(CString(pPlainText.c_str()) + _T("\n")); 294 | 295 | SQLRETURN nRet = 0; 296 | CWebpageInsert pWebpageInsert; 297 | if (!pWebpageInsert.Execute(pWebSearchEngineDlg->m_pConnection, pURL, pTitle, pPlainText)) // add webpage to database 298 | { 299 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 300 | do { 301 | ::MessageBeep(0xFFFFFFFF); 302 | nRet = pWebSearchEngineDlg->m_pConnection.Disconnect(); 303 | ::Sleep(30 * 1000); 304 | nRet = pWebSearchEngineDlg->m_pConnection.DriverConnect(const_cast(reinterpret_cast(pWebSearchEngineDlg->m_sConnectionInString)), pWebSearchEngineDlg->m_sConnectionOutString); 305 | } while (!SQL_SUCCEEDED(nRet)); 306 | if (!pWebpageInsert.Execute(pWebSearchEngineDlg->m_pConnection, pURL, pTitle, pPlainText)) 307 | { 308 | pWebSearchEngineDlg->MessageBox(_T("Cannot insert webpage into the database"), _T("Error"), MB_OK); 309 | return false; 310 | } 311 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 312 | } 313 | gWebpageID[pURL] = ++gCurrentWebpageID; 314 | pWebSearchEngineDlg->m_pWebpageCounter.SetWindowText(std::to_wstring(gCurrentWebpageID).c_str()); 315 | 316 | const std::wstring pLowerCaseText = to_lower(pPlainText); 317 | // Skip delimiters at beginning. 318 | std::size_t lastPos = pLowerCaseText.find_first_not_of(DELIMITERS, 0); 319 | // Find first "non-delimiter". 320 | std::size_t pos = pLowerCaseText.find_first_of(DELIMITERS, lastPos); 321 | 322 | while ((std::string::npos != pos) || (std::string::npos != lastPos)) 323 | { 324 | // Found a token, add it to the vector. 325 | const std::wstring pKeyword = pLowerCaseText.substr(lastPos, pos - lastPos); 326 | // Skip delimiters. Note the "not_of" 327 | lastPos = pLowerCaseText.find_first_not_of(DELIMITERS, pos); 328 | // Find next "non-delimiter" 329 | pos = pLowerCaseText.find_first_of(DELIMITERS, lastPos); 330 | 331 | if (pKeyword.length() == 0) 332 | continue; 333 | 334 | if (pKeyword.find_first_not_of(_T("abcdefghijklmnopqrstuvwxyz")) != std::string::npos) 335 | continue; 336 | 337 | OutputDebugString(CString(pKeyword.c_str()) + _T("\n")); 338 | bool already_added = false; 339 | for (auto it = gWordArray.begin(); it != gWordArray.end(); it++) 340 | { 341 | if (pKeyword.compare(it->c_str()) == 0) 342 | { 343 | already_added = true; 344 | break; 345 | } 346 | } 347 | 348 | if (!already_added) 349 | { 350 | gWordArray.push_back(pKeyword); 351 | 352 | CKeywordInsert pKeywordInsert; 353 | if (!pKeywordInsert.Execute(pWebSearchEngineDlg->m_pConnection, pKeyword)) // add keyword to database 354 | { 355 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 356 | do { 357 | ::MessageBeep(0xFFFFFFFF); 358 | nRet = pWebSearchEngineDlg->m_pConnection.Disconnect(); 359 | ::Sleep(30 * 1000); 360 | nRet = pWebSearchEngineDlg->m_pConnection.DriverConnect(const_cast(reinterpret_cast(pWebSearchEngineDlg->m_sConnectionInString)), pWebSearchEngineDlg->m_sConnectionOutString); 361 | } while (!SQL_SUCCEEDED(nRet)); 362 | if (!pKeywordInsert.Execute(pWebSearchEngineDlg->m_pConnection, pKeyword)) 363 | { 364 | pWebSearchEngineDlg->MessageBox(_T("Cannot insert keyword into the database"), _T("Error"), MB_OK); 365 | return false; 366 | } 367 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 368 | } 369 | gKeywordID[pKeyword] = ++gCurrentKeywordID; 370 | pWebSearchEngineDlg->m_pKeywordCounter.SetWindowText(std::to_wstring(gCurrentKeywordID).c_str()); 371 | 372 | COccurrenceInsert pOccurrenceInsert; 373 | if (!pOccurrenceInsert.Execute(pWebSearchEngineDlg->m_pConnection, gCurrentWebpageID, gCurrentKeywordID, 1)) 374 | { 375 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 376 | do { 377 | ::MessageBeep(0xFFFFFFFF); 378 | nRet = pWebSearchEngineDlg->m_pConnection.Disconnect(); 379 | ::Sleep(30 * 1000); 380 | nRet = pWebSearchEngineDlg->m_pConnection.DriverConnect(const_cast(reinterpret_cast(pWebSearchEngineDlg->m_sConnectionInString)), pWebSearchEngineDlg->m_sConnectionOutString); 381 | } while (!SQL_SUCCEEDED(nRet)); 382 | if (!pOccurrenceInsert.Execute(pWebSearchEngineDlg->m_pConnection, gCurrentWebpageID, gCurrentKeywordID, 1)) 383 | { 384 | pWebSearchEngineDlg->MessageBox(_T("Cannot insert occurrence into the database"), _T("Error"), MB_OK); 385 | return false; 386 | } 387 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 388 | } 389 | } 390 | else 391 | { 392 | const __int64 nKeywordID = gKeywordID[pKeyword]; 393 | COccurrenceInsert pOccurrenceInsert; 394 | if (!pOccurrenceInsert.Execute(pWebSearchEngineDlg->m_pConnection, gCurrentWebpageID, nKeywordID, 1)) 395 | { 396 | COccurrenceUpdate pOccurrenceUpdate; 397 | if (!pOccurrenceUpdate.Execute(pWebSearchEngineDlg->m_pConnection, gCurrentWebpageID, nKeywordID)) 398 | { 399 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 400 | do { 401 | ::MessageBeep(0xFFFFFFFF); 402 | nRet = pWebSearchEngineDlg->m_pConnection.Disconnect(); 403 | ::Sleep(30 * 1000); 404 | nRet = pWebSearchEngineDlg->m_pConnection.DriverConnect(const_cast(reinterpret_cast(pWebSearchEngineDlg->m_sConnectionInString)), pWebSearchEngineDlg->m_sConnectionOutString); 405 | } while (!SQL_SUCCEEDED(nRet)); 406 | if (!pOccurrenceUpdate.Execute(pWebSearchEngineDlg->m_pConnection, gCurrentWebpageID, nKeywordID)) 407 | { 408 | pWebSearchEngineDlg->MessageBox(_T("Cannot update occurrence into the database"), _T("Error"), MB_OK); 409 | return false; 410 | } 411 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 412 | } 413 | } 414 | } 415 | 416 | already_added = false; 417 | for (auto it = gDataMiningTerms.begin(); it != gDataMiningTerms.end(); it++) 418 | { 419 | if (pKeyword.compare(it->c_str()) == 0) 420 | { 421 | already_added = true; 422 | break; 423 | } 424 | } 425 | if (!already_added) 426 | gDataMiningTerms.push_back(pKeyword); 427 | } 428 | 429 | if ((gCurrentWebpageID % 1000) == 0) 430 | { 431 | for (auto it = gDataMiningTerms.begin(); it != gDataMiningTerms.end(); it++) 432 | { 433 | strMessage.Format(_T("applying data mining for '%s'..."), it->c_str()); 434 | pWebSearchEngineDlg->m_pCrawling.SetWindowText(strMessage); 435 | 436 | CDataMiningUpdate pDataMiningUpdate; 437 | if (!pDataMiningUpdate.Execute(pWebSearchEngineDlg->m_pConnection, *it)) 438 | { 439 | pWebSearchEngineDlg->m_pProgress.SetMarquee(FALSE, 30); 440 | do { 441 | ::MessageBeep(0xFFFFFFFF); 442 | nRet = pWebSearchEngineDlg->m_pConnection.Disconnect(); 443 | ::Sleep(30 * 1000); 444 | nRet = pWebSearchEngineDlg->m_pConnection.DriverConnect(const_cast(reinterpret_cast(pWebSearchEngineDlg->m_sConnectionInString)), pWebSearchEngineDlg->m_sConnectionOutString); 445 | } while (!SQL_SUCCEEDED(nRet)); 446 | if (!pDataMiningUpdate.Execute(pWebSearchEngineDlg->m_pConnection, *it)) 447 | { 448 | pWebSearchEngineDlg->MessageBox(_T("Cannot apply data mining to the database"), _T("Error"), MB_OK); 449 | return false; 450 | } 451 | pWebSearchEngineDlg->m_pProgress.SetMarquee(TRUE, 30); 452 | } 453 | } 454 | gDataMiningTerms.clear(); 455 | } 456 | return true; 457 | } 458 | return false; 459 | } 460 | -------------------------------------------------------------------------------- /WebSearchEngineExt.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #ifndef __WEBSEARCHENGINE_H__ 16 | #define __WEBSEARCHENGINE_H__ 17 | 18 | #pragma once 19 | 20 | #include "stdafx.h" 21 | #include "ODBCWrappers.h" 22 | #include "WebSearchEngineDlg.h" 23 | 24 | typedef std::vector FrontierArray; 25 | typedef std::map FrontierScore; 26 | typedef std::map WebpageIndex; 27 | typedef std::map KeywordIndex; 28 | typedef std::vector KeywordArray; 29 | 30 | bool AddURLToFrontier(const std::string& lpszURL); 31 | bool ExtractURLFromFrontier(std::string& lpszURL); 32 | bool DownloadURLToFile(const std::string& lpszURL, std::string& lpszFilename); 33 | bool ProcessHTML(CWebSearchEngineDlg* pWebSearchEngineDlg, const std::string& lpszFilename, const std::string& lpszURL); 34 | 35 | class CGenericStatement // execute one SQL statement; no output returned 36 | { 37 | public: 38 | // Methods 39 | bool Execute(CODBC::CConnection& pDbConnect, LPCTSTR lpszSQL) 40 | { 41 | // Create the statement object 42 | CODBC::CStatement statement; 43 | SQLRETURN nRet = statement.Create(pDbConnect); 44 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 45 | 46 | // Prepare the statement 47 | #pragma warning(suppress: 26465 26490 26492) 48 | nRet = statement.Prepare(const_cast(reinterpret_cast(lpszSQL))); 49 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 50 | 51 | // Execute the statement 52 | nRet = statement.Execute(); 53 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 54 | return true; 55 | } 56 | }; 57 | 58 | class CWebpageInsertAccessor // sets the data for inserting one row intro WEBPAGE table 59 | { 60 | public: 61 | // Parameter values 62 | TCHAR m_lpszURL[MAX_URL_LENGTH]; 63 | TCHAR m_lpszTitle[0x100]; 64 | TCHAR m_lpszContent[0x10000]; 65 | 66 | #pragma warning(suppress: 26429) 67 | BEGIN_ODBC_PARAM_MAP(CWebpageInsertAccessor) 68 | SET_ODBC_PARAM_TYPE(SQL_PARAM_INPUT) 69 | #pragma warning(suppress: 26446 26485 26486 26489) 70 | ODBC_PARAM_ENTRY(1, m_lpszURL) 71 | ODBC_PARAM_ENTRY(2, m_lpszTitle) 72 | ODBC_PARAM_ENTRY(3, m_lpszContent) 73 | END_ODBC_PARAM_MAP() 74 | 75 | DEFINE_ODBC_COMMAND(CWebpageInsertAccessor, _T("INSERT INTO `webpage` (`url`, `title`, `content`) VALUES (?, ?, ?);")) 76 | 77 | // You may wish to call this function if you are inserting a record and wish to 78 | // initialize all the fields, if you are not going to explicitly set all of them. 79 | void ClearRecord() noexcept 80 | { 81 | memset(this, 0, sizeof(*this)); 82 | } 83 | }; 84 | 85 | class CWebpageInsert : public CODBC::CAccessor // execute INSERT statement for WEBPAGE table; no output returned 86 | { 87 | public: 88 | // Methods 89 | bool Execute(CODBC::CConnection& pDbConnect, const std::wstring& pURL, const std::wstring& pTitle, const std::wstring& pContent) 90 | { 91 | ClearRecord(); 92 | // Create the statement object 93 | CODBC::CStatement statement; 94 | SQLRETURN nRet = statement.Create(pDbConnect); 95 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 96 | 97 | // Prepare the statement 98 | nRet = statement.Prepare(GetDefaultCommand()); 99 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 100 | 101 | // Bind the parameters 102 | #pragma warning(suppress: 26485) 103 | _tcscpy_s(m_lpszURL, _countof(m_lpszURL), pURL.c_str()); 104 | _tcscpy_s(m_lpszTitle, _countof(m_lpszTitle), pTitle.c_str()); 105 | _tcscpy_s(m_lpszContent, _countof(m_lpszContent), pContent.c_str()); 106 | nRet = BindParameters(statement); 107 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 108 | 109 | // Execute the statement 110 | nRet = statement.Execute(); 111 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 112 | return true; 113 | } 114 | }; 115 | 116 | class CKeywordInsertAccessor // sets the data for inserting one row intro KEYWORD table 117 | { 118 | public: 119 | // Parameter values 120 | TCHAR m_lpszName[0x100]; 121 | 122 | #pragma warning(suppress: 26429) 123 | BEGIN_ODBC_PARAM_MAP(CKeywordInsertAccessor) 124 | SET_ODBC_PARAM_TYPE(SQL_PARAM_INPUT) 125 | #pragma warning(suppress: 26446 26485 26486 26489) 126 | ODBC_PARAM_ENTRY(1, m_lpszName) 127 | END_ODBC_PARAM_MAP() 128 | 129 | DEFINE_ODBC_COMMAND(CKeywordInsertAccessor, _T("INSERT INTO `keyword` (`name`) VALUES (?);")) 130 | 131 | // You may wish to call this function if you are inserting a record and wish to 132 | // initialize all the fields, if you are not going to explicitly set all of them. 133 | void ClearRecord() noexcept 134 | { 135 | memset(this, 0, sizeof(*this)); 136 | } 137 | }; 138 | 139 | class CKeywordInsert : public CODBC::CAccessor // execute INSERT statement for KEYWORD table; no output returned 140 | { 141 | public: 142 | // Methods 143 | bool Execute(CODBC::CConnection& pDbConnect, const std::wstring& pKeyword) 144 | { 145 | ClearRecord(); 146 | // Create the statement object 147 | CODBC::CStatement statement; 148 | SQLRETURN nRet = statement.Create(pDbConnect); 149 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 150 | 151 | // Prepare the statement 152 | nRet = statement.Prepare(GetDefaultCommand()); 153 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 154 | 155 | // Bind the parameters 156 | #pragma warning(suppress: 26485) 157 | _tcscpy_s(m_lpszName, _countof(m_lpszName), pKeyword.c_str()); 158 | nRet = BindParameters(statement); 159 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 160 | 161 | // Execute the statement 162 | nRet = statement.Execute(); 163 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 164 | return true; 165 | } 166 | }; 167 | 168 | class COccurrenceInsertAccessor // sets the data for inserting one row intro OCCURRENCE table 169 | { 170 | public: 171 | // Parameter values 172 | __int64 m_nWebpageID; 173 | __int64 m_nKeywordID; 174 | __int64 m_nCounter; 175 | double m_rPageRank; 176 | 177 | #pragma warning(suppress: 26429) 178 | BEGIN_ODBC_PARAM_MAP(COccurrenceInsertAccessor) 179 | SET_ODBC_PARAM_TYPE(SQL_PARAM_INPUT) 180 | #pragma warning(suppress: 26446 26485 26486 26489) 181 | ODBC_PARAM_ENTRY(1, m_nWebpageID) 182 | ODBC_PARAM_ENTRY(2, m_nKeywordID) 183 | ODBC_PARAM_ENTRY(3, m_nCounter) 184 | ODBC_PARAM_ENTRY(4, m_rPageRank) 185 | END_ODBC_PARAM_MAP() 186 | 187 | DEFINE_ODBC_COMMAND(COccurrenceInsertAccessor, _T("INSERT INTO `occurrence` (`webpage_id`, `keyword_id`, `counter`, `pagerank`) VALUES (?, ?, ?, ?);")) 188 | 189 | // You may wish to call this function if you are inserting a record and wish to 190 | // initialize all the fields, if you are not going to explicitly set all of them. 191 | void ClearRecord() noexcept 192 | { 193 | memset(this, 0, sizeof(*this)); 194 | } 195 | }; 196 | 197 | class COccurrenceInsert : public CODBC::CAccessor // execute INSERT statement for OCCURRENCE table; no output returned 198 | { 199 | public: 200 | // Methods 201 | bool Execute(CODBC::CConnection& pDbConnect, const __int64& nWebpageID, const __int64& nKeywordID, const __int64& nCounter) 202 | { 203 | ClearRecord(); 204 | // Create the statement object 205 | CODBC::CStatement statement; 206 | SQLRETURN nRet = statement.Create(pDbConnect); 207 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 208 | 209 | // Prepare the statement 210 | nRet = statement.Prepare(GetDefaultCommand()); 211 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 212 | 213 | // Bind the parameters 214 | #pragma warning(suppress: 26485) 215 | m_nWebpageID = nWebpageID; 216 | m_nKeywordID = nKeywordID; 217 | m_nCounter = nCounter; 218 | m_rPageRank = 0.0; 219 | nRet = BindParameters(statement); 220 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 221 | 222 | // Execute the statement 223 | nRet = statement.Execute(); 224 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 225 | return true; 226 | } 227 | }; 228 | 229 | class COccurrenceUpdateAccessor // sets the data for updating one row intro OCCURRENCE table 230 | { 231 | public: 232 | // Parameter values 233 | __int64 m_nWebpageID; 234 | __int64 m_nKeywordID; 235 | 236 | #pragma warning(suppress: 26429) 237 | BEGIN_ODBC_PARAM_MAP(COccurrenceUpdateAccessor) 238 | SET_ODBC_PARAM_TYPE(SQL_PARAM_INPUT) 239 | #pragma warning(suppress: 26446 26485 26486 26489) 240 | ODBC_PARAM_ENTRY(1, m_nWebpageID) 241 | ODBC_PARAM_ENTRY(2, m_nKeywordID) 242 | END_ODBC_PARAM_MAP() 243 | 244 | DEFINE_ODBC_COMMAND(COccurrenceUpdateAccessor, _T("UPDATE `occurrence` SET `counter` = `counter` + 1 WHERE `webpage_id` = ? AND `keyword_id` = ?;")) 245 | 246 | // You may wish to call this function if you are inserting a record and wish to 247 | // initialize all the fields, if you are not going to explicitly set all of them. 248 | void ClearRecord() noexcept 249 | { 250 | memset(this, 0, sizeof(*this)); 251 | } 252 | }; 253 | 254 | class COccurrenceUpdate : public CODBC::CAccessor // execute UPDATE statement for OCCURRENCE table; no output returned 255 | { 256 | public: 257 | // Methods 258 | bool Execute(CODBC::CConnection& pDbConnect, const __int64& nWebpageID, const __int64& nKeywordID) 259 | { 260 | ClearRecord(); 261 | // Create the statement object 262 | CODBC::CStatement statement; 263 | SQLRETURN nRet = statement.Create(pDbConnect); 264 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 265 | 266 | // Prepare the statement 267 | nRet = statement.Prepare(GetDefaultCommand()); 268 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 269 | 270 | // Bind the parameters 271 | #pragma warning(suppress: 26485) 272 | m_nWebpageID = nWebpageID; 273 | m_nKeywordID = nKeywordID; 274 | nRet = BindParameters(statement); 275 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 276 | 277 | // Execute the statement 278 | nRet = statement.Execute(); 279 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 280 | return true; 281 | } 282 | }; 283 | 284 | class CDataMiningUpdateAccessor // applying Data Mining 285 | { 286 | public: 287 | // Parameter values 288 | TCHAR m_lpszName[0x100]; 289 | 290 | #pragma warning(suppress: 26429) 291 | BEGIN_ODBC_PARAM_MAP(CDataMiningUpdateAccessor) 292 | SET_ODBC_PARAM_TYPE(SQL_PARAM_INPUT) 293 | #pragma warning(suppress: 26446 26485 26486 26489) 294 | ODBC_PARAM_ENTRY(1, m_lpszName) 295 | END_ODBC_PARAM_MAP() 296 | 297 | DEFINE_ODBC_COMMAND(CDataMiningUpdateAccessor, _T("UPDATE `occurrence` INNER JOIN `keyword` USING(`keyword_id`) SET `pagerank` = data_mining(`webpage_id`, `name`) WHERE `name` = ?;")) 298 | 299 | // You may wish to call this function if you are inserting a record and wish to 300 | // initialize all the fields, if you are not going to explicitly set all of them. 301 | void ClearRecord() noexcept 302 | { 303 | memset(this, 0, sizeof(*this)); 304 | } 305 | }; 306 | 307 | class CDataMiningUpdate : public CODBC::CAccessor // execute UPDATE statement for Data Mining; no output returned 308 | { 309 | public: 310 | // Methods 311 | bool Execute(CODBC::CConnection& pDbConnect, const std::wstring& pKeyword) 312 | { 313 | ClearRecord(); 314 | // Create the statement object 315 | CODBC::CStatement statement; 316 | SQLRETURN nRet = statement.Create(pDbConnect); 317 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 318 | 319 | // Prepare the statement 320 | nRet = statement.Prepare(GetDefaultCommand()); 321 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 322 | 323 | // Bind the parameters 324 | #pragma warning(suppress: 26485) 325 | _tcscpy_s(m_lpszName, _countof(m_lpszName), pKeyword.c_str()); 326 | nRet = BindParameters(statement); 327 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 328 | 329 | // Execute the statement 330 | nRet = statement.Execute(); 331 | ODBC_CHECK_RETURN_FALSE(nRet, statement); 332 | return true; 333 | } 334 | }; 335 | 336 | #endif 337 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | text-mining.ro 5 | 6 | 7 | 8 | 9 | 10 | 82 | 83 | 84 | 85 |
86 |
87 |
88 | 89 |
Website developed by
Stefan-Mihai MOGA as part of his dissertation. 90 |
91 |
92 |
93 |
94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /res/WebSearchEngine.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/res/WebSearchEngine.ico -------------------------------------------------------------------------------- /res/WebSearchEngine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/res/WebSearchEngine.png -------------------------------------------------------------------------------- /res/WebSearchEngine.rc2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/res/WebSearchEngine.rc2 -------------------------------------------------------------------------------- /search.php: -------------------------------------------------------------------------------- 1 | */ 14 | 15 | $servername = "localhost"; 16 | $username = "r46882text_engine"; 17 | $password = "TextMining2021!@#$"; 18 | $dbname = "r46882text_mining"; 19 | 20 | 21 | function content_index($content, $keyword) { 22 | $pos = -1; 23 | foreach ($keyword as $key => $value) { 24 | $tmp = stripos($content, $value); 25 | $pos = ($pos < 0) ? $tmp : min($pos, $tmp); 26 | } 27 | return $pos; 28 | } 29 | 30 | 31 | echo "\n"; 32 | echo "\n"; 33 | echo "\t\n"; 34 | echo "\t\t" . $_GET['q'] . "\n"; 35 | echo "\t\t\n"; 36 | echo "\t\t\n"; 37 | echo "\t\t\n"; 38 | echo "\t\t\n"; 39 | echo "\t\n"; 40 | echo "\t\n"; 41 | echo "\t\t
\n"; 42 | $search = strtolower($_GET['q']); 43 | $counter = 0; 44 | $mysql_clause = ""; 45 | $mysql_select = ""; 46 | // $token_find = array(); 47 | // $token_replace = array(); 48 | $token = strtok($search, "\t\n\r\"\' !?#$%&|(){}[]*/+-:;<>=.,"); 49 | while ($token !== false) { 50 | // array_push($token_find, $token); 51 | // array_push($token_replace, "" . $token . ""); 52 | if ($counter == 0) { 53 | $mysql_clause = "SELECT DISTINCT `webpage_id` FROM `occurrence` INNER JOIN `keyword` USING (`keyword_id`) WHERE `name` = '$token'"; 54 | $mysql_select = "(`name` = '$token')"; 55 | } 56 | else { 57 | $mysql_clause = "SELECT DISTINCT `webpage_id` FROM `occurrence` INNER JOIN `keyword` USING (`keyword_id`) WHERE `name` = '$token' AND `webpage_id` IN (" . $mysql_clause . ")"; 58 | $mysql_select = $mysql_select . " OR (`name` = '$token')"; 59 | } 60 | $counter++; 61 | $token = strtok("\t\n\r\"\' !?#$%&|(){}[]*/+-:;<>=.,"); 62 | }; 63 | if ($counter > 0) 64 | { 65 | // Create connection 66 | $conn = mysqli_connect($servername, $username, $password, $dbname); 67 | // Check connection 68 | if (!$conn) { 69 | die("Connection failed: " . mysqli_connect_error()); 70 | } 71 | 72 | $statement = "SELECT DISTINCT `webpage_id`, `title`, `url`, `content`, AVG(`pagerank`) AS score FROM `occurrence` INNER JOIN `webpage` USING(`webpage_id`) INNER JOIN `keyword` USING(`keyword_id`) WHERE `webpage_id` IN (" . $mysql_clause . ") AND (" . $mysql_select . ") GROUP BY `webpage_id` ORDER BY score DESC LIMIT 100;"; 73 | $result = mysqli_query($conn, $statement); 74 | if (mysqli_num_rows($result) > 0) { 75 | // output data of each row 76 | while($row = mysqli_fetch_assoc($result)) { 77 | echo "\t\t\t
" . $row["webpage_id"] . ". " . $row["title"] . " Score: " . $row["score"] . "
"; 78 | echo "" . $row["url"] . "
"; 79 | echo "" . utf8_encode(substr($row["content"], 0, 1024)) . "

\n"; 80 | // echo str_ireplace($token_find, $token_replace, substr($row["content"], content_index($row["content"], $token_find), 1024)) . "

\n"; 81 | } 82 | } else { 83 | echo "0 results"; 84 | } 85 | mysqli_close($conn); 86 | } 87 | echo "\t\t\n"; 88 | echo "\t\t\n"; 89 | echo "\t\n"; 90 | echo "\n"; 91 | ?> 92 | -------------------------------------------------------------------------------- /search.sql: -------------------------------------------------------------------------------- 1 | CREATE UNIQUE INDEX index_name ON `keyword`(`name`); 2 | 3 | DELIMITER // 4 | CREATE OR REPLACE FUNCTION no_of_words(token VARCHAR(256)) RETURNS REAL READS SQL DATA 5 | BEGIN 6 | DECLARE retVal REAL; 7 | SELECT MAX(`counter`) INTO retVal FROM `occurrence` INNER JOIN `keyword` USING(`keyword_id`) WHERE `name` = token; 8 | RETURN retVal; 9 | END// 10 | DELIMITER ; 11 | 12 | DELIMITER // 13 | CREATE OR REPLACE FUNCTION no_of_pages(token VARCHAR(256)) RETURNS REAL READS SQL DATA 14 | BEGIN 15 | DECLARE retVal REAL; 16 | SELECT COUNT(`webpage_id`) INTO retVal FROM `occurrence` INNER JOIN `keyword` USING(`keyword_id`) WHERE `name` = token; 17 | RETURN retVal; 18 | END// 19 | DELIMITER ; 20 | 21 | DELIMITER // 22 | CREATE OR REPLACE FUNCTION total_pages() RETURNS REAL READS SQL DATA 23 | BEGIN 24 | DECLARE retVal REAL; 25 | SELECT COUNT(`webpage_id`) INTO retVal FROM `webpage`; 26 | RETURN retVal; 27 | END// 28 | DELIMITER ; 29 | 30 | DELIMITER // 31 | CREATE OR REPLACE FUNCTION data_mining(webpage_no BIGINT, token VARCHAR(256)) RETURNS REAL READS SQL DATA 32 | BEGIN 33 | DECLARE retVal REAL; 34 | SELECT SUM(`counter`)/no_of_words(token)*LOG((1+total_pages())/no_of_pages(token)) INTO retVal FROM `occurrence` INNER JOIN `keyword` USING(`keyword_id`) WHERE `name` = token AND `webpage_id` = webpage_no; 35 | RETURN retVal; 36 | END// 37 | DELIMITER ; 38 | -------------------------------------------------------------------------------- /stdafx.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // stdafx.cpp : source file that includes just the standard includes 16 | // WebSearchEngine.pch will be the pre-compiled header 17 | // stdafx.obj will contain the pre-compiled type information 18 | 19 | #include "stdafx.h" 20 | 21 | 22 | -------------------------------------------------------------------------------- /stdafx.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | // stdafx.h : include file for standard system include files, 16 | // or project specific include files that are used frequently, 17 | // but are changed infrequently 18 | 19 | #pragma once 20 | 21 | #ifndef VC_EXTRALEAN 22 | #define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers 23 | #endif 24 | 25 | #include "targetver.h" 26 | 27 | #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 28 | 29 | #define _CRT_SECURE_NO_WARNINGS 30 | 31 | #define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be explicit 32 | 33 | // turns off MFC's hiding of some common and often safely ignored warning messages 34 | #define _AFX_ALL_WARNINGS 35 | 36 | #include // MFC core and standard components 37 | #include // MFC extensions 38 | 39 | 40 | #include // MFC Automation classes 41 | 42 | #ifndef _AFX_NO_OLE_SUPPORT 43 | #include // MFC support for Internet Explorer 4 Common Controls 44 | #endif 45 | #ifndef _AFX_NO_AFXCMN_SUPPORT 46 | #include // MFC support for Windows Common Controls 47 | #endif // _AFX_NO_AFXCMN_SUPPORT 48 | 49 | #include // MFC support for ribbons and control bars 50 | 51 | #include // MFC socket extensions 52 | 53 | #ifdef _UNICODE 54 | #if defined _M_IX86 55 | #pragma comment(linker,"/manifestdependency:\"type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='x86' publicKeyToken='6595b64144ccf1df' language='*'\"") 56 | #elif defined _M_X64 57 | #pragma comment(linker,"/manifestdependency:\"type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='amd64' publicKeyToken='6595b64144ccf1df' language='*'\"") 58 | #else 59 | #pragma comment(linker,"/manifestdependency:\"type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='*' publicKeyToken='6595b64144ccf1df' language='*'\"") 60 | #endif 61 | #endif 62 | 63 | //Pull in support for ATL 64 | #include 65 | 66 | //Pull in support for ODBC 67 | #include 68 | #include 69 | #include 70 | 71 | //Pull in support for STL 72 | #include 73 | #include 74 | #include 75 | #include 76 | #include 77 | #include 78 | #include 79 | #include 80 | 81 | #define REGKEY_SECTION _T("Settings") 82 | #define REGKEY_DBTYPE _T("dbtype") 83 | #define REGKEY_HOSTNAME _T("hostname") 84 | #define REGKEY_HOSTPORT _T("hostport") 85 | #define REGKEY_DATABASE _T("database") 86 | #define REGKEY_FILENAME _T("filename") 87 | #define REGKEY_USERNAME _T("username") 88 | #define REGKEY_PASSWORD _T("password") 89 | 90 | #define DEFAULT_DBTYPE DB_MYSQL 91 | #define DEFAULT_HOSTNAME _T("localhost") 92 | #define DEFAULT_HOSTPORT _T("3306") /*only for MySQL*/ 93 | #define DEFAULT_DATABASE _T("TextMining") 94 | #define DEFAULT_FILENAME _T("") 95 | #define DEFAULT_USERNAME _T("root") 96 | #define DEFAULT_PASSWORD _T("") 97 | 98 | #define MAX_URL_LENGTH 0x1000 99 | 100 | //Another flavour of an ODBC_CHECK_RETURN macro 101 | #define ODBC_CHECK_RETURN_FALSE(nRet, handle) \ 102 | handle.ValidateReturnValue(nRet); \ 103 | if (!SQL_SUCCEEDED(nRet)) \ 104 | { \ 105 | return false; \ 106 | } 107 | -------------------------------------------------------------------------------- /targetver.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2022-2025 Stefan-Mihai MOGA 2 | This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA. 3 | 4 | WebSearchEngine is free software: you can redistribute it and/or modify it 5 | under the terms of the GNU General Public License as published by the Open 6 | Source Initiative, either version 3 of the License, or any later version. 7 | 8 | WebSearchEngine is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 10 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 11 | 12 | You should have received a copy of the GNU General Public License along with 13 | WebSearchEngine. If not, see */ 14 | 15 | #pragma once 16 | 17 | // Including SDKDDKVer.h defines the highest available Windows platform. 18 | 19 | // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and 20 | // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. 21 | 22 | #include 23 | -------------------------------------------------------------------------------- /x64/Release/WebSearchEngine.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mihaimoga/WebSearchEngine/29584b55b0e53774543e62e15eee3579498d1cb5/x64/Release/WebSearchEngine.exe --------------------------------------------------------------------------------