├── .DS_Store ├── .cproject ├── .gitattributes ├── .gitignore ├── .project ├── .settings └── org.eclipse.cdt.managedbuilder.core.prefs ├── README.md ├── TopicDetectionAndTracking.sln └── TopicDetectionAndTracking ├── DataPreprocessing ├── DataPreprocessing.cpp └── DataPreprocessing.h ├── Dataset ├── mttkn │ ├── 20030401_0113_1041_AFP_ARB.mttkn │ ├── 20030401_0130_0330_XIN_MAN.mttkn │ ├── 20030401_0300_0500_AFP_MAN.mttkn │ ├── 20030401_0530_0700_CNA_MAN.mttkn │ ├── 20030401_0600_0800_XIN_ARB.mttkn │ ├── 20030401_0730_0930_XIN_MAN.mttkn │ ├── 20030401_1000_1200_ZBN_MAN.mttkn │ ├── 20030401_1041_2343_AFP_ARB.mttkn │ ├── 20030401_1100_1230_ANN_ARB.mttkn │ ├── 20030401_1130_1330_XIN_MAN.mttkn │ ├── 20030401_1500_1700_XIN_ARB.mttkn │ ├── 20030401_1530_1730_XIN_MAN.mttkn │ ├── 20030401_1930_2130_XIN_MAN.mttkn │ ├── 20030402_0106_1145_AFP_ARB.mttkn │ ├── 20030402_0130_0330_XIN_MAN.mttkn │ ├── 20030402_0300_0500_AFP_MAN.mttkn │ ├── 20030402_0530_0700_CNA_MAN.mttkn │ ├── 20030402_0600_0800_XIN_ARB.mttkn │ ├── 20030402_0730_0930_XIN_MAN.mttkn │ └── 20030402_0830_1000_CNA_MAN.mttkn ├── mttkn_bnd │ ├── 20030401_0113_1041_AFP_ARB.mttkn_bnd │ ├── 20030401_0130_0330_XIN_MAN.mttkn_bnd │ ├── 20030401_0300_0500_AFP_MAN.mttkn_bnd │ ├── 20030401_0530_0700_CNA_MAN.mttkn_bnd │ ├── 20030401_0600_0800_XIN_ARB.mttkn_bnd │ ├── 20030401_0730_0930_XIN_MAN.mttkn_bnd │ ├── 20030401_1000_1200_ZBN_MAN.mttkn_bnd │ ├── 20030401_1041_2343_AFP_ARB.mttkn_bnd │ ├── 20030401_1100_1230_ANN_ARB.mttkn_bnd │ ├── 20030401_1130_1330_XIN_MAN.mttkn_bnd │ ├── 20030401_1500_1700_XIN_ARB.mttkn_bnd │ ├── 20030401_1530_1730_XIN_MAN.mttkn_bnd │ ├── 20030401_1930_2130_XIN_MAN.mttkn_bnd │ ├── 20030402_0106_1145_AFP_ARB.mttkn_bnd │ ├── 20030402_0130_0330_XIN_MAN.mttkn_bnd │ ├── 20030402_0300_0500_AFP_MAN.mttkn_bnd │ ├── 20030402_0530_0700_CNA_MAN.mttkn_bnd │ ├── 20030402_0600_0800_XIN_ARB.mttkn_bnd │ ├── 20030402_0730_0930_XIN_MAN.mttkn_bnd │ └── 20030402_0830_1000_CNA_MAN.mttkn_bnd └── tfidf.dat ├── Evaluation ├── Evaluation.cpp └── Evaluation.h ├── FirstStoryDetection ├── FirstStoryDetection.cpp └── FirstStoryDetection.h ├── Main.cpp ├── Presentation ├── Presentation.cpp └── Presentation.h ├── StoryLinkDetection ├── StoryLinkDetection.cpp └── StoryLinkDetection.h ├── StorySegmentation ├── StorySegmentation.cpp └── StorySegmentation.h ├── TopicDetection ├── TopicDetection.cpp └── TopicDetection.h ├── TopicDetectionAndTracking.vcxproj ├── TopicDetectionAndTracking.vcxproj.filters ├── TopicTracking ├── TopicTracking.cpp └── TopicTracking.h ├── Utilities ├── Story.cpp ├── Story.h ├── Utilities.cpp └── Utilities.h └── Websites ├── part1.html └── part2.html /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavierwu/TopicDetectionAndTracking/e449feda00e55fc739f3ad104dc29c5fa4089a4b/.DS_Store -------------------------------------------------------------------------------- /.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 29 | 30 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 81 | 82 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.sln.docstates 8 | 9 | # Build results 10 | [Dd]ebug/ 11 | [Dd]ebugPublic/ 12 | [Rr]elease/ 13 | x64/ 14 | build/ 15 | bld/ 16 | [Bb]in/ 17 | [Oo]bj/ 18 | 19 | # Roslyn cache directories 20 | *.ide/ 21 | 22 | # MSTest test Results 23 | [Tt]est[Rr]esult*/ 24 | [Bb]uild[Ll]og.* 25 | 26 | #NUNIT 27 | *.VisualState.xml 28 | TestResult.xml 29 | 30 | # Build Results of an ATL Project 31 | [Dd]ebugPS/ 32 | [Rr]eleasePS/ 33 | dlldata.c 34 | 35 | *_i.c 36 | *_p.c 37 | *_i.h 38 | *.ilk 39 | *.meta 40 | *.obj 41 | *.pch 42 | *.pdb 43 | *.pgc 44 | *.pgd 45 | *.rsp 46 | *.sbr 47 | *.tlb 48 | *.tli 49 | *.tlh 50 | *.tmp 51 | *.tmp_proj 52 | *.log 53 | *.vspscc 54 | *.vssscc 55 | .builds 56 | *.pidb 57 | *.svclog 58 | *.scc 59 | 60 | # Chutzpah Test files 61 | _Chutzpah* 62 | 63 | # Visual C++ cache files 64 | ipch/ 65 | *.aps 66 | *.ncb 67 | *.opensdf 68 | *.sdf 69 | *.cachefile 70 | 71 | # Visual Studio profiler 72 | *.psess 73 | *.vsp 74 | *.vspx 75 | 76 | # TFS 2012 Local Workspace 77 | $tf/ 78 | 79 | # Guidance Automation Toolkit 80 | *.gpState 81 | 82 | # ReSharper is a .NET coding add-in 83 | _ReSharper*/ 84 | *.[Rr]e[Ss]harper 85 | *.DotSettings.user 86 | 87 | # JustCode is a .NET coding addin-in 88 | .JustCode 89 | 90 | # TeamCity is a build add-in 91 | _TeamCity* 92 | 93 | # DotCover is a Code Coverage Tool 94 | *.dotCover 95 | 96 | # NCrunch 97 | _NCrunch_* 98 | .*crunch*.local.xml 99 | 100 | # MightyMoose 101 | *.mm.* 102 | AutoTest.Net/ 103 | 104 | # Web workbench (sass) 105 | .sass-cache/ 106 | 107 | # Installshield output folder 108 | [Ee]xpress/ 109 | 110 | # DocProject is a documentation generator add-in 111 | DocProject/buildhelp/ 112 | DocProject/Help/*.HxT 113 | DocProject/Help/*.HxC 114 | DocProject/Help/*.hhc 115 | DocProject/Help/*.hhk 116 | DocProject/Help/*.hhp 117 | DocProject/Help/Html2 118 | DocProject/Help/html 119 | 120 | # Click-Once directory 121 | publish/ 122 | 123 | # Publish Web Output 124 | *.[Pp]ublish.xml 125 | *.azurePubxml 126 | ## TODO: Comment the next line if you want to checkin your 127 | ## web deploy settings but do note that will include unencrypted 128 | ## passwords 129 | #*.pubxml 130 | 131 | # NuGet Packages Directory 132 | packages/* 133 | ## TODO: If the tool you use requires repositories.config 134 | ## uncomment the next line 135 | #!packages/repositories.config 136 | 137 | # Enable "build/" folder in the NuGet Packages folder since 138 | # NuGet packages use it for MSBuild targets. 139 | # This line needs to be after the ignore of the build folder 140 | # (and the packages folder if the line above has been uncommented) 141 | !packages/build/ 142 | 143 | # Windows Azure Build Output 144 | csx/ 145 | *.build.csdef 146 | 147 | # Windows Store app package directory 148 | AppPackages/ 149 | 150 | # Others 151 | sql/ 152 | *.Cache 153 | ClientBin/ 154 | [Ss]tyle[Cc]op.* 155 | ~$* 156 | *~ 157 | *.dbmdl 158 | *.dbproj.schemaview 159 | *.pfx 160 | *.publishsettings 161 | node_modules/ 162 | 163 | # RIA/Silverlight projects 164 | Generated_Code/ 165 | 166 | # Backup & report files from converting an old project file 167 | # to a newer Visual Studio version. Backup files are not needed, 168 | # because we have git ;-) 169 | _UpgradeReport_Files/ 170 | Backup*/ 171 | UpgradeLog*.XML 172 | UpgradeLog*.htm 173 | 174 | # SQL Server files 175 | *.mdf 176 | *.ldf 177 | 178 | # Business Intelligence projects 179 | *.rdl.data 180 | *.bim.layout 181 | *.bim_*.settings 182 | 183 | # Microsoft Fakes 184 | FakesAssemblies/ 185 | 186 | # LightSwitch generated files 187 | GeneratedArtifacts/ 188 | _Pvt_Extensions/ 189 | ModelManifest.xml -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | TDT 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.core.ccnature 24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 26 | 27 | 28 | -------------------------------------------------------------------------------- /.settings/org.eclipse.cdt.managedbuilder.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPATH/delimiter=\: 3 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPATH/operation=remove 4 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPLUS_INCLUDE_PATH/delimiter=\: 5 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPLUS_INCLUDE_PATH/operation=remove 6 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/C_INCLUDE_PATH/delimiter=\: 7 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/C_INCLUDE_PATH/operation=remove 8 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/append=true 9 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/appendContributed=true 10 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/LIBRARY_PATH/delimiter=\: 11 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/LIBRARY_PATH/operation=remove 12 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/append=true 13 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/appendContributed=true 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TopicDetectionAndTracking 2 | My graduation project (with three friends), dealing with tasks about TDT. 3 | 4 | ## Dataset/ 5 | ### mttkn/ 6 | Contains some TDT5 token files. 7 | ### mttkn_bnd/ 8 | Contains the corresponding boundary files. 9 | ## Main.cpp 10 | Contains main() function only. 11 | ## Utilities/ 12 | Codes that are shared among all other parts, e.g., the class 'Story'. 13 | ## StorySegmentation/ 14 | IF we dont have boundary files, how could we find out the boundaries between two documents in a token file? 15 | ## DataPreprocessing/ 16 | Read data, do some pre-processing 17 | ## StoryLinkDetection/ 18 | Find out the link between two stories . 19 | ## TopicDetection/ 20 | Detect some topics. 21 | ## FirstStoryDetection/ 22 | Detect the first story of a ceratin event. 23 | ## TopicTracking/ 24 | ## Evaluation/ 25 | ## Presentation/ 26 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.31101.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TopicDetectionAndTracking", "TopicDetectionAndTracking\TopicDetectionAndTracking.vcxproj", "{AE135031-9969-4F5E-82B0-974800352F3F}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Win32 = Debug|Win32 11 | Release|Win32 = Release|Win32 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {AE135031-9969-4F5E-82B0-974800352F3F}.Debug|Win32.ActiveCfg = Debug|Win32 15 | {AE135031-9969-4F5E-82B0-974800352F3F}.Debug|Win32.Build.0 = Debug|Win32 16 | {AE135031-9969-4F5E-82B0-974800352F3F}.Release|Win32.ActiveCfg = Release|Win32 17 | {AE135031-9969-4F5E-82B0-974800352F3F}.Release|Win32.Build.0 = Release|Win32 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/DataPreprocessing/DataPreprocessing.cpp: -------------------------------------------------------------------------------- 1 | #include "DataPreprocessing.h" 2 | 3 | const int MAX_FILES = 999999; 4 | 5 | /* Set 'corpus' and 'glossary', and do some other preprocessing */ 6 | void DataPreprocessing (vector &corpus, 7 | map &glossaryIntToString, map &glossaryStringToInt, 8 | map> &storiesIndexWithCertainWord, 9 | const string tknDir, const string bndDir, 10 | const bool &isWithStemmer) 11 | { 12 | cout << "> Start DataPreprocessing......" << endl; 13 | 14 | readCorpus (corpus, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord, tknDir, 15 | bndDir, isWithStemmer); 16 | 17 | cout << "> DataPreprocessing Done." << endl; 18 | } 19 | 20 | void readCorpus (vector &corpus, 21 | map &glossaryIntToString, map &glossaryStringToInt, 22 | map> &storiesIndexWithCertainWord, 23 | const string tknDir, const string bndDir, const bool &isWithStemmer) 24 | { 25 | cout << ">> Start reading corpus......" << endl; 26 | 27 | if (!isWithStemmer) { 28 | cout << "Please choose" << endl; 29 | cout << "1. Read from the specific file" << endl; 30 | cout << "2. Read from files in the directory" << endl; 31 | 32 | char choice; 33 | cin >> choice; 34 | 35 | while (true) { 36 | fflush (stdin); 37 | if (choice == '1') { 38 | readCorpusFromFile (corpus, glossaryIntToString, glossaryStringToInt, 39 | storiesIndexWithCertainWord, tknDir, bndDir, isWithStemmer); 40 | break; 41 | } else if (choice == '2') { 42 | readCorpusFromDirectory (corpus, glossaryIntToString, glossaryStringToInt, 43 | storiesIndexWithCertainWord, tknDir, bndDir, isWithStemmer); 44 | break; 45 | } else { 46 | cout << "Invalid input, please input again!" << endl; 47 | cin >> choice; 48 | } 49 | } 50 | 51 | } else { // TODO: (optional) add a stemmer to the readCorpus(...) ? 52 | } 53 | 54 | cout << ">> Reading corpus done." << endl; 55 | } 56 | 57 | void readCorpusFromFile (vector &corpus, 58 | map &glossaryIntToString, map &glossaryStringToInt, 59 | map> &storiesIndexWithCertainWord, 60 | const string tknDir, const string bndDir, const bool &isWithStemmer) 61 | { 62 | 63 | while (true) { 64 | // the id of the first and the last words of a story 65 | vector Brecid; 66 | vector Erecid; 67 | 68 | string bndFile, tknFile; 69 | 70 | cout << "Please input the file name of bnd file" << endl; 71 | cin >> bndFile; 72 | bndFile = bndDir + bndFile; 73 | 74 | cout << "Please input the file name of tkn file" << endl; 75 | cin >> tknFile; 76 | tknFile = tknDir + tknFile; 77 | 78 | readBndFile (corpus, bndFile, Brecid, Erecid); 79 | 80 | readTknFile (corpus, tknFile, Brecid, Erecid, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord); 81 | 82 | cout << "Continue?(Y/N)" << endl; 83 | 84 | char choice; 85 | cin >> choice; 86 | 87 | 88 | REJUDGE: fflush (stdin); 89 | if (choice == 'Y' || choice == 'y') { 90 | continue; 91 | } else if (choice == 'N' || choice == 'n') { 92 | break; 93 | } else { 94 | cout << "Invalid input, please input again!" << endl; 95 | cin >> choice; 96 | goto REJUDGE; 97 | } 98 | 99 | } 100 | } 101 | 102 | void readCorpusFromDirectory (vector &corpus, map &glossaryIntToString, 103 | map &glossaryStringToInt, 104 | map> &storiesIndexWithCertainWord, 105 | const string tknDir, const string bndDir, const bool &isWithStemmer) 106 | { 107 | _finddata_t file; 108 | long lf; 109 | 110 | // the id of the first and the last words of a story 111 | vector Brecid; 112 | vector Erecid; 113 | 114 | string bnd = bndDir.c_str (); 115 | bnd += "*.*"; 116 | string tkn = tknDir.c_str (); 117 | tkn += "*.*"; 118 | 119 | int numOfFileTobeRead = 0; 120 | int numOfFilesRead = 0; 121 | 122 | cout << "Input the number of files want to be read (0 represents all)" << endl; 123 | cin >> numOfFileTobeRead; 124 | 125 | if (numOfFileTobeRead == 0) { 126 | numOfFileTobeRead = MAX_FILES; 127 | } 128 | 129 | if ((lf = _findfirst (bnd.c_str (), &file)) == -1l) 130 | cout << "No bnd file found!" << endl; 131 | else { 132 | // the first file name found is "..", so drop it 133 | bool firsFileIsGhost = true; 134 | while (_findnext (lf, &file) == 0 && numOfFilesRead < numOfFileTobeRead) { 135 | if (!firsFileIsGhost) { 136 | cout << file.name << " found" << endl; 137 | 138 | string bndFile (file.name); 139 | bndFile = bndDir + bndFile; 140 | 141 | readBndFile (corpus, bndFile, Brecid, Erecid); 142 | numOfFilesRead++; 143 | } 144 | firsFileIsGhost = false; 145 | } 146 | } 147 | 148 | numOfFilesRead = 0; 149 | if ((lf = _findfirst (tkn.c_str (), &file)) == -1l) 150 | cout << "No tkn file found!" << endl; 151 | else { 152 | bool firsFileIsGhost = true; 153 | while (_findnext (lf, &file) == 0 && numOfFilesRead < numOfFileTobeRead) { 154 | if (!firsFileIsGhost) { 155 | cout << file.name << " found" << endl; 156 | 157 | string tknFile (file.name); 158 | tknFile = tknDir + tknFile; 159 | 160 | readTknFile (corpus, tknFile, Brecid, Erecid, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord); 161 | numOfFilesRead++; 162 | } 163 | firsFileIsGhost = false; 164 | } 165 | } 166 | 167 | _findclose (lf); 168 | 169 | } 170 | 171 | void readBndFile (vector &corpus, const string bndFile, vector &Brecid, 172 | vector &Erecid) 173 | { 174 | int numOfStories = 0; 175 | 176 | ifstream fin (bndFile, ios::in); 177 | assert (fin.is_open ()); 178 | 179 | // the first line is title, and it is of no use, so try to work hard to be a useful man 180 | string titleUseless; 181 | getline (fin, titleUseless); 182 | 183 | // for each line, if simply use fin, we can get 5 strings: 184 | // 1. "", very important 189 | string boundaryUseless; 190 | while (fin >> boundaryUseless) { 191 | string timestamp, doctypeUseless; 192 | 193 | // because they are not only Brecid and Erecid, so they are called as follows 194 | char BrecidWithRedundancy[15] = { }; 195 | char ErecidWithRedundancy[15] = { }; 196 | 197 | // the follows are real Brecid and Erecid 198 | int BrecidInt, ErecidInt; 199 | 200 | fin >> timestamp >> doctypeUseless >> BrecidWithRedundancy >> ErecidWithRedundancy; 201 | 202 | // retrieve the timestamp 203 | timestamp = timestamp.substr (9, timestamp.length () - 9); 204 | 205 | // use "=" to split the string 206 | const char *split = "="; 207 | 208 | // point to the real Brecid and Erecid 209 | char *pid; 210 | 211 | pid = strtok (BrecidWithRedundancy, split); 212 | pid = strtok (NULL, split); 213 | 214 | // convert char* to int 215 | BrecidInt = atoi (pid); 216 | 217 | pid = strtok (ErecidWithRedundancy, split); 218 | pid = strtok (NULL, split); 219 | 220 | ErecidInt = atoi (pid); 221 | 222 | vector words; 223 | Story newStory (numOfStories, words, timestamp); 224 | corpus.push_back (newStory); 225 | numOfStories++; 226 | 227 | Brecid.push_back (BrecidInt); 228 | Erecid.push_back (ErecidInt); 229 | } 230 | 231 | fin.close (); 232 | 233 | cout << "read bnd file done!" << endl; 234 | } 235 | 236 | void readTknFile (vector &corpus, const string tknFile, 237 | const vector &Brecid, const vector &Erecid, 238 | map &glossaryIntToString, map &glossaryStringToInt, 239 | map> &storiesIndexWithCertainWord) 240 | { 241 | 242 | ifstream fin (tknFile, ios::in); 243 | assert (fin.is_open ()); 244 | 245 | // the first line is title, and it is of no use, so try to work hard again to be a useful man 246 | string titleUseless; 247 | getline (fin, titleUseless); 248 | 249 | // for each line, if simply use fin, we can get 4 strings: 250 | // 1. "> _WUseless) { 259 | string recidWithRedundancy, TrUseless, word; 260 | 261 | // this means a new tkn file is read 262 | if (Brecid[numOfStories] == 1 && beginOfAStroy){ 263 | recid = 1; 264 | beginOfAStroy = false; 265 | } 266 | 267 | if (recid > Erecid[numOfStories]) { 268 | numOfStories++; 269 | beginOfAStroy = true; 270 | } 271 | 272 | fin >> recidWithRedundancy >> TrUseless >> word; 273 | 274 | processWord (word); 275 | 276 | addWordToGlossary (word, glossaryIntToString, glossaryStringToInt); 277 | 278 | int wordID = glossaryStringToInt[word]; 279 | corpus[numOfStories].addWord (wordID); 280 | storiesIndexWithCertainWord[wordID].insert (numOfStories); 281 | 282 | recid++; 283 | } 284 | 285 | fin.close (); 286 | 287 | cout << "read tkn file done!" << endl; 288 | } 289 | 290 | void processWord (string &word) 291 | { 292 | int len = word.length (); 293 | 294 | for (int i = 0; i < len; i++) { 295 | if (word[i] >= 'A' && word[i] <= 'Z') { 296 | word[i] -= ('A' - 'a'); 297 | } else if (word[i] >= '0' && word[i] <= '9') { 298 | continue; 299 | } else if (word[i] == '.' || word[i] == '-') { 300 | continue; 301 | } else if (word[i] < 'a' || word[i] > 'z') { 302 | // if you can't figure out, just give up 303 | word = word.substr (0, i) + word.substr (i + 1, word.length () - (i + 1)); 304 | i--; 305 | len--; 306 | } 307 | } 308 | } 309 | 310 | void addWordToGlossary (const string &word, map &glossaryIntToString, 311 | map &glossaryStringToInt) 312 | { 313 | if (glossaryStringToInt.find (word) == glossaryStringToInt.end ()) { 314 | int index = glossaryStringToInt.size (); 315 | glossaryStringToInt[word] = index; 316 | glossaryIntToString[index] = word; 317 | } 318 | } 319 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/DataPreprocessing/DataPreprocessing.h: -------------------------------------------------------------------------------- 1 | /* Retrieve data from existed files (tkn_file & bnd_file), and do some pre-processing. */ 2 | 3 | #ifndef DATA_PREPROCESSING_H 4 | #define DATA_PREPROCESSING_H 5 | 6 | #include "../Utilities/Story.h" 7 | 8 | /* Read from files, set the 'corpus' and 'glossary', and do some other preprocessing. 9 | Input: 'tknDir', 'bndDir', 'isWithStemmer' 10 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' 11 | */ 12 | void DataPreprocessing (vector &corpus, 13 | map &glossaryIntToString, map &glossaryStringToInt, 14 | map> &storiesIndexWithCertainWord, 15 | const string tknDir, const string bndDir, 16 | const bool &isWithStemmer = false); 17 | 18 | /* Read from files, set the 'corpus' and 'glossary'. 19 | Input: 'tknDir', 'bndDir', 'isWithStemmer' 20 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' 21 | */ 22 | void readCorpus (vector &corpus, 23 | map &glossaryIntToString, map &glossaryStringToInt, 24 | map> &storiesIndexWithCertainWord, 25 | const string tknDir, const string bndDir, const bool &isWithStemmer = false); 26 | 27 | /* Read from the specific file, set the 'corpus' and 'glossary'. 28 | Input: 'tknDir', 'bndDir', 'isWithStemmer' 29 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' 30 | */ 31 | void readCorpusFromFile (vector &corpus, 32 | map &glossaryIntToString, map &glossaryStringToInt, 33 | map> &storiesIndexWithCertainWord, 34 | const string tknDir, const string bndDir, const bool &isWithStemmer = false); 35 | 36 | /* Read from files in the directory, set the 'corpus' and 'glossary'. 37 | Input: 'tknDir', 'bndDir', 'isWithStemmer' 38 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' 39 | */ 40 | void readCorpusFromDirectory (vector &corpus, map &glossaryIntToString, 41 | map &glossaryStringToInt, 42 | map> &storiesIndexWithCertainWord, 43 | const string tknDir, const string bndDir, 44 | const bool &isWithStemmer = false); 45 | 46 | 47 | /* Read from bnd files to get the begin and the end of a story. 48 | Input: 'bnd_file' 49 | Output: 'corpus', 'Brecid', 'Erecid' */ 50 | void readBndFile (vector &corpus, const string bndFile, 51 | vector &Brecid, vector &Erecid); 52 | 53 | /* Read from bnd files, get the words for each story and set the glossary. 54 | Input: 'tkn_file', 'Brecid', 'Erecid' 55 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' */ 56 | void readTknFile (vector &corpus, const string tknFile, 57 | const vector &Brecid, const vector &Erecid, 58 | map &glossaryIntToString, map &glossaryStringToInt, 59 | map> &storiesIndexWithCertainWord); 60 | 61 | /* Process the word, remove punctuations and convert all the letters to lowercase 62 | Input: 'word' 63 | Output: 'word' */ 64 | void processWord (string &word); 65 | 66 | /* add the word to glossary if the word does not exist 67 | Input: 'word' 68 | Output: 'glossaryIntToString', 'glossaryStringToInt' */ 69 | void addWordToGlossary (const string &word, map &glossaryIntToString, 70 | map &glossaryStringToInt); 71 | 72 | #endif/* Retrieve data from existed files (tkn_file & bnd_file), and do some pre-processing. */ -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0113_1041_AFP_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0130_0330_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0300_0500_AFP_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0530_0700_CNA_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0600_0800_XIN_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0730_0930_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1000_1200_ZBN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1041_2343_AFP_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1100_1230_ANN_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1130_1330_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1500_1700_XIN_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1530_1730_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_1930_2130_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0106_1145_AFP_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0130_0330_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0300_0500_AFP_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0530_0700_CNA_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0600_0800_XIN_ARB.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0730_0930_XIN_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Dataset/mttkn_bnd/20030402_0830_1000_CNA_MAN.mttkn_bnd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Evaluation/Evaluation.cpp: -------------------------------------------------------------------------------- 1 | #include "Evaluation.h" 2 | 3 | void Evaluation(const vector &firstStories, const vector &corpus) 4 | { 5 | //TODO: evaluation 6 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Evaluation/Evaluation.h: -------------------------------------------------------------------------------- 1 | #ifndef EVALUATION_H 2 | #define EVALUATION_H 3 | 4 | #include "../Utilities/Story.h" 5 | 6 | void Evaluation(const vector &firstStories, const vector &corpus); 7 | 8 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/FirstStoryDetection/FirstStoryDetection.cpp: -------------------------------------------------------------------------------- 1 | #include "FirstStoryDetection.h" 2 | 3 | void FirstStoryDetection (vector &firstStories, const vector &corpus, 4 | const int &numOfTopics) 5 | { 6 | cout << "> Start FirstStoryDetection......" << endl; 7 | for (int curTopic = 0; curTopic != numOfTopics; ++curTopic) { 8 | const Story *firstStoryOfCurTopic = nullptr; 9 | for (const Story &curStory : corpus) { 10 | if (curStory.getTopicID () == curTopic) { 11 | if (firstStoryOfCurTopic == nullptr) 12 | firstStoryOfCurTopic = new Story(curStory); 13 | else if (curStory.getTimeStamp () < firstStoryOfCurTopic->getTimeStamp ()) 14 | firstStoryOfCurTopic = new Story (curStory); 15 | } 16 | } 17 | assert (firstStoryOfCurTopic != nullptr); 18 | firstStories.push_back (*firstStoryOfCurTopic); 19 | } 20 | cout << "> FirstStoryDetection Done." << endl; 21 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/FirstStoryDetection/FirstStoryDetection.h: -------------------------------------------------------------------------------- 1 | /* Detect the first story of a certain event/activity/topic. */ 2 | 3 | #ifndef FIRST_STORY_DETECTION_H 4 | #define FIRST_STORY_DETECTION_H 5 | 6 | #include "../Utilities/Story.h" 7 | 8 | /* Detect the first story of every topic. 9 | Input: 'corpus', and the number of topics 10 | Output: 'firstStories', it's ordered by topicId. */ 11 | void FirstStoryDetection(vector &firstStories, const vector &corpus, 12 | const int &numOfTopics); 13 | 14 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Main.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Zewei Wu, Zitong Wang, Zhaoqi Wang, Peng Kang */ 2 | 3 | #include "DataPreprocessing/DataPreprocessing.h" 4 | #include "StoryLinkDetection/StoryLinkDetection.h" 5 | #include "TopicDetection/TopicDetection.h" 6 | #include "FirstStoryDetection/FirstStoryDetection.h" 7 | #include "Evaluation/Evaluation.h" 8 | #include "Presentation/Presentation.h" 9 | 10 | int main (int argc, char **argv) 11 | { 12 | vector corpus; 13 | vector firstStories; 14 | /* */ 15 | map glossaryIntToString; 16 | /* */ 17 | map glossaryStringToInt; 18 | /* */ 19 | map> storiesIndexWithCertainWord; 20 | 21 | // string tkn_file = "Dataset/mttkn/20030402_0530_0700_CNA_MAN.mttkn"; 22 | // string bnd_file = "Dataset/mttkn_bnd/20030402_0530_0700_CNA_MAN.mttkn_bnd"; 23 | string tknDir = "Dataset/mttkn/"; 24 | string bndDir = "Dataset/mttkn_bnd/"; 25 | 26 | DataPreprocessing (corpus, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord, 27 | tknDir, bndDir); 28 | 29 | cout << "corpus.size() = " << corpus.size () << endl; 30 | assert (corpus.size () > 0); 31 | cout << "glossaryIntToString.size () = " << glossaryIntToString.size () << endl; 32 | assert (glossaryIntToString.size () > 0); 33 | cout << "glossaryStringToInt.size () = " << glossaryStringToInt.size () << endl; 34 | assert (glossaryStringToInt.size () > 0); 35 | assert (glossaryIntToString.size () == glossaryStringToInt.size ()); 36 | cout << "storiesIndexWithCertainWord.size () = " << storiesIndexWithCertainWord.size () << endl; 37 | assert (storiesIndexWithCertainWord.size () > 0); 38 | 39 | StoryLinkDetection (corpus, storiesIndexWithCertainWord); 40 | 41 | int numOfTopics = 0; 42 | TopicDetection (corpus, numOfTopics); 43 | 44 | cout << "numOfTopics = " << numOfTopics << endl; 45 | assert (numOfTopics > 0); 46 | 47 | FirstStoryDetection (firstStories, corpus, numOfTopics); 48 | 49 | cout << "firstStories.size() = " << firstStories.size () << endl; 50 | assert (firstStories.size () == numOfTopics); 51 | 52 | Presentation (firstStories, corpus, glossaryIntToString, numOfTopics); 53 | 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Presentation/Presentation.cpp: -------------------------------------------------------------------------------- 1 | #include "Presentation.h" 2 | 3 | void Presentation (const vector &firstStories, const vector &corpus, 4 | const map &glossary, const int &numOfTopics) 5 | { 6 | cout << "> Start presenting......" << endl; 7 | printClusters (corpus, glossary, numOfTopics, false); 8 | cout << "> Presenting done. " << endl; 9 | } 10 | 11 | void printFirstStories (const vector &firstStories, const map &glossary, 12 | const bool &isPrintItAll) 13 | { 14 | for (const Story &curStory : firstStories) { 15 | cout << curStory.getTopicID () << endl; 16 | if (isPrintItAll) 17 | cout << curStory.toString (glossary) << endl; 18 | else 19 | cout << curStory.getTimeStamp () << endl; 20 | } 21 | } 22 | 23 | void printClusters (const vector &corpus, const map &glossary, 24 | const int &numOfTopics, const bool &isPrintItAll) 25 | { 26 | for (int curTopic = 0; curTopic < numOfTopics; ++curTopic) { 27 | cout << curTopic << endl; 28 | for (const Story &curStory : corpus) { 29 | if (curStory.getTopicID () == curTopic) { 30 | if (isPrintItAll) 31 | cout << curStory.toString (glossary) << endl; 32 | else 33 | cout << curStory.getTimeStamp () << endl; 34 | } 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Presentation/Presentation.h: -------------------------------------------------------------------------------- 1 | #ifndef PRESENTATION_H 2 | #define PRESENTATION_H 3 | 4 | #include "../Utilities/Story.h" 5 | 6 | /* Simply print the firstStories for now. */ 7 | void Presentation (const vector &firstStories, const vector &corpus, 8 | const map &glossary, const int &numOfTopics); 9 | 10 | /* Print the first stories of all clusters. 11 | If isPrintItAll==true, print the timestamp and words. 12 | Otherwise print the timestamp only. */ 13 | void printFirstStories (const vector &firstStories, const map &glossary, 14 | const bool &isPrintItAll = false); 15 | 16 | /* Print the clusters. 17 | If isPrintItAll==true, print the timestamp and words. 18 | Otherwise print the timestamp only. */ 19 | void printClusters (const vector &corpus, const map &glossary, 20 | const int &numOfTopics, const bool &isPrintItAll = false); 21 | 22 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/StoryLinkDetection/StoryLinkDetection.cpp: -------------------------------------------------------------------------------- 1 | #include "StoryLinkDetection.h" 2 | 3 | /* Preparing for the similarity calculation, e.g., calculating tfidf's. */ 4 | void StoryLinkDetection (vector &corpus, const map> &storiesIndexWithCertainWord) 5 | { 6 | cout << "> Start StoryLinkDetection......" << endl; 7 | 8 | prepareTFIDF (corpus, storiesIndexWithCertainWord); 9 | 10 | cout << "> StoryLinkDetection done. " << endl; 11 | } 12 | 13 | void prepareTFIDF (vector &corpus, const map> &storiesIndexWithCertainWord) 14 | { 15 | cout << ">> Start prepareTFIDF......" << endl; 16 | 17 | /* Calculate the tfidf, and save it. */ 18 | Story::setTFIDFOfCorpus (corpus, storiesIndexWithCertainWord); 19 | Story::saveTFIDF (corpus, "Dataset/tfidf.dat"); 20 | 21 | /* Load the tfidf from file, pls make sure the file exist. */ 22 | // Story::loadTFIDF(corpus, "Dataset/tfidf.dat"); 23 | 24 | cout << ">> prepareTFIDF done. " << endl; 25 | } 26 | 27 | bool isTwoStoriesSimilar (const Story &story1, const Story &story2, double threshold) 28 | { 29 | double similarity = getSimilarity (story1, story2); 30 | return similarity >= threshold; 31 | } 32 | 33 | double getSimilarity (const Story &story1, const Story &story2) 34 | { 35 | // TODO: what about other similarity measures? 36 | return getCosineSimilarity (story1, story2); 37 | } 38 | 39 | double getCosineSimilarity (const Story &story1, const Story &story2) 40 | { 41 | double similarity = 0.0; 42 | double innerProduct = 0.0; 43 | double squareSum1 = 0.0; 44 | double squareSum2 = 0.0; 45 | 46 | map tfidf1; 47 | map tfidf2; 48 | story1.getTFIDF (tfidf1); 49 | story2.getTFIDF (tfidf2); 50 | 51 | for (map::const_iterator citer = tfidf1.cbegin (); 52 | citer != tfidf1.cend (); ++citer) 53 | if (tfidf2.find (citer->first) != tfidf2.cend ()) { 54 | innerProduct += citer->second * tfidf2[citer->first]; 55 | squareSum1 += (citer->second) *(citer->second); 56 | squareSum2 += tfidf2[citer->first] * tfidf2[citer->first]; 57 | } 58 | 59 | similarity = innerProduct / sqrt (squareSum1 * squareSum2); 60 | 61 | return similarity; 62 | } 63 | 64 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/StoryLinkDetection/StoryLinkDetection.h: -------------------------------------------------------------------------------- 1 | /* Determine whether or not two stories discuss the same topic. */ 2 | 3 | #ifndef STORY_LINK_DETECTION_H 4 | #define STORY_LINK_DETECTION_H 5 | 6 | #include "../Utilities/Story.h" 7 | 8 | /* Preparing for the similarity calculation, e.g., calculating tfidf's. 9 | Input: 'corpus', storiesIndexWithCertainWord 10 | */ 11 | void StoryLinkDetection (vector &corpus, const map> &storiesIndexWithCertainWord); 12 | 13 | /* Calculating tfidf's of stories in corpus */ 14 | void prepareTFIDF (vector &corpus, const map> &storiesIndexWithCertainWord); 15 | 16 | /* Given two stories, return whether or not these two stories discuss the same topic. 17 | ACQUIRED: invoke StoryLinkDetection() before this function. 18 | Input: 'story1', 'story2', 'threshold' 19 | Output: return true/false */ 20 | bool isTwoStoriesSimilar(const Story &story1, const Story &story2, double threshold); 21 | 22 | /* Return the similarities between any two stories. 23 | REQUIRE: call setTFIDFBasedOnCorpus() before using the stories as parameters. 24 | NOTE: we simply use getCosineSimilarity() for now. 25 | Input: 'story1', 'story2' 26 | Output: similarity */ 27 | double getSimilarity(const Story &story1, const Story &story2); 28 | 29 | /* Before using this function, make sure that each story contains non-empty tfidf. */ 30 | double getCosineSimilarity (const Story &story1, const Story &story2); 31 | 32 | 33 | 34 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/StorySegmentation/StorySegmentation.cpp: -------------------------------------------------------------------------------- 1 | #include "StorySegmentation.h" 2 | 3 | void StorySegmentation(const string &tkn_file, string &bnd_file) 4 | { 5 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/StorySegmentation/StorySegmentation.h: -------------------------------------------------------------------------------- 1 | /* Take a show of news and to detect the boundaries between stories automatically. */ 2 | 3 | #ifndef STORY_SEGMENTATION_H 4 | #define STORY_SEGMENTATION_H 5 | 6 | #include "../Utilities/Story.h" 7 | 8 | /* Detect the boundaries between stories. OPTIONAL: we have both of these two files already. 9 | input: 'tkn_file' (e.g., "*.mttkn") 10 | output: 'bnd_file' */ 11 | void StorySegmentation(const string &tkn_file, string &bnd_file); 12 | 13 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicDetection/TopicDetection.cpp: -------------------------------------------------------------------------------- 1 | #include "TopicDetection.h" 2 | 3 | void TopicDetection (vector &corpus, int &numOfTopics) 4 | { 5 | cout << "> Start TopicDetection......" << endl; 6 | 7 | numOfTopics = 6; 8 | KMeans (corpus, numOfTopics); 9 | //NaiveClustering (corpus, numOfTopics, 0.5); 10 | 11 | cout << "> TopicDetection Done." << endl; 12 | } 13 | 14 | void NaiveClustering (vector &corpus, int &numOfTopics, const double &threshold) 15 | { 16 | cout << ">> Start NaiveClustering......" << endl; 17 | 18 | numOfTopics = 0; 19 | 20 | for (unsigned i = 0; i < corpus.size (); ++i) { 21 | if (i % 10 == 0) 22 | cout << i << " / " << corpus.size () << endl; 23 | for (unsigned j = i + 1; j < corpus.size (); ++j) { 24 | if (isTwoStoriesSimilar (corpus[i], corpus[j], threshold)) { 25 | if (!corpus[i].isClustered () && !corpus[j].isClustered ()) { 26 | corpus[i].setTopicID (numOfTopics); 27 | corpus[j].setTopicID (numOfTopics); 28 | ++numOfTopics; 29 | } else if (!corpus[i].isClustered () && corpus[j].isClustered ()) { 30 | corpus[i].setTopicID (corpus[j].getTopicID ()); 31 | } else if (corpus[i].isClustered () && !corpus[j].isClustered ()) { 32 | corpus[j].setTopicID (corpus[i].getTopicID ()); 33 | } 34 | } 35 | } 36 | } 37 | 38 | for (Story &story : corpus) 39 | if (!story.isClustered ()) 40 | story.setTopicID (numOfTopics++); 41 | 42 | cout << ">> NaiveClustering done." << endl; 43 | } 44 | 45 | void KMeans (vector &corpus, int &numOfTopics) 46 | { 47 | // vector< map > means; 48 | vector means; // centers 49 | 50 | initMeans (means, corpus, numOfTopics); 51 | 52 | int loopCnt = 10; // Kmeans loop execution counter, set to 10 temporarily 53 | 54 | while (loopCnt) { 55 | // label clusters 56 | for (unsigned int i = 0; i < corpus.size (); i++) { 57 | cluster (corpus[i], means, numOfTopics); 58 | } 59 | 60 | // Calculate centers 61 | for (int i = 0; i < numOfTopics; i++) { 62 | means[i] = getMean (corpus, i); 63 | } 64 | 65 | loopCnt--; 66 | } 67 | 68 | } 69 | 70 | void initMeans (vector &means, const vector &corpus, const int &numOfTopics) 71 | { 72 | 73 | for (int i = 0; i < numOfTopics; i++) { 74 | // map tfidf; 75 | // corpus[i].(tfidf); 76 | assert (i >= 0 && i < numOfTopics); 77 | means.push_back (corpus[i]); // set initial centers to several stories, for temporary. 78 | } 79 | } 80 | 81 | Story getMean (const vector &corpus, const int &topicID) 82 | { 83 | // map mean; 84 | Story mean; 85 | int storyNumOfTopic = 0; 86 | 87 | // Add all TF-IDF value to mean 88 | for (unsigned int i = 0; i < corpus.size (); i++) { 89 | if (corpus[i].getTopicID () == topicID) { 90 | storyNumOfTopic++; 91 | 92 | map tfidf; 93 | corpus[i].getTFIDF (tfidf); 94 | 95 | for (map::const_iterator it = tfidf.cbegin (); it != tfidf.cend (); it++) { 96 | map tfidfOfMean; 97 | mean.getTFIDF (tfidfOfMean); 98 | if (tfidfOfMean.find (it->first) != tfidfOfMean.cend ()) { 99 | tfidfOfMean[it->first] += it->second; 100 | } else { 101 | tfidfOfMean.insert (std::pair (it->first, it->second)); 102 | } 103 | mean.setTFIDF (tfidfOfMean); 104 | } 105 | } 106 | } 107 | 108 | map tfidfOfMean; 109 | mean.getTFIDF (tfidfOfMean); 110 | for (map::iterator it = tfidfOfMean.begin (); it != tfidfOfMean.end (); it++) { 111 | it->second /= storyNumOfTopic; 112 | } 113 | mean.setTFIDF (tfidfOfMean); 114 | 115 | return mean; 116 | } 117 | 118 | void cluster (Story &story, const vector &means, const int &numOfTopics) 119 | { 120 | double maxSimilarity = 0; 121 | 122 | for (int i = 0; i < numOfTopics; i++) { 123 | double similarity = getSimilarity (story, means[i]); 124 | //cout << "sim of " << i << " = " << similarity << endl; 125 | if (similarity > maxSimilarity) { 126 | maxSimilarity = similarity; 127 | story.setTopicID (i); 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicDetection/TopicDetection.h: -------------------------------------------------------------------------------- 1 | /* To cluster stories on the same topic into bins, 2 | and the creation of bins is an unsupervised task. */ 3 | 4 | #ifndef TOPIC_DETECTION_H 5 | #define TOPIC_DETECTION_H 6 | 7 | #include "../StoryLinkDetection/StoryLinkDetection.h" 8 | 9 | /* Cluster stories into topics. 10 | Input: corpus 11 | Output: corpus, with 'topicID' set. */ 12 | void TopicDetection (vector &corpus, int &numOfTopics); 13 | 14 | /* Naive clustering, simply cluster two stories to be of same topic, if the similarity is above threshold. 15 | Input: corpus, threshold 16 | Output: numOfTopics, corpus[].topicID 17 | */ 18 | void NaiveClustering (vector &corpus, int &numOfTopics, const double &threshold); 19 | 20 | /* 21 | * K-means, cluster stories into several topics. 22 | * Input: corpus, numOfTopics 23 | * Output: clustered corpus 24 | */ 25 | void KMeans(vector &corpus, int &numOfTopics); 26 | 27 | /* 28 | * initialize means. 29 | * Input: means, (with corpus and numOfTopics as parameters) 30 | * Output: initialized means 31 | */ 32 | void initMeans(vector &means, const vector &corpus, const int &numOfTopics); 33 | 34 | /* 35 | * Calculate mean of a cluster. 36 | * Input: corpus, topicID 37 | * Output: mean of this topic 38 | */ 39 | Story getMean(const vector &corpus, const int &topicID); 40 | 41 | /* 42 | * Set Story a certain topicID. 43 | * Input: story, means 44 | * Output: story with a certain topicID 45 | */ 46 | void cluster(Story &story, const vector &means, const int &numOfTopicss); 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicDetectionAndTracking.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | {AE135031-9969-4F5E-82B0-974800352F3F} 85 | Win32Proj 86 | TopicDetectionAndTracking 87 | 88 | 89 | 90 | Application 91 | true 92 | v120 93 | Unicode 94 | 95 | 96 | Application 97 | false 98 | v120 99 | true 100 | Unicode 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | true 114 | 115 | 116 | false 117 | 118 | 119 | 120 | 121 | 122 | Level3 123 | Disabled 124 | WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) 125 | true 126 | 127 | 128 | Console 129 | true 130 | 131 | 132 | 133 | 134 | Level3 135 | 136 | 137 | MaxSpeed 138 | true 139 | true 140 | WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) 141 | true 142 | 143 | 144 | Console 145 | true 146 | true 147 | true 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicDetectionAndTracking.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {75752864-2e0e-460a-a006-9c0b443c71d7} 6 | 7 | 8 | {42e02597-8f51-44fe-b40b-33cb85c1aeaf} 9 | 10 | 11 | {6fbd771c-0bad-4b57-ade3-4e7460bd158f} 12 | 13 | 14 | {6d4f27ad-fa8e-4b9e-a7e6-b8478a3b848b} 15 | 16 | 17 | {9ccaca02-3282-4b2c-b570-48b3e5c41d01} 18 | 19 | 20 | {03dd74de-969a-4a82-b0f3-a7afc3b76144} 21 | 22 | 23 | {08c794f5-a96b-44b5-bd88-6c3f13f4ddee} 24 | 25 | 26 | {c224464d-2218-4ec0-8e95-e2246c9a4793} 27 | 28 | 29 | {21d15f13-f9bd-4737-a08d-7ae72ed78b83} 30 | 31 | 32 | {35fb326b-77f3-433a-ba56-1dd072240dc5} 33 | 34 | 35 | {abe43947-a059-4b3f-aba9-a7c654140a8b} 36 | 37 | 38 | {d9b592ef-d0f1-4adf-a197-42af1df82f2c} 39 | 40 | 41 | {fd7347fb-b2ac-4929-9348-674e70bc3f41} 42 | 43 | 44 | 45 | 46 | DataPreprocessing 47 | 48 | 49 | TopicDetection 50 | 51 | 52 | StoryLinkDetection 53 | 54 | 55 | Utilities 56 | 57 | 58 | TopicTracking 59 | 60 | 61 | 62 | FirstStoryDetection 63 | 64 | 65 | Evaluation 66 | 67 | 68 | StorySegmentation 69 | 70 | 71 | Presentation 72 | 73 | 74 | Utilities 75 | 76 | 77 | 78 | 79 | DataPreprocessing 80 | 81 | 82 | TopicDetection 83 | 84 | 85 | StoryLinkDetection 86 | 87 | 88 | Utilities 89 | 90 | 91 | Utilities 92 | 93 | 94 | TopicTracking 95 | 96 | 97 | FirstStoryDetection 98 | 99 | 100 | Evaluation 101 | 102 | 103 | StorySegmentation 104 | 105 | 106 | Presentation 107 | 108 | 109 | 110 | 111 | Dataset\mttkn 112 | 113 | 114 | Dataset\mttkn_bnd 115 | 116 | 117 | Dataset 118 | 119 | 120 | Dataset\mttkn 121 | 122 | 123 | Dataset\mttkn 124 | 125 | 126 | Dataset\mttkn 127 | 128 | 129 | Dataset\mttkn 130 | 131 | 132 | Dataset\mttkn 133 | 134 | 135 | Dataset\mttkn 136 | 137 | 138 | Dataset\mttkn 139 | 140 | 141 | Dataset\mttkn 142 | 143 | 144 | Dataset\mttkn 145 | 146 | 147 | Dataset\mttkn 148 | 149 | 150 | Dataset\mttkn 151 | 152 | 153 | Dataset\mttkn 154 | 155 | 156 | Dataset\mttkn 157 | 158 | 159 | Dataset\mttkn 160 | 161 | 162 | Dataset\mttkn 163 | 164 | 165 | Dataset\mttkn 166 | 167 | 168 | Dataset\mttkn 169 | 170 | 171 | Dataset\mttkn 172 | 173 | 174 | Dataset\mttkn 175 | 176 | 177 | Dataset\mttkn_bnd 178 | 179 | 180 | Dataset\mttkn_bnd 181 | 182 | 183 | Dataset\mttkn_bnd 184 | 185 | 186 | Dataset\mttkn_bnd 187 | 188 | 189 | Dataset\mttkn_bnd 190 | 191 | 192 | Dataset\mttkn_bnd 193 | 194 | 195 | Dataset\mttkn_bnd 196 | 197 | 198 | Dataset\mttkn_bnd 199 | 200 | 201 | Dataset\mttkn_bnd 202 | 203 | 204 | Dataset\mttkn_bnd 205 | 206 | 207 | Dataset\mttkn_bnd 208 | 209 | 210 | Dataset\mttkn_bnd 211 | 212 | 213 | Dataset\mttkn_bnd 214 | 215 | 216 | Dataset\mttkn_bnd 217 | 218 | 219 | Dataset\mttkn_bnd 220 | 221 | 222 | Dataset\mttkn_bnd 223 | 224 | 225 | Dataset\mttkn_bnd 226 | 227 | 228 | Dataset\mttkn_bnd 229 | 230 | 231 | Dataset\mttkn_bnd 232 | 233 | 234 | Websites 235 | 236 | 237 | Websites 238 | 239 | 240 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicTracking/TopicTracking.cpp: -------------------------------------------------------------------------------- 1 | #include "TopicTracking.h" 2 | 3 | void TopicTracking(const string &tkn_file, const string &bnd_file) 4 | { 5 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/TopicTracking/TopicTracking.h: -------------------------------------------------------------------------------- 1 | /* Provided with a small number of stories that are known to be on the same 2 | topic, find all other stories on that topic in the stream of arriving 3 | news. */ 4 | 5 | #ifndef TOPIC_TRACKING_H 6 | #define TOPIC_TRACKING_H 7 | 8 | #include "../Utilities/Story.h" 9 | 10 | /* OPTIONAL */ 11 | void TopicTracking(const vector &corpus); 12 | 13 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Utilities/Story.cpp: -------------------------------------------------------------------------------- 1 | #include "Story.h" 2 | 3 | Story::Story () 4 | { 5 | } 6 | 7 | Story::Story (int storyID, vector words, string timeStamp) 8 | : storyID (storyID), words (words), timeStamp (timeStamp) 9 | { 10 | } 11 | 12 | int Story::getStoryID () const 13 | { 14 | return this->storyID; 15 | } 16 | 17 | /* Set storyID */ 18 | //// Unused. The storyID should be set only once, when the story is created. 19 | //void Story::setStoryID (int storyID) 20 | //{ 21 | // this->storyID = storyID; 22 | //} 23 | 24 | vector Story::getWords () const 25 | { 26 | return this->words; 27 | } 28 | 29 | 30 | /* Set words, the whole vector. */ 31 | //// Unused. The words should be set only once, when the story is created. 32 | //void Story::setWords (vector &words) 33 | //{ 34 | // this->words = words; 35 | //} 36 | 37 | int Story::getWord (int index) const 38 | { 39 | return this->words[index]; 40 | } 41 | 42 | /* Set words[index], a specific element in the vector. */ 43 | //// Unused. The words should be set only once, when the story is created. 44 | //void Story::setWord (int index, int value) 45 | //{ 46 | // this->words[index] = value; 47 | //} 48 | 49 | /* Set & get timeStamp */ 50 | string Story::getTimeStamp () const 51 | { 52 | return this->timeStamp; 53 | } 54 | 55 | void Story::setTimeStamp (const string &timeStamp) 56 | { 57 | this->timeStamp = timeStamp; 58 | } 59 | 60 | /* Set & get topicID */ 61 | int Story::getTopicID () const 62 | { 63 | return this->topicID; 64 | } 65 | 66 | void Story::setTopicID (int topicID) 67 | { 68 | this->topicID = topicID; 69 | } 70 | 71 | /* Get 'wordsCount', simply invoke the assign operator */ 72 | void Story::getWordsCount (map &wordsCount) const 73 | { 74 | wordsCount = this->wordsCount; 75 | } 76 | 77 | /* Set 'wordsCount' */ 78 | void Story::setWordsCount () 79 | { 80 | for (int curWordID : this->words) { 81 | if (wordsCount.find (curWordID) == wordsCount.end ()) // curWordID not found 82 | wordsCount[curWordID] = 1; 83 | else // curWordID is found 84 | wordsCount[curWordID]++; 85 | } 86 | } 87 | 88 | /* Get 'termFrequency', simply invoke the assign operator */ 89 | void Story::getTermFrequency (map &termFrequency) const 90 | { 91 | termFrequency = this->termFrequency; 92 | } 93 | 94 | /* Set 'termFrequency' */ 95 | void Story::setTermFrequency () 96 | { 97 | if (this->wordsCount.empty ()) 98 | this->setWordsCount (); 99 | 100 | double length = this->getLength (); 101 | for (map::const_iterator iter = this->wordsCount.cbegin (); 102 | iter != this->wordsCount.cend (); ++iter) { 103 | termFrequency[iter->first] = iter->second / length; 104 | } 105 | } 106 | 107 | /* Get 'tfidf', simply invoke the assign operator */ 108 | void Story::getTFIDF (map &tfidf) const 109 | { 110 | tfidf = this->tfidf; 111 | } 112 | 113 | void Story::setTFIDF (const map &tfidf) 114 | { 115 | this->tfidf.clear (); 116 | this->tfidf = tfidf; 117 | } 118 | 119 | /* Set 'tfidf', based on corpus */ 120 | void Story::setTFIDFBasedOnCorpus (const vector &corpus, 121 | const map> &storiesIndexWithCertainWord) 122 | { 123 | if (this->termFrequency.empty ()) 124 | this->setTermFrequency (); 125 | 126 | this->tfidf = this->termFrequency; 127 | for (map::iterator iter = this->tfidf.begin (); 128 | iter != this->tfidf.end (); ++iter) { 129 | if (storiesIndexWithCertainWord.find (iter->first) != storiesIndexWithCertainWord.cend ()) { 130 | double idf = 0.0; 131 | double storiesWithWord = 0.0; 132 | storiesWithWord = storiesIndexWithCertainWord.find (iter->first)->second.size (); 133 | idf = log (corpus.size () / storiesWithWord); 134 | iter->second *= idf; 135 | } 136 | } 137 | } 138 | 139 | void Story::addWord (int wordIndex) 140 | { 141 | words.push_back (wordIndex); 142 | } 143 | 144 | int Story::getLength () const 145 | { 146 | return this->words.size (); 147 | } 148 | 149 | string Story::toString (const map &glossary) const 150 | { 151 | string result = ""; 152 | result += this->timeStamp; 153 | for (const int wordID : this->words) 154 | if (glossary.find (wordID) != glossary.cend ()) 155 | result += " " + glossary.find (wordID)->second; 156 | return result; 157 | } 158 | 159 | bool Story::isWordExisted (int wordID) const 160 | { 161 | if (!this->wordsCount.empty ()) { 162 | return this->wordsCount.find (wordID) != this->wordsCount.cend (); 163 | } else { 164 | // Option 1: first build up the wordsCount, then use wordsCount.find() 165 | // Option 2: sort the words, then find (using the binary search) 166 | return std::find (this->words.cbegin (), this->words.cend (), wordID) != this->words.cend (); 167 | } 168 | } 169 | 170 | bool Story::isClustered () const 171 | { 172 | return topicID != DEFAULT_TOPIC_ID; 173 | } 174 | 175 | void Story::setTFIDFOfCorpus (vector &corpus, 176 | const map> &storiesIndexWithCertainWord) 177 | { 178 | cout << ">>> Start calculating tfidf of corpus......" << endl; 179 | 180 | for (unsigned count = 0; count < corpus.size (); ++count) { 181 | if (count % 10 == 0) 182 | cout << count << " / " << corpus.size () << endl; 183 | corpus[count].setTFIDFBasedOnCorpus (corpus, storiesIndexWithCertainWord); 184 | } 185 | 186 | cout << ">>> Calculating tfidf's done." << endl; 187 | } 188 | 189 | /* Save the tfidf's of corpus to tfidfFile */ 190 | void Story::saveTFIDF (const vector &corpus, const string &tfidfFile) 191 | { 192 | cout << ">>> Start saving tfidf......" << endl; 193 | 194 | ofstream fout (tfidfFile, ios::out); 195 | assert (fout.is_open ()); 196 | 197 | for (const Story &story : corpus) { 198 | map tfidf; 199 | story.getTFIDF (tfidf); 200 | for (map::const_iterator citer = tfidf.cbegin (); 201 | citer != tfidf.cend (); ++citer) 202 | fout << citer->first << ":" << citer->second << " "; 203 | fout << endl; 204 | } 205 | 206 | fout.close (); 207 | 208 | cout << ">>> Saving tfidf done. " << endl; 209 | } 210 | 211 | /* Load the tfidf's of corpus from tfidfFile */ 212 | void Story::loadTFIDF (vector &corpus, const string &tfidfFile) 213 | { 214 | cout << ">>> Start loading tfidf......" << endl; 215 | 216 | ifstream fin (tfidfFile, ios::in); 217 | assert (fin.is_open ()); 218 | 219 | string line = ""; 220 | map tfidf; 221 | stringstream ss; 222 | int key; 223 | double value; 224 | int i = 0; 225 | while (std::getline (fin, line)) { 226 | tfidf.clear (); 227 | ss.clear (); 228 | ss << line; 229 | while (ss >> key) { 230 | ss.get (); 231 | ss >> value; 232 | tfidf[key] = value; 233 | } 234 | corpus[i++].setTFIDF (tfidf); 235 | } 236 | 237 | fin.close (); 238 | 239 | cout << ">>> Loading tfidf done. " << endl; 240 | } -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Utilities/Story.h: -------------------------------------------------------------------------------- 1 | /* This is file is put in the directory 'Utilities' because this file is public among 2 | the system. If there is any change of directory 'Utilities', notify all authors. */ 3 | 4 | #ifndef STORY_H 5 | #define STORY_H 6 | 7 | #include "Utilities.h" 8 | 9 | /* Default value of 'topicID', meaning the story is clustered. 10 | Please AVOID using it outside 'Story.h' and 'Story.cpp'. */ 11 | #define DEFAULT_TOPIC_ID -1 12 | /* Default value of 'storyID', meaning the story is just a tmp story. 13 | Please AVOID using it outside 'Story.h' and 'Story.cpp'. */ 14 | #define DEFAULT_STORY_ID -1 15 | 16 | class Story 17 | { 18 | private: 19 | int storyID = DEFAULT_STORY_ID; 20 | vector words; // the index of each plain word, refer to the glossary 21 | string timeStamp; // yyyymmdd.ttmm.XXX 22 | int topicID = DEFAULT_TOPIC_ID; 23 | // . Before using, make sure setWordsCount is invoked. 24 | map wordsCount; 25 | // . Before using, make sure setTermFrequency() is invoked. 26 | map termFrequency; 27 | // . Before using, make sure setTFIDFBasedOnCorpus() is invoked 28 | map tfidf; 29 | // vector words_s; // unused since we have already the glossary and index of plain word. 30 | 31 | public: 32 | /* UNSUGGESTED: Default constructor. */ 33 | Story (); 34 | /* The first two parameters are necessary */ 35 | Story (int storyID, vector words, string timeStamp); 36 | 37 | /* Get 'storyID' */ 38 | int getStoryID () const; 39 | //// UNUSED: The storyID should be set only once, when the story is created. 40 | ///* Set 'storyID'. */ 41 | //void setStoryID (int storyID); 42 | 43 | /* Get words, the whole vector. */ 44 | vector getWords () const; 45 | //// UNUSED: The words should be set only once, when the story is created. 46 | ///* Set words, the whold vector. */ 47 | //void setWords (vector &words); 48 | 49 | /* Get words[index], a specific element in the vector. */ 50 | int getWord (int index) const; 51 | //// Unused. The words should be set only once, when the story is created. 52 | ///* Set words[index], a specific element in the vector. */ 53 | //void setWord (int index); 54 | 55 | /* Get 'timeStamp' */ 56 | string getTimeStamp () const; 57 | /* Set 'timeStamp' */ 58 | void setTimeStamp (const string &timeStamp); 59 | 60 | /* Get topicID */ 61 | int getTopicID () const; 62 | /* Set topicID */ 63 | void setTopicID (int topicID); 64 | 65 | /* Get 'wordsCount', before using, you'd better call setWordsCount() first. */ 66 | void getWordsCount (map &wordsCount) const; 67 | /* Set 'wordsCount' */ 68 | void setWordsCount (); 69 | 70 | /* Get 'termFrequency', before using, you'd better call setTermFrequency() first. */ 71 | void getTermFrequency (map &termFrequency) const; 72 | /* Set 'termFrequency' */ 73 | void setTermFrequency (); 74 | 75 | /* Get 'tfidf', before using, you'd better call setTFIDFBasedOnCorpus() first. */ 76 | void getTFIDF (map &tfidf) const; 77 | /* Set 'tfidf' */ 78 | void setTFIDF (const map &tfidf); 79 | /* To calculate tfidf for a certain story, we have to refer to a corpus. */ 80 | void setTFIDFBasedOnCorpus (const vector &corpus, 81 | const map> &storiesIndexWithCertainWord); 82 | 83 | /* add word to 'words' */ 84 | void addWord (int wordIndex); 85 | 86 | /* return the length of the story, i.e., the length of 'words' */ 87 | int getLength () const; 88 | 89 | /* return timestamp and words */ 90 | string toString (const map &glossary) const; 91 | 92 | /* return true if this story contains a certain word. */ 93 | bool isWordExisted (int wordID) const; 94 | 95 | /* return true if the story is already clustered. */ 96 | bool isClustered () const; 97 | 98 | /* Set 'tfidf' for all stories in corpus. */ 99 | static void setTFIDFOfCorpus (vector &corpus, 100 | const map> &storiesIndexWithCertainWord); 101 | /* Save the tfidf's of corpus to tfidfFile */ 102 | static void saveTFIDF (const vector &corpus, const string &tfidfFile); 103 | /* Load the tfidf's of corpus from tfidfFile */ 104 | static void loadTFIDF (vector &corpus, const string &tfidfFile); 105 | }; 106 | 107 | #endif -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Utilities/Utilities.cpp: -------------------------------------------------------------------------------- 1 | /* Put all public help functions here. */ 2 | 3 | #include "Utilities.h" 4 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Utilities/Utilities.h: -------------------------------------------------------------------------------- 1 | /* All external dependencies should be put here. 2 | If other .h in this system is already included, no need to include this 3 | file again, since all other .h files would include this file. */ 4 | 5 | #ifndef UTILITIES_H 6 | #define UTILITIES_H 7 | 8 | /* Without this statement, while developing in VS, you have to use functions like 9 | strtok_s instead functions like strtok. We avoid doing it. Because functions like 10 | strtok_s won't work in other platform, e.g., linux. */ 11 | #define _CRT_SECURE_NO_WARNINGS 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | // please avoid using namespaces 28 | using std::string; 29 | using std::vector; 30 | using std::map; 31 | using std::cout; 32 | using std::cin; 33 | using std::endl; 34 | using std::ifstream; 35 | using std::ofstream; 36 | using std::ios; 37 | using std::stringstream; 38 | using std::set; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Websites/part1.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /TopicDetectionAndTracking/Websites/part2.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Here is title 7 | 8 | 9 | 10 | 11 | --------------------------------------------------------------------------------