├── .DS_Store
├── .cproject
├── .gitattributes
├── .gitignore
├── .project
├── .settings
└── org.eclipse.cdt.managedbuilder.core.prefs
├── README.md
├── TopicDetectionAndTracking.sln
└── TopicDetectionAndTracking
├── DataPreprocessing
├── DataPreprocessing.cpp
└── DataPreprocessing.h
├── Dataset
├── mttkn
│ ├── 20030401_0113_1041_AFP_ARB.mttkn
│ ├── 20030401_0130_0330_XIN_MAN.mttkn
│ ├── 20030401_0300_0500_AFP_MAN.mttkn
│ ├── 20030401_0530_0700_CNA_MAN.mttkn
│ ├── 20030401_0600_0800_XIN_ARB.mttkn
│ ├── 20030401_0730_0930_XIN_MAN.mttkn
│ ├── 20030401_1000_1200_ZBN_MAN.mttkn
│ ├── 20030401_1041_2343_AFP_ARB.mttkn
│ ├── 20030401_1100_1230_ANN_ARB.mttkn
│ ├── 20030401_1130_1330_XIN_MAN.mttkn
│ ├── 20030401_1500_1700_XIN_ARB.mttkn
│ ├── 20030401_1530_1730_XIN_MAN.mttkn
│ ├── 20030401_1930_2130_XIN_MAN.mttkn
│ ├── 20030402_0106_1145_AFP_ARB.mttkn
│ ├── 20030402_0130_0330_XIN_MAN.mttkn
│ ├── 20030402_0300_0500_AFP_MAN.mttkn
│ ├── 20030402_0530_0700_CNA_MAN.mttkn
│ ├── 20030402_0600_0800_XIN_ARB.mttkn
│ ├── 20030402_0730_0930_XIN_MAN.mttkn
│ └── 20030402_0830_1000_CNA_MAN.mttkn
├── mttkn_bnd
│ ├── 20030401_0113_1041_AFP_ARB.mttkn_bnd
│ ├── 20030401_0130_0330_XIN_MAN.mttkn_bnd
│ ├── 20030401_0300_0500_AFP_MAN.mttkn_bnd
│ ├── 20030401_0530_0700_CNA_MAN.mttkn_bnd
│ ├── 20030401_0600_0800_XIN_ARB.mttkn_bnd
│ ├── 20030401_0730_0930_XIN_MAN.mttkn_bnd
│ ├── 20030401_1000_1200_ZBN_MAN.mttkn_bnd
│ ├── 20030401_1041_2343_AFP_ARB.mttkn_bnd
│ ├── 20030401_1100_1230_ANN_ARB.mttkn_bnd
│ ├── 20030401_1130_1330_XIN_MAN.mttkn_bnd
│ ├── 20030401_1500_1700_XIN_ARB.mttkn_bnd
│ ├── 20030401_1530_1730_XIN_MAN.mttkn_bnd
│ ├── 20030401_1930_2130_XIN_MAN.mttkn_bnd
│ ├── 20030402_0106_1145_AFP_ARB.mttkn_bnd
│ ├── 20030402_0130_0330_XIN_MAN.mttkn_bnd
│ ├── 20030402_0300_0500_AFP_MAN.mttkn_bnd
│ ├── 20030402_0530_0700_CNA_MAN.mttkn_bnd
│ ├── 20030402_0600_0800_XIN_ARB.mttkn_bnd
│ ├── 20030402_0730_0930_XIN_MAN.mttkn_bnd
│ └── 20030402_0830_1000_CNA_MAN.mttkn_bnd
└── tfidf.dat
├── Evaluation
├── Evaluation.cpp
└── Evaluation.h
├── FirstStoryDetection
├── FirstStoryDetection.cpp
└── FirstStoryDetection.h
├── Main.cpp
├── Presentation
├── Presentation.cpp
└── Presentation.h
├── StoryLinkDetection
├── StoryLinkDetection.cpp
└── StoryLinkDetection.h
├── StorySegmentation
├── StorySegmentation.cpp
└── StorySegmentation.h
├── TopicDetection
├── TopicDetection.cpp
└── TopicDetection.h
├── TopicDetectionAndTracking.vcxproj
├── TopicDetectionAndTracking.vcxproj.filters
├── TopicTracking
├── TopicTracking.cpp
└── TopicTracking.h
├── Utilities
├── Story.cpp
├── Story.h
├── Utilities.cpp
└── Utilities.h
└── Websites
├── part1.html
└── part2.html
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavierwu/TopicDetectionAndTracking/e449feda00e55fc739f3ad104dc29c5fa4089a4b/.DS_Store
--------------------------------------------------------------------------------
/.cproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.sln.docstates
8 |
9 | # Build results
10 | [Dd]ebug/
11 | [Dd]ebugPublic/
12 | [Rr]elease/
13 | x64/
14 | build/
15 | bld/
16 | [Bb]in/
17 | [Oo]bj/
18 |
19 | # Roslyn cache directories
20 | *.ide/
21 |
22 | # MSTest test Results
23 | [Tt]est[Rr]esult*/
24 | [Bb]uild[Ll]og.*
25 |
26 | #NUNIT
27 | *.VisualState.xml
28 | TestResult.xml
29 |
30 | # Build Results of an ATL Project
31 | [Dd]ebugPS/
32 | [Rr]eleasePS/
33 | dlldata.c
34 |
35 | *_i.c
36 | *_p.c
37 | *_i.h
38 | *.ilk
39 | *.meta
40 | *.obj
41 | *.pch
42 | *.pdb
43 | *.pgc
44 | *.pgd
45 | *.rsp
46 | *.sbr
47 | *.tlb
48 | *.tli
49 | *.tlh
50 | *.tmp
51 | *.tmp_proj
52 | *.log
53 | *.vspscc
54 | *.vssscc
55 | .builds
56 | *.pidb
57 | *.svclog
58 | *.scc
59 |
60 | # Chutzpah Test files
61 | _Chutzpah*
62 |
63 | # Visual C++ cache files
64 | ipch/
65 | *.aps
66 | *.ncb
67 | *.opensdf
68 | *.sdf
69 | *.cachefile
70 |
71 | # Visual Studio profiler
72 | *.psess
73 | *.vsp
74 | *.vspx
75 |
76 | # TFS 2012 Local Workspace
77 | $tf/
78 |
79 | # Guidance Automation Toolkit
80 | *.gpState
81 |
82 | # ReSharper is a .NET coding add-in
83 | _ReSharper*/
84 | *.[Rr]e[Ss]harper
85 | *.DotSettings.user
86 |
87 | # JustCode is a .NET coding addin-in
88 | .JustCode
89 |
90 | # TeamCity is a build add-in
91 | _TeamCity*
92 |
93 | # DotCover is a Code Coverage Tool
94 | *.dotCover
95 |
96 | # NCrunch
97 | _NCrunch_*
98 | .*crunch*.local.xml
99 |
100 | # MightyMoose
101 | *.mm.*
102 | AutoTest.Net/
103 |
104 | # Web workbench (sass)
105 | .sass-cache/
106 |
107 | # Installshield output folder
108 | [Ee]xpress/
109 |
110 | # DocProject is a documentation generator add-in
111 | DocProject/buildhelp/
112 | DocProject/Help/*.HxT
113 | DocProject/Help/*.HxC
114 | DocProject/Help/*.hhc
115 | DocProject/Help/*.hhk
116 | DocProject/Help/*.hhp
117 | DocProject/Help/Html2
118 | DocProject/Help/html
119 |
120 | # Click-Once directory
121 | publish/
122 |
123 | # Publish Web Output
124 | *.[Pp]ublish.xml
125 | *.azurePubxml
126 | ## TODO: Comment the next line if you want to checkin your
127 | ## web deploy settings but do note that will include unencrypted
128 | ## passwords
129 | #*.pubxml
130 |
131 | # NuGet Packages Directory
132 | packages/*
133 | ## TODO: If the tool you use requires repositories.config
134 | ## uncomment the next line
135 | #!packages/repositories.config
136 |
137 | # Enable "build/" folder in the NuGet Packages folder since
138 | # NuGet packages use it for MSBuild targets.
139 | # This line needs to be after the ignore of the build folder
140 | # (and the packages folder if the line above has been uncommented)
141 | !packages/build/
142 |
143 | # Windows Azure Build Output
144 | csx/
145 | *.build.csdef
146 |
147 | # Windows Store app package directory
148 | AppPackages/
149 |
150 | # Others
151 | sql/
152 | *.Cache
153 | ClientBin/
154 | [Ss]tyle[Cc]op.*
155 | ~$*
156 | *~
157 | *.dbmdl
158 | *.dbproj.schemaview
159 | *.pfx
160 | *.publishsettings
161 | node_modules/
162 |
163 | # RIA/Silverlight projects
164 | Generated_Code/
165 |
166 | # Backup & report files from converting an old project file
167 | # to a newer Visual Studio version. Backup files are not needed,
168 | # because we have git ;-)
169 | _UpgradeReport_Files/
170 | Backup*/
171 | UpgradeLog*.XML
172 | UpgradeLog*.htm
173 |
174 | # SQL Server files
175 | *.mdf
176 | *.ldf
177 |
178 | # Business Intelligence projects
179 | *.rdl.data
180 | *.bim.layout
181 | *.bim_*.settings
182 |
183 | # Microsoft Fakes
184 | FakesAssemblies/
185 |
186 | # LightSwitch generated files
187 | GeneratedArtifacts/
188 | _Pvt_Extensions/
189 | ModelManifest.xml
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | TDT
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder
10 | clean,full,incremental,
11 |
12 |
13 |
14 |
15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
16 | full,incremental,
17 |
18 |
19 |
20 |
21 |
22 | org.eclipse.cdt.core.cnature
23 | org.eclipse.cdt.core.ccnature
24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature
25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
26 |
27 |
28 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.cdt.managedbuilder.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPATH/delimiter=\:
3 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPATH/operation=remove
4 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPLUS_INCLUDE_PATH/delimiter=\:
5 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/CPLUS_INCLUDE_PATH/operation=remove
6 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/C_INCLUDE_PATH/delimiter=\:
7 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/C_INCLUDE_PATH/operation=remove
8 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/append=true
9 | environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/appendContributed=true
10 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/LIBRARY_PATH/delimiter=\:
11 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/LIBRARY_PATH/operation=remove
12 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/append=true
13 | environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.957622487/appendContributed=true
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TopicDetectionAndTracking
2 | My graduation project (with three friends), dealing with tasks about TDT.
3 |
4 | ## Dataset/
5 | ### mttkn/
6 | Contains some TDT5 token files.
7 | ### mttkn_bnd/
8 | Contains the corresponding boundary files.
9 | ## Main.cpp
10 | Contains main() function only.
11 | ## Utilities/
12 | Codes that are shared among all other parts, e.g., the class 'Story'.
13 | ## StorySegmentation/
14 | IF we dont have boundary files, how could we find out the boundaries between two documents in a token file?
15 | ## DataPreprocessing/
16 | Read data, do some pre-processing
17 | ## StoryLinkDetection/
18 | Find out the link between two stories .
19 | ## TopicDetection/
20 | Detect some topics.
21 | ## FirstStoryDetection/
22 | Detect the first story of a ceratin event.
23 | ## TopicTracking/
24 | ## Evaluation/
25 | ## Presentation/
26 |
--------------------------------------------------------------------------------
/TopicDetectionAndTracking.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.31101.0
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TopicDetectionAndTracking", "TopicDetectionAndTracking\TopicDetectionAndTracking.vcxproj", "{AE135031-9969-4F5E-82B0-974800352F3F}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Win32 = Debug|Win32
11 | Release|Win32 = Release|Win32
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {AE135031-9969-4F5E-82B0-974800352F3F}.Debug|Win32.ActiveCfg = Debug|Win32
15 | {AE135031-9969-4F5E-82B0-974800352F3F}.Debug|Win32.Build.0 = Debug|Win32
16 | {AE135031-9969-4F5E-82B0-974800352F3F}.Release|Win32.ActiveCfg = Release|Win32
17 | {AE135031-9969-4F5E-82B0-974800352F3F}.Release|Win32.Build.0 = Release|Win32
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | EndGlobal
23 |
--------------------------------------------------------------------------------
/TopicDetectionAndTracking/DataPreprocessing/DataPreprocessing.cpp:
--------------------------------------------------------------------------------
1 | #include "DataPreprocessing.h"
2 |
3 | const int MAX_FILES = 999999;
4 |
5 | /* Set 'corpus' and 'glossary', and do some other preprocessing */
6 | void DataPreprocessing (vector &corpus,
7 | map &glossaryIntToString, map &glossaryStringToInt,
8 | map> &storiesIndexWithCertainWord,
9 | const string tknDir, const string bndDir,
10 | const bool &isWithStemmer)
11 | {
12 | cout << "> Start DataPreprocessing......" << endl;
13 |
14 | readCorpus (corpus, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord, tknDir,
15 | bndDir, isWithStemmer);
16 |
17 | cout << "> DataPreprocessing Done." << endl;
18 | }
19 |
20 | void readCorpus (vector &corpus,
21 | map &glossaryIntToString, map &glossaryStringToInt,
22 | map> &storiesIndexWithCertainWord,
23 | const string tknDir, const string bndDir, const bool &isWithStemmer)
24 | {
25 | cout << ">> Start reading corpus......" << endl;
26 |
27 | if (!isWithStemmer) {
28 | cout << "Please choose" << endl;
29 | cout << "1. Read from the specific file" << endl;
30 | cout << "2. Read from files in the directory" << endl;
31 |
32 | char choice;
33 | cin >> choice;
34 |
35 | while (true) {
36 | fflush (stdin);
37 | if (choice == '1') {
38 | readCorpusFromFile (corpus, glossaryIntToString, glossaryStringToInt,
39 | storiesIndexWithCertainWord, tknDir, bndDir, isWithStemmer);
40 | break;
41 | } else if (choice == '2') {
42 | readCorpusFromDirectory (corpus, glossaryIntToString, glossaryStringToInt,
43 | storiesIndexWithCertainWord, tknDir, bndDir, isWithStemmer);
44 | break;
45 | } else {
46 | cout << "Invalid input, please input again!" << endl;
47 | cin >> choice;
48 | }
49 | }
50 |
51 | } else { // TODO: (optional) add a stemmer to the readCorpus(...) ?
52 | }
53 |
54 | cout << ">> Reading corpus done." << endl;
55 | }
56 |
57 | void readCorpusFromFile (vector &corpus,
58 | map &glossaryIntToString, map &glossaryStringToInt,
59 | map> &storiesIndexWithCertainWord,
60 | const string tknDir, const string bndDir, const bool &isWithStemmer)
61 | {
62 |
63 | while (true) {
64 | // the id of the first and the last words of a story
65 | vector Brecid;
66 | vector Erecid;
67 |
68 | string bndFile, tknFile;
69 |
70 | cout << "Please input the file name of bnd file" << endl;
71 | cin >> bndFile;
72 | bndFile = bndDir + bndFile;
73 |
74 | cout << "Please input the file name of tkn file" << endl;
75 | cin >> tknFile;
76 | tknFile = tknDir + tknFile;
77 |
78 | readBndFile (corpus, bndFile, Brecid, Erecid);
79 |
80 | readTknFile (corpus, tknFile, Brecid, Erecid, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord);
81 |
82 | cout << "Continue?(Y/N)" << endl;
83 |
84 | char choice;
85 | cin >> choice;
86 |
87 |
88 | REJUDGE: fflush (stdin);
89 | if (choice == 'Y' || choice == 'y') {
90 | continue;
91 | } else if (choice == 'N' || choice == 'n') {
92 | break;
93 | } else {
94 | cout << "Invalid input, please input again!" << endl;
95 | cin >> choice;
96 | goto REJUDGE;
97 | }
98 |
99 | }
100 | }
101 |
102 | void readCorpusFromDirectory (vector &corpus, map &glossaryIntToString,
103 | map &glossaryStringToInt,
104 | map> &storiesIndexWithCertainWord,
105 | const string tknDir, const string bndDir, const bool &isWithStemmer)
106 | {
107 | _finddata_t file;
108 | long lf;
109 |
110 | // the id of the first and the last words of a story
111 | vector Brecid;
112 | vector Erecid;
113 |
114 | string bnd = bndDir.c_str ();
115 | bnd += "*.*";
116 | string tkn = tknDir.c_str ();
117 | tkn += "*.*";
118 |
119 | int numOfFileTobeRead = 0;
120 | int numOfFilesRead = 0;
121 |
122 | cout << "Input the number of files want to be read (0 represents all)" << endl;
123 | cin >> numOfFileTobeRead;
124 |
125 | if (numOfFileTobeRead == 0) {
126 | numOfFileTobeRead = MAX_FILES;
127 | }
128 |
129 | if ((lf = _findfirst (bnd.c_str (), &file)) == -1l)
130 | cout << "No bnd file found!" << endl;
131 | else {
132 | // the first file name found is "..", so drop it
133 | bool firsFileIsGhost = true;
134 | while (_findnext (lf, &file) == 0 && numOfFilesRead < numOfFileTobeRead) {
135 | if (!firsFileIsGhost) {
136 | cout << file.name << " found" << endl;
137 |
138 | string bndFile (file.name);
139 | bndFile = bndDir + bndFile;
140 |
141 | readBndFile (corpus, bndFile, Brecid, Erecid);
142 | numOfFilesRead++;
143 | }
144 | firsFileIsGhost = false;
145 | }
146 | }
147 |
148 | numOfFilesRead = 0;
149 | if ((lf = _findfirst (tkn.c_str (), &file)) == -1l)
150 | cout << "No tkn file found!" << endl;
151 | else {
152 | bool firsFileIsGhost = true;
153 | while (_findnext (lf, &file) == 0 && numOfFilesRead < numOfFileTobeRead) {
154 | if (!firsFileIsGhost) {
155 | cout << file.name << " found" << endl;
156 |
157 | string tknFile (file.name);
158 | tknFile = tknDir + tknFile;
159 |
160 | readTknFile (corpus, tknFile, Brecid, Erecid, glossaryIntToString, glossaryStringToInt, storiesIndexWithCertainWord);
161 | numOfFilesRead++;
162 | }
163 | firsFileIsGhost = false;
164 | }
165 | }
166 |
167 | _findclose (lf);
168 |
169 | }
170 |
171 | void readBndFile (vector &corpus, const string bndFile, vector &Brecid,
172 | vector &Erecid)
173 | {
174 | int numOfStories = 0;
175 |
176 | ifstream fin (bndFile, ios::in);
177 | assert (fin.is_open ());
178 |
179 | // the first line is title, and it is of no use, so try to work hard to be a useful man
180 | string titleUseless;
181 | getline (fin, titleUseless);
182 |
183 | // for each line, if simply use fin, we can get 5 strings:
184 | // 1. "", very important
189 | string boundaryUseless;
190 | while (fin >> boundaryUseless) {
191 | string timestamp, doctypeUseless;
192 |
193 | // because they are not only Brecid and Erecid, so they are called as follows
194 | char BrecidWithRedundancy[15] = { };
195 | char ErecidWithRedundancy[15] = { };
196 |
197 | // the follows are real Brecid and Erecid
198 | int BrecidInt, ErecidInt;
199 |
200 | fin >> timestamp >> doctypeUseless >> BrecidWithRedundancy >> ErecidWithRedundancy;
201 |
202 | // retrieve the timestamp
203 | timestamp = timestamp.substr (9, timestamp.length () - 9);
204 |
205 | // use "=" to split the string
206 | const char *split = "=";
207 |
208 | // point to the real Brecid and Erecid
209 | char *pid;
210 |
211 | pid = strtok (BrecidWithRedundancy, split);
212 | pid = strtok (NULL, split);
213 |
214 | // convert char* to int
215 | BrecidInt = atoi (pid);
216 |
217 | pid = strtok (ErecidWithRedundancy, split);
218 | pid = strtok (NULL, split);
219 |
220 | ErecidInt = atoi (pid);
221 |
222 | vector words;
223 | Story newStory (numOfStories, words, timestamp);
224 | corpus.push_back (newStory);
225 | numOfStories++;
226 |
227 | Brecid.push_back (BrecidInt);
228 | Erecid.push_back (ErecidInt);
229 | }
230 |
231 | fin.close ();
232 |
233 | cout << "read bnd file done!" << endl;
234 | }
235 |
236 | void readTknFile (vector &corpus, const string tknFile,
237 | const vector &Brecid, const vector &Erecid,
238 | map &glossaryIntToString, map &glossaryStringToInt,
239 | map> &storiesIndexWithCertainWord)
240 | {
241 |
242 | ifstream fin (tknFile, ios::in);
243 | assert (fin.is_open ());
244 |
245 | // the first line is title, and it is of no use, so try to work hard again to be a useful man
246 | string titleUseless;
247 | getline (fin, titleUseless);
248 |
249 | // for each line, if simply use fin, we can get 4 strings:
250 | // 1. "> _WUseless) {
259 | string recidWithRedundancy, TrUseless, word;
260 |
261 | // this means a new tkn file is read
262 | if (Brecid[numOfStories] == 1 && beginOfAStroy){
263 | recid = 1;
264 | beginOfAStroy = false;
265 | }
266 |
267 | if (recid > Erecid[numOfStories]) {
268 | numOfStories++;
269 | beginOfAStroy = true;
270 | }
271 |
272 | fin >> recidWithRedundancy >> TrUseless >> word;
273 |
274 | processWord (word);
275 |
276 | addWordToGlossary (word, glossaryIntToString, glossaryStringToInt);
277 |
278 | int wordID = glossaryStringToInt[word];
279 | corpus[numOfStories].addWord (wordID);
280 | storiesIndexWithCertainWord[wordID].insert (numOfStories);
281 |
282 | recid++;
283 | }
284 |
285 | fin.close ();
286 |
287 | cout << "read tkn file done!" << endl;
288 | }
289 |
290 | void processWord (string &word)
291 | {
292 | int len = word.length ();
293 |
294 | for (int i = 0; i < len; i++) {
295 | if (word[i] >= 'A' && word[i] <= 'Z') {
296 | word[i] -= ('A' - 'a');
297 | } else if (word[i] >= '0' && word[i] <= '9') {
298 | continue;
299 | } else if (word[i] == '.' || word[i] == '-') {
300 | continue;
301 | } else if (word[i] < 'a' || word[i] > 'z') {
302 | // if you can't figure out, just give up
303 | word = word.substr (0, i) + word.substr (i + 1, word.length () - (i + 1));
304 | i--;
305 | len--;
306 | }
307 | }
308 | }
309 |
310 | void addWordToGlossary (const string &word, map &glossaryIntToString,
311 | map &glossaryStringToInt)
312 | {
313 | if (glossaryStringToInt.find (word) == glossaryStringToInt.end ()) {
314 | int index = glossaryStringToInt.size ();
315 | glossaryStringToInt[word] = index;
316 | glossaryIntToString[index] = word;
317 | }
318 | }
319 |
--------------------------------------------------------------------------------
/TopicDetectionAndTracking/DataPreprocessing/DataPreprocessing.h:
--------------------------------------------------------------------------------
1 | /* Retrieve data from existed files (tkn_file & bnd_file), and do some pre-processing. */
2 |
3 | #ifndef DATA_PREPROCESSING_H
4 | #define DATA_PREPROCESSING_H
5 |
6 | #include "../Utilities/Story.h"
7 |
8 | /* Read from files, set the 'corpus' and 'glossary', and do some other preprocessing.
9 | Input: 'tknDir', 'bndDir', 'isWithStemmer'
10 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt'
11 | */
12 | void DataPreprocessing (vector &corpus,
13 | map &glossaryIntToString, map &glossaryStringToInt,
14 | map> &storiesIndexWithCertainWord,
15 | const string tknDir, const string bndDir,
16 | const bool &isWithStemmer = false);
17 |
18 | /* Read from files, set the 'corpus' and 'glossary'.
19 | Input: 'tknDir', 'bndDir', 'isWithStemmer'
20 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt'
21 | */
22 | void readCorpus (vector &corpus,
23 | map &glossaryIntToString, map &glossaryStringToInt,
24 | map> &storiesIndexWithCertainWord,
25 | const string tknDir, const string bndDir, const bool &isWithStemmer = false);
26 |
27 | /* Read from the specific file, set the 'corpus' and 'glossary'.
28 | Input: 'tknDir', 'bndDir', 'isWithStemmer'
29 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt'
30 | */
31 | void readCorpusFromFile (vector &corpus,
32 | map &glossaryIntToString, map &glossaryStringToInt,
33 | map> &storiesIndexWithCertainWord,
34 | const string tknDir, const string bndDir, const bool &isWithStemmer = false);
35 |
36 | /* Read from files in the directory, set the 'corpus' and 'glossary'.
37 | Input: 'tknDir', 'bndDir', 'isWithStemmer'
38 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt'
39 | */
40 | void readCorpusFromDirectory (vector &corpus, map &glossaryIntToString,
41 | map &glossaryStringToInt,
42 | map> &storiesIndexWithCertainWord,
43 | const string tknDir, const string bndDir,
44 | const bool &isWithStemmer = false);
45 |
46 |
47 | /* Read from bnd files to get the begin and the end of a story.
48 | Input: 'bnd_file'
49 | Output: 'corpus', 'Brecid', 'Erecid' */
50 | void readBndFile (vector &corpus, const string bndFile,
51 | vector &Brecid, vector &Erecid);
52 |
53 | /* Read from bnd files, get the words for each story and set the glossary.
54 | Input: 'tkn_file', 'Brecid', 'Erecid'
55 | Output: 'corpus', 'glossaryIntToString', 'glossaryStringToInt' */
56 | void readTknFile (vector &corpus, const string tknFile,
57 | const vector &Brecid, const vector &Erecid,
58 | map &glossaryIntToString, map &glossaryStringToInt,
59 | map> &storiesIndexWithCertainWord);
60 |
61 | /* Process the word, remove punctuations and convert all the letters to lowercase
62 | Input: 'word'
63 | Output: 'word' */
64 | void processWord (string &word);
65 |
66 | /* add the word to glossary if the word does not exist
67 | Input: 'word'
68 | Output: 'glossaryIntToString', 'glossaryStringToInt' */
69 | void addWordToGlossary (const string &word, map &glossaryIntToString,
70 | map &glossaryStringToInt);
71 |
72 | #endif/* Retrieve data from existed files (tkn_file & bnd_file), and do some pre-processing. */
--------------------------------------------------------------------------------
/TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0113_1041_AFP_ARB.mttkn_bnd:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
--------------------------------------------------------------------------------
/TopicDetectionAndTracking/Dataset/mttkn_bnd/20030401_0130_0330_XIN_MAN.mttkn_bnd:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |