├── example
    ├── .gitignore
    ├── CMakeLists.txt
    ├── example.srt
    ├── example-output.srt
    ├── main.cpp
    └── README.MD
├── LICENSE.txt
├── README.adoc
└── srtparser.h


/example/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | cmake-build-debug/


--------------------------------------------------------------------------------
/example/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.6)
2 | project(demo)
3 | 
4 | set(CMAKE_CXX_STANDARD 11)
5 | 
6 | include_directories(../)
7 | 
8 | set(SOURCE_FILES main.cpp ../srtparser.h)
9 | add_executable(demo ${SOURCE_FILES})


--------------------------------------------------------------------------------
/example/example.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:00:00,520 --> 00:00:03,536
 3 | Chris: Elon, hey, welcome back to TED.
 4 | 
 5 | 2
 6 | 00:00:03,560 --> 00:00:04,936
 7 | It's great to have you here.
 8 | 
 9 | 3
10 | 00:00:04,960 --> 00:00:06,536
11 | Elon: Thanks for having me.
12 | 
13 | 4
14 | 00:00:06,560 --> 00:00:09,416
15 | (Applause) Thanks
16 | 
17 | 5
18 | 00:00:09,440 --> 00:00:11,256
19 | <i>we're going to spend some time</i>


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Original work Copyright (c) 2017 Oleksii Maryshchenko
 4 | Modified work Copyright (c) 2017 Saurabh Shrivastava
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/example/example-output.srt:
--------------------------------------------------------------------------------
 1 | BEGIN
 2 | start : 520
 3 | end : 3536
 4 | text : 
 5 | Chris: Elon, hey, welcome back to TED.
 6 | 
 7 | 
 8 | startString : 00:00:00,520
 9 | endString : 00:00:03,536
10 | justDialogue : 
11 | Elon, hey, welcome back to TED.
12 | 
13 | 
14 | ignore : 0
15 | speakerCount : 1
16 | speakers : Chris, 
17 | END
18 | 
19 | BEGIN
20 | start : 3560
21 | end : 4936
22 | text : 
23 | It's great to have you here.
24 | 
25 | 
26 | startString : 00:00:03,560
27 | endString : 00:00:04,936
28 | justDialogue : 
29 | It's great to have you here.
30 | 
31 | 
32 | ignore : 0
33 | speakerCount : 0
34 | END
35 | 
36 | BEGIN
37 | start : 4960
38 | end : 6536
39 | text : 
40 | Elon: Thanks for having me.
41 | 
42 | 
43 | startString : 00:00:04,960
44 | endString : 00:00:06,536
45 | justDialogue : 
46 | Thanks for having me.
47 | 
48 | 
49 | ignore : 0
50 | speakerCount : 1
51 | speakers : Elon, 
52 | END
53 | 
54 | BEGIN
55 | start : 6560
56 | end : 9416
57 | text : 
58 | (Applause) Thanks
59 | 
60 | 
61 | startString : 00:00:06,560
62 | endString : 00:00:09,416
63 | justDialogue : 
64 |  Thanks
65 | 
66 | 
67 | ignore : 0
68 | speakerCount : 0
69 | END
70 | 
71 | BEGIN
72 | start : 9440
73 | end : 11256
74 | text : 
75 | <i>we're going to spend some time</i>
76 | 
77 | startString : 00:00:09,440
78 | endString : 00:00:11,256
79 | justDialogue : 
80 | we're going to spend some time
81 | 
82 | ignore : 0
83 | speakerCount : 0
84 | END
85 | 
86 | 


--------------------------------------------------------------------------------
/example/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "../srtparser.h"
 2 | using namespace std;
 3 | 
 4 | int main(int argc, char *argv[]) {
 5 | 
 6 |     // If filename not provided
 7 |     if(argc < 2) {
 8 |         cout << "Please pass the filename as an argument: ./a.out filename.srt\n";
 9 |         return 0;
10 |     }
11 | 
12 |     SubtitleParserFactory *subParserFactory = new SubtitleParserFactory(argv[1]);
13 |     SubtitleParser *parser = subParserFactory->getParser();
14 | 
15 |     std::vector<SubtitleItem*> sub = parser->getSubtitles();
16 | 
17 |     ofstream myfile;
18 |     myfile.open ("out.srt");
19 | 
20 |     for(SubtitleItem * element : sub)
21 |     {
22 |         myfile<<"BEGIN"<<endl;
23 |         myfile<<"startString : "<<element->getStartTimeString()<<endl;
24 |         myfile<<"start : "<<element->getStartTime()<<endl;
25 |         myfile<<"endString : "<<element->getEndTimeString()<<endl;
26 |         myfile<<"end : "<<element->getEndTime()<<endl;
27 |         myfile<<"text : "<<element->getText()<<endl;
28 |         myfile<<"justDialogue : "<<element->getDialogue()<<endl;
29 |         myfile<<"words count : "<<element->getWordCount()<<endl;
30 |         myfile<<"words :";
31 |         std::vector<std::string> word = element->getIndividualWords();
32 |             for(std::string display : word)
33 |                 myfile<<display<<", ";
34 |             myfile<<endl;
35 | 
36 |         myfile<<"speakerCount : "<<element->getSpeakerCount()<<endl;
37 |         myfile<<"speakers : ";
38 |         if(element->getSpeakerCount())
39 |         {
40 |             std::vector<std::string> name = element->getSpeakerNames();
41 |             for(std::string display : name)
42 |                 myfile<<display<<", ";
43 |             myfile<<endl;
44 |         }
45 | 
46 |         myfile<<"ignore : "<<element->getIgnoreStatus()<<endl;
47 |         myfile<<"END"<<endl<<endl;
48 |     }
49 | 
50 |     return 0;
51 | }


--------------------------------------------------------------------------------
/example/README.MD:
--------------------------------------------------------------------------------
  1 | # Example for srtparser.h : Simple, yet powerful C++ SRT Subtitle Parser Library.
  2 | 
  3 | ## How to run?
  4 | 
  5 | CMakeLists.txt is provided. Simply run using cmake.
  6 | 
  7 | The compiled tool needs the subtitle file as input.
  8 | 
  9 | `./a.out input.srt`
 10 | 
 11 | ## Output
 12 | 
 13 | A simple example resides in this directory (`main.cpp`). Demo subtitle file (`example.srt`) is also provided.
 14 | It's output is located in (`example-output.srt`).
 15 | 
 16 | Code Snippet:
 17 | 
 18 | ```
 19 |         myfile<<"BEGIN"<<endl;
 20 |         myfile<<"startString : "<<element->getStartTimeString()<<endl;
 21 |         myfile<<"start : "<<element->getStartTime()<<endl;
 22 |         myfile<<"endString : "<<element->getEndTimeString()<<endl;
 23 |         myfile<<"end : "<<element->getEndTime()<<endl;
 24 |         myfile<<"text : "<<element->getText()<<endl;
 25 |         myfile<<"justDialogue : "<<element->getDialogue()<<endl;
 26 |         myfile<<"words count : "<<element->getWordCount()<<endl;
 27 |         myfile<<"words :";
 28 |         std::vector<std::string> word = element->getIndividualWords();
 29 |             for(std::string display : word)
 30 |                 myfile<<display<<", ";
 31 |             myfile<<endl;
 32 | 
 33 |         myfile<<"speakerCount : "<<element->getSpeakerCount()<<endl;
 34 |         myfile<<"speakers : ";
 35 |         if(element->getSpeakerCount())
 36 |         {
 37 |             std::vector<std::string> name = element->getSpeakerNames();
 38 |             for(std::string display : name)
 39 |                 myfile<<display<<", ";
 40 |             myfile<<endl;
 41 |         }
 42 | 
 43 |         myfile<<"ignore : "<<element->getIgnoreStatus()<<endl;
 44 |         myfile<<"END"<<endl<<endl;
 45 | ```
 46 | 
 47 | Input : 
 48 | 
 49 | ```
 50 | 
 51 | 1
 52 | 00:00:00,520 --> 00:00:03,536
 53 | Chris: Elon, hey, welcome back to TED.
 54 | 
 55 | 2
 56 | 00:00:03,560 --> 00:00:04,936
 57 | Chris : It's great to have you here.
 58 | Evan: Indeed it is.
 59 | 
 60 | 3
 61 | 00:00:04,960 --> 00:00:06,536
 62 | Elon: Thanks for having me.
 63 | 
 64 | 4
 65 | 00:00:06,560 --> 00:00:09,416
 66 | (Applause) Thanks
 67 | 
 68 | 5
 69 | 00:00:09,440 --> 00:00:11,256
 70 | <i>we're going to spend some time</i>
 71 | ```
 72 | 
 73 | Parsed Output :
 74 | 
 75 | ```
 76 | 
 77 | BEGIN
 78 | startString : 00:00:00,520
 79 | start : 520
 80 | endString : 00:00:03,536
 81 | end : 3536
 82 | text : Chris: Elon, hey, welcome back to TED.
 83 | justDialogue : Elon, hey, welcome back to TED.
 84 | words count : 6
 85 | words :Elon,, hey,, welcome, back, to, TED., 
 86 | speakerCount : 1
 87 | speakers : Chris, 
 88 | ignore : 0
 89 | END
 90 | 
 91 | BEGIN
 92 | startString : 00:00:03,560
 93 | start : 3560
 94 | endString : 00:00:04,936
 95 | end : 4936
 96 | text : Chris : It's great to have you here. Evan: Indeed it is.
 97 | justDialogue : It's great to have you here. Indeed it is.
 98 | words count : 9
 99 | words :It's, great, to, have, you, here., Indeed, it, is., 
100 | speakerCount : 2
101 | speakers : Chris ,  Evan, 
102 | ignore : 0
103 | END
104 | 
105 | BEGIN
106 | startString : 00:00:04,960
107 | start : 4960
108 | endString : 00:00:06,536
109 | end : 6536
110 | text : Elon: Thanks for having me.
111 | justDialogue : Thanks for having me.
112 | words count : 4
113 | words :Thanks, for, having, me., 
114 | speakerCount : 1
115 | speakers : Elon, 
116 | ignore : 0
117 | END
118 | 
119 | BEGIN
120 | startString : 00:00:06,560
121 | start : 6560
122 | endString : 00:00:09,416
123 | end : 9416
124 | text : (Applause) Thanks
125 | justDialogue : Thanks
126 | words count : 1
127 | words :Thanks, 
128 | speakerCount : 0
129 | speakers : ignore : 0
130 | END
131 | 
132 | BEGIN
133 | startString : 00:00:09,440
134 | start : 9440
135 | endString : 00:00:11,256
136 | end : 11256
137 | text : <i>we're going to spend some time</i>
138 | justDialogue : we're going to spend some time
139 | words count : 6
140 | words :we're, going, to, spend, some, time, 
141 | speakerCount : 0
142 | speakers : ignore : 0
143 | END
144 | 
145 | 
146 | 
147 | 
148 | ```


--------------------------------------------------------------------------------
/README.adoc:
--------------------------------------------------------------------------------
  1 | = srtparser.h : Simple, yet powerful C++ SRT Subtitle Parser Library.
  2 | A single header, simple, powerful full blown srt subtitle parser written in C++.
  3 | ___
  4 | 
  5 | https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp[srtparser.h] is a single header, simple and powerful C++ srt subtitle parsing library that allows you to easily handle, process and manipulate srt subtitle files in your project. It is an extension of Oleksii Maryshchenko's simple https://github.com/young-developer/subtitle-parser[subtitle-parser]. It has following features :
  6 | 
  7 | 1. It is a single header C++ (CPP) file, and can be easily used in your project.
  8 | 2. Focus on portability, efficiency and simplicity with no external dependency.
  9 | 3. Wide variety of functions at programmers disposal to parse srt file as per need.
 10 | 4. Capable of :
 11 | 	- extracting and stripping HTML and other styling tags from subtitle text.
 12 | 	- extracting and stripping speaker names.
 13 | 	- extracting and stripping non dialogue texts.
 14 | 5. Easy to extend and add new  functionalities.
 15 | 
 16 | == How to use srtparser.h
 17 | 
 18 | === General usage ===
 19 | 
 20 | srptparser.h is a cross-platform robust srt subtitle parser.
 21 | 
 22 | * Download `srtparser.h` from https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp
 23 | * Include the header file in your program.
 24 | 	`#include "lib/srtparser.h"`
 25 | * Create SubtitleParserFactory object. Use this factory object to create SubtitleParser object.
 26 | 
 27 | ```
 28 | SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt");
 29 | SubtitleParser *parser = subParserFactory->getParser();
 30 | 
 31 | //to get subtitles 
 32 | 
 33 | std::vector<SubtitleItem*> sub = parser->getSubtitles();
 34 | ```
 35 | 
 36 | * Call appropriate functions to perform parsing.
 37 | 
 38 | See demo usage in `examples` directory.
 39 | 
 40 | === Parser Functions ===
 41 | 
 42 | The following is a complete list of available parser functions.
 43 | 
 44 | Syntax:
 45 | 
 46 | 
 47 | 
 48 | [cols="2,1,2,5"]
 49 | |===
 50 | | Class | Return Type | Function | Description
 51 | 
 52 | | SubtitleParserFactory
 53 | | SubtitleParserFactory
 54 | | `SubtitleParserFactory("inputFile.srt")`
 55 | | Creates a SubtitleParserFactory object. Here the _inputFile.srt_ is the path of subtitle file to be parsed. This object is used to create parser.
 56 | 
 57 | _E.g.: ``SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt");``_
 58 | 
 59 | | SubtitleParserFactory
 60 | | SubtitleParser
 61 | | `getParser()`
 62 | | Returns the SubtitleParser object. This object will be used to parse the subtitle file.
 63 | 
 64 | _E.g.: ``SubtitleParser *parser = subParserFactory->getParser();``_
 65 | 
 66 | | SubtitleParser
 67 | | std::vector<SubtitleItem*>
 68 | | `getSubtitles()`
 69 | | Returns the Subtitle as SubtitleItem object.
 70 | 
 71 | _E.g.: ``std::vector<SubtitleItem*> sub = parser->getSubtitles();``_
 72 | 
 73 | | SubtitleParser
 74 | | std::string
 75 | | `getFileData()`
 76 | | Returns the complete file data read as it is from inputFile.srt
 77 | 
 78 | _E.g.: ``std::string fileData = parser->getFileData();``_
 79 | 
 80 | | SubtitleItem
 81 | | long int
 82 | | `getStartTime()`
 83 | | Returns the starting time of subtitle in milliseconds.
 84 | 
 85 | _E.g.: ``long int startTime = sub->getStartTime();``_
 86 | 
 87 | | SubtitleItem
 88 | | long int
 89 | | `getEndTime()`
 90 | | Returns the ending time of subtitle in milliseconds.
 91 | 
 92 | _E.g.: ``long int endTime = sub->getEndTime();``_
 93 | 
 94 | | SubtitleItem
 95 | | std::string
 96 | | `getStartTimeString()`
 97 | | Returns the starting time of subtitle in srt format.
 98 | 
 99 | _E.g.: ``std::string startTime = sub->getStartTimeString();``_
100 | 
101 | | SubtitleItem
102 | | std::string
103 | | `getEndTimeString()`
104 | | Returns the ending time of subtitle in srt format.
105 | 
106 | _E.g.: ``std::string endTime = sub->getEndTimeString();``_
107 | 
108 | | SubtitleItem
109 | | std::string
110 | | `getText()`
111 | | Returns the subtitle text as present in .srt file.
112 | 
113 | _E.g.: ``std::string text = sub->getText();``_
114 | 
115 | | SubtitleItem
116 | | std::string
117 | | `getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames);`
118 | | Returns the subtitle text after processing according to parameters.
119 | 
120 | keepHTML = 1 to stop parser from stripping style tags
121 | 
122 | doNotIgnoreNonDialogues = 1 to stop parser from ignoring and extracting non dialogue texts such as _(laughter)_.
123 | 
124 | doNotRemoveSpeakerNames = 1 to stop parser from ignoring and extracting speaker names
125 | 
126 | By default (0,0,0) values are passed.
127 | 
128 | _E.g.: ``std::string text = sub->getDialogue();``_
129 | 
130 | | SubtitleItem
131 | | int
132 | | `getWordCount()`
133 | | Returns the count of number of words present in the subtitle dialogue.
134 | 
135 | _E.g.: ``int wordCount = sub->getWordCount();``_
136 | 
137 | | SubtitleItem
138 | | std::vector<std::string>
139 | | `getIndividualWords()`
140 | | Returns string vector of individual words present in subtitle.
141 | 
142 | _E.g.: ``std::vector<std::string> words = sub->getIndividualWords();``_
143 | 
144 | | SubtitleItem
145 | | bool
146 | | `getIgnoreStatus()`
147 | | Returns the ignore status. Returns true, if the _justDialogue field i.e. subtitle after processing is empty.
148 | 
149 | _E.g.: ``bool ignore = sub->getIgnoreStatus();``_
150 | 
151 | | SubtitleItem
152 | | int
153 | | `getSpeakerCount()`
154 | | Returns the count of number of speakers present in the subtitle.
155 | 
156 | _E.g.: ``int speakerCount = sub->getSpeakerCount();``_
157 | 
158 | | SubtitleItem
159 | | std::vector<std::string>
160 | | `getSpeakerNames()`
161 | | Returns string vector of speaker names.
162 | 
163 | _E.g.: ``std::vector<std::string> speakerNames = sub->getSpeakerNames();``_
164 | 
165 | | SubtitleItem
166 | | int
167 | | `getNonDialogueCount()`
168 | | Returns the count of number of non dialogue words present in the subtitle.
169 | 
170 | _E.g.: ``int nonDialogueCount = sub->getNonDialogueCount();``_
171 | 
172 | | SubtitleItem
173 | | std::vector<std::string>
174 | | `getNonDialogueWords()`
175 | | Returns string vector of non dialogue words.
176 | 
177 | _E.g.: ``std::vector<std::string> nonDialogueWords = sub->getNonDialogueWords();``_
178 | 
179 | | SubtitleItem
180 | | int
181 | | `getStyleTagCount()`
182 | | Returns the count of number of style tags present in the subtitle.
183 | 
184 | _E.g.: ``int styleTagCount = sub->getStyleTagCount();``_
185 | 
186 | | SubtitleItem
187 | | std::vector<std::string>
188 | | `getStyleTags()`
189 | | Returns string vector of style tags.
190 | 
191 | _E.g.: ``std::vector<std::string> styleTags = sub->getStyleTags();``_
192 | 
193 | | SubtitleWord
194 | | std::string
195 | | `getText()`
196 | | Returns the subtitle text as present in .srt file.
197 | 
198 | _E.g.: ``std::string text = sub->getText();``_
199 | 
200 | |===
201 | 
202 | ## Examples
203 | 
204 | While I've tried to include examples in the above table, a compilation of all of them together in a single C++ program can be found in `example` directory.
205 | 
206 | ## Contributing
207 | 
208 | Suggestions, features request, PRs, bug reports, bug fixes are welcomed. I'll be thankful.
209 | 
210 | ## Credits
211 | 
212 | Built upon a MIT licensed simple subtitle-parser called LibSub-Parser by Oleksii Maryshchenko.
213 | 
214 | The original parser had 3 major functions : getStartTime(), getEndTime() and getText(). 
215 | 
216 | Rest work done by Saurabh Shrivastava, originally for using this in his https://saurabhshri.github.io/2017/05/gsoc/creating-a-full-blown-srt-subtitle-parser[GSoC project].


--------------------------------------------------------------------------------
/srtparser.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Author   : Saurabh Shrivastava
  3 |  * Email    : saurabh.shrivastava54@gmail.com
  4 |  * Link     : https://github.com/saurabhshri
  5 |  *
  6 |  * Based on subtitle-parser by Oleksii Maryshchenko.
  7 |  * Email    : young_developer@mail.ru
  8 |  * Link     : https://github.com/young-developer/subtitle-parser
  9 |  */
 10 | 
 11 | #ifndef SRTPARSER_H
 12 | #define SRTPARSER_H
 13 | 
 14 | #include <iostream>
 15 | #include <fstream>
 16 | #include <sstream>
 17 | #include <vector>
 18 | #include <algorithm>
 19 | #include <iterator>
 20 | 
 21 | //function for splitting sentences based on supplied delimiter
 22 | inline std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
 23 |     std::stringstream ss(s);
 24 |     std::string item;
 25 | 
 26 |     while (getline(ss, item, delim)) {
 27 |         elems.push_back(item);
 28 |     }
 29 |     return elems;
 30 | }
 31 | 
 32 | /**** Class definitions ****/
 33 | 
 34 | 
 35 | class SubtitleWord
 36 | {
 37 | private:
 38 |     std::string _text;
 39 | public:
 40 |     SubtitleWord(void);
 41 |     SubtitleWord(std::string text);
 42 |     virtual std::string getText() const;
 43 |     ~SubtitleWord(void);
 44 | };
 45 | 
 46 | class SubtitleItem
 47 | {
 48 | private:
 49 |     long int _startTime;                    //in milliseconds
 50 |     long int _endTime;
 51 |     std::string _text;                      //actual line, as present in subtitle file
 52 |     long int timeMSec(std::string value);   //converts time string into ms
 53 | 
 54 |     int _subNo;                              //subtitle number
 55 |     std::string _startTimeString;           //time as in srt format
 56 |     std::string _endTimeString;
 57 |     bool _ignore;                           //should subtitle be ignore; used when the subtitle is empty after processing
 58 |     std::string _justDialogue;              //contains processed subtitle - stripped style, non dialogue text removal etc.
 59 |     int _speakerCount;                      //count of number of speakers
 60 |     std::vector<std::string> _speaker;      //list of speakers in a single subtitle
 61 |     int _nonDialogueCount;                  //count of non spoken words in a subtitle
 62 |     std::vector<std::string> _nonDialogue;  //list of non dialogue words, e.g. (applause)
 63 |     int _wordCount;                         //number of words in _justDialogue
 64 |     std::vector<std::string> _word;         //list of words in dialogue
 65 |     std::vector<long int> _wordStartTime;   //start time of each word in dialogue
 66 |     std::vector<long int> _wordEndTime;     //end time of each word in dialogue
 67 |     std::vector<long int> _wordDuration;   //actual duration of each word without silence
 68 |     int _styleTagCount;                     //count of style tags in a single subtitle
 69 |     std::vector<std::string> _styleTag;     //list of style tags in that subtitle
 70 |     void extractInfo(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0,  bool doNotRemoveSpeakerNames = 0);   //process subtitle
 71 | public:
 72 |     long int getStartTime() const;          //returns starting time in ms
 73 |     long int getEndTime() const;            //returns ending time in ms
 74 |     std::string getText() const;            //returns subtitle text as present in .srt file
 75 | 
 76 |     int getSubNo() const;              //returns subtitle number
 77 |     std::string getStartTimeString() const; //returns sarting time as present in .srt file
 78 |     std::string getEndTimeString() const;   //returns ending time as present in .srt file
 79 |     bool getIgnoreStatus() const;           //returns status, whether the subtitle is ignorable or not after processing
 80 |     std::string getDialogue(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0,  bool doNotRemoveSpeakerNames = 0); //returns processed subtitle
 81 |     int getSpeakerCount() const;            //return speaker count
 82 |     int getNonDialogueCount() const;        //return non dialogue words count
 83 |     int getStyleTagCount() const;           //return style tags count
 84 |     int getWordCount() const;               //return words count
 85 |     std::vector<std::string> getIndividualWords(); //return string vector of individual words
 86 |     std::string getWordByIndex(int index);       //return word stored at 'index'
 87 |     std::vector<long int> getWordStartTimes();   //return long int vector of start time of individual words
 88 |     std::vector<long int> getWordEndTimes();     //return long int vector of end time of individual words
 89 |     long int getWordStartTimeByIndex(int index); //return the start time of a word based on index
 90 |     long int getWordEndTimeByIndex (int index);  //return the end time of a word based on index
 91 |     std::vector<std::string> getSpeakerNames();  //return string vector of speaker names
 92 |     std::vector<std::string> getNonDialogueWords(); //return string vector of non dialogue words
 93 |     std::vector<std::string> getStyleTags();    //return string vector of style tags
 94 | 
 95 | 
 96 |     void setStartTime(long int startTime);  //set starting time
 97 |     void setEndTime(long int endTime);      //set ending time
 98 |     void setText(std::string text);         //set subtitle text
 99 |     void setWordTimes(std::vector<long int> wordStartTime, std::vector<long int> wordEndTime, std::vector<long int> wordDuration);  //assign time to individual words
100 | 
101 |     SubtitleItem(void);
102 |     SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore = false,
103 |                  std::string justDialogue = "" , int speakerCount = 0, int nonDialogueCount = 0,
104 |                  int styleTagCount = 0, int wordCount = 0, std::vector<std::string> speaker = std::vector<std::string>(),
105 |                  std::vector<std::string> nonDialogue = std::vector<std::string>(),
106 |                  std::vector<std::string> styleTags = std::vector<std::string>(),
107 |                  std::vector<std::string> word = std::vector<std::string>());  //default constructor
108 |     ~SubtitleItem(void);
109 | };
110 | 
111 | class SubtitleParser
112 | {
113 | protected:
114 |     std::vector<SubtitleItem*> _subtitles;              //stores subtitles
115 |     std::string _fileName;                              //supplied filename
116 |     virtual void parse(std::string fileName) = 0;
117 | public:
118 |     virtual std::vector<SubtitleItem*> getSubtitles();  //returns subtitles
119 |     std::string getFileData();
120 |     SubtitleParser(void);
121 |     virtual ~SubtitleParser(void);
122 | };
123 | 
124 | class SubtitleParserFactory
125 | {
126 | private:
127 |     std::string _fileName;
128 | public:
129 |     SubtitleParser* getParser();
130 |     SubtitleParserFactory(std::string fileName);
131 |     ~SubtitleParserFactory(void);
132 | };
133 | 
134 | class SubRipParser : public SubtitleParser
135 | {
136 |     void parse(std::string fileName);
137 | public:
138 |     SubRipParser(void);
139 |     SubRipParser(std::string fileName);
140 |     ~SubRipParser(void);
141 | };
142 | 
143 | 
144 | /**** Function definitions ****/
145 | 
146 | //1. SubtitleParserFactory class
147 | 
148 | inline SubtitleParserFactory::SubtitleParserFactory(std::string fileName)
149 | {
150 |     _fileName = fileName;
151 | }
152 | 
153 | inline SubtitleParser* SubtitleParserFactory::getParser()
154 | {
155 |     return new SubRipParser(_fileName);                 //creates and returns SubRipParser obj
156 | }
157 | 
158 | inline SubtitleParserFactory::~SubtitleParserFactory(void)
159 | {
160 | }
161 | 
162 | //2. SubtitleParser class
163 | 
164 | inline std::vector<SubtitleItem*> SubtitleParser::getSubtitles()
165 | {
166 |     return _subtitles;
167 | }
168 | 
169 | inline std::string SubtitleParser::getFileData()           //returns whole read file i.e. contents of input.srt
170 | {
171 |     std::ifstream infile(_fileName);
172 |     std::string allData = "";
173 |     std::string line;
174 |     while (std::getline(infile, line))
175 |     {
176 |         std::istringstream iss(line);
177 |         allData += line + "\n";
178 |     }
179 |     return allData;
180 | 
181 | }
182 | 
183 | inline SubtitleParser::SubtitleParser(void)
184 | {
185 | 
186 | }
187 | 
188 | inline SubtitleParser::~SubtitleParser(void)
189 | {
190 | }
191 | 
192 | //3. SubRipParser class
193 | 
194 | inline SubRipParser::SubRipParser(void)
195 | {
196 | }
197 | 
198 | inline void SubRipParser::parse(std::string fileName)      //srt parser
199 | {
200 | 
201 |     std::ifstream infile(fileName);
202 |     std::string line, start, end, completeLine = "", timeLine = "";
203 |     int subNo, turn = 0;
204 | 
205 |     /*
206 |      * turn = 0 -> Add subtitle number
207 |      * turn = 1 -> Add string to timeLine
208 |      * turn > 1 -> Add string to completeLine
209 |      */
210 | 
211 |     while (std::getline(infile, line))
212 |     {
213 |         line.erase(remove(line.begin(), line.end(), '\r'), line.end());
214 | 
215 |         if (line.compare(""))
216 |         {
217 |             if(!turn)
218 |             {
219 |                 subNo=atoi(line.c_str());
220 |                 turn++;
221 |                 continue;
222 |             }
223 | 
224 |             if (line.find("-->") != std::string::npos)
225 |             {
226 |                 timeLine += line;
227 | 
228 |                 std::vector<std::string> srtTime;
229 |                 srtTime = split(timeLine, ' ', srtTime);
230 |                 start = srtTime[0];
231 |                 end = srtTime[2];
232 | 
233 |             }
234 |             else
235 |             {
236 |                 if (completeLine != "")
237 |                     completeLine += " ";
238 | 
239 |                 completeLine += line;
240 |             }
241 | 
242 |             turn++;
243 |         }
244 | 
245 |         else
246 |         {
247 |             turn = 0;
248 |             _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine));
249 |             completeLine = timeLine = "";
250 |         }
251 | 
252 |         if(infile.eof())    //insert last remaining subtitle
253 |         {
254 |             _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine));
255 |         }
256 |     }
257 | }
258 | 
259 | inline SubRipParser::SubRipParser(std::string fileName)
260 | {
261 |     _fileName = fileName;
262 |     parse(fileName);
263 | }
264 | 
265 | inline SubRipParser::~SubRipParser(void)
266 | {
267 |     for(int i=0;i != _subtitles.size();++i)
268 |     {
269 |         if(_subtitles[i])
270 |             delete _subtitles[i];
271 |     }
272 | }
273 | 
274 | //4. SubtitleItem class
275 | 
276 | inline SubtitleItem::SubtitleItem(void)
277 | {
278 | }
279 | 
280 | inline SubtitleItem::SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore,
281 |                            std::string justDialogue, int speakerCount, int nonDialogueCount,
282 |                            int styleTagCount, int wordCount, std::vector<std::string> speaker, std::vector<std::string> nonDialogue,
283 |                            std::vector<std::string> styleTags, std::vector<std::string> word)
284 | {
285 |     _startTime = timeMSec(startTime);
286 |     _endTime = timeMSec(endTime);
287 |     _text = text;
288 | 
289 |     _subNo = subNo;
290 |     _startTimeString = startTime;
291 |     _endTimeString = endTime;
292 |     _ignore = ignore;
293 |     _justDialogue = justDialogue;
294 |     _speakerCount = speakerCount;
295 |     _nonDialogueCount = nonDialogueCount;
296 |     _wordCount = wordCount;
297 |     _speaker = speaker;
298 |     _styleTagCount = styleTagCount;
299 |     _styleTag = styleTags;
300 |     _nonDialogue = nonDialogue;
301 |     _word = word;
302 | 
303 |     extractInfo();
304 | }
305 | 
306 | inline long int SubtitleItem::timeMSec(std::string value)
307 | {
308 |     std::vector<std::string> t, secs;
309 |     int hours, mins, seconds, milliseconds;
310 | 
311 |     t = split(value, ':', t);
312 |     hours = atoi(t[0].c_str());
313 |     mins = atoi(t[1].c_str());
314 | 
315 |     secs = split(t[2], ',', secs);
316 |     seconds = atoi(secs[0].c_str());
317 |     milliseconds = atoi(secs[1].c_str());
318 | 
319 |     return hours * 3600000 + mins * 60000 + seconds * 1000 + milliseconds;
320 | }
321 | 
322 | inline long int SubtitleItem::getStartTime() const
323 | {
324 |     return _startTime;
325 | }
326 | inline long int SubtitleItem::getEndTime() const
327 | {
328 |     return _endTime;
329 | }
330 | 
331 | inline std::string SubtitleItem::getText() const
332 | {
333 |     return _text;
334 | }
335 | 
336 | inline void SubtitleItem::setStartTime(long int startTime)
337 | {
338 |     _startTime = startTime;
339 | }
340 | inline void SubtitleItem::setEndTime(long int endTime)
341 | {
342 |     _endTime = endTime;
343 | }
344 | inline void SubtitleItem::setText(std::string text)
345 | {
346 |     _text = text;
347 | }
348 | inline void SubtitleItem::setWordTimes(std::vector<long int> wordStartTime, std::vector<long int> wordEndTime, std::vector<long int> wordDuration)
349 | {
350 |     _wordStartTime = wordStartTime;
351 |     _wordEndTime = wordEndTime;
352 |     _wordDuration = wordDuration;
353 | }
354 | inline int SubtitleItem::getSubNo() const
355 | {
356 |     return _subNo;
357 | }
358 | inline std::string SubtitleItem::getStartTimeString() const
359 | {
360 |     return _startTimeString;
361 | }
362 | 
363 | inline std::string SubtitleItem::getEndTimeString() const
364 | {
365 |     return _endTimeString;
366 | }
367 | 
368 | inline bool SubtitleItem::getIgnoreStatus() const
369 | {
370 |     if(_ignore)
371 |         return true;
372 | 
373 |     else
374 |         return false;
375 | 
376 | }
377 | 
378 | inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames)   //process subtitle
379 | {
380 |     std::string output = _text;
381 | 
382 |     //stripping HTML tags
383 |     if(!keepHTML)
384 |     {
385 |         /*
386 |          * TODO : Before erasing, extract the words.
387 |          * std::vector<std::string> getStyleTags();
388 |          * int getStyleTagCount() const;
389 |          * std::vector<std::string> _styleTag;
390 |          * int _styleTagCount;
391 |          */
392 | 
393 |         int countP = 0;
394 |         for(char& c : output) // replacing <...> with ~~~~
395 |         {
396 |             if(c=='<')
397 |             {
398 |                 countP++;
399 |                 c = '~';
400 |             }
401 | 
402 |             else
403 |             {
404 |                 if(countP!=0)
405 |                 {
406 |                     if(c != '>')
407 |                         c = '~';
408 | 
409 |                     else if(c == '>')
410 |                     {
411 |                         c = '~';
412 |                         countP--;
413 |                     }
414 |                 }
415 |             }
416 |         }
417 |     }
418 | 
419 |     //stripping non dialogue data e.g. (applause)
420 | 
421 |     if(!doNotIgnoreNonDialogues)
422 |     {
423 |         /*
424 |          * TODO : Before erasing, extract the words.
425 |          * std::vector<std::string> getNonDialogueWords();
426 |          * int getNonDialogueCount() const;
427 |          * std::vector<std::string> _nonDialogue;
428 |          * int _nonDialogueCount;
429 |          */
430 | 
431 |         int countP = 0;
432 |         for(char& c : output)   // replacing (...) with ~~~~
433 |         {
434 |             if(c=='(')
435 |             {
436 |                 countP++;
437 |                 c = '~';
438 |             }
439 | 
440 |             else
441 |             {
442 |                 if(countP!=0)
443 |                 {
444 |                     if(c != ')')
445 |                         c = '~';
446 | 
447 |                     else if(c == ')')
448 |                     {
449 |                         c = '~';
450 |                         countP--;
451 |                     }
452 |                 }
453 |             }
454 |         }
455 |     }
456 | 
457 |     output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~
458 | 
459 |     //Extracting speaker names
460 |     if(!doNotRemoveSpeakerNames)
461 |     {
462 |         for(int i=0; output[i]!='\0';i++)
463 |         {
464 |             int colonIndex = 0, nameBeginIndex = 0;
465 |             if(output[i]==':')  //speaker found; travel back
466 |             {
467 |                 _speakerCount++;
468 |                 colonIndex = i;
469 | 
470 |                 int tempIndex = 0, foundEvilColon = 0, continueFlag = 0, spaceBeforeColon = 0;
471 | 
472 |                 if(output[i-1] == ' ')
473 |                     spaceBeforeColon = 2;
474 | 
475 |                 /*
476 |                 Possible Cases :
477 | 
478 |                 Elon Musk: Hey Saurabh, you are pretty smart.       // First and Last Name
479 |                 Saurabh: *_* What? Elon Musk: Yes!                  // Two names in single line
480 |                 Saurabh : OMG OMG!                                  // Space before colon
481 |                 Elon: LOL World: LAMAO
482 |                 Saurabh: ._.                                        // normal
483 | 
484 |                  */
485 | 
486 |                 for(int j=i - spaceBeforeColon; j>=0;j--)
487 |                 {
488 |                     if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || output[j] == '\n'
489 |                        || output[j] == ' ' || j== 0)
490 |                     {
491 | 
492 |                         if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || j == 0)
493 |                         {
494 |                             if((continueFlag && j == 0))
495 |                             {
496 |                                 if(!isupper(output[j]))
497 |                                 {
498 |                                     nameBeginIndex = tempIndex;
499 |                                     break;
500 |                                 }
501 | 
502 |                                 else
503 |                                     tempIndex = j;
504 | 
505 |                             }
506 | 
507 |                             else if(j!=0)
508 |                                 tempIndex = j + 1;
509 |                         }
510 | 
511 |                         else if(output[j] == ' ' && isupper(output[j+1]))
512 |                         {
513 |                             tempIndex = j;
514 |                             continueFlag = 1;
515 | 
516 |                             continue;
517 |                         }
518 | 
519 |                         else if(output[j] == ' ' && !isupper(output[j+1] && tempIndex == 0))
520 |                         {
521 |                             _speakerCount--;
522 |                             foundEvilColon = 1;
523 |                             break;
524 |                         }
525 | 
526 |                         nameBeginIndex = tempIndex;
527 |                         break;
528 |                     }
529 |                 }
530 | 
531 |                 if(foundEvilColon)
532 |                     continue;
533 | 
534 |                 i = nameBeginIndex; //compensating the removal and changes in index
535 | 
536 |                 //check if there's a space after colon i.e. A: Hello vs A:Hello
537 |                 int removeSpace = 0;
538 |                 if(output[colonIndex + 1]==' ')
539 |                     removeSpace = 1;
540 | 
541 |                 _speaker.push_back(output.substr(nameBeginIndex, colonIndex - nameBeginIndex));
542 |                 output.erase(nameBeginIndex, colonIndex - nameBeginIndex + removeSpace);
543 |             }
544 | 
545 |         }
546 | 
547 |     }
548 | 
549 |     // removing more than one whitespaces with one space
550 |     unique_copy (output.begin(), output.end(), std::back_insert_iterator<std::string>(_justDialogue),
551 |                  [](char a,char b)
552 |                  {
553 |                      return isspace(a) && isspace(b);
554 |                  });
555 | 
556 |     // trimming whitespaces
557 |     const char* whiteSpaces = " \t\n\r\f\v";
558 |     _justDialogue.erase(0, _justDialogue.find_first_not_of(whiteSpaces));
559 |     _justDialogue.erase(_justDialogue.find_last_not_of(whiteSpaces) + 1);
560 | 
561 |     if(_justDialogue.empty() || _justDialogue == " ")
562 |         _ignore = true;
563 | 
564 |     else
565 |     {
566 |         _word = split(_justDialogue, ' ', _word); //extracting individual words
567 |         _wordCount = _word.size();
568 |     }
569 | }
570 | 
571 | inline std::string SubtitleItem::getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues,  bool doNotRemoveSpeakerNames)
572 | {
573 |     if(_justDialogue.empty())
574 |         extractInfo(keepHTML, doNotIgnoreNonDialogues, doNotRemoveSpeakerNames);
575 | 
576 |     return _justDialogue;
577 | }
578 | inline int SubtitleItem::getSpeakerCount() const
579 | {
580 |     return _speakerCount;
581 | }
582 | inline int SubtitleItem::getNonDialogueCount() const
583 | {
584 |     return _nonDialogueCount;
585 | }
586 | inline int SubtitleItem::getStyleTagCount() const
587 | {
588 |     return _styleTagCount;
589 | }
590 | inline int SubtitleItem::getWordCount() const
591 | {
592 |     return _wordCount;
593 | }
594 | inline std::vector<std::string> SubtitleItem::getSpeakerNames()
595 | {
596 |     return _speaker;
597 | }
598 | inline std::vector<std::string> SubtitleItem::getNonDialogueWords()
599 | {
600 |     return _nonDialogue;
601 | }
602 | inline std::vector<std::string> SubtitleItem::getIndividualWords()
603 | {
604 |     return _word;
605 | }
606 | inline std::string SubtitleItem::getWordByIndex(int index)
607 | {
608 |     return _word[index];
609 | }
610 | inline std::vector<long int> SubtitleItem::getWordStartTimes()
611 | {
612 |     return _wordStartTime;
613 | }
614 | inline std::vector<long int> SubtitleItem::getWordEndTimes()
615 | {
616 |     return _wordEndTime;
617 | }
618 | inline long int SubtitleItem::getWordStartTimeByIndex(int index)
619 | {
620 |     return _wordStartTime[index];
621 | }
622 | inline long int SubtitleItem::getWordEndTimeByIndex(int index)
623 | {
624 |     return _wordEndTime[index];
625 | }
626 | inline std::vector<std::string> SubtitleItem::getStyleTags()
627 | {
628 |     return _styleTag;
629 | }
630 | inline SubtitleItem::~SubtitleItem(void)
631 | {
632 | 
633 | }
634 | 
635 | //5. SubtitleWordclass
636 | 
637 | inline SubtitleWord::SubtitleWord(void)
638 | {
639 |     _text = "";
640 | }
641 | 
642 | inline SubtitleWord::SubtitleWord(std::string text)
643 | {
644 |     _text = text;
645 | }
646 | 
647 | inline std::string SubtitleWord::getText() const
648 | {
649 |     return _text;
650 | }
651 | 
652 | inline SubtitleWord::~SubtitleWord(void)
653 | {
654 | }
655 | 
656 | 
657 | #endif //SRTPARSER_H


--------------------------------------------------------------------------------