├── example
├── .gitignore
├── CMakeLists.txt
├── example.srt
├── example-output.srt
├── main.cpp
└── README.MD
├── LICENSE.txt
├── README.adoc
└── srtparser.h
/example/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | cmake-build-debug/
--------------------------------------------------------------------------------
/example/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.6)
2 | project(demo)
3 |
4 | set(CMAKE_CXX_STANDARD 11)
5 |
6 | include_directories(../)
7 |
8 | set(SOURCE_FILES main.cpp ../srtparser.h)
9 | add_executable(demo ${SOURCE_FILES})
--------------------------------------------------------------------------------
/example/example.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,520 --> 00:00:03,536
3 | Chris: Elon, hey, welcome back to TED.
4 |
5 | 2
6 | 00:00:03,560 --> 00:00:04,936
7 | It's great to have you here.
8 |
9 | 3
10 | 00:00:04,960 --> 00:00:06,536
11 | Elon: Thanks for having me.
12 |
13 | 4
14 | 00:00:06,560 --> 00:00:09,416
15 | (Applause) Thanks
16 |
17 | 5
18 | 00:00:09,440 --> 00:00:11,256
19 | we're going to spend some time
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Original work Copyright (c) 2017 Oleksii Maryshchenko
4 | Modified work Copyright (c) 2017 Saurabh Shrivastava
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/example/example-output.srt:
--------------------------------------------------------------------------------
1 | BEGIN
2 | start : 520
3 | end : 3536
4 | text :
5 | Chris: Elon, hey, welcome back to TED.
6 |
7 |
8 | startString : 00:00:00,520
9 | endString : 00:00:03,536
10 | justDialogue :
11 | Elon, hey, welcome back to TED.
12 |
13 |
14 | ignore : 0
15 | speakerCount : 1
16 | speakers : Chris,
17 | END
18 |
19 | BEGIN
20 | start : 3560
21 | end : 4936
22 | text :
23 | It's great to have you here.
24 |
25 |
26 | startString : 00:00:03,560
27 | endString : 00:00:04,936
28 | justDialogue :
29 | It's great to have you here.
30 |
31 |
32 | ignore : 0
33 | speakerCount : 0
34 | END
35 |
36 | BEGIN
37 | start : 4960
38 | end : 6536
39 | text :
40 | Elon: Thanks for having me.
41 |
42 |
43 | startString : 00:00:04,960
44 | endString : 00:00:06,536
45 | justDialogue :
46 | Thanks for having me.
47 |
48 |
49 | ignore : 0
50 | speakerCount : 1
51 | speakers : Elon,
52 | END
53 |
54 | BEGIN
55 | start : 6560
56 | end : 9416
57 | text :
58 | (Applause) Thanks
59 |
60 |
61 | startString : 00:00:06,560
62 | endString : 00:00:09,416
63 | justDialogue :
64 | Thanks
65 |
66 |
67 | ignore : 0
68 | speakerCount : 0
69 | END
70 |
71 | BEGIN
72 | start : 9440
73 | end : 11256
74 | text :
75 | we're going to spend some time
76 |
77 | startString : 00:00:09,440
78 | endString : 00:00:11,256
79 | justDialogue :
80 | we're going to spend some time
81 |
82 | ignore : 0
83 | speakerCount : 0
84 | END
85 |
86 |
--------------------------------------------------------------------------------
/example/main.cpp:
--------------------------------------------------------------------------------
1 | #include "../srtparser.h"
2 | using namespace std;
3 |
4 | int main(int argc, char *argv[]) {
5 |
6 | // If filename not provided
7 | if(argc < 2) {
8 | cout << "Please pass the filename as an argument: ./a.out filename.srt\n";
9 | return 0;
10 | }
11 |
12 | SubtitleParserFactory *subParserFactory = new SubtitleParserFactory(argv[1]);
13 | SubtitleParser *parser = subParserFactory->getParser();
14 |
15 | std::vector sub = parser->getSubtitles();
16 |
17 | ofstream myfile;
18 | myfile.open ("out.srt");
19 |
20 | for(SubtitleItem * element : sub)
21 | {
22 | myfile<<"BEGIN"<getStartTimeString()<getStartTime()<getEndTimeString()<getEndTime()<getText()<getDialogue()<getWordCount()< word = element->getIndividualWords();
32 | for(std::string display : word)
33 | myfile<getSpeakerCount()<getSpeakerCount())
39 | {
40 | std::vector name = element->getSpeakerNames();
41 | for(std::string display : name)
42 | myfile<getIgnoreStatus()<getStartTimeString()<getStartTime()<getEndTimeString()<getEndTime()<getText()<getDialogue()<getWordCount()< word = element->getIndividualWords();
29 | for(std::string display : word)
30 | myfile<getSpeakerCount()<getSpeakerCount())
36 | {
37 | std::vector name = element->getSpeakerNames();
38 | for(std::string display : name)
39 | myfile<getIgnoreStatus()< 00:00:03,536
53 | Chris: Elon, hey, welcome back to TED.
54 |
55 | 2
56 | 00:00:03,560 --> 00:00:04,936
57 | Chris : It's great to have you here.
58 | Evan: Indeed it is.
59 |
60 | 3
61 | 00:00:04,960 --> 00:00:06,536
62 | Elon: Thanks for having me.
63 |
64 | 4
65 | 00:00:06,560 --> 00:00:09,416
66 | (Applause) Thanks
67 |
68 | 5
69 | 00:00:09,440 --> 00:00:11,256
70 | we're going to spend some time
71 | ```
72 |
73 | Parsed Output :
74 |
75 | ```
76 |
77 | BEGIN
78 | startString : 00:00:00,520
79 | start : 520
80 | endString : 00:00:03,536
81 | end : 3536
82 | text : Chris: Elon, hey, welcome back to TED.
83 | justDialogue : Elon, hey, welcome back to TED.
84 | words count : 6
85 | words :Elon,, hey,, welcome, back, to, TED.,
86 | speakerCount : 1
87 | speakers : Chris,
88 | ignore : 0
89 | END
90 |
91 | BEGIN
92 | startString : 00:00:03,560
93 | start : 3560
94 | endString : 00:00:04,936
95 | end : 4936
96 | text : Chris : It's great to have you here. Evan: Indeed it is.
97 | justDialogue : It's great to have you here. Indeed it is.
98 | words count : 9
99 | words :It's, great, to, have, you, here., Indeed, it, is.,
100 | speakerCount : 2
101 | speakers : Chris , Evan,
102 | ignore : 0
103 | END
104 |
105 | BEGIN
106 | startString : 00:00:04,960
107 | start : 4960
108 | endString : 00:00:06,536
109 | end : 6536
110 | text : Elon: Thanks for having me.
111 | justDialogue : Thanks for having me.
112 | words count : 4
113 | words :Thanks, for, having, me.,
114 | speakerCount : 1
115 | speakers : Elon,
116 | ignore : 0
117 | END
118 |
119 | BEGIN
120 | startString : 00:00:06,560
121 | start : 6560
122 | endString : 00:00:09,416
123 | end : 9416
124 | text : (Applause) Thanks
125 | justDialogue : Thanks
126 | words count : 1
127 | words :Thanks,
128 | speakerCount : 0
129 | speakers : ignore : 0
130 | END
131 |
132 | BEGIN
133 | startString : 00:00:09,440
134 | start : 9440
135 | endString : 00:00:11,256
136 | end : 11256
137 | text : we're going to spend some time
138 | justDialogue : we're going to spend some time
139 | words count : 6
140 | words :we're, going, to, spend, some, time,
141 | speakerCount : 0
142 | speakers : ignore : 0
143 | END
144 |
145 |
146 |
147 |
148 | ```
--------------------------------------------------------------------------------
/README.adoc:
--------------------------------------------------------------------------------
1 | = srtparser.h : Simple, yet powerful C++ SRT Subtitle Parser Library.
2 | A single header, simple, powerful full blown srt subtitle parser written in C++.
3 | ___
4 |
5 | https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp[srtparser.h] is a single header, simple and powerful C++ srt subtitle parsing library that allows you to easily handle, process and manipulate srt subtitle files in your project. It is an extension of Oleksii Maryshchenko's simple https://github.com/young-developer/subtitle-parser[subtitle-parser]. It has following features :
6 |
7 | 1. It is a single header C++ (CPP) file, and can be easily used in your project.
8 | 2. Focus on portability, efficiency and simplicity with no external dependency.
9 | 3. Wide variety of functions at programmers disposal to parse srt file as per need.
10 | 4. Capable of :
11 | - extracting and stripping HTML and other styling tags from subtitle text.
12 | - extracting and stripping speaker names.
13 | - extracting and stripping non dialogue texts.
14 | 5. Easy to extend and add new functionalities.
15 |
16 | == How to use srtparser.h
17 |
18 | === General usage ===
19 |
20 | srptparser.h is a cross-platform robust srt subtitle parser.
21 |
22 | * Download `srtparser.h` from https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp
23 | * Include the header file in your program.
24 | `#include "lib/srtparser.h"`
25 | * Create SubtitleParserFactory object. Use this factory object to create SubtitleParser object.
26 |
27 | ```
28 | SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt");
29 | SubtitleParser *parser = subParserFactory->getParser();
30 |
31 | //to get subtitles
32 |
33 | std::vector sub = parser->getSubtitles();
34 | ```
35 |
36 | * Call appropriate functions to perform parsing.
37 |
38 | See demo usage in `examples` directory.
39 |
40 | === Parser Functions ===
41 |
42 | The following is a complete list of available parser functions.
43 |
44 | Syntax:
45 |
46 |
47 |
48 | [cols="2,1,2,5"]
49 | |===
50 | | Class | Return Type | Function | Description
51 |
52 | | SubtitleParserFactory
53 | | SubtitleParserFactory
54 | | `SubtitleParserFactory("inputFile.srt")`
55 | | Creates a SubtitleParserFactory object. Here the _inputFile.srt_ is the path of subtitle file to be parsed. This object is used to create parser.
56 |
57 | _E.g.: ``SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt");``_
58 |
59 | | SubtitleParserFactory
60 | | SubtitleParser
61 | | `getParser()`
62 | | Returns the SubtitleParser object. This object will be used to parse the subtitle file.
63 |
64 | _E.g.: ``SubtitleParser *parser = subParserFactory->getParser();``_
65 |
66 | | SubtitleParser
67 | | std::vector
68 | | `getSubtitles()`
69 | | Returns the Subtitle as SubtitleItem object.
70 |
71 | _E.g.: ``std::vector sub = parser->getSubtitles();``_
72 |
73 | | SubtitleParser
74 | | std::string
75 | | `getFileData()`
76 | | Returns the complete file data read as it is from inputFile.srt
77 |
78 | _E.g.: ``std::string fileData = parser->getFileData();``_
79 |
80 | | SubtitleItem
81 | | long int
82 | | `getStartTime()`
83 | | Returns the starting time of subtitle in milliseconds.
84 |
85 | _E.g.: ``long int startTime = sub->getStartTime();``_
86 |
87 | | SubtitleItem
88 | | long int
89 | | `getEndTime()`
90 | | Returns the ending time of subtitle in milliseconds.
91 |
92 | _E.g.: ``long int endTime = sub->getEndTime();``_
93 |
94 | | SubtitleItem
95 | | std::string
96 | | `getStartTimeString()`
97 | | Returns the starting time of subtitle in srt format.
98 |
99 | _E.g.: ``std::string startTime = sub->getStartTimeString();``_
100 |
101 | | SubtitleItem
102 | | std::string
103 | | `getEndTimeString()`
104 | | Returns the ending time of subtitle in srt format.
105 |
106 | _E.g.: ``std::string endTime = sub->getEndTimeString();``_
107 |
108 | | SubtitleItem
109 | | std::string
110 | | `getText()`
111 | | Returns the subtitle text as present in .srt file.
112 |
113 | _E.g.: ``std::string text = sub->getText();``_
114 |
115 | | SubtitleItem
116 | | std::string
117 | | `getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames);`
118 | | Returns the subtitle text after processing according to parameters.
119 |
120 | keepHTML = 1 to stop parser from stripping style tags
121 |
122 | doNotIgnoreNonDialogues = 1 to stop parser from ignoring and extracting non dialogue texts such as _(laughter)_.
123 |
124 | doNotRemoveSpeakerNames = 1 to stop parser from ignoring and extracting speaker names
125 |
126 | By default (0,0,0) values are passed.
127 |
128 | _E.g.: ``std::string text = sub->getDialogue();``_
129 |
130 | | SubtitleItem
131 | | int
132 | | `getWordCount()`
133 | | Returns the count of number of words present in the subtitle dialogue.
134 |
135 | _E.g.: ``int wordCount = sub->getWordCount();``_
136 |
137 | | SubtitleItem
138 | | std::vector
139 | | `getIndividualWords()`
140 | | Returns string vector of individual words present in subtitle.
141 |
142 | _E.g.: ``std::vector words = sub->getIndividualWords();``_
143 |
144 | | SubtitleItem
145 | | bool
146 | | `getIgnoreStatus()`
147 | | Returns the ignore status. Returns true, if the _justDialogue field i.e. subtitle after processing is empty.
148 |
149 | _E.g.: ``bool ignore = sub->getIgnoreStatus();``_
150 |
151 | | SubtitleItem
152 | | int
153 | | `getSpeakerCount()`
154 | | Returns the count of number of speakers present in the subtitle.
155 |
156 | _E.g.: ``int speakerCount = sub->getSpeakerCount();``_
157 |
158 | | SubtitleItem
159 | | std::vector
160 | | `getSpeakerNames()`
161 | | Returns string vector of speaker names.
162 |
163 | _E.g.: ``std::vector speakerNames = sub->getSpeakerNames();``_
164 |
165 | | SubtitleItem
166 | | int
167 | | `getNonDialogueCount()`
168 | | Returns the count of number of non dialogue words present in the subtitle.
169 |
170 | _E.g.: ``int nonDialogueCount = sub->getNonDialogueCount();``_
171 |
172 | | SubtitleItem
173 | | std::vector
174 | | `getNonDialogueWords()`
175 | | Returns string vector of non dialogue words.
176 |
177 | _E.g.: ``std::vector nonDialogueWords = sub->getNonDialogueWords();``_
178 |
179 | | SubtitleItem
180 | | int
181 | | `getStyleTagCount()`
182 | | Returns the count of number of style tags present in the subtitle.
183 |
184 | _E.g.: ``int styleTagCount = sub->getStyleTagCount();``_
185 |
186 | | SubtitleItem
187 | | std::vector
188 | | `getStyleTags()`
189 | | Returns string vector of style tags.
190 |
191 | _E.g.: ``std::vector styleTags = sub->getStyleTags();``_
192 |
193 | | SubtitleWord
194 | | std::string
195 | | `getText()`
196 | | Returns the subtitle text as present in .srt file.
197 |
198 | _E.g.: ``std::string text = sub->getText();``_
199 |
200 | |===
201 |
202 | ## Examples
203 |
204 | While I've tried to include examples in the above table, a compilation of all of them together in a single C++ program can be found in `example` directory.
205 |
206 | ## Contributing
207 |
208 | Suggestions, features request, PRs, bug reports, bug fixes are welcomed. I'll be thankful.
209 |
210 | ## Credits
211 |
212 | Built upon a MIT licensed simple subtitle-parser called LibSub-Parser by Oleksii Maryshchenko.
213 |
214 | The original parser had 3 major functions : getStartTime(), getEndTime() and getText().
215 |
216 | Rest work done by Saurabh Shrivastava, originally for using this in his https://saurabhshri.github.io/2017/05/gsoc/creating-a-full-blown-srt-subtitle-parser[GSoC project].
--------------------------------------------------------------------------------
/srtparser.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Author : Saurabh Shrivastava
3 | * Email : saurabh.shrivastava54@gmail.com
4 | * Link : https://github.com/saurabhshri
5 | *
6 | * Based on subtitle-parser by Oleksii Maryshchenko.
7 | * Email : young_developer@mail.ru
8 | * Link : https://github.com/young-developer/subtitle-parser
9 | */
10 |
11 | #ifndef SRTPARSER_H
12 | #define SRTPARSER_H
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 |
21 | //function for splitting sentences based on supplied delimiter
22 | inline std::vector &split(const std::string &s, char delim, std::vector &elems) {
23 | std::stringstream ss(s);
24 | std::string item;
25 |
26 | while (getline(ss, item, delim)) {
27 | elems.push_back(item);
28 | }
29 | return elems;
30 | }
31 |
32 | /**** Class definitions ****/
33 |
34 |
35 | class SubtitleWord
36 | {
37 | private:
38 | std::string _text;
39 | public:
40 | SubtitleWord(void);
41 | SubtitleWord(std::string text);
42 | virtual std::string getText() const;
43 | ~SubtitleWord(void);
44 | };
45 |
46 | class SubtitleItem
47 | {
48 | private:
49 | long int _startTime; //in milliseconds
50 | long int _endTime;
51 | std::string _text; //actual line, as present in subtitle file
52 | long int timeMSec(std::string value); //converts time string into ms
53 |
54 | int _subNo; //subtitle number
55 | std::string _startTimeString; //time as in srt format
56 | std::string _endTimeString;
57 | bool _ignore; //should subtitle be ignore; used when the subtitle is empty after processing
58 | std::string _justDialogue; //contains processed subtitle - stripped style, non dialogue text removal etc.
59 | int _speakerCount; //count of number of speakers
60 | std::vector _speaker; //list of speakers in a single subtitle
61 | int _nonDialogueCount; //count of non spoken words in a subtitle
62 | std::vector _nonDialogue; //list of non dialogue words, e.g. (applause)
63 | int _wordCount; //number of words in _justDialogue
64 | std::vector _word; //list of words in dialogue
65 | std::vector _wordStartTime; //start time of each word in dialogue
66 | std::vector _wordEndTime; //end time of each word in dialogue
67 | std::vector _wordDuration; //actual duration of each word without silence
68 | int _styleTagCount; //count of style tags in a single subtitle
69 | std::vector _styleTag; //list of style tags in that subtitle
70 | void extractInfo(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //process subtitle
71 | public:
72 | long int getStartTime() const; //returns starting time in ms
73 | long int getEndTime() const; //returns ending time in ms
74 | std::string getText() const; //returns subtitle text as present in .srt file
75 |
76 | int getSubNo() const; //returns subtitle number
77 | std::string getStartTimeString() const; //returns sarting time as present in .srt file
78 | std::string getEndTimeString() const; //returns ending time as present in .srt file
79 | bool getIgnoreStatus() const; //returns status, whether the subtitle is ignorable or not after processing
80 | std::string getDialogue(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //returns processed subtitle
81 | int getSpeakerCount() const; //return speaker count
82 | int getNonDialogueCount() const; //return non dialogue words count
83 | int getStyleTagCount() const; //return style tags count
84 | int getWordCount() const; //return words count
85 | std::vector getIndividualWords(); //return string vector of individual words
86 | std::string getWordByIndex(int index); //return word stored at 'index'
87 | std::vector getWordStartTimes(); //return long int vector of start time of individual words
88 | std::vector getWordEndTimes(); //return long int vector of end time of individual words
89 | long int getWordStartTimeByIndex(int index); //return the start time of a word based on index
90 | long int getWordEndTimeByIndex (int index); //return the end time of a word based on index
91 | std::vector getSpeakerNames(); //return string vector of speaker names
92 | std::vector getNonDialogueWords(); //return string vector of non dialogue words
93 | std::vector getStyleTags(); //return string vector of style tags
94 |
95 |
96 | void setStartTime(long int startTime); //set starting time
97 | void setEndTime(long int endTime); //set ending time
98 | void setText(std::string text); //set subtitle text
99 | void setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration); //assign time to individual words
100 |
101 | SubtitleItem(void);
102 | SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore = false,
103 | std::string justDialogue = "" , int speakerCount = 0, int nonDialogueCount = 0,
104 | int styleTagCount = 0, int wordCount = 0, std::vector speaker = std::vector(),
105 | std::vector nonDialogue = std::vector(),
106 | std::vector styleTags = std::vector(),
107 | std::vector word = std::vector()); //default constructor
108 | ~SubtitleItem(void);
109 | };
110 |
111 | class SubtitleParser
112 | {
113 | protected:
114 | std::vector _subtitles; //stores subtitles
115 | std::string _fileName; //supplied filename
116 | virtual void parse(std::string fileName) = 0;
117 | public:
118 | virtual std::vector getSubtitles(); //returns subtitles
119 | std::string getFileData();
120 | SubtitleParser(void);
121 | virtual ~SubtitleParser(void);
122 | };
123 |
124 | class SubtitleParserFactory
125 | {
126 | private:
127 | std::string _fileName;
128 | public:
129 | SubtitleParser* getParser();
130 | SubtitleParserFactory(std::string fileName);
131 | ~SubtitleParserFactory(void);
132 | };
133 |
134 | class SubRipParser : public SubtitleParser
135 | {
136 | void parse(std::string fileName);
137 | public:
138 | SubRipParser(void);
139 | SubRipParser(std::string fileName);
140 | ~SubRipParser(void);
141 | };
142 |
143 |
144 | /**** Function definitions ****/
145 |
146 | //1. SubtitleParserFactory class
147 |
148 | inline SubtitleParserFactory::SubtitleParserFactory(std::string fileName)
149 | {
150 | _fileName = fileName;
151 | }
152 |
153 | inline SubtitleParser* SubtitleParserFactory::getParser()
154 | {
155 | return new SubRipParser(_fileName); //creates and returns SubRipParser obj
156 | }
157 |
158 | inline SubtitleParserFactory::~SubtitleParserFactory(void)
159 | {
160 | }
161 |
162 | //2. SubtitleParser class
163 |
164 | inline std::vector SubtitleParser::getSubtitles()
165 | {
166 | return _subtitles;
167 | }
168 |
169 | inline std::string SubtitleParser::getFileData() //returns whole read file i.e. contents of input.srt
170 | {
171 | std::ifstream infile(_fileName);
172 | std::string allData = "";
173 | std::string line;
174 | while (std::getline(infile, line))
175 | {
176 | std::istringstream iss(line);
177 | allData += line + "\n";
178 | }
179 | return allData;
180 |
181 | }
182 |
183 | inline SubtitleParser::SubtitleParser(void)
184 | {
185 |
186 | }
187 |
188 | inline SubtitleParser::~SubtitleParser(void)
189 | {
190 | }
191 |
192 | //3. SubRipParser class
193 |
194 | inline SubRipParser::SubRipParser(void)
195 | {
196 | }
197 |
198 | inline void SubRipParser::parse(std::string fileName) //srt parser
199 | {
200 |
201 | std::ifstream infile(fileName);
202 | std::string line, start, end, completeLine = "", timeLine = "";
203 | int subNo, turn = 0;
204 |
205 | /*
206 | * turn = 0 -> Add subtitle number
207 | * turn = 1 -> Add string to timeLine
208 | * turn > 1 -> Add string to completeLine
209 | */
210 |
211 | while (std::getline(infile, line))
212 | {
213 | line.erase(remove(line.begin(), line.end(), '\r'), line.end());
214 |
215 | if (line.compare(""))
216 | {
217 | if(!turn)
218 | {
219 | subNo=atoi(line.c_str());
220 | turn++;
221 | continue;
222 | }
223 |
224 | if (line.find("-->") != std::string::npos)
225 | {
226 | timeLine += line;
227 |
228 | std::vector srtTime;
229 | srtTime = split(timeLine, ' ', srtTime);
230 | start = srtTime[0];
231 | end = srtTime[2];
232 |
233 | }
234 | else
235 | {
236 | if (completeLine != "")
237 | completeLine += " ";
238 |
239 | completeLine += line;
240 | }
241 |
242 | turn++;
243 | }
244 |
245 | else
246 | {
247 | turn = 0;
248 | _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine));
249 | completeLine = timeLine = "";
250 | }
251 |
252 | if(infile.eof()) //insert last remaining subtitle
253 | {
254 | _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine));
255 | }
256 | }
257 | }
258 |
259 | inline SubRipParser::SubRipParser(std::string fileName)
260 | {
261 | _fileName = fileName;
262 | parse(fileName);
263 | }
264 |
265 | inline SubRipParser::~SubRipParser(void)
266 | {
267 | for(int i=0;i != _subtitles.size();++i)
268 | {
269 | if(_subtitles[i])
270 | delete _subtitles[i];
271 | }
272 | }
273 |
274 | //4. SubtitleItem class
275 |
276 | inline SubtitleItem::SubtitleItem(void)
277 | {
278 | }
279 |
280 | inline SubtitleItem::SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore,
281 | std::string justDialogue, int speakerCount, int nonDialogueCount,
282 | int styleTagCount, int wordCount, std::vector speaker, std::vector nonDialogue,
283 | std::vector styleTags, std::vector word)
284 | {
285 | _startTime = timeMSec(startTime);
286 | _endTime = timeMSec(endTime);
287 | _text = text;
288 |
289 | _subNo = subNo;
290 | _startTimeString = startTime;
291 | _endTimeString = endTime;
292 | _ignore = ignore;
293 | _justDialogue = justDialogue;
294 | _speakerCount = speakerCount;
295 | _nonDialogueCount = nonDialogueCount;
296 | _wordCount = wordCount;
297 | _speaker = speaker;
298 | _styleTagCount = styleTagCount;
299 | _styleTag = styleTags;
300 | _nonDialogue = nonDialogue;
301 | _word = word;
302 |
303 | extractInfo();
304 | }
305 |
306 | inline long int SubtitleItem::timeMSec(std::string value)
307 | {
308 | std::vector t, secs;
309 | int hours, mins, seconds, milliseconds;
310 |
311 | t = split(value, ':', t);
312 | hours = atoi(t[0].c_str());
313 | mins = atoi(t[1].c_str());
314 |
315 | secs = split(t[2], ',', secs);
316 | seconds = atoi(secs[0].c_str());
317 | milliseconds = atoi(secs[1].c_str());
318 |
319 | return hours * 3600000 + mins * 60000 + seconds * 1000 + milliseconds;
320 | }
321 |
322 | inline long int SubtitleItem::getStartTime() const
323 | {
324 | return _startTime;
325 | }
326 | inline long int SubtitleItem::getEndTime() const
327 | {
328 | return _endTime;
329 | }
330 |
331 | inline std::string SubtitleItem::getText() const
332 | {
333 | return _text;
334 | }
335 |
336 | inline void SubtitleItem::setStartTime(long int startTime)
337 | {
338 | _startTime = startTime;
339 | }
340 | inline void SubtitleItem::setEndTime(long int endTime)
341 | {
342 | _endTime = endTime;
343 | }
344 | inline void SubtitleItem::setText(std::string text)
345 | {
346 | _text = text;
347 | }
348 | inline void SubtitleItem::setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration)
349 | {
350 | _wordStartTime = wordStartTime;
351 | _wordEndTime = wordEndTime;
352 | _wordDuration = wordDuration;
353 | }
354 | inline int SubtitleItem::getSubNo() const
355 | {
356 | return _subNo;
357 | }
358 | inline std::string SubtitleItem::getStartTimeString() const
359 | {
360 | return _startTimeString;
361 | }
362 |
363 | inline std::string SubtitleItem::getEndTimeString() const
364 | {
365 | return _endTimeString;
366 | }
367 |
368 | inline bool SubtitleItem::getIgnoreStatus() const
369 | {
370 | if(_ignore)
371 | return true;
372 |
373 | else
374 | return false;
375 |
376 | }
377 |
378 | inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) //process subtitle
379 | {
380 | std::string output = _text;
381 |
382 | //stripping HTML tags
383 | if(!keepHTML)
384 | {
385 | /*
386 | * TODO : Before erasing, extract the words.
387 | * std::vector getStyleTags();
388 | * int getStyleTagCount() const;
389 | * std::vector _styleTag;
390 | * int _styleTagCount;
391 | */
392 |
393 | int countP = 0;
394 | for(char& c : output) // replacing <...> with ~~~~
395 | {
396 | if(c=='<')
397 | {
398 | countP++;
399 | c = '~';
400 | }
401 |
402 | else
403 | {
404 | if(countP!=0)
405 | {
406 | if(c != '>')
407 | c = '~';
408 |
409 | else if(c == '>')
410 | {
411 | c = '~';
412 | countP--;
413 | }
414 | }
415 | }
416 | }
417 | }
418 |
419 | //stripping non dialogue data e.g. (applause)
420 |
421 | if(!doNotIgnoreNonDialogues)
422 | {
423 | /*
424 | * TODO : Before erasing, extract the words.
425 | * std::vector getNonDialogueWords();
426 | * int getNonDialogueCount() const;
427 | * std::vector _nonDialogue;
428 | * int _nonDialogueCount;
429 | */
430 |
431 | int countP = 0;
432 | for(char& c : output) // replacing (...) with ~~~~
433 | {
434 | if(c=='(')
435 | {
436 | countP++;
437 | c = '~';
438 | }
439 |
440 | else
441 | {
442 | if(countP!=0)
443 | {
444 | if(c != ')')
445 | c = '~';
446 |
447 | else if(c == ')')
448 | {
449 | c = '~';
450 | countP--;
451 | }
452 | }
453 | }
454 | }
455 | }
456 |
457 | output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~
458 |
459 | //Extracting speaker names
460 | if(!doNotRemoveSpeakerNames)
461 | {
462 | for(int i=0; output[i]!='\0';i++)
463 | {
464 | int colonIndex = 0, nameBeginIndex = 0;
465 | if(output[i]==':') //speaker found; travel back
466 | {
467 | _speakerCount++;
468 | colonIndex = i;
469 |
470 | int tempIndex = 0, foundEvilColon = 0, continueFlag = 0, spaceBeforeColon = 0;
471 |
472 | if(output[i-1] == ' ')
473 | spaceBeforeColon = 2;
474 |
475 | /*
476 | Possible Cases :
477 |
478 | Elon Musk: Hey Saurabh, you are pretty smart. // First and Last Name
479 | Saurabh: *_* What? Elon Musk: Yes! // Two names in single line
480 | Saurabh : OMG OMG! // Space before colon
481 | Elon: LOL World: LAMAO
482 | Saurabh: ._. // normal
483 |
484 | */
485 |
486 | for(int j=i - spaceBeforeColon; j>=0;j--)
487 | {
488 | if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || output[j] == '\n'
489 | || output[j] == ' ' || j== 0)
490 | {
491 |
492 | if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || j == 0)
493 | {
494 | if((continueFlag && j == 0))
495 | {
496 | if(!isupper(output[j]))
497 | {
498 | nameBeginIndex = tempIndex;
499 | break;
500 | }
501 |
502 | else
503 | tempIndex = j;
504 |
505 | }
506 |
507 | else if(j!=0)
508 | tempIndex = j + 1;
509 | }
510 |
511 | else if(output[j] == ' ' && isupper(output[j+1]))
512 | {
513 | tempIndex = j;
514 | continueFlag = 1;
515 |
516 | continue;
517 | }
518 |
519 | else if(output[j] == ' ' && !isupper(output[j+1] && tempIndex == 0))
520 | {
521 | _speakerCount--;
522 | foundEvilColon = 1;
523 | break;
524 | }
525 |
526 | nameBeginIndex = tempIndex;
527 | break;
528 | }
529 | }
530 |
531 | if(foundEvilColon)
532 | continue;
533 |
534 | i = nameBeginIndex; //compensating the removal and changes in index
535 |
536 | //check if there's a space after colon i.e. A: Hello vs A:Hello
537 | int removeSpace = 0;
538 | if(output[colonIndex + 1]==' ')
539 | removeSpace = 1;
540 |
541 | _speaker.push_back(output.substr(nameBeginIndex, colonIndex - nameBeginIndex));
542 | output.erase(nameBeginIndex, colonIndex - nameBeginIndex + removeSpace);
543 | }
544 |
545 | }
546 |
547 | }
548 |
549 | // removing more than one whitespaces with one space
550 | unique_copy (output.begin(), output.end(), std::back_insert_iterator(_justDialogue),
551 | [](char a,char b)
552 | {
553 | return isspace(a) && isspace(b);
554 | });
555 |
556 | // trimming whitespaces
557 | const char* whiteSpaces = " \t\n\r\f\v";
558 | _justDialogue.erase(0, _justDialogue.find_first_not_of(whiteSpaces));
559 | _justDialogue.erase(_justDialogue.find_last_not_of(whiteSpaces) + 1);
560 |
561 | if(_justDialogue.empty() || _justDialogue == " ")
562 | _ignore = true;
563 |
564 | else
565 | {
566 | _word = split(_justDialogue, ' ', _word); //extracting individual words
567 | _wordCount = _word.size();
568 | }
569 | }
570 |
571 | inline std::string SubtitleItem::getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames)
572 | {
573 | if(_justDialogue.empty())
574 | extractInfo(keepHTML, doNotIgnoreNonDialogues, doNotRemoveSpeakerNames);
575 |
576 | return _justDialogue;
577 | }
578 | inline int SubtitleItem::getSpeakerCount() const
579 | {
580 | return _speakerCount;
581 | }
582 | inline int SubtitleItem::getNonDialogueCount() const
583 | {
584 | return _nonDialogueCount;
585 | }
586 | inline int SubtitleItem::getStyleTagCount() const
587 | {
588 | return _styleTagCount;
589 | }
590 | inline int SubtitleItem::getWordCount() const
591 | {
592 | return _wordCount;
593 | }
594 | inline std::vector SubtitleItem::getSpeakerNames()
595 | {
596 | return _speaker;
597 | }
598 | inline std::vector SubtitleItem::getNonDialogueWords()
599 | {
600 | return _nonDialogue;
601 | }
602 | inline std::vector SubtitleItem::getIndividualWords()
603 | {
604 | return _word;
605 | }
606 | inline std::string SubtitleItem::getWordByIndex(int index)
607 | {
608 | return _word[index];
609 | }
610 | inline std::vector SubtitleItem::getWordStartTimes()
611 | {
612 | return _wordStartTime;
613 | }
614 | inline std::vector SubtitleItem::getWordEndTimes()
615 | {
616 | return _wordEndTime;
617 | }
618 | inline long int SubtitleItem::getWordStartTimeByIndex(int index)
619 | {
620 | return _wordStartTime[index];
621 | }
622 | inline long int SubtitleItem::getWordEndTimeByIndex(int index)
623 | {
624 | return _wordEndTime[index];
625 | }
626 | inline std::vector SubtitleItem::getStyleTags()
627 | {
628 | return _styleTag;
629 | }
630 | inline SubtitleItem::~SubtitleItem(void)
631 | {
632 |
633 | }
634 |
635 | //5. SubtitleWordclass
636 |
637 | inline SubtitleWord::SubtitleWord(void)
638 | {
639 | _text = "";
640 | }
641 |
642 | inline SubtitleWord::SubtitleWord(std::string text)
643 | {
644 | _text = text;
645 | }
646 |
647 | inline std::string SubtitleWord::getText() const
648 | {
649 | return _text;
650 | }
651 |
652 | inline SubtitleWord::~SubtitleWord(void)
653 | {
654 | }
655 |
656 |
657 | #endif //SRTPARSER_H
--------------------------------------------------------------------------------