├── example ├── .gitignore ├── CMakeLists.txt ├── example.srt ├── example-output.srt ├── main.cpp └── README.MD ├── LICENSE.txt ├── README.adoc └── srtparser.h /example/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | cmake-build-debug/ -------------------------------------------------------------------------------- /example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.6) 2 | project(demo) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | include_directories(../) 7 | 8 | set(SOURCE_FILES main.cpp ../srtparser.h) 9 | add_executable(demo ${SOURCE_FILES}) -------------------------------------------------------------------------------- /example/example.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,520 --> 00:00:03,536 3 | Chris: Elon, hey, welcome back to TED. 4 | 5 | 2 6 | 00:00:03,560 --> 00:00:04,936 7 | It's great to have you here. 8 | 9 | 3 10 | 00:00:04,960 --> 00:00:06,536 11 | Elon: Thanks for having me. 12 | 13 | 4 14 | 00:00:06,560 --> 00:00:09,416 15 | (Applause) Thanks 16 | 17 | 5 18 | 00:00:09,440 --> 00:00:11,256 19 | we're going to spend some time -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Original work Copyright (c) 2017 Oleksii Maryshchenko 4 | Modified work Copyright (c) 2017 Saurabh Shrivastava 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /example/example-output.srt: -------------------------------------------------------------------------------- 1 | BEGIN 2 | start : 520 3 | end : 3536 4 | text : 5 | Chris: Elon, hey, welcome back to TED. 6 | 7 | 8 | startString : 00:00:00,520 9 | endString : 00:00:03,536 10 | justDialogue : 11 | Elon, hey, welcome back to TED. 12 | 13 | 14 | ignore : 0 15 | speakerCount : 1 16 | speakers : Chris, 17 | END 18 | 19 | BEGIN 20 | start : 3560 21 | end : 4936 22 | text : 23 | It's great to have you here. 24 | 25 | 26 | startString : 00:00:03,560 27 | endString : 00:00:04,936 28 | justDialogue : 29 | It's great to have you here. 30 | 31 | 32 | ignore : 0 33 | speakerCount : 0 34 | END 35 | 36 | BEGIN 37 | start : 4960 38 | end : 6536 39 | text : 40 | Elon: Thanks for having me. 41 | 42 | 43 | startString : 00:00:04,960 44 | endString : 00:00:06,536 45 | justDialogue : 46 | Thanks for having me. 47 | 48 | 49 | ignore : 0 50 | speakerCount : 1 51 | speakers : Elon, 52 | END 53 | 54 | BEGIN 55 | start : 6560 56 | end : 9416 57 | text : 58 | (Applause) Thanks 59 | 60 | 61 | startString : 00:00:06,560 62 | endString : 00:00:09,416 63 | justDialogue : 64 | Thanks 65 | 66 | 67 | ignore : 0 68 | speakerCount : 0 69 | END 70 | 71 | BEGIN 72 | start : 9440 73 | end : 11256 74 | text : 75 | we're going to spend some time 76 | 77 | startString : 00:00:09,440 78 | endString : 00:00:11,256 79 | justDialogue : 80 | we're going to spend some time 81 | 82 | ignore : 0 83 | speakerCount : 0 84 | END 85 | 86 | -------------------------------------------------------------------------------- /example/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../srtparser.h" 2 | using namespace std; 3 | 4 | int main(int argc, char *argv[]) { 5 | 6 | // If filename not provided 7 | if(argc < 2) { 8 | cout << "Please pass the filename as an argument: ./a.out filename.srt\n"; 9 | return 0; 10 | } 11 | 12 | SubtitleParserFactory *subParserFactory = new SubtitleParserFactory(argv[1]); 13 | SubtitleParser *parser = subParserFactory->getParser(); 14 | 15 | std::vector sub = parser->getSubtitles(); 16 | 17 | ofstream myfile; 18 | myfile.open ("out.srt"); 19 | 20 | for(SubtitleItem * element : sub) 21 | { 22 | myfile<<"BEGIN"<getStartTimeString()<getStartTime()<getEndTimeString()<getEndTime()<getText()<getDialogue()<getWordCount()< word = element->getIndividualWords(); 32 | for(std::string display : word) 33 | myfile<getSpeakerCount()<getSpeakerCount()) 39 | { 40 | std::vector name = element->getSpeakerNames(); 41 | for(std::string display : name) 42 | myfile<getIgnoreStatus()<getStartTimeString()<getStartTime()<getEndTimeString()<getEndTime()<getText()<getDialogue()<getWordCount()< word = element->getIndividualWords(); 29 | for(std::string display : word) 30 | myfile<getSpeakerCount()<getSpeakerCount()) 36 | { 37 | std::vector name = element->getSpeakerNames(); 38 | for(std::string display : name) 39 | myfile<getIgnoreStatus()< 00:00:03,536 53 | Chris: Elon, hey, welcome back to TED. 54 | 55 | 2 56 | 00:00:03,560 --> 00:00:04,936 57 | Chris : It's great to have you here. 58 | Evan: Indeed it is. 59 | 60 | 3 61 | 00:00:04,960 --> 00:00:06,536 62 | Elon: Thanks for having me. 63 | 64 | 4 65 | 00:00:06,560 --> 00:00:09,416 66 | (Applause) Thanks 67 | 68 | 5 69 | 00:00:09,440 --> 00:00:11,256 70 | we're going to spend some time 71 | ``` 72 | 73 | Parsed Output : 74 | 75 | ``` 76 | 77 | BEGIN 78 | startString : 00:00:00,520 79 | start : 520 80 | endString : 00:00:03,536 81 | end : 3536 82 | text : Chris: Elon, hey, welcome back to TED. 83 | justDialogue : Elon, hey, welcome back to TED. 84 | words count : 6 85 | words :Elon,, hey,, welcome, back, to, TED., 86 | speakerCount : 1 87 | speakers : Chris, 88 | ignore : 0 89 | END 90 | 91 | BEGIN 92 | startString : 00:00:03,560 93 | start : 3560 94 | endString : 00:00:04,936 95 | end : 4936 96 | text : Chris : It's great to have you here. Evan: Indeed it is. 97 | justDialogue : It's great to have you here. Indeed it is. 98 | words count : 9 99 | words :It's, great, to, have, you, here., Indeed, it, is., 100 | speakerCount : 2 101 | speakers : Chris , Evan, 102 | ignore : 0 103 | END 104 | 105 | BEGIN 106 | startString : 00:00:04,960 107 | start : 4960 108 | endString : 00:00:06,536 109 | end : 6536 110 | text : Elon: Thanks for having me. 111 | justDialogue : Thanks for having me. 112 | words count : 4 113 | words :Thanks, for, having, me., 114 | speakerCount : 1 115 | speakers : Elon, 116 | ignore : 0 117 | END 118 | 119 | BEGIN 120 | startString : 00:00:06,560 121 | start : 6560 122 | endString : 00:00:09,416 123 | end : 9416 124 | text : (Applause) Thanks 125 | justDialogue : Thanks 126 | words count : 1 127 | words :Thanks, 128 | speakerCount : 0 129 | speakers : ignore : 0 130 | END 131 | 132 | BEGIN 133 | startString : 00:00:09,440 134 | start : 9440 135 | endString : 00:00:11,256 136 | end : 11256 137 | text : we're going to spend some time 138 | justDialogue : we're going to spend some time 139 | words count : 6 140 | words :we're, going, to, spend, some, time, 141 | speakerCount : 0 142 | speakers : ignore : 0 143 | END 144 | 145 | 146 | 147 | 148 | ``` -------------------------------------------------------------------------------- /README.adoc: -------------------------------------------------------------------------------- 1 | = srtparser.h : Simple, yet powerful C++ SRT Subtitle Parser Library. 2 | A single header, simple, powerful full blown srt subtitle parser written in C++. 3 | ___ 4 | 5 | https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp[srtparser.h] is a single header, simple and powerful C++ srt subtitle parsing library that allows you to easily handle, process and manipulate srt subtitle files in your project. It is an extension of Oleksii Maryshchenko's simple https://github.com/young-developer/subtitle-parser[subtitle-parser]. It has following features : 6 | 7 | 1. It is a single header C++ (CPP) file, and can be easily used in your project. 8 | 2. Focus on portability, efficiency and simplicity with no external dependency. 9 | 3. Wide variety of functions at programmers disposal to parse srt file as per need. 10 | 4. Capable of : 11 | - extracting and stripping HTML and other styling tags from subtitle text. 12 | - extracting and stripping speaker names. 13 | - extracting and stripping non dialogue texts. 14 | 5. Easy to extend and add new functionalities. 15 | 16 | == How to use srtparser.h 17 | 18 | === General usage === 19 | 20 | srptparser.h is a cross-platform robust srt subtitle parser. 21 | 22 | * Download `srtparser.h` from https://github.com/saurabhshri/simple-yet-powerful-srt-subtitle-parser-cpp 23 | * Include the header file in your program. 24 | `#include "lib/srtparser.h"` 25 | * Create SubtitleParserFactory object. Use this factory object to create SubtitleParser object. 26 | 27 | ``` 28 | SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt"); 29 | SubtitleParser *parser = subParserFactory->getParser(); 30 | 31 | //to get subtitles 32 | 33 | std::vector sub = parser->getSubtitles(); 34 | ``` 35 | 36 | * Call appropriate functions to perform parsing. 37 | 38 | See demo usage in `examples` directory. 39 | 40 | === Parser Functions === 41 | 42 | The following is a complete list of available parser functions. 43 | 44 | Syntax: 45 | 46 | 47 | 48 | [cols="2,1,2,5"] 49 | |=== 50 | | Class | Return Type | Function | Description 51 | 52 | | SubtitleParserFactory 53 | | SubtitleParserFactory 54 | | `SubtitleParserFactory("inputFile.srt")` 55 | | Creates a SubtitleParserFactory object. Here the _inputFile.srt_ is the path of subtitle file to be parsed. This object is used to create parser. 56 | 57 | _E.g.: ``SubtitleParserFactory *subParserFactory = new SubtitleParserFactory("inputFile.srt");``_ 58 | 59 | | SubtitleParserFactory 60 | | SubtitleParser 61 | | `getParser()` 62 | | Returns the SubtitleParser object. This object will be used to parse the subtitle file. 63 | 64 | _E.g.: ``SubtitleParser *parser = subParserFactory->getParser();``_ 65 | 66 | | SubtitleParser 67 | | std::vector 68 | | `getSubtitles()` 69 | | Returns the Subtitle as SubtitleItem object. 70 | 71 | _E.g.: ``std::vector sub = parser->getSubtitles();``_ 72 | 73 | | SubtitleParser 74 | | std::string 75 | | `getFileData()` 76 | | Returns the complete file data read as it is from inputFile.srt 77 | 78 | _E.g.: ``std::string fileData = parser->getFileData();``_ 79 | 80 | | SubtitleItem 81 | | long int 82 | | `getStartTime()` 83 | | Returns the starting time of subtitle in milliseconds. 84 | 85 | _E.g.: ``long int startTime = sub->getStartTime();``_ 86 | 87 | | SubtitleItem 88 | | long int 89 | | `getEndTime()` 90 | | Returns the ending time of subtitle in milliseconds. 91 | 92 | _E.g.: ``long int endTime = sub->getEndTime();``_ 93 | 94 | | SubtitleItem 95 | | std::string 96 | | `getStartTimeString()` 97 | | Returns the starting time of subtitle in srt format. 98 | 99 | _E.g.: ``std::string startTime = sub->getStartTimeString();``_ 100 | 101 | | SubtitleItem 102 | | std::string 103 | | `getEndTimeString()` 104 | | Returns the ending time of subtitle in srt format. 105 | 106 | _E.g.: ``std::string endTime = sub->getEndTimeString();``_ 107 | 108 | | SubtitleItem 109 | | std::string 110 | | `getText()` 111 | | Returns the subtitle text as present in .srt file. 112 | 113 | _E.g.: ``std::string text = sub->getText();``_ 114 | 115 | | SubtitleItem 116 | | std::string 117 | | `getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames);` 118 | | Returns the subtitle text after processing according to parameters. 119 | 120 | keepHTML = 1 to stop parser from stripping style tags 121 | 122 | doNotIgnoreNonDialogues = 1 to stop parser from ignoring and extracting non dialogue texts such as _(laughter)_. 123 | 124 | doNotRemoveSpeakerNames = 1 to stop parser from ignoring and extracting speaker names 125 | 126 | By default (0,0,0) values are passed. 127 | 128 | _E.g.: ``std::string text = sub->getDialogue();``_ 129 | 130 | | SubtitleItem 131 | | int 132 | | `getWordCount()` 133 | | Returns the count of number of words present in the subtitle dialogue. 134 | 135 | _E.g.: ``int wordCount = sub->getWordCount();``_ 136 | 137 | | SubtitleItem 138 | | std::vector 139 | | `getIndividualWords()` 140 | | Returns string vector of individual words present in subtitle. 141 | 142 | _E.g.: ``std::vector words = sub->getIndividualWords();``_ 143 | 144 | | SubtitleItem 145 | | bool 146 | | `getIgnoreStatus()` 147 | | Returns the ignore status. Returns true, if the _justDialogue field i.e. subtitle after processing is empty. 148 | 149 | _E.g.: ``bool ignore = sub->getIgnoreStatus();``_ 150 | 151 | | SubtitleItem 152 | | int 153 | | `getSpeakerCount()` 154 | | Returns the count of number of speakers present in the subtitle. 155 | 156 | _E.g.: ``int speakerCount = sub->getSpeakerCount();``_ 157 | 158 | | SubtitleItem 159 | | std::vector 160 | | `getSpeakerNames()` 161 | | Returns string vector of speaker names. 162 | 163 | _E.g.: ``std::vector speakerNames = sub->getSpeakerNames();``_ 164 | 165 | | SubtitleItem 166 | | int 167 | | `getNonDialogueCount()` 168 | | Returns the count of number of non dialogue words present in the subtitle. 169 | 170 | _E.g.: ``int nonDialogueCount = sub->getNonDialogueCount();``_ 171 | 172 | | SubtitleItem 173 | | std::vector 174 | | `getNonDialogueWords()` 175 | | Returns string vector of non dialogue words. 176 | 177 | _E.g.: ``std::vector nonDialogueWords = sub->getNonDialogueWords();``_ 178 | 179 | | SubtitleItem 180 | | int 181 | | `getStyleTagCount()` 182 | | Returns the count of number of style tags present in the subtitle. 183 | 184 | _E.g.: ``int styleTagCount = sub->getStyleTagCount();``_ 185 | 186 | | SubtitleItem 187 | | std::vector 188 | | `getStyleTags()` 189 | | Returns string vector of style tags. 190 | 191 | _E.g.: ``std::vector styleTags = sub->getStyleTags();``_ 192 | 193 | | SubtitleWord 194 | | std::string 195 | | `getText()` 196 | | Returns the subtitle text as present in .srt file. 197 | 198 | _E.g.: ``std::string text = sub->getText();``_ 199 | 200 | |=== 201 | 202 | ## Examples 203 | 204 | While I've tried to include examples in the above table, a compilation of all of them together in a single C++ program can be found in `example` directory. 205 | 206 | ## Contributing 207 | 208 | Suggestions, features request, PRs, bug reports, bug fixes are welcomed. I'll be thankful. 209 | 210 | ## Credits 211 | 212 | Built upon a MIT licensed simple subtitle-parser called LibSub-Parser by Oleksii Maryshchenko. 213 | 214 | The original parser had 3 major functions : getStartTime(), getEndTime() and getText(). 215 | 216 | Rest work done by Saurabh Shrivastava, originally for using this in his https://saurabhshri.github.io/2017/05/gsoc/creating-a-full-blown-srt-subtitle-parser[GSoC project]. -------------------------------------------------------------------------------- /srtparser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Author : Saurabh Shrivastava 3 | * Email : saurabh.shrivastava54@gmail.com 4 | * Link : https://github.com/saurabhshri 5 | * 6 | * Based on subtitle-parser by Oleksii Maryshchenko. 7 | * Email : young_developer@mail.ru 8 | * Link : https://github.com/young-developer/subtitle-parser 9 | */ 10 | 11 | #ifndef SRTPARSER_H 12 | #define SRTPARSER_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | //function for splitting sentences based on supplied delimiter 22 | inline std::vector &split(const std::string &s, char delim, std::vector &elems) { 23 | std::stringstream ss(s); 24 | std::string item; 25 | 26 | while (getline(ss, item, delim)) { 27 | elems.push_back(item); 28 | } 29 | return elems; 30 | } 31 | 32 | /**** Class definitions ****/ 33 | 34 | 35 | class SubtitleWord 36 | { 37 | private: 38 | std::string _text; 39 | public: 40 | SubtitleWord(void); 41 | SubtitleWord(std::string text); 42 | virtual std::string getText() const; 43 | ~SubtitleWord(void); 44 | }; 45 | 46 | class SubtitleItem 47 | { 48 | private: 49 | long int _startTime; //in milliseconds 50 | long int _endTime; 51 | std::string _text; //actual line, as present in subtitle file 52 | long int timeMSec(std::string value); //converts time string into ms 53 | 54 | int _subNo; //subtitle number 55 | std::string _startTimeString; //time as in srt format 56 | std::string _endTimeString; 57 | bool _ignore; //should subtitle be ignore; used when the subtitle is empty after processing 58 | std::string _justDialogue; //contains processed subtitle - stripped style, non dialogue text removal etc. 59 | int _speakerCount; //count of number of speakers 60 | std::vector _speaker; //list of speakers in a single subtitle 61 | int _nonDialogueCount; //count of non spoken words in a subtitle 62 | std::vector _nonDialogue; //list of non dialogue words, e.g. (applause) 63 | int _wordCount; //number of words in _justDialogue 64 | std::vector _word; //list of words in dialogue 65 | std::vector _wordStartTime; //start time of each word in dialogue 66 | std::vector _wordEndTime; //end time of each word in dialogue 67 | std::vector _wordDuration; //actual duration of each word without silence 68 | int _styleTagCount; //count of style tags in a single subtitle 69 | std::vector _styleTag; //list of style tags in that subtitle 70 | void extractInfo(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //process subtitle 71 | public: 72 | long int getStartTime() const; //returns starting time in ms 73 | long int getEndTime() const; //returns ending time in ms 74 | std::string getText() const; //returns subtitle text as present in .srt file 75 | 76 | int getSubNo() const; //returns subtitle number 77 | std::string getStartTimeString() const; //returns sarting time as present in .srt file 78 | std::string getEndTimeString() const; //returns ending time as present in .srt file 79 | bool getIgnoreStatus() const; //returns status, whether the subtitle is ignorable or not after processing 80 | std::string getDialogue(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //returns processed subtitle 81 | int getSpeakerCount() const; //return speaker count 82 | int getNonDialogueCount() const; //return non dialogue words count 83 | int getStyleTagCount() const; //return style tags count 84 | int getWordCount() const; //return words count 85 | std::vector getIndividualWords(); //return string vector of individual words 86 | std::string getWordByIndex(int index); //return word stored at 'index' 87 | std::vector getWordStartTimes(); //return long int vector of start time of individual words 88 | std::vector getWordEndTimes(); //return long int vector of end time of individual words 89 | long int getWordStartTimeByIndex(int index); //return the start time of a word based on index 90 | long int getWordEndTimeByIndex (int index); //return the end time of a word based on index 91 | std::vector getSpeakerNames(); //return string vector of speaker names 92 | std::vector getNonDialogueWords(); //return string vector of non dialogue words 93 | std::vector getStyleTags(); //return string vector of style tags 94 | 95 | 96 | void setStartTime(long int startTime); //set starting time 97 | void setEndTime(long int endTime); //set ending time 98 | void setText(std::string text); //set subtitle text 99 | void setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration); //assign time to individual words 100 | 101 | SubtitleItem(void); 102 | SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore = false, 103 | std::string justDialogue = "" , int speakerCount = 0, int nonDialogueCount = 0, 104 | int styleTagCount = 0, int wordCount = 0, std::vector speaker = std::vector(), 105 | std::vector nonDialogue = std::vector(), 106 | std::vector styleTags = std::vector(), 107 | std::vector word = std::vector()); //default constructor 108 | ~SubtitleItem(void); 109 | }; 110 | 111 | class SubtitleParser 112 | { 113 | protected: 114 | std::vector _subtitles; //stores subtitles 115 | std::string _fileName; //supplied filename 116 | virtual void parse(std::string fileName) = 0; 117 | public: 118 | virtual std::vector getSubtitles(); //returns subtitles 119 | std::string getFileData(); 120 | SubtitleParser(void); 121 | virtual ~SubtitleParser(void); 122 | }; 123 | 124 | class SubtitleParserFactory 125 | { 126 | private: 127 | std::string _fileName; 128 | public: 129 | SubtitleParser* getParser(); 130 | SubtitleParserFactory(std::string fileName); 131 | ~SubtitleParserFactory(void); 132 | }; 133 | 134 | class SubRipParser : public SubtitleParser 135 | { 136 | void parse(std::string fileName); 137 | public: 138 | SubRipParser(void); 139 | SubRipParser(std::string fileName); 140 | ~SubRipParser(void); 141 | }; 142 | 143 | 144 | /**** Function definitions ****/ 145 | 146 | //1. SubtitleParserFactory class 147 | 148 | inline SubtitleParserFactory::SubtitleParserFactory(std::string fileName) 149 | { 150 | _fileName = fileName; 151 | } 152 | 153 | inline SubtitleParser* SubtitleParserFactory::getParser() 154 | { 155 | return new SubRipParser(_fileName); //creates and returns SubRipParser obj 156 | } 157 | 158 | inline SubtitleParserFactory::~SubtitleParserFactory(void) 159 | { 160 | } 161 | 162 | //2. SubtitleParser class 163 | 164 | inline std::vector SubtitleParser::getSubtitles() 165 | { 166 | return _subtitles; 167 | } 168 | 169 | inline std::string SubtitleParser::getFileData() //returns whole read file i.e. contents of input.srt 170 | { 171 | std::ifstream infile(_fileName); 172 | std::string allData = ""; 173 | std::string line; 174 | while (std::getline(infile, line)) 175 | { 176 | std::istringstream iss(line); 177 | allData += line + "\n"; 178 | } 179 | return allData; 180 | 181 | } 182 | 183 | inline SubtitleParser::SubtitleParser(void) 184 | { 185 | 186 | } 187 | 188 | inline SubtitleParser::~SubtitleParser(void) 189 | { 190 | } 191 | 192 | //3. SubRipParser class 193 | 194 | inline SubRipParser::SubRipParser(void) 195 | { 196 | } 197 | 198 | inline void SubRipParser::parse(std::string fileName) //srt parser 199 | { 200 | 201 | std::ifstream infile(fileName); 202 | std::string line, start, end, completeLine = "", timeLine = ""; 203 | int subNo, turn = 0; 204 | 205 | /* 206 | * turn = 0 -> Add subtitle number 207 | * turn = 1 -> Add string to timeLine 208 | * turn > 1 -> Add string to completeLine 209 | */ 210 | 211 | while (std::getline(infile, line)) 212 | { 213 | line.erase(remove(line.begin(), line.end(), '\r'), line.end()); 214 | 215 | if (line.compare("")) 216 | { 217 | if(!turn) 218 | { 219 | subNo=atoi(line.c_str()); 220 | turn++; 221 | continue; 222 | } 223 | 224 | if (line.find("-->") != std::string::npos) 225 | { 226 | timeLine += line; 227 | 228 | std::vector srtTime; 229 | srtTime = split(timeLine, ' ', srtTime); 230 | start = srtTime[0]; 231 | end = srtTime[2]; 232 | 233 | } 234 | else 235 | { 236 | if (completeLine != "") 237 | completeLine += " "; 238 | 239 | completeLine += line; 240 | } 241 | 242 | turn++; 243 | } 244 | 245 | else 246 | { 247 | turn = 0; 248 | _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); 249 | completeLine = timeLine = ""; 250 | } 251 | 252 | if(infile.eof()) //insert last remaining subtitle 253 | { 254 | _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); 255 | } 256 | } 257 | } 258 | 259 | inline SubRipParser::SubRipParser(std::string fileName) 260 | { 261 | _fileName = fileName; 262 | parse(fileName); 263 | } 264 | 265 | inline SubRipParser::~SubRipParser(void) 266 | { 267 | for(int i=0;i != _subtitles.size();++i) 268 | { 269 | if(_subtitles[i]) 270 | delete _subtitles[i]; 271 | } 272 | } 273 | 274 | //4. SubtitleItem class 275 | 276 | inline SubtitleItem::SubtitleItem(void) 277 | { 278 | } 279 | 280 | inline SubtitleItem::SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore, 281 | std::string justDialogue, int speakerCount, int nonDialogueCount, 282 | int styleTagCount, int wordCount, std::vector speaker, std::vector nonDialogue, 283 | std::vector styleTags, std::vector word) 284 | { 285 | _startTime = timeMSec(startTime); 286 | _endTime = timeMSec(endTime); 287 | _text = text; 288 | 289 | _subNo = subNo; 290 | _startTimeString = startTime; 291 | _endTimeString = endTime; 292 | _ignore = ignore; 293 | _justDialogue = justDialogue; 294 | _speakerCount = speakerCount; 295 | _nonDialogueCount = nonDialogueCount; 296 | _wordCount = wordCount; 297 | _speaker = speaker; 298 | _styleTagCount = styleTagCount; 299 | _styleTag = styleTags; 300 | _nonDialogue = nonDialogue; 301 | _word = word; 302 | 303 | extractInfo(); 304 | } 305 | 306 | inline long int SubtitleItem::timeMSec(std::string value) 307 | { 308 | std::vector t, secs; 309 | int hours, mins, seconds, milliseconds; 310 | 311 | t = split(value, ':', t); 312 | hours = atoi(t[0].c_str()); 313 | mins = atoi(t[1].c_str()); 314 | 315 | secs = split(t[2], ',', secs); 316 | seconds = atoi(secs[0].c_str()); 317 | milliseconds = atoi(secs[1].c_str()); 318 | 319 | return hours * 3600000 + mins * 60000 + seconds * 1000 + milliseconds; 320 | } 321 | 322 | inline long int SubtitleItem::getStartTime() const 323 | { 324 | return _startTime; 325 | } 326 | inline long int SubtitleItem::getEndTime() const 327 | { 328 | return _endTime; 329 | } 330 | 331 | inline std::string SubtitleItem::getText() const 332 | { 333 | return _text; 334 | } 335 | 336 | inline void SubtitleItem::setStartTime(long int startTime) 337 | { 338 | _startTime = startTime; 339 | } 340 | inline void SubtitleItem::setEndTime(long int endTime) 341 | { 342 | _endTime = endTime; 343 | } 344 | inline void SubtitleItem::setText(std::string text) 345 | { 346 | _text = text; 347 | } 348 | inline void SubtitleItem::setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration) 349 | { 350 | _wordStartTime = wordStartTime; 351 | _wordEndTime = wordEndTime; 352 | _wordDuration = wordDuration; 353 | } 354 | inline int SubtitleItem::getSubNo() const 355 | { 356 | return _subNo; 357 | } 358 | inline std::string SubtitleItem::getStartTimeString() const 359 | { 360 | return _startTimeString; 361 | } 362 | 363 | inline std::string SubtitleItem::getEndTimeString() const 364 | { 365 | return _endTimeString; 366 | } 367 | 368 | inline bool SubtitleItem::getIgnoreStatus() const 369 | { 370 | if(_ignore) 371 | return true; 372 | 373 | else 374 | return false; 375 | 376 | } 377 | 378 | inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) //process subtitle 379 | { 380 | std::string output = _text; 381 | 382 | //stripping HTML tags 383 | if(!keepHTML) 384 | { 385 | /* 386 | * TODO : Before erasing, extract the words. 387 | * std::vector getStyleTags(); 388 | * int getStyleTagCount() const; 389 | * std::vector _styleTag; 390 | * int _styleTagCount; 391 | */ 392 | 393 | int countP = 0; 394 | for(char& c : output) // replacing <...> with ~~~~ 395 | { 396 | if(c=='<') 397 | { 398 | countP++; 399 | c = '~'; 400 | } 401 | 402 | else 403 | { 404 | if(countP!=0) 405 | { 406 | if(c != '>') 407 | c = '~'; 408 | 409 | else if(c == '>') 410 | { 411 | c = '~'; 412 | countP--; 413 | } 414 | } 415 | } 416 | } 417 | } 418 | 419 | //stripping non dialogue data e.g. (applause) 420 | 421 | if(!doNotIgnoreNonDialogues) 422 | { 423 | /* 424 | * TODO : Before erasing, extract the words. 425 | * std::vector getNonDialogueWords(); 426 | * int getNonDialogueCount() const; 427 | * std::vector _nonDialogue; 428 | * int _nonDialogueCount; 429 | */ 430 | 431 | int countP = 0; 432 | for(char& c : output) // replacing (...) with ~~~~ 433 | { 434 | if(c=='(') 435 | { 436 | countP++; 437 | c = '~'; 438 | } 439 | 440 | else 441 | { 442 | if(countP!=0) 443 | { 444 | if(c != ')') 445 | c = '~'; 446 | 447 | else if(c == ')') 448 | { 449 | c = '~'; 450 | countP--; 451 | } 452 | } 453 | } 454 | } 455 | } 456 | 457 | output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~ 458 | 459 | //Extracting speaker names 460 | if(!doNotRemoveSpeakerNames) 461 | { 462 | for(int i=0; output[i]!='\0';i++) 463 | { 464 | int colonIndex = 0, nameBeginIndex = 0; 465 | if(output[i]==':') //speaker found; travel back 466 | { 467 | _speakerCount++; 468 | colonIndex = i; 469 | 470 | int tempIndex = 0, foundEvilColon = 0, continueFlag = 0, spaceBeforeColon = 0; 471 | 472 | if(output[i-1] == ' ') 473 | spaceBeforeColon = 2; 474 | 475 | /* 476 | Possible Cases : 477 | 478 | Elon Musk: Hey Saurabh, you are pretty smart. // First and Last Name 479 | Saurabh: *_* What? Elon Musk: Yes! // Two names in single line 480 | Saurabh : OMG OMG! // Space before colon 481 | Elon: LOL World: LAMAO 482 | Saurabh: ._. // normal 483 | 484 | */ 485 | 486 | for(int j=i - spaceBeforeColon; j>=0;j--) 487 | { 488 | if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || output[j] == '\n' 489 | || output[j] == ' ' || j== 0) 490 | { 491 | 492 | if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || j == 0) 493 | { 494 | if((continueFlag && j == 0)) 495 | { 496 | if(!isupper(output[j])) 497 | { 498 | nameBeginIndex = tempIndex; 499 | break; 500 | } 501 | 502 | else 503 | tempIndex = j; 504 | 505 | } 506 | 507 | else if(j!=0) 508 | tempIndex = j + 1; 509 | } 510 | 511 | else if(output[j] == ' ' && isupper(output[j+1])) 512 | { 513 | tempIndex = j; 514 | continueFlag = 1; 515 | 516 | continue; 517 | } 518 | 519 | else if(output[j] == ' ' && !isupper(output[j+1] && tempIndex == 0)) 520 | { 521 | _speakerCount--; 522 | foundEvilColon = 1; 523 | break; 524 | } 525 | 526 | nameBeginIndex = tempIndex; 527 | break; 528 | } 529 | } 530 | 531 | if(foundEvilColon) 532 | continue; 533 | 534 | i = nameBeginIndex; //compensating the removal and changes in index 535 | 536 | //check if there's a space after colon i.e. A: Hello vs A:Hello 537 | int removeSpace = 0; 538 | if(output[colonIndex + 1]==' ') 539 | removeSpace = 1; 540 | 541 | _speaker.push_back(output.substr(nameBeginIndex, colonIndex - nameBeginIndex)); 542 | output.erase(nameBeginIndex, colonIndex - nameBeginIndex + removeSpace); 543 | } 544 | 545 | } 546 | 547 | } 548 | 549 | // removing more than one whitespaces with one space 550 | unique_copy (output.begin(), output.end(), std::back_insert_iterator(_justDialogue), 551 | [](char a,char b) 552 | { 553 | return isspace(a) && isspace(b); 554 | }); 555 | 556 | // trimming whitespaces 557 | const char* whiteSpaces = " \t\n\r\f\v"; 558 | _justDialogue.erase(0, _justDialogue.find_first_not_of(whiteSpaces)); 559 | _justDialogue.erase(_justDialogue.find_last_not_of(whiteSpaces) + 1); 560 | 561 | if(_justDialogue.empty() || _justDialogue == " ") 562 | _ignore = true; 563 | 564 | else 565 | { 566 | _word = split(_justDialogue, ' ', _word); //extracting individual words 567 | _wordCount = _word.size(); 568 | } 569 | } 570 | 571 | inline std::string SubtitleItem::getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) 572 | { 573 | if(_justDialogue.empty()) 574 | extractInfo(keepHTML, doNotIgnoreNonDialogues, doNotRemoveSpeakerNames); 575 | 576 | return _justDialogue; 577 | } 578 | inline int SubtitleItem::getSpeakerCount() const 579 | { 580 | return _speakerCount; 581 | } 582 | inline int SubtitleItem::getNonDialogueCount() const 583 | { 584 | return _nonDialogueCount; 585 | } 586 | inline int SubtitleItem::getStyleTagCount() const 587 | { 588 | return _styleTagCount; 589 | } 590 | inline int SubtitleItem::getWordCount() const 591 | { 592 | return _wordCount; 593 | } 594 | inline std::vector SubtitleItem::getSpeakerNames() 595 | { 596 | return _speaker; 597 | } 598 | inline std::vector SubtitleItem::getNonDialogueWords() 599 | { 600 | return _nonDialogue; 601 | } 602 | inline std::vector SubtitleItem::getIndividualWords() 603 | { 604 | return _word; 605 | } 606 | inline std::string SubtitleItem::getWordByIndex(int index) 607 | { 608 | return _word[index]; 609 | } 610 | inline std::vector SubtitleItem::getWordStartTimes() 611 | { 612 | return _wordStartTime; 613 | } 614 | inline std::vector SubtitleItem::getWordEndTimes() 615 | { 616 | return _wordEndTime; 617 | } 618 | inline long int SubtitleItem::getWordStartTimeByIndex(int index) 619 | { 620 | return _wordStartTime[index]; 621 | } 622 | inline long int SubtitleItem::getWordEndTimeByIndex(int index) 623 | { 624 | return _wordEndTime[index]; 625 | } 626 | inline std::vector SubtitleItem::getStyleTags() 627 | { 628 | return _styleTag; 629 | } 630 | inline SubtitleItem::~SubtitleItem(void) 631 | { 632 | 633 | } 634 | 635 | //5. SubtitleWordclass 636 | 637 | inline SubtitleWord::SubtitleWord(void) 638 | { 639 | _text = ""; 640 | } 641 | 642 | inline SubtitleWord::SubtitleWord(std::string text) 643 | { 644 | _text = text; 645 | } 646 | 647 | inline std::string SubtitleWord::getText() const 648 | { 649 | return _text; 650 | } 651 | 652 | inline SubtitleWord::~SubtitleWord(void) 653 | { 654 | } 655 | 656 | 657 | #endif //SRTPARSER_H --------------------------------------------------------------------------------