├── Makefile ├── README ├── spec ├── spec.cpp ├── spec.h ├── spec_main.cpp └── spec_wtf8.cpp └── wtf8.h /Makefile: -------------------------------------------------------------------------------- 1 | 2 | SRC=$(wildcard spec/*.cpp) 3 | OBJ=$(SRC:.cpp=.o) 4 | 5 | CPPFLAGS+=-Wall -Wextra -pedantic -g 6 | 7 | run: specsuite 8 | ./specsuite 9 | 10 | specsuite: $(OBJ) 11 | $(LINK.cpp) $^ -o $@ 12 | 13 | 14 | clean: 15 | $(RM) $(OBJ) specsuite 16 | 17 | 18 | spec/spec_wtf8.o: wtf8.h spec/spec.h 19 | 20 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 2 | wtf8 3 | Utility functions for utf8 handling. 4 | 5 | 6 | 7 | wtf8_decode - decode a codepoint from utf8 bytes 8 | wtf8_encode - encode a codepoint to utf8 bytes 9 | wtf8_strlen - count the number of codepoints in a utf8 string 10 | wtf8_strnlen - count the number of codepoints in a utf8 string 11 | wtf8_is_initial_byte - is byte a valid first byte of a sequence 12 | wtf8_is_continuation_byte - is byte a valid continuation byte of a sequence 13 | 14 | The used utf8 decoder is based on the work of Bjoern Hoehrmann. 15 | Copyright (c) 2008-2010 Bjoern Hoehrmann 16 | See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 17 | 18 | All other code is public domain. 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /spec/spec.cpp: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #include "spec.h" 31 | 32 | #include 33 | 34 | namespace specific { 35 | 36 | 37 | 38 | void SpecWriter::startGroup(std::string /*group*/, std::string /*description*/) {} 39 | 40 | void SpecWriter::addFailedAssertation(const std::string& msg, const char *file, int line, const std::string& group, const std::string& description) { 41 | mFailures.push_back( SpecFailure(msg, file, line, group, description) ); 42 | } 43 | void SpecWriter::addSpecResult(SpecResult r) { 44 | mResults.push_back( r ); 45 | } 46 | void SpecWriter::start() {} 47 | void SpecWriter::stop() { 48 | std::cout << std::endl; 49 | size_t nth = 0; 50 | std::string lastgroup = ""; 51 | std::string lastdescription = ""; 52 | for(std::vector::iterator i=mFailures.begin(); i != mFailures.end(); ++i, ++nth) 53 | { 54 | std::cout << std::endl; 55 | if(lastgroup != i->group || lastdescription != i->description) { 56 | 57 | lastgroup = i->group; 58 | lastdescription = i->description; 59 | 60 | std::cout << std::endl; 61 | std::cout << "In " << i->group << ": " << i->description << ":" << std::endl << std::endl; 62 | } 63 | std::cout << " " << (nth+1) << ") Failed assertation at " << i->file << ":" 64 | << i->line << ":" << std::endl << " " << i->msg << std::endl; 65 | } 66 | std::cout << std::endl << mResults.size() << " examples, " << mFailures.size() << " failures" << std::endl; 67 | 68 | } 69 | 70 | 71 | 72 | void ProgressWriter::addSpecResult(SpecResult r) { 73 | SpecWriter::addSpecResult(r); 74 | switch(r.type) { 75 | case SpecResult::PASSED: 76 | std::cout << "."; 77 | break; 78 | case SpecResult::FAILED: 79 | std::cout << "F"; 80 | break; 81 | case SpecResult::ERRORED: 82 | std::cout << "E"; 83 | break; 84 | } 85 | std::cout << std::flush; 86 | } 87 | 88 | 89 | 90 | void SpecdocWriter::startGroup(std::string group, std::string description) { 91 | std::cout << group << ": " << description << std::endl; 92 | } 93 | 94 | 95 | void SpecdocWriter::addSpecResult(SpecResult r) { 96 | SpecWriter::addSpecResult(r); 97 | size_t nth = mFailures.size(); 98 | std::cout << "- " << r.test; 99 | switch(r.type) { 100 | case SpecResult::PASSED: 101 | std::cout << " [OK]"; 102 | break; 103 | case SpecResult::FAILED: 104 | std::cout << " [FAILED - " << nth << "]"; 105 | break; 106 | case SpecResult::ERRORED: 107 | std::cout << " [ERROR - "<< nth <<"]"; 108 | break; 109 | } 110 | std::cout << std::endl; 111 | } 112 | 113 | 114 | 115 | 116 | class spec_failure {}; 117 | 118 | 119 | 120 | SpecBase::SpecBase() : mWriter(NULL), mName(NULL), 121 | mFailed(false), mLastFailed(false), mError(false), mExecutionPoint(0), mContinuePoint(0) 122 | { 123 | SpecRunner::getInstance().add(this); 124 | } 125 | 126 | 127 | SpecBase::~SpecBase() { 128 | 129 | } 130 | 131 | 132 | bool SpecBase::startSpec(const char* name) 133 | { 134 | endSpec(); 135 | 136 | mExecutionPoint++; 137 | if(mExecutionPoint <= mContinuePoint) return false; 138 | mContinuePoint++; 139 | 140 | mName = name; 141 | return true; 142 | } 143 | 144 | 145 | void SpecBase::endSpec() 146 | { 147 | if(!mName) return; 148 | 149 | SpecResult r; 150 | r.group = getGroup(); 151 | r.description = getDescription(); 152 | r.type = SpecResult::PASSED; 153 | if(mLastFailed) r.type = SpecResult::FAILED; 154 | if(mError) r.type = SpecResult::ERRORED; 155 | r.test = mName; 156 | mWriter->addSpecResult( r ); 157 | 158 | mName = NULL; 159 | } 160 | 161 | 162 | void SpecBase::should_test(bool value, const char* message, const char* file, int line) { 163 | mLastFailed=false; 164 | if(!value) { 165 | mWriter->addFailedAssertation(message, file, line, getGroup(), getDescription()); 166 | mLastFailed = mFailed = true; 167 | throw spec_failure(); 168 | } 169 | } 170 | 171 | 172 | void SpecBase::error(std::string msg) { 173 | mWriter->addFailedAssertation(msg, "(exception)", 0, getGroup(), getDescription()); 174 | mLastFailed = true; 175 | mFailed = true; 176 | mError = true; 177 | } 178 | 179 | bool SpecBase::done() { 180 | if( mError ) { 181 | mError = false; 182 | return false; 183 | } 184 | return true; 185 | } 186 | 187 | 188 | SpecRunner::SpecRunner() {} 189 | SpecRunner::~SpecRunner() { } 190 | 191 | SpecRunner& SpecRunner::getInstance() { 192 | static SpecRunner* instance = NULL; 193 | if( instance == NULL ) { 194 | instance = new SpecRunner; 195 | } 196 | return *instance; 197 | } 198 | 199 | 200 | bool SpecRunner::run(SpecWriter& writer, const std::string subset) { 201 | bool success = true; 202 | 203 | writer.start(); 204 | std::vector::iterator i = mSpecs.begin(); 205 | for(; i != mSpecs.end(); ++i) { 206 | SpecBase *b = *i; 207 | if( b->getGroup().find(subset, 0) == std::string::npos ) continue; 208 | b->mContinuePoint = 0; 209 | b->setWriter(&writer); 210 | writer.startGroup( b->getGroup(), b->getDescription() ); 211 | do { 212 | b->mExecutionPoint = 0; 213 | try { 214 | b->specify(); 215 | } catch(spec_failure& /*e*/) { 216 | b->mError=true; 217 | } catch( std::exception& e) { 218 | b->error(e.what()); 219 | } catch( ... ) { 220 | b->error("unknown exception"); 221 | } 222 | b->endSpec(); 223 | 224 | } while( !b->done() ); 225 | 226 | success = success && b->isSuccessful(); 227 | 228 | } 229 | writer.stop(); 230 | 231 | return success; 232 | } 233 | 234 | 235 | } 236 | 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /spec/spec.h: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #ifndef SPECIFIC_SPEC_H 31 | #define SPECIFIC_SPEC_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | namespace specific { 39 | 40 | 41 | class SpecResult { 42 | public: 43 | typedef enum { 44 | PASSED, 45 | FAILED, 46 | ERRORED 47 | } Type; 48 | 49 | Type type; 50 | 51 | std::string group; 52 | std::string description; 53 | std::string test; 54 | }; 55 | 56 | 57 | class SpecFailure { 58 | public: 59 | SpecFailure(const std::string& amsg, const char* afile, int aline, const std::string& agroup, const std::string& adescription) 60 | : msg(amsg), file(afile), line(aline), group(agroup), description(adescription) { } 61 | std::string msg; 62 | const char* file; 63 | int line; 64 | std::string group; 65 | std::string description; 66 | }; 67 | 68 | 69 | class SpecWriter { 70 | public: 71 | std::vector mResults; 72 | std::vector mFailures; 73 | SpecWriter() {} 74 | virtual ~SpecWriter() {} 75 | virtual void startGroup(std::string group, std::string description); 76 | virtual void addFailedAssertation(const std::string& msg, const char *file, int line, const std::string& group, const std::string& description); 77 | virtual void addSpecResult(SpecResult r); 78 | virtual void start(); 79 | virtual void stop(); 80 | }; 81 | 82 | 83 | class ProgressWriter : public SpecWriter { 84 | public: 85 | void addSpecResult(SpecResult r); 86 | }; 87 | 88 | 89 | 90 | class SpecdocWriter : public SpecWriter { 91 | public: 92 | void startGroup(std::string group, std::string description); 93 | void addSpecResult(SpecResult r); 94 | }; 95 | 96 | 97 | 98 | template std::string inspect(const T& value) { 99 | std::stringstream ss; 100 | ss << value; 101 | return ss.str(); 102 | } 103 | 104 | 105 | class SpecBase { 106 | public: 107 | SpecBase(); 108 | virtual ~SpecBase(); 109 | 110 | virtual void specify() = 0; 111 | 112 | void setWriter(SpecWriter* w) { mWriter = w; } 113 | 114 | bool startSpec(const char* name); 115 | void endSpec(); 116 | 117 | void should_test(bool value, const char* message, const char* file, int line); 118 | 119 | template void should_equal_template(const T1& a, const T2& b, const char* file, int line) { 120 | std::stringstream ss; 121 | ss << "`" << ::specific::inspect(a) << "'" << " == " << "`" << ::specific::inspect(b) << "'"; 122 | should_test( a == b, ss.str().c_str(), file, line); 123 | } 124 | 125 | template void should_not_equal_template(const T1& a, const T2& b, const char* file, int line) { 126 | std::stringstream ss; 127 | ss << "`" << ::specific::inspect(a) << "'" << " != " << "`" << ::specific::inspect(b) << "'"; 128 | should_test( a != b, ss.str().c_str(), file, line); 129 | } 130 | 131 | 132 | 133 | virtual std::string getGroup() = 0; 134 | virtual std::string getDescription() = 0; 135 | 136 | bool isSuccessful() { return !mFailed; } 137 | 138 | bool done(); 139 | 140 | void error(std::string msg); 141 | 142 | SpecWriter* mWriter; 143 | const char* mName; 144 | bool mFailed; 145 | bool mLastFailed; 146 | bool mError; 147 | int mExecutionPoint; 148 | int mContinuePoint; 149 | char *mFile; 150 | std::string mErrorMessage; 151 | int mLine; 152 | }; 153 | 154 | 155 | class SpecRunner { 156 | public: 157 | static SpecRunner& getInstance(); 158 | void add(SpecBase* spec) { mSpecs.push_back( spec ); } 159 | bool run(SpecWriter& writer, const std::string subset = ""); 160 | private: 161 | 162 | std::vector mSpecs; 163 | 164 | SpecRunner(); 165 | ~SpecRunner(); 166 | }; 167 | 168 | #define SPEC_UNIQUE_NAME3(x,y) x##y 169 | #define SPEC_UNIQUE_NAME2(x,y) SPEC_UNIQUE_NAME3(x,y) 170 | 171 | #define SPEC_NAME(x) SPEC_UNIQUE_NAME2(SPEC_##x, SPEC_UNIQUE_NAME2(_startingOnLine, __LINE__) ) 172 | 173 | 174 | #define describe(group, description) \ 175 | class SPEC_NAME(group) : public specific::SpecBase \ 176 | { \ 177 | public: \ 178 | void specify(); \ 179 | std::string getGroup() { return #group; } \ 180 | std::string getDescription() { return description; } \ 181 | }; \ 182 | static SPEC_NAME(group) SPEC_UNIQUE_NAME2(SPEC_NAME(group), _instance); \ 183 | void SPEC_NAME(group)::specify() 184 | 185 | 186 | #define it(description) if(startSpec(description)) 187 | 188 | 189 | // Matchers 190 | #define should_be_true(a) should_test(a, #a, __FILE__, __LINE__) 191 | #define should_be_false(a) should_be_true( !a ) 192 | 193 | #ifndef SPECIFIC_NO_OSTREAM 194 | #define should_equal(a, b) should_equal_template( a,b, __FILE__, __LINE__ ) 195 | #define should_not_equal(a, b) should_not_equal_template( a,b, __FILE__, __LINE__ ) 196 | #else 197 | #define should_equal(a, b) should_be_true( (a) == (b) ) 198 | #define should_not_equal(a, b) should_be_true( (a) != (b) ) 199 | #endif 200 | 201 | #define should_throw(code, what) \ 202 | do { \ 203 | bool _thrown = false; \ 204 | try { \ 205 | code ; \ 206 | } catch(what& /*e*/) { \ 207 | _thrown = true; \ 208 | } \ 209 | should_test(_thrown, "should throw exception " #what, __FILE__, __LINE__); \ 210 | } while(0) 211 | 212 | 213 | 214 | } 215 | 216 | 217 | 218 | #endif /* Include guard */ 219 | 220 | -------------------------------------------------------------------------------- /spec/spec_main.cpp: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #include "spec.h" 31 | #include 32 | 33 | int main(int argc, char *argv[]) 34 | { 35 | 36 | std::string subset(""); 37 | 38 | specific::ProgressWriter progressWriter; 39 | specific::SpecdocWriter specdocWriter; 40 | specific::SpecWriter* writer = &progressWriter; 41 | 42 | for(size_t i = 1; i < size_t(argc); ++i) { 43 | if( std::string("-s") == argv[i] ) { 44 | writer = &specdocWriter; 45 | } else { 46 | subset = argv[i]; 47 | } 48 | } 49 | 50 | 51 | bool success = specific::SpecRunner::getInstance().run(*writer, subset); 52 | 53 | return success ? EXIT_SUCCESS : EXIT_FAILURE; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /spec/spec_wtf8.cpp: -------------------------------------------------------------------------------- 1 | #include "../wtf8.h" 2 | 3 | #include "spec.h" 4 | 5 | 6 | describe(wtf8, "wtf8_decode") { 7 | 8 | it("should decode codepoint from utf8 string in the single octet range 00-7f") { 9 | 10 | const unsigned char ustr1[]={ 1 }; 11 | const unsigned char ustr2[]={ 0x32 }; 12 | const unsigned char ustr3[]={ 0x7f }; 13 | const unsigned char ustr_er[]={ 0x80 }; 14 | 15 | const char* str1 = (const char*)ustr1; 16 | const char* str2 = (const char*)ustr2; 17 | const char* str3 = (const char*)ustr3; 18 | const char* str_er = (const char*)ustr_er; 19 | 20 | unsigned int codepoint = 0; 21 | 22 | const char* res = 0; 23 | 24 | codepoint = 0; 25 | res = wtf8_decode(str1, 1, &codepoint); 26 | should_equal( codepoint, 1u); 27 | should_equal( res, str1+1 ); 28 | 29 | codepoint = 0; 30 | res = wtf8_decode(str2, 1, &codepoint); 31 | should_equal( codepoint, 0x32u); 32 | should_equal( res, str2+1 ); 33 | 34 | codepoint = 0; 35 | res = wtf8_decode(str3, 1, &codepoint); 36 | should_equal( codepoint, 0x7fu); 37 | should_equal( res, str3+1 ); 38 | 39 | codepoint = 0; 40 | res = wtf8_decode(str_er, 1, &codepoint); 41 | should_equal( codepoint, 0xfffdu); 42 | should_equal( res, str_er+1 ); 43 | 44 | } 45 | 46 | it("should decode codepoint from utf8 string in the two octet range 80-7ff") { 47 | 48 | const unsigned char ustr1[]={ 0xc2u, 0x80u }; 49 | const unsigned char ustr2[]={ 0xc4u, 0x80u }; 50 | const unsigned char ustr3[]={ 0xdfu, 0xbfu }; 51 | const unsigned char ustr_er[]={ 0xdfu, 0xc0u }; 52 | const unsigned char ustr_er2[]={ 0xdfu }; 53 | 54 | const char* str1 = (const char*)ustr1; 55 | const char* str2 = (const char*)ustr2; 56 | const char* str3 = (const char*)ustr3; 57 | const char* str_er = (const char*)ustr_er; 58 | const char* str_er2 = (const char*)ustr_er2; 59 | 60 | unsigned int codepoint = 0; 61 | 62 | const char* res = 0; 63 | 64 | codepoint = 0; 65 | res = wtf8_decode(str1, 2, &codepoint); 66 | should_equal( codepoint, 0x80u); 67 | should_equal( res, str1+2 ); 68 | 69 | codepoint = 0; 70 | res = wtf8_decode(str2, 2, &codepoint); 71 | should_equal( codepoint, 0x100u); 72 | should_equal( res, str2+2 ); 73 | 74 | codepoint = 0; 75 | res = wtf8_decode(str3, 2, &codepoint); 76 | should_equal( codepoint, 0x7ffu); 77 | should_equal( res, str3+2 ); 78 | 79 | codepoint = 0; 80 | res = wtf8_decode(str_er, 2, &codepoint); 81 | should_equal( codepoint, 0xfffdu); 82 | should_equal( res, str_er+2 ); 83 | 84 | codepoint = 0; 85 | res = wtf8_decode(str_er2, 1, &codepoint); 86 | should_equal( codepoint, 0xfffdu); 87 | should_equal( res, str_er2+1 ); 88 | 89 | } 90 | 91 | it("should decode codepoint from utf8 string in the three octet range 800-ffff") { 92 | 93 | const unsigned char ustr1[]={ 0xe0u, 0xa0u, 0x80u }; 94 | const unsigned char ustr2[]={ 0xe1u, 0x80u, 0x80u }; 95 | const unsigned char ustr3[]={ 0xefu, 0xbfu, 0xbfu }; 96 | const unsigned char ustr_er[]={ 0xefu, 0xbfu, 0xc0u }; 97 | const unsigned char ustr_er2[]={ 0xefu, 0xbfu }; 98 | 99 | const char* str1 = (const char*)ustr1; 100 | const char* str2 = (const char*)ustr2; 101 | const char* str3 = (const char*)ustr3; 102 | const char* str_er = (const char*)ustr_er; 103 | const char* str_er2 = (const char*)ustr_er2; 104 | 105 | unsigned int codepoint = 0; 106 | 107 | const char* res = 0; 108 | 109 | codepoint = 0; 110 | res = wtf8_decode(str1, 3, &codepoint); 111 | should_equal( codepoint, 0x800u); 112 | should_equal( res, str1+3 ); 113 | 114 | codepoint = 0; 115 | res = wtf8_decode(str2, 3, &codepoint); 116 | should_equal( codepoint, 0x1000u); 117 | should_equal( res, str2+3 ); 118 | 119 | codepoint = 0; 120 | res = wtf8_decode(str3, 3, &codepoint); 121 | should_equal( codepoint, 0xffffu); 122 | should_equal( res, str3+3 ); 123 | 124 | codepoint = 0; 125 | res = wtf8_decode(str_er, 3, &codepoint); 126 | should_equal( codepoint, 0xfffdu); 127 | should_equal( res, str_er+3 ); 128 | 129 | codepoint = 0; 130 | res = wtf8_decode(str_er2, 2, &codepoint); 131 | should_equal( codepoint, 0xfffdu); 132 | should_equal( res, str_er2+2 ); 133 | 134 | } 135 | 136 | it("should decode codepoint from utf8 string in the four octet range 10000-1ffff") { 137 | 138 | const unsigned char ustr1[]={ 0xf0u, 0x90u, 0x80u, 0x80u }; 139 | const unsigned char ustr2[]={ 0xf0u, 0x92u, 0x80u, 0x80u }; 140 | const unsigned char ustr3[]={ 0xf0u, 0x9fu, 0xbfu, 0xbfu }; 141 | const unsigned char ustr_er[]={ 0xf0u, 0x9fu, 0xbfu, 0xc0u }; 142 | const unsigned char ustr_er2[]={ 0xf0u, 0x9fu, 0xbfu }; 143 | 144 | const char* str1 = (const char*)ustr1; 145 | const char* str2 = (const char*)ustr2; 146 | const char* str3 = (const char*)ustr3; 147 | const char* str_er = (const char*)ustr_er; 148 | const char* str_er2 = (const char*)ustr_er2; 149 | 150 | unsigned int codepoint = 0; 151 | 152 | const char* res = 0; 153 | 154 | codepoint = 0; 155 | res = wtf8_decode(str1, 4, &codepoint); 156 | should_equal( codepoint, 0x10000u); 157 | should_equal( res, str1+4 ); 158 | 159 | codepoint = 0; 160 | res = wtf8_decode(str2, 4, &codepoint); 161 | should_equal( codepoint, 0x12000u); 162 | should_equal( res, str2+4 ); 163 | 164 | codepoint = 0; 165 | res = wtf8_decode(str3, 4, &codepoint); 166 | should_equal( codepoint, 0x1ffffu); 167 | should_equal( res, str3+4 ); 168 | 169 | codepoint = 0; 170 | res = wtf8_decode(str_er, 4, &codepoint); 171 | should_equal( codepoint, 0xfffdu); 172 | should_equal( res, str_er+4 ); 173 | 174 | codepoint = 0; 175 | res = wtf8_decode(str_er2, 3, &codepoint); 176 | should_equal( codepoint, 0xfffdu); 177 | should_equal( res, str_er2+3 ); 178 | 179 | } 180 | 181 | it("should not allow overlong sequences") { 182 | const unsigned char ustr1[]={ 0xc0u, 0xafu }; 183 | const unsigned char ustr2[]={ 0xe0u, 0x80u, 0xafu }; 184 | const unsigned char ustr3[]={ 0xf0u, 0x80u, 0x80u, 0xafu }; 185 | const unsigned char ustr4[]={ 0xf8u, 0x80u, 0x80u, 0x80u, 0xafu }; 186 | const unsigned char ustr5[]={ 0xfcu, 0x80u, 0x80u, 0x80u, 0x80u, 0xafu }; 187 | 188 | const char* str1 = (const char*)ustr1; 189 | const char* str2 = (const char*)ustr2; 190 | const char* str3 = (const char*)ustr3; 191 | const char* str4 = (const char*)ustr4; 192 | const char* str5 = (const char*)ustr5; 193 | 194 | unsigned int codepoint; 195 | 196 | codepoint = 0; 197 | wtf8_decode(str1, 2, &codepoint); 198 | should_equal( codepoint, 0xfffdu); 199 | 200 | codepoint = 0; 201 | wtf8_decode(str2, 3, &codepoint); 202 | should_equal( codepoint, 0xfffdu); 203 | 204 | codepoint = 0; 205 | wtf8_decode(str3, 4, &codepoint); 206 | should_equal( codepoint, 0xfffdu); 207 | 208 | codepoint = 0; 209 | wtf8_decode(str4, 5, &codepoint); 210 | should_equal( codepoint, 0xfffdu); 211 | 212 | codepoint = 0; 213 | wtf8_decode(str5, 6, &codepoint); 214 | should_equal( codepoint, 0xfffdu); 215 | 216 | 217 | } 218 | 219 | it("should not allow maximum overlong sequences") { 220 | const unsigned char ustr1[]={ 0xc1u, 0xbfu }; 221 | const unsigned char ustr2[]={ 0xe0u, 0x9fu, 0xbfu }; 222 | const unsigned char ustr3[]={ 0xf0u, 0x8fu, 0xbfu, 0xbfu }; 223 | const unsigned char ustr4[]={ 0xf8u, 0x87u, 0xbfu, 0xbfu, 0xbfu }; 224 | const unsigned char ustr5[]={ 0xfcu, 0x83u, 0xbfu, 0xbfu, 0xbfu, 0xbfu }; 225 | 226 | const char* str1 = (const char*)ustr1; 227 | const char* str2 = (const char*)ustr2; 228 | const char* str3 = (const char*)ustr3; 229 | const char* str4 = (const char*)ustr4; 230 | const char* str5 = (const char*)ustr5; 231 | 232 | unsigned int codepoint; 233 | 234 | codepoint = 0; 235 | wtf8_decode(str1, 2, &codepoint); 236 | should_equal( codepoint, 0xfffdu); 237 | 238 | codepoint = 0; 239 | wtf8_decode(str2, 3, &codepoint); 240 | should_equal( codepoint, 0xfffdu); 241 | 242 | codepoint = 0; 243 | wtf8_decode(str3, 4, &codepoint); 244 | should_equal( codepoint, 0xfffdu); 245 | 246 | codepoint = 0; 247 | wtf8_decode(str4, 5, &codepoint); 248 | should_equal( codepoint, 0xfffdu); 249 | 250 | codepoint = 0; 251 | wtf8_decode(str5, 6, &codepoint); 252 | should_equal( codepoint, 0xfffdu); 253 | 254 | 255 | } 256 | 257 | it("should not allow codepoints designated as surrogates") { 258 | for(size_t i = 0xa0; i <= 0xbf; ++i) { 259 | for(size_t j = 0x80; j <= 0xbf; ++j) { 260 | const unsigned char ustr1[]={ (unsigned char)0xedu, (unsigned char)i, (unsigned char)j }; 261 | const char* str1 = (const char*)ustr1; 262 | unsigned int codepoint = 0; 263 | wtf8_decode(str1, 3, &codepoint); 264 | should_equal( codepoint, 0xfffdu); 265 | } 266 | } 267 | } 268 | } 269 | 270 | 271 | 272 | describe(wtf8, "wtf8_encode") { 273 | 274 | it("should encode all valid codepoints to utf8") { 275 | char buf[8]; 276 | for(unsigned int i = 0; i < 0x1ffff; ++i) 277 | { 278 | // Skip surrogates, as they are not allowed in utf8 279 | if( i >= 0xd800 && i <= 0xdfff ) continue; 280 | 281 | memset(buf, 0, 8); 282 | 283 | const char* ret1 = wtf8_encode(i, buf); 284 | uint32_t res = 0; 285 | const char* ret2 = wtf8_decode(buf,7,&res); 286 | should_equal( i, res ); 287 | should_equal( ret1, ret2 ); 288 | } 289 | } 290 | } 291 | 292 | 293 | describe(wtf8, "wtf8_strlen") { 294 | 295 | it("should count distinct codepoints") { 296 | 297 | const char* str1 = "foobar"; 298 | const char* str2 = "foob\xc3\xa6r"; 299 | const char* str3 = "foob\xf0\x9f\x99\x88r"; 300 | 301 | should_equal( wtf8_strlen(str1), 6); 302 | should_equal( wtf8_strlen(str2), 6); 303 | should_equal( wtf8_strlen(str3), 6); 304 | 305 | } 306 | 307 | } 308 | 309 | describe(wtf8, "wtf8_strnlen") { 310 | 311 | it("should count distinct codepoints") { 312 | 313 | const char* str1 = "foobar"; 314 | const char* str2 = "foob\xc3\xa6r"; 315 | const char* str3 = "foob\xf0\x9f\x99\x88r"; 316 | 317 | should_equal( wtf8_strnlen(str1,6), 6); 318 | should_equal( wtf8_strnlen(str2,7), 6); 319 | should_equal( wtf8_strnlen(str3,9), 6); 320 | 321 | } 322 | 323 | } 324 | 325 | describe(wtf8, "wtf8_is_continuation_byte") { 326 | it("should return true if a given byte is not the initial byte of a utf8 sequence") { 327 | const char* str1 = "f"; 328 | const char* str2 = "f\xc3\xa6r"; 329 | const char* str3 = "f\xf0\x9f\x99\x88r"; 330 | should_be_false( wtf8_is_continuation_byte( str1[0] ) ); 331 | 332 | should_be_false( wtf8_is_continuation_byte( str2[0] ) ); 333 | should_be_false( wtf8_is_continuation_byte( str2[1] ) ); 334 | should_be_true( wtf8_is_continuation_byte( str2[2] ) ); 335 | 336 | should_be_false( wtf8_is_continuation_byte( str3[0] ) ); 337 | should_be_false( wtf8_is_continuation_byte( str3[1] ) ); 338 | should_be_true( wtf8_is_continuation_byte( str3[2] ) ); 339 | should_be_true( wtf8_is_continuation_byte( str3[3] ) ); 340 | should_be_true( wtf8_is_continuation_byte( str3[4] ) ); 341 | } 342 | } 343 | 344 | describe(wtf8, "wtf8_is_initial_byte") { 345 | it("should return true if a given byte is the initial byte of a utf8 sequence") { 346 | const char* str1 = "f"; 347 | const char* str2 = "f\xc3\xa6r"; 348 | const char* str3 = "f\xf0\x9f\x99\x88r"; 349 | should_be_true( wtf8_is_initial_byte( str1[0] ) ); 350 | 351 | should_be_true( wtf8_is_initial_byte( str2[0] ) ); 352 | should_be_true( wtf8_is_initial_byte( str2[1] ) ); 353 | should_be_false( wtf8_is_initial_byte( str2[2] ) ); 354 | 355 | should_be_true( wtf8_is_initial_byte( str3[0] ) ); 356 | should_be_true( wtf8_is_initial_byte( str3[1] ) ); 357 | should_be_false( wtf8_is_initial_byte( str3[2] ) ); 358 | should_be_false( wtf8_is_initial_byte( str3[3] ) ); 359 | should_be_false( wtf8_is_initial_byte( str3[4] ) ); 360 | 361 | // 0 - 7f 362 | for (int i = 0; i <= 0x7f; ++i) { 363 | should_be_true(wtf8_is_initial_byte(i)); 364 | } 365 | // 80 - 7ff 366 | for (int i = 0; i <= 0x1f; ++i) { 367 | should_be_true(wtf8_is_initial_byte(i | 0xc0)); 368 | } 369 | // 800 - ffff 370 | for (int i = 0; i <= 0xf; ++i) { 371 | should_be_true(wtf8_is_initial_byte(i | 0xe0)); 372 | } 373 | // 10000 - 10ffff 374 | for (int i = 0; i <= 0x7; ++i) { 375 | should_be_true(wtf8_is_initial_byte(i | 0xf0)); 376 | } 377 | 378 | // continuation bytes 379 | for (int i = 0x0; i <= 0x3f; ++i) { 380 | should_be_false(wtf8_is_initial_byte(i | 0x80)); 381 | } 382 | 383 | // remaining bytes 384 | for (int i = 0xf8; i <= 0xff; ++i) { 385 | should_be_false(wtf8_is_initial_byte(i)); 386 | } 387 | 388 | } 389 | } 390 | -------------------------------------------------------------------------------- /wtf8.h: -------------------------------------------------------------------------------- 1 | // WTF8 - Public domain, except for the utf8 decoder below 2 | 3 | // Copyright (c) 2008-2010 Bjoern Hoehrmann 4 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 5 | 6 | // Copyright (c) 2008-2009 Bjoern Hoehrmann 7 | // 8 | // Permission is hereby granted, free of charge, to any person obtaining a copy 9 | // of this software and associated documentation files (the "Software"), to deal 10 | // in the Software without restriction, including without limitation the rights 11 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | // copies of the Software, and to permit persons to whom the Software is 13 | // furnished to do so, subject to the following conditions: 14 | // 15 | // The above copyright notice and this permission notice shall be included in all 16 | // copies or substantial portions of the Software. 17 | // 18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | // SOFTWARE. 25 | 26 | #ifndef WTF8_WTF8_H 27 | #define WTF8_WTF8_H 28 | 29 | 30 | #ifdef _WIN32 31 | #define uint32_t unsigned int 32 | #define uint8_t unsigned char 33 | #else 34 | #include 35 | #endif 36 | 37 | 38 | 39 | 40 | #define UTF8_ACCEPT 0 41 | #define UTF8_REJECT 12 42 | 43 | static const uint8_t wtf8_utf8d[] = { 44 | // The first part of the table maps bytes to character classes that 45 | // to reduce the size of the transition table and create bitmasks. 46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 49 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 50 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 51 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 52 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 53 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 54 | 55 | // The second part is a transition table that maps a combination 56 | // of a state of the automaton and a character class to a state. 57 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 58 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 59 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 60 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 61 | 12,36,12,12,12,12,12,12,12,12,12,12, 62 | }; 63 | 64 | /** Decode utf8 codepoint a byte at a time. Uses explictly user provided state variable, 65 | * that should be initialized to zero before first use. Places the result to codep 66 | * @return Returns UTF8_ACCEPT when a full codepoint achieved 67 | */ 68 | static inline uint32_t wtf8_decode_state(uint32_t* state, uint32_t* codep, uint32_t byte) { 69 | uint32_t type = wtf8_utf8d[byte]; 70 | 71 | *codep = (*state != UTF8_ACCEPT) ? 72 | (byte & 0x3fu) | (*codep << 6) : 73 | (0xff >> type) & (byte); 74 | 75 | *state = wtf8_utf8d[256 + *state + type]; 76 | return *state; 77 | } 78 | 79 | 80 | 81 | /** Decode a utf8 codepoint from a byte array. Reads maximum of maxbytes from str. 82 | * Places the result to result 83 | * @return The start of next codepoint sequence 84 | */ 85 | static inline const char* wtf8_decode(const char* str, int maxbytes, uint32_t* result) { 86 | 87 | const unsigned char* ustr = (unsigned char*)str; 88 | uint32_t state = 0; 89 | while(maxbytes--) { 90 | uint32_t res = wtf8_decode_state(&state, result, *ustr); 91 | ustr++; 92 | if(res == UTF8_ACCEPT) return (const char*)ustr; 93 | else if(res == UTF8_REJECT) { *result=0xfffd; return (const char*)ustr; } 94 | } 95 | 96 | *result = 0xfffd; 97 | 98 | return (const char*)ustr; 99 | 100 | } 101 | 102 | 103 | static inline const char* wtf8_encode(uint32_t codepoint, char* str) { 104 | 105 | unsigned char* ustr = (unsigned char*)str; 106 | if( codepoint <= 0x7f) { 107 | ustr[0] = (unsigned char)codepoint; 108 | ustr+=1; 109 | } else if( codepoint <= 0x7ff ) { 110 | ustr[0] = (unsigned char) (0xc0 + (codepoint >> 6)); 111 | ustr[1] = (unsigned char) (0x80 + (codepoint & 0x3f)); 112 | ustr+=2; 113 | } else if( codepoint <= 0xffff) { 114 | ustr[0] = (unsigned char) (0xe0 + (codepoint >> 12)); 115 | ustr[1] = (unsigned char) (0x80 + ((codepoint >> 6) & 63)); 116 | ustr[2] = (unsigned char) (0x80 + (codepoint & 63)); 117 | ustr+=3; 118 | } else if( codepoint <= 0x1ffff) { 119 | ustr[0] = (unsigned char) (0xf0 + (codepoint >> 18)); 120 | ustr[1] = (unsigned char) (0x80 + ((codepoint >> 12) & 0x3f)); 121 | ustr[2] = (unsigned char) (0x80 + ((codepoint >> 6) & 0x3f)); 122 | ustr[3] = (unsigned char) (0x80 + (codepoint & 0x3f)); 123 | ustr+=4; 124 | } 125 | 126 | 127 | return (char*)ustr; 128 | } 129 | 130 | 131 | static inline int wtf8_strlen(const char* str) { 132 | 133 | int count = 0; 134 | uint32_t state = 0; 135 | 136 | const unsigned char* ustr = (unsigned char*)str; 137 | uint32_t tmp; 138 | while(*ustr != 0) { 139 | uint32_t res = wtf8_decode_state(&state, &tmp, *ustr); 140 | ustr++; 141 | if(res == UTF8_ACCEPT) { count++; } 142 | else if(res == UTF8_REJECT) { count++; } 143 | } 144 | 145 | 146 | return count; 147 | } 148 | 149 | static inline int wtf8_strnlen(const char* str, int bytes) { 150 | 151 | int count = 0; 152 | uint32_t state = 0; 153 | uint32_t res; 154 | 155 | const unsigned char* ustr = (unsigned char*)str; 156 | uint32_t tmp; 157 | while(bytes--) { 158 | if(*ustr == 0) break; 159 | res = wtf8_decode_state(&state, &tmp, *ustr); 160 | ustr++; 161 | if(res == UTF8_ACCEPT) { count++; } 162 | else if(res == UTF8_REJECT) { count++; } 163 | } 164 | 165 | 166 | return count; 167 | } 168 | 169 | static inline int wtf8_is_continuation_byte(char byte) { 170 | return (byte & 0xc0) == 0x80; 171 | } 172 | 173 | static inline int wtf8_is_initial_byte(char byte) { 174 | return (byte & 0x80) == 0 175 | || (byte & 0xe0) == 0xc0 176 | || (byte & 0xf0) == 0xe0 177 | || (byte & 0xf8) == 0xf0; 178 | } 179 | 180 | #ifdef _WIN32 181 | #undef uint32_t 182 | #undef uint8_t 183 | #endif 184 | 185 | 186 | #endif 187 | --------------------------------------------------------------------------------