diff options
Diffstat (limited to 'tesseract/unittest/unicharcompress_test.cc')
-rw-r--r-- | tesseract/unittest/unicharcompress_test.cc | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/tesseract/unittest/unicharcompress_test.cc b/tesseract/unittest/unicharcompress_test.cc new file mode 100644 index 00000000..1777930e --- /dev/null +++ b/tesseract/unittest/unicharcompress_test.cc @@ -0,0 +1,257 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> + +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" +#include "allheaders.h" + +#include "include_gunit.h" +#include "log.h" // for LOG +#include "serialis.h" +#include "tprintf.h" +#include "unicharcompress.h" + +namespace tesseract { + +class UnicharcompressTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + // Loads and compresses the given unicharset. + void LoadUnicharset(const std::string& unicharset_name) { + std::string radical_stroke_file = + file::JoinPath(LANGDATA_DIR, "radical-stroke.txt"); + std::string unicharset_file = + file::JoinPath(TESTDATA_DIR, unicharset_name); + std::string radical_data; + CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, + file::Defaults())); + CHECK(unicharset_.load_from_file(unicharset_file.c_str())); + STRING radical_str(radical_data.c_str()); + null_char_ = + unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size(); + compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str); + // Get the encoding of the null char. + RecodedCharID code; + compressed_.EncodeUnichar(null_char_, &code); + encoded_null_char_ = code(0); + std::string output_name = file::JoinPath( + FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt")); + STRING encoding = compressed_.GetEncodingAsString(unicharset_); + std::string encoding_str(&encoding[0], encoding.size()); + CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults())); + LOG(INFO) << "Wrote encoding to:" << output_name; + } + // Serializes and de-serializes compressed_ over itself. + void SerializeAndUndo() { + std::vector<char> data; + TFile wfp; + wfp.OpenWrite(&data); + EXPECT_TRUE(compressed_.Serialize(&wfp)); + TFile rfp; + rfp.Open(&data[0], data.size()); + EXPECT_TRUE(compressed_.DeSerialize(&rfp)); + } + // Returns true if the lang is in CJK. + bool IsCJKLang(const std::string& lang) { + return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || + lang == "jpn"; + } + // Returns true if the lang is Indic. + bool IsIndicLang(const std::string& lang) { + return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || + lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" || + lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" || + lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" || + lang == "tel"; + } + + // Expects the appropriate results from the compressed_ unicharset_. + void ExpectCorrect(const std::string& lang) { + // Count the number of times each code is used in each element of + // RecodedCharID. + RecodedCharID zeros; + for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0); + int code_range = compressed_.code_range(); + std::vector<RecodedCharID> times_seen(code_range, zeros); + for (int u = 0; u <= unicharset_.size(); ++u) { + if (u != UNICHAR_SPACE && u != null_char_ && + (u == unicharset_.size() || (unicharset_.has_special_codes() && + u < SPECIAL_UNICHAR_CODES_COUNT))) { + continue; // Not used so not encoded. + } + RecodedCharID code; + int len = compressed_.EncodeUnichar(u, &code); + // Check round-trip encoding. + int unichar_id; + GenericVector<UNICHAR_ID> normed_ids; + if (u == null_char_ || u == unicharset_.size()) { + unichar_id = null_char_; + } else { + unichar_id = u; + } + EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code)); + // Check that the codes are valid. + for (int i = 0; i < len; ++i) { + int code_val = code(i); + EXPECT_GE(code_val, 0); + EXPECT_LT(code_val, code_range); + times_seen[code_val].Set(i, times_seen[code_val](i) + 1); + } + } + // Check that each code is used in at least one position. + for (int c = 0; c < code_range; ++c) { + int num_used = 0; + for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) { + if (times_seen[c](i) != 0) ++num_used; + } + EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range; + } + // Check that GetNextCodes/GetFinalCodes lists match the times_seen, + // and create valid codes. + RecodedCharID code; + CheckCodeExtensions(code, times_seen); + // Finally, we achieved all that using a codebook < 10% of the size of + // the original unicharset, for CK or Indic, and 20% with J, but just + // no bigger for all others. + if (IsCJKLang(lang) || IsIndicLang(lang)) { + EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10)); + } else { + EXPECT_LE(code_range, unicharset_.size() + 1); + } + LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " + << code_range; + } + // Checks for extensions of the current code that either finish a code, or + // extend it and checks those extensions recursively. + void CheckCodeExtensions(const RecodedCharID& code, + const std::vector<RecodedCharID>& times_seen) { + RecodedCharID extended = code; + int length = code.length(); + const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code); + if (final_codes != nullptr) { + for (int i = 0; i < final_codes->size(); ++i) { + int ending = (*final_codes)[i]; + EXPECT_GT(times_seen[ending](length), 0); + extended.Set(length, ending); + int unichar_id = compressed_.DecodeUnichar(extended); + EXPECT_NE(INVALID_UNICHAR_ID, unichar_id); + } + } + const GenericVector<int>* next_codes = compressed_.GetNextCodes(code); + if (next_codes != nullptr) { + for (int i = 0; i < next_codes->size(); ++i) { + int extension = (*next_codes)[i]; + EXPECT_GT(times_seen[extension](length), 0); + extended.Set(length, extension); + CheckCodeExtensions(extended, times_seen); + } + } + } + + UnicharCompress compressed_; + UNICHARSET unicharset_; + int null_char_; + // The encoding of the null_char_. + int encoded_null_char_; +}; + +TEST_F(UnicharcompressTest, DoesChinese) { + LOG(INFO) << "Testing chi_tra"; + LoadUnicharset("chi_tra.unicharset"); + ExpectCorrect("chi_tra"); + LOG(INFO) << "Testing chi_sim"; + LoadUnicharset("chi_sim.unicharset"); + ExpectCorrect("chi_sim"); +} + +TEST_F(UnicharcompressTest, DoesJapanese) { + LOG(INFO) << "Testing jpn"; + LoadUnicharset("jpn.unicharset"); + ExpectCorrect("jpn"); +} + +TEST_F(UnicharcompressTest, DoesKorean) { + LOG(INFO) << "Testing kor"; + LoadUnicharset("kor.unicharset"); + ExpectCorrect("kor"); +} + +TEST_F(UnicharcompressTest, DoesKannada) { + LOG(INFO) << "Testing kan"; + LoadUnicharset("kan.unicharset"); + ExpectCorrect("kan"); + SerializeAndUndo(); + ExpectCorrect("kan"); +} + +TEST_F(UnicharcompressTest, DoesMarathi) { + LOG(INFO) << "Testing mar"; + LoadUnicharset("mar.unicharset"); + ExpectCorrect("mar"); +} + +TEST_F(UnicharcompressTest, DoesEnglish) { + LOG(INFO) << "Testing eng"; + LoadUnicharset("eng.unicharset"); + ExpectCorrect("eng"); +} + +// Tests that a unicharset that contains double-letter ligatures (eg ff) has +// no null char in the encoding at all. +TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) { + LOG(INFO) << "Testing por with ligatures"; + LoadUnicharset("por.unicharset"); + ExpectCorrect("por"); + // Check that any unichar-id that is encoded with multiple codes has the + // correct encoded_nulll_char_ in between. + for (int u = 0; u <= unicharset_.size(); ++u) { + RecodedCharID code; + int len = compressed_.EncodeUnichar(u, &code); + if (len > 1) { + // The should not be any null char in the code. + for (int i = 0; i < len; ++i) { + EXPECT_NE(encoded_null_char_, code(i)); + } + } + } +} + +// Tests that GetEncodingAsString returns the right result for a trivial +// unicharset. +TEST_F(UnicharcompressTest, GetEncodingAsString) { + LoadUnicharset("trivial.unicharset"); + ExpectCorrect("trivial"); + STRING encoding = compressed_.GetEncodingAsString(unicharset_); + std::string encoding_str(&encoding[0], encoding.length()); + std::vector<std::string> lines = + absl::StrSplit(encoding_str, "\n", absl::SkipEmpty()); + EXPECT_EQ(5, lines.size()); + // The first line is always space. + EXPECT_EQ("0\t ", lines[0]); + // Next we have i. + EXPECT_EQ("1\ti", lines[1]); + // Next we have f. + EXPECT_EQ("2\tf", lines[2]); + // Next we have the fi ligature: fi. There are no nulls in it, as there are no + // repeated letter ligatures in this unicharset, unlike por.unicharset above. + EXPECT_EQ("2,1\tfi", lines[3]); + // Finally the null character. + EXPECT_EQ("3\t<nul>", lines[4]); +} + +} // namespace tesseract |