summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/unittest/unicharcompress_test.cc')
-rw-r--r--tesseract/unittest/unicharcompress_test.cc257
1 files changed, 257 insertions, 0 deletions
diff --git a/tesseract/unittest/unicharcompress_test.cc b/tesseract/unittest/unicharcompress_test.cc
new file mode 100644
index 00000000..1777930e
--- /dev/null
+++ b/tesseract/unittest/unicharcompress_test.cc
@@ -0,0 +1,257 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "allheaders.h"
+
+#include "include_gunit.h"
+#include "log.h" // for LOG
+#include "serialis.h"
+#include "tprintf.h"
+#include "unicharcompress.h"
+
+namespace tesseract {
+
+class UnicharcompressTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ // Loads and compresses the given unicharset.
+ void LoadUnicharset(const std::string& unicharset_name) {
+ std::string radical_stroke_file =
+ file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
+ std::string unicharset_file =
+ file::JoinPath(TESTDATA_DIR, unicharset_name);
+ std::string radical_data;
+ CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
+ file::Defaults()));
+ CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
+ STRING radical_str(radical_data.c_str());
+ null_char_ =
+ unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
+ compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
+ // Get the encoding of the null char.
+ RecodedCharID code;
+ compressed_.EncodeUnichar(null_char_, &code);
+ encoded_null_char_ = code(0);
+ std::string output_name = file::JoinPath(
+ FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
+ STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+ std::string encoding_str(&encoding[0], encoding.size());
+ CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
+ LOG(INFO) << "Wrote encoding to:" << output_name;
+ }
+ // Serializes and de-serializes compressed_ over itself.
+ void SerializeAndUndo() {
+ std::vector<char> data;
+ TFile wfp;
+ wfp.OpenWrite(&data);
+ EXPECT_TRUE(compressed_.Serialize(&wfp));
+ TFile rfp;
+ rfp.Open(&data[0], data.size());
+ EXPECT_TRUE(compressed_.DeSerialize(&rfp));
+ }
+ // Returns true if the lang is in CJK.
+ bool IsCJKLang(const std::string& lang) {
+ return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
+ lang == "jpn";
+ }
+ // Returns true if the lang is Indic.
+ bool IsIndicLang(const std::string& lang) {
+ return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
+ lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
+ lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
+ lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
+ lang == "tel";
+ }
+
+ // Expects the appropriate results from the compressed_ unicharset_.
+ void ExpectCorrect(const std::string& lang) {
+ // Count the number of times each code is used in each element of
+ // RecodedCharID.
+ RecodedCharID zeros;
+ for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
+ int code_range = compressed_.code_range();
+ std::vector<RecodedCharID> times_seen(code_range, zeros);
+ for (int u = 0; u <= unicharset_.size(); ++u) {
+ if (u != UNICHAR_SPACE && u != null_char_ &&
+ (u == unicharset_.size() || (unicharset_.has_special_codes() &&
+ u < SPECIAL_UNICHAR_CODES_COUNT))) {
+ continue; // Not used so not encoded.
+ }
+ RecodedCharID code;
+ int len = compressed_.EncodeUnichar(u, &code);
+ // Check round-trip encoding.
+ int unichar_id;
+ GenericVector<UNICHAR_ID> normed_ids;
+ if (u == null_char_ || u == unicharset_.size()) {
+ unichar_id = null_char_;
+ } else {
+ unichar_id = u;
+ }
+ EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
+ // Check that the codes are valid.
+ for (int i = 0; i < len; ++i) {
+ int code_val = code(i);
+ EXPECT_GE(code_val, 0);
+ EXPECT_LT(code_val, code_range);
+ times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
+ }
+ }
+ // Check that each code is used in at least one position.
+ for (int c = 0; c < code_range; ++c) {
+ int num_used = 0;
+ for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
+ if (times_seen[c](i) != 0) ++num_used;
+ }
+ EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
+ }
+ // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
+ // and create valid codes.
+ RecodedCharID code;
+ CheckCodeExtensions(code, times_seen);
+ // Finally, we achieved all that using a codebook < 10% of the size of
+ // the original unicharset, for CK or Indic, and 20% with J, but just
+ // no bigger for all others.
+ if (IsCJKLang(lang) || IsIndicLang(lang)) {
+ EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
+ } else {
+ EXPECT_LE(code_range, unicharset_.size() + 1);
+ }
+ LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
+ << code_range;
+ }
+ // Checks for extensions of the current code that either finish a code, or
+ // extend it and checks those extensions recursively.
+ void CheckCodeExtensions(const RecodedCharID& code,
+ const std::vector<RecodedCharID>& times_seen) {
+ RecodedCharID extended = code;
+ int length = code.length();
+ const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code);
+ if (final_codes != nullptr) {
+ for (int i = 0; i < final_codes->size(); ++i) {
+ int ending = (*final_codes)[i];
+ EXPECT_GT(times_seen[ending](length), 0);
+ extended.Set(length, ending);
+ int unichar_id = compressed_.DecodeUnichar(extended);
+ EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
+ }
+ }
+ const GenericVector<int>* next_codes = compressed_.GetNextCodes(code);
+ if (next_codes != nullptr) {
+ for (int i = 0; i < next_codes->size(); ++i) {
+ int extension = (*next_codes)[i];
+ EXPECT_GT(times_seen[extension](length), 0);
+ extended.Set(length, extension);
+ CheckCodeExtensions(extended, times_seen);
+ }
+ }
+ }
+
+ UnicharCompress compressed_;
+ UNICHARSET unicharset_;
+ int null_char_;
+ // The encoding of the null_char_.
+ int encoded_null_char_;
+};
+
+TEST_F(UnicharcompressTest, DoesChinese) {
+ LOG(INFO) << "Testing chi_tra";
+ LoadUnicharset("chi_tra.unicharset");
+ ExpectCorrect("chi_tra");
+ LOG(INFO) << "Testing chi_sim";
+ LoadUnicharset("chi_sim.unicharset");
+ ExpectCorrect("chi_sim");
+}
+
+TEST_F(UnicharcompressTest, DoesJapanese) {
+ LOG(INFO) << "Testing jpn";
+ LoadUnicharset("jpn.unicharset");
+ ExpectCorrect("jpn");
+}
+
+TEST_F(UnicharcompressTest, DoesKorean) {
+ LOG(INFO) << "Testing kor";
+ LoadUnicharset("kor.unicharset");
+ ExpectCorrect("kor");
+}
+
+TEST_F(UnicharcompressTest, DoesKannada) {
+ LOG(INFO) << "Testing kan";
+ LoadUnicharset("kan.unicharset");
+ ExpectCorrect("kan");
+ SerializeAndUndo();
+ ExpectCorrect("kan");
+}
+
+TEST_F(UnicharcompressTest, DoesMarathi) {
+ LOG(INFO) << "Testing mar";
+ LoadUnicharset("mar.unicharset");
+ ExpectCorrect("mar");
+}
+
+TEST_F(UnicharcompressTest, DoesEnglish) {
+ LOG(INFO) << "Testing eng";
+ LoadUnicharset("eng.unicharset");
+ ExpectCorrect("eng");
+}
+
+// Tests that a unicharset that contains double-letter ligatures (eg ff) has
+// no null char in the encoding at all.
+TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
+ LOG(INFO) << "Testing por with ligatures";
+ LoadUnicharset("por.unicharset");
+ ExpectCorrect("por");
+ // Check that any unichar-id that is encoded with multiple codes has the
+ // correct encoded_nulll_char_ in between.
+ for (int u = 0; u <= unicharset_.size(); ++u) {
+ RecodedCharID code;
+ int len = compressed_.EncodeUnichar(u, &code);
+ if (len > 1) {
+ // The should not be any null char in the code.
+ for (int i = 0; i < len; ++i) {
+ EXPECT_NE(encoded_null_char_, code(i));
+ }
+ }
+ }
+}
+
+// Tests that GetEncodingAsString returns the right result for a trivial
+// unicharset.
+TEST_F(UnicharcompressTest, GetEncodingAsString) {
+ LoadUnicharset("trivial.unicharset");
+ ExpectCorrect("trivial");
+ STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+ std::string encoding_str(&encoding[0], encoding.length());
+ std::vector<std::string> lines =
+ absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
+ EXPECT_EQ(5, lines.size());
+ // The first line is always space.
+ EXPECT_EQ("0\t ", lines[0]);
+ // Next we have i.
+ EXPECT_EQ("1\ti", lines[1]);
+ // Next we have f.
+ EXPECT_EQ("2\tf", lines[2]);
+ // Next we have the fi ligature: fi. There are no nulls in it, as there are no
+ // repeated letter ligatures in this unicharset, unlike por.unicharset above.
+ EXPECT_EQ("2,1\tfi", lines[3]);
+ // Finally the null character.
+ EXPECT_EQ("3\t<nul>", lines[4]);
+}
+
+} // namespace tesseract