diff options
Diffstat (limited to 'tesseract/src/ccutil/scanutils.cpp')
-rw-r--r-- | tesseract/src/ccutil/scanutils.cpp | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/tesseract/src/ccutil/scanutils.cpp b/tesseract/src/ccutil/scanutils.cpp new file mode 100644 index 00000000..d94dc8b9 --- /dev/null +++ b/tesseract/src/ccutil/scanutils.cpp @@ -0,0 +1,503 @@ +// Copyright 2006 Google Inc. +// All Rights Reserved. +// Author: renn +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include <cctype> +#include <climits> // for CHAR_BIT +#include <cmath> +#include <cstdarg> +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <limits> // for std::numeric_limits + +#include "scanutils.h" + +enum Flags { + FL_SPLAT = 0x01, // Drop the value, do not assign + FL_INV = 0x02, // Character-set with inverse + FL_WIDTH = 0x04, // Field width specified + FL_MINUS = 0x08, // Negative number +}; + +enum Ranks { + RANK_CHAR = -2, + RANK_SHORT = -1, + RANK_INT = 0, + RANK_LONG = 1, + RANK_LONGLONG = 2, + RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers +}; + +const enum Ranks kMinRank = RANK_CHAR; +const enum Ranks kMaxRank = RANK_LONGLONG; + +const enum Ranks kIntMaxRank = RANK_LONGLONG; +const enum Ranks kSizeTRank = RANK_LONG; +const enum Ranks kPtrDiffRank = RANK_LONG; + +enum Bail { + BAIL_NONE = 0, // No error condition + BAIL_EOF, // Hit EOF + BAIL_ERR // Conversion mismatch +}; + +// Helper functions ------------------------------------------------------------ +inline size_t LongBit() { + return CHAR_BIT * sizeof(long); +} + +static inline int +SkipSpace(FILE *s) { + int p; + while (isascii(p = fgetc(s)) && isspace(p)); + ungetc(p, s); // Make sure next char is available for reading + return p; +} + +static inline void +SetBit(unsigned long *bitmap, unsigned int bit) { + bitmap[bit/LongBit()] |= 1UL << (bit%LongBit()); +} + +static inline int +TestBit(unsigned long *bitmap, unsigned int bit) { + return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1; +} + +static inline int DigitValue(int ch, int base) { + if (ch >= '0' && ch <= '9') { + if (base >= 10 || ch <= '7') + return ch-'0'; + } else if (ch >= 'A' && ch <= 'Z' && base == 16) { + return ch-'A'+10; + } else if (ch >= 'a' && ch <= 'z' && base == 16) { + return ch-'a'+10; + } + return -1; +} + +// IO (re-)implementations ----------------------------------------------------- +static uintmax_t streamtoumax(FILE* s, int base) { + int minus = 0; + uintmax_t v = 0; + int d, c = 0; + + for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)); + + // Single optional + or - + if (c == '-' || c == '+') { + minus = (c == '-'); + c = fgetc(s); + } + + // Assign correct base + if (base == 0) { + if (c == '0') { + c = fgetc(s); + if (c == 'x' || c == 'X') { + base = 16; + c = fgetc(s); + } else { + base = 8; + } + } + } else if (base == 16) { + if (c == '0') { + c = fgetc(s); + if (c == 'x' || c == 'X') c = fgetc(s); + } + } + + // Actual number parsing + for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) + v = v*base + d; + + ungetc(c, s); + return minus ? -v : v; +} + +static double streamtofloat(FILE* s) { + bool minus = false; + uint64_t v = 0; + int d, c; + uint64_t k = 1; + uint64_t w = 0; + + for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)); + + // Single optional + or - + if (c == '-' || c == '+') { + minus = (c == '-'); + c = fgetc(s); + } + + // Actual number parsing + for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) + v = v*10 + d; + if (c == '.') { + for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { + w = w*10 + d; + k *= 10; + } + } + double f = v + static_cast<double>(w) / k; + if (c == 'e' || c == 'E') { + c = fgetc(s); + int expsign = 1; + if (c == '-' || c == '+') { + expsign = (c == '-') ? -1 : 1; + c = fgetc(s); + } + int exponent = 0; + for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { + exponent = exponent * 10 + d; + } + exponent *= expsign; + f *= pow(10.0, static_cast<double>(exponent)); + } + ungetc(c, s); + + return minus ? -f : f; +} + +static int tvfscanf(FILE* stream, const char *format, va_list ap); + +int tfscanf(FILE* stream, const char *format, ...) { + va_list ap; + int rv; + + va_start(ap, format); + rv = tvfscanf(stream, format, ap); + va_end(ap); + + return rv; +} + +static int tvfscanf(FILE* stream, const char *format, va_list ap) { + const char *p = format; + char ch; + int q = 0; + uintmax_t val = 0; + int rank = RANK_INT; // Default rank + unsigned int width = UINT_MAX; + int base; + int flags = 0; + enum { + ST_NORMAL, // Ground state + ST_FLAGS, // Special flags + ST_WIDTH, // Field width + ST_MODIFIERS, // Length or conversion modifiers + ST_MATCH_INIT, // Initial state of %[ sequence + ST_MATCH, // Main state of %[ sequence + ST_MATCH_RANGE, // After - in a %[ sequence + } state = ST_NORMAL; + char *sarg = nullptr; // %s %c or %[ string argument + enum Bail bail = BAIL_NONE; + int converted = 0; // Successful conversions + unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) / + (CHAR_BIT * sizeof(long))]; + int matchinv = 0; // Is match map inverted? + unsigned char range_start = 0; + auto start_off = std::ftell(stream); + + // Skip leading spaces + SkipSpace(stream); + + while ((ch = *p++) && !bail) { + switch (state) { + case ST_NORMAL: + if (ch == '%') { + state = ST_FLAGS; + flags = 0; rank = RANK_INT; width = UINT_MAX; + } else if (isascii(ch) && isspace(ch)) { + SkipSpace(stream); + } else { + if (fgetc(stream) != ch) + bail = BAIL_ERR; // Match failure + } + break; + + case ST_FLAGS: + if (ch == '*') { + flags |= FL_SPLAT; + } else if ('0' <= ch && ch <= '9') { + width = (ch-'0'); + state = ST_WIDTH; + flags |= FL_WIDTH; + } else { + state = ST_MODIFIERS; + p--; // Process this character again + } + break; + + case ST_WIDTH: + if (ch >= '0' && ch <= '9') { + width = width*10+(ch-'0'); + } else { + state = ST_MODIFIERS; + p--; // Process this character again + } + break; + + case ST_MODIFIERS: + switch (ch) { + // Length modifiers - nonterminal sequences + case 'h': + rank--; // Shorter rank + break; + case 'l': + rank++; // Longer rank + break; + case 'j': + rank = kIntMaxRank; + break; + case 'z': + rank = kSizeTRank; + break; + case 't': + rank = kPtrDiffRank; + break; + case 'L': + case 'q': + rank = RANK_LONGLONG; // long double/long long + break; + + default: + // Output modifiers - terminal sequences + state = ST_NORMAL; // Next state will be normal + if (rank < kMinRank) // Canonicalize rank + rank = kMinRank; + else if (rank > kMaxRank) + rank = kMaxRank; + + switch (ch) { + case 'P': // Upper case pointer + case 'p': // Pointer + rank = RANK_PTR; + base = 0; + goto scan_int; + + case 'i': // Base-independent integer + base = 0; + goto scan_int; + + case 'd': // Decimal integer + base = 10; + goto scan_int; + + case 'o': // Octal integer + base = 8; + goto scan_int; + + case 'u': // Unsigned decimal integer + base = 10; + goto scan_int; + + case 'x': // Hexadecimal integer + case 'X': + base = 16; + goto scan_int; + + case 'n': // Number of characters consumed + val = std::ftell(stream) - start_off; + goto set_integer; + + scan_int: + q = SkipSpace(stream); + if (q <= 0) { + bail = BAIL_EOF; + break; + } + val = streamtoumax(stream, base); + // fall through + + set_integer: + if (!(flags & FL_SPLAT)) { + converted++; + switch(rank) { + case RANK_CHAR: + *va_arg(ap, unsigned char *) + = static_cast<unsigned char>(val); + break; + case RANK_SHORT: + *va_arg(ap, unsigned short *) + = static_cast<unsigned short>(val); + break; + case RANK_INT: + *va_arg(ap, unsigned int *) + = static_cast<unsigned int>(val); + break; + case RANK_LONG: + *va_arg(ap, unsigned long *) + = static_cast<unsigned long>(val); + break; + case RANK_LONGLONG: + *va_arg(ap, unsigned long long *) + = static_cast<unsigned long long>(val); + break; + case RANK_PTR: + *va_arg(ap, void **) + = reinterpret_cast<void *>(static_cast<uintptr_t>(val)); + break; + } + } + break; + + case 'f': // Preliminary float value parsing + case 'g': + case 'G': + case 'e': + case 'E': + q = SkipSpace(stream); + if (q <= 0) { + bail = BAIL_EOF; + break; + } + + { + double fval = streamtofloat(stream); + if (!(flags & FL_SPLAT)) { + if (rank == RANK_INT) + *va_arg(ap, float *) = static_cast<float>(fval); + else if (rank == RANK_LONG) + *va_arg(ap, double *) = static_cast<double>(fval); + converted++; + } + } + break; + + case 'c': // Character + width = (flags & FL_WIDTH) ? width : 1; // Default width == 1 + sarg = va_arg(ap, char *); + while (width--) { + if ((q = fgetc(stream)) <= 0) { + bail = BAIL_EOF; + break; + } + if (!(flags & FL_SPLAT)) { + *sarg++ = q; + converted++; + } + } + break; + + case 's': // String + { + if (!(flags & FL_SPLAT)) { + sarg = va_arg(ap, char *); + } + unsigned length = 0; + while (width--) { + q = fgetc(stream); + if ((isascii(q) && isspace(q)) || (q <= 0)) { + ungetc(q, stream); + break; + } + if (!(flags & FL_SPLAT)) { + sarg[length] = q; + } + length++; + } + if (length == 0) { + bail = BAIL_EOF; + } else if (!(flags & FL_SPLAT)) { + sarg[length] = '\0'; // Terminate output + converted++; + } + } + break; + + case '[': // Character range + sarg = va_arg(ap, char *); + state = ST_MATCH_INIT; + matchinv = 0; + memset(matchmap, 0, sizeof matchmap); + break; + + case '%': // %% sequence + if (fgetc(stream) != '%') + bail = BAIL_ERR; + break; + + default: // Anything else + bail = BAIL_ERR; // Unknown sequence + break; + } + } + break; + + case ST_MATCH_INIT: // Initial state for %[ match + if (ch == '^' && !(flags & FL_INV)) { + matchinv = 1; + } else { + SetBit(matchmap, static_cast<unsigned char>(ch)); + state = ST_MATCH; + } + break; + + case ST_MATCH: // Main state for %[ match + if (ch == ']') { + goto match_run; + } else if (ch == '-') { + range_start = static_cast<unsigned char>(ch); + state = ST_MATCH_RANGE; + } else { + SetBit(matchmap, static_cast<unsigned char>(ch)); + } + break; + + case ST_MATCH_RANGE: // %[ match after - + if (ch == ']') { + SetBit(matchmap, static_cast<unsigned char>('-')); + goto match_run; + } else { + int i; + for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++) + SetBit(matchmap, i); + state = ST_MATCH; + } + break; + + match_run: // Match expression finished + char* oarg = sarg; + while (width) { + q = fgetc(stream); + auto qc = static_cast<unsigned char>(q); + if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) { + ungetc(q, stream); + break; + } + if (!(flags & FL_SPLAT)) *sarg = q; + sarg++; + } + if (oarg == sarg) { + bail = (q <= 0) ? BAIL_EOF : BAIL_ERR; + } else if (!(flags & FL_SPLAT)) { + *sarg = '\0'; + converted++; + } + break; + } + } + + if (bail == BAIL_EOF && !converted) + converted = -1; // Return EOF (-1) + + return converted; +} |