summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'devices/vector/gdevpdte.c')
-rw-r--r--devices/vector/gdevpdte.c272
1 files changed, 252 insertions, 20 deletions
diff --git a/devices/vector/gdevpdte.c b/devices/vector/gdevpdte.c
index 6f0eb158..7a6b145e 100644
--- a/devices/vector/gdevpdte.c
+++ b/devices/vector/gdevpdte.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2020 Artifex Software, Inc.
+/* Copyright (C) 2001-2021 Artifex Software, Inc.
All Rights Reserved.
This software is provided AS-IS with no warranty, either express or
@@ -43,6 +43,7 @@
#include "gxcpath.h"
#include "gsfcmap.h"
+#include "tessocr.h"
static int pdf_char_widths(gx_device_pdf *const pdev,
pdf_font_resource_t *pdfont, int ch,
@@ -80,6 +81,216 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
return pdf_process_string(penum, pstr, pfmat, ppts, gdata);
}
+static int OCRText(gx_device_pdf *pdev, gs_glyph glyph, gs_char ch, gs_char *length, byte **unicode)
+{
+#if OCR_VERSION > 0
+ int code = 0;
+
+ if(pdev->OCRStage == OCR_Rendered) {
+ int llx, lly, urx, ury, char_count = 0, returned_count = 0, *returned;
+ ocr_glyph_t *next_glyph = pdev->ocr_glyphs;
+ int rows, stride, row, column;
+ byte *bitmap = NULL, *src, *dest, *rowptr, srcmask, destmask;
+ void *state;
+ const char *language = pdev->ocr_language;
+ gp_file *DbgFile;
+
+ if(language == NULL || language[0] == 0)
+ language = "eng";
+
+ /* We should alredy have rendered a bitmap for all the glyphs in the
+ * text operation, so this shuld be redundant, but best to be safe.
+ */
+ if(next_glyph == NULL)
+ return_error(gs_error_unknownerror);
+
+ /* Identify the bounding box of the returned glyphs by examing the bounds and position
+ * of each glyph. At the same time count the number of expected returned characters.
+ * We treat any empty bitmap (all 0x00 bytes) as a space because, obviously, the
+ * OCR engine can't tell differentiate between a space character and no character at all.
+ */
+ llx = next_glyph->x;
+ lly = next_glyph->y;
+ urx = llx + next_glyph->width;
+ ury = lly + next_glyph->height;
+ if(next_glyph != NULL && !next_glyph->is_space)
+ char_count++;
+ next_glyph = (ocr_glyph_t *)next_glyph->next;
+ while(next_glyph) {
+ if(!next_glyph->is_space)
+ char_count++;
+ if(next_glyph->x < llx)
+ llx = next_glyph->x;
+ if(next_glyph->y < lly)
+ lly = next_glyph->y;
+ if(next_glyph->x + next_glyph->width > urx)
+ urx = next_glyph->x + next_glyph->width;
+ if(next_glyph->y + next_glyph->height > ury)
+ ury = next_glyph->y + next_glyph->height;
+ next_glyph = next_glyph->next;
+ }
+
+ /* Allocate and initialise the 'strip' bitmap which will receive all the
+ * individual glyph bitmaps.
+ */
+ rows = ury - lly;
+ stride = (((urx - llx) + 7) / 8) + 1;
+ bitmap = gs_alloc_bytes(pdev->memory, rows * stride, "working OCR memory");
+ if(bitmap == NULL)
+ return_error(gs_error_VMerror);
+ memset(bitmap, 0x00, rows * stride);
+
+ /* Allocate a buffer for the OCR engine to return the Unicode code points. This needs work,
+ * we might want more information returned (bounding boxes and confidence levels) and we
+ * need to think about the possibility that the OCR engine finds more character than we
+ * expected (eg fi ligatures returned as 'f' and 'i'.
+ */
+ returned = (int *)gs_alloc_bytes(pdev->memory, char_count * sizeof(int), "returned unicodes");
+ if(returned == NULL) {
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+ return_error(gs_error_VMerror);
+ }
+ memset(returned, 0x00, char_count * sizeof(int));
+
+ /* Now copy each glyph bitmap to the correct position in the strip. This is complicated
+ * by the fact that bitmaps are monochrome pcaked into bytes and so the destination
+ * may not be aligned on a byte boundary.
+ */
+ next_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+ while(next_glyph) {
+ rowptr = bitmap + ((next_glyph->y - lly) * stride) + (int)floor((next_glyph->x - llx) / 8);
+ for(row = 0;row < next_glyph->height;row++) {
+ dest = rowptr + row * stride;
+ src = next_glyph->data + (row * next_glyph->raster);
+ destmask = 0x80 >> (next_glyph->x - llx) % 8;
+ srcmask = 0x80;
+ for(column = 0; column < next_glyph->width;column++) {
+ if(*src & srcmask) {
+ *dest = *dest | destmask;
+ }
+ srcmask = srcmask >> 1;
+ if(srcmask == 0) {
+ srcmask = 0x80;
+ src++;
+ }
+ destmask = destmask >> 1;
+ if(destmask == 0) {
+ destmask = 0x80;
+ dest++;
+ }
+ }
+ }
+ next_glyph = next_glyph->next;
+ }
+
+#if 0
+ DbgFile = gp_fopen(pdev->memory, "d:/temp/bits.txt", "wb+");
+ for(row = 0;row < rows;row++) {
+ for(column = 0;column < stride;column++) {
+ dest = bitmap + (row * stride);
+ gp_fprintf(DbgFile, "%02x", dest[column]);
+ }
+ gp_fprintf(DbgFile, "\n");
+ }
+ gp_fclose(DbgFile);
+#endif
+ /* Initialise the OCR engine */
+ code = ocr_init_api(pdev->memory->non_gc_memory, language,
+ pdev->ocr_engine, &state);
+ if(code < 0) {
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return 0;
+ }
+ returned_count = char_count;
+
+ /* Pass our strip to the OCR engine */
+ code = ocr_bitmap_to_unicodes(state,
+ bitmap, 0, stride * 8, rows, stride,
+ (int)pdev->HWResolution[0],
+ (int)pdev->HWResolution[1],
+ returned, &returned_count);
+
+ /* and close the engine back down again */
+ ocr_fin_api(pdev->memory->non_gc_memory, state);
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+
+ if(code < 0) {
+ pdev->OCRStage = OCR_Failed;
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return code;
+ }
+
+ /* Future enhancement we should fall back to trying the individual bitmap here */
+ if(returned_count != char_count) {
+ pdev->OCRStage = OCR_Failed;
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return 0;
+ }
+ pdev->OCRUnicode = returned;
+
+ /* Actually perform OCR on the stored bitmaps */
+ pdev->OCRStage = OCR_UnicodeAvailable;
+ }
+
+ if(pdev->OCRStage == OCR_UnicodeAvailable) {
+ /* We've OCR'ed the bitmaps already, find the unicode value */
+ ocr_glyph_t *new_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+ int ocr_index = 0;
+ uint mask = 0xFF;
+ int ix;
+ char *u;
+
+ /* Find the bitmap which matches the character/glyph we are processing */
+ while(new_glyph) {
+ if(new_glyph->char_code == ch || new_glyph->glyph == glyph) {
+ ocr_glyph_t *g1 = pdev->ocr_glyphs;
+
+ /* Spaces are handled specially, so just jump out now */
+ if(new_glyph->is_space)
+ break;
+
+ /* Otherwise, find all the bitmaps which lie to the left of the
+ * one we found (we are assuming for now that the returned
+ * Unicode values are left to right)
+ */
+ while(g1) {
+ if(!g1->is_space) {
+ if(g1->x < new_glyph->x)
+ ocr_index++;
+ }
+ g1 = g1->next;
+ }
+ break;
+ }
+ new_glyph = new_glyph->next;
+ }
+
+ /* If we found a matching bitmap, get the corresponding unicode code point from
+ * the stored values returned by the OCR engine.
+ */
+ if(new_glyph) {
+ *unicode = (byte *)gs_alloc_bytes(pdev->memory, 2 * sizeof(ushort), "temporary Unicode array");
+ if(*unicode == NULL)
+ return_error(gs_error_VMerror);
+ u = (char *)(*unicode);
+ if(new_glyph->is_space) {
+ memset(u, 0x00, 3);
+ u[3] = 0x20;
+ }
+ else {
+ for(ix = 0;ix < 4;ix++) {
+ u[3 - ix] = (pdev->OCRUnicode[ocr_index] & mask) >> (8 * ix);
+ mask = mask << 8;
+ }
+ }
+ *length = 4;
+ }
+ }
+ #endif
+ return 0;
+}
+
/*
* Add char code pair to ToUnicode CMap,
* creating the CMap on neccessity.
@@ -87,27 +298,43 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
int
pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfont,
gs_glyph glyph, gs_char ch, const gs_const_string *gnstr)
-{ int code;
- gs_char length;
+{ int code = 0;
+ gs_char length = 0;
ushort *unicode = 0;
if (glyph == GS_NO_GLYPH)
return 0;
- length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
- if ((length == 0 || length == GS_NO_CHAR) && gnstr != NULL && gnstr->size == 7) {
- if (!memcmp(gnstr->data, "uni", 3)) {
- static const char *hexdigits = "0123456789ABCDEF";
- char *d0 = strchr(hexdigits, gnstr->data[3]);
- char *d1 = strchr(hexdigits, gnstr->data[4]);
- char *d2 = strchr(hexdigits, gnstr->data[5]);
- char *d3 = strchr(hexdigits, gnstr->data[6]);
-
- unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
- if (d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
- char *u = (char *)unicode;
- u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
- u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
- length = 2;
+ if(pdev->UseOCR == UseOCRAlways) {
+ code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+ if(code < 0)
+ return code;
+ }
+ else {
+ length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
+ if(length == 0 || length == GS_NO_CHAR) {
+ if(gnstr != NULL && gnstr->size == 7) {
+ if(!memcmp(gnstr->data, "uni", 3)) {
+ static const char *hexdigits = "0123456789ABCDEF";
+ char *d0 = strchr(hexdigits, gnstr->data[3]);
+ char *d1 = strchr(hexdigits, gnstr->data[4]);
+ char *d2 = strchr(hexdigits, gnstr->data[5]);
+ char *d3 = strchr(hexdigits, gnstr->data[6]);
+
+ unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
+ if(d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
+ char *u = (char *)unicode;
+ u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
+ u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
+ length = 2;
+ }
+ }
+ }
+ else {
+ if(pdev->UseOCR != UseOCRNever) {
+ code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+ if(code < 0)
+ return code;
+ }
}
}
}
@@ -163,6 +390,7 @@ pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfon
if (length > 2 && pdfont->u.simple.Encoding != NULL)
pdfont->TwoByteToUnicode = 0;
}
+
if (unicode)
gs_free_object(pdev->memory, unicode, "temporary Unicode array");
return 0;
@@ -255,8 +483,11 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
pet = &pdfont->u.simple.Encoding[ch];
glyph = (gdata == NULL ? font->procs.encode_char(font, ch, GLYPH_SPACE_NAME)
: *gdata);
- if (glyph == GS_NO_GLYPH || glyph == pet->glyph)
+ if (glyph == GS_NO_GLYPH || glyph == pet->glyph) {
+ if((pdfont->cmap_ToUnicode == NULL || !gs_cmap_ToUnicode_check_pair(pdfont->cmap_ToUnicode, ch)) && pdev->UseOCR != UseOCRNever)
+ (void)pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
return 0;
+ }
if (pet->glyph != GS_NO_GLYPH) { /* encoding conflict */
return_error(gs_error_rangecheck);
/* Must not happen because pdf_obtain_font_resource
@@ -358,7 +589,7 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
* The decision about writing it out is deferred until pdf_write_font_resource.
*/
code = pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
- if (code < 0)
+ if(code < 0)
return code;
pet->glyph = glyph;
pet->str = gnstr;
@@ -1035,6 +1266,7 @@ process_text_return_width(const pdf_text_enum_t *pte, gs_font_base *font,
{ const gs_glyph *gdata_i = (gdata != NULL ? gdata + i : 0);
code = pdf_encode_string_element(pdev, (gs_font *)font, pdfont, ch, gdata_i);
+
if (code < 0)
return code;
}