diff options
Diffstat (limited to 'devices/vector/gdevpdte.c')
-rw-r--r-- | devices/vector/gdevpdte.c | 272 |
1 files changed, 252 insertions, 20 deletions
diff --git a/devices/vector/gdevpdte.c b/devices/vector/gdevpdte.c index 6f0eb158..7a6b145e 100644 --- a/devices/vector/gdevpdte.c +++ b/devices/vector/gdevpdte.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2020 Artifex Software, Inc. +/* Copyright (C) 2001-2021 Artifex Software, Inc. All Rights Reserved. This software is provided AS-IS with no warranty, either express or @@ -43,6 +43,7 @@ #include "gxcpath.h" #include "gsfcmap.h" +#include "tessocr.h" static int pdf_char_widths(gx_device_pdf *const pdev, pdf_font_resource_t *pdfont, int ch, @@ -80,6 +81,216 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr, return pdf_process_string(penum, pstr, pfmat, ppts, gdata); } +static int OCRText(gx_device_pdf *pdev, gs_glyph glyph, gs_char ch, gs_char *length, byte **unicode) +{ +#if OCR_VERSION > 0 + int code = 0; + + if(pdev->OCRStage == OCR_Rendered) { + int llx, lly, urx, ury, char_count = 0, returned_count = 0, *returned; + ocr_glyph_t *next_glyph = pdev->ocr_glyphs; + int rows, stride, row, column; + byte *bitmap = NULL, *src, *dest, *rowptr, srcmask, destmask; + void *state; + const char *language = pdev->ocr_language; + gp_file *DbgFile; + + if(language == NULL || language[0] == 0) + language = "eng"; + + /* We should alredy have rendered a bitmap for all the glyphs in the + * text operation, so this shuld be redundant, but best to be safe. + */ + if(next_glyph == NULL) + return_error(gs_error_unknownerror); + + /* Identify the bounding box of the returned glyphs by examing the bounds and position + * of each glyph. At the same time count the number of expected returned characters. + * We treat any empty bitmap (all 0x00 bytes) as a space because, obviously, the + * OCR engine can't tell differentiate between a space character and no character at all. + */ + llx = next_glyph->x; + lly = next_glyph->y; + urx = llx + next_glyph->width; + ury = lly + next_glyph->height; + if(next_glyph != NULL && !next_glyph->is_space) + char_count++; + next_glyph = (ocr_glyph_t *)next_glyph->next; + while(next_glyph) { + if(!next_glyph->is_space) + char_count++; + if(next_glyph->x < llx) + llx = next_glyph->x; + if(next_glyph->y < lly) + lly = next_glyph->y; + if(next_glyph->x + next_glyph->width > urx) + urx = next_glyph->x + next_glyph->width; + if(next_glyph->y + next_glyph->height > ury) + ury = next_glyph->y + next_glyph->height; + next_glyph = next_glyph->next; + } + + /* Allocate and initialise the 'strip' bitmap which will receive all the + * individual glyph bitmaps. + */ + rows = ury - lly; + stride = (((urx - llx) + 7) / 8) + 1; + bitmap = gs_alloc_bytes(pdev->memory, rows * stride, "working OCR memory"); + if(bitmap == NULL) + return_error(gs_error_VMerror); + memset(bitmap, 0x00, rows * stride); + + /* Allocate a buffer for the OCR engine to return the Unicode code points. This needs work, + * we might want more information returned (bounding boxes and confidence levels) and we + * need to think about the possibility that the OCR engine finds more character than we + * expected (eg fi ligatures returned as 'f' and 'i'. + */ + returned = (int *)gs_alloc_bytes(pdev->memory, char_count * sizeof(int), "returned unicodes"); + if(returned == NULL) { + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + return_error(gs_error_VMerror); + } + memset(returned, 0x00, char_count * sizeof(int)); + + /* Now copy each glyph bitmap to the correct position in the strip. This is complicated + * by the fact that bitmaps are monochrome pcaked into bytes and so the destination + * may not be aligned on a byte boundary. + */ + next_glyph = (ocr_glyph_t *)pdev->ocr_glyphs; + while(next_glyph) { + rowptr = bitmap + ((next_glyph->y - lly) * stride) + (int)floor((next_glyph->x - llx) / 8); + for(row = 0;row < next_glyph->height;row++) { + dest = rowptr + row * stride; + src = next_glyph->data + (row * next_glyph->raster); + destmask = 0x80 >> (next_glyph->x - llx) % 8; + srcmask = 0x80; + for(column = 0; column < next_glyph->width;column++) { + if(*src & srcmask) { + *dest = *dest | destmask; + } + srcmask = srcmask >> 1; + if(srcmask == 0) { + srcmask = 0x80; + src++; + } + destmask = destmask >> 1; + if(destmask == 0) { + destmask = 0x80; + dest++; + } + } + } + next_glyph = next_glyph->next; + } + +#if 0 + DbgFile = gp_fopen(pdev->memory, "d:/temp/bits.txt", "wb+"); + for(row = 0;row < rows;row++) { + for(column = 0;column < stride;column++) { + dest = bitmap + (row * stride); + gp_fprintf(DbgFile, "%02x", dest[column]); + } + gp_fprintf(DbgFile, "\n"); + } + gp_fclose(DbgFile); +#endif + /* Initialise the OCR engine */ + code = ocr_init_api(pdev->memory->non_gc_memory, language, + pdev->ocr_engine, &state); + if(code < 0) { + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + gs_free_object(pdev->memory, returned, "returned unicodes"); + return 0; + } + returned_count = char_count; + + /* Pass our strip to the OCR engine */ + code = ocr_bitmap_to_unicodes(state, + bitmap, 0, stride * 8, rows, stride, + (int)pdev->HWResolution[0], + (int)pdev->HWResolution[1], + returned, &returned_count); + + /* and close the engine back down again */ + ocr_fin_api(pdev->memory->non_gc_memory, state); + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + + if(code < 0) { + pdev->OCRStage = OCR_Failed; + gs_free_object(pdev->memory, returned, "returned unicodes"); + return code; + } + + /* Future enhancement we should fall back to trying the individual bitmap here */ + if(returned_count != char_count) { + pdev->OCRStage = OCR_Failed; + gs_free_object(pdev->memory, returned, "returned unicodes"); + return 0; + } + pdev->OCRUnicode = returned; + + /* Actually perform OCR on the stored bitmaps */ + pdev->OCRStage = OCR_UnicodeAvailable; + } + + if(pdev->OCRStage == OCR_UnicodeAvailable) { + /* We've OCR'ed the bitmaps already, find the unicode value */ + ocr_glyph_t *new_glyph = (ocr_glyph_t *)pdev->ocr_glyphs; + int ocr_index = 0; + uint mask = 0xFF; + int ix; + char *u; + + /* Find the bitmap which matches the character/glyph we are processing */ + while(new_glyph) { + if(new_glyph->char_code == ch || new_glyph->glyph == glyph) { + ocr_glyph_t *g1 = pdev->ocr_glyphs; + + /* Spaces are handled specially, so just jump out now */ + if(new_glyph->is_space) + break; + + /* Otherwise, find all the bitmaps which lie to the left of the + * one we found (we are assuming for now that the returned + * Unicode values are left to right) + */ + while(g1) { + if(!g1->is_space) { + if(g1->x < new_glyph->x) + ocr_index++; + } + g1 = g1->next; + } + break; + } + new_glyph = new_glyph->next; + } + + /* If we found a matching bitmap, get the corresponding unicode code point from + * the stored values returned by the OCR engine. + */ + if(new_glyph) { + *unicode = (byte *)gs_alloc_bytes(pdev->memory, 2 * sizeof(ushort), "temporary Unicode array"); + if(*unicode == NULL) + return_error(gs_error_VMerror); + u = (char *)(*unicode); + if(new_glyph->is_space) { + memset(u, 0x00, 3); + u[3] = 0x20; + } + else { + for(ix = 0;ix < 4;ix++) { + u[3 - ix] = (pdev->OCRUnicode[ocr_index] & mask) >> (8 * ix); + mask = mask << 8; + } + } + *length = 4; + } + } + #endif + return 0; +} + /* * Add char code pair to ToUnicode CMap, * creating the CMap on neccessity. @@ -87,27 +298,43 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr, int pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfont, gs_glyph glyph, gs_char ch, const gs_const_string *gnstr) -{ int code; - gs_char length; +{ int code = 0; + gs_char length = 0; ushort *unicode = 0; if (glyph == GS_NO_GLYPH) return 0; - length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0); - if ((length == 0 || length == GS_NO_CHAR) && gnstr != NULL && gnstr->size == 7) { - if (!memcmp(gnstr->data, "uni", 3)) { - static const char *hexdigits = "0123456789ABCDEF"; - char *d0 = strchr(hexdigits, gnstr->data[3]); - char *d1 = strchr(hexdigits, gnstr->data[4]); - char *d2 = strchr(hexdigits, gnstr->data[5]); - char *d3 = strchr(hexdigits, gnstr->data[6]); - - unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array"); - if (d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) { - char *u = (char *)unicode; - u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits)); - u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits)); - length = 2; + if(pdev->UseOCR == UseOCRAlways) { + code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode); + if(code < 0) + return code; + } + else { + length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0); + if(length == 0 || length == GS_NO_CHAR) { + if(gnstr != NULL && gnstr->size == 7) { + if(!memcmp(gnstr->data, "uni", 3)) { + static const char *hexdigits = "0123456789ABCDEF"; + char *d0 = strchr(hexdigits, gnstr->data[3]); + char *d1 = strchr(hexdigits, gnstr->data[4]); + char *d2 = strchr(hexdigits, gnstr->data[5]); + char *d3 = strchr(hexdigits, gnstr->data[6]); + + unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array"); + if(d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) { + char *u = (char *)unicode; + u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits)); + u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits)); + length = 2; + } + } + } + else { + if(pdev->UseOCR != UseOCRNever) { + code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode); + if(code < 0) + return code; + } } } } @@ -163,6 +390,7 @@ pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfon if (length > 2 && pdfont->u.simple.Encoding != NULL) pdfont->TwoByteToUnicode = 0; } + if (unicode) gs_free_object(pdev->memory, unicode, "temporary Unicode array"); return 0; @@ -255,8 +483,11 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_ pet = &pdfont->u.simple.Encoding[ch]; glyph = (gdata == NULL ? font->procs.encode_char(font, ch, GLYPH_SPACE_NAME) : *gdata); - if (glyph == GS_NO_GLYPH || glyph == pet->glyph) + if (glyph == GS_NO_GLYPH || glyph == pet->glyph) { + if((pdfont->cmap_ToUnicode == NULL || !gs_cmap_ToUnicode_check_pair(pdfont->cmap_ToUnicode, ch)) && pdev->UseOCR != UseOCRNever) + (void)pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr); return 0; + } if (pet->glyph != GS_NO_GLYPH) { /* encoding conflict */ return_error(gs_error_rangecheck); /* Must not happen because pdf_obtain_font_resource @@ -358,7 +589,7 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_ * The decision about writing it out is deferred until pdf_write_font_resource. */ code = pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr); - if (code < 0) + if(code < 0) return code; pet->glyph = glyph; pet->str = gnstr; @@ -1035,6 +1266,7 @@ process_text_return_width(const pdf_text_enum_t *pte, gs_font_base *font, { const gs_glyph *gdata_i = (gdata != NULL ? gdata + i : 0); code = pdf_encode_string_element(pdev, (gs_font *)font, pdfont, ch, gdata_i); + if (code < 0) return code; } |