summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'devices/gdevpdfocr.c')
-rw-r--r--devices/gdevpdfocr.c727
1 files changed, 727 insertions, 0 deletions
diff --git a/devices/gdevpdfocr.c b/devices/gdevpdfocr.c
new file mode 100644
index 00000000..95d358b1
--- /dev/null
+++ b/devices/gdevpdfocr.c
@@ -0,0 +1,727 @@
+/* Copyright (C) 2001-2020 Artifex Software, Inc.
+ All Rights Reserved.
+
+ This software is provided AS-IS with no warranty, either express or
+ implied.
+
+ This software is distributed under license and may not be copied,
+ modified or distributed except as expressly authorized under the terms
+ of the license contained in the file LICENSE in this distribution.
+
+ Refer to licensing information at http://www.artifex.com or contact
+ Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato,
+ CA 94945, U.S.A., +1(415)492-9861, for further information.
+*/
+
+#include "stdint_.h"
+#include "gdevprn.h"
+#include "gxdownscale.h"
+#include "gdevkrnlsclass.h" /* 'standard' built in subclasses, currently First/Last Page and obejct filter */
+#include "stream.h"
+#include "spprint.h"
+#include "time_.h"
+#include "smd5.h"
+#include "sstring.h"
+#include "strimpl.h"
+#include "slzwx.h"
+#include "szlibx.h"
+#include "sdct.h"
+#include "srlx.h"
+#include "gsicc_cache.h"
+#include "sjpeg.h"
+
+#include "gdevpdfimg.h"
+
+#define COMPRESSION_NONE 1 /* dump mode */
+#define COMPRESSION_LZW 2 /* Lempel-Ziv & Welch */
+#define COMPRESSION_FLATE 3
+#define COMPRESSION_JPEG 4
+#define COMPRESSION_RLE 5
+
+static struct compression_string {
+ unsigned char id;
+ const char *str;
+} compression_strings [] = {
+ { COMPRESSION_NONE, "None" },
+ { COMPRESSION_LZW, "LZW" }, /* Not supported in PCLm */
+ { COMPRESSION_FLATE, "Flate" },
+ { COMPRESSION_JPEG, "JPEG" },
+ { COMPRESSION_RLE, "RLE" },
+ { 0, NULL }
+};
+
+int pdf_ocr_open(gx_device *pdev);
+int pdf_ocr_close(gx_device *pdev);
+
+
+static int
+pdfocr_put_some_params(gx_device * dev, gs_param_list * plist)
+{
+ gx_device_pdf_image *const pdf_dev = (gx_device_pdf_image *)dev;
+ int ecode = 0;
+ int code;
+ gs_param_string langstr;
+ const char *param_name;
+ size_t len;
+
+ switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) {
+ case 0:
+ len = langstr.size;
+ if (len >= sizeof(pdf_dev->ocr.language))
+ len = sizeof(pdf_dev->ocr.language)-1;
+ memcpy(pdf_dev->ocr.language, langstr.data, len);
+ pdf_dev->ocr.language[len] = 0;
+ break;
+ case 1:
+ break;
+ default:
+ ecode = code;
+ param_signal_error(plist, param_name, ecode);
+ }
+
+ return code;
+}
+
+static int
+pdfocr_put_params_downscale_cmyk(gx_device * dev, gs_param_list * plist)
+{
+ int code = pdfocr_put_some_params(dev, plist);
+ if (code < 0)
+ return code;
+ return pdf_image_put_params_downscale_cmyk(dev, plist);
+}
+
+static int
+pdfocr_put_params_downscale(gx_device * dev, gs_param_list * plist)
+{
+ int code = pdfocr_put_some_params(dev, plist);
+ if (code < 0)
+ return code;
+ return pdf_image_put_params_downscale(dev, plist);
+}
+
+static int
+pdfocr_get_some_params(gx_device * dev, gs_param_list * plist)
+{
+ gx_device_pdf_image *const pdf_dev = (gx_device_pdf_image *)dev;
+ int code = 0;
+ int ecode = 0;
+ gs_param_string langstr;
+
+ if (pdf_dev->ocr.language[0]) {
+ langstr.data = (const byte *)pdf_dev->ocr.language;
+ langstr.size = strlen(pdf_dev->ocr.language);
+ langstr.persistent = false;
+ } else {
+ langstr.data = (const byte *)"eng";
+ langstr.size = 3;
+ langstr.persistent = false;
+ }
+ if ((code = param_write_string(plist, "OCRLanguage", &langstr)) < 0)
+ ecode = code;
+
+ return ecode;
+}
+
+static int
+pdfocr_get_params_downscale_cmyk(gx_device * dev, gs_param_list * plist)
+{
+ int code = pdfocr_get_some_params(dev, plist);
+ if (code < 0)
+ return code;
+
+ return pdf_image_get_params_downscale_cmyk(dev, plist);
+}
+
+static int
+pdfocr_get_params_downscale(gx_device * dev, gs_param_list * plist)
+{
+ int code = pdfocr_get_some_params(dev, plist);
+ if (code < 0)
+ return code;
+
+ return pdf_image_get_params_downscale(dev, plist);
+}
+
+/* ------ The pdfocr8 device ------ */
+
+static const gx_device_procs pdfocr8_procs =
+prn_color_params_procs(pdf_ocr_open,
+ gdev_prn_output_page_seekable,
+ pdf_ocr_close,
+ gx_default_gray_map_rgb_color,
+ gx_default_gray_map_color_rgb,
+ pdfocr_get_params_downscale,
+ pdfocr_put_params_downscale);
+
+const gx_device_pdf_image gs_pdfocr8_device = {
+ prn_device_body(gx_device_pdf_image,
+ pdfocr8_procs,
+ "pdfocr8",
+ DEFAULT_WIDTH_10THS, DEFAULT_HEIGHT_10THS,
+ 600, 600, /* 600 dpi by default */
+ 0, 0, 0, 0, /* Margins */
+ 1, /* num components */
+ 8, /* bits per sample */
+ 255, 0, 256, 0,
+ pdf_image_print_page),
+ 3,
+ GX_DOWNSCALER_PARAMS_DEFAULTS,
+ 0, /* StripHeight */
+ 0.0, /* QFactor */
+ 0 /* JPEGQ */
+};
+
+/* ------ The pdfocr24 device ------ */
+
+static const gx_device_procs pdfocr24_procs =
+prn_color_params_procs(pdf_ocr_open,
+ gdev_prn_output_page_seekable,
+ pdf_ocr_close,
+ gx_default_rgb_map_rgb_color,
+ gx_default_rgb_map_color_rgb,
+ pdfocr_get_params_downscale,
+ pdfocr_put_params_downscale);
+
+const gx_device_pdf_image gs_pdfocr24_device = {
+ prn_device_body(gx_device_pdf_image,
+ pdfocr24_procs,
+ "pdfocr24",
+ DEFAULT_WIDTH_10THS, DEFAULT_HEIGHT_10THS,
+ 600, 600, /* 600 dpi by default */
+ 0, 0, 0, 0, /* Margins */
+ 3, /* num components */
+ 24, /* bits per sample */
+ 255, 255, 256, 256,
+ pdf_image_print_page),
+ 3,
+ GX_DOWNSCALER_PARAMS_DEFAULTS,
+ 0, /* StripHeight */
+ 0.0, /* QFactor */
+ 0 /* JPEGQ */
+};
+
+/* ------ The pdfocr32 device ------ */
+
+static const gx_device_procs pdfocr32_procs = {
+ pdf_ocr_open, NULL, NULL, gdev_prn_output_page_seekable, pdf_ocr_close,
+ NULL, cmyk_8bit_map_color_cmyk, NULL, NULL, NULL, NULL, NULL, NULL,
+ pdfocr_get_params_downscale_cmyk, pdfocr_put_params_downscale_cmyk,
+ cmyk_8bit_map_cmyk_color, NULL, NULL, NULL, gx_page_device_get_page_device
+};
+
+const gx_device_pdf_image gs_pdfocr32_device = {
+ prn_device_body(gx_device_pdf_image,
+ pdfocr32_procs,
+ "pdfocr32",
+ DEFAULT_WIDTH_10THS, DEFAULT_HEIGHT_10THS,
+ 600, 600, /* 600 dpi by default */
+ 0, 0, 0, 0, /* Margins */
+ 4, /* num components */
+ 32, /* bits per sample */
+ 255, 255, 256, 256,
+ pdf_image_print_page),
+ 3,
+ GX_DOWNSCALER_PARAMS_DEFAULTS,
+ 0, /* StripHeight */
+ 0.0, /* QFactor */
+ 0 /* JPEGQ */
+};
+
+/* Funky font */
+static const char funky_font[] =
+"3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]"
+"/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type /Font"
+">>\nendobj\n";
+
+static const char funky_font2[] =
+"4 0 obj\n<</BaseFont/GlyphLessFont"
+"/CIDToGIDMap 5 0 R\n/CIDSystemInfo<<\n"
+"/Ordering (Identity)/Registry (Adobe)/Supplement 0>>"
+"/FontDescriptor 7 0 R/Subtype /CIDFontType2/Type/Font"
+"/DW 500>>\nendobj\n";
+
+static const char funky_font3[] =
+"5 0 obj\n<</Length 210/Filter/FlateDecode"
+">>stream\n";
+
+static const char funky_font3a[] = {
+0x78, 0x9c, 0xec, 0xc2, 0x01, 0x09, 0x00, 0x00,
+0x00, 0x02, 0xa0, 0xfa, 0x7f, 0xba, 0x21, 0x89,
+0xa6, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x80, 0x7b, 0x03, 0x00, 0x00, 0xff, 0xff, 0xec,
+0xc2, 0x01, 0x0d, 0x00, 0x00, 0x00, 0xc2, 0x20,
+0xdf, 0xbf, 0xb4, 0x45, 0x18, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00,
+0x00, 0xff, 0xff, 0xec, 0xc2, 0x01, 0x0d, 0x00,
+0x00, 0x00, 0xc2, 0x20, 0xdf, 0xbf, 0xb4, 0x45,
+0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xeb, 0x00, 0x00, 0x00, 0xff, 0xff, 0xed,
+0xc2, 0x01, 0x0d, 0x00, 0x00, 0x00, 0xc2, 0x20,
+0xdf, 0xbf, 0xb4, 0x45, 0x18, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xeb, 0x00, 0xff,
+0x00, 0x10
+};
+
+static const char funky_font3b[] =
+"endstream\nendobj\n";
+
+static const char funky_font4[] =
+"6 0 obj\n<</Length 353>>\nstream\n"
+"/CIDInit /ProcSet findresource begin\n"
+"12 dict begin\n"
+"begincmap\n"
+"/CIDSystemInfo\n"
+"<<\n"
+" /Registry (Adobe)\n"
+" /Ordering (UCS)\n"
+" /Supplement 0\n"
+">> def\n"
+"/CMapName /Adobe-Identify-UCS def\n"
+"/CMapType 2 def\n"
+"1 begincodespacerange\n"
+"<0000> <FFFF>\n"
+"endcodespacerange\n"
+"1 beginbfrange\n"
+"<0000> <FFFF> <0000>\n"
+"endbfrange\n"
+"endcmap\n"
+"CMapName currentdict /CMap defineresource pop\n"
+"end\n"
+"end\n"
+"endstream\n"
+"endobj\n";
+
+static const char funky_font5[] =
+"7 0 obj\n"
+"<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5"
+"/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont"
+"/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n";
+
+static const char funky_font6[] =
+"8 0 obj\n<</Length 572/Length1 572>>\nstream\n";
+
+static const char funky_font6a[] =
+{
+0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x80,
+0x00, 0x03, 0x00, 0x20, 0x4f, 0x53, 0x2f, 0x32,
+0x56, 0xde, 0xc8, 0x94, 0x00, 0x00, 0x01, 0x28,
+0x00, 0x00, 0x00, 0x60, 0x63, 0x6d, 0x61, 0x70,
+0x00, 0x0a, 0x00, 0x34, 0x00, 0x00, 0x01, 0x90,
+0x00, 0x00, 0x00, 0x1e, 0x67, 0x6c, 0x79, 0x66,
+0x15, 0x22, 0x41, 0x24, 0x00, 0x00, 0x01, 0xb8,
+0x00, 0x00, 0x00, 0x18, 0x68, 0x65, 0x61, 0x64,
+0x0b, 0x78, 0xf1, 0x65, 0x00, 0x00, 0x00, 0xac,
+0x00, 0x00, 0x00, 0x36, 0x68, 0x68, 0x65, 0x61,
+0x0c, 0x02, 0x04, 0x02, 0x00, 0x00, 0x00, 0xe4,
+0x00, 0x00, 0x00, 0x24, 0x68, 0x6d, 0x74, 0x78,
+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x88,
+0x00, 0x00, 0x00, 0x08, 0x6c, 0x6f, 0x63, 0x61,
+0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x01, 0xb0,
+0x00, 0x00, 0x00, 0x06, 0x6d, 0x61, 0x78, 0x70,
+0x00, 0x04, 0x00, 0x05, 0x00, 0x00, 0x01, 0x08,
+0x00, 0x00, 0x00, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+0xf2, 0xeb, 0x16, 0xda, 0x00, 0x00, 0x01, 0xd0,
+0x00, 0x00, 0x00, 0x4b, 0x70, 0x6f, 0x73, 0x74,
+0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x02, 0x1c,
+0x00, 0x00, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00,
+0x00, 0x01, 0x00, 0x00, 0xb0, 0x94, 0x71, 0x10,
+0x5f, 0x0f, 0x3c, 0xf5, 0x04, 0x07, 0x08, 0x00,
+0x00, 0x00, 0x00, 0x00, 0xcf, 0x9a, 0xfc, 0x6e,
+0x00, 0x00, 0x00, 0x00, 0xd4, 0xc3, 0xa7, 0xf2,
+0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00,
+0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+0x08, 0x00, 0xff, 0xff, 0x00, 0x00, 0x04, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x04,
+0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x03, 0x00, 0x00, 0x01, 0x90, 0x00, 0x05,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x05, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x47, 0x4f, 0x4f, 0x47, 0x00, 0x40,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xff,
+0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x80, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x14, 0x00, 0x03, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x14, 0x00, 0x06, 0x00, 0x0a,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00,
+0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+0x08, 0x00, 0x00, 0x03, 0x00, 0x00, 0x31, 0x21,
+0x11, 0x21, 0x04, 0x00, 0xfc, 0x00, 0x08, 0x00,
+0x00, 0x00, 0x00, 0x03, 0x00, 0x2a, 0x00, 0x00,
+0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x16,
+0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x05, 0x00, 0x0b, 0x00, 0x16, 0x00, 0x03,
+0x00, 0x01, 0x04, 0x09, 0x00, 0x05, 0x00, 0x16,
+0x00, 0x00, 0x00, 0x56, 0x00, 0x65, 0x00, 0x72,
+0x00, 0x73, 0x00, 0x69, 0x00, 0x6f, 0x00, 0x6e,
+0x00, 0x20, 0x00, 0x31, 0x00, 0x2e, 0x00, 0x30,
+0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20,
+0x31, 0x2e, 0x30, 0x00, 0x00, 0x01, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00
+};
+
+static const char funky_font6b[] =
+"endstream\nendobj\n";
+
+static int
+ocr_file_init(gx_device_pdf_image *dev)
+{
+ const char *language = dev->ocr.language;
+ if (language == NULL || language[0] == 0)
+ language = "eng";
+
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font, sizeof(funky_font)-1);
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font2, sizeof(funky_font2)-1);
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font3, sizeof(funky_font3)-1);
+ stream_write(dev->strm, funky_font3a, sizeof(funky_font3a));
+ stream_write(dev->strm, funky_font3b, sizeof(funky_font3b)-1);
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font4, sizeof(funky_font4)-1);
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font5, sizeof(funky_font5)-1);
+ dev->ocr.file_object_offset[dev->ocr.file_objects++] = stell(dev->strm);
+ stream_write(dev->strm, funky_font6, sizeof(funky_font6)-1);
+ stream_write(dev->strm, funky_font6a, sizeof(funky_font6a));
+ stream_write(dev->strm, funky_font6b, sizeof(funky_font6b)-1);
+
+ return ocr_init_api(dev->memory, language, &dev->ocr.state);
+}
+
+static void
+ocr_line8(gx_device_pdf_image *dev, void *row)
+{
+ int w = dev->ocr.w;
+ int raster = (w+3)&~3;
+ char *in = (char *)row;
+ char *out = ((char *)dev->ocr.data) + raster * dev->ocr.y++;
+ int i;
+
+#if ARCH_IS_BIG_ENDIAN
+ memcpy(out, in, dev->ocr.w);
+#else
+ for (i = 0; i < w; i++)
+ out[i^3] = in[i];
+#endif
+}
+
+static void
+ocr_line24(gx_device_pdf_image *dev, void *row)
+{
+ int w = dev->ocr.w;
+ int raster = (w+3)&~3;
+ char *in = (char *)row;
+ char *out = ((char *)dev->ocr.data) + raster * dev->ocr.y++;
+ int i;
+
+#if ARCH_IS_BIG_ENDIAN
+ for (i = 0; i < w; i++) {
+ int v = *in++;
+ v += 2* *in++;
+ v += *in++;
+ out[i] = v>>2;
+ }
+#else
+ for (i = 0; i < w; i++) {
+ int v = *in++;
+ v += 2* *in++;
+ v += *in++;
+ out[i^3] = v>>2;
+ }
+#endif
+}
+
+static void
+ocr_line32(gx_device_pdf_image *dev, void *row)
+{
+ int w = dev->ocr.w;
+ int raster = (w+3)&~3;
+ char *in = (char *)row;
+ char *out = ((char *)dev->ocr.data) + raster * dev->ocr.y++;
+ int i;
+
+#if ARCH_IS_BIG_ENDIAN
+ for (i = 0; i < w; i++) {
+ int v = 255 - *in++;
+ v -= *in++;
+ v -= *in++;
+ v -= *in++;
+ if (v < 0) v = 0;
+ out[i] = v;
+ }
+#else
+ for (i = 0; i < w; i++) {
+ int v = 255 - *in++;
+ v -= *in++;
+ v -= *in++;
+ v -= *in++;
+ if (v < 0) v = 0;
+ out[i^3] = v;
+ }
+#endif
+}
+
+static int
+ocr_begin_page(gx_device_pdf_image *dev, int w, int h, int bpp)
+{
+ int raster = (w+3)&~3;
+
+ dev->ocr.data = gs_alloc_bytes(dev->memory, raster * h, "ocr_begin_page");
+ if (dev->ocr.data == NULL)
+ return_error(gs_error_VMerror);
+ dev->ocr.w = w;
+ dev->ocr.h = h;
+ dev->ocr.y = 0;
+
+ if (bpp == 32)
+ dev->ocr.line = ocr_line32;
+ else if (bpp == 24)
+ dev->ocr.line = ocr_line24;
+ else
+ dev->ocr.line = ocr_line8;
+
+ return 0;
+}
+
+static void
+flush_word(gx_device_pdf_image *dev)
+{
+ char buffer[1024];
+ float size, scale;
+ float *bbox = dev->ocr.wordbox;
+ int i, len;
+
+ len = dev->ocr.word_len;
+ if (len == 0)
+ return;
+
+ size = bbox[3]-bbox[1];
+ if (dev->ocr.cur_size != size) {
+ gs_sprintf(buffer, "/Ft0 %.3f Tf", size);
+ stream_puts(dev->strm, buffer);
+ dev->ocr.cur_size = size;
+ }
+ scale = (bbox[2]-bbox[0]) / size / len * 200;
+ if (dev->ocr.cur_scale != scale) {
+ gs_sprintf(buffer, " %.3f Tz", scale);
+ stream_puts(dev->strm, buffer);
+ dev->ocr.cur_scale = scale;
+ }
+ gs_sprintf(buffer, " 1 0 0 1 %.3f %.3f Tm[<", bbox[0], bbox[1]);
+ stream_puts(dev->strm, buffer);
+ for (i = 0; i < len; i++) {
+ gs_sprintf(buffer, "%04x", dev->ocr.word_chars[i]);
+ stream_puts(dev->strm, buffer);
+ }
+ stream_puts(dev->strm, ">]TJ\n");
+
+ dev->ocr.word_len = 0;
+}
+
+static int
+ocr_callback(void *arg, const char *rune_,
+ const int *line_bbox, const int *word_bbox,
+ const int *char_bbox, int pointsize)
+{
+ gx_device_pdf_image *ppdev = (gx_device_pdf_image *)arg;
+ int unicode;
+ const unsigned char *rune = (const unsigned char *)rune_;
+ float bbox[4];
+ int factor = ppdev->downscale.downscale_factor;
+ float scale = 72000000.0f / gx_downscaler_scale(1000000, factor);
+
+ if (rune[0] >= 0xF8)
+ return 0; /* Illegal */
+ if (rune[0] < 0x80)
+ unicode = rune[0];
+ else {
+ unicode = rune[1] & 0x7f;
+ if (rune[0] < 0xd0)
+ unicode |= ((rune[0] & 0x1f) << 6);
+ else {
+ unicode = (unicode<<6) | (rune[2] & 0x7f);
+ if (rune[0] < 0xf0)
+ unicode |= ((rune[0] & 0x0f) << 12);
+ else
+ unicode |= ((rune[0] & 0x07) << 18) | (unicode<<6) | (rune[3] & 0x7f);
+ }
+ }
+
+#if 0
+ // First attempt; match char bboxes exactly. This is bad, as the
+ // bboxes given back from tesseract are 'untrustworthy' to say the
+ // least (they overlap one another in strange ways). Trying to
+ // match those causes the font height to change repeatedly, and
+ // gives output that's hard to identify words in.
+ bbox[0] = char_bbox[0] * 72.0 / ppdev->ocr.xres;
+ bbox[1] = (ppdev->ocr.h-1 - char_bbox[3]) * 72.0 / ppdev->ocr.yres;
+ bbox[2] = char_bbox[2] * 72.0 / ppdev->ocr.xres;
+ bbox[3] = (ppdev->ocr.h-1 - char_bbox[1]) * 72.0 / ppdev->ocr.yres;
+
+ size = bbox[3]-bbox[1];
+ if (ppdev->ocr.cur_size != size) {
+ gs_sprintf(buffer, "/Ft0 %f Tf ", size);
+ stream_puts(ppdev->strm, buffer);
+ ppdev->ocr.cur_size = size;
+ }
+ scale = (bbox[2]-bbox[0]) / size * 200;
+ if (ppdev->ocr.cur_scale != scale) {
+ gs_sprintf(buffer, " %f Tz ", scale);
+ stream_puts(ppdev->strm, buffer);
+ ppdev->ocr.cur_scale = scale;
+ }
+ gs_sprintf(buffer, "1 0 0 1 %f %f Tm ", bbox[0], bbox[1]);
+ stream_puts(ppdev->strm, buffer);
+ gs_sprintf(buffer, "<%04x>Tj\n", unicode);
+ stream_puts(ppdev->strm, buffer);
+#else
+ bbox[0] = word_bbox[0] * scale / ppdev->ocr.xres;
+ bbox[1] = (ppdev->ocr.h-1 - line_bbox[3]) * scale / ppdev->ocr.yres;
+ bbox[2] = word_bbox[2] * scale / ppdev->ocr.xres;
+ bbox[3] = (ppdev->ocr.h-1 - line_bbox[1]) * scale / ppdev->ocr.yres;
+
+ /* If the word bbox differs, flush the word. */
+ if (bbox[0] != ppdev->ocr.wordbox[0] ||
+ bbox[1] != ppdev->ocr.wordbox[1] ||
+ bbox[2] != ppdev->ocr.wordbox[2] ||
+ bbox[3] != ppdev->ocr.wordbox[3]) {
+ flush_word(ppdev);
+ ppdev->ocr.wordbox[0] = bbox[0];
+ ppdev->ocr.wordbox[1] = bbox[1];
+ ppdev->ocr.wordbox[2] = bbox[2];
+ ppdev->ocr.wordbox[3] = bbox[3];
+ }
+
+ /* Add the char to the current word. */
+ if (ppdev->ocr.word_len == ppdev->ocr.word_max) {
+ int *newblock;
+ int newmax = ppdev->ocr.word_max * 2;
+ if (newmax == 0)
+ newmax = 16;
+ newblock = (int *)gs_alloc_bytes(ppdev->memory, sizeof(int)*newmax,
+ "ocr_callback(word)");
+ if (newblock == NULL)
+ return_error(gs_error_VMerror);
+ if (ppdev->ocr.word_len > 0)
+ memcpy(newblock, ppdev->ocr.word_chars,
+ sizeof(int) * ppdev->ocr.word_len);
+ gs_free_object(ppdev->memory, ppdev->ocr.word_chars,
+ "ocr_callback(word)");
+ ppdev->ocr.word_chars = newblock;
+ ppdev->ocr.word_max = newmax;
+ }
+ ppdev->ocr.word_chars[ppdev->ocr.word_len++] = unicode;
+#endif
+
+ return 0;
+}
+
+static int
+ocr_end_page(gx_device_pdf_image *dev)
+{
+ stream_puts(dev->strm, "\nBT 3 Tr\n");
+ dev->ocr.cur_x = 0;
+ dev->ocr.cur_y = 0;
+ dev->ocr.cur_size = -1;
+ dev->ocr.cur_scale = 0;
+ dev->ocr.wordbox[0] = 0;
+ dev->ocr.wordbox[1] = 0;
+ dev->ocr.wordbox[2] = -1;
+ dev->ocr.wordbox[3] = -1;
+ dev->ocr.word_len = 0;
+ dev->ocr.word_max = 0;
+ dev->ocr.word_chars = NULL;
+ ocr_recognise(dev->ocr.state,
+ dev->ocr.w,
+ dev->ocr.h,
+ dev->ocr.data,
+ dev->ocr.xres,
+ dev->ocr.yres,
+ ocr_callback,
+ dev);
+ if (dev->ocr.word_len)
+ flush_word(dev);
+ stream_puts(dev->strm, "\nET");
+
+ gs_free_object(dev->memory, dev->ocr.word_chars,
+ "ocr_callback(word)");
+ gs_free_object(dev->memory, dev->ocr.data, "ocr_end_page");
+ dev->ocr.data = NULL;
+
+ return 0;
+}
+
+int
+pdf_ocr_open(gx_device *pdev)
+{
+ gx_device_pdf_image *ppdev;
+ int code = pdf_image_open(pdev);
+
+ if (code < 0)
+ return code;
+
+ /* If we've been subclassed, find the terminal device */
+ while(pdev->child)
+ pdev = pdev->child;
+ ppdev = (gx_device_pdf_image *)pdev;
+
+ ppdev->ocr.file_init = ocr_file_init;
+ ppdev->ocr.begin_page = ocr_begin_page;
+ ppdev->ocr.end_page = ocr_end_page;
+ ppdev->ocr.xres = (int)pdev->HWResolution[0];
+ ppdev->ocr.yres = (int)pdev->HWResolution[1];
+
+ return 0;
+}
+
+int
+pdf_ocr_close(gx_device *pdev)
+{
+ gx_device_pdf_image *pdf_dev;
+ int code;
+
+ code = pdf_image_close(pdev);
+ if (code < 0)
+ return code;
+
+ /* If we've been subclassed, find the terminal device */
+ while(pdev->child)
+ pdev = pdev->child;
+ pdf_dev = (gx_device_pdf_image *)pdev;
+
+ ocr_fin_api(pdf_dev->memory, pdf_dev->ocr.state);
+ pdf_dev->ocr.state = NULL;
+
+ return code;
+}