diff options
Diffstat (limited to 'extract/src')
-rw-r--r-- | extract/src/astring.c | 14 | ||||
-rw-r--r-- | extract/src/boxer.c | 621 | ||||
-rw-r--r-- | extract/src/buffer-test.c | 20 | ||||
-rw-r--r-- | extract/src/buffer.c | 30 | ||||
-rw-r--r-- | extract/src/document.c | 34 | ||||
-rw-r--r-- | extract/src/document.h | 93 | ||||
-rw-r--r-- | extract/src/docx.c | 167 | ||||
-rw-r--r-- | extract/src/docx.h | 4 | ||||
-rw-r--r-- | extract/src/docx_template.c | 38 | ||||
-rwxr-xr-x | extract/src/docx_template_build.py | 46 | ||||
-rw-r--r-- | extract/src/extract-exe.c | 26 | ||||
-rw-r--r-- | extract/src/extract.c | 662 | ||||
-rw-r--r-- | extract/src/html.c | 228 | ||||
-rw-r--r-- | extract/src/join.c | 215 | ||||
-rw-r--r-- | extract/src/mem.c | 2 | ||||
-rw-r--r-- | extract/src/memento.c | 5 | ||||
-rwxr-xr-x | extract/src/memento.py | 6 | ||||
-rw-r--r-- | extract/src/misc-test.c | 16 | ||||
-rw-r--r-- | extract/src/odt.c | 134 | ||||
-rw-r--r-- | extract/src/odt.h | 2 | ||||
-rw-r--r-- | extract/src/odt_template.c | 16 | ||||
-rw-r--r-- | extract/src/outf.c | 2 | ||||
-rw-r--r-- | extract/src/rect.c | 57 | ||||
-rw-r--r-- | extract/src/text.c | 11 | ||||
-rw-r--r-- | extract/src/xml.c | 3 | ||||
-rw-r--r-- | extract/src/zip-test.c | 24 | ||||
-rw-r--r-- | extract/src/zip.c | 64 |
27 files changed, 1774 insertions, 766 deletions
diff --git a/extract/src/astring.c b/extract/src/astring.c index e5d40217..07c22d55 100644 --- a/extract/src/astring.c +++ b/extract/src/astring.c @@ -87,7 +87,7 @@ int extract_astring_catc_unicode( ) { int ret = -1; - + if (0) {} /* Escape XML special characters. */ @@ -118,7 +118,7 @@ int extract_astring_catc_unicode( { if (extract_astring_cat(alloc, string, "ffl")) goto end; } - + /* Convert some special characters to ascii. */ else if (ascii_dash && c == 0x2212) { @@ -162,7 +162,7 @@ int extract_astring_catc_unicode( } else if (c < 0x0800) { - char cc[2] = + char cc[2] = { (char) (((c >> 6) & 0x1f) | 0xc0), (char) (((c >> 0) & 0x3f) | 0x80) @@ -171,7 +171,7 @@ int extract_astring_catc_unicode( } else if (c < 0x10000) { - char cc[3] = + char cc[3] = { (char) (((c >> 12) & 0x0f) | 0xe0), (char) (((c >> 6) & 0x3f) | 0x80), @@ -181,7 +181,7 @@ int extract_astring_catc_unicode( } else if (c < 0x110000) { - char cc[4] = + char cc[4] = { (char) (((c >> 18) & 0x07) | 0xf0), (char) (((c >> 12) & 0x3f) | 0x80), @@ -198,9 +198,9 @@ int extract_astring_catc_unicode( } } } - + ret = 0; - + end: return ret; } diff --git a/extract/src/boxer.c b/extract/src/boxer.c new file mode 100644 index 00000000..21cde1b6 --- /dev/null +++ b/extract/src/boxer.c @@ -0,0 +1,621 @@ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <assert.h> + +#include "document.h" +#include "outf.h" + +#define DEBUG_WRITE_AS_PS +/* #define DEBUG_PRINT */ + +typedef struct boxer_s boxer_t; + +typedef struct { + int len; + int max; + rect_t list[1]; +} rectlist_t; + +struct boxer_s { + extract_alloc_t *alloc; + rect_t mediabox; + rectlist_t *list; +}; + +static rectlist_t * +rectlist_create(extract_alloc_t *alloc, int max) +{ + rectlist_t *list; + + if (extract_malloc(alloc, &list, sizeof(rectlist_t) + sizeof(rect_t)*(max-1))) + return NULL; + + list->len = 0; + list->max = max; + + return list; +} + +/* Push box onto rectlist, unless it is completely enclosed by + * another box, or completely encloses others (in which case they + * are replaced by it). */ +static void +rectlist_append(rectlist_t *list, rect_t *box) +{ + int i; + + for (i = 0; i < list->len; i++) + { + rect_t *r = &list->list[i]; + rect_t smaller, larger; + /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */ + double r_fudge = 4; + + smaller.min.x = r->min.x + r_fudge; + larger. min.x = r->min.x - r_fudge; + smaller.min.y = r->min.y + r_fudge; + larger. min.y = r->min.y - r_fudge; + smaller.max.x = r->max.x - r_fudge; + larger. max.x = r->max.x + r_fudge; + smaller.max.y = r->max.y - r_fudge; + larger. max.y = r->max.y + r_fudge; + + if (extract_rect_contains_rect(larger, *box)) + return; /* box is enclosed! Nothing to do. */ + if (extract_rect_contains_rect(*box, smaller)) { + /* box encloses r. Ditch r. */ + /* Shorten the list */ + --list->len; + /* If the one that just got chopped off wasn't r, move it down. */ + if (i < list->len) { + memcpy(r, &list->list[list->len], sizeof(*r)); + i--; /* Reconsider this entry next time. */ + } + } + } + + assert(list->len < list->max); + memcpy(&list->list[list->len], box, sizeof(*box)); + list->len++; +} + +static boxer_t * +boxer_create_length(extract_alloc_t *alloc, rect_t *mediabox, int len) +{ + boxer_t *boxer; + + if (extract_malloc(alloc, &boxer, sizeof(*boxer))) + return NULL; + + boxer->alloc = alloc; + memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox)); + boxer->list = rectlist_create(alloc, len); + + return boxer; +} + +/* Create a boxer structure for a page of size mediabox. */ +static boxer_t * +boxer_create(extract_alloc_t *alloc, rect_t *mediabox) +{ + boxer_t *boxer = boxer_create_length(alloc, mediabox, 1); + + if (boxer == NULL) + return NULL; + rectlist_append(boxer->list, mediabox); + + return boxer; +} + +static void +push_if_intersect_suitable(rectlist_t *dst, const rect_t *a, const rect_t *b) +{ + rect_t c; + + /* Intersect a and b. */ + c = extract_rect_intersect(*a, *b); + /* If no intersection, nothing to push. */ + if (!extract_rect_valid(c)) + return; + + /* If the intersect is too narrow or too tall, ignore it. + * We don't care about inter character spaces, for example. + * Arbitrary 4 point threshold. */ +#define THRESHOLD 4 + if (c.min.x + THRESHOLD >= c.max.x || c.min.y+THRESHOLD >= c.max.y) + return; + + rectlist_append(dst, &c); +} + +static void +boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const rect_t *box) +{ + int i; + + for (i = 0; i < src->len; i++) + push_if_intersect_suitable(dst, &src->list[i], box); +} + +/* Mark a given box as being occupied (typically by a glyph) */ +static int boxer_feed(boxer_t *boxer, rect_t *bbox) +{ + rect_t box; + /* When we feed a box into a the boxer, we can never make + * the list more than 4 times as long. */ + rectlist_t *newlist = rectlist_create(boxer->alloc, boxer->list->len * 4); + if (newlist == NULL) + return -1; + +#ifdef DEBUG_WRITE_AS_PS + printf("0 0 1 setrgbcolor\n"); + printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n", + bbox->min.x, bbox->min.y, + bbox->min.x, bbox->max.y, + bbox->max.x, bbox->max.y, + bbox->max.x, bbox->min.y + ); +#endif + + /* Left (0,0) (min.x,H) */ + box.min.x = boxer->mediabox.min.x; + box.min.y = boxer->mediabox.min.y; + box.max.x = bbox->min.x; + box.max.y = boxer->mediabox.max.y; + boxlist_feed_intersect(newlist, boxer->list, &box); + + /* Right (max.x,0) (W,H) */ + box.min.x = bbox->max.x; + box.min.y = boxer->mediabox.min.y; + box.max.x = boxer->mediabox.max.x; + box.max.y = boxer->mediabox.max.y; + boxlist_feed_intersect(newlist, boxer->list, &box); + + /* Bottom (0,0) (W,min.y) */ + box.min.x = boxer->mediabox.min.x; + box.min.y = boxer->mediabox.min.y; + box.max.x = boxer->mediabox.max.x; + box.max.y = bbox->min.y; + boxlist_feed_intersect(newlist, boxer->list, &box); + + /* Top (0,max.y) (W,H) */ + box.min.x = boxer->mediabox.min.x; + box.min.y = bbox->max.y; + box.max.x = boxer->mediabox.max.x; + box.max.y = boxer->mediabox.max.y; + boxlist_feed_intersect(newlist, boxer->list, &box); + + extract_free(boxer->alloc, &boxer->list); + boxer->list = newlist; + + return 0; +} + +static int +compare_areas(const void *a_, const void *b_) +{ + const rect_t *a = (const rect_t *)a_; + const rect_t *b = (const rect_t *)b_; + double area_a = (a->max.x-a->min.x) * (a->max.y-a->min.y); + double area_b = (b->max.x-b->min.x) * (b->max.y-b->min.y); + + if (area_a < area_b) + return 1; + else if (area_a > area_b) + return -1; + else + return 0; +} + +/* Sort the rectangle list to be largest area first. For ease of humans + * reading debug output. */ +static void boxer_sort(boxer_t *boxer) +{ + qsort(boxer->list->list, boxer->list->len, sizeof(rect_t), compare_areas); +} + +/* Get the rectangle list for a given boxer. Return value is the length of + * the list. Lifespan is until the boxer is modified or freed. */ +static int boxer_results(boxer_t *boxer, rect_t **list) +{ + *list = boxer->list->list; + return boxer->list->len; +} + +/* Destroy a boxer. */ +static void boxer_destroy(boxer_t *boxer) +{ + if (!boxer) + return; + + extract_free(boxer->alloc, &boxer->list); + extract_free(boxer->alloc, &boxer); +} + +/* Find the margins for a given boxer. */ +static rect_t boxer_margins(boxer_t *boxer) +{ + rectlist_t *list = boxer->list; + int i; + rect_t margins = boxer->mediabox; + + for (i = 0; i < list->len; i++) + { + rect_t *r = &list->list[i]; + if (r->min.x <= margins.min.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y) { + margins.min.x = r->max.x; /* Left Margin */ + } else if (r->max.x >= margins.max.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y) { + margins.max.x = r->min.x; /* Right Margin */ + } else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->min.y <= margins.min.y) { + margins.min.y = r->max.y; /* Top Margin */ + } else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->max.y >= margins.max.y) { + margins.max.y = r->min.y; /* Bottom Margin */ + } + } + + return margins; +} + +/* Create a new boxer from a subset of an old one. */ +static boxer_t *boxer_subset(boxer_t *boxer, rect_t rect) +{ + boxer_t *new_boxer = boxer_create_length(boxer->alloc, &rect, boxer->list->len); + int i; + + if (new_boxer == NULL) + return NULL; + + for (i = 0; i < boxer->list->len; i++) { + rect_t r = extract_rect_intersect(boxer->list->list[i], rect); + + if (!extract_rect_valid(r)) + continue; + rectlist_append(new_boxer->list, &r); + } + + return new_boxer; +} + +/* Consider a boxer for subdivision. + * Returns 0 if no suitable subdivision point found. + * Returns 1, and sets *boxer1 and *boxer2 to new boxer structures for the the subdivisions + * if a subdivision point is found.*/ +static split_type_t +boxer_subdivide(boxer_t *boxer, boxer_t **boxer1, boxer_t **boxer2) +{ + rectlist_t *list = boxer->list; + int num_h = 0, num_v = 0; + double max_h = 0, max_v = 0; + rect_t best_h = {0}, best_v = {0}; + int i; + + *boxer1 = NULL; + *boxer2 = NULL; + + for (i = 0; i < list->len; i++) { + rect_t r = boxer->list->list[i]; + + if (r.min.x <= boxer->mediabox.min.x && r.max.x >= boxer->mediabox.max.x) { + /* Horizontal divider */ + double size = r.max.y - r.min.y; + if (size > max_h) { + max_h = size; + best_h = r; + } + num_h++; + } + if (r.min.y <= boxer->mediabox.min.y && r.max.y >= boxer->mediabox.max.y) { + /* Vertical divider */ + double size = r.max.x - r.min.x; + if (size > max_v) { + max_v = size; + best_v = r; + } + num_v++; + } + } + + outf("num_h=%d num_v=%d\n", num_h, num_v); + outf("max_h=%g max_v=%g\n", max_h, max_v); + + if (max_h > max_v) { + rect_t r; + /* Divider runs horizontally. */ + r = boxer->mediabox; + r.max.y = best_h.min.y; + *boxer1 = boxer_subset(boxer, r); + r = boxer->mediabox; + r.min.y = best_h.max.y; + *boxer2 = boxer_subset(boxer, r); + return SPLIT_VERTICAL; + } else if (max_v > 0) { + rect_t r; + /* Divider runs vertically. */ + r = boxer->mediabox; + r.max.x = best_v.min.x; + *boxer1 = boxer_subset(boxer, r); + r = boxer->mediabox; + r.min.x = best_v.max.x; + *boxer2 = boxer_subset(boxer, r); + return SPLIT_HORIZONTAL; + } + + return SPLIT_NONE; +} + + +/* Extract specifics */ +static rect_t +extract_span_bbox(span_t *span) +{ + int j; + rect_t bbox = extract_rect_empty; + + for (j = 0; j < span->chars_num; j++) + { + char_t *char_ = &span->chars[j]; + bbox = extract_rect_union(bbox, char_->bbox); + } + return bbox; +} + + +static int +extract_subpage_subset(extract_alloc_t *alloc, extract_page_t *page, subpage_t *subpage, rect_t mediabox) +{ + subpage_t *target; + int s; + + if (extract_subpage_alloc(alloc, mediabox, page, &target)) + { + return -1; + } + + for (s = 0; s < subpage->spans_num; s++) + { + rect_t bbox; + span_t *span = subpage->spans[s]; + if (!span) + continue; + + bbox = extract_span_bbox(span); + + if (bbox.min.x >= mediabox.min.x && bbox.min.y >= mediabox.min.y && bbox.max.x <= mediabox.max.x && bbox.max.y <= mediabox.max.y) { + if (subpage_span_append(alloc, target, span)) + { + return -1; + } + subpage->spans[s] = NULL; + } + } + + return 0; +} + +enum { + MAX_ANALYSIS_DEPTH = 6 +}; + +static int +analyse_sub(extract_page_t *page, subpage_t *subpage, boxer_t *big_boxer, split_t **psplit, int depth) +{ + rect_t margins; + boxer_t *boxer; + boxer_t *boxer1; + boxer_t *boxer2; + int ret; + split_type_t split_type; + split_t *split; + + margins = boxer_margins(big_boxer); +#ifdef DEBUG_WRITE_AS_PS + printf("\n\n%% MARGINS %g %g %g %g\n", margins.min.x, margins.min.y, margins.max.x, margins.max.y); +#endif + + boxer = boxer_subset(big_boxer, margins); + + if (depth < MAX_ANALYSIS_DEPTH && + (split_type = boxer_subdivide(boxer, &boxer1, &boxer2)) != SPLIT_NONE) { + if (boxer1 == NULL || boxer2 == NULL || + extract_split_alloc(boxer->alloc, split_type, 2, psplit)) + { + ret = -1; + goto fail_mid_split; + } + split = *psplit; + outf("depth=%d %s\n", depth, split_type == SPLIT_HORIZONTAL ? "H" : "V"); + ret = analyse_sub(page, subpage, boxer1, &split->split[0], depth+1); + if (!ret) ret = analyse_sub(page, subpage, boxer2, &split->split[1], depth+1); + if (!ret) + { + if (split_type == SPLIT_HORIZONTAL) + { + split->split[0]->weight = boxer1->mediabox.max.x - boxer1->mediabox.min.x; + split->split[1]->weight = boxer2->mediabox.max.x - boxer2->mediabox.min.x; + } + else + { + split->split[0]->weight = boxer1->mediabox.max.y - boxer1->mediabox.min.y; + split->split[1]->weight = boxer2->mediabox.max.y - boxer2->mediabox.min.y; + } + } +fail_mid_split: + boxer_destroy(boxer1); + boxer_destroy(boxer2); + boxer_destroy(boxer); + return ret; + } + + outf("depth=%d LEAF\n", depth); + + if (extract_split_alloc(boxer->alloc, SPLIT_NONE, 0, psplit)) + { + boxer_destroy(boxer); + return -1; + } + split = *psplit; + + ret = extract_subpage_subset(boxer->alloc, page, subpage, boxer->mediabox); + +#ifdef DEBUG_WRITE_AS_PS + { + int i, n; + rect_t *list; + boxer_sort(boxer); + n = boxer_results(boxer, &list); + + printf("%% SUBDIVISION\n"); + for (i = 0; i < n; i++) { + printf("%% %g %g %g %g\n", + list[i].min.x, list[i].min.y, list[i].max.x, list[i].max.y); + } + + printf("0 0 0 setrgbcolor\n"); + for (i = 0; i < n; i++) { + printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", + list[i].min.x, list[i].min.y, + list[i].min.x, list[i].max.y, + list[i].max.x, list[i].max.y, + list[i].max.x, list[i].min.y); + } + + printf("1 0 0 setrgbcolor\n"); + printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", + margins.min.x, margins.min.y, + margins.min.x, margins.max.y, + margins.max.x, margins.max.y, + margins.max.x, margins.min.y); + } +#endif + boxer_destroy(boxer); + + return ret; +} + + +static int +collate_splits(extract_alloc_t *alloc, split_t **psplit) +{ + split_t *split = *psplit; + int s; + int n = 0; + int i; + int j; + split_t *newsplit; + + /* Recurse into all our children to ensure they are collated. + * Count how many children we'll have once we pull all the + * children of children that match our type up into us. */ + for (s = 0; s < split->count; s++) + { + if (collate_splits(alloc, &split->split[s])) + { + return -1; + } + if (split->split[s]->type == split->type) + { + n += split->split[s]->count; + } + else + { + n++; + } + } + + /* No change in the number of children? Just exit. */ + if (n == split->count) + return 0; + + if (extract_split_alloc(alloc, split->type, n, &newsplit)) + { + return -1; + } + + newsplit->weight = split->weight; + + /* Now, run across our children. */ + i = 0; + for (s = 0; s < split->count; s++) + { + split_t *sub = split->split[s]; + if (sub->type == split->type) + { + /* If the type matches, pull the grandchildren into newsplit. */ + for (j = 0; j < sub->count; j++) + { + newsplit->split[i++] = sub->split[j]; + sub->split[j] = NULL; + } + } + else + { + /* Otherwise just move the child into newsplit. */ + newsplit->split[i++] = sub; + split->split[s] = NULL; + } + } + + extract_split_free(alloc, psplit); + *psplit = newsplit; + + return 0; +} + +int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page) +{ + boxer_t *boxer; + int i; + subpage_t *subpage = page->subpages[0]; + + /* This code will only work if the page contains a single subpage. + * This should always be the case if we're called from a page + * generated via extract_page_begin. */ + if (page->subpages_num != 1) return 0; + + /* Take the old subpages out from the page. */ + page->subpages_num = 0; + extract_free(alloc, &page->subpages); + +#ifdef DEBUG_WRITE_AS_PS + printf("1 -1 scale 0 -%g translate\n", page->mediabox.max.y-page->mediabox.min.y); +#endif + + boxer = boxer_create(alloc, (rect_t *)&subpage->mediabox); + + for (i = 0; i < subpage->spans_num; i++) + { + span_t *span = subpage->spans[i]; + rect_t bbox = extract_span_bbox(span); + if (boxer_feed(boxer, &bbox)) + { + goto fail; + } + } + + if (analyse_sub(page, subpage, boxer, &page->split, 0)) + { + goto fail; + } + + if (collate_splits(boxer->alloc, &page->split)) + { + goto fail; + } + +#ifdef DEBUG_WRITE_AS_PS + printf("showpage\n"); +#endif + + boxer_destroy(boxer); + extract_subpage_free(alloc, &subpage); + + return 0; + +fail: + outf("Analysis failed!\n"); + boxer_destroy(boxer); + extract_subpage_free(alloc, &subpage); + + return -1; +} diff --git a/extract/src/buffer-test.c b/extract/src/buffer-test.c index a8464c2a..19d693aa 100644 --- a/extract/src/buffer-test.c +++ b/extract/src/buffer-test.c @@ -104,7 +104,7 @@ static void test_read(void) int e; extract_buffer_t* buffer; s_create_read_buffer(NULL /*alloc*/, len, &r, &buffer); - + /* Repeatedly read from read-buffer until we get EOF, and check we read the original content. */ if (extract_malloc(r.alloc, &out_buffer, len)) abort(); @@ -127,7 +127,7 @@ static void test_read(void) out_buffer = NULL; e = extract_buffer_close(&buffer); assert(!e); - + outf("Read test passed.\n"); } @@ -202,9 +202,9 @@ static void test_write(void) size_t out_pos = 0; int its; int e; - + s_create_write_buffer(NULL /*alloc*/, len, &r, &buffer); - + /* Write to read-buffer, and check it contains the original content. */ if (extract_malloc(r.alloc, &out_buffer, len)) abort(); for (i=0; i<len; ++i) { @@ -234,7 +234,7 @@ static void test_file(void) /* Check we can write 3 bytes to file. */ extract_buffer_t* file_buffer; if (extract_buffer_open_file(NULL /*alloc*/, "test/generated/buffer-file", 1 /*writable*/, &file_buffer)) abort(); - + { size_t n; int e; @@ -247,11 +247,11 @@ static void test_file(void) } } if (extract_buffer_close(&file_buffer)) abort(); - + /* Check we get back expected short reads and EOF when reading from 3-byte file created above. */ if (extract_buffer_open_file(NULL /*alloc*/, "test/generated/buffer-file", 0 /*writable*/, &file_buffer)) abort(); - + { size_t n; char buffer[10]; @@ -277,7 +277,7 @@ static void test_file(void) } } if (extract_buffer_close(&file_buffer)) abort(); - + /* Check writing to read-only file buffer fails. */ { int e; @@ -286,13 +286,13 @@ static void test_file(void) if (extract_buffer_open_file(NULL /*alloc*/, "test/generated/buffer-file", 0 /*writable*/, &file_buffer)) { abort(); } - + e = extract_buffer_write(file_buffer, text, sizeof(text)-1, &actual); outf("extract_buffer_write() on read buffer returned e=%i actual=%zi", e, actual); if (e != -1 || errno != EINVAL) abort(); if (extract_buffer_close(&file_buffer)) abort(); } - + outf("file buffer tests passed.\n"); } diff --git a/extract/src/buffer.c b/extract/src/buffer.c index b25dee73..0cc6f749 100644 --- a/extract/src/buffer.c +++ b/extract/src/buffer.c @@ -33,7 +33,7 @@ extract_alloc_t* extract_buffer_alloc(extract_buffer_t* buffer) int extract_buffer_open( - extract_alloc_t* alloc, + extract_alloc_t* alloc, void* handle, extract_buffer_fn_read fn_read, extract_buffer_fn_write fn_write, @@ -45,7 +45,7 @@ int extract_buffer_open( int e = -1; extract_buffer_t* buffer; if (extract_malloc(alloc, &buffer, sizeof(*buffer))) goto end; - + buffer->alloc = alloc; buffer->handle = handle; buffer->fn_read = fn_read; @@ -57,7 +57,7 @@ int extract_buffer_open( buffer->cache.pos = 0; buffer->pos = 0; e = 0; - + end: if (e) { extract_free(alloc, &buffer); @@ -119,7 +119,7 @@ then fn_write returned EOF. */ buffer->cache.pos = 0; e = 0; end: - + *o_actual = p; return e; } @@ -128,11 +128,11 @@ int extract_buffer_close(extract_buffer_t** p_buffer) { extract_buffer_t* buffer = *p_buffer; int e = -1; - + if (!buffer) { return 0; } - + if (buffer->cache.cache && buffer->fn_write) { /* Flush cache. */ size_t cache_bytes = buffer->cache.pos; @@ -171,7 +171,7 @@ int extract_buffer_open_simple( { extract_buffer_t* buffer; if (extract_malloc(alloc, &buffer, sizeof(*buffer))) return -1; - + /* We need cast away the const here. data[] will be written-to if caller uses us as a write buffer. */ buffer->alloc = alloc; @@ -233,7 +233,7 @@ int extract_buffer_open_file(extract_alloc_t* alloc, const char* path, int writa outf("failed to open '%s': %s", path, strerror(errno)); goto end; } - + if (extract_buffer_open( alloc, file /*handle*/, @@ -244,7 +244,7 @@ int extract_buffer_open_file(extract_alloc_t* alloc, const char* path, int writa o_buffer )) goto end; e = 0; - + end: if (e) { if (file) fclose(file); @@ -266,7 +266,7 @@ int extract_buffer_read_internal( { int e = -1; size_t pos = 0; /* Number of bytes read so far. */ - + /* In each iteration we either read from cache, or use buffer->fn_read() directly or repopulate the cache. */ for(;;) { @@ -315,7 +315,7 @@ int extract_buffer_read_internal( } } e = 0; - + end: if (o_actual) *o_actual = pos; if (e == 0 && pos != numbytes) return +1; /* EOF. */ @@ -332,12 +332,12 @@ int extract_buffer_write_internal( { int e = -1; size_t pos = 0; /* Number of bytes written so far. */ - + if (!buffer->fn_write) { errno = EINVAL; return -1; } - + /* In each iteration we either write to cache, or use buffer->fn_write() directly or flush the cache. */ for(;;) { @@ -381,7 +381,7 @@ int extract_buffer_write_internal( } if (ee) goto end; } - + if (!buffer->fn_cache) { use_write = 1; } @@ -412,7 +412,7 @@ int extract_buffer_write_internal( } } e = 0; - + end: if (o_actual) *o_actual = pos; if (e == 0 && pos != numbytes) e = +1; /* EOF. */ diff --git a/extract/src/document.c b/extract/src/document.c index d501f259..1999c009 100644 --- a/extract/src/document.c +++ b/extract/src/document.c @@ -66,11 +66,11 @@ void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell) int p; cell_t* cell = *pcell; if (!cell) return; - + outf("cell->lines_num=%i", cell->lines_num); outf("cell->paragraphs_num=%i", cell->paragraphs_num); extract_lines_free(alloc, &cell->lines, cell->lines_num); - + outf("cell=%p cell->paragraphs_num=%i", cell, cell->paragraphs_num); for (p=0; p<cell->paragraphs_num; ++p) { @@ -85,4 +85,34 @@ void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell) extract_free(alloc, pcell); } +int +extract_split_alloc(extract_alloc_t* alloc, split_type_t type, int count, split_t** psplit) +{ + split_t *split; + + if (extract_malloc(alloc, psplit, sizeof(*split) + (count-1) * sizeof(split_t *))) + { + return -1; + } + + split = *psplit; + split->type = type; + split->weight = 0; + split->count = count; + memset(&split->split[0], 0, sizeof(split_t *) * count); + return 0; +} + +void extract_split_free(extract_alloc_t *alloc, split_t **psplit) +{ + int i; + split_t *split = *psplit; + + if (!split) + return; + + for (i = 0; i < split->count; i++) + extract_split_free(alloc, &split->split[i]); + extract_free(alloc, psplit); +} diff --git a/extract/src/document.h b/extract/src/document.h index 2dc4f1ee..69c4232c 100644 --- a/extract/src/document.h +++ b/extract/src/document.h @@ -26,6 +26,17 @@ typedef struct point_t max; } rect_t; +extern const rect_t extract_rect_infinite; +extern const rect_t extract_rect_empty; + +rect_t extract_rect_intersect(rect_t a, rect_t b); + +rect_t extract_rect_union(rect_t a, rect_t b); + +int extract_rect_contains_rect(rect_t a, rect_t b); + +int extract_rect_valid(rect_t a); + const char* extract_rect_string(const rect_t* rect); typedef struct @@ -56,13 +67,15 @@ typedef struct /* (x,y) before transformation by ctm and trm. */ double pre_x; double pre_y; - + /* (x,y) after transformation by ctm and trm. */ double x; double y; - + unsigned ucs; double adv; + + rect_t bbox; } char_t; /* A single char in a span. */ @@ -72,15 +85,15 @@ typedef struct matrix_t ctm; matrix_t trm; char* font_name; - + /* font size is extract_matrix_cmp4(trm). */ - + struct { unsigned font_bold : 1; unsigned font_italic : 1; unsigned wmode : 1; } flags; - + char_t* chars; int chars_num; } span_t; @@ -138,10 +151,10 @@ typedef struct double h; void* data; size_t data_size; - + extract_image_data_free data_free; void* data_free_handle; - + } image_t; /* Information about an image. <type> is as passed to extract_add_image(); <name> and <id> are created to be unique identifiers for use in generated docx @@ -166,18 +179,18 @@ typedef struct typedef struct { rect_t rect; - + /* If left/above is true, this cell is not obscured by cell to its left/above. */ uint8_t left; uint8_t above; - + /* extend_right and extend_down are 1 for normal cells, 2 for cells which extend right/down to cover an additional column/row, 3 to cover two additional columns/rows etc. */ int extend_right; int extend_down; - + /* Contents of this cell. */ line_t** lines; int lines_num; @@ -192,7 +205,7 @@ void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell); typedef struct { point_t pos; /* top-left. */ - + /* Array of cells_num_x*cells_num_y cells; cell (x, y) is: cells_num_x * y + x. */ @@ -202,11 +215,30 @@ typedef struct } table_t; +typedef enum +{ + SPLIT_NONE = 0, + SPLIT_HORIZONTAL, + SPLIT_VERTICAL +} split_type_t; + + +typedef struct split_t +{ + split_type_t type; + double weight; + int count; + struct split_t *split[1]; +} split_t; + + typedef struct { + rect_t mediabox; + span_t** spans; int spans_num; - + image_t* images; int images_num; @@ -219,16 +251,27 @@ typedef struct int paragraphs_num; /* These refer to items in .lines. Initially empty, then set by extract_join(). */ - + tablelines_t tablelines_horizontal; tablelines_t tablelines_vertical; - + table_t** tables; int tables_num; +} subpage_t; +/* A subpage. Contains different representations of the list of spans. */ + +typedef struct +{ + rect_t mediabox; + + subpage_t** subpages; + int subpages_num; + + split_t* split; } extract_page_t; -/* A page. Contains different representations of the list of spans. NB not -+called page_t because this clashes with a system type on hpux. */ +/* A page. Contains a list of subpages. NB not +called page_t because this clashes with a system type on hpux. */ typedef struct @@ -248,7 +291,7 @@ typedef struct } images_t; -int extract_document_join(extract_alloc_t* alloc, document_t* document); +int extract_document_join(extract_alloc_t* alloc, document_t* document, int layout_analysis); /* This does all the work of finding paragraphs and tables. */ double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm); @@ -273,5 +316,21 @@ typedef struct content, e.g. so we know whether a font has changed so need to start a new odt span. */ +int extract_page_analyse(extract_alloc_t* alloc, extract_page_t* page); +/* Analyse page content for layouts. */ + +int extract_subpage_alloc(extract_alloc_t* extract, rect_t mediabox, extract_page_t* page, subpage_t** psubpage); +/* content_t constructor. */ + +void extract_subpage_free(extract_alloc_t* alloc, subpage_t** psubpage); +/* subpage_t destructor. */ + +int subpage_span_append(extract_alloc_t* alloc, subpage_t* subpage, span_t* span); +/* Push span onto the end of subpage. */ + +int extract_split_alloc(extract_alloc_t* alloc, split_type_t type, int count, split_t** psplit); +/* Allocate a split_t. */ + +void extract_split_free(extract_alloc_t* alloc, split_t** psplit); #endif diff --git a/extract/src/docx.c b/extract/src/docx.c index 761de176..ca6c5d78 100644 --- a/extract/src/docx.c +++ b/extract/src/docx.c @@ -95,7 +95,7 @@ static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* con content_state.font.size = 10; content_state.font.bold = 0; content_state.font.italic = 0; - + if (s_docx_run_start(alloc, content, &content_state)) goto end; //docx_char_append_string(content, " "); /*   is non-break space. */ if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end; @@ -168,9 +168,9 @@ font. */ if (s_docx_run_finish(alloc, content_state, content)) goto end; } if (s_docx_paragraph_finish(alloc, content)) goto end; - + e = 0; - + end: return e; } @@ -245,7 +245,7 @@ static int s_docx_append_image( static int s_docx_output_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, int paragraph_begin, int paragraph_end, int rot, @@ -330,7 +330,7 @@ static int s_docx_output_rotated_paragraphs( /* Output paragraphs p0..p2-1. */ for (p=paragraph_begin; p<paragraph_end; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + paragraph_t* paragraph = subpage->paragraphs[p]; if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } @@ -364,7 +364,7 @@ static int s_docx_output_rotated_paragraphs( extract_astring_cat(alloc, content, " <w:txbxContent>"); for (p=paragraph_begin; p<paragraph_end; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + paragraph_t* paragraph = subpage->paragraphs[p]; if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } @@ -392,7 +392,7 @@ to the application. */ { int e = -1; int y; - + if (extract_astring_cat(alloc, content, "\n" " <w:tbl>\n" @@ -406,14 +406,14 @@ to the application. */ " <w:tr>\n" " <w:trPr/>\n" )) goto end; - + for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; if (!cell->left) continue; - + if (extract_astring_cat(alloc, content, " <w:tc>\n")) goto end; - + /* Write cell properties. */ { if (extract_astring_cat(alloc, content, @@ -442,7 +442,7 @@ to the application. */ } if (extract_astring_cat(alloc, content, " </w:tcPr>\n")) goto end; } - + /* Write contents of this cell. */ { size_t chars_num_old = content->chars_num; @@ -476,20 +476,20 @@ to the application. */ } if (extract_astring_cat(alloc, content, " </w:tbl>\n")) goto end; e = 0; - + end: return e; } static int s_docx_append_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, content_state_t* state, int* p, int* text_box_id, const matrix_t* ctm, double rotate, - extract_astring_t* content + extract_astring_t* output ) /* Appends paragraphs with same rotation, starting with page->paragraphs[*p] and updates *p. */ @@ -501,8 +501,8 @@ and updates *p. */ point_t extent = {0, 0}; int p0 = *p; int p1; - paragraph_t* paragraph = page->paragraphs[*p]; - + paragraph_t* paragraph = subpage->paragraphs[*p]; + outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", rotate, rotate * 180 / pi, ctm->e, @@ -535,8 +535,8 @@ and updates *p. */ ctm->a, ctm->b, ctm->c, ctm->d); } - for (*p=p0; *p<page->paragraphs_num; ++(*p)) { - paragraph = page->paragraphs[*p]; + for (*p=p0; *p<subpage->paragraphs_num; ++(*p)) { + paragraph = subpage->paragraphs[*p]; ctm = ¶graph->lines[0]->spans[0]->ctm; rotate = atan2(ctm->b, ctm->a); if (rotate != rotate0) { @@ -625,13 +625,13 @@ and updates *p. */ x -= dx; y -= -dy; - if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end; + if (s_docx_output_rotated_paragraphs(alloc, subpage, p0, p1, rot, x, y, w, h, *text_box_id, output, state)) goto end; } *p = p1 - 1; e = 0; - + end: - + return e; } @@ -647,38 +647,40 @@ int extract_document_to_docx_content( int ret = -1; int text_box_id = 0; int p; - + /* Write paragraphs into <content>. */ for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - - int p = 0; - int t = 0; - - content_state_t content_state; - content_state.font.name = NULL; - content_state.font.size = 0; - content_state.font.bold = 0; - content_state.font.italic = 0; - content_state.ctm_prev = NULL; - - /* Output paragraphs and tables in order of y coordinate. */ - for(;;) - { - paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p]; - table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; - double y_paragraph; - double y_table; - if (!paragraph && !table) break; - y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; - y_table = (table) ? table->pos.y : DBL_MAX; - - if (paragraph && y_paragraph < y_table) - { - const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; - double rotate = atan2(ctm->b, ctm->a); - - if (spacing + int c; + + for (c=0; c<page->subpages_num; ++c) { + subpage_t* subpage = page->subpages[c]; + + int p = 0; + int t = 0; + + content_state_t content_state; + content_state.font.name = NULL; + content_state.font.size = 0; + content_state.font.bold = 0; + content_state.font.italic = 0; + content_state.ctm_prev = NULL; + + /* Output paragraphs and tables in order of y coordinate. */ + for(;;) { + paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : subpage->paragraphs[p]; + table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t]; + double y_paragraph; + double y_table; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; + + if (paragraph && y_paragraph < y_table) { + const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; + double rotate = atan2(ctm->b, ctm->a); + + if (spacing && content_state.ctm_prev && paragraph->lines_num && paragraph->lines[0]->spans_num @@ -687,37 +689,38 @@ int extract_document_to_docx_content( ¶graph->lines[0]->spans[0]->ctm ) ) { - /* Extra vertical space between paragraphs that were at - different angles in the original document. */ - if (s_docx_paragraph_empty(alloc, content)) goto end; - } + /* Extra vertical space between paragraphs that were at + different angles in the original document. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - if (spacing) { - /* Extra vertical space between paragraphs. */ - if (s_docx_paragraph_empty(alloc, content)) goto end; - } + if (spacing) { + /* Extra vertical space between paragraphs. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - if (rotation && rotate != 0) - { - if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end; + if (rotation && rotate != 0) + { + if (s_docx_append_rotated_paragraphs(alloc, subpage, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end; + } + else + { + if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; + } + p += 1; } - else + else if (table) { - if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; + if (s_docx_append_table(alloc, table, content)) goto end; + t += 1; } - p += 1; - } - else if (table) - { - if (s_docx_append_table(alloc, table, content)) goto end; - t += 1; } - } - - if (images) { - int i; - for (i=0; i<page->images_num; ++i) { - s_docx_append_image(alloc, content, &page->images[i]); + + if (images) { + int i; + for (i=0; i<subpage->images_num; ++i) { + s_docx_append_image(alloc, content, &subpage->images[i]); + } } } } @@ -759,7 +762,7 @@ int extract_docx_content_item( extract_astring_t temp; extract_astring_init(&temp); *text2 = NULL; - + if (0) {} else if (!strcmp(name, "[Content_Types].xml")) { @@ -841,7 +844,7 @@ int extract_docx_content_item( return e; } - + int extract_docx_write_template( extract_alloc_t* alloc, @@ -862,7 +865,7 @@ int extract_docx_write_template( assert(path_out); assert(path_template); - + if (extract_check_path_shell_safe(path_out)) { outf("path_out is unsafe: %s", path_out); goto end; @@ -889,7 +892,7 @@ int extract_docx_write_template( /* Might be nice to iterate through all items in path_tempdir, but for now we look at just the items that we know extract_docx_content_item() will modify. */ - + { const char* names[] = { "word/document.xml", @@ -904,7 +907,7 @@ int extract_docx_write_template( extract_free(alloc, &text2); if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; if (extract_read_all_path(alloc, path, &text)) goto end; - + if (extract_docx_content_item( alloc, contentss, @@ -926,14 +929,14 @@ int extract_docx_write_template( extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; if (extract_mkdir(path, 0777)) goto end; - + for (i=0; i<images->images_num; ++i) { image_t* image = &images->images[i]; extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; if (extract_write_all(image->data, image->data_size, path)) goto end; } - + outf("Zipping tempdir to create %s", path_out); { const char* path_out_leaf = strrchr(path_out, '/'); diff --git a/extract/src/docx.h b/extract/src/docx.h index 976272a6..ffce019c 100644 --- a/extract/src/docx.h +++ b/extract/src/docx.h @@ -22,7 +22,7 @@ word/document.xml. */ int extract_docx_write_template( - extract_alloc_t* alloc, + extract_alloc_t* alloc, extract_astring_t* contentss, int contentss_num, images_t* images, @@ -34,7 +34,7 @@ int extract_docx_write_template( Uses the 'zip' and 'unzip' commands internally. -contents +contentss contentss_num Content to be inserted into word/document.xml. document diff --git a/extract/src/docx_template.c b/extract/src/docx_template.c index 73ab5b71..06a2e8c5 100644 --- a/extract/src/docx_template.c +++ b/extract/src/docx_template.c @@ -21,7 +21,7 @@ const docx_template_item_t docx_template_items[] = "<Override PartName=\"/docProps/core.xml\" ContentType=\"application/vnd.openxmlformats-package.core-properties+xml\"/>" "<Override PartName=\"/docProps/app.xml\" ContentType=\"application/vnd.openxmlformats-officedocument.extended-properties+xml\"/></Types>" }, - + { "_rels/.rels", "" @@ -32,7 +32,7 @@ const docx_template_item_t docx_template_items[] = "<Relationship Id=\"rId2\" Type=\"http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties\" Target=\"docProps/core.xml\"/>" "<Relationship Id=\"rId1\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument\" Target=\"word/document.xml\"/></Relationships>" }, - + { "docProps/app.xml", "" @@ -56,7 +56,7 @@ const docx_template_item_t docx_template_items[] = "<HyperlinksChanged>false</HyperlinksChanged>" "<AppVersion>16.0000</AppVersion></Properties>" }, - + { "docProps/core.xml", "" @@ -73,7 +73,7 @@ const docx_template_item_t docx_template_items[] = "<dcterms:created xsi:type=\"dcterms:W3CDTF\">2020-09-25T17:04:00Z</dcterms:created>" "<dcterms:modified xsi:type=\"dcterms:W3CDTF\">2020-09-25T17:07:00Z</dcterms:modified></cp:coreProperties>" }, - + { "word/document.xml", "" @@ -155,7 +155,7 @@ const docx_template_item_t docx_template_items[] = "<w:cols w:space=\"708\"/>" "<w:docGrid w:linePitch=\"360\"/></w:sectPr></w:body></w:document>" }, - + { "word/fontTable.xml", "" @@ -181,7 +181,7 @@ const docx_template_item_t docx_template_items[] = "<w:pitch w:val=\"variable\"/>" "<w:sig w:usb0=\"E4002EFF\" w:usb1=\"C000247B\" w:usb2=\"00000009\" w:usb3=\"00000000\" w:csb0=\"000001FF\" w:csb1=\"00000000\"/></w:font></w:fonts>" }, - + { "word/settings.xml", "" @@ -227,7 +227,7 @@ const docx_template_item_t docx_template_items[] = "<w15:chartTrackingRefBased/>" "<w15:docId w15:val=\"{A10F59F7-497D-44D4-A338-47719734E7A0}\"/></w:settings>" }, - + { "word/styles.xml", "" @@ -647,7 +647,7 @@ const docx_template_item_t docx_template_items[] = "<w:semiHidden/>" "<w:unhideWhenUsed/></w:style></w:styles>" }, - + { "word/webSettings.xml", "" @@ -657,7 +657,7 @@ const docx_template_item_t docx_template_items[] = "<w:optimizeForBrowser/>" "<w:allowPNG/></w:webSettings>" }, - + { "word/_rels/document.xml.rels", "" @@ -670,7 +670,7 @@ const docx_template_item_t docx_template_items[] = "<Relationship Id=\"rId5\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme\" Target=\"theme/theme1.xml\"/>" "<Relationship Id=\"rId4\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable\" Target=\"fontTable.xml\"/></Relationships>" }, - + { "word/theme/theme1.xml", "" @@ -708,10 +708,10 @@ const docx_template_item_t docx_template_items[] = "<a:latin typeface=\"Calibri Light\" panose=\"020F0302020204030204\"/>" "<a:ea typeface=\"\"/>" "<a:cs typeface=\"\"/>" - "<a:font script=\"Jpan\" typeface=\"游ゴシック Light\"/>" - "<a:font script=\"Hang\" typeface=\"맑은 고딕\"/>" - "<a:font script=\"Hans\" typeface=\"等线 Light\"/>" - "<a:font script=\"Hant\" typeface=\"新細明體\"/>" + "<a:font script=\"Jpan\" typeface=\"\xe6\xb8\xb8\xe3\x82\xb4\xe3\x82\xb7\xe3\x83\x83\xe3\x82\xaf Light\"/>" + "<a:font script=\"Hang\" typeface=\"\xeb\xa7\x91\xec\x9d\x80 \xea\xb3\xa0\xeb\x94\x95\"/>" + "<a:font script=\"Hans\" typeface=\"\xe7\xad\x89\xe7\xba\xbf Light\"/>" + "<a:font script=\"Hant\" typeface=\"\xe6\x96\xb0\xe7\xb4\xb0\xe6\x98\x8e\xe9\xab\x94\"/>" "<a:font script=\"Arab\" typeface=\"Times New Roman\"/>" "<a:font script=\"Hebr\" typeface=\"Times New Roman\"/>" "<a:font script=\"Thai\" typeface=\"Angsana New\"/>" @@ -759,10 +759,10 @@ const docx_template_item_t docx_template_items[] = "<a:latin typeface=\"Calibri\" panose=\"020F0502020204030204\"/>" "<a:ea typeface=\"\"/>" "<a:cs typeface=\"\"/>" - "<a:font script=\"Jpan\" typeface=\"游明朝\"/>" - "<a:font script=\"Hang\" typeface=\"맑은 고딕\"/>" - "<a:font script=\"Hans\" typeface=\"等线\"/>" - "<a:font script=\"Hant\" typeface=\"新細明體\"/>" + "<a:font script=\"Jpan\" typeface=\"\xe6\xb8\xb8\xe6\x98\x8e\xe6\x9c\x9d\"/>" + "<a:font script=\"Hang\" typeface=\"\xeb\xa7\x91\xec\x9d\x80 \xea\xb3\xa0\xeb\x94\x95\"/>" + "<a:font script=\"Hans\" typeface=\"\xe7\xad\x89\xe7\xba\xbf\"/>" + "<a:font script=\"Hant\" typeface=\"\xe6\x96\xb0\xe7\xb4\xb0\xe6\x98\x8e\xe9\xab\x94\"/>" "<a:font script=\"Arab\" typeface=\"Arial\"/>" "<a:font script=\"Hebr\" typeface=\"Arial\"/>" "<a:font script=\"Thai\" typeface=\"Cordia New\"/>" @@ -904,7 +904,7 @@ const docx_template_item_t docx_template_items[] = "<a:ext uri=\"{05A4C25C-085E-4340-85A3-A5531E510DB2}\">" "<thm15:themeFamily xmlns:thm15=\"http://schemas.microsoft.com/office/thememl/2012/main\" name=\"Office Theme\" id=\"{62F939B6-93AF-4DB8-9C6B-D6C7DFDC589F}\" vid=\"{4A3C46E8-61CC-4603-A589-7422A47A8E4A}\"/></a:ext></a:extLst></a:theme>" }, - + }; int docx_template_items_num = 11; diff --git a/extract/src/docx_template_build.py b/extract/src/docx_template_build.py index 8b836300..e04137d5 100755 --- a/extract/src/docx_template_build.py +++ b/extract/src/docx_template_build.py @@ -14,13 +14,13 @@ Args: -i <in-path> Set template docx/odt file to extract from. - + -n docx | odt Infix to use in generated identifier names. -o <out-path> Set name of output files. - + We write to <out-path>.c and <out-path>.h. ''' @@ -82,7 +82,7 @@ def check_path_safe(path): raise Exception(f'Path is unsafe because contains "..": {path!r}') for c in path: if not c.isalnum() and c not in '/._-': - #print(f'unsafe character {c} in: {path}') + #print(f'unsafe character {c} in: {path}') raise Exception(f'Path is unsafe because contains "{c}": {path!r}') def path_safe(path): @@ -134,37 +134,37 @@ def main(): path_out = next(args) else: assert 0, f'unrecognised arg: {arg}' - + if not path_in: return - + if not path_in: raise Exception('Need to specify -i <in-path>') if not infix: raise Exception('Need to specify -n <name>') if not path_out: raise Exception('Need to specify -o <out-path>') - + check_path_safe(path_in) check_path_safe(path_out) path_temp = f'{path_in}.dir' os.system(f'rm -r "{path_temp}" 2>/dev/null') system(f'unzip -q -d {path_temp} {path_in}') - + out_c = io.StringIO() out_c.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n') out_c.write(f'\n') out_c.write(f'#include "{os.path.basename(path_out)}.h"\n') out_c.write(f'\n') - - + + out_c.write(f'const {infix}_template_item_t {infix}_template_items[] =\n') out_c.write(f'{{\n') - + num_items = 0 for dirpath, dirnames, filenames in os.walk(path_temp): dirnames.sort() - + if 0: # Write code to create directory item in zip. This isn't recognised by zipinfo, and doesn't # make Word like the file. @@ -174,7 +174,7 @@ def main(): if not name.endswith('/'): name += '/' out_c3.write(f' if (extract_zip_write_file(zip, NULL, 0, "{infix}")) goto end;\n') - + for filename in sorted(filenames): num_items += 1 path = os.path.join(dirpath, filename) @@ -205,7 +205,15 @@ def main(): for tag in 'dc:creator', 'cp:lastModifiedBy': text = re.sub(f'[<]{tag}[>][^<]*[<]/{tag}[>]', f'<{tag}></{tag}>', text) - out_c.write(f' "{text}"\n') + out_c.write(f' "') + # Represent non-ascii utf-8 bytes as C escape sequences. + for c in text: + if ord( c) <= 127: + out_c.write( c) + else: + for cc in c.encode( 'utf-8'): + out_c.write( f'\\x{cc:02x}') + out_c.write(f'"\n') else: data = read(os.path.join(dirpath, filename), encoding=None) out_c.write(f' "') @@ -216,17 +224,17 @@ def main(): out_c.write(f'"\n "') out_c.write(f'\\x{byte:02x}') out_c.write(f'"\n') - + out_c.write(f' }},\n') - out_c.write(f' \n') - + out_c.write(f'\n') + out_c.write(f'}};\n') out_c.write(f'\n') out_c.write(f'int {infix}_template_items_num = {num_items};\n') - + out_c = out_c.getvalue() write_if_diff(out_c, f'{path_out}.c', 'utf-8', force) - + out_h = io.StringIO() out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n') out_h.write(f'#define EXTRACT_{infix.upper()}_TEMPLATE_H\n') @@ -247,6 +255,6 @@ def main(): out_h.write(f'#endif\n') write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force) #os.system(f'rm -r "{path_temp}"') - + if __name__ == '__main__': main() diff --git a/extract/src/extract-exe.c b/extract/src/extract-exe.c index ee34023a..808d2cd1 100644 --- a/extract/src/extract-exe.c +++ b/extract/src/extract-exe.c @@ -1,5 +1,9 @@ /* Command-line programme for extract_ API. */ +#ifdef _WIN32 +#define _CRT_SECURE_NO_WARNINGS +#endif + #include "../include/extract.h" #include "../include/extract_alloc.h" @@ -67,13 +71,13 @@ int main(int argc, char** argv) extract_buffer_t* out_buffer = NULL; extract_buffer_t* intermediate = NULL; extract_t* extract = NULL; - + /* Create an allocator so we test the allocation code. */ if (extract_alloc_create(s_realloc, (void*) 123, &alloc)) { assert(0); } - + for (i=1; i<argc; ++i) { const char* arg = argv[i]; if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { @@ -185,7 +189,7 @@ int main(int argc, char** argv) assert(i < argc); } - + if (format == -1) { printf("'-f odt | docx' must be specified\n"); @@ -198,17 +202,17 @@ int main(int argc, char** argv) errno = EINVAL; goto end; } - + if (extract_buffer_open_file(alloc, input_path, 0 /*writable*/, &intermediate)) { printf("Failed to open intermediate file: %s\n", input_path); goto end; } - + if (extract_begin(alloc, format, &extract)) goto end; if (extract_read_intermediate(extract, intermediate, autosplit)) goto end; - + if (extract_process(extract, spacing, rotation, images)) goto end; - + if (content_path) { if (extract_buffer_open_file(alloc, content_path, 1 /*writable*/, &out_buffer)) goto end; if (extract_write_content(extract, out_buffer)) goto end; @@ -247,9 +251,9 @@ int main(int argc, char** argv) printf("Failed (errno=%i): %s\n", errno, strerror(errno)); return 1; } - + extract_internal_end(); - + if (alloc_stats) { extract_alloc_stats_t* stats = extract_alloc_stats(alloc); printf("Alloc stats: num_malloc=%i num_realloc=%i num_free=%i num_libc_realloc=%i\n", @@ -259,9 +263,9 @@ int main(int argc, char** argv) stats->num_libc_realloc ); } - + extract_alloc_destroy(&alloc); - assert(alloc == NULL); + assert(alloc == NULL); printf("Finished.\n"); return 0; diff --git a/extract/src/extract.c b/extract/src/extract.c index 2c375571..42f888f3 100644 --- a/extract/src/extract.c +++ b/extract/src/extract.c @@ -25,6 +25,9 @@ +const rect_t extract_rect_infinite = { { DBL_MIN, DBL_MIN }, { DBL_MAX, DBL_MAX } }; +const rect_t extract_rect_empty = { { DBL_MAX, DBL_MAX }, { DBL_MIN, DBL_MIN } }; + double extract_matrix_expansion(matrix_t m) { @@ -200,74 +203,97 @@ static void table_free(extract_alloc_t* alloc, table_t** ptable) extract_free(alloc, ptable); } -static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) +void extract_subpage_free(extract_alloc_t* alloc, subpage_t** psubpage) { - extract_page_t* page = *ppage; - if (!page) return; + subpage_t* subpage = *psubpage; + if (!subpage) return; - outf0("page=%p page->spans_num=%i page->lines_num=%i", - page, page->spans_num, page->lines_num); - extract_spans_free(alloc, &page->spans, page->spans_num); + outf0("subpage=%p subpage->spans_num=%i subpage->lines_num=%i", + subpage, subpage->spans_num, subpage->lines_num); + extract_spans_free(alloc, &subpage->spans, subpage->spans_num); - extract_lines_free(alloc, &page->lines, page->lines_num); + extract_lines_free(alloc, &subpage->lines, subpage->lines_num); { int p; - for (p=0; p<page->paragraphs_num; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + for (p=0; p<subpage->paragraphs_num; ++p) { + paragraph_t* paragraph = subpage->paragraphs[p]; /* We don't call extract_lines_free(¶graph->lines) because - these point into the same data as page->lines, which we have + these point into the same data as subpage->lines, which we have already freed above. */ if (paragraph) extract_free(alloc, ¶graph->lines); - extract_free(alloc, &page->paragraphs[p]); + extract_free(alloc, &subpage->paragraphs[p]); } } - extract_free(alloc, &page->paragraphs); - + extract_free(alloc, &subpage->paragraphs); + { int i; - for (i=0; i<page->images_num; ++i) { - extract_image_clear(alloc, &page->images[i]); + for (i=0; i<subpage->images_num; ++i) { + extract_image_clear(alloc, &subpage->images[i]); } - extract_free(alloc, &page->images); + extract_free(alloc, &subpage->images); } - extract_free(alloc, &page->images); + extract_free(alloc, &subpage->images); + + extract_free(alloc, &subpage->tablelines_horizontal.tablelines); + extract_free(alloc, &subpage->tablelines_vertical.tablelines); - extract_free(alloc, &page->tablelines_horizontal.tablelines); - extract_free(alloc, &page->tablelines_vertical.tablelines); - { int t; - outf("page=%p page->tables_num=%i", page, page->tables_num); - for (t=0; t<page->tables_num; ++t) + outf("subpage=%p subpage->tables_num=%i", subpage, subpage->tables_num); + for (t=0; t<subpage->tables_num; ++t) { - table_free(alloc, &page->tables[t]); + table_free(alloc, &subpage->tables[t]); } - extract_free(alloc, &page->tables); + extract_free(alloc, &subpage->tables); + } + + extract_free(alloc, psubpage); +} + +static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) +{ + int c; + extract_page_t* page = *ppage; + if (!page) return; + + for (c=0; c<page->subpages_num; ++c) + { + subpage_t *subpage = page->subpages[c]; + extract_subpage_free(alloc, &subpage); } - + extract_free(alloc, &page->subpages); extract_free(alloc, ppage); } -static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page) -/* Appends new empty span_ to an extract_page_t; returns NULL with errno set on -error. */ +int subpage_span_append(extract_alloc_t *alloc, subpage_t *subpage, span_t *span) { - span_t* span; - if (extract_malloc(alloc, &span, sizeof(*span))) return NULL; - extract_span_init(span); if (extract_realloc2( alloc, - &page->spans, - sizeof(*page->spans) * page->spans_num, - sizeof(*page->spans) * (page->spans_num + 1) + &subpage->spans, + sizeof(*subpage->spans) * subpage->spans_num, + sizeof(*subpage->spans) * (subpage->spans_num + 1) )) { - extract_free(alloc, &span); - return NULL; + return -1; } - page->spans[page->spans_num] = span; - page->spans_num += 1; - return span; + subpage->spans[subpage->spans_num] = span; + subpage->spans_num += 1; + + return 0; +} + + +static int subpage_span_append_new(extract_alloc_t* alloc, subpage_t *subpage, span_t** pspan) +/* Appends new empty span_ to a subpage_t; returns -1 with errno set on error. */ +{ + if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1; + extract_span_init(*pspan); + if (subpage_span_append(alloc, subpage, *pspan)) { + extract_free(alloc, pspan); + return -1; + } + return 0; } @@ -285,9 +311,9 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images) static int extract_document_images(extract_alloc_t* alloc, document_t* document, images_t* o_images) -/* Moves image_t's from document->page[] to *o_images. +/* Moves image_t's from document->subpage[] to *o_images. -On return document->page[].images* will be NULL etc. +On return document->subpage[].images* will be NULL etc. */ { int e = -1; @@ -297,59 +323,65 @@ On return document->page[].images* will be NULL etc. for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - int i; - for (i=0; i<page->images_num; ++i) + int c; + for (c=0; c<page->subpages_num; ++c) { - image_t* image; - if (extract_realloc2( - alloc, - &images.images, - sizeof(image_t) * images.images_num, - sizeof(image_t) * (images.images_num + 1) - )) goto end; - image = &page->images[i]; - outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); - assert(image->name); - images.images[images.images_num] = *image; - images.images_num += 1; - - /* Add image type if we haven't seen it before. */ + subpage_t* subpage = page->subpages[c]; + int i; + for (i=0; i<subpage->images_num; ++i) { - int it; - for (it=0; it<images.imagetypes_num; ++it) + image_t* image; + if (extract_realloc2( + alloc, + &images.images, + sizeof(image_t) * images.images_num, + sizeof(image_t) * (images.images_num + 1) + )) goto end; + image = &subpage->images[i]; + outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); + assert(image->name); + images.images[images.images_num] = *image; + images.images_num += 1; + + /* Add image type if we haven't seen it before. */ { - outf("it=%i images.imagetypes[it]=%s image->type=%s", - it, images.imagetypes[it], image->type); - if (!strcmp(images.imagetypes[it], image->type)) { - break; + int it; + for (it=0; it<images.imagetypes_num; ++it) + { + outf("it=%i images.imagetypes[it]=%s image->type=%s", + it, images.imagetypes[it], image->type); + if (!strcmp(images.imagetypes[it], image->type)) + { + break; + } } - } - if (it == images.imagetypes_num) - { - /* We haven't seen this image type before. */ - if (extract_realloc2( - alloc, - &images.imagetypes, - sizeof(char*) * images.imagetypes_num, - sizeof(char*) * (images.imagetypes_num + 1) + if (it == images.imagetypes_num) + { + /* We haven't seen this image type before. */ + if (extract_realloc2( + alloc, + &images.imagetypes, + sizeof(char*) * images.imagetypes_num, + sizeof(char*) * (images.imagetypes_num + 1) )) goto end; - assert(image->type); - images.imagetypes[images.imagetypes_num] = image->type; - images.imagetypes_num += 1; - outf("have added images.imagetypes_num=%i", images.imagetypes_num); + assert(image->type); + images.imagetypes[images.imagetypes_num] = image->type; + images.imagetypes_num += 1; + outf("have added images.imagetypes_num=%i", images.imagetypes_num); + } } + + /* We've taken ownership of image->* so NULL the original values + here to ensure we can't use things after free. */ + image->type = NULL; + image->name = NULL; + image->id = NULL; + image->data = NULL; + image->data_size = 0; } - - /* We've taken ownership of image->* so NULL the original values - here to ensure we can't use things after free. */ - image->type = NULL; - image->name = NULL; - image->id = NULL; - image->data = NULL; - image->data_size = 0; + extract_free(alloc, &subpage->images); + subpage->images_num = 0; } - extract_free(alloc, &page->images); - page->images_num = 0; } e = 0; end: @@ -367,12 +399,11 @@ On return document->page[].images* will be NULL etc. static void extract_document_free(extract_alloc_t* alloc, document_t* document) { int p; - if (!document) { - return; - } - for (p=0; p<document->pages_num; ++p) { - extract_page_t* page = document->pages[p]; - page_free(alloc, &page); + if (!document) return; + + for (p=0; p<document->pages_num; ++p) + { + page_free(alloc, &document->pages[p]); } extract_free(alloc, &document->pages); document->pages = NULL; @@ -451,11 +482,11 @@ static void s_document_init(document_t* document) } -static int page_span_end_clean(extract_alloc_t* alloc, extract_page_t* page) -/* Does preliminary processing of the end of the last span in a page; intended +static int subpage_span_end_clean(extract_alloc_t* alloc, subpage_t* subpage) +/* Does preliminary processing of the end of the last span in a subpage; intended to be called as we load span information. -Looks at last two char_t's in last span_t of <page>, and either +Looks at last two char_t's in last span_t of <subpage>, and either leaves unchanged, or removes space in last-but-one position, or moves last char_t into a new span_t. */ { @@ -468,9 +499,9 @@ char_t into a new span_t. */ double err_x; double err_y; point_t dir; - - assert(page->spans_num); - span = page->spans[page->spans_num-1]; + + assert(subpage->spans_num); + span = subpage->spans[subpage->spans_num-1]; assert(span->chars_num); /* Last two char_t's are char_[-2] and char_[-1]. */ @@ -547,8 +578,8 @@ char_t into a new span_t. */ span_string2(span) ); { - span_t* span2 = page_span_append(alloc, page); - if (!span2) goto end; + span_t* span2; + if (subpage_span_append_new(alloc, subpage, &span2)) goto end; *span2 = *span; if (extract_strdup(alloc, span->font_name, &span2->font_name)) goto end; span2->chars_num = 1; @@ -567,42 +598,44 @@ char_t into a new span_t. */ struct extract_t { extract_alloc_t* alloc; - + + int layout_analysis; + document_t document; - + int num_spans_split; - /* Number of extra spans from page_span_end_clean(). */ - + /* Number of extra spans from subpage_span_end_clean(). */ + int num_spans_autosplit; /* Number of extra spans from autosplit=1. */ - + double span_offset_x; double span_offset_y; /* Only used if autosplit is non-zero. */ - + int image_n; /* Used to generate unique ids for images. */ - + /* List of strings that are the generated docx content for each page. When zip_* can handle appending of data, we will be able to remove this list. */ extract_astring_t* contentss; int contentss_num; - + images_t images; - + extract_format_t format; extract_odt_styles_t odt_styles; - + char* tables_csv_format; int tables_csv_i; - + enum { path_type_NONE, path_type_FILL, path_type_STROKE, } path_type; - + union { struct @@ -612,7 +645,7 @@ struct extract_t point_t points[4]; int n; } fill; - + struct { matrix_t ctm; @@ -623,7 +656,7 @@ struct extract_t point_t point; int point_set; } stroke; - + } path; }; @@ -636,7 +669,7 @@ int extract_begin( { int e = -1; extract_t* extract; - + if (1 && format != extract_format_ODT && format != extract_format_DOCX @@ -648,29 +681,35 @@ int extract_begin( errno = EINVAL; return -1; } - + /* Use a temporary extract_alloc_t to allocate space for the extract_t. */ if (extract_malloc(alloc, &extract, sizeof(*extract))) goto end; - + extract_bzero(extract, sizeof(*extract)); extract->alloc = alloc; s_document_init(&extract->document); - + /* Start at 10 because template document might use some low-numbered IDs. */ extract->image_n = 10; - + extract->format = format; extract->tables_csv_format = NULL; extract->tables_csv_i = 0; - + e = 0; - + end: *pextract = (e) ? NULL : extract; return e; } +int extract_set_layout_analysis(extract_t *extract, int enable) +{ + extract->layout_analysis = enable; + return 0; +} + int extract_tables_csv_format(extract_t* extract, const char* path_format) { return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); @@ -686,7 +725,7 @@ static void image_free_fn(void* handle, void* image_data) int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int autosplit) { int ret = -1; - + document_t* document = &extract->document; char* image_data = NULL; int num_spans = 0; @@ -716,16 +755,18 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int </page> ... - We convert this into a list of extract_page_t's, each containing a list of + We convert this into a list of subpage_t's, each containing a list of span_t's, each containing a list of char_t's. While doing this, we do some within-span processing by calling - page_span_end_clean(): + subpage_span_end_clean(): Remove spurious spaces. Split spans in two where there seem to be large gaps between glyphs. */ for(;;) { extract_page_t* page; + subpage_t* subpage; + rect_t mediabox = extract_rect_infinite; /* Fake mediabox */ int e = extract_xml_pparse_next(buffer, &tag); if (e == 1) break; /* EOF. */ if (e) goto end; @@ -741,14 +782,16 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int goto end; } outfx("loading spans for page %i...", document->pages_num); - if (extract_page_begin(extract)) goto end; + if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end; page = extract->document.pages[extract->document.pages_num-1]; if (!page) goto end; + subpage = page->subpages[page->subpages_num-1]; + if (!subpage) goto end; for(;;) { if (extract_xml_pparse_next(buffer, &tag)) goto end; if (!strcmp(tag.name, "/page")) { - num_spans += page->spans_num; + num_spans += subpage->spans_num; break; } if (!strcmp(tag.name, "image")) { @@ -804,20 +847,20 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; byte *= 16; - + cc = *c; c += 1; if (cc >= '0' && cc <= '9') byte += cc-'0'; else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; - + image_data[i] = (char) byte; i += 1; if (i == image_data_size) { break; } continue; - + compressed_error: outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars)); errno = EINVAL; @@ -893,12 +936,12 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int trm.e, trm.f )) goto end; - + for(;;) { - double x; - double y; - double adv; - unsigned ucs; + double x; + double y; + double adv; + unsigned ucs; if (extract_xml_pparse_next(buffer, &tag)) { outf("Failed to find <char or </span"); @@ -917,16 +960,17 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end; if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end; if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end; - - if (extract_add_char(extract, x, y, ucs, adv, autosplit)) goto end; + + /* BBox is bogus here. Analysis will fail. */ + if (extract_add_char(extract, x, y, ucs, adv, autosplit, x, y, x + adv, y + adv)) goto end; } extract_xml_tag_free(extract->alloc, &tag); } } if (extract_page_end(extract)) goto end; - outf("page=%i page->num_spans=%i", - document->pages_num, page->spans_num); + outf("page=%i subpage->num_spans=%i", + document->pages_num, subpage->spans_num); } outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i", @@ -940,7 +984,7 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int end: extract_xml_tag_free(extract->alloc, &tag); extract_free(extract->alloc, &image_data); - + return ret; } @@ -967,9 +1011,11 @@ int extract_span_begin( { int e = -1; extract_page_t* page; + subpage_t* subpage; span_t* span; assert(extract->document.pages_num > 0); page = extract->document.pages[extract->document.pages_num-1]; + subpage = page->subpages[page->subpages_num-1]; outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i", ctm_a, ctm_b, @@ -986,22 +1032,21 @@ int extract_span_begin( font_name, wmode ); - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; span->ctm.a = ctm_a; span->ctm.b = ctm_b; span->ctm.c = ctm_c; span->ctm.d = ctm_d; span->ctm.e = ctm_e; span->ctm.f = ctm_f; - + span->trm.a = trm_a; span->trm.b = trm_b; span->trm.c = trm_c; span->trm.d = trm_d; span->trm.e = trm_e; span->trm.f = trm_f; - + { const char* ff = strchr(font_name, '+'); const char* f = (ff) ? ff+1 : font_name; @@ -1019,25 +1064,30 @@ int extract_span_begin( int extract_add_char( - extract_t* extract, - double x, - double y, - unsigned ucs, - double adv, - int autosplit + extract_t* extract, + double x, + double y, + unsigned ucs, + double adv, + int autosplit, + double x0, + double y0, + double x1, + double y1 ) { int e = -1; char_t* char_; extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; - span_t* span = page->spans[page->spans_num - 1]; - + subpage_t* subpage = page->subpages[page->subpages_num-1]; + span_t* span = subpage->spans[subpage->spans_num - 1]; + outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); /* Ignore the specified <autosplit> - there seems no advantage to not splitting spans on multiple lines, and not doing so causes problems with missing spaces in the output. */ autosplit = 1; - + if (span->chars_num) { char_t* char_prev = &span->chars[span->chars_num - 1]; @@ -1065,17 +1115,16 @@ int extract_add_char( dir.x, dir.y, span_a ); extract->num_spans_autosplit += 1; - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; *span = *span0; span->chars = NULL; span->chars_num = 0; if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end; } } - + if (0 && autosplit && y - extract->span_offset_y != 0) { - + double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x) + span->ctm.b * (y - extract->span_offset_y); double f = span->ctm.f + span->ctm.c * (x - extract->span_offset_x) @@ -1094,8 +1143,7 @@ int extract_add_char( /* Create new span. */ span_t* span0 = span; extract->num_spans_autosplit += 1; - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; *span = *span0; span->chars = NULL; span->chars_num = 0; @@ -1106,31 +1154,35 @@ int extract_add_char( outfx("autosplit: char_pre_y=%f offset_y=%f", char_pre_y, offset_y); } - + if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end; /* Coverity warns, but extract_span_append_c() will have appended an item. */ /* coverity[var_deref_op] */ char_ = &span->chars[ span->chars_num-1]; - + char_->pre_x = x; char_->pre_y = y; char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e; char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f; - + char_->adv = adv; char_->ucs = ucs; + char_->bbox.min.x = x0; + char_->bbox.min.y = y0; + char_->bbox.max.x = x1; + char_->bbox.max.y = y1; { - int page_spans_num_old = page->spans_num; - if (page_span_end_clean(extract->alloc, page)) goto end; - span = page->spans[page->spans_num-1]; /* fixme: unnecessary. */ - if (page->spans_num != page_spans_num_old) { + int subpage_spans_num_old = subpage->spans_num; + if (subpage_span_end_clean(extract->alloc, subpage)) goto end; + span = subpage->spans[subpage->spans_num-1]; /* fixme: unnecessary. */ + if (subpage->spans_num != subpage_spans_num_old) { extract->num_spans_split += 1; } } e = 0; - + end: return e; } @@ -1139,13 +1191,14 @@ int extract_add_char( int extract_span_end(extract_t* extract) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; - span_t* span = page->spans[page->spans_num - 1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; + span_t* span = subpage->spans[subpage->spans_num - 1]; if (span->chars_num == 0) { /* Calling code called extract_span_begin() then extract_span_end() without any call to extract_add_char(). Our joining code assumes that all spans are non-empty, so we need to delete this span. */ - extract_free(extract->alloc, &page->spans[page->spans_num - 1]); - page->spans_num -= 1; + extract_free(extract->alloc, &subpage->spans[subpage->spans_num - 1]); + subpage->spans_num -= 1; } return 0; } @@ -1166,8 +1219,9 @@ int extract_add_image( { int e = -1; extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; image_t image_temp = {0}; - + extract->image_n += 1; image_temp.x = x; image_temp.y = y; @@ -1180,29 +1234,29 @@ int extract_add_image( if (extract_strdup(extract->alloc, type, &image_temp.type)) goto end; if (extract_asprintf(extract->alloc, &image_temp.id, "rId%i", extract->image_n) < 0) goto end; if (extract_asprintf(extract->alloc, &image_temp.name, "image%i.%s", extract->image_n, image_temp.type) < 0) goto end; - + if (extract_realloc2( extract->alloc, - &page->images, - sizeof(image_t) * page->images_num, - sizeof(image_t) * (page->images_num + 1) + &subpage->images, + sizeof(image_t) * subpage->images_num, + sizeof(image_t) * (subpage->images_num + 1) )) goto end; - - page->images[page->images_num] = image_temp; - page->images_num += 1; - outf("page->images_num=%i", page->images_num); - + + subpage->images[subpage->images_num] = image_temp; + subpage->images_num += 1; + outf("subpage->images_num=%i", subpage->images_num); + e = 0; - + end: - + if (e) { extract_free(extract->alloc, &image_temp.type); extract_free(extract->alloc, &image_temp.data); extract_free(extract->alloc, &image_temp.id); extract_free(extract->alloc, &image_temp.name); } - + return e; } @@ -1220,7 +1274,7 @@ static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, r return 0; } -static point_t transform(double x, double y, +static point_t transform(double x, double y, double ctm_a, double ctm_b, double ctm_c, @@ -1265,6 +1319,7 @@ int extract_add_path4( ) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; point_t points[4] = { transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), @@ -1303,20 +1358,20 @@ int extract_add_path4( if (points[(i+4) % 4].y != y0) return 0; rect.min.y = (y1 > y0) ? y0 : y1; rect.max.y = (y1 > y0) ? y1 : y0; - + dx = rect.max.x - rect.min.x; dy = rect.max.y - rect.min.y; if (dx / dy > 5) { /* Horizontal line. */ outf("have found horizontal line: %s", extract_rect_string(&rect)); - if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1; + if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1; } else if (dy / dx > 5) { /* Vertical line. */ outf("have found vertical line: %s", extract_rect_string(&rect)); - if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1; + if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1; } return 0; } @@ -1339,6 +1394,7 @@ int extract_add_line( ) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); @@ -1348,7 +1404,7 @@ int extract_add_line( rect.min.y = s_min(p0.y, p1.y); rect.max.x = s_max(p0.x, p1.x); rect.max.y = s_max(p0.y, p1.y); - + outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", extract_FUNCTION, width, @@ -1362,49 +1418,98 @@ int extract_add_line( { rect.min.x -= width2 / 2; rect.max.x += width2 / 2; - return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color); + return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color); } else if (rect.min.y == rect.max.y) { rect.min.y -= width2 / 2; rect.max.y += width2 / 2; - return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color); + return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color); } return 0; } +int extract_subpage_alloc(extract_alloc_t* alloc, rect_t mediabox, extract_page_t* page, subpage_t** psubpage) +{ + subpage_t* subpage; + if (extract_malloc(alloc, psubpage, sizeof(subpage_t))) + { + return -1; + } + subpage = *psubpage; + subpage->mediabox = mediabox; + subpage->spans = NULL; + subpage->spans_num = 0; + subpage->lines = NULL; + subpage->lines_num = 0; + subpage->paragraphs = NULL; + subpage->paragraphs_num = 0; + subpage->images = NULL; + subpage->images_num = 0; + subpage->tablelines_horizontal.tablelines = NULL; + subpage->tablelines_horizontal.tablelines_num = 0; + subpage->tablelines_vertical.tablelines = NULL; + subpage->tablelines_vertical.tablelines_num = 0; + subpage->tables = NULL; + subpage->tables_num = 0; + + if (extract_realloc2( + alloc, + &page->subpages, + sizeof(subpage_t*) * page->subpages_num, + sizeof(subpage_t*) * (page->subpages_num + 1) + )) { + extract_free(alloc, psubpage); + return -1; + } + page->subpages[page->subpages_num] = subpage; + page->subpages_num += 1; + return 0; +} + +static int extract_subpage_begin(extract_t* extract, double x0, double y0, double x1, double y1) +/* Appends new empty subpage_t to the last page of an extract->document. */ +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num - 1]; + subpage_t* subpage; + rect_t mediabox = { { x0, y0 }, { x1, y1 } }; + + return extract_subpage_alloc(extract->alloc, mediabox, page, &subpage); +} -int extract_page_begin(extract_t* extract) +int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1) { - /* Appends new empty extract_page_t to an extract->document. */ + /* Appends new empty page_t to an extract->document. */ extract_page_t* page; - if (extract_malloc(extract->alloc, &page, sizeof(extract_page_t))) return -1; - page->spans = NULL; - page->spans_num = 0; - page->lines = NULL; - page->lines_num = 0; - page->paragraphs = NULL; - page->paragraphs_num = 0; - page->images = NULL; - page->images_num = 0; - page->tablelines_horizontal.tablelines = NULL; - page->tablelines_horizontal.tablelines_num = 0; - page->tablelines_vertical.tablelines = NULL; - page->tablelines_vertical.tablelines_num = 0; - page->tables = NULL; - page->tables_num = 0; - + + if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1; + page->mediabox.min.x = x0; + page->mediabox.min.y = y0; + page->mediabox.max.x = x1; + page->mediabox.max.y = y1; + page->subpages = NULL; + page->subpages_num = 0; + page->split = NULL; + if (extract_realloc2( extract->alloc, &extract->document.pages, - sizeof(extract_page_t*) * extract->document.pages_num + 1, - sizeof(extract_page_t*) * (extract->document.pages_num + 1) + sizeof(subpage_t*) * extract->document.pages_num, + sizeof(subpage_t*) * (extract->document.pages_num + 1) )) { extract_free(extract->alloc, &page); return -1; } + extract->document.pages[extract->document.pages_num] = page; extract->document.pages_num += 1; + + if (extract_subpage_begin(extract, x0, y0, x1, y1)) { + extract->document.pages_num--; + page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]); + return -1; + } + return 0; } @@ -1634,13 +1739,22 @@ int extract_stroke_end(extract_t* extract) -int extract_page_end(extract_t* extract) +static int extract_subpage_end(extract_t* extract) { (void) extract; return 0; } +int extract_page_end(extract_t* extract) +{ + if (extract_subpage_end(extract)) + return -1; + + return 0; +} + + static int paragraphs_to_text_content( extract_alloc_t* alloc, paragraph_t** paragraphs, @@ -1692,54 +1806,59 @@ static int extract_write_tables_csv(extract_t* extract) FILE* f = NULL; extract_astring_t text = {NULL, 0}; if (!extract->tables_csv_format) return 0; - + outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); outf("extract->document.pages_num=%i", extract->document.pages_num); for (p=0; p<extract->document.pages_num; ++p) { + int c; extract_page_t* page = extract->document.pages[p]; - int t; - outf("p=%i page->tables_num=%i", p, page->tables_num); - for (t=0; t<page->tables_num; ++t) + for (c=0; c<page->subpages_num; ++c) { - table_t* table = page->tables[t]; - int y; - extract_free(extract->alloc, &path); - if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; - extract->tables_csv_i += 1; - outf("Writing table %i to: %s", t, path); - outf("table->cells_num_x=%i", table->cells_num_x); - outf("table->cells_num_y=%i", table->cells_num_y); - f = fopen(path, "w"); - if (!f) goto end; - for (y=0; y<table->cells_num_y; ++y) + subpage_t* subpage = page->subpages[c]; + int t; + outf("p=%i subpage->tables_num=%i", p, subpage->tables_num); + for (t=0; t<subpage->tables_num; ++t) { - int x; - int have_output = 0; - for (x=0; x<table->cells_num_x; ++x) + table_t* table = subpage->tables[t]; + int y; + extract_free(extract->alloc, &path); + if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; + extract->tables_csv_i += 1; + outf("Writing table %i to: %s", t, path); + outf("table->cells_num_x=%i", table->cells_num_x); + outf("table->cells_num_y=%i", table->cells_num_y); + f = fopen(path, "w"); + if (!f) goto end; + for (y=0; y<table->cells_num_y; ++y) { - cell_t* cell = table->cells[table->cells_num_x * y + x]; - extract_astring_free(extract->alloc, &text); - if (y==0) + int x; + int have_output = 0; + for (x=0; x<table->cells_num_x; ++x) { - outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + cell_t* cell = table->cells[table->cells_num_x * y + x]; + extract_astring_free(extract->alloc, &text); + if (y==0) + { + outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + } + if (have_output) fprintf(f, ","); + have_output = 1; + if (paragraphs_to_text_content( + extract->alloc, + cell->paragraphs, + cell->paragraphs_num, + &text + )) goto end; + /* Reference cvs output trims trailing spaces. */ + extract_astring_char_truncate_if(&text, ' '); + fprintf(f, "\"%s\"", text.chars ? text.chars : ""); } - if (have_output) fprintf(f, ","); - have_output = 1; - if (paragraphs_to_text_content( - extract->alloc, - cell->paragraphs, - cell->paragraphs_num, - &text - )) goto end; - /* Reference cvs output trims trailing spaces. */ - extract_astring_char_truncate_if(&text, ' '); - fprintf(f, "\"%s\"", text.chars ? text.chars : ""); + fprintf(f, "\n"); } - fprintf(f, "\n"); + fclose(f); + f = NULL; } - fclose(f); - f = NULL; } } ret = 0; @@ -1760,7 +1879,7 @@ int extract_process( ) { int e = -1; - + if (extract_realloc2( extract->alloc, &extract->contentss, @@ -1769,9 +1888,9 @@ int extract_process( )) goto end; extract_astring_init(&extract->contentss[extract->contentss_num]); extract->contentss_num += 1; - - if (extract_document_join(extract->alloc, &extract->document)) goto end; - + + if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis)) goto end; + if (extract->format == extract_format_ODT) { if (extract_document_to_odt_content( @@ -1811,12 +1930,17 @@ int extract_process( for (p=0; p<extract->document.pages_num; ++p) { extract_page_t* page = extract->document.pages[p]; - if (paragraphs_to_text_content( - extract->alloc, - page->paragraphs, - page->paragraphs_num, - &extract->contentss[extract->contentss_num - 1] + int c; + for (c=0; c<page->subpages_num; ++c) + { + subpage_t* subpage = page->subpages[c]; + if (paragraphs_to_text_content( + extract->alloc, + subpage->paragraphs, + subpage->paragraphs_num, + &extract->contentss[extract->contentss_num - 1] )) goto end; + } } } else @@ -1828,23 +1952,23 @@ int extract_process( } if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; - + if (extract->tables_csv_format) { extract_write_tables_csv(extract); } - + { - int i; - for (i=0; i<extract->document.pages_num; ++i) { - page_free(extract->alloc, &extract->document.pages[i]); + int p; + for (p=0; p<extract->document.pages_num; ++p) { + page_free(extract->alloc, &extract->document.pages[p]); } extract_free(extract->alloc, &extract->document.pages); extract->document.pages_num = 0; } - + e = 0; - + end: return e; } @@ -1855,7 +1979,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) extract_zip_t* zip = NULL; char* text2 = NULL; int i; - + if (extract->format == extract_format_ODT) { if (extract_zip_open(buffer, &zip)) goto end; @@ -1922,7 +2046,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } if (extract_zip_close(&zip)) goto end; - + } else if (extract->format == extract_format_HTML) { @@ -1945,9 +2069,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) errno = EINVAL; return 1; } - + e = 0; - + end: if (e) { @@ -1955,7 +2079,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) extract_zip_close(&zip); } extract_free(extract->alloc, &text2); - + return e; } @@ -1982,7 +2106,7 @@ static int string_ends_with(const char* string, const char* end) } int extract_write_template( - extract_t* extract, + extract_t* extract, const char* path_template, const char* path_out, int preserve_dir @@ -2021,7 +2145,7 @@ void extract_end(extract_t** pextract) extract_t* extract = *pextract; if (!extract) return; extract_document_free(extract->alloc, &extract->document); - + { int i; for (i=0; i<extract->contentss_num; ++i) { diff --git a/extract/src/html.c b/extract/src/html.c index d12a3101..de204881 100644 --- a/extract/src/html.c +++ b/extract/src/html.c @@ -51,7 +51,7 @@ static int content_state_reset(extract_alloc_t* alloc, content_state_t* content_ content_state->font.italic = 0; } e = 0; - + end: return e; } @@ -110,9 +110,9 @@ static int paragraph_to_html_content( } } if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end; - + e = 0; - + end: return e; } @@ -136,10 +136,10 @@ etc. */ paragraph_t* paragraph = paragraphs[p]; if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end; } - + if (content_state_reset(alloc, state, content)) goto end; e = 0; - + end: return e; } @@ -148,19 +148,14 @@ static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* { int e = -1; int y; - + if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end; - + for (y=0; y<table->cells_num_y; ++y) { /* If 1, we put each <td>...</td> on a separate line. */ - int multiline = 0; int x; if (extract_astring_cat(alloc, content, " <tr>\n")) goto end; - if (!multiline) - { - if (extract_astring_cat(alloc, content, " ")) goto end; - } for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; @@ -172,7 +167,7 @@ static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* } if (extract_astring_cat(alloc, content, " ")) goto end; if (extract_astring_cat(alloc, content, "<td")) goto end; - + if (cell->extend_right > 1) { if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end; @@ -181,7 +176,7 @@ static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* { if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end; } - + if (extract_astring_cat(alloc, content, ">")) goto end; if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end; @@ -190,15 +185,11 @@ static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* if (content_state_reset(alloc, state, content)) goto end; } - if (!multiline) - { - if (extract_astring_cat(alloc, content, "\n")) goto end; - } if (extract_astring_cat(alloc, content, " </tr>\n")) goto end; } if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end; e = 0; - + end: return e; } @@ -222,6 +213,126 @@ static int compare_paragraph_y(const void* a, const void* b) return 0; } +/* +*/ +static int +split_to_html(extract_alloc_t *alloc, split_t* split, subpage_t*** ppsubpage, extract_astring_t *output) +{ + int p; + int s; + int t; + subpage_t* subpage; + paragraph_t** paragraphs = NULL; + content_state_t state; + content_state_init(&state); + + if (split == NULL) { + /* fall through to below - SPLIT_NONE */ + } else if (split->type == SPLIT_HORIZONTAL) { + int ret = 0; + double total = 0; + for (s = 0; s < split->count; s++) { + total += split->split[s]->weight; + } + if (split->count > 1) + extract_astring_cat(alloc, output, "<div style=\"display:flex;\">\n"); + for (s = 0; s < split->count; s++) { + if (split->count > 1) + { + if (total == 0) + { + extract_astring_catf(alloc, output, "<div>\n"); + } + else + { + extract_astring_catf(alloc, output, "<div style=\"width:%g%%;\">\n", 100.0*split->split[s]->weight/total); + } + } + ret = split_to_html(alloc, split->split[s], ppsubpage, output); + if (ret) + break; + if (split->count > 1) + extract_astring_cat(alloc, output, "</div>\n"); + } + if (split->count > 1) + extract_astring_cat(alloc, output, "</div>\n"); + return ret; + } else if (split->type == SPLIT_VERTICAL) { + int ret = 0; + for (s = 0; s < split->count; s++) { + ret = split_to_html(alloc, split->split[s], ppsubpage, output); + if (ret) + break; + } + return ret; + } + + /* We'll deal with the next subpage entry. Increment the pointer for the + * next caller. */ + subpage = **ppsubpage; + *ppsubpage = (*ppsubpage)+1; + + /* Output paragraphs and tables in order of increasing <y> coordinate. + + Unfortunately the paragraph ordering we do in page->paragraphs[] + isn't quite right and results in bad ordering if ctm/trm matrices are + inconsistent. So we create our own list of paragraphs sorted strictly + by y coordinate of the first char of each paragraph. */ + if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * subpage->paragraphs_num)) goto end; + for (p = 0; p < subpage->paragraphs_num; ++p) + { + paragraphs[p] = subpage->paragraphs[p]; + } + qsort(paragraphs, subpage->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); + + if (0) + { + int p; + outf0("paragraphs are:"); + for (p=0; p<subpage->paragraphs_num; ++p) + { + paragraph_t* paragraph = subpage->paragraphs[p]; + line_t* line = paragraph->lines[0]; + span_t* span = line->spans[0]; + outf0(" p=%i: %s", p, extract_span_string(NULL, span)); + } + } + + p = 0; + t = 0; + for(;;) + { + double y_paragraph; + double y_table; + paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : paragraphs[p]; + table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t]; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; + outf("p=%i y_paragraph=%f", p, y_paragraph); + outf("t=%i y_table=%f", t, y_table); + if (paragraph && y_paragraph < y_table) + { + //extract_astring_catf(alloc, output, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph); + if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end; + if (content_state_reset(alloc, &state, output)) goto end; + p += 1; + } + else if (table) + { + //extract_astring_catf(alloc, output, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table); + if (append_table(alloc, &state, table, output)) goto end; + t += 1; + } + } + extract_free(alloc, ¶graphs); + return 0; + +end: + extract_free(alloc, ¶graphs); + return -1; +} + int extract_document_to_html_content( extract_alloc_t* alloc, document_t* document, @@ -231,84 +342,35 @@ int extract_document_to_html_content( ) { int ret = -1; - int p; + int n; paragraph_t** paragraphs = NULL; - + (void) rotation; (void) images; - + extract_astring_cat(alloc, content, "<html>\n"); extract_astring_cat(alloc, content, "<body>\n"); - + /* Write paragraphs into <content>. */ - for (p=0; p<document->pages_num; ++p) + for (n=0; n<document->pages_num; ++n) { - extract_page_t* page = document->pages[p]; - int p; - int t; - content_state_t state; - content_state_init(&state); - extract_free(alloc, ¶graphs); - - /* Output paragraphs and tables in order of increasing <y> coordinate. + extract_page_t* page = document->pages[n]; + subpage_t **psubpage = page->subpages; - Unfortunately the paragraph ordering we do in page->paragraphs[] - isn't quite right and results in bad ordering if ctm/trm matrices are - inconsistent. So we create our own list of paragraphs sorted strictly - by y coordinate of the first char of each paragraph. */ - if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * page->paragraphs_num)) goto end; - for (p = 0; p < page->paragraphs_num; ++p) - { - paragraphs[p] = page->paragraphs[p]; - } - qsort(paragraphs, page->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); - - if (0) - { - int p; - outf0("paragraphs are:"); - for (p=0; p<page->paragraphs_num; ++p) - { - paragraph_t* paragraph = page->paragraphs[p]; - line_t* line = paragraph->lines[0]; - span_t* span = line->spans[0]; - outf0(" p=%i: %s", p, extract_span_string(NULL, span)); - } - } + /* Every page gets its own div. */ + extract_astring_cat(alloc, content, "<div>\n"); - p = 0; - t = 0; - for(;;) - { - double y_paragraph; - double y_table; - paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : paragraphs[p]; - table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; - if (!paragraph && !table) break; - y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; - y_table = (table) ? table->pos.y : DBL_MAX; - outf("p=%i y_paragraph=%f", p, y_paragraph); - outf("t=%i y_table=%f", t, y_table); - if (paragraph && y_paragraph < y_table) - { - //extract_astring_catf(alloc, content, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph); - if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, content)) goto end; - if (content_state_reset(alloc, &state, content)) goto end; - p += 1; - } - else if (table) - { - //extract_astring_catf(alloc, content, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table); - if (append_table(alloc, &state, table, content)) goto end; - t += 1; - } - } + ret = split_to_html(alloc, page->split, &psubpage, content); + if (ret) + goto end; + + extract_astring_cat(alloc, content, "</div>\n"); } extract_astring_cat(alloc, content, "</body>\n"); extract_astring_cat(alloc, content, "</html>\n"); ret = 0; - end: +end: extract_free(alloc, ¶graphs); return ret; } diff --git a/extract/src/join.c b/extract/src/join.c index 4425de3d..110d3901 100644 --- a/extract/src/join.c +++ b/extract/src/join.c @@ -356,7 +356,7 @@ On entry: On exit: If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num line_t*'s, each pointing to a line_t. - + If <rects_num> is zero, each of these line_t's will contain pointers to items in <spans>; otherwise each of the line_t's will contain new spans which should be freed by the caller (spans are not necessarily wholy inside @@ -385,7 +385,7 @@ static int make_lines( int num_compatible; int num_joins; span_t* span = NULL; - + if (rects_num) { /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */ @@ -412,7 +412,7 @@ static int make_lines( { extract_span_free(alloc, &span); } - + if (!spans[a]->chars_num) { /* All characters in this span are inside table, so remove @@ -446,7 +446,7 @@ static int make_lines( outfx("initial line a=%i: %s", a, line_string(lines[a])); } } - + num_compatible = 0; /* For each line, look for nearest aligned line, and append if found. */ @@ -459,7 +459,7 @@ static int make_lines( line_t* nearest_line = NULL; span_t* span_a; double angle_a; - + line_t* line_a = lines[a]; if (!line_a) { continue; @@ -580,7 +580,7 @@ static int make_lines( { continue; } - + if (1 && extract_span_char_last(span_a)->ucs != ' ' && span_char_first(span_b)->ucs != ' ' @@ -903,7 +903,7 @@ On exit: are undefined. */ static int make_paragraphs( - extract_alloc_t* alloc, + extract_alloc_t* alloc, line_t** lines, int lines_num, paragraph_t*** o_paragraphs, @@ -941,7 +941,7 @@ static int make_paragraphs( double angle_a; int verbose; int b; - + paragraph_t* paragraph_a = paragraphs[a]; if (!paragraph_a) { /* This paragraph is empty - already been appended to a different @@ -1183,9 +1183,9 @@ static int make_paragraphs( return ret; } -static int s_join_page_rects( +static int s_join_subpage_rects( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, rect_t* rects, int rects_num, line_t*** lines, @@ -1198,8 +1198,8 @@ rects_num is zero. */ { if (make_lines( alloc, - page->spans, - &page->spans_num, + subpage->spans, + &subpage->spans_num, rects, rects_num, lines, @@ -1212,7 +1212,7 @@ rects_num is zero. */ paragraphs, paragraphs_num )) return -1; - + return 0; } @@ -1304,17 +1304,17 @@ void extract_cell_init(cell_t* cell) static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y) -{ +{ /* Find cell extensions to right and down by looking at cells' .left and .above flags. - + For example for adjacent cells ABC..., we extend A to include cells BC.. until we reach a cell with .left set to one. - + ABCDE FGHIJ KLMNO - + When looking to extend cell A, we only look at cells in the same column or same row, (i.e. in the above example we look at BCDE and FK, and not at GHIJ and LMNO). @@ -1349,7 +1349,7 @@ static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y) } cell->extend_down = yy - y; cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y; - + /* Clear .above and .left in enclosed cells. */ for (xx = x; xx < x + cell->extend_right; ++xx) { @@ -1384,7 +1384,7 @@ static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y) } -static int table_find_cells_text(extract_alloc_t* alloc, extract_page_t* page, +static int table_find_cells_text(extract_alloc_t* alloc, subpage_t* subpage, cell_t** cells, int cells_num_x, int cells_num_y) /* Sets each cell to contain the text that is within the cell's boundary. We remove any found text from the page. */ @@ -1398,9 +1398,9 @@ remove any found text from the page. */ { cell_t* cell = cells[i]; if (!cell->above || !cell->left) continue; - if (s_join_page_rects( + if (s_join_subpage_rects( alloc, - page, + subpage, &cell->rect, 1 /*rects_num*/, &cell->lines, @@ -1409,17 +1409,17 @@ remove any found text from the page. */ &cell->paragraphs_num )) return -1; } - + /* Append the table we have found to page->tables[]. */ - if (extract_realloc(alloc, &page->tables, sizeof(*page->tables) * (page->tables_num + 1))) goto end; - if (extract_malloc(alloc, &page->tables[page->tables_num], sizeof(*page->tables[page->tables_num]))) goto end; - page->tables[page->tables_num]->pos.x = cells[0]->rect.min.x; - page->tables[page->tables_num]->pos.y = cells[0]->rect.min.y; - page->tables[page->tables_num]->cells = cells; - page->tables[page->tables_num]->cells_num_x = cells_num_x; - page->tables[page->tables_num]->cells_num_y = cells_num_y; - page->tables_num += 1; - + if (extract_realloc(alloc, &subpage->tables, sizeof(*subpage->tables) * (subpage->tables_num + 1))) goto end; + if (extract_malloc(alloc, &subpage->tables[subpage->tables_num], sizeof(*subpage->tables[subpage->tables_num]))) goto end; + subpage->tables[subpage->tables_num]->pos.x = cells[0]->rect.min.x; + subpage->tables[subpage->tables_num]->pos.y = cells[0]->rect.min.y; + subpage->tables[subpage->tables_num]->cells = cells; + subpage->tables[subpage->tables_num]->cells_num_x = cells_num_x; + subpage->tables[subpage->tables_num]->cells_num_y = cells_num_y; + subpage->tables_num += 1; + if (0) { /* For debugging. */ @@ -1442,24 +1442,24 @@ remove any found text from the page. */ } fprintf(stderr, "\n"); } - + } - + e = 0; end: return e; } -static int table_find(extract_alloc_t* alloc, extract_page_t* page, double y_min, double y_max) +static int table_find(extract_alloc_t* alloc, subpage_t* subpage, double y_min, double y_max) /* Finds single table made from lines whose y coordinates are in the range y_min..y_max. */ { - tablelines_t* all_h = &page->tablelines_horizontal; - tablelines_t* all_v = &page->tablelines_vertical; + tablelines_t* all_h = &subpage->tablelines_horizontal; + tablelines_t* all_v = &subpage->tablelines_vertical; int e = -1; int i; - + /* Find subset of vertical and horizontal lines that are within range y_min..y_max, and sort by y coordinate. */ tablelines_t tl_h = {NULL, 0}; @@ -1472,14 +1472,14 @@ y_min..y_max. */ int y; outf("y=(%f %f)", y_min, y_max); - + if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end; if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end; /* Suppress false coverity warning - qsort() does not dereference null pointer if nmemb is zero. */ /* coverity[var_deref_model] */ qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x); - + if (0) { /* Show raw lines info. */ @@ -1519,28 +1519,28 @@ y_min..y_max. */ break; } cells_num_y += 1; - + for (j=0; j<tl_v.tablelines_num; ) { int j_next; int ii; int jj; cell_t* cell; - + for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next) { if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break; } outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect)); - + if (j_next == tl_v.tablelines_num) break; - + if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end; if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end; cell = cells[cells_num]; cells_num += 1; if (i==0) cells_num_x += 1; - + cell->rect.min.x = tl_v.tablelines[j].rect.min.x; cell->rect.min.y = tl_h.tablelines[i].rect.min.y; cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x; @@ -1553,7 +1553,7 @@ y_min..y_max. */ cell->lines_num = 0; cell->paragraphs = NULL; cell->paragraphs_num = 0; - + /* Set cell->above if there is a horizontal line above the cell. */ outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect)); for (ii = i; ii < i_next; ++ii) @@ -1570,7 +1570,7 @@ y_min..y_max. */ break; } } - + /* Set cell->left if there is a vertical line to the left of the cell. */ for (jj = j; jj < j_next; ++jj) { @@ -1586,15 +1586,15 @@ y_min..y_max. */ break; } } - + j = j_next; } - + i = i_next; } - + assert(cells_num == cells_num_x * cells_num_y); - + /* Remove cols and rows where no cells have .above and .left - these will not appear. It also avoids spurious empty columns when table uses closely-spaced double lines as separators. */ @@ -1629,7 +1629,7 @@ y_min..y_max. */ cells_num_x -= 1; } } - + if (cells_num == 0) { e = 0; @@ -1637,9 +1637,9 @@ y_min..y_max. */ } if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end; - - if (table_find_cells_text(alloc, page, cells, cells_num_x, cells_num_y)) goto end; - + + if (table_find_cells_text(alloc, subpage, cells, cells_num_x, cells_num_y)) goto end; + e = 0; end: extract_free(alloc, &tl_h.tablelines); @@ -1656,9 +1656,9 @@ y_min..y_max. */ } -static int extract_page_tables_find_lines( +static int extract_subpage_tables_find_lines( extract_alloc_t* alloc, - extract_page_t* page + subpage_t* subpage ) /* Finds tables in <page> by looking for lines in page->tablelines_horizontal and page->tablelines_vertical that look like table dividers. @@ -1671,45 +1671,45 @@ Any text found inside tables is removed from page->spans[]. double margin = 1; int iv; int ih; - outf("page->tablelines_horizontal.tablelines_num=%i", page->tablelines_horizontal.tablelines_num); - outf("page->tablelines_vertical.tablelines_num=%i", page->tablelines_vertical.tablelines_num); - + outf("page->tablelines_horizontal.tablelines_num=%i", subpage->tablelines_horizontal.tablelines_num); + outf("page->tablelines_vertical.tablelines_num=%i", subpage->tablelines_vertical.tablelines_num); + /* Sort all lines by y coordinate. */ qsort( - page->tablelines_horizontal.tablelines, - page->tablelines_horizontal.tablelines_num, - sizeof(*page->tablelines_horizontal.tablelines), + subpage->tablelines_horizontal.tablelines, + subpage->tablelines_horizontal.tablelines_num, + sizeof(*subpage->tablelines_horizontal.tablelines), tablelines_compare_y ); qsort( - page->tablelines_vertical.tablelines, - page->tablelines_vertical.tablelines_num, - sizeof(*page->tablelines_vertical.tablelines), + subpage->tablelines_vertical.tablelines, + subpage->tablelines_vertical.tablelines_num, + sizeof(*subpage->tablelines_vertical.tablelines), tablelines_compare_y ); - + if (0) { /* Show info about lines. */ int i; outf0("tablelines_horizontal:"); - for (i=0; i<page->tablelines_horizontal.tablelines_num; ++i) + for (i=0; i<subpage->tablelines_horizontal.tablelines_num; ++i) { outf0(" color=%f: %s", - page->tablelines_horizontal.tablelines[i].color, - extract_rect_string(&page->tablelines_horizontal.tablelines[i].rect) + subpage->tablelines_horizontal.tablelines[i].color, + extract_rect_string(&subpage->tablelines_horizontal.tablelines[i].rect) ); } outf0("tablelines_vertical:"); - for (i=0; i<page->tablelines_vertical.tablelines_num; ++i) + for (i=0; i<subpage->tablelines_vertical.tablelines_num; ++i) { outf0(" color=%f: %s", - page->tablelines_vertical.tablelines[i].color, - extract_rect_string(&page->tablelines_vertical.tablelines[i].rect) + subpage->tablelines_vertical.tablelines[i].color, + extract_rect_string(&subpage->tablelines_vertical.tablelines[i].rect) ); } } - + /* Look for completely separate vertical regions that define different tables, by looking for vertical gaps between the rects of each horizontal/vertical line. */ @@ -1722,22 +1722,22 @@ Any text found inside tables is removed from page->spans[]. tableline_t* tlv = NULL; tableline_t* tlh = NULL; tableline_t* tl; - if (iv < page->tablelines_vertical.tablelines_num) + if (iv < subpage->tablelines_vertical.tablelines_num) { - tlv = &page->tablelines_vertical.tablelines[iv]; + tlv = &subpage->tablelines_vertical.tablelines[iv]; } /* We only consider horizontal lines that are not white. This is a bit of a cheat to get the right behaviour with twotables_2.pdf. */ - while (ih < page->tablelines_horizontal.tablelines_num) + while (ih < subpage->tablelines_horizontal.tablelines_num) { - if (page->tablelines_horizontal.tablelines[ih].color == 1) + if (subpage->tablelines_horizontal.tablelines[ih].color == 1) { /* Ignore white horizontal lines. */ ++ih; } else { - tlh = &page->tablelines_horizontal.tablelines[ih]; + tlh = &subpage->tablelines_horizontal.tablelines[ih]; break; } } @@ -1756,16 +1756,16 @@ Any text found inside tables is removed from page->spans[]. { outf("New table. maxy=%f miny=%f", maxy, miny); /* Find table. */ - table_find(alloc, page, miny - margin, maxy + margin); + table_find(alloc, subpage, miny - margin, maxy + margin); } miny = tl->rect.min.y; } if (tl->rect.max.y > maxy) maxy = tl->rect.max.y; } - + /* Find last table. */ - table_find(alloc, page, miny - margin, maxy + margin); - + table_find(alloc, subpage, miny - margin, maxy + margin); + return 0; } @@ -1793,9 +1793,9 @@ static void show_tables(table_t** tables, int tables_num) } } -static int extract_page_tables_find( +static int extract_subpage_tables_find( extract_alloc_t* alloc, - extract_page_t* page + subpage_t* subpage ) /* Find tables in <page>. @@ -1804,53 +1804,53 @@ will call other functions that find tables in different ways, e.g. by analysing an image of a page, or looking for blocks of whitespace in between chunks of text. */ { - if (extract_page_tables_find_lines(alloc, page)) return -1; + if (extract_subpage_tables_find_lines(alloc, subpage)) return -1; if (0) { outf0("=== tables from extract_page_tables_find_lines():"); - show_tables(page->tables, page->tables_num); + show_tables(subpage->tables, subpage->tables_num); } return 0; } -static int extract_document_join_page( +static int extract_join_subpage( extract_alloc_t* alloc, - extract_page_t* page + subpage_t* subpage ) /* Finds tables and paragraphs on <page>. */ { /* Find tables on this page first. This will remove text that is within tables from page->spans, so that text doesn't appearing more than once in the final output. */ - if (extract_page_tables_find(alloc, page)) return -1; + if (extract_subpage_tables_find(alloc, subpage)) return -1; /* Now join remaining spans into lines and paragraphs. */ - if (s_join_page_rects( + if (s_join_subpage_rects( alloc, - page, + subpage, NULL /*rects*/, 0 /*rects_num*/, - &page->lines, - &page->lines_num, - &page->paragraphs, - &page->paragraphs_num + &subpage->lines, + &subpage->lines_num, + &subpage->paragraphs, + &subpage->paragraphs_num )) { - outf0("s_join_page_rects failed. page->spans_num=%i page->lines_num=%i page->paragraphs_num=%i", - page->spans_num, - page->lines_num, - page->paragraphs_num + outf0("s_join_subpage_rects failed. subpage->spans_num=%i subpage->lines_num=%i subpage->paragraphs_num=%i", + subpage->spans_num, + subpage->lines_num, + subpage->paragraphs_num ); return -1; } - + return 0; } -int extract_document_join(extract_alloc_t* alloc, document_t* document) +int extract_document_join(extract_alloc_t* alloc, document_t* document, int layout_analysis) { /* For each page in <document> we find tables and join spans into lines and paragraphs. @@ -1861,9 +1861,16 @@ int extract_document_join(extract_alloc_t* alloc, document_t* document) int p; for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - - outf("processing page %i: num_spans=%i", p, page->spans_num); - if (extract_document_join_page(alloc, page)) return -1; + int c; + + if (layout_analysis && extract_page_analyse(alloc, page)) return -1; + + for (c=0; c<page->subpages_num; ++c) { + subpage_t* subpage = page->subpages[c]; + + outf("processing page %i, subpage %i: num_spans=%i", p, c, subpage->spans_num); + if (extract_join_subpage(alloc, subpage)) return -1; + } } return 0; diff --git a/extract/src/mem.c b/extract/src/mem.c index 1c3c96e6..788080f4 100644 --- a/extract/src/mem.c +++ b/extract/src/mem.c @@ -35,7 +35,7 @@ int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va } vsnprintf(*out, n + 1, format, va2); ret = 0; - + end: va_end(va2); return ret; diff --git a/extract/src/memento.c b/extract/src/memento.c index e62744be..e991b29b 100644 --- a/extract/src/memento.c +++ b/extract/src/memento.c @@ -1448,7 +1448,10 @@ int Memento_listBlocksNested(void) size_t end = (b->rawsize < MEMENTO_PTRSEARCH ? b->rawsize : MEMENTO_PTRSEARCH); size_t z; VALGRIND_MAKE_MEM_DEFINED(p, end); - end -= sizeof(void *)-1; + if (end > sizeof(void *)-1) + end -= sizeof(void *)-1; + else + end = 0; for (z = MEMENTO_SEARCH_SKIP; z < end; z += sizeof(void *)) { void *q = *(void **)(&p[z]); void **r; diff --git a/extract/src/memento.py b/extract/src/memento.py index 55171e39..6924736c 100755 --- a/extract/src/memento.py +++ b/extract/src/memento.py @@ -57,7 +57,7 @@ def main(): stdin = child.stdout else: stdin = sys.stdin - + openbsd = os.uname()[0] == 'OpenBSD' n = None segv = 0 @@ -70,14 +70,14 @@ def main(): if m: if not m.group(2): # Start of squeeze. - + if 0 and not openbsd: # Looks like memento's forked processes might terminate # before they get to output the 'Memory squeezing @ <N> # complete' line. # assert n is None, f'n={n} line={line!r}' - + n = int(m.group(1)) if n >= quiet_next: sys.stdout.write(f'quiet_next={quiet_next!r} n={n!r}: {line}') diff --git a/extract/src/misc-test.c b/extract/src/misc-test.c index 5e658e8f..ec10079d 100644 --- a/extract/src/misc-test.c +++ b/extract/src/misc-test.c @@ -24,7 +24,7 @@ static void s_check( else { ok = (ret == 0 && values_equal); } - + if (ok) printf(" ok: "); else printf(" fail:"); printf(" text=%16s", text); @@ -81,9 +81,9 @@ static void s_check_xml_parse() "< bar=>", "< =>", }; - + extract_xml_tag_init( &tag); - + for (i=0; i<sizeof(texts) / sizeof(texts[0]); ++i) { const char* text = texts[i]; @@ -103,7 +103,7 @@ static void s_check_xml_parse() e = extract_xml_pparse_next( buffer, &tag); s_check_e( e, "extract_xml_pparse_next()"); s_check_e( tag.name ? 0 : 1, "tag.name is not null"); - + { int j; for (j=0; j<tag.attributes_num; ++j) @@ -122,17 +122,17 @@ int main(void) s_check_int("-20", -20, 0); s_check_int("-20b", 0, EINVAL); s_check_int("123456789123", 0, ERANGE); - + printf("testing extract_xml_str_to_uint():\n"); s_check_uint("2", 2, 0); s_check_uint("-20", 0, ERANGE); s_check_uint("-20b", 0, EINVAL); s_check_uint("123456789123", 0, ERANGE); - + s_check_xml_parse(); - + printf("s_num_fails=%i\n", s_num_fails); - + if (s_num_fails) { printf("Failed\n"); return 1; diff --git a/extract/src/odt.c b/extract/src/odt.c index 9e369078..e2e45e2d 100644 --- a/extract/src/odt.c +++ b/extract/src/odt.c @@ -128,7 +128,7 @@ static int s_odt_styles_definitions( ); extract_astring_cat(alloc, out, "<style:paragraph-properties style:writing-mode=\"lr-tb\"/>\n"); extract_astring_cat(alloc, out, "</style:style>\n"); - + /* Style for images. */ extract_astring_cat(alloc, out, "<style:style style:name=\"fr1\" style:family=\"graphic\" style:parent-style-name=\"Graphics\">\n"); extract_astring_cat(alloc, out, "<style:graphic-properties" @@ -156,8 +156,8 @@ static int s_odt_styles_definitions( " draw:color-mode=\"standard\"" "/>\n"); extract_astring_cat(alloc, out, "</style:style>\n"); - - + + if (extract_astring_cat(alloc, out, "</office:automatic-styles>")) return -1; return 0; } @@ -308,9 +308,9 @@ change font. */ if (s_odt_run_finish(alloc, content_state, content)) goto end; } if (s_odt_paragraph_finish(alloc, content)) goto end; - + e = 0; - + end: return e; } @@ -335,14 +335,14 @@ static int s_odt_append_image( ); extract_astring_cat(alloc, content, "</draw:frame>\n"); extract_astring_cat(alloc, content, "</text:p>\n"); - + return 0; } static int s_odt_output_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, int paragraph_begin, int paragraph_end, double rotation_rad, @@ -361,13 +361,13 @@ static int s_odt_output_rotated_paragraphs( int p; double pt_to_inch = 1/72.0; outf("rotated paragraphs: rotation_rad=%f (x y)=(%f %f) (w h)=(%f %f)", rotation_rad, x_pt, y_pt, w_pt, h_pt); - + // https://docs.oasis-open.org/office/OpenDocument/v1.3/cs02/part3-schema/OpenDocument-v1.3-cs02-part3-schema.html#attribute-draw_transform // says rotation is in degrees, but we seem to require -radians. // - + if (!e) e = extract_astring_cat(alloc, content, "\n"); - + if (!e) e = extract_astring_cat(alloc, content, "<text:p text:style-name=\"Standard\">\n"); if (!e) e = extract_astring_catf(alloc, content, "<draw:frame" " text:anchor-type=\"paragraph\"" @@ -388,19 +388,19 @@ static int s_odt_output_rotated_paragraphs( y_pt * pt_to_inch ); if (!e) e = extract_astring_cat(alloc, content, "<draw:text-box>\n"); - + for (p=paragraph_begin; p<paragraph_end; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + paragraph_t* paragraph = subpage->paragraphs[p]; if (!e) e = s_document_to_odt_content_paragraph(alloc, content_state, paragraph, content, styles); } - + if (!e) e = extract_astring_cat(alloc, content, "\n"); if (!e) e = extract_astring_cat(alloc, content, "</draw:text-box>\n"); if (!e) e = extract_astring_cat(alloc, content, "</draw:frame>\n"); - + if (!e) e = extract_astring_cat(alloc, content, "</text:p>\n"); - + return e; } @@ -409,7 +409,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as { int e = -1; int y; - + { int x; static int table_number = 0; @@ -438,7 +438,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as if (extract_astring_cat(alloc, content, " <table:table-row>\n" )) goto end; - + for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; @@ -447,7 +447,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as if (extract_astring_cat(alloc, content, " <table:covered-table-cell/>\n")) goto end; continue; } - + if (extract_astring_cat(alloc, content, " <table:table-cell")) goto end; if (cell->extend_right > 1) { @@ -458,7 +458,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as if (extract_astring_catf(alloc, content, " table:number-rows-spanned=\"%i\"", cell->extend_down)) goto end; } if (extract_astring_catf(alloc, content, ">\n")) goto end; - + /* Write contents of this cell. */ { int p; @@ -482,7 +482,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as } if (extract_astring_cat(alloc, content, " </table:table>\n")) goto end; e = 0; - + end: return e; } @@ -490,7 +490,7 @@ static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_as static int s_odt_append_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, content_state_t* content_state, int* p, int* text_box_id, @@ -499,7 +499,7 @@ static int s_odt_append_rotated_paragraphs( extract_astring_t* content, extract_odt_styles_t* styles ) -/* Appends paragraphs with same rotation, starting with page->paragraphs[*p] +/* Appends paragraphs with same rotation, starting with subpage->paragraphs[*p] and updates *p. */ { /* Find extent of paragraphs with this same rotation. extent @@ -509,7 +509,7 @@ and updates *p. */ point_t extent = {0, 0}; int p0 = *p; int p1; - paragraph_t* paragraph = page->paragraphs[*p]; + paragraph_t* paragraph = subpage->paragraphs[*p]; outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", rotate, rotate * 180 / pi, @@ -546,9 +546,9 @@ and updates *p. */ ctm->a, ctm->b, ctm->c, ctm->d); } - for (*p=p0; *p<page->paragraphs_num; ++*p) + for (*p=p0; *p<subpage->paragraphs_num; ++*p) { - paragraph = page->paragraphs[*p]; + paragraph = subpage->paragraphs[*p]; ctm = ¶graph->lines[0]->spans[0]->ctm; rotate = atan2(ctm->b, ctm->a); if (rotate != rotate0) @@ -597,7 +597,7 @@ and updates *p. */ if (s_odt_output_rotated_paragraphs( alloc, - page, + subpage, p0, p1, rotate, @@ -612,15 +612,15 @@ and updates *p. */ )) goto end; *p = p1 - 1; e = 0; - + end: return e; } -int extract_document_to_odt_content( +static int extract_page_to_odt_content( extract_alloc_t* alloc, - document_t* document, + extract_page_t* page, int spacing, int rotation, int images, @@ -630,12 +630,12 @@ int extract_document_to_odt_content( { int ret = -1; int text_box_id = 0; - int p; + int c; /* Write paragraphs into <content>. */ - for (p=0; p<document->pages_num; ++p) + for (c=0; c<page->subpages_num; ++c) { - extract_page_t* page = document->pages[p]; + subpage_t* subpage = page->subpages[c]; int p = 0; int t = 0; content_state_t content_state; @@ -644,17 +644,17 @@ int extract_document_to_odt_content( content_state.font.bold = 0; content_state.font.italic = 0; content_state.ctm_prev = NULL; - + for(;;) { - paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p]; - table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; + paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : subpage->paragraphs[p]; + table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t]; double y_paragraph; double y_table; if (!paragraph && !table) break; y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; y_table = (table) ? table->pos.y : DBL_MAX; - + if (paragraph && y_paragraph < y_table) { const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; @@ -683,7 +683,7 @@ int extract_document_to_odt_content( if (rotation && rotate != 0) { - if (s_odt_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content, styles)) goto end; + if (s_odt_append_rotated_paragraphs(alloc, subpage, &content_state, &p, &text_box_id, ctm, rotate, content, styles)) goto end; } else { @@ -697,15 +697,15 @@ int extract_document_to_odt_content( t += 1; } } - + outf("images=%i", images); if (images) { int i; - outf("page->images_num=%i", page->images_num); - for (i=0; i<page->images_num; ++i) + outf("subpage->images_num=%i", subpage->images_num); + for (i=0; i<subpage->images_num; ++i) { - s_odt_append_image(alloc, content, &page->images[i]); + s_odt_append_image(alloc, content, &subpage->images[i]); } } } @@ -716,6 +716,38 @@ int extract_document_to_odt_content( return ret; } +int extract_document_to_odt_content( + extract_alloc_t* alloc, + document_t* document, + int spacing, + int rotation, + int images, + extract_astring_t* content, + extract_odt_styles_t* styles + ) +{ + int p; + int ret = 0; + + /* Write paragraphs into <content>. */ + for (p=0; p<document->pages_num; ++p) + { + extract_page_t* page = document->pages[p]; + + ret = extract_page_to_odt_content( + alloc, + page, + spacing, + rotation, + images, + content, + styles + ); + if (ret) break; + }; + + return ret; +} #if 0 static int s_find_mid(const char* text, const char* begin, const char* end, const char** o_begin, const char** o_end) @@ -749,7 +781,7 @@ int extract_odt_content_item( extract_astring_t temp; extract_astring_init(&temp); *text2 = NULL; - + (void) images; if (0) {} @@ -771,10 +803,10 @@ int extract_odt_content_item( &text_intermediate )) goto end; outf("text_intermediate: %s", text_intermediate); - + /* Convert <styles> to text. */ if (s_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end; - + /* To make tables work, we seem to need to specify table and column styles, and these can be empty. todo: maybe specify exact sizes based on the pdf table and cell dimensions. */ @@ -783,7 +815,7 @@ int extract_odt_content_item( "<style:style style:name=\"extract.table\" style:family=\"table\"/>\n" "<style:style style:name=\"extract.table.column\" style:family=\"table-column\"/>\n" )) goto end; - + /* Replace '<office:automatic-styles/>' with text from <styles_definitions>. */ e = extract_content_insert( @@ -845,7 +877,7 @@ int extract_odt_content_item( return e; } - + int extract_odt_write_template( extract_alloc_t* alloc, @@ -867,7 +899,7 @@ int extract_odt_write_template( assert(path_out); assert(path_template); - + if (extract_check_path_shell_safe(path_out)) { outf("path_out is unsafe: %s", path_out); @@ -896,7 +928,7 @@ int extract_odt_write_template( /* Might be nice to iterate through all items in path_tempdir, but for now we look at just the items that we know extract_odt_content_item() will modify. */ - + { const char* names[] = { @@ -912,7 +944,7 @@ int extract_odt_write_template( extract_free(alloc, &text2); if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; if (extract_read_all_path(alloc, path, &text)) goto end; - + outf("before extract_odt_content_item() styles->styles_num=%i", styles->styles_num); if (extract_odt_content_item( alloc, @@ -928,7 +960,7 @@ int extract_odt_write_template( outf("extract_odt_content_item() failed"); goto end; } - + outf("after extract_odt_content_item styles->styles_num=%i", styles->styles_num); { @@ -954,7 +986,7 @@ int extract_odt_write_template( if (extract_asprintf(alloc, &path, "%s/Pictures/%s", path_tempdir, image->name) < 0) goto end; if (extract_write_all(image->data, image->data_size, path)) goto end; } - + outf("Zipping tempdir to create %s", path_out); { const char* path_out_leaf = strrchr(path_out, '/'); diff --git a/extract/src/odt.h b/extract/src/odt.h index 68fa0262..cf523a68 100644 --- a/extract/src/odt.h +++ b/extract/src/odt.h @@ -34,7 +34,7 @@ extract_odt_write_template() to be inserted into an odt archive. */ int extract_odt_write_template( - extract_alloc_t* alloc, + extract_alloc_t* alloc, extract_astring_t* contentss, int contentss_num, extract_odt_styles_t* styles, diff --git a/extract/src/odt_template.c b/extract/src/odt_template.c index 5bc66b81..98e4f2dd 100644 --- a/extract/src/odt_template.c +++ b/extract/src/odt_template.c @@ -26,7 +26,7 @@ const odt_template_item_t odt_template_items[] = "<text:sequence-decl text:display-outline-level=\"0\" text:name=\"Figure\"/></text:sequence-decls>" "<text:p text:style-name=\"Standard\"/></office:text></office:body></office:document-content>" }, - + { "manifest.rdf", "\x3c\x3f\x78\x6d\x6c\x20\x76\x65\x72\x73\x69\x6f\x6e\x3d\x22" @@ -87,7 +87,7 @@ const odt_template_item_t odt_template_items[] = "\x72\x69\x70\x74\x69\x6f\x6e\x3e\x0a\x3c\x2f\x72\x64\x66\x3a\x52" "\x44\x46\x3e\x0a" }, - + { "meta.xml", "" @@ -102,14 +102,14 @@ const odt_template_item_t odt_template_items[] = "<meta:editing-cycles>1</meta:editing-cycles>" "<meta:document-statistic meta:table-count=\"0\" meta:image-count=\"0\" meta:object-count=\"0\" meta:page-count=\"1\" meta:paragraph-count=\"0\" meta:word-count=\"0\" meta:character-count=\"0\" meta:non-whitespace-character-count=\"0\"/></office:meta></office:document-meta>" }, - + { "mimetype", "\x61\x70\x70\x6c\x69\x63\x61\x74\x69\x6f\x6e\x2f\x76\x6e\x64" "\x2e\x6f\x61\x73\x69\x73\x2e\x6f\x70\x65\x6e\x64\x6f\x63\x75\x6d" "\x65\x6e\x74\x2e\x74\x65\x78\x74" }, - + { "settings.xml", "" @@ -238,7 +238,7 @@ const odt_template_item_t odt_template_items[] = "<config:config-item config:name=\"ChartAutoUpdate\" config:type=\"boolean\">true</config:config-item>" "<config:config-item config:name=\"AddParaTableSpacing\" config:type=\"boolean\">true</config:config-item></config:config-item-set></office:settings></office:document-settings>" }, - + { "styles.xml", "" @@ -320,7 +320,7 @@ const odt_template_item_t odt_template_items[] = "<office:master-styles>" "<style:master-page style:name=\"Standard\" style:page-layout-name=\"Mpm1\"/></office:master-styles></office:document-styles>" }, - + { "META-INF/manifest.xml", "" @@ -345,7 +345,7 @@ const odt_template_item_t odt_template_items[] = "<manifest:file-entry manifest:full-path=\"Thumbnails/thumbnail.png\" manifest:media-type=\"image/png\"/>\n" "</manifest:manifest>" }, - + { "Thumbnails/thumbnail.png", "\x89\x50\x4e\x47\x0d\x0a\x1a\x0a\x00\x00\x00\x0d\x49\x48\x44" @@ -359,7 +359,7 @@ const odt_template_item_t odt_template_items[] = "\x00\x78\x18\xc7\x00\x00\x01\xf9\xd2\xb5\x9a\x00\x00\x00\x00\x49" "\x45\x4e\x44\xae\x42\x60\x82" }, - + }; int odt_template_items_num = 8; diff --git a/extract/src/outf.c b/extract/src/outf.c index de7662f6..9e955f83 100644 --- a/extract/src/outf.c +++ b/extract/src/outf.c @@ -26,7 +26,7 @@ void (extract_outf)( if (level > extract_outf_verbose) { return; } - + if (ln) { fprintf(stderr, "%s:%i:%s: ", file, line, fn); } diff --git a/extract/src/rect.c b/extract/src/rect.c new file mode 100644 index 00000000..2ea70962 --- /dev/null +++ b/extract/src/rect.c @@ -0,0 +1,57 @@ +#include "../include/extract.h" +#include "document.h" + +static inline double +mind(double a, double b) +{ + return (a < b) ? a : b; +} + +static inline double +maxd(double a, double b) +{ + return (a > b) ? a : b; +} + +rect_t extract_rect_intersect(rect_t a, rect_t b) +{ + rect_t r; + + r.min.x = maxd(a.min.x, b.min.x); + r.min.y = maxd(a.min.y, b.min.y); + r.max.x = mind(a.max.x, b.max.x); + r.max.y = mind(a.max.y, b.max.y); + + return r; +} + +rect_t extract_rect_union(rect_t a, rect_t b) +{ + rect_t r; + + r.min.x = mind(a.min.x, b.min.x); + r.min.y = mind(a.min.y, b.min.y); + r.max.x = maxd(a.max.x, b.max.x); + r.max.y = maxd(a.max.y, b.max.y); + + return r; +} + +int extract_rect_contains_rect(rect_t a, rect_t b) +{ + if (a.min.x > b.min.x) + return 0; + if (a.min.y > b.min.y) + return 0; + if (a.max.x < b.max.x) + return 0; + if (a.max.y < b.max.y) + return 0; + + return 1; +} + +int extract_rect_valid(rect_t a) +{ + return (a.min.x <= a.max.x && a.min.y <= a.max.y); +} diff --git a/extract/src/text.c b/extract/src/text.c index e75e3e69..c035edeb 100644 --- a/extract/src/text.c +++ b/extract/src/text.c @@ -25,11 +25,11 @@ int extract_content_insert( const char* single = NULL; extract_astring_t out; extract_astring_init(&out); - + assert(single_name || mid_begin_name || mid_end_name); - + if (single_name) single = strstr(original, single_name); - + if (single) { outf("Have found single_name='%s', using in preference to mid_begin_name=%s mid_end_name=%s", @@ -81,11 +81,11 @@ int extract_content_insert( */ /* coverity[var_deref_model] */ if (extract_astring_cat(alloc, &out, mid_end)) goto end; - + *o_out = out.chars; out.chars = NULL; e = 0; - + end: if (e) { extract_astring_free(alloc, &out); @@ -93,4 +93,3 @@ int extract_content_insert( } return e; } - diff --git a/extract/src/xml.c b/extract/src/xml.c index 24116f6d..87ba5d0c 100644 --- a/extract/src/xml.c +++ b/extract/src/xml.c @@ -408,7 +408,7 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out) char c; int i; extract_alloc_t* alloc = extract_buffer_alloc(buffer); - + if (0) outf("out is: %s", extract_xml_tag_string(extract_buffer_alloc(buffer), out)); assert(buffer); extract_xml_tag_free(alloc, out); @@ -512,4 +512,3 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out) } return ret; } - diff --git a/extract/src/zip-test.c b/extract/src/zip-test.c index 67082342..0e2d07aa 100644 --- a/extract/src/zip-test.c +++ b/extract/src/zip-test.c @@ -39,7 +39,7 @@ static int s_show(const char* filename) datasize = n; outf("datasize=%zi", datasize); fclose(f); - + /* look for End of central directory (EOCD) record. */ uint32_t magic = 0x06054b50; char* pos = data + datasize - 22; @@ -70,7 +70,7 @@ static int s_show(const char* filename) outf(" offset_cd=%i", offset_cd); outf(" comment_length=%i", comment_length); outf(" comment=%s", comment); - + if (pos != data + datasize - 22 - comment_length) { outf("file does not end with EOCD. datasize=%zi pos-data=%li datasize-22-comment_length=%zi", datasize, @@ -83,7 +83,7 @@ static int s_show(const char* filename) the file. */ assert(0); } - + pos = data + offset_cd; int i; for (i=0; i<num_records_on_disk; ++i) { @@ -110,7 +110,7 @@ static int s_show(const char* filename) assert(filename); memcpy(filename, pos+46, filename_length); filename[filename_length] = 0; - + char* comment = extract_malloc(filecomment_length + 1); assert(comment); memcpy(comment, pos+46+filename_length+extrafield_length, filecomment_length); @@ -133,7 +133,7 @@ static int s_show(const char* filename) outf(" external_attributes=0x%x", external_attributes); outf(" offset=%i", offset); outf(" filename=%s", filename); - + if (extrafield_length) { outf( " extra:"); fprintf(stderr, " "); @@ -146,14 +146,14 @@ static int s_show(const char* filename) } fputc('\n', stderr); } - + /* show local file header. */ { char* local_pos = data + offset; outf(" local header offset=%i", i, local_pos - data); magic = 0x04034b50; assert(!memcmp(local_pos, &magic, sizeof(magic))); - + uint16_t version_needed = *(uint16_t*)(local_pos+4); uint16_t general_bit_flag = *(uint16_t*)(local_pos+6); uint16_t compression_method = *(uint16_t*)(local_pos+8); @@ -164,7 +164,7 @@ static int s_show(const char* filename) uint32_t size_uncompressed = *(uint32_t*)(local_pos+22); uint16_t filename_length = *(uint16_t*)(local_pos+26); uint16_t extrafield_length = *(uint16_t*)(local_pos+28); - + char* filename = extract_malloc(filename_length + 1); assert(filename); memcpy(filename, local_pos+30, filename_length); @@ -201,15 +201,15 @@ static int s_show(const char* filename) } } - + outf(" comment=%s", comment); - + pos += 46 + filename_length + extrafield_length + filecomment_length; } - + outf("finished"); extract_free(&data); - + return 0; } diff --git a/extract/src/zip.c b/extract/src/zip.c index 691b743b..baf7bb1b 100644 --- a/extract/src/zip.c +++ b/extract/src/zip.c @@ -30,7 +30,7 @@ typedef struct uint32_t offset; uint16_t attr_internal; uint32_t attr_external; - + } extract_zip_cd_file_t; struct extract_zip_t @@ -38,14 +38,14 @@ struct extract_zip_t extract_buffer_t* buffer; extract_zip_cd_file_t* cd_files; int cd_files_num; - + /* errno_ is set to non-zero if any operation fails; avoids need to check after every small output operation. */ int errno_; int eof; uint16_t compression_method; int compress_level; - + /* Defaults for various values in zip file headers etc. */ uint16_t mtime; uint16_t mdate; @@ -62,9 +62,9 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip) int e = -1; extract_zip_t* zip; extract_alloc_t* alloc = extract_buffer_alloc(buffer); - + if (extract_malloc(alloc, &zip, sizeof(*zip))) goto end; - + zip->cd_files = NULL; zip->cd_files_num = 0; zip->buffer = buffer; @@ -72,10 +72,10 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip) zip->eof = 0; zip->compression_method = Z_DEFLATED; zip->compress_level = Z_DEFAULT_COMPRESSION; - + /* We could maybe convert current date/time to the ms-dos format required here, but using zeros doesn't seem to make a difference to Word etc. */ - + { time_t t = time(NULL); struct tm* tm; @@ -107,21 +107,21 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip) zip->mdate = 0; } } - + /* These are all copied from command-line zip on unix. */ zip->version_creator = (0x3 << 8) + 30; /* 0x3 is unix, 30 means 3.0. */ zip->version_extract = 10; /* 10 means 1.0. */ zip->general_purpose_bit_flag = 0; zip->file_attr_internal = 0; - + /* We follow command-line zip which uses 0x81a40000 which is octal 0100644:0. (0100644 is S_IFREG (regular file) plus rw-r-r. See stat(2) for details.) */ zip->file_attr_external = (0100644 << 16) + 0; if (extract_strdup(alloc, "Artifex", &zip->archive_comment)) goto end; - + e = 0; - + end: if (e) { if (zip) extract_free(alloc, &zip->archive_comment); @@ -184,11 +184,11 @@ static int s_write_compressed( z_stream zstream = {0}; /* Initialise to keep Coverity quiet. */ if (zip->errno_) return -1; if (zip->eof) return +1; - + zstream.zalloc = s_zalloc; zstream.zfree = s_zfree; zstream.opaque = zip; - + /* We need to write raw deflate data, so we use deflateInit2() with -ve windowBits. The values we use are deflateInit()'s defaults. */ ze = deflateInit2( @@ -206,11 +206,11 @@ static int s_write_compressed( outf("deflateInit2() failed ze=%i", ze); return -1; } - + /* Set zstream to read from specified data. */ zstream.next_in = (void*) data; zstream.avail_in = (unsigned) data_length; - + /* We increment *o_compressed_length gradually so that if we return an error, we still indicate how many butes of compressed data have been written. */ @@ -218,7 +218,7 @@ static int s_write_compressed( { *o_compressed_length = 0; } - + for(;;) { /* todo: write an extract_buffer_cache() function so we can write @@ -329,7 +329,7 @@ int extract_zip_write_file( int e = -1; extract_zip_cd_file_t* cd_file = NULL; extract_alloc_t* alloc = extract_buffer_alloc(zip->buffer); - + if (data_length > INT_MAX) { assert(0); errno = EINVAL; @@ -344,7 +344,7 @@ int extract_zip_write_file( )) goto end; cd_file = &zip->cd_files[zip->cd_files_num]; cd_file->name = NULL; - + cd_file->mtime = zip->mtime; cd_file->mdate = zip->mdate; cd_file->crc_sum = (int32_t) crc32(crc32(0, NULL, 0), data, (int) data_length); @@ -358,7 +358,7 @@ int extract_zip_write_file( cd_file->attr_internal = zip->file_attr_internal; cd_file->attr_external = zip->file_attr_external; if (!cd_file->name) goto end; - + /* Write local file header. If we are using compression, we set bit 3 of General purpose bit flag and write zeros for crc-32, compressed size and uncompressed size; then we write the actual values in data descriptor after @@ -390,14 +390,14 @@ int extract_zip_write_file( s_write_string(zip, cd_file->name); /* File name */ s_write(zip, extra_local, sizeof(extra_local)-1); /* Extra field */ } - + if (zip->compression_method) { /* Write compressed data. */ size_t data_length_compressed; s_write_compressed(zip, data, data_length, &data_length_compressed); cd_file->size_compressed = (int) data_length_compressed; - + /* Write data descriptor. */ s_write_uint32(zip, 0x08074b50); /* Data descriptor signature */ s_write_uint32(zip, cd_file->crc_sum); /* CRC-32 of uncompressed data */ @@ -408,14 +408,14 @@ int extract_zip_write_file( { s_write(zip, data, data_length); } - + if (zip->errno_) e = -1; else if (zip->eof) e = +1; else e = 0; - - + + end: - + if (e) { /* Leave zip->cd_files_num unchanged, so calling extract_zip_close() will write out any earlier files. Free cd_file->name to avoid leak. */ @@ -425,7 +425,7 @@ int extract_zip_write_file( /* cd_files[zip->cd_files_num] is valid. */ zip->cd_files_num += 1; } - + return e; } @@ -443,7 +443,7 @@ int extract_zip_close(extract_zip_t** pzip) alloc = extract_buffer_alloc(zip->buffer); pos = extract_buffer_pos(zip->buffer); len = 0; - + /* Write Central directory file headers, freeing data as we go. */ for (i=0; i<zip->cd_files_num; ++i) { const char extra[] = ""; @@ -472,7 +472,7 @@ int extract_zip_close(extract_zip_t** pzip) extract_free(alloc, &cd_file->name); } extract_free(alloc, &zip->cd_files); - + /* Write End of central directory record. */ s_write_uint32(zip, 0x06054b50); s_write_uint16(zip, 0); /* Number of this disk */ @@ -481,16 +481,16 @@ int extract_zip_close(extract_zip_t** pzip) s_write_uint16(zip, (uint16_t) zip->cd_files_num); /* Total number of central directory records */ s_write_uint32(zip, (int) len); /* Size of central directory (bytes) */ s_write_uint32(zip, (int) pos); /* Offset of start of central directory, relative to start of archive */ - + s_write_uint16(zip, (uint16_t) strlen(zip->archive_comment)); /* Comment length (n) */ s_write_string(zip, zip->archive_comment); extract_free(alloc, &zip->archive_comment); - + if (zip->errno_) e = -1; else if (zip->eof) e = +1; else e = 0; - + extract_free(alloc, pzip); - + return e; } |