[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)

This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces.

When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`).

Fixes 6612.
This commit is contained in:
Jonas Jenwald 2015-11-23 16:57:43 +01:00
parent c2dfe9e9a9
commit 6dfe53b976
12 changed files with 75 additions and 24 deletions

View file

@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
});
},
getTextContent: function PartialEvaluator_getTextContent(stream, task,
resources,
stateManager) {
getTextContent:
function PartialEvaluator_getTextContent(stream, task, resources,
stateManager,
normalizeWhitespace) {
stateManager = (stateManager || new StateManager(new TextState()));
var WhitespaceRegexp = /\s/g;
var textContent = {
items: [],
styles: Object.create(null)
@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return textContentItem;
}
function replaceWhitespace(str) {
// Replaces all whitespaces with standard spaces (0x20), to avoid
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
var i = 0, ii = str.length, code;
while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
i++;
}
return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
}
function runBidiTransform(textChunk) {
var str = textChunk.str.join('');
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
return {
str: bidiResult.str,
str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
bidiResult.str),
dir: bidiResult.dir,
width: textChunk.width,
height: textChunk.height,
@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
return self.getTextContent(xobj, task,
xobj.dict.get('Resources') || resources, stateManager).
then(function (formTextContent) {
xobj.dict.get('Resources') || resources, stateManager,
normalizeWhitespace).then(function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore();