Allow to insert several annotations under the same parent in the structure tree

While testing stamp insertion with the added pdf, I noticed that the tags using a MCID
weren't considered when trying to attach an annotation to it.
This commit is contained in:
Calixte Denizet 2024-04-22 23:11:04 +02:00
parent dafc4f66c2
commit 45fa867577
6 changed files with 243 additions and 112 deletions

View file

@ -378,6 +378,12 @@ class RefSetCache {
clear() { clear() {
this._map.clear(); this._map.clear();
} }
*items() {
for (const [ref, value] of this._map) {
yield [Ref.fromString(ref), value];
}
}
} }
function isName(v, name) { function isName(v, name) {

View file

@ -119,19 +119,19 @@ class StructTreeRoot {
newRefs, newRefs,
}) { }) {
const root = pdfManager.catalog.cloneDict(); const root = pdfManager.catalog.cloneDict();
const cache = new RefSetCache();
cache.put(catalogRef, root);
const structTreeRootRef = xref.getNewTemporaryRef(); const structTreeRootRef = xref.getNewTemporaryRef();
root.set("StructTreeRoot", structTreeRootRef); root.set("StructTreeRoot", structTreeRootRef);
const buffer = [];
await writeObject(catalogRef, root, buffer, xref);
newRefs.push({ ref: catalogRef, data: buffer.join("") });
const structTreeRoot = new Dict(xref); const structTreeRoot = new Dict(xref);
structTreeRoot.set("Type", Name.get("StructTreeRoot")); structTreeRoot.set("Type", Name.get("StructTreeRoot"));
const parentTreeRef = xref.getNewTemporaryRef(); const parentTreeRef = xref.getNewTemporaryRef();
structTreeRoot.set("ParentTree", parentTreeRef); structTreeRoot.set("ParentTree", parentTreeRef);
const kids = []; const kids = [];
structTreeRoot.set("K", kids); structTreeRoot.set("K", kids);
cache.put(structTreeRootRef, structTreeRoot);
const parentTree = new Dict(xref); const parentTree = new Dict(xref);
const nums = []; const nums = [];
@ -144,18 +144,18 @@ class StructTreeRoot {
nums, nums,
xref, xref,
pdfManager, pdfManager,
newRefs, cache,
buffer,
}); });
structTreeRoot.set("ParentTreeNextKey", nextKey); structTreeRoot.set("ParentTreeNextKey", nextKey);
buffer.length = 0; cache.put(parentTreeRef, parentTree);
await writeObject(parentTreeRef, parentTree, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") });
const buffer = [];
for (const [ref, obj] of cache.items()) {
buffer.length = 0; buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref); await writeObject(ref, obj, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") }); newRefs.push({ ref, data: buffer.join("") });
}
} }
async canUpdateStructTree({ pdfManager, xref, newAnnotationsByPage }) { async canUpdateStructTree({ pdfManager, xref, newAnnotationsByPage }) {
@ -232,6 +232,8 @@ class StructTreeRoot {
const xref = this.dict.xref; const xref = this.dict.xref;
const structTreeRoot = this.dict.clone(); const structTreeRoot = this.dict.clone();
const structTreeRootRef = this.ref; const structTreeRootRef = this.ref;
const cache = new RefSetCache();
cache.put(structTreeRootRef, structTreeRoot);
let parentTreeRef = structTreeRoot.getRaw("ParentTree"); let parentTreeRef = structTreeRoot.getRaw("ParentTree");
let parentTree; let parentTree;
@ -243,6 +245,7 @@ class StructTreeRoot {
structTreeRoot.set("ParentTree", parentTreeRef); structTreeRoot.set("ParentTree", parentTreeRef);
} }
parentTree = parentTree.clone(); parentTree = parentTree.clone();
cache.put(parentTreeRef, parentTree);
let nums = parentTree.getRaw("Nums"); let nums = parentTree.getRaw("Nums");
let numsRef = null; let numsRef = null;
@ -255,47 +258,27 @@ class StructTreeRoot {
parentTree.set("Nums", nums); parentTree.set("Nums", nums);
} }
let kids = structTreeRoot.getRaw("K");
let kidsRef = null;
if (kids instanceof Ref) {
kidsRef = kids;
kids = xref.fetch(kidsRef);
} else {
kidsRef = xref.getNewTemporaryRef();
structTreeRoot.set("K", kidsRef);
}
kids = Array.isArray(kids) ? kids.slice() : [kids];
const buffer = [];
const newNextkey = await StructTreeRoot.#writeKids({ const newNextkey = await StructTreeRoot.#writeKids({
newAnnotationsByPage, newAnnotationsByPage,
structTreeRootRef, structTreeRootRef,
kids, kids: null,
nums, nums,
xref, xref,
pdfManager, pdfManager,
newRefs, cache,
buffer,
}); });
structTreeRoot.set("ParentTreeNextKey", newNextkey); structTreeRoot.set("ParentTreeNextKey", newNextkey);
buffer.length = 0;
await writeObject(kidsRef, kids, buffer, xref);
newRefs.push({ ref: kidsRef, data: buffer.join("") });
if (numsRef) { if (numsRef) {
buffer.length = 0; cache.put(numsRef, nums);
await writeObject(numsRef, nums, buffer, xref);
newRefs.push({ ref: numsRef, data: buffer.join("") });
} }
const buffer = [];
for (const [ref, obj] of cache.items()) {
buffer.length = 0; buffer.length = 0;
await writeObject(parentTreeRef, parentTree, buffer, xref); await writeObject(ref, obj, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") }); newRefs.push({ ref, data: buffer.join("") });
}
buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") });
} }
static async #writeKids({ static async #writeKids({
@ -305,8 +288,7 @@ class StructTreeRoot {
nums, nums,
xref, xref,
pdfManager, pdfManager,
newRefs, cache,
buffer,
}) { }) {
const objr = Name.get("OBJR"); const objr = Name.get("OBJR");
let nextKey = -Infinity; let nextKey = -Infinity;
@ -349,19 +331,15 @@ class StructTreeRoot {
tagDict.set("ActualText", actualText); tagDict.set("ActualText", actualText);
} }
if (structTreeParent) {
await this.#updateParentTag({ await this.#updateParentTag({
structTreeParent, structTreeParent,
tagDict, tagDict,
newTagRef: tagRef, newTagRef: tagRef,
fallbackRef: structTreeRootRef, structTreeRootRef,
fallbackKids: kids,
xref, xref,
newRefs, cache,
buffer,
}); });
} else {
tagDict.set("P", structTreeRootRef);
}
const objDict = new Dict(xref); const objDict = new Dict(xref);
tagDict.set("K", objDict); tagDict.set("K", objDict);
@ -372,23 +350,24 @@ class StructTreeRoot {
} }
objDict.set("Obj", ref); objDict.set("Obj", ref);
buffer.length = 0; cache.put(tagRef, tagDict);
await writeObject(tagRef, tagDict, buffer, xref);
newRefs.push({ ref: tagRef, data: buffer.join("") });
nums.push(parentTreeId, tagRef); nums.push(parentTreeId, tagRef);
kids.push(tagRef);
} }
} }
return nextKey + 1; return nextKey + 1;
} }
static #collectParents({ elements, xref, pageDict, numberTree }) { static #collectParents({ elements, xref, pageDict, numberTree }) {
const idToElement = new Map(); const idToElements = new Map();
for (const element of elements) { for (const element of elements) {
if (element.structTreeParentId) { if (element.structTreeParentId) {
const id = parseInt(element.structTreeParentId.split("_mc")[1], 10); const id = parseInt(element.structTreeParentId.split("_mc")[1], 10);
idToElement.set(id, element); let elems = idToElements.get(id);
if (!elems) {
elems = [];
idToElements.set(id, elems);
}
elems.push(element);
} }
} }
@ -400,13 +379,16 @@ class StructTreeRoot {
const parentArray = numberTree.get(id); const parentArray = numberTree.get(id);
const updateElement = (kid, pageKid, kidRef) => { const updateElement = (kid, pageKid, kidRef) => {
const element = idToElement.get(kid); const elems = idToElements.get(kid);
if (element) { if (elems) {
const parentRef = pageKid.getRaw("P"); const parentRef = pageKid.getRaw("P");
const parentDict = xref.fetchIfRef(parentRef); const parentDict = xref.fetchIfRef(parentRef);
if (parentRef instanceof Ref && parentDict instanceof Dict) { if (parentRef instanceof Ref && parentDict instanceof Dict) {
// It should always the case, but we check just in case. // It should always the case, but we check just in case.
element.structTreeParent = { ref: kidRef, dict: pageKid }; const params = { ref: kidRef, dict: pageKid };
for (const element of elems) {
element.structTreeParent = params;
}
} }
return true; return true;
} }
@ -431,67 +413,73 @@ class StructTreeRoot {
if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) { if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) {
break; break;
} }
if (!(kid instanceof Dict)) {
continue;
}
if (!isName(kid.get("Type"), "MCR")) {
break;
}
const mcid = kid.get("MCID");
if (Number.isInteger(mcid) && updateElement(mcid, pageKid, kidRef)) {
break;
}
} }
} }
} }
static async #updateParentTag({ static async #updateParentTag({
structTreeParent: { ref, dict }, structTreeParent,
tagDict, tagDict,
newTagRef, newTagRef,
fallbackRef, structTreeRootRef,
fallbackKids,
xref, xref,
newRefs, cache,
buffer,
}) { }) {
let ref = null;
let parentRef;
if (structTreeParent) {
({ ref } = structTreeParent);
// We get the parent of the tag. // We get the parent of the tag.
const parentRef = dict.getRaw("P"); parentRef = structTreeParent.dict.getRaw("P") || structTreeRootRef;
let parentDict = xref.fetchIfRef(parentRef); } else {
parentRef = structTreeRootRef;
}
tagDict.set("P", parentRef); tagDict.set("P", parentRef);
// We get the kids in order to insert a new tag at the right position. // We get the kids in order to insert a new tag at the right position.
let saveParentDict = false; const parentDict = xref.fetchIfRef(parentRef);
let parentKids; if (!parentDict) {
let parentKidsRef = parentDict.getRaw("K"); fallbackKids.push(newTagRef);
if (!(parentKidsRef instanceof Ref)) {
parentKids = parentKidsRef;
parentKidsRef = xref.getNewTemporaryRef();
parentDict = parentDict.clone();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
} else {
parentKids = xref.fetch(parentKidsRef);
}
if (Array.isArray(parentKids)) {
const index = parentKids.indexOf(ref);
if (index >= 0) {
parentKids = parentKids.slice();
parentKids.splice(index + 1, 0, newTagRef);
} else {
warn("Cannot update the struct tree: parent kid not found.");
tagDict.set("P", fallbackRef);
return;
}
} else if (parentKids instanceof Dict) {
parentKids = [parentKidsRef, newTagRef];
parentKidsRef = xref.getNewTemporaryRef();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
}
buffer.length = 0;
await writeObject(parentKidsRef, parentKids, buffer, xref);
newRefs.push({ ref: parentKidsRef, data: buffer.join("") });
if (!saveParentDict) {
return; return;
} }
buffer.length = 0; let cachedParentDict = cache.get(parentRef);
await writeObject(parentRef, parentDict, buffer, xref); if (!cachedParentDict) {
newRefs.push({ ref: parentRef, data: buffer.join("") }); cachedParentDict = parentDict.clone();
cache.put(parentRef, cachedParentDict);
}
const parentKidsRaw = cachedParentDict.getRaw("K");
let cachedParentKids =
parentKidsRaw instanceof Ref ? cache.get(parentKidsRaw) : null;
if (!cachedParentKids) {
cachedParentKids = xref.fetchIfRef(parentKidsRaw);
cachedParentKids = Array.isArray(cachedParentKids)
? cachedParentKids.slice()
: [parentKidsRaw];
const parentKidsRef = xref.getNewTemporaryRef();
cachedParentDict.set("K", parentKidsRef);
cache.put(parentKidsRef, cachedParentKids);
}
const index = cachedParentKids.indexOf(ref);
cachedParentKids.splice(
index >= 0 ? index + 1 : cachedParentKids.length,
0,
newTagRef
);
} }
} }

View file

@ -645,3 +645,4 @@
!issue12213.pdf !issue12213.pdf
!tracemonkey_freetext.pdf !tracemonkey_freetext.pdf
!issue17998.pdf !issue17998.pdf
!pdfjs_wikipedia.pdf

BIN
test/pdfs/pdfjs_wikipedia.pdf Executable file

Binary file not shown.

View file

@ -1030,6 +1030,20 @@ describe("api", function () {
await pdfLoadingTask.destroy(); await pdfLoadingTask.destroy();
}); });
function findNode(parent, node, index, check) {
if (check(node)) {
return [parent.children[index - 1], node];
}
for (let i = 0; i < node.children?.length ?? 0; i++) {
const child = node.children[i];
const elements = findNode(node, child, i, check);
if (elements) {
return elements;
}
}
return null;
}
it("gets number of pages", function () { it("gets number of pages", function () {
expect(pdfDocument.numPages).toEqual(3); expect(pdfDocument.numPages).toEqual(3);
}); });
@ -2396,7 +2410,22 @@ describe("api", function () {
pdfDoc = await loadingTask.promise; pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1); const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree(); const tree = await page.getStructTree();
const leaf = tree.children[0].children[6].children[1]; const [predecessor, leaf] = findNode(
null,
tree,
0,
node => node.role === "Figure"
);
expect(predecessor).toEqual({
role: "Span",
children: [
{
type: "content",
id: "p3R_mc12",
},
],
});
expect(leaf).toEqual({ expect(leaf).toEqual({
role: "Figure", role: "Figure",
@ -2412,6 +2441,104 @@ describe("api", function () {
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("write a new stamp annotation in a tagged pdf (with some MCIDs), save and check the structure tree", async function () {
if (isNodeJS) {
pending("Cannot create a bitmap from Node.js.");
}
const TEST_IMAGES_PATH = "../images/";
const filename = "firefox_logo.png";
const path = new URL(TEST_IMAGES_PATH + filename, window.location).href;
const response = await fetch(path);
const blob = await response.blob();
const bitmap = await createImageBitmap(blob);
let loadingTask = getDocument(
buildGetDocumentParams("pdfjs_wikipedia.pdf")
);
let pdfDoc = await loadingTask.promise;
for (let i = 0; i < 2; i++) {
pdfDoc.annotationStorage.setValue(`pdfjs_internal_editor_${i}`, {
annotationType: AnnotationEditorType.STAMP,
bitmapId: `im${i}`,
pageIndex: 0,
rect: [257 + i, 572 + i, 286 + i, 603 + i],
rotation: 0,
isSvg: false,
structTreeParentId: "p2R_mc155",
accessibilityData: {
type: "Figure",
alt: `Firefox logo ${i}`,
},
bitmap: structuredClone(bitmap),
});
}
const data = await pdfDoc.saveDocument();
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree();
let [predecessor, figure] = findNode(
null,
tree,
0,
node => node.role === "Figure" && node.alt === "Firefox logo 1"
);
expect(predecessor).toEqual({
role: "NonStruct",
children: [
{
type: "content",
id: "p2R_mc155",
},
],
});
expect(figure).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_420R",
},
],
alt: "Firefox logo 1",
});
[predecessor, figure] = findNode(
null,
tree,
0,
node => node.role === "Figure" && node.alt === "Firefox logo 0"
);
expect(predecessor).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_420R",
},
],
alt: "Firefox logo 1",
});
expect(figure).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_416R",
},
],
alt: "Firefox logo 0",
});
await loadingTask.destroy();
});
it("write a new stamp annotation in a tagged pdf, save, repeat and check the structure tree", async function () { it("write a new stamp annotation in a tagged pdf, save, repeat and check the structure tree", async function () {
if (isNodeJS) { if (isNodeJS) {
pending("Cannot create a bitmap from Node.js."); pending("Cannot create a bitmap from Node.js.");

View file

@ -498,6 +498,15 @@ describe("primitives", function () {
cache.put(ref2, obj2); cache.put(ref2, obj2);
expect([...cache]).toEqual([obj1, obj2]); expect([...cache]).toEqual([obj1, obj2]);
}); });
it("should support iteration over key-value pairs", function () {
cache.put(ref1, obj1);
cache.put(ref2, obj2);
expect([...cache.items()]).toEqual([
[ref1, obj1],
[ref2, obj2],
]);
});
}); });
describe("isName", function () { describe("isName", function () {