From 0bafbd9c1273fab0dc79fd20db0ffc4443683f96 Mon Sep 17 00:00:00 2001 From: Ken Sharp Date: Mon, 29 Apr 2019 11:14:06 +0100 Subject: [PATCH] PDF interpreter - Decode ToUnicode entries of the form /Identity-H/V Bug #701003 "Text searchability broken due to omission of /ToUnicode /Identity-H" The PDF references from 1.2 too 2.0 all state that the value associated with a ToUnicode key in a FontDescriptor must be a stream object. However this file (and one case seen previously, bug 687351) have FontDescriptor dictionaries where the value associated with a /ToUnicode key is a name object, in both cases /Identity-H. Although this is clearly not legal, Acrobat not only tolerates it, it actually uses it for search/copy/paste (see bug 701003 for details). Without the key Acrobat is unable to successfully search the output file. We can't simply preserve the name object as a ToUnicode value; when handling ToUnicode we actually decode the CMap and build a GlyphNames2Unicode map (an internal representation of the G2U data produced by the Microsoft PostScript printer driver). When writing the output file we use that information to get a Unicode value for each character we write, and build a new ToUnicode CMap using that. This commit tackles the problem by pre-scanning for a name object and then checking to see if its Identity-H or Identity-V (although we have not seen an Identity-V, there seems no reason why it wouldn't be equally valid). If we find either of these then we construct a GlyphNames2Unicode table for all possible values (0 - 65535) and store that with the font as normal. When we write the output file we only write the required entries for the subset font, so we write a now completely legal ToUnicode CMap, and Acrobat is equally happy with that as the original name. If the ToUnicode value isn't a name object, or isn't one of the identities then we proceed as before. This means we will print a warning for non conforming ToUnicode entries and ignore them. --- Resource/Init/pdf_font.ps | 200 ++++++++++++++++++++++++-------------- 1 file changed, 129 insertions(+), 71 deletions(-) diff --git a/Resource/Init/pdf_font.ps b/Resource/Init/pdf_font.ps index 0e802d393..964d54c1e 100644 --- a/Resource/Init/pdf_font.ps +++ b/Resource/Init/pdf_font.ps @@ -621,86 +621,144 @@ currentdict end readonly def PDFDEBUG { (.processToUnicode beg) = } if - 2 index /ToUnicode knownoget { - dup type /dicttype eq { dup /File known not } { //true } ifelse { - % We undefine wrong /Length and define /File in stream dictionaries. - % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect. - ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning - pop + + 2 index /ToUnicode knownoget + { + dup type /nametype eq { + % This is contrary to the specification but it seems that Acrobat at least will accept + % a ToUnicode with a value of Identity-H *and* will use that for search, copy/paste. + % We can't pass through a name, so the best we can do is build a GlyphNames2Unicode + % map matching that which would have been generated by a full 16-bit Identity CMap + % + % See bug numbers 701003 and 687351 + % + dup /Identity-H eq 1 index /Identity-V eq or{ + pop + 1 index /FontInfo .knownget not { + currentglobal 2 index dup gcheck setglobal + /FontInfo 5 dict dup 5 1 roll .forceput + setglobal + } if + dup /GlyphNames2Unicode .knownget not { + //true % No existing G2U, make one + } { + dup wcheck { + //false % Existing, writeable G2U, don't make new one + } { + pop //true % Existing read only G2U, make new one + } ifelse + } ifelse + { + currentglobal exch dup gcheck setglobal + dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput + 3 2 roll setglobal + } if % font-res font-dict encoding|null font-info g2u + + 0 1 65535{ + % g2u index + dup dup 256 mod exch 256 idiv % g2u index lo-byte hi-byte + 2 string dup 0 4 -1 roll % g2u index lo-byte () () 0 hi-byte + put % g2u index lo-byte (x) + dup 1 % g2u index lo-byte (x) (x) 1 + 4 -1 roll put % g2u index (x) (x) 1 lo-byte -> dict index (xx) + 2 index % g2u index (xx) dict + 3 1 roll % g2u g2u index (xx) + put % g2u + } for + pop % font-res font-dict encoding|null font-info + pop % font-res font-dict encoding|null + //false % We built a GlyphNames2Unicode table, don't need to process further + }{ + //true % name is not Identity-V or H, fail by falling through + }ifelse } { - /PDFScanRules .getuserparam dup //null eq { - pop //PDFScanRules_null - } { - 1 dict dup /PDFScanRules 4 -1 roll put - } ifelse - //PDFScanRules_true setuserparams - PDFfile fileposition - 3 -1 roll - count 1 sub - countdictstack - { //false resolvestream - % Following Acrobat we ignore everything outside - % begincodespacerange .. endcmap. - dup 0 (begincodespacerange) /SubFileDecode filter flushfile - /CIDInit /ProcSet findresource begin - //ToUnicodeCMapReader begin - 12 dict begin - /CMapType 2 def - mark exch % emulate 'begincodespacerange' - 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn - endcmap - userdict /.lastToUnicode currentdict put - end end end - } + //true + } ifelse % not a name, try as a dictionary (as specified) - PDFSTOPONERROR { - { exec } 0 get - //false - 5 -2 roll - 5 + % If the ToUnicode isn't a name, or the name isn't Identity-V or -H then follow the specification + % If its not a dictionary type throw an error, otherwise decode it and build a GlyphNames2Unicode + % + { + dup type /dicttype eq { dup /File known not } { //true } ifelse { + % We undefine wrong /Length and define /File in stream dictionaries. + % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect. + ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning + pop } { - { stopped } 0 get - 4 2 roll - 4 - } ifelse - array astore cvx exec + /PDFScanRules .getuserparam dup //null eq { + pop //PDFScanRules_null + } { + 1 dict dup /PDFScanRules 4 -1 roll put + } ifelse + //PDFScanRules_true setuserparams + PDFfile fileposition + 3 -1 roll + count 1 sub + countdictstack + { //false resolvestream + % Following Acrobat we ignore everything outside + % begincodespacerange .. endcmap. + dup 0 (begincodespacerange) /SubFileDecode filter flushfile + /CIDInit /ProcSet findresource begin + //ToUnicodeCMapReader begin + 12 dict begin + /CMapType 2 def + mark exch % emulate 'begincodespacerange' + 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn + endcmap + userdict /.lastToUnicode currentdict put + end end end + } - countdictstack exch sub 0 .max { end } repeat - count exch sub 2 sub 0 .max { exch pop } repeat - 3 1 roll % Stach the stop flag. - PDFfile exch setfileposition - setuserparams - { - ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning - } { - 1 index /FontInfo .knownget not { - currentglobal 2 index dup gcheck setglobal - /FontInfo 5 dict dup 5 1 roll .forceput - setglobal - } if - dup /GlyphNames2Unicode .knownget not { - //true % No existing G2U, make one + PDFSTOPONERROR { + { exec } 0 get + //false + 5 -2 roll + 5 + } { + { stopped } 0 get + 4 2 roll + 4 + } ifelse + array astore cvx exec + + countdictstack exch sub 0 .max { end } repeat + count exch sub 2 sub 0 .max { exch pop } repeat + 3 1 roll % Stach the stop flag. + PDFfile exch setfileposition + setuserparams + { + ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning } { - dup wcheck { - //false % Existing, writeable G2U, don't make new one + 1 index /FontInfo .knownget not { + currentglobal 2 index dup gcheck setglobal + /FontInfo 5 dict dup 5 1 roll .forceput + setglobal + } if + dup /GlyphNames2Unicode .knownget not { + //true % No existing G2U, make one } { - pop //true % Existing read only G2U, make new one + dup wcheck { + //false % Existing, writeable G2U, don't make new one + } { + pop //true % Existing read only G2U, make new one + } ifelse } ifelse + { + currentglobal exch dup gcheck setglobal + dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput + 3 2 roll setglobal + } if % font-res font-dict encoding|null font-info g2u + exch pop exch % font-res font-dict g2u encoding|null + userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap + .convert_ToUnicode-into-g2u % font-res font-dict + //null % font-res font-dict //null } ifelse - { - currentglobal exch dup gcheck setglobal - dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput - 3 2 roll setglobal - } if % font-res font-dict encoding|null font-info g2u - exch pop exch % font-res font-dict g2u encoding|null - userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap - .convert_ToUnicode-into-g2u % font-res font-dict - //null % font-res font-dict //null } ifelse - } ifelse - } if - PDFDEBUG { - (.processToUnicode end) = + } if + PDFDEBUG { + (.processToUnicode end) = + } if } if } if } stopped -- 2.23.0