From 0bafbd9c1273fab0dc79fd20db0ffc4443683f96 Mon Sep 17 00:00:00 2001
From: Ken Sharp <ken.sharp@artifex.com>
Date: Mon, 29 Apr 2019 11:14:06 +0100
Subject: [PATCH] PDF interpreter - Decode ToUnicode entries of the form
 /Identity-H/V

Bug #701003 "Text searchability broken due to omission of /ToUnicode /Identity-H"

The PDF references from 1.2 too 2.0 all state that the value associated
with a ToUnicode key in a FontDescriptor must be a stream object. However
this file (and one case seen previously, bug 687351) have FontDescriptor
dictionaries where the value associated with a /ToUnicode key is a
name object, in both cases /Identity-H.

Although this is clearly not legal, Acrobat not only tolerates it, it
actually uses it for search/copy/paste (see bug 701003 for details).
Without the key Acrobat is unable to successfully search the output file.

We can't simply preserve the name object as a ToUnicode value; when
handling ToUnicode we actually decode the CMap and build a
GlyphNames2Unicode map (an internal representation of the G2U data
produced by the Microsoft PostScript printer driver). When writing the
output file we use that information to get a Unicode value for each
character we write, and build a new ToUnicode CMap using that.

This commit tackles the problem by pre-scanning for a name object and
then checking to see if its Identity-H or Identity-V (although we have
not seen an Identity-V, there seems no reason why it wouldn't be
equally valid). If we find either of these then we construct a
GlyphNames2Unicode table for all possible values (0 - 65535) and store
that with the font as normal. When we write the output file we only
write the required entries for the subset font, so we write a now
completely legal ToUnicode CMap, and Acrobat is equally happy with that
as the original name.

If the ToUnicode value isn't a name object, or isn't one of the
identities then we proceed as before. This means we will print a
warning for non conforming ToUnicode entries and ignore them.
---
 Resource/Init/pdf_font.ps | 200 ++++++++++++++++++++++++--------------
 1 file changed, 129 insertions(+), 71 deletions(-)

diff --git a/Resource/Init/pdf_font.ps b/Resource/Init/pdf_font.ps
index 0e802d393..964d54c1e 100644
--- a/Resource/Init/pdf_font.ps
+++ b/Resource/Init/pdf_font.ps
@@ -621,86 +621,144 @@ currentdict end readonly def
         PDFDEBUG {
           (.processToUnicode beg) =
         } if
-        2 index /ToUnicode knownoget {
-          dup type /dicttype eq { dup /File known not } { //true } ifelse {
-            % We undefine wrong /Length and define /File in stream dictionaries.
-            % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
-            (   **** Warning: Ignoring bad ToUnicode CMap.\n)  pdfformatwarning
-            pop
+
+        2 index /ToUnicode knownoget
+        {
+            dup type /nametype eq {
+              % This is contrary to the specification but it seems that Acrobat at least will accept
+              % a ToUnicode with a value of Identity-H *and* will use that for search, copy/paste.
+              % We can't pass through a name, so the best we can do is build a GlyphNames2Unicode
+              % map matching that which would have been generated by a full 16-bit Identity CMap
+              %
+              % See bug numbers 701003 and 687351
+              %
+              dup /Identity-H eq 1 index /Identity-V eq or{
+                pop
+                1 index /FontInfo .knownget not {
+                  currentglobal 2 index dup gcheck setglobal
+                  /FontInfo 5 dict dup 5 1 roll .forceput
+                  setglobal
+                } if
+                dup /GlyphNames2Unicode .knownget not {
+                  //true                        % No existing G2U, make one
+                } {
+                  dup wcheck {
+                    //false                     % Existing, writeable G2U, don't make new one
+                  } {
+                    pop //true                          % Existing read only G2U, make new one
+                  } ifelse
+                } ifelse
+                {
+                  currentglobal exch dup gcheck setglobal
+                  dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
+                  3 2 roll setglobal
+                } if                                 % font-res font-dict encoding|null font-info g2u
+
+                0 1 65535{
+                                                        % g2u index
+                  dup dup 256 mod exch 256 idiv         % g2u index lo-byte hi-byte
+                  2 string dup 0 4 -1 roll              % g2u index lo-byte () () 0 hi-byte
+                  put                                   % g2u index lo-byte (x)
+                  dup 1                                 % g2u index lo-byte (x) (x) 1
+                  4 -1 roll put                         % g2u index (x) (x) 1 lo-byte -> dict index (xx)
+                  2 index                               % g2u index (xx) dict
+                  3 1 roll                              % g2u g2u index (xx)
+                  put                                   % g2u
+                } for
+                pop                                     % font-res font-dict encoding|null font-info
+                pop                                     % font-res font-dict encoding|null
+                //false                                 % We built a GlyphNames2Unicode table, don't need to process further
+              }{
+                //true                                  % name is not Identity-V or H, fail by falling through
+              }ifelse
           } {
-            /PDFScanRules .getuserparam dup //null eq {
-              pop //PDFScanRules_null
-            } {
-              1 dict dup /PDFScanRules 4 -1 roll put
-            } ifelse
-            //PDFScanRules_true setuserparams
-            PDFfile fileposition
-            3 -1 roll
-            count 1 sub
-            countdictstack
-            { //false resolvestream
-              % Following Acrobat we ignore everything outside
-              %   begincodespacerange .. endcmap.
-              dup 0 (begincodespacerange) /SubFileDecode filter flushfile
-              /CIDInit /ProcSet findresource begin
-              //ToUnicodeCMapReader begin
-              12 dict begin
-              /CMapType 2 def
-              mark exch % emulate 'begincodespacerange'
-              0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
-              endcmap
-              userdict /.lastToUnicode currentdict put
-              end end end
-            }
+            //true
+          } ifelse                                      % not a name, try as a dictionary (as specified)
 
-            PDFSTOPONERROR {
-              { exec } 0 get
-              //false
-              5 -2 roll
-              5
+          % If the ToUnicode isn't a name, or the name isn't Identity-V or -H then follow the specification
+          % If its not a dictionary type throw an error, otherwise decode it and build a GlyphNames2Unicode
+          %
+          {
+            dup type /dicttype eq { dup /File known not } { //true } ifelse {
+              % We undefine wrong /Length and define /File in stream dictionaries.
+              % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
+              (   **** Warning: Ignoring bad ToUnicode CMap.\n)  pdfformatwarning
+              pop
             } {
-              { stopped } 0 get
-              4 2 roll
-              4
-            } ifelse
-            array astore cvx exec
+              /PDFScanRules .getuserparam dup //null eq {
+                pop //PDFScanRules_null
+              } {
+                1 dict dup /PDFScanRules 4 -1 roll put
+              } ifelse
+              //PDFScanRules_true setuserparams
+              PDFfile fileposition
+              3 -1 roll
+              count 1 sub
+              countdictstack
+              { //false resolvestream
+                % Following Acrobat we ignore everything outside
+                %   begincodespacerange .. endcmap.
+                dup 0 (begincodespacerange) /SubFileDecode filter flushfile
+                /CIDInit /ProcSet findresource begin
+                //ToUnicodeCMapReader begin
+                12 dict begin
+                /CMapType 2 def
+                mark exch % emulate 'begincodespacerange'
+                0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
+                endcmap
+                userdict /.lastToUnicode currentdict put
+                end end end
+              }
 
-            countdictstack exch sub 0 .max { end } repeat
-            count exch sub 2 sub 0 .max { exch pop } repeat
-            3 1 roll                     % Stach the stop flag.
-            PDFfile exch setfileposition
-            setuserparams
-            {
-              (   **** Warning: Failed to read ToUnicode CMap.\n)  pdfformatwarning
-            } {
-              1 index /FontInfo .knownget not {
-                currentglobal 2 index dup gcheck setglobal
-                /FontInfo 5 dict dup 5 1 roll .forceput
-                setglobal
-              } if
-              dup /GlyphNames2Unicode .knownget not {
-                //true                        % No existing G2U, make one
+              PDFSTOPONERROR {
+                { exec } 0 get
+                //false
+                5 -2 roll
+                5
+              } {
+                { stopped } 0 get
+                4 2 roll
+                4
+              } ifelse
+              array astore cvx exec
+
+              countdictstack exch sub 0 .max { end } repeat
+              count exch sub 2 sub 0 .max { exch pop } repeat
+              3 1 roll                     % Stach the stop flag.
+              PDFfile exch setfileposition
+              setuserparams
+              {
+                (   **** Warning: Failed to read ToUnicode CMap.\n)  pdfformatwarning
               } {
-                dup wcheck {
-                  //false                     % Existing, writeable G2U, don't make new one
+                1 index /FontInfo .knownget not {
+                  currentglobal 2 index dup gcheck setglobal
+                  /FontInfo 5 dict dup 5 1 roll .forceput
+                  setglobal
+                } if
+                dup /GlyphNames2Unicode .knownget not {
+                  //true                        % No existing G2U, make one
                 } {
-                  pop //true                          % Existing read only G2U, make new one
+                  dup wcheck {
+                    //false                     % Existing, writeable G2U, don't make new one
+                  } {
+                    pop //true                          % Existing read only G2U, make new one
+                  } ifelse
                 } ifelse
+                {
+                  currentglobal exch dup gcheck setglobal
+                  dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
+                  3 2 roll setglobal
+                } if                                 % font-res font-dict encoding|null font-info g2u
+                exch pop exch                        % font-res font-dict g2u encoding|null
+                userdict /.lastToUnicode get         % font-res font-dict g2u Encoding|null CMap
+                .convert_ToUnicode-into-g2u          % font-res font-dict
+                //null                               % font-res font-dict //null
               } ifelse
-              {
-                currentglobal exch dup gcheck setglobal
-                dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
-                3 2 roll setglobal
-              } if                                 % font-res font-dict encoding|null font-info g2u
-              exch pop exch                        % font-res font-dict g2u encoding|null
-              userdict /.lastToUnicode get         % font-res font-dict g2u Encoding|null CMap
-              .convert_ToUnicode-into-g2u          % font-res font-dict
-              //null                               % font-res font-dict //null
             } ifelse
-          } ifelse
-        } if
-        PDFDEBUG {
-          (.processToUnicode end) =
+          } if
+          PDFDEBUG {
+            (.processToUnicode end) =
+          } if
         } if
       } if
     } stopped
-- 
2.23.0