From 3d01c62e19df9f369cdfaeff82ec8af2c0be75f1 Mon Sep 17 00:00:00 2001
From: halcanary <halcanary@google.com>
Date: Wed, 31 Aug 2016 12:52:35 -0700
Subject: [PATCH] SkPDF: Fix Type3 ToUnicode table.

This seems to fix text extraction on Adobe Reader

  - Registry/Ordering is now set to Skia/SkiaOrdering.
  - Type3 fonts now get a FontDescriptor (force symbolic font).
  - CMapName is now Skia-Identity-SkiaOrdering
  - CMap behaves correctly for single-byte fonts.

Also:
  - SkTestTypeface returns tounicode map for testing.
  - Unit test updated

All PDFs render the same

BUG=skia:5606
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2292303004

Review-Url: https://codereview.chromium.org/2292303004
---
 src/fonts/SkTestScalerContext.cpp  | 10 +++++-
 src/pdf/SkPDFFont.cpp              | 21 +++++++++---
 src/pdf/SkPDFMakeToUnicodeCmap.cpp | 51 ++++++++++++++++--------------
 tests/PDFGlyphsToUnicodeTest.cpp   |  6 ++--
 4 files changed, 56 insertions(+), 32 deletions(-)
diff --git a/src/fonts/SkTestScalerContext.cpp b/src/fonts/SkTestScalerContext.cpp
index fcb65a83c1..09b20ba71b 100644
--- a/src/fonts/SkTestScalerContext.cpp
+++ b/src/fonts/SkTestScalerContext.cpp
@@ -150,7 +150,15 @@ SkAdvancedTypefaceMetrics* SkTestTypeface::onGetAdvancedTypefaceMetrics(
 // pdf only
     SkAdvancedTypefaceMetrics* info = new SkAdvancedTypefaceMetrics;
     info->fFontName.set(fTestFont->fName);
-    info->fLastGlyphID = SkToU16(onCountGlyphs() - 1);
+    int glyphCount = this->onCountGlyphs();
+    info->fLastGlyphID = SkToU16(glyphCount - 1);
+
+    SkTDArray<SkUnichar>& toUnicode = info->fGlyphToUnicode;
+    toUnicode.setCount(glyphCount);
+    SkASSERT(glyphCount == SkToInt(fTestFont->fCharCodesCount));
+    for (int gid = 0; gid < glyphCount; ++gid) {
+        toUnicode[gid] = SkToS32(fTestFont->fCharCodes[gid]);
+    }
     return info;
 }
 
diff --git a/src/pdf/SkPDFFont.cpp b/src/pdf/SkPDFFont.cpp
index 93f48332d8..32e365388a 100644
--- a/src/pdf/SkPDFFont.cpp
+++ b/src/pdf/SkPDFFont.cpp
@@ -29,7 +29,7 @@ namespace {
 // PDF's notion of symbolic vs non-symbolic is related to the character set, not
 // symbols vs. characters.  Rarely is a font the right character set to call it
 // non-symbolic, so always call it symbolic.  (PDF 1.4 spec, section 5.7.1)
-static const int kPdfSymbolic = 4;
+static const int32_t kPdfSymbolic = 4;
 
 struct SkPDFType0Font final : public SkPDFFont {
     SkPDFType0Font(SkPDFFont::Info, const SkAdvancedTypefaceMetrics&);
@@ -426,8 +426,9 @@ void SkPDFType0Font::getFontSubset(SkPDFCanon* canon) {
     }
 
     auto sysInfo = sk_make_sp<SkPDFDict>();
-    sysInfo->insertString("Registry", "Adobe");
-    sysInfo->insertString("Ordering", "Identity");
+    sysInfo->insertString("Registry", "Skia");
+    // TODO: Registry+Ordering should be globally unique!
+    sysInfo->insertString("Ordering", "SkiaOrdering");
     sysInfo->insertInt("Supplement", 0);
     newCIDFont->insertObject("CIDSystemInfo", std::move(sysInfo));
 
@@ -597,6 +598,7 @@ static void add_type3_font_info(SkPDFCanon* canon,
                                 const SkBitSet& subset,
                                 SkGlyphID firstGlyphID,
                                 SkGlyphID lastGlyphID) {
+    const SkAdvancedTypefaceMetrics* metrics = SkPDFFont::GetMetrics(typeface, canon);
     SkASSERT(lastGlyphID >= firstGlyphID);
     // Remove unused glyphs at the end of the range.
     // Keep the lastGlyphID >= firstGlyphID invariant true.
@@ -684,8 +686,7 @@ static void add_type3_font_info(SkPDFCanon* canon,
     fontBBox->appendInt(bbox.top());
     font->insertObject("FontBBox", std::move(fontBBox));
     font->insertName("CIDToGIDMap", "Identity");
-    const SkAdvancedTypefaceMetrics* metrics = SkPDFFont::GetMetrics(typeface, canon);
-    if (metrics /* && metrics->fGlyphToUnicode.count() > 0 */) {
+    if (metrics && metrics->fGlyphToUnicode.count() > 0) {
         font->insertObjRef("ToUnicode",
                            SkPDFMakeToUnicodeCmap(metrics->fGlyphToUnicode,
                                                   &subset,
@@ -693,6 +694,16 @@ static void add_type3_font_info(SkPDFCanon* canon,
                                                   firstGlyphID,
                                                   lastGlyphID));
     }
+    auto descriptor = sk_make_sp<SkPDFDict>("FontDescriptor");
+    int32_t fontDescriptorFlags = kPdfSymbolic;
+    if (metrics) {
+        // Type3 FontDescriptor does not require all the same fields.
+        descriptor->insertName("FontName", metrics->fFontName);
+        descriptor->insertInt("ItalicAngle", metrics->fItalicAngle);
+        fontDescriptorFlags |= (int32_t)metrics->fStyle;
+    }
+    descriptor->insertInt("Flags", fontDescriptorFlags);
+    font->insertObjRef("FontDescriptor", std::move(descriptor));
     font->insertObject("Widths", std::move(widthArray));
     font->insertObject("Encoding", std::move(encoding));
     font->insertObject("CharProcs", std::move(charProcs));
diff --git a/src/pdf/SkPDFMakeToUnicodeCmap.cpp b/src/pdf/SkPDFMakeToUnicodeCmap.cpp
index 5186cbbda1..7fc5c59be3 100644
--- a/src/pdf/SkPDFMakeToUnicodeCmap.cpp
+++ b/src/pdf/SkPDFMakeToUnicodeCmap.cpp
@@ -10,8 +10,7 @@
 #include "SkUtils.h"
 
 static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
-                                    SkGlyphID firstGlyphID,
-                                    SkGlyphID lastGlyphID) {
+                                    bool multibyte) {
     // 12 dict begin: 12 is an Adobe-suggested value. Shall not change.
     // It's there to prevent old version Adobe Readers from malfunctioning.
     const char* kHeader =
@@ -26,8 +25,8 @@ static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
     // different. This is not a reference object.
     const char* kSysInfo =
         "/CIDSystemInfo\n"
-        "<<  /Registry (Adobe)\n"
-        "/Ordering (UCS)\n"
+        "<<  /Registry (Skia)\n"
+        "/Ordering (SkiaOrdering)\n"
         "/Supplement 0\n"
         ">> def\n";
     cmap->writeText(kSysInfo);
@@ -36,18 +35,16 @@ static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
     // /CMapType 2 means ToUnicode.
     // Codespace range just tells the PDF processor the valid range.
     const char* kTypeInfoHeader =
-        "/CMapName /Adobe-Identity-UCS def\n"
+        "/CMapName /Skia-Identity-SkiaOrdering def\n"
         "/CMapType 2 def\n"
         "1 begincodespacerange\n";
     cmap->writeText(kTypeInfoHeader);
-
-    // e.g.     "<0000> <FFFF>\n"
-    SkString range;
-    range.appendf("<%04X> <%04X>\n", firstGlyphID, lastGlyphID);
-    cmap->writeText(range.c_str());
-
-    const char* kTypeInfoFooter = "endcodespacerange\n";
-    cmap->writeText(kTypeInfoFooter);
+    if (multibyte) {
+        cmap->writeText("<0000> <FFFF>\n");
+    } else {
+        cmap->writeText("<00> <FF>\n");
+    }
+    cmap->writeText("endcodespacerange\n");
 }
 
 static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
@@ -82,7 +79,18 @@ static void write_utf16be(SkDynamicMemoryWStream* wStream, SkUnichar utf32) {
     }
 }
 
+static void write_glyph(SkDynamicMemoryWStream* cmap,
+                        bool multiByte,
+                        SkGlyphID gid) {
+    if (multiByte) {
+        SkPDFUtils::WriteUInt16BE(cmap, gid);
+    } else {
+        SkPDFUtils::WriteUInt8(cmap, SkToU8(gid));
+    }
+}
+
 static void append_bfchar_section(const SkTDArray<BFChar>& bfchar,
+                                  bool multiByte,
                                   SkDynamicMemoryWStream* cmap) {
     // PDF spec defines that every bf* list can have at most 100 entries.
     for (int i = 0; i < bfchar.count(); i += 100) {
@@ -92,7 +100,7 @@ static void append_bfchar_section(const SkTDArray<BFChar>& bfchar,
         cmap->writeText(" beginbfchar\n");
         for (int j = 0; j < count; ++j) {
             cmap->writeText("<");
-            SkPDFUtils::WriteUInt16BE(cmap, bfchar[i + j].fGlyphId);
+            write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId);
             cmap->writeText("> <");
             write_utf16be(cmap, bfchar[i + j].fUnicode);
             cmap->writeText(">\n");
@@ -102,6 +110,7 @@ static void append_bfchar_section(const SkTDArray<BFChar>& bfchar,
 }
 
 static void append_bfrange_section(const SkTDArray<BFRange>& bfrange,
+                                   bool multiByte,
                                    SkDynamicMemoryWStream* cmap) {
     // PDF spec defines that every bf* list can have at most 100 entries.
     for (int i = 0; i < bfrange.count(); i += 100) {
@@ -111,9 +120,9 @@ static void append_bfrange_section(const SkTDArray<BFRange>& bfrange,
         cmap->writeText(" beginbfrange\n");
         for (int j = 0; j < count; ++j) {
             cmap->writeText("<");
-            SkPDFUtils::WriteUInt16BE(cmap, bfrange[i + j].fStart);
+            write_glyph(cmap, multiByte, bfrange[i + j].fStart);
             cmap->writeText("> <");
-            SkPDFUtils::WriteUInt16BE(cmap, bfrange[i + j].fEnd);
+            write_glyph(cmap, multiByte, bfrange[i + j].fEnd);
             cmap->writeText("> <");
             write_utf16be(cmap, bfrange[i + j].fUnicode);
             cmap->writeText(">\n");
@@ -206,8 +215,8 @@ void SkPDFAppendCmapSections(const SkTDArray<SkUnichar>& glyphToUnicode,
 
     // The spec requires all bfchar entries for a font must come before bfrange
     // entries.
-    append_bfchar_section(bfcharEntries, cmap);
-    append_bfrange_section(bfrangeEntries, cmap);
+    append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap);
+    append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap);
 }
 
 sk_sp<SkPDFStream> SkPDFMakeToUnicodeCmap(
@@ -217,11 +226,7 @@ sk_sp<SkPDFStream> SkPDFMakeToUnicodeCmap(
         SkGlyphID firstGlyphID,
         SkGlyphID lastGlyphID) {
     SkDynamicMemoryWStream cmap;
-    if (multiByteGlyphs) {
-        append_tounicode_header(&cmap, firstGlyphID, lastGlyphID);
-    } else {
-        append_tounicode_header(&cmap, 1, lastGlyphID - firstGlyphID + 1);
-    }
+    append_tounicode_header(&cmap, multiByteGlyphs);
     SkPDFAppendCmapSections(glyphToUnicode, subset, &cmap, multiByteGlyphs,
                             firstGlyphID, lastGlyphID);
     append_cmap_footer(&cmap);
diff --git a/tests/PDFGlyphsToUnicodeTest.cpp b/tests/PDFGlyphsToUnicodeTest.cpp
index d83ce664bc..3ba8870774 100644
--- a/tests/PDFGlyphsToUnicodeTest.cpp
+++ b/tests/PDFGlyphsToUnicodeTest.cpp
@@ -125,11 +125,11 @@ endbfchar\n";
 
     char expectedResultSingleBytes[] =
 "2 beginbfchar\n\
-<0001> <0000>\n\
-<0002> <0000>\n\
+<01> <0000>\n\
+<02> <0000>\n\
 endbfchar\n\
 1 beginbfrange\n\
-<0003> <0006> <1010>\n\
+<03> <06> <1010>\n\
 endbfrange\n";
 
     REPORTER_ASSERT(reporter, stream_equals(buffer, 0,