[PDF] Add a ToUnicode mapping for fonts.

This makes text in PDFs searchable and copy&paste-able. Code from arthurhsu@chromium.org. Original review: http://codereview.appspot.com/4428082/ Review URL: http://codereview.appspot.com/4525042 git-svn-id: http://skia.googlecode.com/svn/trunk@1280 2bbb7eff-a529-9590-31e7-b0007b416f81
2011-05-09 18:13:47 +00:00 · 2011-05-09 18:13:47 +00:00 · 6744d498fc
commit 6744d498fc
parent 339ac3d0a7
5 changed files with 236 additions and 4 deletions
--- a/include/core/SkAdvancedTypefaceMetrics.h
+++ b/include/core/SkAdvancedTypefaceMetrics.h
@ -82,6 +82,8 @@ public:
      kHAdvance_PerGlyphInfo   = 0x1, // Populate horizontal advance data.
      kVAdvance_PerGlyphInfo   = 0x2, // Populate vertical advance data.
      kGlyphNames_PerGlyphInfo = 0x4, // Populate glyph names (Type 1 only).
+      kToUnicode_PerGlyphInfo  = 0x8, // Populate ToUnicode table, ignored
+                                      // for Type 1 fonts
    };

    template <typename Data>
@ -113,6 +115,10 @@ public:

    // The names of each glyph, only populated for postscript fonts.
    SkTScopedPtr<SkAutoTArray<SkString> > fGlyphNames;
+
+    // The mapping from glyph to Unicode, only populated if
+    // kToUnicode_PerGlyphInfo is passed to GetAdvancedTypefaceMetrics.
+    SkTDArray<SkUnichar> fGlyphToUnicode;
 };

 namespace skia_advanced_typeface_metrics_utils {
--- a/include/pdf/SkPDFFont.h
+++ b/include/pdf/SkPDFFont.h
@ -130,6 +130,7 @@ private:
    */
    void populateType3Font(int16_t glyphID);
    bool addFontDescriptor(int16_t defaultWidth);
+    void populateToUnicodeTable();
    void addWidthInfoFromRange(int16_t defaultWidth,
        const SkAdvancedTypefaceMetrics::WidthRange* widthRangeEntry);
    /** Set fFirstGlyphID and fLastGlyphID to span at most 255 glyphs,
--- a/src/pdf/SkPDFFont.cpp
+++ b/src/pdf/SkPDFFont.cpp
@ -319,6 +319,92 @@ SkPDFArray* composeAdvanceData(

 }  // namespace

+static void append_tounicode_header(SkDynamicMemoryWStream* cmap) {
+    // 12 dict begin: 12 is an Adobe-suggested value. Shall not change.
+    // It's there to prevent old version Adobe Readers from malfunctioning.
+    const char* kHeader =
+        "/CIDInit /ProcSet findresource begin\n"
+        "12 dict begin\n"
+        "begincmap\n";
+    cmap->writeText(kHeader);
+
+    // The /CIDSystemInfo must be consistent to the one in
+    // SkPDFFont::populateCIDFont().
+    // We can not pass over the system info object here because the format is
+    // different. This is not a reference object.
+    const char* kSysInfo =
+        "/CIDSystemInfo\n"
+        "<<  /Registry (Adobe)\n"
+        "/Ordering (UCS)\n"
+        "/Supplement 0\n"
+        ">> def\n";
+    cmap->writeText(kSysInfo);
+
+    // The CMapName must be consistent to /CIDSystemInfo above.
+    // /CMapType 2 means ToUnicode.
+    // We specify codespacerange from 0x0000 to 0xFFFF because we convert our
+    // code table from unsigned short (16-bits). Codespace range just tells the
+    // PDF processor the valid range. It does not matter whether a complete
+    // mapping is provided or not.
+    const char* kTypeInfo =
+        "/CMapName /Adobe-Identity-UCS def\n"
+        "/CMapType 2 def\n"
+        "1 begincodespacerange\n"
+        "<0000> <FFFF>\n"
+        "endcodespacerange\n";
+    cmap->writeText(kTypeInfo);
+}
+
+static void append_cmap_bfchar_table(uint16_t* glyph_id, SkUnichar* unicode,
+                                     size_t count,
+                                     SkDynamicMemoryWStream* cmap) {
+    cmap->writeDecAsText(count);
+    cmap->writeText(" beginbfchar\n");
+    for (size_t i = 0; i < count; ++i) {
+        cmap->writeText("<");
+        cmap->writeHexAsText(glyph_id[i], 4);
+        cmap->writeText("> <");
+        cmap->writeHexAsText(unicode[i], 4);
+        cmap->writeText(">\n");
+    }
+    cmap->writeText("endbfchar\n");
+}
+
+static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
+    const char* kFooter =
+        "endcmap\n"
+        "CMapName currentdict /CMap defineresource pop\n"
+        "end\n"
+        "end";
+    cmap->writeText(kFooter);
+}
+
+// Generate <bfchar> table according to PDF spec 1.4 and Adobe Technote 5014.
+static void append_cmap_bfchar_sections(
+                const SkTDArray<SkUnichar>& glyphUnicode,
+                SkDynamicMemoryWStream* cmap) {
+    // PDF spec defines that every bf* list can have at most 100 entries.
+    const size_t kMaxEntries = 100;
+    uint16_t glyphId[kMaxEntries];
+    SkUnichar unicode[kMaxEntries];
+    size_t index = 0;
+    for (int i = 0; i < glyphUnicode.count(); i++) {
+        if (glyphUnicode[i]) {
+            glyphId[index] = i;
+            unicode[index] = glyphUnicode[i];
+            ++index;
+        }
+        if (index == kMaxEntries) {
+            append_cmap_bfchar_table(glyphId, unicode, index, cmap);
+            index = 0;
+        }
+    }
+
+    if (index) {
+        append_cmap_bfchar_table(glyphId, unicode, index, cmap);
+    }
+}
+
 /* Font subset design: It would be nice to be able to subset fonts
 * (particularly type 3 fonts), but it's a lot of work and not a priority.
 *
@ -404,9 +490,13 @@ SkPDFFont* SkPDFFont::getFontResource(SkTypeface* typeface, uint16_t glyphID) {
        fontInfo = relatedFont->fFontInfo;
        fontDescriptor = relatedFont->fDescriptor.get();
    } else {
-        fontInfo = SkFontHost::GetAdvancedTypefaceMetrics(fontID, SkTBitOr(
-                SkAdvancedTypefaceMetrics::kHAdvance_PerGlyphInfo,
-                SkAdvancedTypefaceMetrics::kGlyphNames_PerGlyphInfo));
+        SkAdvancedTypefaceMetrics::PerGlyphInfo info;
+        info = SkAdvancedTypefaceMetrics::kHAdvance_PerGlyphInfo;
+        info = SkTBitOr<SkAdvancedTypefaceMetrics::PerGlyphInfo>(
+                  info, SkAdvancedTypefaceMetrics::kGlyphNames_PerGlyphInfo);
+        info = SkTBitOr<SkAdvancedTypefaceMetrics::PerGlyphInfo>(
+                  info, SkAdvancedTypefaceMetrics::kToUnicode_PerGlyphInfo);
+        fontInfo = SkFontHost::GetAdvancedTypefaceMetrics(fontID, info);
        SkSafeUnref(fontInfo.get());  // SkRefPtr and Get both took a reference.
    }

@ -497,7 +587,6 @@ SkPDFFont::SkPDFFont(class SkAdvancedTypefaceMetrics* fontInfo,
 }

 void SkPDFFont::populateType0Font() {
-    // TODO(vandebo) add a ToUnicode mapping.
    fMultiByteGlyphs = true;

    insert("Subtype", new SkPDFName("Type0"))->unref();
@ -512,6 +601,26 @@ void SkPDFFont::populateType0Font() {
        new SkPDFFont(fFontInfo.get(), fTypeface.get(), 1, true, NULL));
    descendantFonts->append(new SkPDFObjRef(fResources.top()))->unref();
    insert("DescendantFonts", descendantFonts.get());
+
+    populateToUnicodeTable();
+}
+
+void SkPDFFont::populateToUnicodeTable() {
+    if (fFontInfo.get() == NULL ||
+        fFontInfo->fGlyphToUnicode.begin() == NULL) {
+        return;
+    }
+
+    SkDynamicMemoryWStream cmap;
+    append_tounicode_header(&cmap);
+    append_cmap_bfchar_sections(fFontInfo->fGlyphToUnicode, &cmap);
+    append_cmap_footer(&cmap);
+    SkRefPtr<SkMemoryStream> cmapStream = new SkMemoryStream();
+    cmapStream->unref();  // SkRefPtr and new took a reference.
+    cmapStream->setMemoryOwned(cmap.detach(), cmap.getOffset());
+    SkRefPtr<SkPDFStream> pdfCmap = new SkPDFStream(cmapStream.get());
+    fResources.push(pdfCmap.get());  // Pass reference from new.
+    insert("ToUnicode", new SkPDFObjRef(pdfCmap.get()))->unref();
 }

 void SkPDFFont::populateCIDFont() {
@ -522,6 +631,7 @@ void SkPDFFont::populateCIDFont() {
        insert("Subtype", new SkPDFName("CIDFontType0"))->unref();
    } else if (fFontInfo->fType == SkAdvancedTypefaceMetrics::kTrueType_Font) {
        insert("Subtype", new SkPDFName("CIDFontType2"))->unref();
+        insert("CIDToGIDMap", new SkPDFName("Identity"))->unref();
    } else {
        SkASSERT(false);
    }
@ -697,9 +807,12 @@ void SkPDFFont::populateType3Font(int16_t glyphID) {
    insert("FirstChar", new SkPDFInt(fFirstGlyphID))->unref();
    insert("LastChar", new SkPDFInt(fLastGlyphID))->unref();
    insert("Widths", widthArray.get());
+    insert("CIDToGIDMap", new SkPDFName("Identity"))->unref();

    if (fFontInfo && fFontInfo->fLastGlyphID <= 255)
        fFontInfo = NULL;
+
+    populateToUnicodeTable();
 }

 bool SkPDFFont::addFontDescriptor(int16_t defaultWidth) {
--- a/src/ports/SkFontHost_FreeType.cpp
+++ b/src/ports/SkFontHost_FreeType.cpp
@ -339,6 +339,56 @@ static bool getWidthAdvance(FT_Face face, int gId, int16_t* data) {
    return true;
 }

+static void populate_glyph_to_unicode(FT_Face& face,
+                                      SkTDArray<SkUnichar>* glyphToUnicode) {
+    // Check and see if we have Unicode cmaps.
+    for (int i = 0; i < face->num_charmaps; ++i) {
+        // CMaps known to support Unicode:
+        // Platform ID   Encoding ID   Name
+        // -----------   -----------   -----------------------------------
+        // 0             0,1           Apple Unicode
+        // 0             3             Apple Unicode 2.0 (preferred)
+        // 3             1             Microsoft Unicode UCS-2
+        // 3             10            Microsoft Unicode UCS-4 (preferred)
+        //
+        // See Apple TrueType Reference Manual
+        // http://developer.apple.com/fonts/TTRefMan/RM06/Chap6cmap.html
+        // http://developer.apple.com/fonts/TTRefMan/RM06/Chap6name.html#ID
+        // Microsoft OpenType Specification
+        // http://www.microsoft.com/typography/otspec/cmap.htm
+
+        FT_UShort platformId = face->charmaps[i]->platform_id;
+        FT_UShort encodingId = face->charmaps[i]->encoding_id;
+
+        if (platformId != 0 && platformId != 3) {
+            continue;
+        }
+        if (platformId == 3 && encodingId != 1 && encodingId != 10) {
+            continue;
+        }
+        bool preferredMap = ((platformId == 3 && encodingId == 10) ||
+                             (platformId == 0 && encodingId == 3));
+
+        FT_Set_Charmap(face, face->charmaps[i]);
+        if (glyphToUnicode->isEmpty()) {
+            glyphToUnicode->setCount(face->num_glyphs);
+            memset(glyphToUnicode->begin(), 0,
+                   sizeof(SkUnichar) * face->num_glyphs);
+        }
+
+        // Iterate through each cmap entry.
+        FT_UInt glyphIndex;
+        for (SkUnichar charCode = FT_Get_First_Char(face, &glyphIndex);
+             glyphIndex != 0;
+             charCode = FT_Get_Next_Char(face, charCode, &glyphIndex)) {
+            if (charCode &&
+                    ((*glyphToUnicode)[glyphIndex] == 0 || preferredMap)) {
+                (*glyphToUnicode)[glyphIndex] = charCode;
+            }
+        }
+    }
+}
+
 // static
 SkAdvancedTypefaceMetrics* SkFontHost::GetAdvancedTypefaceMetrics(
        uint32_t fontID,
@ -509,6 +559,12 @@ SkAdvancedTypefaceMetrics* SkFontHost::GetAdvancedTypefaceMetrics(
        }
    }

+    if (perGlyphInfo & SkAdvancedTypefaceMetrics::kToUnicode_PerGlyphInfo &&
+           info->fType != SkAdvancedTypefaceMetrics::kType1_Font &&
+           face->num_charmaps) {
+        populate_glyph_to_unicode(face, &(info->fGlyphToUnicode));
+    }
+
    if (!canEmbed(face))
        info->fType = SkAdvancedTypefaceMetrics::kNotEmbeddable_Font;

--- a/src/ports/SkFontHost_win.cpp
+++ b/src/ports/SkFontHost_win.cpp
@ -194,6 +194,58 @@ static void GetLogFontByID(SkFontID fontID, LOGFONT* lf) {
    }
 }

+// Construct Glyph to Unicode table.
+// Unicode code points that require conjugate pairs in utf16 are not
+// supported.
+// TODO(arthurhsu): Add support for conjugate pairs. It looks like that may
+// require parsing the TTF cmap table (platform 4, encoding 12) directly instead
+// of calling GetFontUnicodeRange().
+static void populate_glyph_to_unicode(HDC fontHdc, const unsigned glyphCount,
+                                      SkTDArray<SkUnichar>* glyphToUnicode) {
+    DWORD glyphSetBufferSize = GetFontUnicodeRanges(fontHdc, NULL);
+    if (!glyphSetBufferSize) {
+        return;
+    }
+
+    SkAutoTDeleteArray<BYTE> glyphSetBuffer(new BYTE[glyphSetBufferSize]);
+    GLYPHSET* glyphSet =
+        reinterpret_cast<LPGLYPHSET>(glyphSetBuffer.get());
+    if (GetFontUnicodeRanges(fontHdc, glyphSet) != glyphSetBufferSize) {
+        return;
+    }
+
+    glyphToUnicode->setCount(glyphCount);
+    memset(glyphToUnicode->begin(), 0, glyphCount * sizeof(SkUnichar));
+    for (DWORD i = 0; i < glyphSet->cRanges; ++i) {
+        // There is no guarantee that within a Unicode range, the corresponding
+        // glyph id in a font file are continuous. So, even if we have ranges,
+        // we can't just use the first and last entry of the range to compute
+        // result. We need to enumerate them one by one.
+        int count = glyphSet->ranges[i].cGlyphs;
+        SkAutoTArray<WCHAR> chars(count + 1);
+        chars[count] = 0;  // termintate string
+        SkAutoTArray<WORD> glyph(count);
+        for (USHORT j = 0; j < count; ++j) {
+            chars[j] = glyphSet->ranges[i].wcLow + j;
+        }
+        GetGlyphIndicesW(fontHdc, chars.get(), count, glyph.get(),
+                         GGI_MARK_NONEXISTING_GLYPHS);
+        // If the glyph ID is valid, and the glyph is not mapped, then we will
+        // fill in the char id into the vector. If the glyph is mapped already,
+        // skip it.
+        // TODO(arthurhsu): better improve this. e.g. Get all used char ids from
+        // font cache, then generate this mapping table from there. It's
+        // unlikely to have collisions since glyph reuse happens mostly for
+        // different Unicode pages.
+        for (USHORT j = 0; j < count; ++j) {
+            if (glyph[j] != 0xffff && glyph[j] < glyphCount &&
+                (*glyphToUnicode)[glyph[j]] == 0) {
+                (*glyphToUnicode)[glyph[j]] = chars[j];
+            }
+        }
+    }
+}
+
 //////////////////////////////////////////////////////////////////////////////////////////////

 class SkScalerContext_Windows : public SkScalerContext {
@ -649,6 +701,10 @@ SkAdvancedTypefaceMetrics* SkFontHost::GetAdvancedTypefaceMetrics(
    info->fFontName.set(lf.lfFaceName);
 #endif

+    if (perGlyphInfo & SkAdvancedTypefaceMetrics::kToUnicode_PerGlyphInfo) {
+        populate_glyph_to_unicode(hdc, glyphCount, &(info->fGlyphToUnicode));
+    }
+
    if (otm.otmTextMetrics.tmPitchAndFamily & TMPF_TRUETYPE) {
        info->fType = SkAdvancedTypefaceMetrics::kTrueType_Font;
    } else {