Merge pull request #14 from mdboom/fix-subsetting

mdboom · mdboom · commit 2ffb1d7a9938 · 2016-04-29T12:03:11.000-04:00
Fix subsetting
diff --git a/lib/freetypy/subset.py b/lib/freetypy/subset.py
@@ -38,6 +38,15 @@
 from __future__ import absolute_import, division, unicode_literals, print_function
 
 
+# The general approach here is to not change any glyph ids, merely to
+# remove content for unused glyphs.  This means the character map
+# tables don't have to be rewritten.  Additionally, this doesn't break
+# random third-party table formats that use glyph ids.  This does mean
+# that some space savings are left on the table, but for large Unicode
+# fonts, the glyph data itself is comprises the majority of the file
+# size, and this approach tackles that handily.
+
+
 __all__ = ['subset_font']
 
 
@@ -211,12 +220,12 @@ def _get_formats(self, fontfile):
     def get_offsets(self, fontfile):
         entry_format, entry_size, scale = self._get_formats(fontfile)
 
-        content = self._content
-        offsets = []
-        for i in range(0, len(content), entry_size):
-            value = struct.unpack(
+        content = self.content
+        offsets = [
+            struct.unpack(
                 entry_format, content[i:i+entry_size])[0] * scale
-            offsets.append(value)
+            for i in range(0, len(content), entry_size)
+        ]
 
         return offsets
 
@@ -232,25 +241,149 @@ def subset(self, fontfile, glyphs, offsets):
         new_offsets.append(offset)
 
         entry_format, entry_size, scale = self._get_formats(fontfile)
-        new_content = []
-        for value in new_offsets:
-            new_content.append(struct.pack(entry_format, value // scale))
-        self.content = b''.join(new_content)
+        self.content = b''.join(
+            struct.pack(entry_format, value // scale)
+            for value in new_offsets)
 
 
 class _GlyfTable(_Table):
+    def find_all_glyphs(self, glyphs, offsets):
+        """
+        Given a set of glyphs, find all glyphs, including the targets of
+        compound glyphs, that are needed to render the glyphs.
+        """
+        ARG_1_AND_2_ARE_WORDS = 1 << 0
+        WE_HAVE_A_SCALE = 1 << 3
+        MORE_COMPONENTS = 1 << 5
+        WE_HAVE_AN_X_AND_Y_SCALE = 1 << 6
+        WE_HAVE_A_TWO_BY_TWO = 1 << 7
+
+        def calculate_skip(flags):
+            """
+            Calculates the number of bytes to skip to get to the next
+            component entry.
+            """
+            # Numbers can be in bytes or shorts, depending on
+            # flag bit
+            if flags & ARG_1_AND_2_ARE_WORDS:
+                base_size = 2
+            else:
+                base_size = 1
+
+            nbytes = 4 + base_size * 2
+            if flags & WE_HAVE_A_SCALE:
+                nbytes += base_size
+            elif flags & WE_HAVE_AN_X_AND_Y_SCALE:
+                nbytes += base_size * 2
+            elif flags & WE_HAVE_A_TWO_BY_TWO:
+                nbytes += base_size * 4
+
+            return nbytes
+
+        content = self.content
+        all_glyphs = set()
+        glyph_queue = glyphs[:]
+
+        while len(glyph_queue):
+            gind = glyph_queue.pop(0)
+            if gind in all_glyphs:
+                continue
+            all_glyphs.add(gind)
+
+            glyph = content[offsets[gind]:offsets[gind+1]]
+            if len(glyph) == 0:
+                continue
+
+            num_contours, = struct.unpack('>h', glyph[0:2])
+            if num_contours < 0:  # compound glyph
+                # skip over glyph header
+                i = 10
+                while True:
+                    flags, component_gind = struct.unpack('>HH', glyph[i:i+4])
+                    glyph_queue.append(component_gind)
+
+                    if not flags & MORE_COMPONENTS:
+                        break
+
+                    i += calculate_skip(flags)
+
+        all_glyphs = list(all_glyphs)
+        all_glyphs.sort()
+        return all_glyphs
+
     def subset(self, glyphs, offsets):
         content = self.content
-        new_content = []
+
+        self.content = b''.join(
+            content[offsets[gind]:offsets[gind+1]]
+            for gind in glyphs)
+
+
+class _PostTable(_Table):
+    post_table_struct = _BinaryStruct([
+        ('format', 'I')])
+
+    def __init__(self, header, content):
+        super(_PostTable, self).__init__(header, content)
+
+        self.__dict__.update(self.post_table_struct.unpack(content[:4]))
+
+    def _subset_format2(self, glyphs):
+        N_BASIC_NAMES = 258
+
+        content = self._content
+        i = 32
+
+        numglyphs, = struct.unpack('>H', content[i:i+2])
+        i += 2
+
+        new_glyph_index = {}
+        needed_indices = {}
         for gind in glyphs:
-            new_content.append(content[offsets[gind]:offsets[gind+1]])
-        self.content = b''.join(new_content)
+            if gind < numglyphs:
+                offset = i + 2 * gind
+                name_index, = struct.unpack('>H', content[offset:offset+2])
+                if name_index < n_basic_names:
+                    new_glyph_index[gind] = name_index
+                elif (name_index >= n_basic_names and
+                      name_index < numglyphs - n_basic_names):
+                    needed_indices[name_index - n_basic_names] = gind
+
+        names = []
+        name_index = 0
+        i += 2 * numglyphs
+        while i < len(content):
+            name_length, = struct.unpack('>B', content[i:i+1])
+            i += 1
+            if name_index in needed_indices:
+                name = content[i:i+name_length]
+                new_glyph_index[needed_indices[name_index]] = (
+                    len(names) + n_basic_names)
+                names.append(name)
+            i += name_length
+            name_index += 1
+
+        new_content = [content[0:36]]
+        for i in range(numglyphs):
+            val = new_glyph_index.get(i, 0)
+            new_content.append(struct.pack('>H', val))
+
+        for name in names:
+            new_content.append(struct.pack('>B', len(name)))
+            new_content.append(name)
+
+        return b''.join(new_content)
+
+    def subset(self, glyphs):
+        if self.format == 0x20000 and False:
+            self.content = self._subset_format2(glyphs)
 
 
 SPECIAL_TABLES = {
     b'head': _HeadTable,
     b'loca': _LocaTable,
-    b'glyf': _GlyfTable
+    b'glyf': _GlyfTable,
+    b'post': _PostTable
 }
 
 
@@ -279,7 +412,7 @@ def __hasitem__(self, tag):
         return tag in self._tables
 
     @classmethod
-    def read(cls, fd):
+    def read(cls, fd, tables_to_remove=[]):
         header = cls.header_struct.read(fd)
 
         if header['version'] not in UNDERSTOOD_VERSIONS:
@@ -293,6 +426,8 @@ def read(cls, fd):
         for table_header in table_dir:
             fd.seek(table_header['offset'])
             content = fd.read(table_header['length'])
+            if table_header['tag'] in tables_to_remove:
+                continue
             table_cls = SPECIAL_TABLES.get(table_header['tag'], _Table)
             tables[table_header['tag']] = table_cls(table_header, content)
 
@@ -311,13 +446,20 @@ def subset(self, ccodes):
         glyphs = [0]
         for ccode in ccodes:
             glyphs.append(self._face.get_char_index_unicode(ccode))
-        glyphs.sort()
 
         offsets = self[b'loca'].get_offsets(self)
+        # Find all glyphs used, including components of compound
+        # glyphs
+        glyphs = self[b'glyf'].find_all_glyphs(glyphs, offsets)
+
         self[b'glyf'].subset(glyphs, offsets)
         self[b'loca'].subset(self, glyphs, offsets)
+        if b'post' in self._tables:
+            self[b'post'].subset(glyphs)
 
     def write(self, fd):
+        self._header['numTables'] = len(self._tables)
+
         self.header_struct.write(fd, self._header)
 
         offset = (self.header_struct.size +
@@ -334,7 +476,7 @@ def write(self, fd):
             fd.write(table._content)
 
 
-def subset_font(input_fd, output_fd, charcodes):
+def subset_font(input_fd, output_fd, charcodes, tables_to_remove=None):
     """
     Subset a SFNT-style (TrueType or OpenType) font.
 
@@ -350,7 +492,16 @@ def subset_font(input_fd, output_fd, charcodes):
 
     charcodes : list of int or unicode string
         The character codes to include in the output font file.
+
+    tables_to_remove : list of bytes, optional
+        The tags of tables to remove completely.  If not provided,
+        this defaults to:
+
+           [b'GPOS', b'GSUB']
     """
-    fontfile = _FontFile.read(input_fd)
+    if tables_to_remove is None:
+        tables_to_remove = [b'GPOS', b'GSUB']
+
+    fontfile = _FontFile.read(input_fd, tables_to_remove)
     fontfile.subset(charcodes)
     fontfile.write(output_fd)
diff --git a/src/freetypy.c b/src/freetypy.c
@@ -61,7 +61,7 @@ either expressed or implied, of the FreeBSD Project.
 
 #include "doc/lcd.h"
 
-#include "freetype/ftlcdfil.h"
+#include FT_LCD_FILTER_H
 
 static FT_Library ft_library;
 
@@ -94,7 +94,7 @@ py_set_lcd_filter(PyObject *self, PyObject *args, PyObject *kwargs)
 PyObject *
 py_set_lcd_filter_weights(PyObject *self, PyObject *args, PyObject *kwargs)
 {
-    char filters[5];
+    unsigned char filters[5];
 
     static char *kwlist[] = {"filter", NULL};
 
diff --git a/src/lcd.c b/src/lcd.c
@@ -30,7 +30,7 @@ either expressed or implied, of the FreeBSD Project.
 #include "lcd.h"
 #include "doc/lcd.h"
 
-#include "freetype/ftlcdfil.h"
+#include FT_LCD_FILTER_H
 
 ftpy_ConstantType Py_FT_LCD_FILTER_ConstantType;
 static PyTypeObject Py_FT_LCD_FILTER_Type;
@@ -46,7 +46,7 @@ static constant_def FT_LCD_FILTER_constants[] = {
 
 int setup_Lcd(PyObject *m)
 {
-    define_constant_namespace(
+    return define_constant_namespace(
           m, &Py_FT_LCD_FILTER_Type, &Py_FT_LCD_FILTER_ConstantType,
           "freetypy.LCD_FILTER",
           doc_LCD_FILTER, FT_LCD_FILTER_constants);