summaryrefslogtreecommitdiff
path: root/fs/unicode
diff options
context:
space:
mode:
authorMasahiro Yamada <yamada.masahiro@socionext.com>2019-04-28 13:45:36 -0400
committerTheodore Ts'o <tytso@mit.edu>2019-04-28 13:45:36 -0400
commit28ba53c07638f31b153e3a32672a6124d0ff2a97 (patch)
treed89dfc7e3e9661b15d98e5192433121f5ac081c2 /fs/unicode
parent0a790fe4389d88253563c5e22bea47e6d357b525 (diff)
downloadlinux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.gz
linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.bz2
linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.zip
unicode: refactor the rule for regenerating utf8data.h
scripts/mkutf8data is used only when regenerating utf8data.h, which never happens in the normal kernel build. However, it is irrespectively built if CONFIG_UNICODE is enabled. Moreover, there is no good reason for it to reside in the scripts/ directory since it is only used in fs/unicode/. Hence, move it from scripts/ to fs/unicode/. In some cases, we bypass build artifacts in the normal build. The conventional way to do so is to surround the code with ifdef REGENERATE_*. For example, - 7373f4f83c71 ("kbuild: add implicit rules for parser generation") - 6aaf49b495b4 ("crypto: arm,arm64 - Fix random regeneration of S_shipped") I rewrote the rule in a more kbuild'ish style. In the normal build, utf8data.h is just shipped from the check-in file. $ make [ snip ] SHIPPED fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a If you want to generate utf8data.h based on UCD, put *.txt files into fs/unicode/, then pass REGENERATE_UTF8DATA=1 from the command line. The mkutf8data tool will be automatically compiled to generate the utf8data.h from the *.txt files. $ make REGENERATE_UTF8DATA=1 [ snip ] HOSTCC fs/unicode/mkutf8data GEN fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a I renamed the check-in utf8data.h to utf8data.h_shipped so that this will work for the out-of-tree build. You can update it based on the latest UCD like this: $ make REGENERATE_UTF8DATA=1 fs/unicode/ $ cp fs/unicode/utf8data.h fs/unicode/utf8data.h_shipped Also, I added entries to .gitignore and dontdiff. Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/unicode')
-rw-r--r--fs/unicode/.gitignore2
-rw-r--r--fs/unicode/Makefile41
-rw-r--r--fs/unicode/README.utf8data9
-rw-r--r--fs/unicode/mkutf8data.c3419
-rw-r--r--fs/unicode/utf8data.h_shipped (renamed from fs/unicode/utf8data.h)0
5 files changed, 3455 insertions, 16 deletions
diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore
new file mode 100644
index 000000000000..0381e2221480
--- /dev/null
+++ b/fs/unicode/.gitignore
@@ -0,0 +1,2 @@
+mkutf8data
+utf8data.h
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 671d31f83006..d46e9baee285 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -5,15 +5,34 @@ obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
unicode-y := utf8-norm.o utf8-core.o
-# This rule is not invoked during the kernel compilation. It is used to
-# regenerate the utf8data.h header file.
-utf8data.h.new: *.txt $(objdir)/scripts/mkutf8data
- $(objdir)/scripts/mkutf8data \
- -a DerivedAge.txt \
- -c DerivedCombiningClass.txt \
- -p DerivedCoreProperties.txt \
- -d UnicodeData.txt \
- -f CaseFolding.txt \
- -n NormalizationCorrections.txt \
- -t NormalizationTest.txt \
+$(obj)/utf8-norm.o: $(obj)/utf8data.h
+
+# In the normal build, the checked-in utf8data.h is just shipped.
+#
+# To generate utf8data.h from UCD, put *.txt files in this directory
+# and pass REGENERATE_UTF8DATA=1 from the command line.
+ifdef REGENERATE_UTF8DATA
+
+quiet_cmd_utf8data = GEN $@
+ cmd_utf8data = $< \
+ -a $(srctree)/$(src)/DerivedAge.txt \
+ -c $(srctree)/$(src)/DerivedCombiningClass.txt \
+ -p $(srctree)/$(src)/DerivedCoreProperties.txt \
+ -d $(srctree)/$(src)/UnicodeData.txt \
+ -f $(srctree)/$(src)/CaseFolding.txt \
+ -n $(srctree)/$(src)/NormalizationCorrections.txt \
+ -t $(srctree)/$(src)/NormalizationTest.txt \
-o $@
+
+$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
+ $(call if_changed,utf8data)
+
+else
+
+$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
+ $(call if_changed,shipped)
+
+endif
+
+targets += utf8data.h
+hostprogs-y += mkutf8data
diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data
index dd56ef50c5d5..9307cf0727de 100644
--- a/fs/unicode/README.utf8data
+++ b/fs/unicode/README.utf8data
@@ -55,15 +55,14 @@ released version of the UCD can be found here:
http://www.unicode.org/Public/UCD/latest/
-To build the utf8data.h file, from a kernel tree that has been built,
-cd to this directory (fs/unicode) and run this command:
+Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1:
- make C=../.. objdir=../.. utf8data.h.new
+ make REGENERATE_UTF8DATA=1 fs/unicode/
-After sanity checking the newly generated utf8data.h.new file (the
+After sanity checking the newly generated utf8data.h file (the
version generated from the 12.1.0 UCD should be 4,109 lines long, and
have a total size of 324k) and/or comparing it with the older version
-of utf8data.h, rename it to utf8data.h.
+of utf8data.h_shipped, rename it to utf8data.h_shipped.
If you are a kernel developer updating to a newer version of the
Unicode Character Database, please update this README.utf8data file
diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
new file mode 100644
index 000000000000..ff2025ac5a32
--- /dev/null
+++ b/fs/unicode/mkutf8data.c
@@ -0,0 +1,3419 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME "DerivedAge.txt"
+#define CCC_NAME "DerivedCombiningClass.txt"
+#define PROP_NAME "DerivedCoreProperties.txt"
+#define DATA_NAME "UnicodeData.txt"
+#define FOLD_NAME "CaseFolding.txt"
+#define NORM_NAME "NormalizationCorrections.txt"
+#define TEST_NAME "NormalizationTest.txt"
+#define UTF8_NAME "utf8data.h"
+
+const char *age_name = AGE_NAME;
+const char *ccc_name = CCC_NAME;
+const char *prop_name = PROP_NAME;
+const char *data_name = DATA_NAME;
+const char *fold_name = FOLD_NAME;
+const char *norm_name = NORM_NAME;
+const char *test_name = TEST_NAME;
+const char *utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE 1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision. These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT (16)
+#define UNICODE_MIN_SHIFT (8)
+
+#define UNICODE_MAJ_MAX ((unsigned short)-1)
+#define UNICODE_MIN_MAX ((unsigned char)-1)
+#define UNICODE_REV_MAX ((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV) \
+ (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
+ ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
+ ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int age_valid(unsigned int major, unsigned int minor,
+ unsigned int revision)
+{
+ if (major > UNICODE_MAJ_MAX)
+ return 0;
+ if (minor > UNICODE_MIN_MAX)
+ return 0;
+ if (revision > UNICODE_REV_MAX)
+ return 0;
+ return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree. The first byte contains the
+ * following information:
+ * NEXTBYTE - flag - advance to next byte if set
+ * BITNUM - 3 bit field - the bit number to tested
+ * OFFLEN - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ * RIGHTPATH - 1 bit field - set if the following node is for the
+ * right-hand path (tested bit is set)
+ * TRIENODE - 1 bit field - set if the following node is an internal
+ * node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ * LEFTNODE - 1 bit field - set if the left-hand node is internal
+ * RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM 0x07
+#define NEXTBYTE 0x08
+#define OFFLEN 0x30
+#define OFFLEN_SHIFT 4
+#define RIGHTPATH 0x40
+#define TRIENODE 0x80
+#define RIGHTNODE 0x40
+#define LEFTNODE 0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ * an index into utf8agetab[]. With this we can filter code
+ * points based on the unicode version in which they were
+ * defined. The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ * to do a stable sort into ascending order of all characters
+ * with a non-zero CCC that occur between two characters with
+ * a CCC of 0, or at the begin or end of a string.
+ * The unicode standard guarantees that all CCC values are
+ * between 0 and 254 inclusive, which leaves 255 available as
+ * a special value.
+ * Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ * start of a NUL-terminated string that is the decomposition
+ * of the character.
+ * The CCC of a decomposable character is the same as the CCC
+ * of the first character of its decomposition.
+ * Some characters decompose as the empty string: these are
+ * characters with the Default_Ignorable_Code_Point property.
+ * These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF) ((LEAF)[0])
+#define LEAF_CCC(LEAF) ((LEAF)[1])
+#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
+
+#define MAXGEN (255)
+
+#define MINCCC (0)
+#define MAXCCC (254)
+#define STOPPER (0)
+#define DECOMPOSE (255)
+#define HANGUL ((char)(255))
+
+#define UTF8HANGULLEAF (12)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
+ const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfdi;
+utf8trie_t *nfdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used. A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values. This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ * 0 - 0x7f: 0 0x7f
+ * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
+ * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
+ * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character. This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ * Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ * http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS 0xC0
+#define UTF8_3_BITS 0xE0
+#define UTF8_4_BITS 0xF0
+#define UTF8_N_BITS 0x80
+#define UTF8_2_MASK 0xE0
+#define UTF8_3_MASK 0xF0
+#define UTF8_4_MASK 0xF8
+#define UTF8_N_MASK 0xC0
+#define UTF8_V_MASK 0x3F
+#define UTF8_V_SHIFT 6
+
+static int utf8encode(char *str, unsigned int val)
+{
+ int len;
+
+ if (val < 0x80) {
+ str[0] = val;
+ len = 1;
+ } else if (val < 0x800) {
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_2_BITS;
+ len = 2;
+ } else if (val < 0x10000) {
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_3_BITS;
+ len = 3;
+ } else if (val < 0x110000) {
+ str[3] = val & UTF8_V_MASK;
+ str[3] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_4_BITS;
+ len = 4;
+ } else {
+ printf("%#x: illegal val\n", val);
+ len = 0;
+ }
+ return len;
+}
+
+static unsigned int utf8decode(const char *str)
+{
+ const unsigned char *s = (const unsigned char*)str;
+ unsigned int unichar = 0;
+
+ if (*s < 0x80) {
+ unichar = *s;
+ } else if (*s < UTF8_3_BITS) {
+ unichar = *s++ & 0x1F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else if (*s < UTF8_4_BITS) {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ }
+ return unichar;
+}
+
+static int utf32valid(unsigned int unichar)
+{
+ return unichar < 0x110000;
+}
+
+#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+ void *root;
+ int childnode;
+ const char *type;
+ unsigned int maxage;
+ struct tree *next;
+ int (*leaf_equal)(void *, void *);
+ void (*leaf_print)(void *, int);
+ int (*leaf_mark)(void *);
+ int (*leaf_size)(void *);
+ int *(*leaf_index)(struct tree *, void *);
+ unsigned char *(*leaf_emit)(void *, unsigned char *);
+ int leafindex[0x110000];
+ int index;
+};
+
+struct node {
+ int index;
+ int offset;
+ int mark;
+ int size;
+ struct node *parent;
+ void *left;
+ void *right;
+ unsigned char bitnum;
+ unsigned char nextbyte;
+ unsigned char leftnode;
+ unsigned char rightnode;
+ unsigned int keybits;
+ unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *lookup(struct tree *tree, const char *key)
+{
+ struct node *node;
+ void *leaf = NULL;
+
+ node = tree->root;
+ while (!leaf && node) {
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7))) {
+ /* Right leg */
+ if (node->rightnode == NODE) {
+ node = node->right;
+ } else if (node->rightnode == LEAF) {
+ leaf = node->right;
+ } else {
+ node = NULL;
+ }
+ } else {
+ /* Left leg */
+ if (node->leftnode == NODE) {
+ node = node->left;
+ } else if (node->leftnode == LEAF) {
+ leaf = node->left;
+ } else {
+ node = NULL;
+ }
+ }
+ }
+
+ return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void tree_walk(struct tree *tree)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int indent = 1;
+ int nodes, singletons, leaves;
+
+ nodes = singletons = leaves = 0;
+
+ printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+ if (tree->childnode == LEAF) {
+ assert(tree->root);
+ tree->leaf_print(tree->root, indent);
+ leaves = 1;
+ } else {
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ printf("%*snode @ %p bitnum %d nextbyte %d"
+ " left %p right %p mask %x bits %x\n",
+ indent, "", node,
+ node->bitnum, node->nextbyte,
+ node->left, node->right,
+ node->keymask, node->keybits);
+ nodes += 1;
+ if (!(node->left && node->right))
+ singletons += 1;
+
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ tree->leaf_print(node->left,
+ indent+1);
+ leaves += 1;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ tree->leaf_print(node->right,
+ indent+1);
+ leaves += 1;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+ }
+ printf("nodes %d leaves %d singletons %d\n",
+ nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *alloc_node(struct node *parent)
+{
+ struct node *node;
+ int bitnum;
+
+ node = malloc(sizeof(*node));
+ node->left = node->right = NULL;
+ node->parent = parent;
+ node->leftnode = NODE;
+ node->rightnode = NODE;
+ node->keybits = 0;
+ node->keymask = 0;
+ node->mark = 0;
+ node->index = 0;
+ node->offset = -1;
+ node->size = 4;
+
+ if (node->parent) {
+ bitnum = parent->bitnum;
+ if ((bitnum & 7) == 0) {
+ node->bitnum = bitnum + 7 + 8;
+ node->nextbyte = 1;
+ } else {
+ node->bitnum = bitnum - 1;
+ node->nextbyte = 0;
+ }
+ } else {
+ node->bitnum = 7;
+ node->nextbyte = 0;
+ }
+
+ return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+ struct node *node;
+ struct node *parent;
+ void **cursor;
+ int keybits;
+
+ assert(keylen >= 1 && keylen <= 4);
+
+ node = NULL;
+ cursor = &tree->root;
+ keybits = 8 * keylen;
+
+ /* Insert, creating path along the way. */
+ while (keybits) {
+ if (!*cursor)
+ *cursor = alloc_node(node);
+ node = *cursor;
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7)))
+ cursor = &node->right;
+ else
+ cursor = &node->left;
+ keybits--;
+ }
+ *cursor = leaf;
+
+ /* Merge subtrees if possible. */
+ while (node) {
+ if (*key & (1 << (node->bitnum & 7)))
+ node->rightnode = LEAF;
+ else
+ node->leftnode = LEAF;
+ if (node->nextbyte)
+ break;
+ if (node->leftnode == NODE || node->rightnode == NODE)
+ break;
+ assert(node->left);
+ assert(node->right);
+ /* Compare */
+ if (! tree->leaf_equal(node->left, node->right))
+ break;
+ /* Keep left, drop right leaf. */
+ leaf = node->left;
+ /* Check in parent */
+ parent = node->parent;
+ if (!parent) {
+ /* root of tree! */
+ tree->root = leaf;
+ tree->childnode = LEAF;
+ } else if (parent->left == node) {
+ parent->left = leaf;
+ parent->leftnode = LEAF;
+ if (parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ }
+ } else if (parent->right == node) {
+ parent->right = leaf;
+ parent->rightnode = LEAF;
+ if (parent->left) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ parent->keybits |= (1 << node->bitnum);
+ }
+ } else {
+ /* internal tree error */
+ assert(0);
+ }
+ free(node);
+ node = parent;
+ }
+
+ /* Propagate keymasks up along singleton chains. */
+ while (node) {
+ parent = node->parent;
+ if (!parent)
+ break;
+ /* Nix the mask for parents with two children. */
+ if (node->keymask == 0) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else if (parent->left && parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ assert((parent->keymask & node->keymask) == 0);
+ parent->keymask |= node->keymask;
+ parent->keymask |= (1 << parent->bitnum);
+ parent->keybits |= node->keybits;
+ if (parent->right)
+ parent->keybits |= (1 << parent->bitnum);
+ }
+ node = parent;
+ }
+
+ return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed. There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves. The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains. When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity. Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void prune(struct tree *tree)
+{
+ struct node *node;
+ struct node *left;
+ struct node *right;
+ struct node *parent;
+ void *leftleaf;
+ void *rightleaf;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+
+ if (verbose > 0)
+ printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+ count = 0;
+ if (tree->childnode == LEAF)
+ return;
+ if (!tree->root)
+ return;
+
+ leftmask = rightmask = 0;
+ node = tree->root;
+ while (node) {
+ if (node->nextbyte)
+ goto advance;
+ if (node->leftnode == LEAF)
+ goto advance;
+ if (node->rightnode == LEAF)
+ goto advance;
+ if (!node->left)
+ goto advance;
+ if (!node->right)
+ goto advance;
+ left = node->left;
+ right = node->right;
+ if (left->keymask == 0)
+ goto advance;
+ if (right->keymask == 0)
+ goto advance;
+ if (left->keymask != right->keymask)
+ goto advance;
+ if (left->keybits != right->keybits)
+ goto advance;
+ leftleaf = NULL;
+ while (!leftleaf) {
+ assert(left->left || left->right);
+ if (left->leftnode == LEAF)
+ leftleaf = left->left;
+ else if (left->rightnode == LEAF)
+ leftleaf = left->right;
+ else if (left->left)
+ left = left->left;
+ else if (left->right)
+ left = left->right;
+ else
+ assert(0);
+ }
+ rightleaf = NULL;
+ while (!rightleaf) {
+ assert(right->left || right->right);
+ if (right->leftnode == LEAF)
+ rightleaf = right->left;
+ else if (right->rightnode == LEAF)
+ rightleaf = right->right;
+ else if (right->left)
+ right = right->left;
+ else if (right->right)
+ right = right->right;
+ else
+ assert(0);
+ }
+ if (! tree->leaf_equal(leftleaf, rightleaf))
+ goto advance;
+ /*
+ * This node has identical singleton-only subtrees.
+ * Remove it.
+ */
+ parent = node->parent;
+ left = node->left;
+ right = node->right;
+ if (parent->left == node)
+ parent->left = left;
+ else if (parent->right == node)
+ parent->right = left;
+ else
+ assert(0);
+ left->parent = parent;
+ left->keymask |= (1 << node->bitnum);
+ node->left = NULL;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ if (node->leftnode == NODE && node->left) {
+ left = node->left;
+ free(node);
+ count++;
+ node = left;
+ } else if (node->rightnode == NODE && node->right) {
+ right = node->right;
+ free(node);
+ count++;
+ node = right;
+ } else {
+ node = NULL;
+ }
+ }
+ /* Propagate keymasks up along singleton chains. */
+ node = parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ for (;;) {
+ if (node->left && node->right)
+ break;
+ if (node->left) {
+ left = node->left;
+ node->keymask |= left->keymask;
+ node->keybits |= left->keybits;
+ }
+ if (node->right) {
+ right = node->right;
+ node->keymask |= right->keymask;
+ node->keybits |= right->keybits;
+ }
+ node->keymask |= (1 << node->bitnum);
+ node = node->parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ }
+ advance:
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0 &&
+ node->leftnode == NODE &&
+ node->left) {
+ leftmask |= bitmask;
+ node = node->left;
+ } else if ((rightmask & bitmask) == 0 &&
+ node->rightnode == NODE &&
+ node->right) {
+ rightmask |= bitmask;
+ node = node->right;
+ } else {
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+ }
+ if (verbose > 0)
+ printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void mark_nodes(struct tree *tree)
+{
+ struct node *node;
+ struct node *n;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int marked;
+
+ marked = 0;
+ if (verbose > 0)
+ printf("Marking %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF)
+ goto done;
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+
+ /* second pass: left siblings and singletons */
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ if (!node->mark && node->parent->mark) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ if (!node->mark && node->parent->mark &&
+ !node->parent->left) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+done:
+ if (verbose > 0)
+ printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie. These values must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int index_nodes(struct tree *tree, int index)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+ int indent;
+
+ /* Align to a cache line (or half a cache line?). */
+ while (index % 64)
+ index++;
+ tree->index = index;
+ indent = 1;
+ count = 0;
+
+ if (verbose > 0)
+ printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);