diff options
| author | Masahiro Yamada <yamada.masahiro@socionext.com> | 2019-04-28 13:45:36 -0400 |
|---|---|---|
| committer | Theodore Ts'o <tytso@mit.edu> | 2019-04-28 13:45:36 -0400 |
| commit | 28ba53c07638f31b153e3a32672a6124d0ff2a97 (patch) | |
| tree | d89dfc7e3e9661b15d98e5192433121f5ac081c2 /fs/unicode | |
| parent | 0a790fe4389d88253563c5e22bea47e6d357b525 (diff) | |
| download | linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.gz linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.bz2 linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.zip | |
unicode: refactor the rule for regenerating utf8data.h
scripts/mkutf8data is used only when regenerating utf8data.h,
which never happens in the normal kernel build. However, it is
irrespectively built if CONFIG_UNICODE is enabled.
Moreover, there is no good reason for it to reside in the scripts/
directory since it is only used in fs/unicode/.
Hence, move it from scripts/ to fs/unicode/.
In some cases, we bypass build artifacts in the normal build. The
conventional way to do so is to surround the code with ifdef REGENERATE_*.
For example,
- 7373f4f83c71 ("kbuild: add implicit rules for parser generation")
- 6aaf49b495b4 ("crypto: arm,arm64 - Fix random regeneration of S_shipped")
I rewrote the rule in a more kbuild'ish style.
In the normal build, utf8data.h is just shipped from the check-in file.
$ make
[ snip ]
SHIPPED fs/unicode/utf8data.h
CC fs/unicode/utf8-norm.o
CC fs/unicode/utf8-core.o
CC fs/unicode/utf8-selftest.o
AR fs/unicode/built-in.a
If you want to generate utf8data.h based on UCD, put *.txt files into
fs/unicode/, then pass REGENERATE_UTF8DATA=1 from the command line.
The mkutf8data tool will be automatically compiled to generate the
utf8data.h from the *.txt files.
$ make REGENERATE_UTF8DATA=1
[ snip ]
HOSTCC fs/unicode/mkutf8data
GEN fs/unicode/utf8data.h
CC fs/unicode/utf8-norm.o
CC fs/unicode/utf8-core.o
CC fs/unicode/utf8-selftest.o
AR fs/unicode/built-in.a
I renamed the check-in utf8data.h to utf8data.h_shipped so that this
will work for the out-of-tree build.
You can update it based on the latest UCD like this:
$ make REGENERATE_UTF8DATA=1 fs/unicode/
$ cp fs/unicode/utf8data.h fs/unicode/utf8data.h_shipped
Also, I added entries to .gitignore and dontdiff.
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/unicode')
| -rw-r--r-- | fs/unicode/.gitignore | 2 | ||||
| -rw-r--r-- | fs/unicode/Makefile | 41 | ||||
| -rw-r--r-- | fs/unicode/README.utf8data | 9 | ||||
| -rw-r--r-- | fs/unicode/mkutf8data.c | 3419 | ||||
| -rw-r--r-- | fs/unicode/utf8data.h_shipped (renamed from fs/unicode/utf8data.h) | 0 |
5 files changed, 3455 insertions, 16 deletions
diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore new file mode 100644 index 000000000000..0381e2221480 --- /dev/null +++ b/fs/unicode/.gitignore @@ -0,0 +1,2 @@ +mkutf8data +utf8data.h diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 671d31f83006..d46e9baee285 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -5,15 +5,34 @@ obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o unicode-y := utf8-norm.o utf8-core.o -# This rule is not invoked during the kernel compilation. It is used to -# regenerate the utf8data.h header file. -utf8data.h.new: *.txt $(objdir)/scripts/mkutf8data - $(objdir)/scripts/mkutf8data \ - -a DerivedAge.txt \ - -c DerivedCombiningClass.txt \ - -p DerivedCoreProperties.txt \ - -d UnicodeData.txt \ - -f CaseFolding.txt \ - -n NormalizationCorrections.txt \ - -t NormalizationTest.txt \ +$(obj)/utf8-norm.o: $(obj)/utf8data.h + +# In the normal build, the checked-in utf8data.h is just shipped. +# +# To generate utf8data.h from UCD, put *.txt files in this directory +# and pass REGENERATE_UTF8DATA=1 from the command line. +ifdef REGENERATE_UTF8DATA + +quiet_cmd_utf8data = GEN $@ + cmd_utf8data = $< \ + -a $(srctree)/$(src)/DerivedAge.txt \ + -c $(srctree)/$(src)/DerivedCombiningClass.txt \ + -p $(srctree)/$(src)/DerivedCoreProperties.txt \ + -d $(srctree)/$(src)/UnicodeData.txt \ + -f $(srctree)/$(src)/CaseFolding.txt \ + -n $(srctree)/$(src)/NormalizationCorrections.txt \ + -t $(srctree)/$(src)/NormalizationTest.txt \ -o $@ + +$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE + $(call if_changed,utf8data) + +else + +$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE + $(call if_changed,shipped) + +endif + +targets += utf8data.h +hostprogs-y += mkutf8data diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data index dd56ef50c5d5..9307cf0727de 100644 --- a/fs/unicode/README.utf8data +++ b/fs/unicode/README.utf8data @@ -55,15 +55,14 @@ released version of the UCD can be found here: http://www.unicode.org/Public/UCD/latest/ -To build the utf8data.h file, from a kernel tree that has been built, -cd to this directory (fs/unicode) and run this command: +Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1: - make C=../.. objdir=../.. utf8data.h.new + make REGENERATE_UTF8DATA=1 fs/unicode/ -After sanity checking the newly generated utf8data.h.new file (the +After sanity checking the newly generated utf8data.h file (the version generated from the 12.1.0 UCD should be 4,109 lines long, and have a total size of 324k) and/or comparing it with the older version -of utf8data.h, rename it to utf8data.h. +of utf8data.h_shipped, rename it to utf8data.h_shipped. If you are a kernel developer updating to a newer version of the Unicode Character Database, please update this README.utf8data file diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c new file mode 100644 index 000000000000..ff2025ac5a32 --- /dev/null +++ b/fs/unicode/mkutf8data.c @@ -0,0 +1,3419 @@ +/* + * Copyright (c) 2014 SGI. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Generator for a compact trie for unicode normalization */ + +#include <sys/types.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> + +/* Default names of the in- and output files. */ + +#define AGE_NAME "DerivedAge.txt" +#define CCC_NAME "DerivedCombiningClass.txt" +#define PROP_NAME "DerivedCoreProperties.txt" +#define DATA_NAME "UnicodeData.txt" +#define FOLD_NAME "CaseFolding.txt" +#define NORM_NAME "NormalizationCorrections.txt" +#define TEST_NAME "NormalizationTest.txt" +#define UTF8_NAME "utf8data.h" + +const char *age_name = AGE_NAME; +const char *ccc_name = CCC_NAME; +const char *prop_name = PROP_NAME; +const char *data_name = DATA_NAME; +const char *fold_name = FOLD_NAME; +const char *norm_name = NORM_NAME; +const char *test_name = TEST_NAME; +const char *utf8_name = UTF8_NAME; + +int verbose = 0; + +/* An arbitrary line size limit on input lines. */ + +#define LINESIZE 1024 +char line[LINESIZE]; +char buf0[LINESIZE]; +char buf1[LINESIZE]; +char buf2[LINESIZE]; +char buf3[LINESIZE]; + +const char *argv0; + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* ------------------------------------------------------------------ */ + +/* + * Unicode version numbers consist of three parts: major, minor, and a + * revision. These numbers are packed into an unsigned int to obtain + * a single version number. + * + * To save space in the generated trie, the unicode version is not + * stored directly, instead we calculate a generation number from the + * unicode versions seen in the DerivedAge file, and use that as an + * index into a table of unicode versions. + */ +#define UNICODE_MAJ_SHIFT (16) +#define UNICODE_MIN_SHIFT (8) + +#define UNICODE_MAJ_MAX ((unsigned short)-1) +#define UNICODE_MIN_MAX ((unsigned char)-1) +#define UNICODE_REV_MAX ((unsigned char)-1) + +#define UNICODE_AGE(MAJ,MIN,REV) \ + (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ + ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ + ((unsigned int)(REV))) + +unsigned int *ages; +int ages_count; + +unsigned int unicode_maxage; + +static int age_valid(unsigned int major, unsigned int minor, + unsigned int revision) +{ + if (major > UNICODE_MAJ_MAX) + return 0; + if (minor > UNICODE_MIN_MAX) + return 0; + if (revision > UNICODE_REV_MAX) + return 0; + return 1; +} + +/* ------------------------------------------------------------------ */ + +/* + * utf8trie_t + * + * A compact binary tree, used to decode UTF-8 characters. + * + * Internal nodes are one byte for the node itself, and up to three + * bytes for an offset into the tree. The first byte contains the + * following information: + * NEXTBYTE - flag - advance to next byte if set + * BITNUM - 3 bit field - the bit number to tested + * OFFLEN - 2 bit field - number of bytes in the offset + * if offlen == 0 (non-branching node) + * RIGHTPATH - 1 bit field - set if the following node is for the + * right-hand path (tested bit is set) + * TRIENODE - 1 bit field - set if the following node is an internal + * node, otherwise it is a leaf node + * if offlen != 0 (branching node) + * LEFTNODE - 1 bit field - set if the left-hand node is internal + * RIGHTNODE - 1 bit field - set if the right-hand node is internal + * + * Due to the way utf8 works, there cannot be branching nodes with + * NEXTBYTE set, and moreover those nodes always have a righthand + * descendant. + */ +typedef unsigned char utf8trie_t; +#define BITNUM 0x07 +#define NEXTBYTE 0x08 +#define OFFLEN 0x30 +#define OFFLEN_SHIFT 4 +#define RIGHTPATH 0x40 +#define TRIENODE 0x80 +#define RIGHTNODE 0x40 +#define LEFTNODE 0x80 + +/* + * utf8leaf_t + * + * The leaves of the trie are embedded in the trie, and so the same + * underlying datatype, unsigned char. + * + * leaf[0]: The unicode version, stored as a generation number that is + * an index into utf8agetab[]. With this we can filter code + * points based on the unicode version in which they were + * defined. The CCC of a non-defined code point is 0. + * leaf[1]: Canonical Combining Class. During normalization, we need + * to do a stable sort into ascending order of all characters + * with a non-zero CCC that occur between two characters with + * a CCC of 0, or at the begin or end of a string. + * The unicode standard guarantees that all CCC values are + * between 0 and 254 inclusive, which leaves 255 available as + * a special value. + * Code points with CCC 0 are known as stoppers. + * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the + * start of a NUL-terminated string that is the decomposition + * of the character. + * The CCC of a decomposable character is the same as the CCC + * of the first character of its decomposition. + * Some characters decompose as the empty string: these are + * characters with the Default_Ignorable_Code_Point property. + * These do affect normalization, as they all have CCC 0. + * + * The decompositions in the trie have been fully expanded. + * + * Casefolding, if applicable, is also done using decompositions. + */ +typedef unsigned char utf8leaf_t; + +#define LEAF_GEN(LEAF) ((LEAF)[0]) +#define LEAF_CCC(LEAF) ((LEAF)[1]) +#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) + +#define MAXGEN (255) + +#define MINCCC (0) +#define MAXCCC (254) +#define STOPPER (0) +#define DECOMPOSE (255) +#define HANGUL ((char)(255)) + +#define UTF8HANGULLEAF (12) + +struct tree; +static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, + const char *, size_t); +static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); + +unsigned char *utf8data; +size_t utf8data_size; + +utf8trie_t *nfdi; +utf8trie_t *nfdicf; + +/* ------------------------------------------------------------------ */ + +/* + * UTF8 valid ranges. + * + * The UTF-8 encoding spreads the bits of a 32bit word over several + * bytes. This table gives the ranges that can be held and how they'd + * be represented. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * There is an additional requirement on UTF-8, in that only the + * shortest representation of a 32bit value is to be used. A decoder + * must not decode sequences that do not satisfy this requirement. + * Thus the allowed ranges have a lower bound. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, + * 17 planes of 65536 values. This limits the sequences actually seen + * even more, to just the following. + * + * 0 - 0x7f: 0 0x7f + * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf + * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf + * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf + * + * Even within those ranges not all values are allowed: the surrogates + * 0xd800 - 0xdfff should never be seen. + * + * Note that the longest sequence seen with valid usage is 4 bytes, + * the same a single UTF-32 character. This makes the UTF-8 + * representation of Unicode strictly smaller than UTF-32. + * + * The shortest sequence requirement was introduced by: + * Corrigendum #1: UTF-8 Shortest Form + * It can be found here: + * http://www.unicode.org/versions/corrigendum1.html + * + */ + +#define UTF8_2_BITS 0xC0 +#define UTF8_3_BITS 0xE0 +#define UTF8_4_BITS 0xF0 +#define UTF8_N_BITS 0x80 +#define UTF8_2_MASK 0xE0 +#define UTF8_3_MASK 0xF0 +#define UTF8_4_MASK 0xF8 +#define UTF8_N_MASK 0xC0 +#define UTF8_V_MASK 0x3F +#define UTF8_V_SHIFT 6 + +static int utf8encode(char *str, unsigned int val) +{ + int len; + + if (val < 0x80) { + str[0] = val; + len = 1; + } else if (val < 0x800) { + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_2_BITS; + len = 2; + } else if (val < 0x10000) { + str[2] = val & UTF8_V_MASK; + str[2] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_3_BITS; + len = 3; + } else if (val < 0x110000) { + str[3] = val & UTF8_V_MASK; + str[3] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[2] = val & UTF8_V_MASK; + str[2] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_4_BITS; + len = 4; + } else { + printf("%#x: illegal val\n", val); + len = 0; + } + return len; +} + +static unsigned int utf8decode(const char *str) +{ + const unsigned char *s = (const unsigned char*)str; + unsigned int unichar = 0; + + if (*s < 0x80) { + unichar = *s; + } else if (*s < UTF8_3_BITS) { + unichar = *s++ & 0x1F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } else if (*s < UTF8_4_BITS) { + unichar = *s++ & 0x0F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } else { + unichar = *s++ & 0x0F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } + return unichar; +} + +static int utf32valid(unsigned int unichar) +{ + return unichar < 0x110000; +} + +#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) + +#define NODE 1 +#define LEAF 0 + +struct tree { + void *root; + int childnode; + const char *type; + unsigned int maxage; + struct tree *next; + int (*leaf_equal)(void *, void *); + void (*leaf_print)(void *, int); + int (*leaf_mark)(void *); + int (*leaf_size)(void *); + int *(*leaf_index)(struct tree *, void *); + unsigned char *(*leaf_emit)(void *, unsigned char *); + int leafindex[0x110000]; + int index; +}; + +struct node { + int index; + int offset; + int mark; + int size; + struct node *parent; + void *left; + void *right; + unsigned char bitnum; + unsigned char nextbyte; + unsigned char leftnode; + unsigned char rightnode; + unsigned int keybits; + unsigned int keymask; +}; + +/* + * Example lookup function for a tree. + */ +static void *lookup(struct tree *tree, const char *key) +{ + struct node *node; + void *leaf = NULL; + + node = tree->root; + while (!leaf && node) { + if (node->nextbyte) + key++; + if (*key & (1 << (node->bitnum & 7))) { + /* Right leg */ + if (node->rightnode == NODE) { + node = node->right; + } else if (node->rightnode == LEAF) { + leaf = node->right; + } else { + node = NULL; + } + } else { + /* Left leg */ + if (node->leftnode == NODE) { + node = node->left; + } else if (node->leftnode == LEAF) { + leaf = node->left; + } else { + node = NULL; + } + } + } + + return leaf; +} + +/* + * A simple non-recursive tree walker: keep track of visits to the + * left and right branches in the leftmask and rightmask. + */ +static void tree_walk(struct tree *tree) +{ + struct node *node; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int indent = 1; + int nodes, singletons, leaves; + + nodes = singletons = leaves = 0; + + printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); + if (tree->childnode == LEAF) { + assert(tree->root); + tree->leaf_print(tree->root, indent); + leaves = 1; + } else { + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + printf("%*snode @ %p bitnum %d nextbyte %d" + " left %p right %p mask %x bits %x\n", + indent, "", node, + node->bitnum, node->nextbyte, + node->left, node->right, + node->keymask, node->keybits); + nodes += 1; + if (!(node->left && node->right)) + singletons += 1; + + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + tree->leaf_print(node->left, + indent+1); + leaves += 1; + } else if (node->left) { + assert(node->leftnode == NODE); + indent += 1; + node = node->left; + break; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + tree->leaf_print(node->right, + indent+1); + leaves += 1; + } else if (node->right) { + assert(node->rightnode == NODE); + indent += 1; + node = node->right; + break; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + indent -= 1; + } + } + } + printf("nodes %d leaves %d singletons %d\n", + nodes, leaves, singletons); +} + +/* + * Allocate an initialize a new internal node. + */ +static struct node *alloc_node(struct node *parent) +{ + struct node *node; + int bitnum; + + node = malloc(sizeof(*node)); + node->left = node->right = NULL; + node->parent = parent; + node->leftnode = NODE; + node->rightnode = NODE; + node->keybits = 0; + node->keymask = 0; + node->mark = 0; + node->index = 0; + node->offset = -1; + node->size = 4; + + if (node->parent) { + bitnum = parent->bitnum; + if ((bitnum & 7) == 0) { + node->bitnum = bitnum + 7 + 8; + node->nextbyte = 1; + } else { + node->bitnum = bitnum - 1; + node->nextbyte = 0; + } + } else { + node->bitnum = 7; + node->nextbyte = 0; + } + + return node; +} + +/* + * Insert a new leaf into the tree, and collapse any subtrees that are + * fully populated and end in identical leaves. A nextbyte tagged + * internal node will not be removed to preserve the tree's integrity. + * Note that due to the structure of utf8, no nextbyte tagged node + * will be a candidate for removal. + */ +static int insert(struct tree *tree, char *key, int keylen, void *leaf) +{ + struct node *node; + struct node *parent; + void **cursor; + int keybits; + + assert(keylen >= 1 && keylen <= 4); + + node = NULL; + cursor = &tree->root; + keybits = 8 * keylen; + + /* Insert, creating path along the way. */ + while (keybits) { + if (!*cursor) + *cursor = alloc_node(node); + node = *cursor; + if (node->nextbyte) + key++; + if (*key & (1 << (node->bitnum & 7))) + cursor = &node->right; + else + cursor = &node->left; + keybits--; + } + *cursor = leaf; + + /* Merge subtrees if possible. */ + while (node) { + if (*key & (1 << (node->bitnum & 7))) + node->rightnode = LEAF; + else + node->leftnode = LEAF; + if (node->nextbyte) + break; + if (node->leftnode == NODE || node->rightnode == NODE) + break; + assert(node->left); + assert(node->right); + /* Compare */ + if (! tree->leaf_equal(node->left, node->right)) + break; + /* Keep left, drop right leaf. */ + leaf = node->left; + /* Check in parent */ + parent = node->parent; + if (!parent) { + /* root of tree! */ + tree->root = leaf; + tree->childnode = LEAF; + } else if (parent->left == node) { + parent->left = leaf; + parent->leftnode = LEAF; + if (parent->right) { + parent->keymask = 0; + parent->keybits = 0; + } else { + parent->keymask |= (1 << node->bitnum); + } + } else if (parent->right == node) { + parent->right = leaf; + parent->rightnode = LEAF; + if (parent->left) { + parent->keymask = 0; + parent->keybits = 0; + } else { + parent->keymask |= (1 << node->bitnum); + parent->keybits |= (1 << node->bitnum); + } + } else { + /* internal tree error */ + assert(0); + } + free(node); + node = parent; + } + + /* Propagate keymasks up along singleton chains. */ + while (node) { + parent = node->parent; + if (!parent) + break; + /* Nix the mask for parents with two children. */ + if (node->keymask == 0) { + parent->keymask = 0; + parent->keybits = 0; + } else if (parent->left && parent->right) { + parent->keymask = 0; + parent->keybits = 0; + } else { + assert((parent->keymask & node->keymask) == 0); + parent->keymask |= node->keymask; + parent->keymask |= (1 << parent->bitnum); + parent->keybits |= node->keybits; + if (parent->right) + parent->keybits |= (1 << parent->bitnum); + } + node = parent; + } + + return 0; +} + +/* + * Prune internal nodes. + * + * Fully populated subtrees that end at the same leaf have already + * been collapsed. There are still internal nodes that have for both + * their left and right branches a sequence of singletons that make + * identical choices and end in identical leaves. The keymask and + * keybits collected in the nodes describe the choices made in these + * singleton chains. When they are identical for the left and right + * branch of a node, and the two leaves comare identical, the node in + * question can be removed. + * + * Note that nodes with the nextbyte tag set will not be removed by + * this to ensure tree integrity. Note as well that the structure of + * utf8 ensures that these nodes would not have been candidates for + * removal in any case. + */ +static void prune(struct tree *tree) +{ + struct node *node; + struct node *left; + struct node *right; + struct node *parent; + void *leftleaf; + void *rightleaf; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int count; + + if (verbose > 0) + printf("Pruning %s_%x\n", tree->type, tree->maxage); + + count = 0; + if (tree->childnode == LEAF) + return; + if (!tree->root) + return; + + leftmask = rightmask = 0; + node = tree->root; + while (node) { + if (node->nextbyte) + goto advance; + if (node->leftnode == LEAF) + goto advance; + if (node->rightnode == LEAF) + goto advance; + if (!node->left) + goto advance; + if (!node->right) + goto advance; + left = node->left; + right = node->right; + if (left->keymask == 0) + goto advance; + if (right->keymask == 0) + goto advance; + if (left->keymask != right->keymask) + goto advance; + if (left->keybits != right->keybits) + goto advance; + leftleaf = NULL; + while (!leftleaf) { + assert(left->left || left->right); + if (left->leftnode == LEAF) + leftleaf = left->left; + else if (left->rightnode == LEAF) + leftleaf = left->right; + else if (left->left) + left = left->left; + else if (left->right) + left = left->right; + else + assert(0); + } + rightleaf = NULL; + while (!rightleaf) { + assert(right->left || right->right); + if (right->leftnode == LEAF) + rightleaf = right->left; + else if (right->rightnode == LEAF) + rightleaf = right->right; + else if (right->left) + right = right->left; + else if (right->right) + right = right->right; + else + assert(0); + } + if (! tree->leaf_equal(leftleaf, rightleaf)) + goto advance; + /* + * This node has identical singleton-only subtrees. + * Remove it. + */ + parent = node->parent; + left = node->left; + right = node->right; + if (parent->left == node) + parent->left = left; + else if (parent->right == node) + parent->right = left; + else + assert(0); + left->parent = parent; + left->keymask |= (1 << node->bitnum); + node->left = NULL; + while (node) { + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + if (node->leftnode == NODE && node->left) { + left = node->left; + free(node); + count++; + node = left; + } else if (node->rightnode == NODE && node->right) { + right = node->right; + free(node); + count++; + node = right; + } else { + node = NULL; + } + } + /* Propagate keymasks up along singleton chains. */ + node = parent; + /* Force re-check */ + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + for (;;) { + if (node->left && node->right) + break; + if (node->left) { + left = node->left; + node->keymask |= left->keymask; + node->keybits |= left->keybits; + } + if (node->right) { + right = node->right; + node->keymask |= right->keymask; + node->keybits |= right->keybits; + } + node->keymask |= (1 << node->bitnum); + node = node->parent; + /* Force re-check */ + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + } + advance: + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0 && + node->leftnode == NODE && + node->left) { + leftmask |= bitmask; + node = node->left; + } else if ((rightmask & bitmask) == 0 && + node->rightnode == NODE && + node->right) { + rightmask |= bitmask; + node = node->right; + } else { + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } + } + if (verbose > 0) + printf("Pruned %d nodes\n", count); +} + +/* + * Mark the nodes in the tree that lead to leaves that must be + * emitted. + */ +static void mark_nodes(struct tree *tree) +{ + struct node *node; + struct node *n; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int marked; + + marked = 0; + if (verbose > 0) + printf("Marking %s_%x\n", tree->type, tree->maxage); + if (tree->childnode == LEAF) + goto done; + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + if (tree->leaf_mark(node->left)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->left) { + assert(node->leftnode == NODE); + node = node->left; + continue; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + if (tree->leaf_mark(node->right)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->right) { + assert(node->rightnode == NODE); + node = node->right; + continue; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } + + /* second pass: left siblings and singletons */ + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + if (tree->leaf_mark(node->left)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->left) { + assert(node->leftnode == NODE); + node = node->left; + if (!node->mark && node->parent->mark) { + marked++; + node->mark = 1; + } + continue; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + if (tree->leaf_mark(node->right)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->right) { + assert(node->rightnode == NODE); + node = node->right; + if (!node->mark && node->parent->mark && + !node->parent->left) { + marked++; + node->mark = 1; + } + continue; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } +done: + if (verbose > 0) + printf("Marked %d nodes\n", marked); +} + +/* + * Compute the index of each node and leaf, which is the offset in the + * emitted trie. These values must be pre-computed because relative + * offsets between nodes are used to navigate the tree. + */ +static int index_nodes(struct tree *tree, int index) +{ + struct node *node; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int count; + int indent; + + /* Align to a cache line (or half a cache line?). */ + while (index % 64) + index++; + tree->index = index; + indent = 1; + count = 0; + + if (verbose > 0) + printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); |
