unicode: refactor the rule for regenerating utf8data.h

scripts/mkutf8data is used only when regenerating utf8data.h, which never happens in the normal kernel build. However, it is irrespectively built if CONFIG_UNICODE is enabled. Moreover, there is no good reason for it to reside in the scripts/ directory since it is only used in fs/unicode/. Hence, move it from scripts/ to fs/unicode/. In some cases, we bypass build artifacts in the normal build. The conventional way to do so is to surround the code with ifdef REGENERATE_*. For example, - 7373f4f83c71 ("kbuild: add implicit rules for parser generation") - 6aaf49b495b4 ("crypto: arm,arm64 - Fix random regeneration of S_shipped") I rewrote the rule in a more kbuild'ish style. In the normal build, utf8data.h is just shipped from the check-in file. $ make [ snip ] SHIPPED fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a If you want to generate utf8data.h based on UCD, put *.txt files into fs/unicode/, then pass REGENERATE_UTF8DATA=1 from the command line. The mkutf8data tool will be automatically compiled to generate the utf8data.h from the *.txt files. $ make REGENERATE_UTF8DATA=1 [ snip ] HOSTCC fs/unicode/mkutf8data GEN fs/unicode/utf8data.h CC fs/unicode/utf8-norm.o CC fs/unicode/utf8-core.o CC fs/unicode/utf8-selftest.o AR fs/unicode/built-in.a I renamed the check-in utf8data.h to utf8data.h_shipped so that this will work for the out-of-tree build. You can update it based on the latest UCD like this: $ make REGENERATE_UTF8DATA=1 fs/unicode/ $ cp fs/unicode/utf8data.h fs/unicode/utf8data.h_shipped Also, I added entries to .gitignore and dontdiff. Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
author: Masahiro Yamada <yamada.masahiro@socionext.com> 2019-04-28 13:45:36 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2019-04-28 13:45:36 -0400
commit: 28ba53c07638f31b153e3a32672a6124d0ff2a97 (patch)
tree: d89dfc7e3e9661b15d98e5192433121f5ac081c2 /fs/unicode
parent: 0a790fe4389d88253563c5e22bea47e6d357b525 (diff)
download: linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.gz
linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.bz2
linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.zip
5 files changed, 3455 insertions, 16 deletions
diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore
new file mode 100644
index 000000000000..0381e2221480
--- /dev/null
+++ b/fs/unicode/.gitignore
@@ -0,0 +1,2 @@
+mkutf8data
+utf8data.h
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 671d31f83006..d46e9baee285 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -5,15 +5,34 @@ obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
 
 unicode-y := utf8-norm.o utf8-core.o
 
-# This rule is not invoked during the kernel compilation.  It is used to
-# regenerate the utf8data.h header file.
-utf8data.h.new: *.txt $(objdir)/scripts/mkutf8data
-	$(objdir)/scripts/mkutf8data \
-		-a DerivedAge.txt \
-		-c DerivedCombiningClass.txt \
-		-p DerivedCoreProperties.txt \
-		-d UnicodeData.txt \
-		-f CaseFolding.txt \
-		-n NormalizationCorrections.txt \
-		-t NormalizationTest.txt \
+$(obj)/utf8-norm.o: $(obj)/utf8data.h
+
+# In the normal build, the checked-in utf8data.h is just shipped.
+#
+# To generate utf8data.h from UCD, put *.txt files in this directory
+# and pass REGENERATE_UTF8DATA=1 from the command line.
+ifdef REGENERATE_UTF8DATA
+
+quiet_cmd_utf8data = GEN     $@
+      cmd_utf8data = $< \
+		-a $(srctree)/$(src)/DerivedAge.txt \
+		-c $(srctree)/$(src)/DerivedCombiningClass.txt \
+		-p $(srctree)/$(src)/DerivedCoreProperties.txt \
+		-d $(srctree)/$(src)/UnicodeData.txt \
+		-f $(srctree)/$(src)/CaseFolding.txt \
+		-n $(srctree)/$(src)/NormalizationCorrections.txt \
+		-t $(srctree)/$(src)/NormalizationTest.txt \
 		-o $@
+
+$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
+	$(call if_changed,utf8data)
+
+else
+
+$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
+	$(call if_changed,shipped)
+
+endif
+
+targets += utf8data.h
+hostprogs-y += mkutf8data
diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data
index dd56ef50c5d5..9307cf0727de 100644
--- a/fs/unicode/README.utf8data
+++ b/fs/unicode/README.utf8data
@@ -55,15 +55,14 @@ released version of the UCD can be found here:
 
   http://www.unicode.org/Public/UCD/latest/
 
-To build the utf8data.h file, from a kernel tree that has been built,
-cd to this directory (fs/unicode) and run this command:
+Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1:
 
-	make C=../.. objdir=../.. utf8data.h.new
+	make REGENERATE_UTF8DATA=1 fs/unicode/
 
-After sanity checking the newly generated utf8data.h.new file (the
+After sanity checking the newly generated utf8data.h file (the
 version generated from the 12.1.0 UCD should be 4,109 lines long, and
 have a total size of 324k) and/or comparing it with the older version
-of utf8data.h, rename it to utf8data.h.
+of utf8data.h_shipped, rename it to utf8data.h_shipped.
 
 If you are a kernel developer updating to a newer version of the
 Unicode Character Database, please update this README.utf8data file
diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
new file mode 100644
index 000000000000..ff2025ac5a32
--- /dev/null
+++ b/fs/unicode/mkutf8data.c
@@ -0,0 +1,3419 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME	"DerivedAge.txt"
+#define CCC_NAME	"DerivedCombiningClass.txt"
+#define PROP_NAME	"DerivedCoreProperties.txt"
+#define DATA_NAME	"UnicodeData.txt"
+#define FOLD_NAME	"CaseFolding.txt"
+#define NORM_NAME	"NormalizationCorrections.txt"
+#define TEST_NAME	"NormalizationTest.txt"
+#define UTF8_NAME	"utf8data.h"
+
+const char	*age_name  = AGE_NAME;
+const char	*ccc_name  = CCC_NAME;
+const char	*prop_name = PROP_NAME;
+const char	*data_name = DATA_NAME;
+const char	*fold_name = FOLD_NAME;
+const char	*norm_name = NORM_NAME;
+const char	*test_name = TEST_NAME;
+const char	*utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE	1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision.  These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT		(16)
+#define UNICODE_MIN_SHIFT		(8)
+
+#define UNICODE_MAJ_MAX			((unsigned short)-1)
+#define UNICODE_MIN_MAX			((unsigned char)-1)
+#define UNICODE_REV_MAX			((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV)			\
+	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
+	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
+	 ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int age_valid(unsigned int major, unsigned int minor,
+		     unsigned int revision)
+{
+	if (major > UNICODE_MAJ_MAX)
+		return 0;
+	if (minor > UNICODE_MIN_MAX)
+		return 0;
+	if (revision > UNICODE_REV_MAX)
+		return 0;
+	return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree.  The first byte contains the
+ * following information:
+ *  NEXTBYTE  - flag        - advance to next byte if set
+ *  BITNUM    - 3 bit field - the bit number to tested
+ *  OFFLEN    - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ *  RIGHTPATH - 1 bit field - set if the following node is for the
+ *                            right-hand path (tested bit is set)
+ *  TRIENODE  - 1 bit field - set if the following node is an internal
+ *                            node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
+ *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM		0x07
+#define NEXTBYTE	0x08
+#define OFFLEN		0x30
+#define OFFLEN_SHIFT	4
+#define RIGHTPATH	0x40
+#define TRIENODE	0x80
+#define RIGHTNODE	0x40
+#define LEFTNODE	0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ *          an index into utf8agetab[].  With this we can filter code
+ *          points based on the unicode version in which they were
+ *          defined.  The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ *          to do a stable sort into ascending order of all characters
+ *          with a non-zero CCC that occur between two characters with
+ *          a CCC of 0, or at the begin or end of a string.
+ *          The unicode standard guarantees that all CCC values are
+ *          between 0 and 254 inclusive, which leaves 255 available as
+ *          a special value.
+ *          Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ *          start of a NUL-terminated string that is the decomposition
+ *          of the character.
+ *          The CCC of a decomposable character is the same as the CCC
+ *          of the first character of its decomposition.
+ *          Some characters decompose as the empty string: these are
+ *          characters with the Default_Ignorable_Code_Point property.
+ *          These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF)	((LEAF)[0])
+#define LEAF_CCC(LEAF)	((LEAF)[1])
+#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
+
+#define MAXGEN		(255)
+
+#define MINCCC		(0)
+#define MAXCCC		(254)
+#define STOPPER		(0)
+#define DECOMPOSE	(255)
+#define HANGUL		((char)(255))
+
+#define UTF8HANGULLEAF	(12)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
+			       const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfdi;
+utf8trie_t *nfdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used.  A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values.  This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ *          0 -     0x7f: 0                     0x7f
+ *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
+ *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
+ *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character.  This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ *    Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ *    http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS     0xC0
+#define UTF8_3_BITS     0xE0
+#define UTF8_4_BITS     0xF0
+#define UTF8_N_BITS     0x80
+#define UTF8_2_MASK     0xE0
+#define UTF8_3_MASK     0xF0
+#define UTF8_4_MASK     0xF8
+#define UTF8_N_MASK     0xC0
+#define UTF8_V_MASK     0x3F
+#define UTF8_V_SHIFT    6
+
+static int utf8encode(char *str, unsigned int val)
+{
+	int len;
+
+	if (val < 0x80) {
+		str[0] = val;
+		len = 1;
+	} else if (val < 0x800) {
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_2_BITS;
+		len = 2;
+	} else if (val < 0x10000) {
+		str[2] = val & UTF8_V_MASK;
+		str[2] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_3_BITS;
+		len = 3;
+	} else if (val < 0x110000) {
+		str[3] = val & UTF8_V_MASK;
+		str[3] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[2] = val & UTF8_V_MASK;
+		str[2] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_4_BITS;
+		len = 4;
+	} else {
+		printf("%#x: illegal val\n", val);
+		len = 0;
+	}
+	return len;
+}
+
+static unsigned int utf8decode(const char *str)
+{
+	const unsigned char *s = (const unsigned char*)str;
+	unsigned int unichar = 0;
+
+	if (*s < 0x80) {
+		unichar = *s;
+	} else if (*s < UTF8_3_BITS) {
+		unichar = *s++ & 0x1F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	} else if (*s < UTF8_4_BITS) {
+		unichar = *s++ & 0x0F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	} else {
+		unichar = *s++ & 0x0F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	}
+	return unichar;
+}
+
+static int utf32valid(unsigned int unichar)
+{
+	return unichar < 0x110000;
+}
+
+#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+	void *root;
+	int childnode;
+	const char *type;
+	unsigned int maxage;
+	struct tree *next;
+	int (*leaf_equal)(void *, void *);
+	void (*leaf_print)(void *, int);
+	int (*leaf_mark)(void *);
+	int (*leaf_size)(void *);
+	int *(*leaf_index)(struct tree *, void *);
+	unsigned char *(*leaf_emit)(void *, unsigned char *);
+	int leafindex[0x110000];
+	int index;
+};
+
+struct node {
+	int index;
+	int offset;
+	int mark;
+	int size;
+	struct node *parent;
+	void *left;
+	void *right;
+	unsigned char bitnum;
+	unsigned char nextbyte;
+	unsigned char leftnode;
+	unsigned char rightnode;
+	unsigned int keybits;
+	unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *lookup(struct tree *tree, const char *key)
+{
+	struct node *node;
+	void *leaf = NULL;
+
+	node = tree->root;
+	while (!leaf && node) {
+		if (node->nextbyte)
+			key++;
+		if (*key & (1 << (node->bitnum & 7))) {
+			/* Right leg */
+			if (node->rightnode == NODE) {
+				node = node->right;
+			} else if (node->rightnode == LEAF) {
+				leaf = node->right;
+			} else {
+				node = NULL;
+			}
+		} else {
+			/* Left leg */
+			if (node->leftnode == NODE) {
+				node = node->left;
+			} else if (node->leftnode == LEAF) {
+				leaf = node->left;
+			} else {
+				node = NULL;
+			}
+		}
+	}
+
+	return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void tree_walk(struct tree *tree)
+{
+	struct node *node;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int indent = 1;
+	int nodes, singletons, leaves;
+
+	nodes = singletons = leaves = 0;
+
+	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+	if (tree->childnode == LEAF) {
+		assert(tree->root);
+		tree->leaf_print(tree->root, indent);
+		leaves = 1;
+	} else {
+		assert(tree->childnode == NODE);
+		node = tree->root;
+		leftmask = rightmask = 0;
+		while (node) {
+			printf("%*snode @ %p bitnum %d nextbyte %d"
+			       " left %p right %p mask %x bits %x\n",
+				indent, "", node,
+				node->bitnum, node->nextbyte,
+				node->left, node->right,
+				node->keymask, node->keybits);
+			nodes += 1;
+			if (!(node->left && node->right))
+				singletons += 1;
+
+			while (node) {
+				bitmask = 1 << node->bitnum;
+				if ((leftmask & bitmask) == 0) {
+					leftmask |= bitmask;
+					if (node->leftnode == LEAF) {
+						assert(node->left);
+						tree->leaf_print(node->left,
+								 indent+1);
+						leaves += 1;
+					} else if (node->left) {
+						assert(node->leftnode == NODE);
+						indent += 1;
+						node = node->left;
+						break;
+					}
+				}
+				if ((rightmask & bitmask) == 0) {
+					rightmask |= bitmask;
+					if (node->rightnode == LEAF) {
+						assert(node->right);
+						tree->leaf_print(node->right,
+								 indent+1);
+						leaves += 1;
+					} else if (node->right) {
+						assert(node->rightnode == NODE);
+						indent += 1;
+						node = node->right;
+						break;
+					}
+				}
+				leftmask &= ~bitmask;
+				rightmask &= ~bitmask;
+				node = node->parent;
+				indent -= 1;
+			}
+		}
+	}
+	printf("nodes %d leaves %d singletons %d\n",
+	       nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *alloc_node(struct node *parent)
+{
+	struct node *node;
+	int bitnum;
+
+	node = malloc(sizeof(*node));
+	node->left = node->right = NULL;
+	node->parent = parent;
+	node->leftnode = NODE;
+	node->rightnode = NODE;
+	node->keybits = 0;
+	node->keymask = 0;
+	node->mark = 0;
+	node->index = 0;
+	node->offset = -1;
+	node->size = 4;
+
+	if (node->parent) {
+		bitnum = parent->bitnum;
+		if ((bitnum & 7) == 0) {
+			node->bitnum = bitnum + 7 + 8;
+			node->nextbyte = 1;
+		} else {
+			node->bitnum = bitnum - 1;
+			node->nextbyte = 0;
+		}
+	} else {
+		node->bitnum = 7;
+		node->nextbyte = 0;
+	}
+
+	return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+	struct node *node;
+	struct node *parent;
+	void **cursor;
+	int keybits;
+
+	assert(keylen >= 1 && keylen <= 4);
+
+	node = NULL;
+	cursor = &tree->root;
+	keybits = 8 * keylen;
+
+	/* Insert, creating path along the way. */
+	while (keybits) {
+		if (!*cursor)
+			*cursor = alloc_node(node);
+		node = *cursor;
+		if (node->nextbyte)
+			key++;
+		if (*key & (1 << (node->bitnum & 7)))
+			cursor = &node->right;
+		else
+			cursor = &node->left;
+		keybits--;
+	}
+	*cursor = leaf;
+
+	/* Merge subtrees if possible. */
+	while (node) {
+		if (*key & (1 << (node->bitnum & 7)))
+			node->rightnode = LEAF;
+		else
+			node->leftnode = LEAF;
+		if (node->nextbyte)
+			break;
+		if (node->leftnode == NODE || node->rightnode == NODE)
+			break;
+		assert(node->left);
+		assert(node->right);
+		/* Compare */
+		if (! tree->leaf_equal(node->left, node->right))
+			break;
+		/* Keep left, drop right leaf. */
+		leaf = node->left;
+		/* Check in parent */
+		parent = node->parent;
+		if (!parent) {
+			/* root of tree! */
+			tree->root = leaf;
+			tree->childnode = LEAF;
+		} else if (parent->left == node) {
+			parent->left = leaf;
+			parent->leftnode = LEAF;
+			if (parent->right) {
+				parent->keymask = 0;
+				parent->keybits = 0;
+			} else {
+				parent->keymask |= (1 << node->bitnum);
+			}
+		} else if (parent->right == node) {
+			parent->right = leaf;
+			parent->rightnode = LEAF;
+			if (parent->left) {
+				parent->keymask = 0;
+				parent->keybits = 0;
+			} else {
+				parent->keymask |= (1 << node->bitnum);
+				parent->keybits |= (1 << node->bitnum);
+			}
+		} else {
+			/* internal tree error */
+			assert(0);
+		}
+		free(node);
+		node = parent;
+	}
+
+	/* Propagate keymasks up along singleton chains. */
+	while (node) {
+		parent = node->parent;
+		if (!parent)
+			break;
+		/* Nix the mask for parents with two children. */
+		if (node->keymask == 0) {
+			parent->keymask = 0;
+			parent->keybits = 0;
+		} else if (parent->left && parent->right) {
+			parent->keymask = 0;
+			parent->keybits = 0;
+		} else {
+			assert((parent->keymask & node->keymask) == 0);
+			parent->keymask |= node->keymask;
+			parent->keymask |= (1 << parent->bitnum);
+			parent->keybits |= node->keybits;
+			if (parent->right)
+				parent->keybits |= (1 << parent->bitnum);
+		}
+		node = parent;
+	}
+
+	return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed.  There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves.  The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains.  When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity.  Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void prune(struct tree *tree)
+{
+	struct node *node;
+	struct node *left;
+	struct node *right;
+	struct node *parent;
+	void *leftleaf;
+	void *rightleaf;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int count;
+
+	if (verbose > 0)
+		printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+	count = 0;
+	if (tree->childnode == LEAF)
+		return;
+	if (!tree->root)
+		return;
+
+	leftmask = rightmask = 0;
+	node = tree->root;
+	while (node) {
+		if (node->nextbyte)
+			goto advance;
+		if (node->leftnode == LEAF)
+			goto advance;
+		if (node->rightnode == LEAF)
+			goto advance;
+		if (!node->left)
+			goto advance;
+		if (!node->right)
+			goto advance;
+		left = node->left;
+		right = node->right;
+		if (left->keymask == 0)
+			goto advance;
+		if (right->keymask == 0)
+			goto advance;
+		if (left->keymask != right->keymask)
+			goto advance;
+		if (left->keybits != right->keybits)
+			goto advance;
+		leftleaf = NULL;
+		while (!leftleaf) {
+			assert(left->left || left->right);
+			if (left->leftnode == LEAF)
+				leftleaf = left->left;
+			else if (left->rightnode == LEAF)
+				leftleaf = left->right;
+			else if (left->left)
+				left = left->left;
+			else if (left->right)
+				left = left->right;
+			else
+				assert(0);
+		}
+		rightleaf = NULL;
+		while (!rightleaf) {
+			assert(right->left || right->right);
+			if (right->leftnode == LEAF)
+				rightleaf = right->left;
+			else if (right->rightnode == LEAF)
+				rightleaf = right->right;
+			else if (right->left)
+				right = right->left;
+			else if (right->right)
+				right = right->right;
+			else
+				assert(0);
+		}
+		if (! tree->leaf_equal(leftleaf, rightleaf))
+			goto advance;
+		/*
+		 * This node has identical singleton-only subtrees.
+		 * Remove it.
+		 */
+		parent = node->parent;
+		left = node->left;
+		right = node->right;
+		if (parent->left == node)
+			parent->left = left;
+		else if (parent->right == node)
+			parent->right = left;
+		else
+			assert(0);
+		left->parent = parent;
+		left->keymask |= (1 << node->bitnum);
+		node->left = NULL;
+		while (node) {
+			bitmask = 1 << node->bitnum;
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			if (node->leftnode == NODE && node->left) {
+				left = node->left;
+				free(node);
+				count++;
+				node = left;
+			} else if (node->rightnode == NODE && node->right) {
+				right = node->right;
+				free(node);
+				count++;
+				node = right;
+			} else {
+				node = NULL;
+			}
+		}
+		/* Propagate keymasks up along singleton chains. */
+		node = parent;
+		/* Force re-check */
+		bitmask = 1 << node->bitnum;
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		for (;;) {
+			if (node->left && node->right)
+				break;
+			if (node->left) {
+				left = node->left;
+				node->keymask |= left->keymask;
+				node->keybits |= left->keybits;
+			}
+			if (node->right) {
+				right = node->right;
+				node->keymask |= right->keymask;
+				node->keybits |= right->keybits;
+			}
+			node->keymask |= (1 << node->bitnum);
+			node = node->parent;
+			/* Force re-check */
+			bitmask = 1 << node->bitnum;
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+		}
+	advance:
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0 &&
+		    node->leftnode == NODE &&
+		    node->left) {
+			leftmask |= bitmask;
+			node = node->left;
+		} else if ((rightmask & bitmask) == 0 &&
+			   node->rightnode == NODE &&
+			   node->right) {
+			rightmask |= bitmask;
+			node = node->right;
+		} else {
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			node = node->parent;
+		}
+	}
+	if (verbose > 0)
+		printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void mark_nodes(struct tree *tree)
+{
+	struct node *node;
+	struct node *n;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int marked;
+
+	marked = 0;
+	if (verbose > 0)
+		printf("Marking %s_%x\n", tree->type, tree->maxage);
+	if (tree->childnode == LEAF)
+		goto done;
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0) {
+			leftmask |= bitmask;
+			if (node->leftnode == LEAF) {
+				assert(node->left);
+				if (tree->leaf_mark(node->left)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->left) {
+				assert(node->leftnode == NODE);
+				node = node->left;
+				continue;
+			}
+		}
+		if ((rightmask & bitmask) == 0) {
+			rightmask |= bitmask;
+			if (node->rightnode == LEAF) {
+				assert(node->right);
+				if (tree->leaf_mark(node->right)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->right) {
+				assert(node->rightnode == NODE);
+				node = node->right;
+				continue;
+			}
+		}
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		node = node->parent;
+	}
+
+	/* second pass: left siblings and singletons */
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0) {
+			leftmask |= bitmask;
+			if (node->leftnode == LEAF) {
+				assert(node->left);
+				if (tree->leaf_mark(node->left)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->left) {
+				assert(node->leftnode == NODE);
+				node = node->left;
+				if (!node->mark && node->parent->mark) {
+					marked++;
+					node->mark = 1;
+				}
+				continue;
+			}
+		}
+		if ((rightmask & bitmask) == 0) {
+			rightmask |= bitmask;
+			if (node->rightnode == LEAF) {
+				assert(node->right);
+				if (tree->leaf_mark(node->right)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->right) {
+				assert(node->rightnode == NODE);
+				node = node->right;
+				if (!node->mark && node->parent->mark &&
+				    !node->parent->left) {
+					marked++;
+					node->mark = 1;
+				}
+				continue;
+			}
+		}
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		node = node->parent;
+	}
+done:
+	if (verbose > 0)
+		printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie.  These values must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int index_nodes(struct tree *tree, int index)
+{
+	struct node *node;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int count;
+	int indent;
+
+	/* Align to a cache line (or half a cache line?). */
+	while (index % 64)
+		index++;
+	tree->index = index;
+	indent = 1;
+	count = 0;
+
+	if (verbose > 0)
+		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
author	Masahiro Yamada <yamada.masahiro@socionext.com>	2019-04-28 13:45:36 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2019-04-28 13:45:36 -0400
commit	28ba53c07638f31b153e3a32672a6124d0ff2a97 (patch)
tree	d89dfc7e3e9661b15d98e5192433121f5ac081c2 /fs/unicode
parent	0a790fe4389d88253563c5e22bea47e6d357b525 (diff)
download	linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.gz linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.tar.bz2 linux-28ba53c07638f31b153e3a32672a6124d0ff2a97.zip