Andrew Geissler | 4ed12e1 | 2020-06-05 18:00:41 -0500 | [diff] [blame^] | 1 | #!/usr/bin/perl -w |
| 2 | # |
| 3 | # Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org> |
| 4 | # |
| 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 6 | # of this software and associated documentation files (the "Software"), to deal |
| 7 | # in the Software without restriction, including without limitation the rights |
| 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 9 | # copies of the Software, and to permit persons to whom the Software is |
| 10 | # furnished to do so, subject to the following conditions: |
| 11 | # |
| 12 | # The above copyright notice and this permission notice shall be included in |
| 13 | # all copies or substantial portions of the Software. |
| 14 | # |
| 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 21 | # THE SOFTWARE. |
| 22 | |
| 23 | =encoding utf8 |
| 24 | |
| 25 | =head1 NAME |
| 26 | |
| 27 | triehash - Generate a perfect hash function derived from a trie. |
| 28 | |
| 29 | =cut |
| 30 | |
| 31 | use strict; |
| 32 | use warnings; |
| 33 | use utf8; |
| 34 | use Getopt::Long; |
| 35 | |
| 36 | =head1 SYNOPSIS |
| 37 | |
| 38 | B<triehash> [S<I<option>>] [S<I<input file>>] |
| 39 | |
| 40 | =head1 DESCRIPTION |
| 41 | |
| 42 | triehash takes a list of words in input file and generates a function and |
| 43 | an enumeration to describe the word |
| 44 | |
| 45 | =head1 INPUT FILE FORMAT |
| 46 | |
| 47 | The file consists of multiple lines of the form: |
| 48 | |
| 49 | [label ~ ] word [= value] |
| 50 | |
| 51 | This maps word to value, and generates an enumeration with entries of the form: |
| 52 | |
| 53 | label = value |
| 54 | |
| 55 | If I<label> is undefined, the word will be used, the minus character will be |
| 56 | replaced by an underscore. If value is undefined it is counted upwards from |
| 57 | the last value. |
| 58 | |
| 59 | There may also be one line of the format |
| 60 | |
| 61 | [ label ~] = value |
| 62 | |
| 63 | Which defines the value to be used for non-existing keys. Note that this also |
| 64 | changes default value for other keys, as for normal entries. So if you place |
| 65 | |
| 66 | = 0 |
| 67 | |
| 68 | at the beginning of the file, unknown strings map to 0, and the other strings |
| 69 | map to values starting with 1. If label is not specified, the default is |
| 70 | I<Unknown>. |
| 71 | |
| 72 | =head1 OPTIONS |
| 73 | |
| 74 | =over 4 |
| 75 | |
| 76 | =item B<-C>I<.c file> B<--code>=I<.c file> |
| 77 | |
| 78 | Generate code in the given file. |
| 79 | |
| 80 | =item B<-H>I<header file> B<--header>=I<header file> |
| 81 | |
| 82 | Generate a header in the given file, containing a declaration of the hash |
| 83 | function and an enumeration. |
| 84 | |
| 85 | =item B<--enum-name=>I<word> |
| 86 | |
| 87 | The name of the enumeration. |
| 88 | |
| 89 | =item B<--function-name=>I<word> |
| 90 | |
| 91 | The name of the function. |
| 92 | |
| 93 | =item B<--label-prefix=>I<word> |
| 94 | |
| 95 | The prefix to use for labels. |
| 96 | |
| 97 | =item B<--label-uppercase> |
| 98 | |
| 99 | Uppercase label names when normalizing them. |
| 100 | |
| 101 | =item B<--namespace=>I<name> |
| 102 | |
| 103 | Put the function and enum into a namespace (C++) |
| 104 | |
| 105 | =item B<--class=>I<name> |
| 106 | |
| 107 | Put the function and enum into a class (C++) |
| 108 | |
| 109 | =item B<--enum-class> |
| 110 | |
| 111 | Generate an enum class instead of an enum (C++) |
| 112 | |
| 113 | =item B<--counter-name=>I<name> |
| 114 | |
| 115 | Use I<name> for a counter that is set to the latest entry in the enumeration |
| 116 | + 1. This can be useful for defining array sizes. |
| 117 | |
| 118 | =item B<--ignore-case> |
| 119 | |
| 120 | Ignore case for words. |
| 121 | |
| 122 | =item B<--multi-byte>=I<value> |
| 123 | |
| 124 | Generate code reading multiple bytes at once. The value is a string of power |
| 125 | of twos to enable. The default value is 320 meaning that 8, 4, and single byte |
| 126 | reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you |
| 127 | also want to allow 2-byte reads. 2-byte reads are disabled by default because |
| 128 | they negatively affect performance on older Intel architectures. |
| 129 | |
| 130 | This generates code for both multiple bytes and single byte reads, but only |
| 131 | enables the multiple byte reads of GNU C compatible compilers, as the following |
| 132 | extensions are used: |
| 133 | |
| 134 | =over 8 |
| 135 | |
| 136 | =item Byte-aligned integers |
| 137 | |
| 138 | We must be able to generate integers that are aligned to a single byte using: |
| 139 | |
| 140 | typedef uint64_t __attribute__((aligned (1))) triehash_uu64; |
| 141 | |
| 142 | =item Byte-order |
| 143 | |
| 144 | The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined. |
| 145 | |
| 146 | =back |
| 147 | |
| 148 | We forcefully disable multi-byte reads on platforms where the variable |
| 149 | I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined, |
| 150 | as there is a measurable overhead from emulating the unaligned reads on |
| 151 | ARM. |
| 152 | |
| 153 | =item B<--language=>I<language> |
| 154 | |
| 155 | Generate a file in the specified language. Currently known are 'C' and 'tree', |
| 156 | the latter generating a tree. |
| 157 | |
| 158 | =item B<--include=>I<header> |
| 159 | |
| 160 | Add the header to the include statements of the header file. The value must |
| 161 | be surrounded by quotes or angle brackets for C code. May be specified multiple |
| 162 | times. |
| 163 | |
| 164 | =back |
| 165 | |
| 166 | =cut |
| 167 | |
| 168 | my $unknown = -1; |
| 169 | my $unknown_label = undef; |
| 170 | my $counter_start = 0; |
| 171 | my $enum_name = 'PerfectKey'; |
| 172 | my $function_name = 'PerfectHash'; |
| 173 | my $enum_class = 0; |
| 174 | |
| 175 | my $code_name = '-'; |
| 176 | my $header_name = '-'; |
| 177 | my $code; |
| 178 | my $header; |
| 179 | my $label_prefix = undef; |
| 180 | my $label_uppercase = 0; |
| 181 | my $ignore_case = 0; |
| 182 | my $multi_byte = '320'; |
| 183 | my $language = 'C'; |
| 184 | my $counter_name = undef; |
| 185 | my @includes = (); |
| 186 | |
| 187 | |
| 188 | Getopt::Long::config('default', |
| 189 | 'bundling', |
| 190 | 'no_getopt_compat', |
| 191 | 'no_auto_abbrev', |
| 192 | 'permute', |
| 193 | 'auto_help'); |
| 194 | |
| 195 | GetOptions ('code|C=s' => \$code_name, |
| 196 | 'header|H=s' => \$header_name, |
| 197 | 'function-name=s' => \$function_name, |
| 198 | 'label-prefix=s' => \$label_prefix, |
| 199 | 'label-uppercase' => \$label_uppercase, |
| 200 | 'ignore-case' => \$ignore_case, |
| 201 | 'enum-name=s' => \$enum_name, |
| 202 | 'language|l=s' => \$language, |
| 203 | 'multi-byte=s' => \$multi_byte, |
| 204 | 'enum-class' => \$enum_class, |
| 205 | 'include=s' => \@includes, |
| 206 | 'counter-name=s' => \$counter_name) |
| 207 | or die('Could not parse options!'); |
| 208 | |
| 209 | |
| 210 | # This implements a simple trie. Each node has three attributes: |
| 211 | # |
| 212 | # children - A hash of keys to other nodes |
| 213 | # value - The value to be stored here |
| 214 | # label - A named representation of the value. |
| 215 | # |
| 216 | # The key at each level of the trie can consist of one or more bytes, and the |
| 217 | # trie can be normalized to a form where all keys at a level have the same |
| 218 | # length using rebuild_tree(). |
| 219 | package Trie { |
| 220 | |
| 221 | sub new { |
| 222 | my $class = shift; |
| 223 | my $self = {}; |
| 224 | bless $self, $class; |
| 225 | |
| 226 | $self->{children} = {}; |
| 227 | $self->{value} = undef; |
| 228 | $self->{label} = undef; |
| 229 | |
| 230 | return $self; |
| 231 | } |
| 232 | |
| 233 | # Return the largest power of 2 smaller or equal to the argument |
| 234 | sub alignpower2 { |
| 235 | my ($self, $length) = @_; |
| 236 | |
| 237 | return 8 if ($length >= 8 && $multi_byte =~ /3/); |
| 238 | return 4 if ($length >= 4 && $multi_byte =~ /2/); |
| 239 | return 2 if ($length >= 2 && $multi_byte =~ /1/); |
| 240 | |
| 241 | return 1; |
| 242 | } |
| 243 | |
| 244 | # Split the key into a head block and a tail |
| 245 | sub split_key { |
| 246 | my ($self, $key) = @_; |
| 247 | my $length = length $key; |
| 248 | my $split = $self->alignpower2($length); |
| 249 | |
| 250 | return (substr($key, 0, $split), substr($key, $split)); |
| 251 | } |
| 252 | |
| 253 | # Given a key, a label, and a value, insert that into the tree, possibly |
| 254 | # replacing an existing node. |
| 255 | sub insert { |
| 256 | my ($self, $key, $label, $value) = @_; |
| 257 | |
| 258 | if (length($key) == 0) { |
| 259 | $self->{label} = $label; |
| 260 | $self->{value} = $value; |
| 261 | return; |
| 262 | } |
| 263 | |
| 264 | my ($child, $tail) = $self->split_key($key); |
| 265 | |
| 266 | $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child})); |
| 267 | |
| 268 | $self->{children}{$child}->insert($tail, $label, $value); |
| 269 | } |
| 270 | |
| 271 | # Construct a new trie that only contains words of a given length. This |
| 272 | # is used to split up the common trie after knowing all words, so we can |
| 273 | # switch on the expected word length first, and have the per-trie function |
| 274 | # implement simple longest prefix matching. |
| 275 | sub filter_depth { |
| 276 | my ($self, $togo) = @_; |
| 277 | |
| 278 | my $new = Trie->new; |
| 279 | |
| 280 | if ($togo != 0) { |
| 281 | my $found = 0; |
| 282 | foreach my $key (sort keys %{$self->{children}}) { |
| 283 | if ($togo > length($key) || defined $self->{children}{$key}->{value}) { |
| 284 | my $child = $self->{children}{$key}->filter_depth($togo - length($key)); |
| 285 | |
| 286 | $new->{children}{$key}= $child if defined $child; |
| 287 | $found = 1 if defined $child; |
| 288 | } |
| 289 | } |
| 290 | return if (!$found); |
| 291 | } else { |
| 292 | $new->{value} = $self->{value}; |
| 293 | $new->{label} = $self->{label}; |
| 294 | } |
| 295 | |
| 296 | return $new; |
| 297 | } |
| 298 | |
| 299 | # (helper for rebuild_tree) |
| 300 | # Reinsert all value nodes into the specified $trie, prepending $prefix |
| 301 | # to their $paths. |
| 302 | sub reinsert_value_nodes_into { |
| 303 | my ($self, $trie, $prefix) = @_; |
| 304 | |
| 305 | $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value}); |
| 306 | |
| 307 | foreach my $key (sort keys %{$self->{children}}) { |
| 308 | $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key); |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | # (helper for rebuild_tree) |
| 313 | # Find the earliest point to split a key. Normally, we split at the maximum |
| 314 | # power of 2 that is greater or equal than the length of the key. When we |
| 315 | # are building an ASCII-optimised case-insensitive trie that simply ORs |
| 316 | # each byte with 0x20, we need to split at the first ambiguous character: |
| 317 | # |
| 318 | # For example, the words a-bc and a\rbc are identical in such a situation: |
| 319 | # '-' | 0x20 == '-' == '\r' | 0x20 |
| 320 | # We cannot simply switch on all 4 bytes at once, but need to split before |
| 321 | # the ambiguous character so we can process the ambiguous character on its |
| 322 | # own. |
| 323 | sub find_earlier_split { |
| 324 | my ($self, $key) = @_; |
| 325 | |
| 326 | if ($ignore_case) { |
| 327 | for my $i (0..length($key)-1) { |
| 328 | # If the key starts with an ambiguous character, we need to |
| 329 | # take only it. Otherwise, we need to take everything |
| 330 | # before the character. |
| 331 | return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1))); |
| 332 | } |
| 333 | } |
| 334 | return $self->alignpower2(length $key); |
| 335 | } |
| 336 | |
| 337 | # This rebuilds the trie, splitting each key before ambiguous characters |
| 338 | # as explained in find_earlier_split(), and then chooses the smallest |
| 339 | # such split at each level, so that all keys at all levels have the same |
| 340 | # length (so we can use a multi-byte switch). |
| 341 | sub rebuild_tree { |
| 342 | my $self = shift; |
| 343 | # Determine if/where we need to split before an ambiguous character |
| 344 | my $new_split = 99999999999999999; |
| 345 | foreach my $key (sort keys %{$self->{children}}) { |
| 346 | my $special_length = $self->find_earlier_split($key); |
| 347 | $new_split = $special_length if ($special_length < $new_split); |
| 348 | } |
| 349 | |
| 350 | # Start building a new uniform trie |
| 351 | my $newself = Trie->new; |
| 352 | $newself->{label} = $self->{label}; |
| 353 | $newself->{value} = $self->{value}; |
| 354 | $newself->{children} = {}; |
| 355 | |
| 356 | foreach my $key (sort keys %{$self->{children}}) { |
| 357 | my $head = substr($key, 0, $new_split); |
| 358 | my $tail = substr($key, $new_split); |
| 359 | # Rebuild the child node at $head, pushing $tail downwards |
| 360 | $newself->{children}{$head} //= Trie->new; |
| 361 | $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail); |
| 362 | # We took up to one special character of each key label. There might |
| 363 | # be more, so we need to rebuild recursively. |
| 364 | $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree(); |
| 365 | } |
| 366 | |
| 367 | return $newself; |
| 368 | } |
| 369 | } |
| 370 | |
| 371 | # Code generator for C and C++ |
| 372 | package CCodeGen { |
| 373 | my $static = ($code_name eq $header_name) ? "static " : ""; |
| 374 | my $enum_specifier = $enum_class ? "enum class" : "enum"; |
| 375 | |
| 376 | sub new { |
| 377 | my $class = shift; |
| 378 | my $self = {}; |
| 379 | bless $self, $class; |
| 380 | |
| 381 | return $self; |
| 382 | } |
| 383 | |
| 384 | sub open_output { |
| 385 | my $self = shift; |
| 386 | if ($code_name ne '-') { |
| 387 | open($code, '>', $code_name) or die "Cannot open $code_name: $!" ; |
| 388 | } else { |
| 389 | $code = *STDOUT; |
| 390 | } |
| 391 | if($code_name eq $header_name) { |
| 392 | $header = $code; |
| 393 | } elsif ($header_name ne '-') { |
| 394 | open($header, '>', $header_name) or die "Cannot open $header_name: $!" ; |
| 395 | } else { |
| 396 | $header = *STDOUT; |
| 397 | } |
| 398 | } |
| 399 | |
| 400 | sub mangle_label { |
| 401 | my ($self, $label) = @_; |
| 402 | |
| 403 | $label = $label_prefix . $label if defined($label_prefix); |
| 404 | $label = uc $label if $label_uppercase; |
| 405 | |
| 406 | return $label; |
| 407 | } |
| 408 | |
| 409 | sub word_to_label { |
| 410 | my ($self, $word) = @_; |
| 411 | |
| 412 | $word =~ s/_/__/g; |
| 413 | $word =~ s/-/_/g; |
| 414 | |
| 415 | return $self->mangle_label($word); |
| 416 | } |
| 417 | |
| 418 | # Return a case label, by shifting and or-ing bytes in the word |
| 419 | sub case_label { |
| 420 | my ($self, $key) = @_; |
| 421 | |
| 422 | return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte; |
| 423 | |
| 424 | my $output = '0'; |
| 425 | |
| 426 | for my $i (0..length($key)-1) { |
| 427 | $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key)); |
| 428 | } |
| 429 | |
| 430 | return $output; |
| 431 | } |
| 432 | |
| 433 | # Return an appropriate read instruction for $length bytes from $offset |
| 434 | sub switch_key { |
| 435 | my ($self, $offset, $length) = @_; |
| 436 | |
| 437 | return "string[$offset]" if $length == 1; |
| 438 | return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8); |
| 439 | } |
| 440 | |
| 441 | # Render the trie so that it matches the longest prefix. |
| 442 | sub print_table { |
| 443 | my ($self, $trie, $fh, $indent, $index) = @_; |
| 444 | $indent //= 0; |
| 445 | $index //= 0; |
| 446 | |
| 447 | # If we have children, try to match them. |
| 448 | if (%{$trie->{children}}) { |
| 449 | # The difference between lowercase and uppercase alphabetical characters |
| 450 | # is that they have one bit flipped. If we have alphabetical characters |
| 451 | # in the search space, and the entire search space works fine if we |
| 452 | # always turn on the flip, just OR the character we are switching over |
| 453 | # with the bit. |
| 454 | my $want_use_bit = 0; |
| 455 | my $can_use_bit = 1; |
| 456 | my $key_length = 0; |
| 457 | foreach my $key (sort keys %{$trie->{children}}) { |
| 458 | $can_use_bit &= not main::ambiguous($key); |
| 459 | $want_use_bit |= ($key =~ /^[a-zA-Z]+$/); |
| 460 | $key_length = length($key); |
| 461 | } |
| 462 | |
| 463 | if ($ignore_case && $can_use_bit && $want_use_bit) { |
| 464 | printf { $fh } ((' ' x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), '20' x $key_length); |
| 465 | } else { |
| 466 | printf { $fh } ((' ' x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length)); |
| 467 | } |
| 468 | |
| 469 | my $notfirst = 0; |
| 470 | foreach my $key (sort keys %{$trie->{children}}) { |
| 471 | if ($notfirst) { |
| 472 | printf { $fh } (' ' x $indent . " break;\n"); |
| 473 | } |
| 474 | if ($ignore_case) { |
| 475 | printf { $fh } (' ' x $indent . "case %s:\n", $self->case_label(lc($key))); |
| 476 | printf { $fh } (' ' x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit); |
| 477 | } else { |
| 478 | printf { $fh } (' ' x $indent . "case %s:\n", $self->case_label($key)); |
| 479 | } |
| 480 | |
| 481 | $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key)); |
| 482 | |
| 483 | $notfirst=1; |
| 484 | } |
| 485 | |
| 486 | printf { $fh } (' ' x $indent . "}\n"); |
| 487 | } |
| 488 | |
| 489 | |
| 490 | # This node has a value, so it is a possible end point. If no children |
| 491 | # matched, we have found our longest prefix. |
| 492 | if (defined $trie->{value}) { |
| 493 | printf { $fh } (' ' x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : '').$trie->{label}); |
| 494 | } |
| 495 | |
| 496 | } |
| 497 | |
| 498 | sub print_words { |
| 499 | my ($self, $trie, $fh, $indent, $sofar) = @_; |
| 500 | |
| 501 | $indent //= 0; |
| 502 | $sofar //= ''; |
| 503 | |
| 504 | |
| 505 | printf { $fh } (' ' x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value}; |
| 506 | |
| 507 | foreach my $key (sort keys %{$trie->{children}}) { |
| 508 | $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key); |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | sub print_functions { |
| 513 | my ($self, $trie, %lengths) = @_; |
| 514 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { |
| 515 | print { $code } ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n"); |
| 516 | print { $code } ("{\n"); |
| 517 | $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1); |
| 518 | printf { $code } (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : '')); |
| 519 | print { $code } ("}\n"); |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | sub main { |
| 524 | my ($self, $trie, $num_values, %lengths) = @_; |
| 525 | print { $header } ("#ifndef TRIE_HASH_${function_name}\n"); |
| 526 | print { $header } ("#define TRIE_HASH_${function_name}\n"); |
| 527 | print { $header } ("#include <stddef.h>\n"); |
| 528 | print { $header } ("#include <stdint.h>\n"); |
| 529 | foreach my $include (@includes) { |
| 530 | print { $header } ("#include $include\n"); |
| 531 | } |
| 532 | printf { $header } ("enum { $counter_name = $num_values };\n") if (defined($counter_name)); |
| 533 | print { $header } ("${enum_specifier} ${enum_name} {\n"); |
| 534 | $self->print_words($trie, $header, 1); |
| 535 | printf { $header } (" $unknown_label = $unknown,\n"); |
| 536 | print { $header } ("};\n"); |
| 537 | print { $header } ("${static}enum ${enum_name} ${function_name}(const char *string, size_t length);\n"); |
| 538 | |
| 539 | print { $code } ("#include \"$header_name\"\n") if ($header_name ne $code_name); |
| 540 | |
| 541 | if ($multi_byte) { |
| 542 | print { $code } ("#ifdef __GNUC__\n"); |
| 543 | foreach my $i ((16, 32, 64)) { |
| 544 | print { $code } ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n"); |
| 545 | print { $code } ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n"); |
| 546 | } |
| 547 | |
| 548 | print { $code } ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n"); |
| 549 | print { $code } ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n"); |
| 550 | print { $code } ("#else\n"); |
| 551 | print { $code } ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n"); |
| 552 | print { $code } ("#endif\n"); |
| 553 | print { $code } ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n"); |
| 554 | print { $code } ("#define TRIE_HASH_MULTI_BYTE\n"); |
| 555 | print { $code } ("#endif\n"); |
| 556 | print { $code } ("#endif /*GNUC */\n"); |
| 557 | |
| 558 | print { $code } ("#ifdef TRIE_HASH_MULTI_BYTE\n"); |
| 559 | $self->print_functions($trie, %lengths); |
| 560 | $multi_byte = 0; |
| 561 | print { $code } ("#else\n"); |
| 562 | $self->print_functions($trie, %lengths); |
| 563 | print { $code } ("#endif /* TRIE_HASH_MULTI_BYTE */\n"); |
| 564 | } else { |
| 565 | $self->print_functions($trie, %lengths); |
| 566 | } |
| 567 | |
| 568 | print { $code } ("${static}enum ${enum_name} ${function_name}(const char *string, size_t length)\n"); |
| 569 | print { $code } ("{\n"); |
| 570 | print { $code } (" switch (length) {\n"); |
| 571 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { |
| 572 | print { $code } (" case $local_length:\n"); |
| 573 | print { $code } (" return ${function_name}${local_length}(string);\n"); |
| 574 | } |
| 575 | print { $code } (" default:\n"); |
| 576 | printf { $code } (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : '')); |
| 577 | print { $code } (" }\n"); |
| 578 | print { $code } ("}\n"); |
| 579 | |
| 580 | # Print end of header here, in case header and code point to the same file |
| 581 | print { $header } ("#endif /* TRIE_HASH_${function_name} */\n"); |
| 582 | } |
| 583 | } |
| 584 | |
| 585 | # A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the |
| 586 | # lower case bit. A word is ambiguous if any character is. This definition is |
| 587 | # used to check if we can perform the |0x20 optimization when building a case- |
| 588 | # insensitive trie. |
| 589 | sub ambiguous { |
| 590 | my $word = shift; |
| 591 | |
| 592 | foreach my $char (split //, $word) { |
| 593 | # If 0x20 does not solely indicate lowercase, it is ambiguous |
| 594 | return 1 if ord(lc($char)) != (ord($char) | 0x20); |
| 595 | return 1 if ord(uc($char)) != (ord($char) & ~0x20); |
| 596 | } |
| 597 | |
| 598 | return 0; |
| 599 | } |
| 600 | |
| 601 | sub build_trie { |
| 602 | my $codegen = shift; |
| 603 | my $trie = Trie->new; |
| 604 | |
| 605 | my $counter = $counter_start; |
| 606 | my $prev_value; |
| 607 | my %lengths; |
| 608 | |
| 609 | open(my $input, '<', $ARGV[0]) or die "Cannot open $ARGV[0]: $!"; |
| 610 | while (my $line = <$input>) { |
| 611 | my ($label, $word, $value) = $line =~ m{ |
| 612 | (?:\s*([^~\s]+)\s*~)? # Label ~ |
| 613 | (?:\s*([^~=\s]+))? # Word |
| 614 | (?:\s*=\s*([^\s]+)\s+)? # = Value |
| 615 | \s* |
| 616 | }x; |
| 617 | |
| 618 | if (defined $word) { |
| 619 | $label //= $codegen->word_to_label($word); |
| 620 | $value //= defined $prev_value ? $prev_value + 1 : 0; |
| 621 | |
| 622 | $trie->insert($word, $label, $value); |
| 623 | $lengths{length($word)} = 1; |
| 624 | } elsif (defined $value) { |
| 625 | $unknown = $value; |
| 626 | $unknown_label = $codegen->mangle_label($label) if defined $label; |
| 627 | } else { |
| 628 | die "Invalid line: $line"; |
| 629 | } |
| 630 | |
| 631 | $prev_value = $value; |
| 632 | $counter = $value + 1 if $value >= $counter; |
| 633 | } |
| 634 | |
| 635 | $unknown_label //= $codegen->mangle_label('Unknown'); |
| 636 | |
| 637 | return ($trie, $counter, %lengths); |
| 638 | } |
| 639 | |
| 640 | # Generates an ASCII art tree |
| 641 | package TreeCodeGen { |
| 642 | |
| 643 | sub new { |
| 644 | my $class = shift; |
| 645 | my $self = {}; |
| 646 | bless $self, $class; |
| 647 | |
| 648 | return $self; |
| 649 | } |
| 650 | |
| 651 | sub mangle_label { |
| 652 | my ($self, $label) = @_; |
| 653 | return $label; |
| 654 | } |
| 655 | |
| 656 | sub word_to_label { |
| 657 | my ($self, $word) = @_; |
| 658 | return $word; |
| 659 | } |
| 660 | |
| 661 | sub main { |
| 662 | my ($self, $trie, $counter, %lengths) = @_; |
| 663 | printf { $code } ("┌────────────────────────────────────────────────────┐\n"); |
| 664 | printf { $code } ("│ Initial trie │\n"); |
| 665 | printf { $code } ("└────────────────────────────────────────────────────┘\n"); |
| 666 | $self->print($trie); |
| 667 | printf { $code } ("┌────────────────────────────────────────────────────┐\n"); |
| 668 | printf { $code } ("│ Rebuilt trie │\n"); |
| 669 | printf { $code } ("└────────────────────────────────────────────────────┘\n"); |
| 670 | $self->print($trie->rebuild_tree()); |
| 671 | |
| 672 | foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { |
| 673 | printf { $code } ("┌────────────────────────────────────────────────────┐\n"); |
| 674 | printf { $code } ("│ Trie for words of length %-4d │\n", $local_length); |
| 675 | printf { $code } ("└────────────────────────────────────────────────────┘\n"); |
| 676 | $self->print($trie->filter_depth($local_length)->rebuild_tree()); |
| 677 | } |
| 678 | } |
| 679 | |
| 680 | sub open_output { |
| 681 | my $self = shift; |
| 682 | if ($code_name ne '-') { |
| 683 | open($code, '>:encoding(utf8)', $code_name) or die "Cannot open $ARGV[0]: $!" ; |
| 684 | } else { |
| 685 | $code = *STDOUT; |
| 686 | binmode($code, ':encoding(utf8)'); |
| 687 | } |
| 688 | } |
| 689 | |
| 690 | # Print a trie |
| 691 | sub print { |
| 692 | my ($self, $trie, $depth) = @_; |
| 693 | $depth //= 0; |
| 694 | |
| 695 | print { $code } (' → ') if defined($trie->{label}); |
| 696 | print { $code } ($trie->{label} // '', "\n"); |
| 697 | foreach my $key (sort keys %{$trie->{children}}) { |
| 698 | print { $code } ('│ ' x ($depth), "├── $key"); |
| 699 | $self->print($trie->{children}{$key}, $depth + 1); |
| 700 | } |
| 701 | } |
| 702 | } |
| 703 | |
| 704 | my %codegens = ( |
| 705 | C => 'CCodeGen', |
| 706 | tree => 'TreeCodeGen', |
| 707 | ); |
| 708 | |
| 709 | |
| 710 | defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(', ', keys %codegens); |
| 711 | my $codegen = $codegens{$language}->new(); |
| 712 | my ($trie, $counter, %lengths) = build_trie($codegen); |
| 713 | |
| 714 | $codegen->open_output(); |
| 715 | $codegen->main($trie, $counter, %lengths); |
| 716 | |
| 717 | |
| 718 | =head1 LICENSE |
| 719 | |
| 720 | triehash is available under the MIT/Expat license, see the source code |
| 721 | for more information. |
| 722 | |
| 723 | =head1 AUTHOR |
| 724 | |
| 725 | Julian Andres Klode <jak@jak-linux.org> |
| 726 | |
| 727 | =cut |
| 728 | |