feat: implement dense-child optimized trie lookup and enterprise test/CI profile hardening
This commit is contained in:
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
test -f gradle/verification-metadata.xml
|
||||
|
||||
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
|
||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom
|
||||
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom
|
||||
|
||||
- name: Upload SBOM
|
||||
if: always()
|
||||
@@ -70,8 +70,8 @@ jobs:
|
||||
with:
|
||||
name: test-reports
|
||||
path: |
|
||||
build/reports/tests/test
|
||||
build/test-results/test
|
||||
build/reports/tests
|
||||
build/test-results
|
||||
if-no-files-found: warn
|
||||
retention-days: 14
|
||||
|
||||
@@ -90,8 +90,8 @@ jobs:
|
||||
with:
|
||||
name: coverage-reports
|
||||
path: |
|
||||
build/reports/jacoco/test/html
|
||||
build/reports/jacoco/test/jacocoTestReport.xml
|
||||
build/reports/jacoco/jacocoCiReleaseReport/html
|
||||
build/reports/jacoco/jacocoCiReleaseReport/jacocoCiReleaseReport.xml
|
||||
if-no-files-found: warn
|
||||
retention-days: 14
|
||||
|
||||
@@ -160,7 +160,7 @@ jobs:
|
||||
env:
|
||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
|
||||
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom centralBundle
|
||||
|
||||
- name: Generate release changelog
|
||||
shell: bash
|
||||
|
||||
19
.github/workflows/pages.yml
vendored
19
.github/workflows/pages.yml
vendored
@@ -70,7 +70,7 @@ jobs:
|
||||
test -f gradle/verification-metadata.xml
|
||||
|
||||
- name: Build reports for publication
|
||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh cyclonedxBom
|
||||
run: ./gradlew --no-daemon clean ciRelease pmdMain javadoc jacocoCiReleaseReport pitest jmh cyclonedxBom
|
||||
|
||||
- name: Prepare gh-pages worktree
|
||||
shell: bash
|
||||
@@ -93,6 +93,9 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
TEST_REPORT_DIR="build/reports/tests/ciRelease"
|
||||
JACOCO_REPORT_DIR="build/reports/jacoco/jacocoCiReleaseReport"
|
||||
|
||||
SITE_DIR=".gh-pages"
|
||||
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
||||
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
||||
@@ -106,14 +109,14 @@ jobs:
|
||||
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
||||
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
||||
|
||||
cp -R build/reports/tests/test "${RUN_DIR}/test"
|
||||
cp -R build/reports/tests/test "${LATEST_DIR}/test"
|
||||
cp -R "${TEST_REPORT_DIR}" "${RUN_DIR}/test"
|
||||
cp -R "${TEST_REPORT_DIR}" "${LATEST_DIR}/test"
|
||||
|
||||
cp -R build/reports/pmd "${RUN_DIR}/pmd"
|
||||
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
|
||||
|
||||
cp -R build/reports/jacoco/test/html "${RUN_DIR}/coverage"
|
||||
cp -R build/reports/jacoco/test/html "${LATEST_DIR}/coverage"
|
||||
cp -R "${JACOCO_REPORT_DIR}/html" "${RUN_DIR}/coverage"
|
||||
cp -R "${JACOCO_REPORT_DIR}/html" "${LATEST_DIR}/coverage"
|
||||
|
||||
cp -R build/reports/pitest "${RUN_DIR}/pitest"
|
||||
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
|
||||
@@ -178,7 +181,7 @@ jobs:
|
||||
|
||||
python3 \
|
||||
./tools/generate-pages-badges.py \
|
||||
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
|
||||
--jacoco-xml "${JACOCO_REPORT_DIR}/jacocoCiReleaseReport.xml" \
|
||||
--pit-xml build/reports/pitest/mutations.xml \
|
||||
--jmh-csv build/reports/jmh/jmh-results.csv \
|
||||
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
||||
@@ -228,7 +231,7 @@ jobs:
|
||||
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
|
||||
<ul>
|
||||
<li><a href="./javadoc/">Javadoc</a></li>
|
||||
<li><a href="./test/">Test Report</a></li>
|
||||
<li><a href="./test/">Release Verification Test Report (ciRelease)</a></li>
|
||||
<li><a href="./pmd/main.html">PMD Report</a></li>
|
||||
<li><a href="./coverage/">Coverage Report</a></li>
|
||||
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
||||
@@ -260,7 +263,7 @@ jobs:
|
||||
|
||||
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||
|
||||
@@ -167,6 +167,9 @@ The repository keeps the front page concise and places detailed documentation un
|
||||
- [Architecture](docs/architecture.md)
|
||||
Structural model, data flow, and runtime lookup behavior.
|
||||
|
||||
- [Lookup Edge Optimization](docs/lookup-edge-optimization.md)
|
||||
Speed/memory trade-off of dense child edge lookup in compiled tries.
|
||||
|
||||
- [Reduction Semantics](docs/reduction-semantics.md)
|
||||
Ranked, unordered, and dominant reduction behavior.
|
||||
|
||||
|
||||
157
build.gradle
157
build.gradle
@@ -108,9 +108,19 @@ dependencyCheck {
|
||||
}
|
||||
}
|
||||
|
||||
tasks.withType(Test).configureEach {
|
||||
useJUnitPlatform()
|
||||
def cliIncludeTags = project.findProperty('includeTags')?.toString() ?: System.getProperty('includeTags')
|
||||
def cliExcludeTags = project.findProperty('excludeTags')?.toString() ?: System.getProperty('excludeTags')
|
||||
|
||||
def splitTagExpression = { String tagsExpr ->
|
||||
if (tagsExpr == null || tagsExpr.isBlank()) {
|
||||
return []
|
||||
}
|
||||
return tagsExpr.split(',')
|
||||
.collect { it.trim() }
|
||||
.findAll { it != null && !it.isBlank() }
|
||||
}
|
||||
|
||||
tasks.withType(Test).configureEach {
|
||||
doFirst {
|
||||
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||
}
|
||||
@@ -123,12 +133,125 @@ tasks.withType(Test).configureEach {
|
||||
minHeapSize = '1g'
|
||||
maxHeapSize = '4g'
|
||||
|
||||
reports {
|
||||
junitXml.required = true
|
||||
html.required = true
|
||||
}
|
||||
}
|
||||
|
||||
def configureJUnitPlatformTags = { Test task, String includeTagsExpr, String excludeTagsExpr ->
|
||||
task.useJUnitPlatform {
|
||||
final def includes = splitTagExpression(includeTagsExpr)
|
||||
final def excludes = splitTagExpression(excludeTagsExpr)
|
||||
|
||||
if (!includes.isEmpty()) {
|
||||
includeTags(*includes.toArray(new String[0]))
|
||||
}
|
||||
if (!excludes.isEmpty()) {
|
||||
excludeTags(*excludes.toArray(new String[0]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tasks.named('test', Test) {
|
||||
configureJUnitPlatformTags(it, cliIncludeTags, cliExcludeTags)
|
||||
finalizedBy(tasks.named('jacocoTestReport'))
|
||||
}
|
||||
|
||||
def configureTaggedTestProfile = { String taskName, String includeTagsExpr, String excludeTagsExpr = null,
|
||||
String taskDescription = null, String testNameExcludePatterns = null ->
|
||||
tasks.register(taskName, Test) {
|
||||
group = 'verification'
|
||||
description = taskDescription
|
||||
|
||||
configureJUnitPlatformTags(delegate as Test, includeTagsExpr, excludeTagsExpr)
|
||||
testClassesDirs = sourceSets.test.output.classesDirs
|
||||
classpath = sourceSets.test.runtimeClasspath
|
||||
dependsOn(tasks.named('compileTestJava'))
|
||||
|
||||
doFirst {
|
||||
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||
}
|
||||
|
||||
if (testNameExcludePatterns != null && !testNameExcludePatterns.isBlank()) {
|
||||
filter {
|
||||
testNameExcludePatterns.split(',').each { String pattern ->
|
||||
final def trimmedPattern = pattern.trim()
|
||||
if (!trimmedPattern.isEmpty()) {
|
||||
excludeTestsMatching(trimmedPattern)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
minHeapSize = '1g'
|
||||
maxHeapSize = '4g'
|
||||
|
||||
reports {
|
||||
junitXml.required = true
|
||||
html.required = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciSmoke',
|
||||
'unit',
|
||||
'slow',
|
||||
'Fast feedback profile for unit tests with slow tests explicitly excluded.',
|
||||
'org.egothor.stemmer.CompileIntegrationTest*'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciCore',
|
||||
'unit,trie,frequency-trie,property',
|
||||
null,
|
||||
'Focused profile for core trie behavior and trie-specific property checks.'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciIntegration',
|
||||
'integration',
|
||||
'slow',
|
||||
'Integration pipeline profile (loader/parser/CLI/IO end-to-end flows) excluding slow integration paths.'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciSlow',
|
||||
'slow',
|
||||
null,
|
||||
'Targeted profile for all slow tests (large dictionaries, long-running corpus validation, and heavy integration checks).'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciCompat',
|
||||
'compat,regression',
|
||||
null,
|
||||
'Compatibility profile guarding persisted artifact and compatibility regressions.'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciRelease',
|
||||
null,
|
||||
'slow',
|
||||
'Release-profile validation of all non-slow tests.',
|
||||
'org.egothor.stemmer.CompileIntegrationTest*,org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*'
|
||||
)
|
||||
|
||||
configureTaggedTestProfile(
|
||||
'ciNightly',
|
||||
'fuzz',
|
||||
null,
|
||||
'Nightly robustness profile with fuzz testing emphasis.'
|
||||
)
|
||||
|
||||
tasks.register('ci') {
|
||||
group = 'verification'
|
||||
description = 'Runs the full enterprise CI profile set in sequence.'
|
||||
dependsOn(tasks.named('ciSmoke'))
|
||||
dependsOn(tasks.named('ciCore'))
|
||||
dependsOn(tasks.named('ciIntegration'))
|
||||
dependsOn(tasks.named('ciCompat'))
|
||||
}
|
||||
|
||||
tasks.withType(Pmd).configureEach {
|
||||
@@ -155,6 +278,36 @@ tasks.named('jacocoTestReport', JacocoReport) {
|
||||
}
|
||||
}
|
||||
|
||||
def registerJacocoProfileReport = { String reportTaskName, String sourceTaskName ->
|
||||
tasks.register(reportTaskName, JacocoReport) {
|
||||
group = 'verification'
|
||||
description = "Generates Jacoco report for ${sourceTaskName} execution."
|
||||
|
||||
dependsOn(tasks.named(sourceTaskName))
|
||||
|
||||
classDirectories.setFrom(
|
||||
files(sourceSets.main.output).asFileTree.matching {
|
||||
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||
}
|
||||
)
|
||||
|
||||
executionData.setFrom(
|
||||
fileTree(layout.buildDirectory.dir('jacoco')) {
|
||||
include "${sourceTaskName}.exec"
|
||||
}
|
||||
)
|
||||
|
||||
reports {
|
||||
xml.required = true
|
||||
csv.required = false
|
||||
html.required = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
registerJacocoProfileReport('jacocoCiReleaseReport', 'ciRelease')
|
||||
|
||||
tasks.named('check') {
|
||||
dependsOn(tasks.named('jacocoTestReport'))
|
||||
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
|
||||
|
||||
193
docs/lookup-edge-optimization.md
Normal file
193
docs/lookup-edge-optimization.md
Normal file
@@ -0,0 +1,193 @@
|
||||
# Lookup Edge Optimization
|
||||
|
||||
Compiled trie nodes (`CompiledNode`) use three lookup strategies when resolving child edges:
|
||||
|
||||
1. dense array direct lookup,
|
||||
2. linear scan for very small child counts,
|
||||
3. binary search over sorted edge labels.
|
||||
|
||||
This page explains the dense path, what `maxExpandedIndex` controls, and how to tune it.
|
||||
|
||||
## Runtime model of one node
|
||||
|
||||
For a node with sorted edge labels `char[] edges`, the implementation can materialize an
|
||||
index-aligned dense table when labels occupy a small compact code-point interval:
|
||||
|
||||
```text
|
||||
span = maxEdge - minEdge
|
||||
use dense table iff (span <= maxExpandedIndex) and (maxExpandedIndex > 0)
|
||||
```
|
||||
|
||||
When dense lookup is used, lookup is constant-time indexing:
|
||||
|
||||
```text
|
||||
denseIndex = requestedEdge - minEdge
|
||||
return denseChildren[denseIndex] // or null if outside interval
|
||||
```
|
||||
|
||||
When dense lookup is not active (interval is too wide or the configured
|
||||
`maxExpandedIndex` is `0`), `CompiledNode` still chooses between two fallback
|
||||
strategies:
|
||||
|
||||
- **linear scan** for very small child counts (`4` or fewer children),
|
||||
- **binary search** for larger child counts.
|
||||
|
||||
This means the fallback method is selected by child count, not by “distance” alone.
|
||||
`linear scan` is therefore used when there are only a few edges even if those edges are
|
||||
spread across very distant code points.
|
||||
|
||||
### Example: few edges, wide Unicode span
|
||||
|
||||
```text
|
||||
edges = ['a', '中', '你']
|
||||
edge count = 3
|
||||
minEdge = 'a' (U+0061)
|
||||
maxEdge = '你' (U+4F60)
|
||||
span = 20319
|
||||
```
|
||||
|
||||
- If `maxExpandedIndex = 512`, dense indexing is not used because `span > maxExpandedIndex`.
|
||||
- Because `edge count = 3` (<= 4), lookup falls back to a tiny linear scan of the
|
||||
three labels.
|
||||
- This is exactly the case where you get benefit from the threshold even though the interval is wide.
|
||||
|
||||
This is useful for non-Latin scripts as well: what matters is interval width in Unicode
|
||||
code points, not script name. A compact Arabic-range block can still benefit from dense
|
||||
lookups when keys stay in a tight code-point interval.
|
||||
|
||||
## Why this is configurable
|
||||
|
||||
`maxExpandedIndex` is only a performance/paging choice:
|
||||
|
||||
- higher value:
|
||||
- more compact intervals qualify for dense tables,
|
||||
- more constant-time child lookup,
|
||||
- more memory for dense tables in qualifying nodes.
|
||||
- lower value (or `0`):
|
||||
- less dense-table allocation,
|
||||
- fewer branches into constant-time path,
|
||||
- lower materialization memory.
|
||||
|
||||
The value never changes lookup semantics. It only changes the in-memory structure shape.
|
||||
|
||||
## Persistence and loading model
|
||||
|
||||
This threshold is **not** stored in `TrieMetadata`.
|
||||
|
||||
- The binary format stores only trie payload and semantic metadata (`reduction`, `traversal`,
|
||||
case/diacritic settings, and stream version).
|
||||
- `maxExpandedIndex` is chosen when materializing nodes in memory.
|
||||
- You can therefore keep one persisted artifact and load it with different in-memory
|
||||
trade-offs depending on deployment constraints.
|
||||
|
||||
## Default
|
||||
|
||||
- `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||
- `CompiledNode.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||
|
||||
These are practical defaults for mixed-language text and Latin-like scripts where edge labels
|
||||
often cluster.
|
||||
|
||||
## Tune during build (writable phase)
|
||||
|
||||
Use the full `FrequencyTrie.Builder` constructor when you are compiling from source data.
|
||||
The builder threshold is applied while freezing reduced nodes into the immutable form.
|
||||
|
||||
```java
|
||||
import org.egothor.stemmer.CaseProcessingMode;
|
||||
import org.egothor.stemmer.DiacriticProcessingMode;
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.WordTraversalDirection;
|
||||
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final FrequencyTrie.Builder<String> fastBuilder =
|
||||
new FrequencyTrie.Builder<>(String[]::new,
|
||||
settings,
|
||||
WordTraversalDirection.BACKWARD,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
DiacriticProcessingMode.AS_IS,
|
||||
1024); // prefer lookup speed
|
||||
|
||||
// ... put(...) ...
|
||||
final FrequencyTrie<String> trie = fastBuilder.build();
|
||||
```
|
||||
|
||||
Use `0` or `256` for lower memory while still building larger tries.
|
||||
|
||||
```java
|
||||
final FrequencyTrie.Builder<String> compactBuilder =
|
||||
new FrequencyTrie.Builder<>(String[]::new,
|
||||
settings,
|
||||
WordTraversalDirection.BACKWARD,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
DiacriticProcessingMode.AS_IS,
|
||||
256); // lower memory profile
|
||||
```
|
||||
|
||||
## Tune when loading a binary artifact (runtime phase)
|
||||
|
||||
At artifact load time, you can tune the same trade-off independently of persisted metadata.
|
||||
|
||||
```java
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
var defaultLookup = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
|
||||
var fastLookup = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"), 1024);
|
||||
|
||||
var compactLookup = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"), 0);
|
||||
```
|
||||
|
||||
You can also set the threshold directly with `FrequencyTrie.readFrom(...)` when reading streams:
|
||||
|
||||
```java
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
|
||||
public final class StreamLoadExample {
|
||||
|
||||
private StreamLoadExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
try (InputStream fileInput = Files.newInputStream(Path.of("stemmers", "english.radixor.gz"));
|
||||
GZIPInputStream gzip = new GZIPInputStream(fileInput);
|
||||
DataInputStream dataInput = new DataInputStream(gzip)) {
|
||||
final FrequencyTrie<String> compactOnLoad = FrequencyTrie.readFrom(
|
||||
dataInput,
|
||||
String[]::new,
|
||||
input -> input.readUTF(),
|
||||
256);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: the string codec is intentionally inline in this snippet to keep it self-contained.
|
||||
|
||||
## Practical guidance
|
||||
|
||||
- Start with default (`512`) in production and profile before changing it.
|
||||
- Use `0` when memory is the priority and query throughput is not the bottleneck.
|
||||
- Use values around `1024` for workloads dominated by compact alphabets and very hot lookups.
|
||||
|
||||
Trade-off expectation:
|
||||
|
||||
- increasing `maxExpandedIndex` improves lookup speed when edges tend to occupy short spans,
|
||||
- decreasing it reduces per-node auxiliary memory in dense-span nodes.
|
||||
@@ -87,6 +87,43 @@ public final class LoadBinaryExample {
|
||||
|
||||
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
|
||||
|
||||
## Tune child lookup density when loading binaries
|
||||
|
||||
To optimize hot-path latency, you can tune direct child indexing by passing `maxExpandedIndex`
|
||||
at load time. This does not change persisted metadata, only the materialized in-memory form.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class LoadBinaryWithDenseLookupExample {
|
||||
|
||||
private LoadBinaryWithDenseLookupExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> balanced = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
|
||||
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"),
|
||||
1024);
|
||||
|
||||
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"),
|
||||
0);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Negative values still use `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX`.
|
||||
|
||||
[Lookup Edge Optimization](lookup-edge-optimization.md) describes the trade-off in detail and examples for build-time tuning as well.
|
||||
|
||||
## Build directly with a mutable builder
|
||||
|
||||
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
|
||||
|
||||
@@ -25,6 +25,7 @@ This is why Radixor can generalize beyond explicitly listed forms and why compil
|
||||
The programmatic API is easier to understand when split by developer task:
|
||||
|
||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
||||
- [Lookup Edge Optimization](lookup-edge-optimization.md) explains dense child lookup tuning and the speed/memory trade-off when materializing compiled tries.
|
||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
||||
|
||||
|
||||
@@ -58,6 +58,27 @@ A deterministic system is easier to test, easier to reason about, and safer to i
|
||||
|
||||
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
||||
|
||||
The recommended execution strategy is defined by the tagged test profiles in [Test taxonomy and execution filtering](test-taxonomy-and-filtering.md). In practice, teams can execute profile tasks directly:
|
||||
|
||||
- `./gradlew ciSmoke`: fast local/PR safety checks (`unit`, excluding `slow`; additionally excludes
|
||||
`CompileIntegrationTest` as a defensive safeguard).
|
||||
- `./gradlew ciSlow`: enterprise heavy gate for all tests marked with `slow` (typically
|
||||
production dictionary and large corpus verification). This should be used for scheduled/manual
|
||||
hardening gates and not in standard release build.
|
||||
- `./gradlew ciCore`: behavioral coverage of trie and frequency-trie paths (`unit` + `property` where applicable)
|
||||
- `./gradlew ciIntegration`: pipeline and CLI integration path checks
|
||||
- `./gradlew ciCompat`: compatibility and regression verification for persisted artifacts
|
||||
- `./gradlew ciRelease`: full non-slow suite for release-confidence runs (all test tags except `slow`,
|
||||
plus explicit name-based exclusion of `CompileIntegrationTest*` and
|
||||
`StemmerPatchTrieLoaderTest$BundledDictionaryTests*` as additional guardrails)
|
||||
- `./gradlew ciNightly`: extended fuzz profile for robustness hardening
|
||||
- `./gradlew ci`: umbrella profile depending on smoke/core/integration/compat
|
||||
|
||||
## Test taxonomy and execution filtering
|
||||
|
||||
The full tag taxonomy and executable filter examples are documented in
|
||||
[Test taxonomy and execution filtering](test-taxonomy-and-filtering.md).
|
||||
|
||||
### Structural coverage
|
||||
|
||||
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
||||
|
||||
@@ -67,6 +67,36 @@ public final class LoadBinaryStemmerExample {
|
||||
}
|
||||
```
|
||||
|
||||
You can tune in-memory child lookup density at load time without changing the artifact:
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class LoadBinaryStemmerExampleTuned {
|
||||
|
||||
private LoadBinaryStemmerExampleTuned() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"),
|
||||
1024);
|
||||
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"),
|
||||
128);
|
||||
|
||||
System.out.println("fast=" + fast.size() + ", compact=" + compact.size());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
For the trade-off details, see [Lookup Edge Optimization](lookup-edge-optimization.md).
|
||||
|
||||
### Build or extend a stemmer from dictionary data
|
||||
|
||||
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
|
||||
|
||||
@@ -23,7 +23,7 @@ These reports are primarily useful when reviewing the published API surface and
|
||||
|
||||
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
||||
|
||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||
|
||||
216
docs/test-taxonomy-and-filtering.md
Normal file
216
docs/test-taxonomy-and-filtering.md
Normal file
@@ -0,0 +1,216 @@
|
||||
# Test Tag Taxonomy and Execution Guide
|
||||
|
||||
Radixor uses JUnit tags as an explicit execution policy for its test suite.
|
||||
|
||||
The project uses three orthogonal axes:
|
||||
|
||||
1. **Scope** (how the test is executed in the pipeline)
|
||||
2. **Domain** (where in the system it belongs)
|
||||
3. **Intent** (what behavior it verifies)
|
||||
|
||||
## Canonical scope tags
|
||||
|
||||
| Tag | Description | Typical usage |
|
||||
| --- | --- | --- |
|
||||
| `unit` | Fast, deterministic tests that exercise a specific class or behavior without external processes. | Default developer feedback; should stay near-zero flakiness and low run time. |
|
||||
| `integration` | Tests that span multiple components or end-to-end flows of the public pipeline. | Parser/loader/CLI/IO integration checks and multi-step compile-then-load validations. |
|
||||
| `property` | Property-based tests with generator-driven coverage for invariants. | Semantics-preserving laws and edge-case exploration beyond curated fixtures. |
|
||||
| `fuzz` | Randomized stress checks with bounded runtime. | Heavier probabilistic verification of robustness and reduction invariants. |
|
||||
| `compat` | Backward/forward compatibility and reproducibility checks for persisted artifacts. | Artifact fingerprints, deterministic rebuild, and regression fixtures. |
|
||||
| `slow` | Long-running or expensive tests that should not execute in every fast gate. | Heavy fuzz/property budgets or high-duration integration checks. |
|
||||
|
||||
## Canonical domain tags
|
||||
|
||||
| Tag | Description | Typical usage |
|
||||
| --- | --- | --- |
|
||||
| `core` | Core algorithm and foundational platform behavior. | Traversal direction, base data structures, low-level helpers. |
|
||||
| `trie` | All mutable/compiled trie behaviors and traversal internals. | Lookup path selection, node shape, child representation, subtree behavior. |
|
||||
| `frequency-trie` | Algorithms and corner cases specific to frequency-aware trie logic. | Ranking, weighted reductions, persistence of weighted nodes. |
|
||||
| `stemmer` | End-user stemming pipeline semantics. | Parse-encode-apply flows and output invariants. |
|
||||
| `patch` | Patch encoding, decoding, and application semantics. | `PatchCommandEncoder` behavior and related compatibility contracts. |
|
||||
| `io` | Input/output and resource loading boundaries. | Filesystem readers, streams, and stream lifecycle handling. |
|
||||
| `serialization` | Binary persistence contract of compiled artifacts. | Versioned format reads/writes and checksum/consistency checks. |
|
||||
| `parser` | Dictionary and metadata parsing concerns. | Dictionary input parsing and malformed-source rejection. |
|
||||
| `cli` | Command-line entrypoint and command orchestration behavior. | Compile CLI integration and CLI argument validation. |
|
||||
| `metadata` | Trie metadata semantics, compatibility fields, and schema expectations. | Version flags, structural properties, and metadata round-trips. |
|
||||
| `compile` | Compile-time pipeline and build-oriented behavior. | Building, reduction-mode behavior, and compiled artifact generation. |
|
||||
| `diacritic` | Unicode diacritic normalization and stripping behavior. | Accent-removal correctness and locale-safe normalization checks. |
|
||||
|
||||
## Canonical intent tags
|
||||
|
||||
| Tag | Description | Typical usage |
|
||||
| --- | --- | --- |
|
||||
| `construction` | Tests around construction and assembly of runtime structures. | Builders, loaders, and compile-time object construction contracts. |
|
||||
| `lookup` | Read behavior and retrieval semantics. | `get()`, `getAll()`, traversal and missing-key behavior. |
|
||||
| `persistence` | Storage lifecycle semantics. | Serialization/deserialization and round-trip correctness. |
|
||||
| `reduction` | Reduction algorithm correctness and corner cases. | Dominance threshold, subtree deduplication, rank-preservation invariants. |
|
||||
| `encoding` | Encoding transformation direction. | `PatchCommandEncoder.encode` and serialized command form generation. |
|
||||
| `decoding` | Decoding/interpretation of persisted or runtime commands. | Optional consumers that parse and apply encoded command payloads. |
|
||||
| `apply` | Patch application and transformation behavior. | Verifies that applied patches produce expected derived forms. |
|
||||
| `normalization` | Canonicalization and cleanup behavior. | String normalization around case/shape and mirrored input paths. |
|
||||
| `validation` | Input rejection and defensive checks. | Null/empty/invalid contracts and explicit failure conditions. |
|
||||
| `regression` | Guard tests for behavior changes over time. | Known historical bugs and behavioral drift prevention. |
|
||||
| `determinism` | Repeatable results under fixed input and settings. | Compile determinism, stable ordering, and artifact reproducibility. |
|
||||
| `error-handling` | Exception surface and robustness expectations. | Recovery/failure modes and diagnostics quality. |
|
||||
|
||||
## Class-level rules
|
||||
|
||||
1. Every test class has **exactly one** scope tag.
|
||||
2. Every test class has at least one domain tag.
|
||||
3. Additional tags describe intent and may be used on classes or nested tests.
|
||||
4. For each test class, intent tags should reflect the primary behavior under test, not historical naming conventions.
|
||||
|
||||
## Governance and execution policy
|
||||
|
||||
The following rules are used to keep the suite auditable and stable:
|
||||
|
||||
| Rule | Required state | Why |
|
||||
| --- | --- | --- |
|
||||
| Scope discipline | Exactly one scope tag per class. | Prevents accidental promotion of integration-only behavior into fast unit runs. |
|
||||
| Coverage breadth | At least one domain tag per class. | Ensures tests can be grouped by subsystem for targeted review. |
|
||||
| Intent specificity | Use at least one intent tag when behavior is non-trivial. | Makes failure triage faster and profile composition explicit. |
|
||||
| Runtime policy | Never run `slow` tests in the default `unit` profile unless explicitly required. | Preserves turnaround for PR feedback while preserving deep checks. |
|
||||
| Change risk | Any persistence or compatibility-affecting change must include `compat` in validation. | Protects long-lived binary artifact contracts. |
|
||||
| Mutation resistance | `fuzz`/`property` sets should be gated to dedicated profiles. | Limits flakiness exposure and controls CI resource cost. |
|
||||
|
||||
## Suggested CI profiles
|
||||
|
||||
These are recommended launch profiles for local and CI usage and are also exposed as Gradle tasks:
|
||||
|
||||
- **Profile: `ci-smoke` (fast feedback):**
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=unit -DexcludeTags=slow
|
||||
./gradlew ciSmoke
|
||||
```
|
||||
|
||||
`ciSmoke` also excludes `org.egothor.stemmer.CompileIntegrationTest*` at test-name filter level as a
|
||||
defensive fallback in case of future tag drift.
|
||||
`ciRelease` also excludes
|
||||
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` at filter level.
|
||||
|
||||
- **Profile: `ci-core` (core behavioral coverage):**
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=unit,trie,frequency-trie,property
|
||||
./gradlew ciCore
|
||||
```
|
||||
|
||||
- **Profile: `ci-integration` (pipeline correctness):**
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=integration
|
||||
./gradlew ciIntegration
|
||||
```
|
||||
|
||||
- **Profile: `ci-slow` (explicit heavy validation):**
|
||||
|
||||
```
|
||||
./gradlew ciSlow
|
||||
```
|
||||
|
||||
- **Profile: `ci-compat` (artifact stability):**
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=compat,regression
|
||||
./gradlew ciCompat
|
||||
```
|
||||
|
||||
- **Profile: `ci-release` (strong confidence before release):**
|
||||
|
||||
```
|
||||
./gradlew test -DexcludeTags=slow
|
||||
./gradlew ciRelease
|
||||
```
|
||||
`ciRelease` is non-slow by policy and uses the same defensive name-based exclusion for
|
||||
`org.egothor.stemmer.CompileIntegrationTest*` and
|
||||
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` in addition to tag filtering.
|
||||
|
||||
- **Profile: `ci-nightly` (extended hardening):**
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=fuzz
|
||||
./gradlew ciNightly
|
||||
```
|
||||
|
||||
- **Profile: `ci` (enterprise umbrella):**
|
||||
|
||||
```
|
||||
./gradlew ci
|
||||
```
|
||||
|
||||
`ci` and `ciRelease` intentionally do **not** include `slow` paths. Run `ciSlow` explicitly for production-dictionary stress and long-running corpus checks.
|
||||
|
||||
## Practical examples
|
||||
|
||||
All examples use Gradle with JUnit Platform integration:
|
||||
|
||||
- Only unit tests:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=unit
|
||||
```
|
||||
|
||||
- Integration tests only:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=integration
|
||||
```
|
||||
|
||||
- Only trie subsystem tests:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=trie
|
||||
```
|
||||
|
||||
- Deterministic fuzz checks:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=fuzz
|
||||
```
|
||||
|
||||
- Property tests:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=property
|
||||
```
|
||||
|
||||
- Stemmer + patch command behavior:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=stemmer,patch
|
||||
```
|
||||
|
||||
- Compatibility artifacts and regression checks:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=compat
|
||||
```
|
||||
|
||||
- Keep regression suite and remove long-running cases:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=regression -DexcludeTags=slow
|
||||
```
|
||||
|
||||
- Core + patch behavior:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=trie,patch
|
||||
```
|
||||
|
||||
- Deterministic compatibility and persistence checks:
|
||||
|
||||
```
|
||||
./gradlew test -DincludeTags=compat,determinism,serialization
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- `-DincludeTags` and `-DexcludeTags` are interpreted by Gradle task filtering and forwarded into
|
||||
JUnit tag filtering.
|
||||
- Class-name filtering is also available via Gradle test selectors where needed
|
||||
(for example, `--tests *CompileTest`), but tag filtering remains the default
|
||||
execution strategy.
|
||||
- `-DincludeTags` supports comma-separated literal tags. When you need a single exact tag with special
|
||||
characters, quote the argument for the shell.
|
||||
@@ -84,7 +84,7 @@ publishing {
|
||||
}
|
||||
|
||||
signing {
|
||||
required { !version.toString().endsWith('-SNAPSHOT') }
|
||||
required = !version.toString().endsWith('-SNAPSHOT')
|
||||
if (signingKey != null && !signingKey.isBlank()) {
|
||||
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||
sign publishing.publications.mavenJava
|
||||
|
||||
@@ -54,6 +54,7 @@ nav:
|
||||
- Overview: architecture-and-reduction.md
|
||||
- Architecture: architecture.md
|
||||
- Reduction Semantics: reduction-semantics.md
|
||||
- Lookup Edge Optimization: lookup-edge-optimization.md
|
||||
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
||||
|
||||
- Dictionaries:
|
||||
@@ -63,3 +64,4 @@ nav:
|
||||
- Quality and Operations: quality-and-operations.md
|
||||
- Benchmarking: benchmarking.md
|
||||
- Reports: reports.md
|
||||
- Test taxonomy and execution filtering: test-taxonomy-and-filtering.md
|
||||
|
||||
@@ -51,7 +51,6 @@ import java.util.logging.Logger;
|
||||
import org.egothor.stemmer.trie.CompiledNode;
|
||||
import org.egothor.stemmer.trie.LocalValueSummary;
|
||||
import org.egothor.stemmer.trie.MutableNode;
|
||||
import org.egothor.stemmer.trie.NodeData;
|
||||
import org.egothor.stemmer.trie.ReducedNode;
|
||||
import org.egothor.stemmer.trie.ReductionContext;
|
||||
import org.egothor.stemmer.trie.ReductionSignature;
|
||||
@@ -87,7 +86,6 @@ import org.egothor.stemmer.trie.ReductionSignature;
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||
public final class FrequencyTrie<V> {
|
||||
|
||||
/**
|
||||
@@ -130,11 +128,54 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Minimum supported stream version constant retained for explicit range checks.
|
||||
*/
|
||||
private static final int MIN_STREAM_VERSION = 1;
|
||||
|
||||
/**
|
||||
* Number of stored values for which {@link #getEntries(String)} can return an
|
||||
* empty result.
|
||||
*/
|
||||
private static final int NO_VALUE_COUNT = 0;
|
||||
|
||||
/**
|
||||
* Number of stored values for which {@link #getEntries(String)} can use a
|
||||
* one-item immutable list special case.
|
||||
*/
|
||||
private static final int SINGLE_VALUE_COUNT = 1;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 5;
|
||||
|
||||
/**
|
||||
* Version where traversal-direction ordinal is persisted.
|
||||
*/
|
||||
private static final int TRAVERSAL_VERSION = 2;
|
||||
|
||||
/**
|
||||
* Version where compact reduction metadata is persisted.
|
||||
*/
|
||||
private static final int REDUCTION_VERSION = 3;
|
||||
|
||||
/**
|
||||
* Version where case-processing mode ordinal is persisted.
|
||||
*/
|
||||
private static final int CASE_VERSION = 4;
|
||||
|
||||
/**
|
||||
* Default dense child lookup span in code points used when materializing
|
||||
* compiled nodes without an explicit override.
|
||||
* <p>
|
||||
* Increasing this value increases the chance of direct array indexing for
|
||||
* child lookup at runtime at the cost of per-node dense table memory for
|
||||
* compact character spans.
|
||||
* </p>
|
||||
*/
|
||||
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||
|
||||
/**
|
||||
* Returns the current persisted binary stream format version.
|
||||
*
|
||||
@@ -259,7 +300,6 @@ public final class FrequencyTrie<V> {
|
||||
* if the key does not exist or no value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
public List<ValueCount<V>> getEntries(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
@@ -269,11 +309,11 @@ public final class FrequencyTrie<V> {
|
||||
|
||||
final V[] orderedValues = node.orderedValues();
|
||||
final int valueCount = orderedValues.length;
|
||||
if (valueCount == 0) {
|
||||
if (valueCount == NO_VALUE_COUNT) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
if (valueCount == 1) {
|
||||
if (valueCount == SINGLE_VALUE_COUNT) {
|
||||
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
|
||||
}
|
||||
|
||||
@@ -383,47 +423,31 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
|
||||
final DataInputStream dataInput; // NOPMD
|
||||
if (inputStream instanceof DataInputStream) {
|
||||
dataInput = (DataInputStream) inputStream;
|
||||
} else {
|
||||
dataInput = new DataInputStream(inputStream);
|
||||
return readFrom(inputStream, arrayFactory, valueCodec, -1);
|
||||
}
|
||||
|
||||
final int magic = dataInput.readInt();
|
||||
if (magic != STREAM_MAGIC) {
|
||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
final int nodeCount = dataInput.readInt();
|
||||
if (nodeCount < 0) {
|
||||
throw new IOException("Negative node count: " + nodeCount);
|
||||
}
|
||||
|
||||
final int rootNodeId = dataInput.readInt();
|
||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||
}
|
||||
|
||||
final TrieMetadata metadata = readMetadata(dataInput, version);
|
||||
|
||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode, metadata);
|
||||
/**
|
||||
* Reads a compiled trie from the supplied input stream, optionally overriding
|
||||
* dense child-index span configuration.
|
||||
* <p>
|
||||
* This setting is applied only while materializing the in-memory compiled
|
||||
* representation during load. It is not serialized in {@link TrieMetadata},
|
||||
* so each load can independently choose its own runtime lookup trade-off.
|
||||
* </p>
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @param arrayFactory array factory used to create typed arrays
|
||||
* @param valueCodec codec used to read values
|
||||
* @param maxExpandedIndex dense lookup span override; zero disables dense lookup,
|
||||
* negative values use {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @param <V> value type
|
||||
* @return deserialized compiled trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails or the binary format is invalid
|
||||
*/
|
||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||
return CompiledTrieReader.read(inputStream, arrayFactory, valueCodec, maxExpandedIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -438,73 +462,6 @@ public final class FrequencyTrie<V> {
|
||||
dataOutput.writeUTF(metadata.toTextBlock());
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads persisted trie metadata while remaining backward compatible with
|
||||
* earlier stream versions.
|
||||
*
|
||||
* @param dataInput input stream
|
||||
* @param version persisted stream version
|
||||
* @return deserialized metadata
|
||||
* @throws IOException if the metadata section is invalid
|
||||
*/
|
||||
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||
if (version >= 5) { // NOPMD
|
||||
try {
|
||||
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
|
||||
} catch (IllegalArgumentException exception) {
|
||||
throw new IOException("Invalid metadata block.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection;
|
||||
if (version >= 2) { // NOPMD
|
||||
final int traversalDirectionOrdinal = dataInput.readInt();
|
||||
final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values();
|
||||
if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) {
|
||||
throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal);
|
||||
}
|
||||
traversalDirection = traversalDirections[traversalDirectionOrdinal];
|
||||
} else {
|
||||
traversalDirection = WordTraversalDirection.BACKWARD;
|
||||
}
|
||||
|
||||
if (version < 3) { // NOPMD
|
||||
return TrieMetadata.legacy(version, traversalDirection);
|
||||
}
|
||||
|
||||
final ReductionMode[] reductionModes = ReductionMode.values();
|
||||
final int reductionModeOrdinal = dataInput.readInt();
|
||||
if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) {
|
||||
throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal);
|
||||
}
|
||||
|
||||
final int dominantWinnerMinPercent = dataInput.readInt();
|
||||
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
||||
|
||||
final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values();
|
||||
final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD
|
||||
if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) {
|
||||
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
|
||||
}
|
||||
|
||||
final CaseProcessingMode caseProcessingMode;
|
||||
if (version >= 4) { // NOPMD
|
||||
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
|
||||
final int caseProcessingModeOrdinal = dataInput.readInt();
|
||||
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
|
||||
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
|
||||
}
|
||||
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
|
||||
} else {
|
||||
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
}
|
||||
|
||||
return new TrieMetadata(version, traversalDirection,
|
||||
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of canonical compiled nodes reachable from the root.
|
||||
*
|
||||
@@ -574,20 +531,126 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads all compiled nodes and resolves child references.
|
||||
* Internal helper that materializes serialized trie data.
|
||||
*
|
||||
* @param dataInput input
|
||||
* @param arrayFactory array factory
|
||||
* @param valueCodec value codec
|
||||
* @param nodeCount number of nodes
|
||||
* @param <V> value type
|
||||
* @return array of nodes indexed by serialized node identifier
|
||||
* @throws IOException if reading fails or the stream is invalid
|
||||
* <p>
|
||||
* Moving reader complexity into this helper keeps the public-facing class from
|
||||
* accumulating excessive class-level cyclomatic complexity while preserving the
|
||||
* same binary compatibility contract.
|
||||
* </p>
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
|
||||
private static final class CompiledTrieReader {
|
||||
|
||||
private static <V> FrequencyTrie<V> read(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
if (maxExpandedIndex < -1) {
|
||||
throw new IllegalArgumentException("maxExpandedIndex must be >= -1.");
|
||||
}
|
||||
|
||||
final DataInputStream dataInput = wrapInputStream(inputStream);
|
||||
final int magic = dataInput.readInt();
|
||||
if (magic != STREAM_MAGIC) {
|
||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version < MIN_STREAM_VERSION || version > STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
final int nodeCount = dataInput.readInt();
|
||||
if (nodeCount < 0) {
|
||||
throw new IOException("Negative node count: " + nodeCount);
|
||||
}
|
||||
|
||||
final int rootNodeId = dataInput.readInt();
|
||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||
}
|
||||
|
||||
final TrieMetadata sourceMetadata = readMetadata(dataInput, version);
|
||||
final int effectiveMaxExpandedIndex = maxExpandedIndex >= 0 ? maxExpandedIndex : DEFAULT_MAX_EXPANDED_INDEX;
|
||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount, effectiveMaxExpandedIndex);
|
||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode, sourceMetadata);
|
||||
}
|
||||
|
||||
private static DataInputStream wrapInputStream(final InputStream inputStream) {
|
||||
return inputStream instanceof DataInputStream
|
||||
? (DataInputStream) inputStream
|
||||
: new DataInputStream(inputStream);
|
||||
}
|
||||
|
||||
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||
if (version == STREAM_VERSION) {
|
||||
return readTextMetadata(dataInput);
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection = readTraversalDirection(dataInput, version);
|
||||
if (version < REDUCTION_VERSION) {
|
||||
return TrieMetadata.legacy(version, traversalDirection);
|
||||
}
|
||||
|
||||
final ReductionSettings reductionSettings = readReductionSettings(dataInput);
|
||||
final DiacriticProcessingMode diacriticProcessingMode = readEnumByOrdinal(dataInput, DiacriticProcessingMode.values(),
|
||||
"diacritic processing mode");
|
||||
final CaseProcessingMode caseProcessingMode = version >= CASE_VERSION
|
||||
? readCaseProcessingMode(dataInput)
|
||||
: CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
return new TrieMetadata(version, traversalDirection, reductionSettings, diacriticProcessingMode, caseProcessingMode);
|
||||
}
|
||||
|
||||
private static TrieMetadata readTextMetadata(final DataInputStream dataInput) throws IOException {
|
||||
try {
|
||||
return TrieMetadata.fromTextBlock(STREAM_VERSION, dataInput.readUTF());
|
||||
} catch (IllegalArgumentException exception) {
|
||||
throw new IOException("Invalid metadata block.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
private static WordTraversalDirection readTraversalDirection(final DataInputStream dataInput, final int version)
|
||||
throws IOException {
|
||||
if (version < TRAVERSAL_VERSION) {
|
||||
return WordTraversalDirection.BACKWARD;
|
||||
}
|
||||
return readEnumByOrdinal(dataInput, WordTraversalDirection.values(), "traversal direction");
|
||||
}
|
||||
|
||||
private static ReductionSettings readReductionSettings(final DataInputStream dataInput) throws IOException {
|
||||
final ReductionMode reductionMode = readEnumByOrdinal(dataInput, ReductionMode.values(), "reduction mode");
|
||||
final int dominantWinnerMinPercent = dataInput.readInt();
|
||||
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
||||
return new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio);
|
||||
}
|
||||
|
||||
private static CaseProcessingMode readCaseProcessingMode(final DataInputStream dataInput) throws IOException {
|
||||
return readEnumByOrdinal(dataInput, CaseProcessingMode.values(), "case processing mode");
|
||||
}
|
||||
|
||||
private static <E extends Enum<E>> E readEnumByOrdinal(final DataInputStream dataInput, final E[] values,
|
||||
final String name) throws IOException {
|
||||
final int ordinal = dataInput.readInt();
|
||||
if (ordinal < 0 || ordinal >= values.length) {
|
||||
throw new IOException("Invalid " + name + " ordinal: " + ordinal);
|
||||
}
|
||||
return values[ordinal];
|
||||
}
|
||||
|
||||
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
|
||||
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
|
||||
final ValueStreamCodec<V> valueCodec, final int nodeCount, final int maxExpandedIndex) throws IOException {
|
||||
final char[][] edgeLabelsByNode = new char[nodeCount][];
|
||||
final int[][] childNodeIdsByNode = new int[nodeCount][];
|
||||
@SuppressWarnings("unchecked")
|
||||
final V[][] orderedValuesByNode = (V[][]) new Object[nodeCount][];
|
||||
final int[][] orderedCountsByNode = new int[nodeCount][];
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final int edgeCount = dataInput.readInt();
|
||||
@@ -595,77 +658,85 @@ public final class FrequencyTrie<V> {
|
||||
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
||||
}
|
||||
|
||||
final char[] edgeLabels = new char[edgeCount];
|
||||
final int[] childNodeIds = new int[edgeCount];
|
||||
edgeLabelsByNode[nodeIndex] = new char[edgeCount];
|
||||
childNodeIdsByNode[nodeIndex] = new int[edgeCount];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||
edgeLabels[edgeIndex] = dataInput.readChar();
|
||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
||||
edgeLabelsByNode[nodeIndex][edgeIndex] = dataInput.readChar();
|
||||
childNodeIdsByNode[nodeIndex][edgeIndex] = dataInput.readInt();
|
||||
}
|
||||
|
||||
validateSerializedEdges(nodeIndex, edgeLabels);
|
||||
validateSerializedEdges(nodeIndex, edgeLabelsByNode[nodeIndex]);
|
||||
|
||||
final int valueCount = dataInput.readInt();
|
||||
if (valueCount < 0) {
|
||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||
}
|
||||
|
||||
final V[] orderedValues = arrayFactory.apply(valueCount);
|
||||
final int[] orderedCounts = new int[valueCount];
|
||||
orderedValuesByNode[nodeIndex] = arrayFactory.apply(valueCount);
|
||||
orderedCountsByNode[nodeIndex] = new int[valueCount];
|
||||
|
||||
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
||||
orderedValues[valueIndex] = valueCodec.read(dataInput);
|
||||
orderedCounts[valueIndex] = dataInput.readInt();
|
||||
if (orderedCounts[valueIndex] <= 0) {
|
||||
orderedValuesByNode[nodeIndex][valueIndex] = valueCodec.read(dataInput);
|
||||
orderedCountsByNode[nodeIndex][valueIndex] = dataInput.readInt();
|
||||
if (orderedCountsByNode[nodeIndex][valueIndex] <= 0) {
|
||||
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
||||
+ valueIndex + ": " + orderedCounts[valueIndex]);
|
||||
+ valueIndex + ": " + orderedCountsByNode[nodeIndex][valueIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
||||
final boolean[] inProgress = new boolean[nodeCount];
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
|
||||
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
|
||||
nodeData.orderedCounts());
|
||||
}
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
final CompiledNode<V> node = nodes[nodeIndex];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
|
||||
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
|
||||
if (childNodeId < 0 || childNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
|
||||
+ ": " + childNodeId);
|
||||
}
|
||||
node.children()[edgeIndex] = nodes[childNodeId];
|
||||
}
|
||||
nodes[nodeIndex] = resolveNode(nodeIndex, edgeLabelsByNode, childNodeIdsByNode, orderedValuesByNode,
|
||||
orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||
}
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the serialized edge-label sequence for one node.
|
||||
*
|
||||
* <p>
|
||||
* Compiled nodes rely on binary search for child lookup and therefore require
|
||||
* edge labels to be stored in strict ascending order without duplicates.
|
||||
* Rejecting malformed streams here keeps lookup semantics deterministic and
|
||||
* avoids silently constructing a trie whose search behavior would be undefined.
|
||||
*
|
||||
* @param nodeIndex serialized node identifier
|
||||
* @param edgeLabels serialized edge labels
|
||||
* @throws IOException if the edge labels are not strictly ascending
|
||||
*/
|
||||
private static <V> CompiledNode<V> resolveNode(final int nodeIndex, final char[][] edgeLabelsByNode,
|
||||
final int[][] childNodeIdsByNode, final V[][] orderedValuesByNode, final int[][] orderedCountsByNode,
|
||||
final CompiledNode<V>[] nodes, final boolean[] inProgress, final int maxExpandedIndex) throws IOException {
|
||||
final CompiledNode<V> cachedNode = nodes[nodeIndex];
|
||||
if (cachedNode != null) {
|
||||
return cachedNode;
|
||||
}
|
||||
|
||||
if (inProgress[nodeIndex]) {
|
||||
throw new IOException("Invalid serialized node graph: cyclic reference detected at node " + nodeIndex + '.');
|
||||
}
|
||||
inProgress[nodeIndex] = true;
|
||||
try {
|
||||
final char[] edgeLabels = edgeLabelsByNode[nodeIndex];
|
||||
final int[] childNodeIds = childNodeIdsByNode[nodeIndex];
|
||||
final int edgeCount = childNodeIds.length;
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] children = new CompiledNode[edgeCount];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||
final int childNodeId = childNodeIds[edgeIndex];
|
||||
if (childNodeId < 0 || childNodeId >= edgeLabelsByNode.length) {
|
||||
throw new IOException(
|
||||
"Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex + ": "
|
||||
+ childNodeId);
|
||||
}
|
||||
children[edgeIndex] = resolveNode(childNodeId, edgeLabelsByNode, childNodeIdsByNode,
|
||||
orderedValuesByNode, orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||
}
|
||||
|
||||
final CompiledNode<V> node = new CompiledNode<>(edgeLabels, children, orderedValuesByNode[nodeIndex], maxExpandedIndex,
|
||||
orderedCountsByNode[nodeIndex]);
|
||||
nodes[nodeIndex] = node;
|
||||
return node;
|
||||
} finally {
|
||||
inProgress[nodeIndex] = false;
|
||||
}
|
||||
}
|
||||
|
||||
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
||||
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
||||
@@ -674,6 +745,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
@@ -771,6 +843,16 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||
|
||||
/**
|
||||
* Dense edge lookup span threshold.
|
||||
* <p>
|
||||
* This value controls a speed/memory trade-off during freezing:
|
||||
* dense child lookup tables are allocated only for nodes whose child
|
||||
* labels fit in this span.
|
||||
* </p>
|
||||
*/
|
||||
private final int maxExpandedIndex;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
@@ -837,11 +919,39 @@ public final class FrequencyTrie<V> {
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, diacriticProcessingMode,
|
||||
CompiledNode.DEFAULT_MAX_EXPANDED_INDEX);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, explicit case processing mode, explicit diacritic processing
|
||||
* mode, and an explicit dense child lookup threshold.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||
* @param maxExpandedIndex dense lookup span override; zero disables
|
||||
* dense lookup. Larger values increase direct
|
||||
* indexing opportunities while potentially
|
||||
* increasing materialization memory in nodes
|
||||
* whose edge label span is within the limit.
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode, final int maxExpandedIndex) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
if (maxExpandedIndex < 0) {
|
||||
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||
}
|
||||
this.maxExpandedIndex = maxExpandedIndex;
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
@@ -1098,7 +1208,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
||||
localSummary.orderedCounts());
|
||||
this.maxExpandedIndex, localSummary.orderedCounts());
|
||||
cache.put(reducedNode, frozen);
|
||||
return frozen;
|
||||
}
|
||||
|
||||
@@ -94,6 +94,29 @@ public final class StemmerPatchTrieBinaryIO {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* with an optional dense child lookup span override.
|
||||
* <p>
|
||||
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||
* persisted in the file and does not change the compiled metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param path source file
|
||||
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final Path path, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
|
||||
try (InputStream fileInputStream = Files.newInputStream(path)) {
|
||||
return read(fileInputStream, maxExpandedIndex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
@@ -108,6 +131,26 @@ public final class StemmerPatchTrieBinaryIO {
|
||||
return read(Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string with an optional dense child lookup span override.
|
||||
* <p>
|
||||
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||
* persisted in the file and does not change the compiled metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName source file name or path string
|
||||
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return read(Path.of(fileName), maxExpandedIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
@@ -132,6 +175,34 @@ public final class StemmerPatchTrieBinaryIO {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from an input stream with
|
||||
* an optional dense child lookup span override.
|
||||
* <p>
|
||||
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||
* persisted in the file and does not change the compiled metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param inputStream source stream
|
||||
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final InputStream inputStream, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
|
||||
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
|
||||
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC,
|
||||
maxExpandedIndex);
|
||||
|
||||
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
|
||||
return trie;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||
* at a filesystem path.
|
||||
|
||||
@@ -71,6 +71,7 @@ import java.util.zip.GZIPInputStream;
|
||||
public final class StemmerPatchTrieLoader {
|
||||
|
||||
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
||||
private static final String PARAMETER_PATH = "path";
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
@@ -461,7 +462,7 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||
diacriticProcessingMode);
|
||||
return load(path, storeOriginal, metadata);
|
||||
@@ -487,7 +488,7 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
Objects.requireNonNull(metadata, "metadata");
|
||||
|
||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||
@@ -759,10 +760,31 @@ public final class StemmerPatchTrieLoader {
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
return StemmerPatchTrieBinaryIO.read(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* using a custom dense lookup span override.
|
||||
* <p>
|
||||
* This is a runtime-only tuning parameter that does not affect persisted
|
||||
* metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the compressed binary trie file
|
||||
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final Path path, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
return StemmerPatchTrieBinaryIO.read(path, maxExpandedIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
@@ -778,6 +800,27 @@ public final class StemmerPatchTrieLoader {
|
||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string using a custom dense lookup span override.
|
||||
* <p>
|
||||
* This is a runtime-only tuning parameter that does not affect persisted
|
||||
* metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return StemmerPatchTrieBinaryIO.read(fileName, maxExpandedIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
@@ -802,7 +845,7 @@ public final class StemmerPatchTrieLoader {
|
||||
* read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||
}
|
||||
|
||||
@@ -845,7 +888,7 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||
StemmerPatchTrieBinaryIO.write(trie, path);
|
||||
}
|
||||
|
||||
|
||||
@@ -44,13 +44,14 @@ import java.util.Objects;
|
||||
* arrays once and all lookup operations thereafter treat them as read-only.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels internal edge label array
|
||||
* @param children internal child array
|
||||
* @param orderedValues internal ordered values array
|
||||
* @param orderedCounts internal ordered counts array
|
||||
*/
|
||||
@SuppressWarnings("PMD.DataClass")
|
||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
||||
public final class CompiledNode<V> {
|
||||
|
||||
/**
|
||||
* Default dense child lookup span in characters used when an explicit override is
|
||||
* not provided.
|
||||
*/
|
||||
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||
|
||||
/**
|
||||
* Number of child edges where linear scan is cheaper than binary search.
|
||||
@@ -58,24 +59,112 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
|
||||
|
||||
/**
|
||||
* Creates one validated compiled node.
|
||||
* Edge labels in sorted ascending order.
|
||||
*/
|
||||
private final char[] edgeLabels;
|
||||
|
||||
/**
|
||||
* Sparse child array aligned with {@link #edgeLabels}.
|
||||
*/
|
||||
private final CompiledNode<V>[] children;
|
||||
|
||||
/**
|
||||
* Dense child lookup table used when labels fit into a compact char interval.
|
||||
* <p>
|
||||
* The table enables direct O(1) indexing for child lookup and is allocated
|
||||
* only when the character span of this node's edges is within the configured
|
||||
* threshold.
|
||||
* </p>
|
||||
*/
|
||||
private final CompiledNode<V>[] denseChildren;
|
||||
|
||||
/**
|
||||
* Normalized minimum edge value for the dense lookup table.
|
||||
*/
|
||||
private final int denseEdgeMin;
|
||||
|
||||
/**
|
||||
* Values stored at this node in local order.
|
||||
*/
|
||||
private final V[] orderedValues;
|
||||
|
||||
/**
|
||||
* Occurrence counts aligned with {@link #orderedValues}.
|
||||
*/
|
||||
private final int[] orderedCounts;
|
||||
|
||||
/**
|
||||
* Creates one validated compiled node using {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||
* for dense lookup sizing.
|
||||
*
|
||||
* @throws NullPointerException if any array argument is {@code null}
|
||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||
* arrays do not have matching lengths
|
||||
*/
|
||||
public CompiledNode {
|
||||
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||
final int... orderedCounts) {
|
||||
this(edgeLabels, children, orderedValues, DEFAULT_MAX_EXPANDED_INDEX, orderedCounts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one validated compiled node.
|
||||
*
|
||||
* @param maxExpandedIndex upper bound for the dense lookup interval size; zero
|
||||
* disables dense lookup. Larger values improve
|
||||
* direct-index likelihood while increasing dense
|
||||
* table memory in compact-label nodes.
|
||||
* @throws NullPointerException if any array argument is {@code null}
|
||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||
* arrays do not have matching lengths or the
|
||||
* dense interval size is negative
|
||||
*/
|
||||
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||
final int maxExpandedIndex, final int... orderedCounts) {
|
||||
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
||||
Objects.requireNonNull(children, "children");
|
||||
Objects.requireNonNull(orderedValues, "orderedValues");
|
||||
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
||||
|
||||
if (maxExpandedIndex < 0) {
|
||||
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||
}
|
||||
|
||||
if (edgeLabels.length != children.length) {
|
||||
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
||||
}
|
||||
if (orderedValues.length != orderedCounts.length) {
|
||||
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
||||
}
|
||||
|
||||
this.edgeLabels = edgeLabels;
|
||||
this.children = children;
|
||||
this.orderedValues = orderedValues;
|
||||
this.orderedCounts = orderedCounts;
|
||||
|
||||
if (edgeLabels.length == 0 || maxExpandedIndex == 0) {
|
||||
this.denseChildren = null;
|
||||
this.denseEdgeMin = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
final int minEdge = edgeLabels[0];
|
||||
final int maxEdge = edgeLabels[edgeLabels.length - 1];
|
||||
final int span = maxEdge - minEdge;
|
||||
|
||||
if (span < 0 || span > maxExpandedIndex) {
|
||||
this.denseChildren = null;
|
||||
this.denseEdgeMin = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] dense = (CompiledNode<V>[]) new CompiledNode[span + 1];
|
||||
for (int edgeIndex = 0; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||
dense[edgeLabels[edgeIndex] - minEdge] = children[edgeIndex];
|
||||
}
|
||||
|
||||
this.denseChildren = dense;
|
||||
this.denseEdgeMin = minEdge;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -87,7 +176,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
*
|
||||
* @return internal edge-label array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public char[] edgeLabels() {
|
||||
return this.edgeLabels;
|
||||
@@ -102,7 +190,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
*
|
||||
* @return internal child-node array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public CompiledNode<V>[] children() {
|
||||
return this.children;
|
||||
@@ -117,7 +204,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
*
|
||||
* @return internal ordered-values array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public V[] orderedValues() {
|
||||
return this.orderedValues;
|
||||
@@ -132,14 +218,143 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
*
|
||||
* @return internal ordered-counts array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] orderedCounts() {
|
||||
return this.orderedCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of child edges represented by this node.
|
||||
*
|
||||
* @return child edge count
|
||||
*/
|
||||
public int edgeCount() {
|
||||
return this.edgeLabels.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of values stored in this node.
|
||||
*
|
||||
* @return value count
|
||||
*/
|
||||
public int valueCount() {
|
||||
return this.orderedValues.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether this node stores any values.
|
||||
*
|
||||
* @return {@code true} when values are present at this node
|
||||
*/
|
||||
public boolean hasValues() {
|
||||
return this.orderedValues.length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether this node has child edges.
|
||||
*
|
||||
* @return {@code true} when this node has at least one outgoing edge
|
||||
*/
|
||||
public boolean hasChildren() {
|
||||
return this.edgeLabels.length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether this node has no child edges.
|
||||
*
|
||||
* @return {@code true} when this node is a terminal leaf node
|
||||
*/
|
||||
public boolean isLeaf() {
|
||||
return !hasChildren();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether an edge label is present at this node.
|
||||
*
|
||||
* @param edge edge label
|
||||
* @return {@code true} if this node contains the supplied edge label
|
||||
*/
|
||||
public boolean hasEdge(final char edge) {
|
||||
return findChild(edge) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether this node has a dense direct-index child lookup table.
|
||||
*
|
||||
* @return {@code true} when a direct-index child table is available
|
||||
*/
|
||||
public boolean hasDenseLookup() {
|
||||
return this.denseChildren != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a small memory-related metric describing this node's dense table size.
|
||||
*
|
||||
* @return number of dense table slots, or {@code 0} when dense lookup is not
|
||||
* enabled
|
||||
*/
|
||||
public int denseTableLength() {
|
||||
return this.denseChildren == null ? 0 : this.denseChildren.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a compact structural summary used by diagnostics and tests.
|
||||
*
|
||||
* @return summary hash for node structure and contents
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = Arrays.hashCode(this.edgeLabels);
|
||||
hash = 31 * hash + Arrays.hashCode(this.children);
|
||||
hash = 31 * hash + Arrays.hashCode(this.orderedValues);
|
||||
hash = 31 * hash + Arrays.hashCode(this.orderedCounts);
|
||||
hash = 31 * hash + Objects.hash(this.denseEdgeMin);
|
||||
hash = 31 * hash + (hasDenseLookup() ? Arrays.hashCode(this.denseChildren) : 0);
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares structural node content, including dense table availability.
|
||||
*
|
||||
* @param object comparison object
|
||||
* @return {@code true} when nodes describe identical structure and payload
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(final Object object) {
|
||||
if (this == object) {
|
||||
return true;
|
||||
}
|
||||
if (!(object instanceof CompiledNode<?> other)) {
|
||||
return false;
|
||||
}
|
||||
return Arrays.equals(this.edgeLabels, other.edgeLabels) && Arrays.equals(this.children, other.children)
|
||||
&& Arrays.equals(this.orderedValues, other.orderedValues) && Arrays.equals(this.orderedCounts, other.orderedCounts)
|
||||
&& this.denseEdgeMin == other.denseEdgeMin && Arrays.equals(this.denseChildren, other.denseChildren);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a short summary useful for debugging and diagnostics.
|
||||
*
|
||||
* @return textual node summary
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "CompiledNode{"
|
||||
+ "edgeCount=" + this.edgeLabels.length + ", orderedValueCount=" + this.orderedValues.length
|
||||
+ ", denseTableLength=" + denseTableLength() + '}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds a child for the supplied edge character.
|
||||
* <p>
|
||||
* Lookup order is:
|
||||
* <ol>
|
||||
* <li>dense array index (if the label interval is compact enough),</li>
|
||||
* <li>small-child linear scan when the fallback node has {@value #LINEAR_CHILD_COUNT_THRESHOLD}
|
||||
* or fewer edges,</li>
|
||||
* <li>binary search over sorted labels.</li>
|
||||
* </ol>
|
||||
* </p>
|
||||
*
|
||||
* @param edge edge character
|
||||
* @return child node, or {@code null} if absent
|
||||
@@ -149,6 +364,15 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
||||
if (childCount == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this.denseChildren != null) {
|
||||
final int denseIndex = edge - this.denseEdgeMin;
|
||||
if (denseIndex < 0 || denseIndex >= this.denseChildren.length) {
|
||||
return null;
|
||||
}
|
||||
return this.denseChildren[denseIndex];
|
||||
}
|
||||
|
||||
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
|
||||
for (int index = 0; index < childCount; index++) {
|
||||
if (this.edgeLabels[index] == edge) {
|
||||
|
||||
@@ -95,6 +95,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
@Tag("integration")
|
||||
@Tag("cli")
|
||||
@Tag("stemmer")
|
||||
@Tag("compile")
|
||||
@Tag("slow")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
@DisplayName("Compile integration")
|
||||
final class CompileIntegrationTest {
|
||||
@@ -192,6 +194,7 @@ final class CompileIntegrationTest {
|
||||
* @throws IOException if reading or writing fails
|
||||
*/
|
||||
@Test
|
||||
@Tag("slow")
|
||||
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
|
||||
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
|
||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||
@@ -237,6 +240,7 @@ final class CompileIntegrationTest {
|
||||
* @throws IOException if reading or writing fails
|
||||
*/
|
||||
@Test
|
||||
@Tag("slow")
|
||||
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
|
||||
void shouldRequireOverwriteForExistingOutput() throws IOException {
|
||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||
@@ -301,6 +305,7 @@ final class CompileIntegrationTest {
|
||||
|
||||
@Nested
|
||||
@DisplayName("Bundled project dictionary workflows")
|
||||
@Tag("slow")
|
||||
final class BundledProjectDictionaryWorkflows {
|
||||
|
||||
/**
|
||||
@@ -322,6 +327,7 @@ final class CompileIntegrationTest {
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] {0}")
|
||||
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
|
||||
@Tag("slow")
|
||||
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
||||
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
||||
final String resourcePath) throws IOException {
|
||||
|
||||
@@ -66,7 +66,10 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
* {@link System#exit(int)}.
|
||||
* </p>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("integration")
|
||||
@Tag("cli")
|
||||
@Tag("compile")
|
||||
@Tag("stemmer")
|
||||
@DisplayName("Compile")
|
||||
class CompileTest {
|
||||
|
||||
|
||||
@@ -70,10 +70,11 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
* <li>compressed artifact reproducibility within the active format version</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("compat")
|
||||
@Tag("regression")
|
||||
@Tag("determinism")
|
||||
@Tag("serialization")
|
||||
@Tag("trie")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
final class CompiledTrieArtifactRegressionTest {
|
||||
|
||||
|
||||
@@ -41,7 +41,8 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link DiacriticStripper}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("diacritics")
|
||||
@Tag("diacritic")
|
||||
@Tag("stemmer")
|
||||
@DisplayName("DiacriticStripper")
|
||||
class DiacriticStripperTest {
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ import org.junit.jupiter.api.Test;
|
||||
*/
|
||||
@DisplayName("FrequencyTrieBuilders")
|
||||
@Tag("unit")
|
||||
@Tag("builder")
|
||||
@Tag("construction")
|
||||
@Tag("frequency-trie")
|
||||
class FrequencyTrieBuildersTest {
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ import java.util.List;
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for the compiled trie abstraction.
|
||||
@@ -59,9 +59,9 @@ import net.jqwik.api.Tag;
|
||||
* core algorithm without overfitting to particular fixture data.
|
||||
*/
|
||||
@Label("FrequencyTrie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("trie")
|
||||
@Tag("frequency-trie")
|
||||
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
|
||||
@@ -33,6 +33,7 @@ package org.egothor.stemmer;
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
@@ -379,6 +380,24 @@ class FrequencyTrieTest {
|
||||
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link FrequencyTrie#getEntries(String)} short-circuits to a one-item immutable list.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("getEntries returns a one-item list for single stored values")
|
||||
void getEntriesReturnsSingleItemListForSingleStoredValue() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("gamma", "only");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final List<ValueCount<String>> entries = trie.getEntries("gamma");
|
||||
|
||||
assertAll(() -> assertEquals(List.of(new ValueCount<String>("only", 1)), entries),
|
||||
() -> assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that equal frequencies prefer the shorter string representation.
|
||||
*/
|
||||
@@ -755,6 +774,115 @@ class FrequencyTrieTest {
|
||||
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reading a compiled trie with a negative max-expanded override
|
||||
* smaller than -1 is rejected.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid maxExpandedIndex override")
|
||||
void readFromRejectsInvalidMaxExpandedIndexOverride() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC, -2));
|
||||
|
||||
assertEquals("maxExpandedIndex must be >= -1.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the max-expanded override controls dense lookup materialization
|
||||
* while preserving lookup semantics.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom respects dense lookup max-expanded index override")
|
||||
void readFromRespectsDenseLookupMaxExpandedIndexOverride() throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("a", "a");
|
||||
builder.put("b", "b");
|
||||
builder.put("c", "c");
|
||||
builder.put("d", "d");
|
||||
|
||||
final FrequencyTrie<String> original = builder.build();
|
||||
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
original.writeTo(outputStream, STRING_CODEC);
|
||||
final byte[] serializedTrie = outputStream.toByteArray();
|
||||
|
||||
final FrequencyTrie<String> defaultDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||
STRING_CODEC);
|
||||
final FrequencyTrie<String> defaultDenseByNegative = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie),
|
||||
String[]::new, STRING_CODEC, -1);
|
||||
final FrequencyTrie<String> disabledDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||
STRING_CODEC, 0);
|
||||
|
||||
assertAll(
|
||||
() -> assertTrue(defaultDense.root().hasDenseLookup(),
|
||||
"Default read should enable dense lookup for compact first-level edges."),
|
||||
() -> assertTrue(defaultDenseByNegative.root().hasDenseLookup(),
|
||||
"Negative override should use the default dense lookup span."),
|
||||
() -> assertFalse(disabledDense.root().hasDenseLookup(),
|
||||
"Zero override should disable dense lookup tables."),
|
||||
() -> assertEquals(original.get("a"), disabledDense.get("a")),
|
||||
() -> assertEquals(original.get("b"), disabledDense.get("b")),
|
||||
() -> assertEquals(original.get("c"), disabledDense.get("c")),
|
||||
() -> assertEquals(original.get("d"), disabledDense.get("d")),
|
||||
() -> assertEquals(original.get("z"), disabledDense.get("z")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that cyclic serialized node references are rejected as invalid
|
||||
* serialization.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects cyclic serialized node references")
|
||||
void readFromRejectsCyclicSerializedNodeReferences() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 2, 0, new NodeWriter[] {
|
||||
dataOutput -> {
|
||||
dataOutput.writeInt(1);
|
||||
dataOutput.writeChar('b');
|
||||
dataOutput.writeInt(1);
|
||||
dataOutput.writeInt(0);
|
||||
},
|
||||
dataOutput -> {
|
||||
dataOutput.writeInt(1);
|
||||
dataOutput.writeChar('a');
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("cyclic reference detected"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that child node references outside the valid serialized range are
|
||||
* rejected.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid child node identifiers")
|
||||
void readFromRejectsInvalidChildNodeId() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(1);
|
||||
dataOutput.writeChar('a');
|
||||
dataOutput.writeInt(3);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Invalid child node id"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects an invalid stream magic header.
|
||||
*/
|
||||
@@ -785,6 +913,27 @@ class FrequencyTrieTest {
|
||||
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the latest stream version validates textual metadata blocks.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid textual metadata block")
|
||||
void readFromRejectsInvalidTextualMetadataBlock() {
|
||||
final int version = FrequencyTrie.currentFormatVersion();
|
||||
final byte[] bytes = createSerializedStream(0x45475452, version, 1, 0, dataOutput -> {
|
||||
dataOutput.writeUTF("not valid metadata");
|
||||
}, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Invalid metadata block"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects a negative node count.
|
||||
*/
|
||||
@@ -862,6 +1011,129 @@ class FrequencyTrieTest {
|
||||
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that legacy version 1 metadata uses compatibility defaults.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom supports legacy version 1 metadata")
|
||||
void readFromSupportsLegacyVersionOneMetadata() throws IOException {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||
|
||||
assertEquals(TrieMetadata.legacy(1, WordTraversalDirection.BACKWARD), trie.metadata());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that legacy version 2 metadata stores traversal direction and uses
|
||||
* compatibility defaults for other values.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom supports legacy version 2 metadata")
|
||||
void readFromSupportsLegacyVersionTwoMetadata() throws IOException {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||
dataOutput -> dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal()), new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||
|
||||
assertEquals(TrieMetadata.legacy(2, WordTraversalDirection.FORWARD), trie.metadata());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that version 3 metadata includes reduction and diacritic
|
||||
* processing settings.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom parses version 3 metadata")
|
||||
void readFromParsesVersionThreeMetadata() throws IOException {
|
||||
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS, 81, 4);
|
||||
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 3, 1, 0,
|
||||
dataOutput -> {
|
||||
dataOutput.writeInt(WordTraversalDirection.BACKWARD.ordinal());
|
||||
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||
dataOutput.writeInt(DiacriticProcessingMode.REMOVE.ordinal());
|
||||
},
|
||||
new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||
final TrieMetadata metadata = trie.metadata();
|
||||
|
||||
assertAll(() -> assertEquals(3, metadata.formatVersion()),
|
||||
() -> assertEquals(WordTraversalDirection.BACKWARD, metadata.traversalDirection()),
|
||||
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||
() -> assertEquals(DiacriticProcessingMode.REMOVE, metadata.diacriticProcessingMode()),
|
||||
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, metadata.caseProcessingMode()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that version 4 metadata additionally stores case-processing mode.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom parses version 4 case processing metadata")
|
||||
void readFromParsesVersionFourCaseMetadata() throws IOException {
|
||||
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, 75, 3);
|
||||
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 4, 1, 0,
|
||||
dataOutput -> {
|
||||
dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal());
|
||||
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||
dataOutput.writeInt(DiacriticProcessingMode.AS_IS.ordinal());
|
||||
dataOutput.writeInt(CaseProcessingMode.AS_IS.ordinal());
|
||||
},
|
||||
new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||
final TrieMetadata metadata = trie.metadata();
|
||||
|
||||
assertAll(() -> assertEquals(4, metadata.formatVersion()),
|
||||
() -> assertEquals(WordTraversalDirection.FORWARD, metadata.traversalDirection()),
|
||||
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||
() -> assertEquals(DiacriticProcessingMode.AS_IS, metadata.diacriticProcessingMode()),
|
||||
() -> assertEquals(CaseProcessingMode.AS_IS, metadata.caseProcessingMode()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that invalid legacy metadata ordinals are rejected by validation.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid metadata ordinal in legacy stream")
|
||||
void readFromRejectsInvalidLegacyMetadataOrdinal() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||
dataOutput -> dataOutput.writeInt(999), new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Invalid traversal direction ordinal"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one node body into a synthetic serialized trie stream.
|
||||
*/
|
||||
@@ -889,6 +1161,24 @@ class FrequencyTrieTest {
|
||||
*/
|
||||
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||
final int rootNodeId, final NodeWriter[] nodes) {
|
||||
return createSerializedStream(magic, version, nodeCount, rootNodeId, dataOutput -> {
|
||||
// legacy and text-based versions write their metadata differently.
|
||||
}, nodes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a synthetic serialized trie stream with a metadata writer hook.
|
||||
*
|
||||
* @param magic stream magic
|
||||
* @param version stream version
|
||||
* @param nodeCount declared node count
|
||||
* @param rootNodeId declared root node identifier
|
||||
* @param metadata version-specific metadata writer
|
||||
* @param nodes node body writers
|
||||
* @return serialized bytes
|
||||
*/
|
||||
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||
final int rootNodeId, final MetadataWriter metadata, final NodeWriter[] nodes) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
||||
@@ -897,6 +1187,7 @@ class FrequencyTrieTest {
|
||||
dataOutputStream.writeInt(version);
|
||||
dataOutputStream.writeInt(nodeCount);
|
||||
dataOutputStream.writeInt(rootNodeId);
|
||||
metadata.write(dataOutputStream);
|
||||
|
||||
for (NodeWriter node : nodes) {
|
||||
node.write(dataOutputStream);
|
||||
@@ -908,4 +1199,19 @@ class FrequencyTrieTest {
|
||||
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one synthetic metadata block.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
private interface MetadataWriter {
|
||||
|
||||
/**
|
||||
* Writes metadata bytes for one stream version.
|
||||
*
|
||||
* @param dataOutput output stream
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
void write(DataOutputStream dataOutput) throws IOException;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,10 +65,9 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
* stems declared by the source dictionary.
|
||||
*/
|
||||
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
||||
@Tag("unit")
|
||||
@Tag("fuzz")
|
||||
@Tag("trie")
|
||||
@Tag("stemming")
|
||||
@Tag("stemmer")
|
||||
class FuzzStemmerAndTrieCompilationTest {
|
||||
|
||||
/**
|
||||
|
||||
@@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for {@link PatchCommandEncoder}.
|
||||
@@ -47,9 +47,9 @@ import net.jqwik.api.Tag;
|
||||
* reconstruct the exact requested target.
|
||||
*/
|
||||
@Label("PatchCommandEncoder properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("patch")
|
||||
@Tag("stemmer")
|
||||
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
|
||||
@@ -241,7 +241,7 @@ class PatchCommandEncoderTest {
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("construction")
|
||||
@Tag("constructor")
|
||||
@Tag("construction")
|
||||
class ConstructionTests {
|
||||
|
||||
/**
|
||||
@@ -326,7 +326,7 @@ class PatchCommandEncoderTest {
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("encode(String, String)")
|
||||
@Tag("encode")
|
||||
@Tag("encoding")
|
||||
class EncodeTests {
|
||||
|
||||
/**
|
||||
@@ -658,7 +658,7 @@ class PatchCommandEncoderTest {
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("reversed-word processing")
|
||||
@Tag("reverse")
|
||||
@Tag("normalization")
|
||||
class ReversedWordProcessingTests {
|
||||
|
||||
/**
|
||||
|
||||
@@ -75,6 +75,7 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
@DisplayName("StemmerDictionaryParser")
|
||||
@Tag("unit")
|
||||
@Tag("parser")
|
||||
@Tag("stemmer")
|
||||
class StemmerDictionaryParserTest {
|
||||
|
||||
/**
|
||||
|
||||
@@ -54,9 +54,9 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
/**
|
||||
* Tests for {@link StemmerKnowledgeExperiment}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("integration")
|
||||
@Tag("stemmer")
|
||||
@Tag("trie")
|
||||
final class StemmerKnowledgeExperimentTest {
|
||||
|
||||
/**
|
||||
|
||||
@@ -38,6 +38,8 @@ import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyInt;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.mockStatic;
|
||||
import static org.mockito.Mockito.verify;
|
||||
@@ -91,6 +93,8 @@ import org.mockito.MockedStatic;
|
||||
@Tag("unit")
|
||||
@Tag("io")
|
||||
@Tag("persistence")
|
||||
@Tag("serialization")
|
||||
@Tag("trie")
|
||||
@DisplayName("StemmerPatchTrieBinaryIO")
|
||||
class StemmerPatchTrieBinaryIOTest {
|
||||
|
||||
@@ -299,9 +303,19 @@ class StemmerPatchTrieBinaryIOTest {
|
||||
"read(Path) must reject null path."),
|
||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
||||
"read(String) must reject null file name."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||
"read(Path, int) must reject null path."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read((String) null,
|
||||
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||
"read(String, int) must reject null file name."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
||||
"read(InputStream) must reject null input stream."));
|
||||
"read(InputStream) must reject null input stream."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||
"read(InputStream, int) must reject null input stream."));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -385,6 +399,143 @@ class StemmerPatchTrieBinaryIOTest {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that stream overload with dense span override delegates to the
|
||||
* four-argument readFrom method.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should delegate stream read with dense span override")
|
||||
void shouldDelegateInputStreamReadWithDenseSpanOverride() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final byte[] gzipPayload = gzip("binary-content-with-max-expanded-index");
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
|
||||
.read(new ByteArrayInputStream(gzipPayload), 17);
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(InputStream, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||
|
||||
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), eq(17)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that path overload with dense span override delegates to the
|
||||
* same method overload with the override parameter.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should delegate path read with dense span override")
|
||||
void shouldDelegatePathReadWithDenseSpanOverride() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final Path sourceFile = temporaryDirectory.resolve("input-max-expanded.bin.gz");
|
||||
Files.write(sourceFile, gzip("path-based-max-expanded-index"));
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile, 0);
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(Path, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||
|
||||
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), eq(0)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that string path overload with dense span override delegates to the
|
||||
* same method overload with the override parameter.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should delegate file name read with dense span override")
|
||||
void shouldDelegateStringReadWithDenseSpanOverride() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final Path sourceFile = temporaryDirectory.resolve("input-string-max-expanded.bin.gz");
|
||||
Files.write(sourceFile, gzip("string-based-max-expanded-index"));
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString(), 32);
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(String, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||
|
||||
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class), eq(32)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that metadata-only read parses and returns the persisted metadata.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should read metadata from gzip payload")
|
||||
void shouldReadMetadataFromGzipPayload() throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
builder.put("run", PatchCommandEncoder.builder().build().encode("running", "run"));
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
StemmerPatchTrieBinaryIO.write(trie, outputStream);
|
||||
|
||||
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(new ByteArrayInputStream(outputStream.toByteArray()));
|
||||
|
||||
assertEquals(trie.metadata(), metadata,
|
||||
"readMetadata(InputStream) must return the same metadata persisted by write().");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that metadata can be read from a binary file path.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should read metadata from file path")
|
||||
void shouldReadMetadataFromPath() throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final Path sourceFile = temporaryDirectory.resolve("metadata-path.bin.gz");
|
||||
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||
|
||||
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile);
|
||||
assertEquals(trie.metadata(), metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that metadata can be read from a binary file name.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should read metadata from file name")
|
||||
void shouldReadMetadataFromStringPath() throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final Path sourceFile = temporaryDirectory.resolve("metadata-string.bin.gz");
|
||||
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||
|
||||
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile.toString());
|
||||
assertEquals(trie.metadata(), metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
||||
*/
|
||||
|
||||
@@ -85,9 +85,10 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
* <li>the current bundled language set, including right-to-left metadata</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("integration")
|
||||
@Tag("stemmer")
|
||||
@Tag("io")
|
||||
@Tag("parser")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
final class StemmerPatchTrieLoaderTest {
|
||||
|
||||
@@ -210,36 +211,43 @@ final class StemmerPatchTrieLoaderTest {
|
||||
Arguments.of("14-load-binary-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("15-load-binary-stream",
|
||||
Arguments.of("15-load-binary-path-override",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||
"path"),
|
||||
Arguments.of("16-load-binary-string-override",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null,
|
||||
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("17-load-binary-stream",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||
"inputStream"),
|
||||
Arguments.of("16-save-binary-null-trie-path",
|
||||
Arguments.of("18-save-binary-null-trie-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
||||
Arguments.of("17-save-binary-null-path",
|
||||
Arguments.of("19-save-binary-null-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
||||
Arguments.of("18-save-binary-null-trie-string",
|
||||
Arguments.of("20-save-binary-null-trie-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
||||
"trie"),
|
||||
Arguments.of("19-save-binary-null-string",
|
||||
Arguments.of("21-save-binary-null-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("20-load-language-null-metadata",
|
||||
Arguments.of("22-load-language-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("21-load-path-null-metadata",
|
||||
Arguments.of("23-load-path-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("22-load-string-null-metadata",
|
||||
Arguments.of("24-load-string-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("23-load-binary-metadata-path-null",
|
||||
Arguments.of("25-load-binary-metadata-path-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||
Arguments.of("24-load-binary-metadata-string-null",
|
||||
Arguments.of("26-load-binary-metadata-string-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("25-load-binary-metadata-stream-null",
|
||||
Arguments.of("27-load-binary-metadata-stream-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||
"inputStream"));
|
||||
}
|
||||
@@ -512,6 +520,44 @@ final class StemmerPatchTrieLoaderTest {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that binary load overloads with an explicit dense lookup span
|
||||
* preserve trie semantics while honoring the dense-layout override.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Binary dense-span override overloads should load equivalent tries")
|
||||
void shouldLoadBinaryWithDenseSpanOverrideOverloads() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
run running runs runner
|
||||
city cities
|
||||
study studies studying
|
||||
""");
|
||||
final Path binaryFile = tempDir.resolve("stemmer-trie-overrides.bin.gz");
|
||||
|
||||
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
|
||||
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||
|
||||
final FrequencyTrie<String> fromPathDefault = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||
final FrequencyTrie<String> fromPathDefaultByNegative = StemmerPatchTrieLoader.loadBinary(binaryFile,
|
||||
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX);
|
||||
final FrequencyTrie<String> fromPathNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile, 0);
|
||||
final FrequencyTrie<String> fromStringNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile.toString(), 0);
|
||||
|
||||
assertTriePatchSemanticsEqual(original, fromPathDefault, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromPathDefaultByNegative, "run", "running", "runner", "cities",
|
||||
"studying");
|
||||
assertTriePatchSemanticsEqual(original, fromPathNoDense, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStringNoDense, "run", "running", "runner", "cities",
|
||||
"studying");
|
||||
|
||||
assertFalse(fromPathNoDense.root().hasDenseLookup(),
|
||||
"Zero span should disable dense lookup on the loaded root.");
|
||||
assertFalse(fromStringNoDense.root().hasDenseLookup(),
|
||||
"Zero span should disable dense lookup on the loaded root.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a dictionary file into the temporary directory.
|
||||
*
|
||||
@@ -530,6 +576,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
* Bundled dictionary integration tests.
|
||||
*/
|
||||
@Nested
|
||||
@Tag("slow")
|
||||
@DisplayName("Bundled dictionaries")
|
||||
final class BundledDictionaryTests {
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ import java.util.Set;
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for patch-command stemmer tries.
|
||||
@@ -56,9 +56,8 @@ import net.jqwik.api.Tag;
|
||||
* persistence must not alter that behavior.
|
||||
*/
|
||||
@Label("Stemmer patch trie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("stemming")
|
||||
@Tag("stemmer")
|
||||
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
|
||||
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@Tag("metadata")
|
||||
@Tag("trie")
|
||||
@DisplayName("TrieMetadata")
|
||||
class TrieMetadataTest {
|
||||
|
||||
|
||||
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@Tag("core")
|
||||
@Tag("stemmer")
|
||||
@DisplayName("WordTraversalDirection")
|
||||
class WordTraversalDirectionTest {
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link ChildDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("ChildDescriptor")
|
||||
class ChildDescriptorTest {
|
||||
|
||||
|
||||
@@ -31,8 +31,10 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -43,7 +45,6 @@ import org.junit.jupiter.api.Test;
|
||||
* documented backing-array exposure.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("CompiledNode and NodeData")
|
||||
class CompiledNodeAndNodeDataTest {
|
||||
@@ -141,4 +142,136 @@ class CompiledNodeAndNodeDataTest {
|
||||
assertSame(orderedValues, node.orderedValues());
|
||||
assertSame(orderedCounts, node.orderedCounts());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that dense lookup is used when the interval is compact.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode can resolve child via dense lookup table")
|
||||
void compiledNodeUsesDenseLookupForCompactIntervals() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[4];
|
||||
children[0] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
children[1] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
children[2] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
children[3] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
|
||||
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'b', 'c', 'd' }, children,
|
||||
new String[] { "1", "2", "3", "4" }, new int[] { 1, 1, 1, 1 });
|
||||
|
||||
assertTrue(node.hasDenseLookup());
|
||||
|
||||
assertSame(children[0], node.findChild('a'));
|
||||
assertSame(children[3], node.findChild('d'));
|
||||
assertSame(null, node.findChild('z'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that fallback linear scan is used for small node degree.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode resolves child by linear scan for small degree")
|
||||
void compiledNodeUsesLinearScanForSmallDegree() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[4];
|
||||
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
children[0] = childA;
|
||||
children[1] = childB;
|
||||
children[2] = childC;
|
||||
children[3] = childD;
|
||||
|
||||
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'z', '中', '你' }, children,
|
||||
new String[] { "1", "2", "3", "4" }, 0, new int[] { 1, 1, 1, 1 });
|
||||
|
||||
assertFalse(node.hasDenseLookup());
|
||||
|
||||
assertSame(childA, node.findChild('a'));
|
||||
assertSame(childD, node.findChild('你'));
|
||||
assertSame(null, node.findChild('b'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that fallback binary search is used for larger node degree without
|
||||
* dense lookup.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode resolves child by binary search for large degree")
|
||||
void compiledNodeUsesBinarySearchForLargeDegree() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[5];
|
||||
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
final CompiledNode<String> childE = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||
children[0] = childA;
|
||||
children[1] = childB;
|
||||
children[2] = childC;
|
||||
children[3] = childD;
|
||||
children[4] = childE;
|
||||
|
||||
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'c', 'k', 't', 'z' }, children,
|
||||
new String[] { "1", "2", "3", "4", "5" }, 0, new int[] { 1, 1, 1, 1, 1 });
|
||||
|
||||
assertFalse(node.hasDenseLookup());
|
||||
|
||||
assertSame(childC, node.findChild('k'));
|
||||
assertSame(childE, node.findChild('z'));
|
||||
assertSame(null, node.findChild('x'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies the basic node-state helpers that are used by diagnostics and
|
||||
* behavioral checks.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode reports leaf, value and edge presence state")
|
||||
void compiledNodeReportsNodeStateHelpers() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] childless = new CompiledNode[0];
|
||||
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], childless, new String[0], new int[0]);
|
||||
|
||||
assertTrue(leaf.isLeaf());
|
||||
assertFalse(leaf.hasChildren());
|
||||
assertFalse(leaf.hasValues());
|
||||
assertFalse(leaf.hasEdge('a'));
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] child = new CompiledNode[1];
|
||||
final String[] orderedValues = new String[] { "leaf" };
|
||||
final int[] orderedCounts = new int[] { 1 };
|
||||
child[0] = new CompiledNode<>(new char[0], new CompiledNode[0], orderedValues, orderedCounts);
|
||||
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a' }, child, orderedValues, orderedCounts);
|
||||
|
||||
assertFalse(node.isLeaf());
|
||||
assertTrue(node.hasChildren());
|
||||
assertTrue(node.hasValues());
|
||||
assertTrue(node.valueCount() > 0);
|
||||
assertTrue(node.hasEdge('a'));
|
||||
assertFalse(node.hasEdge('b'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies structural equality and hash-code behavior for compiled nodes.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode equals and hashCode align for identical structure")
|
||||
void compiledNodeEqualsAndHashCodeAlignForIdenticalStructure() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] child = new CompiledNode[1];
|
||||
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], new CompiledNode[0], new String[] { "v" },
|
||||
new int[] { 1 });
|
||||
child[0] = leaf;
|
||||
|
||||
final CompiledNode<String> first = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
|
||||
new int[] { 2 });
|
||||
final CompiledNode<String> second = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
|
||||
new int[] { 2 });
|
||||
|
||||
assertEquals(first, second);
|
||||
assertEquals(first.hashCode(), second.hashCode());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link DominantLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("DominantLocalDescriptor")
|
||||
class DominantLocalDescriptorTest {
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link LocalValueSummary}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("LocalValueSummary")
|
||||
class LocalValueSummaryTest {
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link MutableNode}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("MutableNode")
|
||||
class MutableNodeTest {
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link RankedLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("RankedLocalDescriptor")
|
||||
class RankedLocalDescriptorTest {
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link ReducedNode}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("ReducedNode")
|
||||
class ReducedNodeTest {
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link ReductionContext}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("ReductionContext")
|
||||
class ReductionContextTest {
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link ReductionSignature}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("ReductionSignature")
|
||||
class ReductionSignatureTest {
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
||||
* Unit tests for {@link UnorderedLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("UnorderedLocalDescriptor")
|
||||
class UnorderedLocalDescriptorTest {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user