Compare commits
17 Commits
release@1.
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
7bd0fc66ba
|
|||
|
dadab5514e
|
|||
|
50c3ab3432
|
|||
|
6ccce248ea
|
|||
|
5a511374f3
|
|||
|
48f21cab72
|
|||
|
39969463a2
|
|||
|
6dbdb4bae8
|
|||
|
2ab3e74048
|
|||
|
128fa919f2
|
|||
|
1f5decd6ea
|
|||
|
9eee321fef
|
|||
|
3e0f786042
|
|||
|
041b7f43fb
|
|||
|
8785f2b7cb
|
|||
|
4d939f5b6e
|
|||
|
a9d15fa3ae
|
46
.classpath
46
.classpath
@@ -1,46 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<classpath>
|
|
||||||
<classpathentry kind="src" output="bin/main" path="src/main/java">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="main"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="main,test,jmh"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="src" output="bin/test" path="src/test/java">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="test"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="test,jmh"/>
|
|
||||||
<attribute name="test" value="true"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="src" output="bin/main" path="src/main/resources">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="main"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="main,test,jmh"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="src" output="bin/jmh" path="src/jmh/java">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="jmh"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="jmh"/>
|
|
||||||
<attribute name="test" value="true"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="src" output="bin/jmh" path="build/third-party/snowball/source/libstemmer_java-3.0.1/java">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="jmh"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="jmh"/>
|
|
||||||
<attribute name="test" value="true"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="src" output="bin/test" path="src/test/resources">
|
|
||||||
<attributes>
|
|
||||||
<attribute name="gradle_scope" value="test"/>
|
|
||||||
<attribute name="gradle_used_by_scope" value="test,jmh"/>
|
|
||||||
<attribute name="test" value="true"/>
|
|
||||||
</attributes>
|
|
||||||
</classpathentry>
|
|
||||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-21/"/>
|
|
||||||
<classpathentry kind="con" path="org.eclipse.buildship.core.gradleclasspathcontainer"/>
|
|
||||||
<classpathentry kind="output" path="bin/default"/>
|
|
||||||
</classpath>
|
|
||||||
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
|||||||
test -f gradle/verification-metadata.xml
|
test -f gradle/verification-metadata.xml
|
||||||
|
|
||||||
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
|
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom
|
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom
|
||||||
|
|
||||||
- name: Upload SBOM
|
- name: Upload SBOM
|
||||||
if: always()
|
if: always()
|
||||||
@@ -70,8 +70,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: test-reports
|
name: test-reports
|
||||||
path: |
|
path: |
|
||||||
build/reports/tests/test
|
build/reports/tests
|
||||||
build/test-results/test
|
build/test-results
|
||||||
if-no-files-found: warn
|
if-no-files-found: warn
|
||||||
retention-days: 14
|
retention-days: 14
|
||||||
|
|
||||||
@@ -90,8 +90,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: coverage-reports
|
name: coverage-reports
|
||||||
path: |
|
path: |
|
||||||
build/reports/jacoco/test/html
|
build/reports/jacoco/jacocoCiReleaseReport/html
|
||||||
build/reports/jacoco/test/jacocoTestReport.xml
|
build/reports/jacoco/jacocoCiReleaseReport/jacocoCiReleaseReport.xml
|
||||||
if-no-files-found: warn
|
if-no-files-found: warn
|
||||||
retention-days: 14
|
retention-days: 14
|
||||||
|
|
||||||
@@ -160,7 +160,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||||
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
|
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom centralBundle
|
||||||
|
|
||||||
- name: Generate release changelog
|
- name: Generate release changelog
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|||||||
19
.github/workflows/pages.yml
vendored
19
.github/workflows/pages.yml
vendored
@@ -70,7 +70,7 @@ jobs:
|
|||||||
test -f gradle/verification-metadata.xml
|
test -f gradle/verification-metadata.xml
|
||||||
|
|
||||||
- name: Build reports for publication
|
- name: Build reports for publication
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh cyclonedxBom
|
run: ./gradlew --no-daemon clean ciRelease pmdMain javadoc jacocoCiReleaseReport pitest jmh cyclonedxBom
|
||||||
|
|
||||||
- name: Prepare gh-pages worktree
|
- name: Prepare gh-pages worktree
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -93,6 +93,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
TEST_REPORT_DIR="build/reports/tests/ciRelease"
|
||||||
|
JACOCO_REPORT_DIR="build/reports/jacoco/jacocoCiReleaseReport"
|
||||||
|
|
||||||
SITE_DIR=".gh-pages"
|
SITE_DIR=".gh-pages"
|
||||||
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
||||||
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
||||||
@@ -106,14 +109,14 @@ jobs:
|
|||||||
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
||||||
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
||||||
|
|
||||||
cp -R build/reports/tests/test "${RUN_DIR}/test"
|
cp -R "${TEST_REPORT_DIR}" "${RUN_DIR}/test"
|
||||||
cp -R build/reports/tests/test "${LATEST_DIR}/test"
|
cp -R "${TEST_REPORT_DIR}" "${LATEST_DIR}/test"
|
||||||
|
|
||||||
cp -R build/reports/pmd "${RUN_DIR}/pmd"
|
cp -R build/reports/pmd "${RUN_DIR}/pmd"
|
||||||
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
|
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
|
||||||
|
|
||||||
cp -R build/reports/jacoco/test/html "${RUN_DIR}/coverage"
|
cp -R "${JACOCO_REPORT_DIR}/html" "${RUN_DIR}/coverage"
|
||||||
cp -R build/reports/jacoco/test/html "${LATEST_DIR}/coverage"
|
cp -R "${JACOCO_REPORT_DIR}/html" "${LATEST_DIR}/coverage"
|
||||||
|
|
||||||
cp -R build/reports/pitest "${RUN_DIR}/pitest"
|
cp -R build/reports/pitest "${RUN_DIR}/pitest"
|
||||||
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
|
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
|
||||||
@@ -178,7 +181,7 @@ jobs:
|
|||||||
|
|
||||||
python3 \
|
python3 \
|
||||||
./tools/generate-pages-badges.py \
|
./tools/generate-pages-badges.py \
|
||||||
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
|
--jacoco-xml "${JACOCO_REPORT_DIR}/jacocoCiReleaseReport.xml" \
|
||||||
--pit-xml build/reports/pitest/mutations.xml \
|
--pit-xml build/reports/pitest/mutations.xml \
|
||||||
--jmh-csv build/reports/jmh/jmh-results.csv \
|
--jmh-csv build/reports/jmh/jmh-results.csv \
|
||||||
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
||||||
@@ -228,7 +231,7 @@ jobs:
|
|||||||
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
|
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li><a href="./javadoc/">Javadoc</a></li>
|
<li><a href="./javadoc/">Javadoc</a></li>
|
||||||
<li><a href="./test/">Test Report</a></li>
|
<li><a href="./test/">Release Verification Test Report (ciRelease)</a></li>
|
||||||
<li><a href="./pmd/main.html">PMD Report</a></li>
|
<li><a href="./pmd/main.html">PMD Report</a></li>
|
||||||
<li><a href="./coverage/">Coverage Report</a></li>
|
<li><a href="./coverage/">Coverage Report</a></li>
|
||||||
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
||||||
@@ -260,7 +263,7 @@ jobs:
|
|||||||
|
|
||||||
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||||
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
||||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||||
|
|||||||
29
.project
29
.project
@@ -1,23 +1,22 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<projectDescription>
|
<projectDescription>
|
||||||
<name>Radixor</name>
|
<name>Radixor</name>
|
||||||
<comment>Project Radixor created by Buildship.</comment>
|
<comment></comment>
|
||||||
<projects>
|
<projects/>
|
||||||
</projects>
|
|
||||||
<buildSpec>
|
|
||||||
<buildCommand>
|
|
||||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
|
||||||
<arguments>
|
|
||||||
</arguments>
|
|
||||||
</buildCommand>
|
|
||||||
<buildCommand>
|
|
||||||
<name>org.eclipse.buildship.core.gradleprojectbuilder</name>
|
|
||||||
<arguments>
|
|
||||||
</arguments>
|
|
||||||
</buildCommand>
|
|
||||||
</buildSpec>
|
|
||||||
<natures>
|
<natures>
|
||||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||||
<nature>org.eclipse.buildship.core.gradleprojectnature</nature>
|
<nature>org.eclipse.buildship.core.gradleprojectnature</nature>
|
||||||
</natures>
|
</natures>
|
||||||
|
<buildSpec>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||||
|
<arguments/>
|
||||||
|
</buildCommand>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.buildship.core.gradleprojectbuilder</name>
|
||||||
|
<arguments/>
|
||||||
|
</buildCommand>
|
||||||
|
</buildSpec>
|
||||||
|
<linkedResources/>
|
||||||
|
<filteredResources/>
|
||||||
</projectDescription>
|
</projectDescription>
|
||||||
|
|||||||
4
.ruleset
4
.ruleset
@@ -162,12 +162,12 @@
|
|||||||
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
||||||
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
||||||
<properties>
|
<properties>
|
||||||
<property name="threshold" value="50" />
|
<property name="threshold" value="60" />
|
||||||
</properties>
|
</properties>
|
||||||
</rule>
|
</rule>
|
||||||
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
||||||
<properties>
|
<properties>
|
||||||
<property name="methodReportLevel" value="18" />
|
<property name="methodReportLevel" value="19" />
|
||||||
</properties>
|
</properties>
|
||||||
</rule>
|
</rule>
|
||||||
<rule ref="category/java/design.xml/DataClass"/>
|
<rule ref="category/java/design.xml/DataClass"/>
|
||||||
|
|||||||
29
LICENSE-stemmer-data
Normal file
29
LICENSE-stemmer-data
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
Stemmer data licensing
|
||||||
|
|
||||||
|
The software source code in this repository is licensed separately under
|
||||||
|
the BSD 3-Clause License.
|
||||||
|
|
||||||
|
Stemmer dictionary and morphology data files are not covered by
|
||||||
|
the BSD 3-Clause License unless explicitly stated otherwise.
|
||||||
|
|
||||||
|
This repository contains adapted data derived from the UniMorph project:
|
||||||
|
https://unimorph.github.io/
|
||||||
|
|
||||||
|
Only stemmer data derived from sources that permit commercial use are included
|
||||||
|
in the main distribution of this repository.
|
||||||
|
|
||||||
|
Accepted upstream licenses for distributed stemmer data in this repository:
|
||||||
|
- CC BY-SA 3.0
|
||||||
|
- CC BY-SA 4.0
|
||||||
|
- CC BY 4.0
|
||||||
|
|
||||||
|
Sources under non-commercial licenses, including CC BY-NC-SA 4.0, are excluded
|
||||||
|
from the main distribution.
|
||||||
|
|
||||||
|
Modifications in this repository may include cleaning, normalization,
|
||||||
|
deduplication, filtering, conversion, and reformatting.
|
||||||
|
|
||||||
|
Copyright (c) 2026 Leo Galambos for the modifications, to the extent permitted
|
||||||
|
by the applicable upstream license terms.
|
||||||
|
|
||||||
|
Per-file licensing is stated in the header of each generated stemmer data file.
|
||||||
@@ -54,7 +54,7 @@ Radixor is especially attractive when you want something more adaptable than sim
|
|||||||
|
|
||||||
Radixor includes a JMH benchmark suite for both its own algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
|
Radixor includes a JMH benchmark suite for both its own algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
|
||||||
|
|
||||||
On the current English comparison workload, Radixor with bundled `US_UK_PROFI` reaches approximately **31 to 32 million tokens per second**. Snowball original Porter reaches approximately **8 million tokens per second**, and Snowball English (Porter2) approximately **5 to 5.5 million tokens per second**.
|
On the current English comparison workload, Radixor with bundled `US_UK` reaches approximately **31 to 32 million tokens per second**. Snowball original Porter reaches approximately **8 million tokens per second**, and Snowball English (Porter2) approximately **5 to 5.5 million tokens per second**.
|
||||||
|
|
||||||
That places Radixor at approximately:
|
That places Radixor at approximately:
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ The repository keeps the front page concise and places detailed documentation un
|
|||||||
A practical first guide to loading, compiling, and using Radixor.
|
A practical first guide to loading, compiling, and using Radixor.
|
||||||
|
|
||||||
- [Built-in Languages](docs/built-in-languages.md)
|
- [Built-in Languages](docs/built-in-languages.md)
|
||||||
Overview of bundled language resources such as `US_UK` and `US_UK_PROFI`.
|
Overview of bundled language resources such as `US_UK`.
|
||||||
|
|
||||||
- [Dictionary Format](docs/dictionary-format.md)
|
- [Dictionary Format](docs/dictionary-format.md)
|
||||||
How to write and normalize stemming dictionaries.
|
How to write and normalize stemming dictionaries.
|
||||||
@@ -167,6 +167,9 @@ The repository keeps the front page concise and places detailed documentation un
|
|||||||
- [Architecture](docs/architecture.md)
|
- [Architecture](docs/architecture.md)
|
||||||
Structural model, data flow, and runtime lookup behavior.
|
Structural model, data flow, and runtime lookup behavior.
|
||||||
|
|
||||||
|
- [Lookup Edge Optimization](docs/lookup-edge-optimization.md)
|
||||||
|
Speed/memory trade-off of dense child edge lookup in compiled tries.
|
||||||
|
|
||||||
- [Reduction Semantics](docs/reduction-semantics.md)
|
- [Reduction Semantics](docs/reduction-semantics.md)
|
||||||
Ranked, unordered, and dominant reduction behavior.
|
Ranked, unordered, and dominant reduction behavior.
|
||||||
|
|
||||||
|
|||||||
216
build.gradle
216
build.gradle
@@ -33,6 +33,9 @@ configurations {
|
|||||||
java {
|
java {
|
||||||
withSourcesJar()
|
withSourcesJar()
|
||||||
withJavadocJar()
|
withJavadocJar()
|
||||||
|
|
||||||
|
sourceCompatibility = JavaVersion.VERSION_21
|
||||||
|
targetCompatibility = JavaVersion.VERSION_21
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks.withType(AbstractArchiveTask).configureEach {
|
tasks.withType(AbstractArchiveTask).configureEach {
|
||||||
@@ -51,18 +54,14 @@ pmd {
|
|||||||
ruleSetFiles = files(rootProject.file(".ruleset"))
|
ruleSetFiles = files(rootProject.file(".ruleset"))
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks.withType(JavaCompile).configureEach {
|
|
||||||
options.release = 21
|
|
||||||
}
|
|
||||||
|
|
||||||
dependencyLocking {
|
dependencyLocking {
|
||||||
lockAllConfigurations()
|
lockAllConfigurations()
|
||||||
|
|
||||||
lockMode = LockMode.STRICT
|
lockMode = LockMode.STRICT
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
jmhImplementation sourceSets.main.output
|
jmhImplementation sourceSets.main.output
|
||||||
|
|
||||||
testImplementation platform(libs.junit.bom)
|
testImplementation platform(libs.junit.bom)
|
||||||
testImplementation libs.junit.jupiter
|
testImplementation libs.junit.jupiter
|
||||||
@@ -71,7 +70,7 @@ dependencies {
|
|||||||
testImplementation libs.mockito.core
|
testImplementation libs.mockito.core
|
||||||
testImplementation libs.mockito.junit.jupiter
|
testImplementation libs.mockito.junit.jupiter
|
||||||
testImplementation libs.jqwik
|
testImplementation libs.jqwik
|
||||||
|
|
||||||
mockitoAgent(libs.mockito.core) {
|
mockitoAgent(libs.mockito.core) {
|
||||||
transitive = false
|
transitive = false
|
||||||
}
|
}
|
||||||
@@ -102,21 +101,37 @@ dependencyCheck {
|
|||||||
delay = nvdApiKey != null ? 3500 : 8000
|
delay = nvdApiKey != null ? 3500 : 8000
|
||||||
validForHours = 4
|
validForHours = 4
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dependencyCheckSuppressionFile.exists()) {
|
if (dependencyCheckSuppressionFile.exists()) {
|
||||||
suppressionFile = dependencyCheckSuppressionFile.absolutePath
|
suppressionFile = dependencyCheckSuppressionFile.absolutePath
|
||||||
failBuildOnUnusedSuppressionRule = true
|
failBuildOnUnusedSuppressionRule = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def cliIncludeTags = project.findProperty('includeTags')?.toString() ?: System.getProperty('includeTags')
|
||||||
|
def cliExcludeTags = project.findProperty('excludeTags')?.toString() ?: System.getProperty('excludeTags')
|
||||||
|
|
||||||
|
def splitTagExpression = { String tagsExpr ->
|
||||||
|
if (tagsExpr == null || tagsExpr.isBlank()) {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
return tagsExpr.split(',')
|
||||||
|
.collect { it.trim() }
|
||||||
|
.findAll { it != null && !it.isBlank() }
|
||||||
|
}
|
||||||
|
|
||||||
tasks.withType(Test).configureEach {
|
tasks.withType(Test).configureEach {
|
||||||
useJUnitPlatform()
|
|
||||||
|
|
||||||
doFirst {
|
doFirst {
|
||||||
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||||
}
|
}
|
||||||
|
|
||||||
finalizedBy(tasks.named('jacocoTestReport'))
|
/*
|
||||||
|
* Bundled dictionary integration tests compile and reload large real-world
|
||||||
|
* stemming dictionaries, including large language resources such as es_es.
|
||||||
|
* The default Gradle test executor heap is too small for this workload.
|
||||||
|
*/
|
||||||
|
minHeapSize = '1g'
|
||||||
|
maxHeapSize = '4g'
|
||||||
|
|
||||||
reports {
|
reports {
|
||||||
junitXml.required = true
|
junitXml.required = true
|
||||||
@@ -124,6 +139,121 @@ tasks.withType(Test).configureEach {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def configureJUnitPlatformTags = { Test task, String includeTagsExpr, String excludeTagsExpr ->
|
||||||
|
task.useJUnitPlatform {
|
||||||
|
final def includes = splitTagExpression(includeTagsExpr)
|
||||||
|
final def excludes = splitTagExpression(excludeTagsExpr)
|
||||||
|
|
||||||
|
if (!includes.isEmpty()) {
|
||||||
|
includeTags(*includes.toArray(new String[0]))
|
||||||
|
}
|
||||||
|
if (!excludes.isEmpty()) {
|
||||||
|
excludeTags(*excludes.toArray(new String[0]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.named('test', Test) {
|
||||||
|
configureJUnitPlatformTags(it, cliIncludeTags, cliExcludeTags)
|
||||||
|
finalizedBy(tasks.named('jacocoTestReport'))
|
||||||
|
}
|
||||||
|
|
||||||
|
def configureTaggedTestProfile = { String taskName, String includeTagsExpr, String excludeTagsExpr = null,
|
||||||
|
String taskDescription = null, String testNameExcludePatterns = null ->
|
||||||
|
tasks.register(taskName, Test) {
|
||||||
|
group = 'verification'
|
||||||
|
description = taskDescription
|
||||||
|
|
||||||
|
configureJUnitPlatformTags(delegate as Test, includeTagsExpr, excludeTagsExpr)
|
||||||
|
testClassesDirs = sourceSets.test.output.classesDirs
|
||||||
|
classpath = sourceSets.test.runtimeClasspath
|
||||||
|
dependsOn(tasks.named('compileTestJava'))
|
||||||
|
|
||||||
|
doFirst {
|
||||||
|
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||||
|
}
|
||||||
|
|
||||||
|
if (testNameExcludePatterns != null && !testNameExcludePatterns.isBlank()) {
|
||||||
|
filter {
|
||||||
|
testNameExcludePatterns.split(',').each { String pattern ->
|
||||||
|
final def trimmedPattern = pattern.trim()
|
||||||
|
if (!trimmedPattern.isEmpty()) {
|
||||||
|
excludeTestsMatching(trimmedPattern)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
minHeapSize = '1g'
|
||||||
|
maxHeapSize = '4g'
|
||||||
|
|
||||||
|
reports {
|
||||||
|
junitXml.required = true
|
||||||
|
html.required = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciSmoke',
|
||||||
|
'unit',
|
||||||
|
'slow',
|
||||||
|
'Fast feedback profile for unit tests with slow tests explicitly excluded.',
|
||||||
|
'org.egothor.stemmer.CompileIntegrationTest*'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciCore',
|
||||||
|
'unit,trie,frequency-trie,property',
|
||||||
|
null,
|
||||||
|
'Focused profile for core trie behavior and trie-specific property checks.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciIntegration',
|
||||||
|
'integration',
|
||||||
|
'slow',
|
||||||
|
'Integration pipeline profile (loader/parser/CLI/IO end-to-end flows) excluding slow integration paths.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciSlow',
|
||||||
|
'slow',
|
||||||
|
null,
|
||||||
|
'Targeted profile for all slow tests (large dictionaries, long-running corpus validation, and heavy integration checks).'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciCompat',
|
||||||
|
'compat,regression',
|
||||||
|
null,
|
||||||
|
'Compatibility profile guarding persisted artifact and compatibility regressions.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciRelease',
|
||||||
|
null,
|
||||||
|
'slow',
|
||||||
|
'Release-profile validation of all non-slow tests.',
|
||||||
|
'org.egothor.stemmer.CompileIntegrationTest*,org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciNightly',
|
||||||
|
'fuzz',
|
||||||
|
null,
|
||||||
|
'Nightly robustness profile with fuzz testing emphasis.'
|
||||||
|
)
|
||||||
|
|
||||||
|
tasks.register('ci') {
|
||||||
|
group = 'verification'
|
||||||
|
description = 'Runs the full enterprise CI profile set in sequence.'
|
||||||
|
dependsOn(tasks.named('ciSmoke'))
|
||||||
|
dependsOn(tasks.named('ciCore'))
|
||||||
|
dependsOn(tasks.named('ciIntegration'))
|
||||||
|
dependsOn(tasks.named('ciCompat'))
|
||||||
|
}
|
||||||
|
|
||||||
tasks.withType(Pmd).configureEach {
|
tasks.withType(Pmd).configureEach {
|
||||||
reports {
|
reports {
|
||||||
xml.required = true
|
xml.required = true
|
||||||
@@ -134,6 +264,13 @@ tasks.withType(Pmd).configureEach {
|
|||||||
tasks.named('jacocoTestReport', JacocoReport) {
|
tasks.named('jacocoTestReport', JacocoReport) {
|
||||||
dependsOn(tasks.named('test'))
|
dependsOn(tasks.named('test'))
|
||||||
|
|
||||||
|
classDirectories.setFrom(
|
||||||
|
files(sourceSets.main.output).asFileTree.matching {
|
||||||
|
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||||
|
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
reports {
|
reports {
|
||||||
xml.required = true
|
xml.required = true
|
||||||
csv.required = false
|
csv.required = false
|
||||||
@@ -141,6 +278,36 @@ tasks.named('jacocoTestReport', JacocoReport) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def registerJacocoProfileReport = { String reportTaskName, String sourceTaskName ->
|
||||||
|
tasks.register(reportTaskName, JacocoReport) {
|
||||||
|
group = 'verification'
|
||||||
|
description = "Generates Jacoco report for ${sourceTaskName} execution."
|
||||||
|
|
||||||
|
dependsOn(tasks.named(sourceTaskName))
|
||||||
|
|
||||||
|
classDirectories.setFrom(
|
||||||
|
files(sourceSets.main.output).asFileTree.matching {
|
||||||
|
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||||
|
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
executionData.setFrom(
|
||||||
|
fileTree(layout.buildDirectory.dir('jacoco')) {
|
||||||
|
include "${sourceTaskName}.exec"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
reports {
|
||||||
|
xml.required = true
|
||||||
|
csv.required = false
|
||||||
|
html.required = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registerJacocoProfileReport('jacocoCiReleaseReport', 'ciRelease')
|
||||||
|
|
||||||
tasks.named('check') {
|
tasks.named('check') {
|
||||||
dependsOn(tasks.named('jacocoTestReport'))
|
dependsOn(tasks.named('jacocoTestReport'))
|
||||||
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
|
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
|
||||||
@@ -178,7 +345,17 @@ pitest {
|
|||||||
'org.egothor.stemmer.trie.*Test'
|
'org.egothor.stemmer.trie.*Test'
|
||||||
]
|
]
|
||||||
|
|
||||||
excludedClasses = ['org.egothor.stemmer.Compile']
|
excludedClasses = [
|
||||||
|
'org.egothor.stemmer.Compile*',
|
||||||
|
'org.egothor.stemmer.StemmerPatchTrieLoader*',
|
||||||
|
'org.egothor.stemmer.StemmerKnowledgeExperiment*',
|
||||||
|
'org.egothor.stemmer.StemmerKnowledgeExperimentCli*'
|
||||||
|
]
|
||||||
|
excludedTestClasses = [
|
||||||
|
'org.egothor.stemmer.CompileIntegrationTest',
|
||||||
|
'org.egothor.stemmer.StemmerPatchTrieLoaderTest',
|
||||||
|
'org.egothor.stemmer.StemmerKnowledgeExperimentTest'
|
||||||
|
]
|
||||||
outputFormats = ['XML', 'HTML']
|
outputFormats = ['XML', 'HTML']
|
||||||
timestampedReports = false
|
timestampedReports = false
|
||||||
exportLineCoverage = true
|
exportLineCoverage = true
|
||||||
@@ -192,6 +369,13 @@ application {
|
|||||||
executableDir = 'bin'
|
executableDir = 'bin'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tasks.register('stemmerKnowledgeExperiment', JavaExec) {
|
||||||
|
group = 'application'
|
||||||
|
description = 'Runs the stemmer knowledge evaluation experiment.'
|
||||||
|
classpath = sourceSets.main.runtimeClasspath
|
||||||
|
mainClass = 'org.egothor.stemmer.StemmerKnowledgeExperimentCli'
|
||||||
|
}
|
||||||
|
|
||||||
distributions {
|
distributions {
|
||||||
main {
|
main {
|
||||||
distributionBaseName = 'radixor'
|
distributionBaseName = 'radixor'
|
||||||
@@ -205,11 +389,15 @@ distributions {
|
|||||||
into ''
|
into ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
from('LICENSE-stemmer-data') {
|
||||||
|
into ''
|
||||||
|
}
|
||||||
|
|
||||||
from('docs') {
|
from('docs') {
|
||||||
into 'docs'
|
into 'docs'
|
||||||
include '**/*.md'
|
include '**/*.md'
|
||||||
}
|
}
|
||||||
|
|
||||||
from(layout.buildDirectory.dir('generated/release-notes')) {
|
from(layout.buildDirectory.dir('generated/release-notes')) {
|
||||||
into ''
|
into ''
|
||||||
include 'CHANGELOG.md'
|
include 'CHANGELOG.md'
|
||||||
|
|||||||
@@ -25,11 +25,11 @@ Each stage has a different purpose.
|
|||||||
The textual dictionary groups known word forms under a canonical stem:
|
The textual dictionary groups known word forms under a canonical stem:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
connect connected connecting connection
|
connect connected connecting connection
|
||||||
```
|
```
|
||||||
|
|
||||||
The first token is the canonical stem. The following tokens are known variants.
|
The first column is the canonical stem. The following tab-separated columns are known variants.
|
||||||
|
|
||||||
### Patch-command generation
|
### Patch-command generation
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ The benchmark suite currently covers two categories:
|
|||||||
|
|
||||||
The comparison benchmark processes the same deterministic English token stream through:
|
The comparison benchmark processes the same deterministic English token stream through:
|
||||||
|
|
||||||
- Radixor with bundled `US_UK_PROFI`,
|
- Radixor with bundled `US_UK` (older benchmark snapshots used the now-retired `US_UK_PROFI` resource),
|
||||||
- Snowball original Porter,
|
- Snowball original Porter,
|
||||||
- Snowball English, commonly referred to as Porter2.
|
- Snowball English, commonly referred to as Porter2.
|
||||||
|
|
||||||
@@ -37,7 +37,7 @@ For that reason, the published badge values should be treated primarily as a com
|
|||||||
|
|
||||||
A recent JMH run on JDK 21.0.10 with JMH 1.37, one thread, three warmup iterations, and five measurement iterations produced the following approximate throughput ranges:
|
A recent JMH run on JDK 21.0.10 with JMH 1.37, one thread, three warmup iterations, and five measurement iterations produced the following approximate throughput ranges:
|
||||||
|
|
||||||
| Workload | Radixor `US_UK_PROFI` | Snowball Porter | Snowball English |
|
| Workload | Radixor `US_UK` *(historical runs: `US_UK_PROFI`)* | Snowball Porter | Snowball English |
|
||||||
| --- | ---: | ---: | ---: |
|
| --- | ---: | ---: | ---: |
|
||||||
| About 12,000 generated tokens | 30.99 M tokens/s | 8.21 M tokens/s | 5.46 M tokens/s |
|
| About 12,000 generated tokens | 30.99 M tokens/s | 8.21 M tokens/s | 5.46 M tokens/s |
|
||||||
| About 60,000 generated tokens | 32.25 M tokens/s | 8.02 M tokens/s | 5.11 M tokens/s |
|
| About 60,000 generated tokens | 32.25 M tokens/s | 8.02 M tokens/s | 5.11 M tokens/s |
|
||||||
@@ -83,7 +83,7 @@ The workload intentionally mixes:
|
|||||||
- simple inflections,
|
- simple inflections,
|
||||||
- common derivational forms,
|
- common derivational forms,
|
||||||
- US and UK spelling families,
|
- US and UK spelling families,
|
||||||
- lexical forms appropriate for `US_UK_PROFI`.
|
- lexical forms appropriate for the current bundled `US_UK` resource (with historical continuity from earlier `US_UK_PROFI` runs).
|
||||||
|
|
||||||
This design keeps runs reproducible across environments and avoids accidental drift caused by changing external corpora.
|
This design keeps runs reproducible across environments and avoids accidental drift caused by changing external corpora.
|
||||||
|
|
||||||
|
|||||||
@@ -1,41 +1,49 @@
|
|||||||
# Built-in Languages
|
# Built-in Languages
|
||||||
|
|
||||||
Radixor provides a set of bundled stemmer dictionaries that can be loaded directly without preparing custom lexical data first.
|
Radixor ships with a curated set of bundled stemmer dictionaries that can be loaded directly from the library distribution. These resources are intended to provide an immediately usable baseline for evaluation, prototyping, integration, and general-purpose stemming workloads, while still fitting naturally into workflows where the bundled baseline is later refined, extended, or replaced with custom lexical data.
|
||||||
|
|
||||||
These resources are intended as practical default dictionaries for common use. They provide a solid starting point for evaluation, integration, and general-purpose stemming workloads, while still fitting naturally into workflows where the bundled baseline is later refined, extended, or replaced by a custom dictionary.
|
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Bundled dictionaries are exposed through:
|
Bundled dictionaries are exposed through:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
StemmerPatchTrieLoader.Language
|
org.egothor.stemmer.StemmerPatchTrieLoader.Language
|
||||||
```
|
```
|
||||||
|
|
||||||
They are packaged with the library as text resources and compiled into a `FrequencyTrie<String>` when loaded.
|
Each bundled dictionary is packaged with the library as a compressed UTF-8 text resource. When loaded, the resource is parsed by `StemmerDictionaryParser`, transformed into patch-command mappings, and compiled into a read-only `FrequencyTrie<String>` by `StemmerPatchTrieLoader`.
|
||||||
|
|
||||||
## Supported languages
|
The bundled language definition also carries a language-level right-to-left flag. That flag is used by the loader to derive the `WordTraversalDirection` used for both trie-key construction and patch-command generation. In practice, left-to-right bundled languages use historical backward Egothor traversal, while right-to-left bundled languages use forward traversal over the stored form.
|
||||||
|
|
||||||
|
## Supported bundled languages
|
||||||
|
|
||||||
The following bundled language identifiers are currently available:
|
The following bundled language identifiers are currently available:
|
||||||
|
|
||||||
| Language | Enum constant | Notes |
|
| Language | Enum constant | Writing direction | Notes |
|
||||||
|---|---|---|
|
|---|---|---:|---|
|
||||||
| Danish | `DA_DK` | Bundled general-purpose dictionary |
|
| Czech | `CS_CZ` | LTR | Bundled general-purpose dictionary |
|
||||||
| German | `DE_DE` | Bundled general-purpose dictionary |
|
| Danish | `DA_DK` | LTR | Bundled general-purpose dictionary |
|
||||||
| Spanish | `ES_ES` | Bundled general-purpose dictionary |
|
| German | `DE_DE` | LTR | Bundled general-purpose dictionary |
|
||||||
| French | `FR_FR` | Bundled general-purpose dictionary |
|
| Spanish | `ES_ES` | LTR | Bundled general-purpose dictionary |
|
||||||
| Italian | `IT_IT` | Bundled general-purpose dictionary |
|
| Persian | `FA_IR` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||||
| Dutch | `NL_NL` | Bundled general-purpose dictionary |
|
| Finnish | `FI_FI` | LTR | Bundled general-purpose dictionary |
|
||||||
| Norwegian | `NO_NO` | Bundled general-purpose dictionary |
|
| French | `FR_FR` | LTR | Bundled general-purpose dictionary |
|
||||||
| Portuguese | `PT_PT` | Bundled general-purpose dictionary |
|
| Hebrew | `HE_IL` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||||
| Russian | `RU_RU` | Currently supplied in normalized transliterated form |
|
| Hungarian | `HU_HU` | LTR | Bundled general-purpose dictionary |
|
||||||
| Swedish | `SV_SE` | Bundled general-purpose dictionary |
|
| Italian | `IT_IT` | LTR | Bundled general-purpose dictionary |
|
||||||
| English | `US_UK` | Standard English dictionary |
|
| Norwegian Bokmål | `NB_NO` | LTR | Bundled general-purpose dictionary |
|
||||||
| English | `US_UK_PROFI` | Extended English dictionary |
|
| Dutch | `NL_NL` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Norwegian Nynorsk | `NN_NO` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Polish | `PL_PL` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Portuguese | `PT_PT` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Russian | `RU_RU` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Swedish | `SV_SE` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Ukrainian | `UK_UA` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| English | `US_UK` | LTR | Bundled general-purpose dictionary |
|
||||||
|
| Yiddish | `YI` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||||
|
|
||||||
## Basic usage
|
## Basic usage
|
||||||
|
|
||||||
Load a bundled stemmer like this:
|
Load a bundled dictionary like this:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -52,16 +60,18 @@ public final class BuiltInExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
|
System.out.println(trie.traversalDirection());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The loader reads the bundled dictionary resource, parses the textual entries, derives patch-command mappings, and compiles the result into a read-only trie.
|
This call loads the bundled dictionary resource for the selected language, parses its lexical entries, derives patch-command mappings, and compiles the result into a read-only trie.
|
||||||
|
|
||||||
## Example: stemming with `US_UK_PROFI`
|
## Example: stemming with a bundled dictionary
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -79,44 +89,49 @@ public final class EnglishExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
final String word = "running";
|
final String word = "running";
|
||||||
final String patch = trie.get(word);
|
final String patch = trie.get(word);
|
||||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
final String stem = PatchCommandEncoder.apply(word, patch, trie.traversalDirection());
|
||||||
|
|
||||||
System.out.println(word + " -> " + stem);
|
System.out.println(word + " -> " + stem);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## `US_UK` and `US_UK_PROFI`
|
Passing `trie.traversalDirection()` to `PatchCommandEncoder.apply(...)` is the correct general contract. It ensures that the patch is applied using the same logical traversal model that was used when the trie and its patch commands were produced.
|
||||||
|
|
||||||
Radixor currently provides two bundled English variants.
|
## Traversal behavior and right-to-left languages
|
||||||
|
|
||||||
### `US_UK`
|
Bundled dictionaries are not all processed identically.
|
||||||
|
|
||||||
`US_UK` is the lighter-weight bundled English resource. It is suitable where a smaller default dictionary is preferred and maximal lexical coverage is not the primary goal.
|
For traditional left-to-right suffix-oriented resources, Radixor preserves historical Egothor behavior and traverses logical word characters backward. That means trie paths are constructed from the logical end of the stored word toward its beginning, and patch commands are interpreted with the same backward traversal model.
|
||||||
|
|
||||||
### `US_UK_PROFI`
|
For bundled right-to-left languages such as Persian, Hebrew, and Yiddish, Radixor uses forward traversal over the stored form. In those cases:
|
||||||
|
|
||||||
`US_UK_PROFI` is the more extensive bundled English resource. It offers broader lexical coverage and is the better default for most applications that want stronger out-of-the-box behavior.
|
- trie keys are traversed from the logical beginning of the stored form,
|
||||||
|
- patch commands are generated in that same forward direction,
|
||||||
|
- patch application must use `WordTraversalDirection.FORWARD`, which is naturally obtained from `trie.traversalDirection()`.
|
||||||
|
|
||||||
### Recommendation
|
This design keeps the traversal policy explicit and consistent across dictionary loading, trie lookup, binary persistence, builder reconstruction, and patch application.
|
||||||
|
|
||||||
For most English-language deployments, prefer:
|
## Reduction behavior
|
||||||
|
|
||||||
```text
|
Bundled dictionaries can be compiled using any supported `ReductionMode`. The reduction configuration controls how semantically equivalent subtrees are merged during trie compilation, while preserving the contract of the selected mode.
|
||||||
US_UK_PROFI
|
|
||||||
```
|
|
||||||
|
|
||||||
Use `US_UK` when a smaller bundled baseline is more appropriate.
|
Typical entry points are:
|
||||||
|
|
||||||
|
- `StemmerPatchTrieLoader.load(language, storeOriginal, reductionMode)`
|
||||||
|
- `StemmerPatchTrieLoader.load(language, storeOriginal, reductionSettings)`
|
||||||
|
|
||||||
|
For most users, `ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS` is the most conservative general-purpose choice because it preserves ranked `getAll(...)` behavior.
|
||||||
|
|
||||||
## Intended role of bundled dictionaries
|
## Intended role of bundled dictionaries
|
||||||
|
|
||||||
Bundled dictionaries should be understood as **general-purpose default resources**.
|
Bundled dictionaries should be understood as practical default resources.
|
||||||
|
|
||||||
They are a good fit when:
|
They are a good fit when:
|
||||||
|
|
||||||
@@ -125,15 +140,18 @@ They are a good fit when:
|
|||||||
- a reasonable baseline is sufficient,
|
- a reasonable baseline is sufficient,
|
||||||
- the goal is evaluation, prototyping, or straightforward integration.
|
- the goal is evaluation, prototyping, or straightforward integration.
|
||||||
|
|
||||||
They are also well suited to staged refinement workflows in which the bundled base is loaded first, then extended with domain-specific vocabulary, and finally persisted as a custom binary artifact.
|
They are also well suited to staged refinement workflows in which a bundled base is loaded first, then extended with domain-specific vocabulary, and finally persisted as a custom binary artifact.
|
||||||
|
|
||||||
## Character representation
|
## Character representation
|
||||||
|
|
||||||
The current bundled resources follow a pragmatic normalization convention.
|
Bundled dictionaries are ordinary UTF-8 lexical resources. The parser reads them as text, the trie stores standard Java strings, and the patch-command model operates on general character sequences.
|
||||||
|
|
||||||
At present, bundled dictionaries are supplied in normalized plain-ASCII form. For some languages, this is simply a lightweight maintenance convention. For others, especially languages commonly written in another script, it reflects a transliterated lexical resource. Russian is the clearest example in the current bundled set.
|
This is important for two reasons:
|
||||||
|
|
||||||
This convention belongs to the supplied dictionary resources, not to the core stemming model. The parser reads UTF-8 text, the dictionary model works with ordinary Java strings, and the trie and patch-command mechanism operate on general character sequences. In practical terms, the architecture is compatible with native-script dictionaries when suitable lexical resources are available.
|
1. the built-in resources are not limited to ASCII-only processing,
|
||||||
|
2. the traversal model is orthogonal to character encoding and script choice.
|
||||||
|
|
||||||
|
In other words, right-to-left handling in the loader is about logical traversal strategy, not about introducing a separate character model.
|
||||||
|
|
||||||
## When to prefer custom dictionaries
|
## When to prefer custom dictionaries
|
||||||
|
|
||||||
@@ -141,8 +159,8 @@ A custom dictionary is usually the better choice when:
|
|||||||
|
|
||||||
- domain-specific vocabulary materially affects stemming quality,
|
- domain-specific vocabulary materially affects stemming quality,
|
||||||
- lexical coverage must be controlled more precisely,
|
- lexical coverage must be controlled more precisely,
|
||||||
- a stronger language resource is available than the bundled baseline,
|
- a stronger lexical resource is available than the bundled baseline,
|
||||||
- native-script support is needed beyond the currently bundled resources.
|
- operational requirements demand an explicitly curated, versioned artifact.
|
||||||
|
|
||||||
Typical examples include:
|
Typical examples include:
|
||||||
|
|
||||||
@@ -150,7 +168,7 @@ Typical examples include:
|
|||||||
- biomedical language,
|
- biomedical language,
|
||||||
- legal or financial vocabulary,
|
- legal or financial vocabulary,
|
||||||
- organization-specific product and process names,
|
- organization-specific product and process names,
|
||||||
- language resources maintained in native scripts.
|
- dictionaries maintained with project-specific validation rules.
|
||||||
|
|
||||||
## Production recommendation
|
## Production recommendation
|
||||||
|
|
||||||
@@ -158,11 +176,11 @@ For production systems, the most robust workflow is usually:
|
|||||||
|
|
||||||
1. start from a bundled dictionary when it is suitable,
|
1. start from a bundled dictionary when it is suitable,
|
||||||
2. extend it with domain-specific forms if needed,
|
2. extend it with domain-specific forms if needed,
|
||||||
3. compile or rebuild it into a binary `.radixor.gz` artifact,
|
3. rebuild it into a binary artifact,
|
||||||
4. deploy that compiled artifact,
|
4. deploy that compiled binary artifact,
|
||||||
5. load it at runtime using `loadBinary(...)`.
|
5. load it at runtime through `loadBinary(...)`.
|
||||||
|
|
||||||
This avoids repeated startup parsing and makes the deployed stemming behavior explicit and versionable.
|
This avoids repeated startup parsing and makes the deployed stemming behavior explicit, reproducible, and versionable.
|
||||||
|
|
||||||
## Example refinement workflow
|
## Example refinement workflow
|
||||||
|
|
||||||
@@ -185,7 +203,7 @@ public final class BundledRefinementExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
@@ -204,11 +222,27 @@ public final class BundledRefinementExample {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The reconstructed builder preserves the traversal direction of the source trie, so refinements remain semantically aligned with the original bundled dictionary.
|
||||||
|
|
||||||
## Extending language support
|
## Extending language support
|
||||||
|
|
||||||
The built-in set is intentionally a practical baseline rather than a closed catalog. High-quality dictionaries for additional languages, improved language coverage, and stronger native-script resources are all natural extension paths for the project.
|
The built-in set is intentionally a practical baseline rather than a closed catalog. Additional languages, stronger lexical coverage, and improved dictionaries for currently supported languages are all natural extension paths.
|
||||||
|
|
||||||
What matters most is not only the number of entries, but the quality, consistency, and operational usefulness of the lexical resource being added.
|
What matters most is not only the number of entries, but the quality, consistency, maintainability, and operational usefulness of the lexical resource being added.
|
||||||
|
|
||||||
|
## Related API surface
|
||||||
|
|
||||||
|
The following types are typically involved when working with bundled dictionaries:
|
||||||
|
|
||||||
|
- `StemmerPatchTrieLoader`
|
||||||
|
- `StemmerPatchTrieLoader.Language`
|
||||||
|
- `FrequencyTrie`
|
||||||
|
- `PatchCommandEncoder`
|
||||||
|
- `WordTraversalDirection`
|
||||||
|
- `ReductionMode`
|
||||||
|
- `ReductionSettings`
|
||||||
|
- `StemmerPatchTrieBinaryIO`
|
||||||
|
- `FrequencyTrieBuilders`
|
||||||
|
|
||||||
## Next steps
|
## Next steps
|
||||||
|
|
||||||
@@ -219,4 +253,4 @@ What matters most is not only the number of entries, but the quality, consistenc
|
|||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
Radixor’s built-in language support provides immediate usability, practical default dictionaries, and a strong starting point for custom refinement. The current bundled resources follow a pragmatic normalization convention, while the underlying architecture remains well suited to richer language resources and future extensions.
|
Radixor’s built-in language support provides immediate usability, a professionally defined baseline API, and a practical starting point for custom refinement. The bundled set now includes both left-to-right and right-to-left languages, and the library models that distinction explicitly through `WordTraversalDirection` so that trie construction, lookup, and patch application remain consistent.
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ This is the preferred preparation workflow when stemming should run against an a
|
|||||||
|
|
||||||
The `Compile` tool performs the following steps:
|
The `Compile` tool performs the following steps:
|
||||||
|
|
||||||
1. reads the input dictionary in the standard Radixor stemmer format,
|
1. reads the input dictionary in the standard Radixor stemmer format, accepting either plain UTF-8 text or GZip-compressed UTF-8 text,
|
||||||
2. parses each line into a canonical stem and its known variants,
|
2. parses each line into a canonical stem column and its known variant columns,
|
||||||
3. converts variants into patch commands,
|
3. converts variants into patch commands,
|
||||||
4. builds a mutable trie of patch-command values,
|
4. builds a mutable trie of patch-command values,
|
||||||
5. applies the configured reduction mode,
|
5. applies the configured reduction mode,
|
||||||
@@ -21,9 +21,10 @@ This workflow is intentionally aligned with the same dictionary semantics used e
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
java org.egothor.stemmer.Compile \
|
java org.egothor.stemmer.Compile \
|
||||||
--input ./data/stemmer.txt \
|
--input ./data/stemmer.tsv \
|
||||||
--output ./build/english.radixor.gz \
|
--output ./build/english.radixor.gz \
|
||||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||||
|
--case-processing-mode LOWERCASE_WITH_LOCALE_ROOT \
|
||||||
--store-original \
|
--store-original \
|
||||||
--overwrite
|
--overwrite
|
||||||
```
|
```
|
||||||
@@ -37,6 +38,8 @@ The CLI supports the following arguments:
|
|||||||
--output <file>
|
--output <file>
|
||||||
--reduction-mode <mode>
|
--reduction-mode <mode>
|
||||||
[--store-original]
|
[--store-original]
|
||||||
|
[--right-to-left]
|
||||||
|
[--case-processing-mode <mode>]
|
||||||
[--dominant-winner-min-percent <1..100>]
|
[--dominant-winner-min-percent <1..100>]
|
||||||
[--dominant-winner-over-second-ratio <1..n>]
|
[--dominant-winner-over-second-ratio <1..n>]
|
||||||
[--overwrite]
|
[--overwrite]
|
||||||
@@ -47,12 +50,12 @@ The CLI supports the following arguments:
|
|||||||
|
|
||||||
Path to the source dictionary file.
|
Path to the source dictionary file.
|
||||||
|
|
||||||
The file must use the standard line-oriented dictionary format. Each non-empty logical line starts with the canonical stem and may contain zero or more variants. The parser expects UTF-8 input, lowercases it using `Locale.ROOT`, and ignores trailing remarks introduced by `#` or `//`.
|
The file must use the standard line-oriented tab-separated values dictionary format, meaning that columns are separated by the tab character. Each non-empty logical line starts with the canonical stem column and may contain zero or more variant columns. The input may be plain UTF-8 text or GZip-compressed UTF-8 text; compression is detected from the stream header rather than the file extension. The parser processes case according to `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores trailing remarks introduced by `#` or `//`, and currently ignores dictionary items containing embedded whitespace while reporting them through warning-level log entries.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
--input ./data/stemmer.txt
|
--input ./data/stemmer.tsv
|
||||||
```
|
```
|
||||||
|
|
||||||
### `--output <file>`
|
### `--output <file>`
|
||||||
@@ -95,6 +98,31 @@ When this flag is present, the canonical stem itself is inserted using the no-op
|
|||||||
|
|
||||||
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
|
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
|
||||||
|
|
||||||
|
### `--right-to-left`
|
||||||
|
|
||||||
|
When present, compilation uses forward traversal (`WordTraversalDirection.FORWARD`) so stored forms are processed from their logical beginning.
|
||||||
|
|
||||||
|
```text
|
||||||
|
--right-to-left
|
||||||
|
```
|
||||||
|
|
||||||
|
This option is intended for right-to-left languages where affix behavior should operate on the written form without externally reversing words.
|
||||||
|
|
||||||
|
### `--case-processing-mode <mode>`
|
||||||
|
|
||||||
|
Controls dictionary key normalization during compilation and lookup. The setting is stored in persisted trie metadata and is therefore available to runtime lookup after binary loading.
|
||||||
|
|
||||||
|
Supported values are:
|
||||||
|
|
||||||
|
- `LOWERCASE_WITH_LOCALE_ROOT` (default)
|
||||||
|
- `AS_IS`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```text
|
||||||
|
--case-processing-mode AS_IS
|
||||||
|
```
|
||||||
|
|
||||||
### `--dominant-winner-min-percent <1..100>`
|
### `--dominant-winner-min-percent <1..100>`
|
||||||
|
|
||||||
Sets the minimum winner percentage used by dominant-result reduction settings.
|
Sets the minimum winner percentage used by dominant-result reduction settings.
|
||||||
@@ -177,6 +205,8 @@ The CLI is best used as a preparation step during packaging, deployment, or cont
|
|||||||
|
|
||||||
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
|
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
|
||||||
|
|
||||||
|
Compiled tries also persist a human-readable metadata block (`key=value` lines) that includes format version, traversal direction, RTL indicator, reduction mode, dominant thresholds, diacritic-processing mode, and case-processing mode. After decompression, you can inspect this block directly to identify what dictionary/trie configuration the artifact contains. The current CLI uses `DiacriticProcessingMode.AS_IS`; custom diacritic stripping is available through the programmatic builder and loader APIs rather than through a CLI flag.
|
||||||
|
|
||||||
### Choose reduction mode deliberately
|
### Choose reduction mode deliberately
|
||||||
|
|
||||||
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.
|
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.
|
||||||
@@ -190,15 +220,15 @@ Compilation is usually a one-time step and is generally fast. The more important
|
|||||||
### 1. Prepare a dictionary
|
### 1. Prepare a dictionary
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
connect connected connecting
|
connect connected connecting
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Compile it
|
### 2. Compile it
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
java org.egothor.stemmer.Compile \
|
java org.egothor.stemmer.Compile \
|
||||||
--input ./data/stemmer.txt \
|
--input ./data/stemmer.tsv \
|
||||||
--output ./build/english.radixor.gz \
|
--output ./build/english.radixor.gz \
|
||||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||||
--store-original
|
--store-original
|
||||||
|
|||||||
@@ -22,27 +22,29 @@ In practice, dictionary quality matters more than dictionary size. A smaller but
|
|||||||
|
|
||||||
## Preferred dictionary shape
|
## Preferred dictionary shape
|
||||||
|
|
||||||
Radixor uses a simple line-oriented format:
|
Radixor uses a simple line-oriented tab-separated values format, meaning that columns are separated by the tab character:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
<stem> <variant1> <variant2> <variant3> ...
|
<stem> <variant1> <variant2> <variant3> ...
|
||||||
```
|
```
|
||||||
|
|
||||||
The first token on a line is the canonical stem. All following tokens on that line are known variants that should reduce to that stem.
|
The first column on a line is the canonical stem. All following tab-separated columns on that line are known variants that should reduce to that stem.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
connect connected connecting connection
|
connect connected connecting connection
|
||||||
```
|
```
|
||||||
|
|
||||||
The parser:
|
The parser:
|
||||||
|
|
||||||
- reads UTF-8 text,
|
- reads UTF-8 text,
|
||||||
- normalizes input to lower case using `Locale.ROOT`,
|
- interprets each line as tab-separated values,
|
||||||
|
- applies configurable case processing through `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`),
|
||||||
- ignores empty lines,
|
- ignores empty lines,
|
||||||
- supports remarks introduced by `#` or `//`.
|
- supports remarks introduced by `#` or `//`,
|
||||||
|
- currently ignores dictionary items containing embedded whitespace and reports them through warning-level log entries.
|
||||||
|
|
||||||
For full format details, see [Dictionary format](dictionary-format.md).
|
For full format details, see [Dictionary format](dictionary-format.md).
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Dictionary Format
|
# Dictionary Format
|
||||||
|
|
||||||
Radixor uses a simple line-oriented dictionary format designed for practical stemming workflows.
|
Radixor uses a simple line-oriented dictionary format designed for practical stemming workflows. The textual source format is tab-separated values, meaning that columns are separated by the tab character.
|
||||||
|
|
||||||
Each logical line describes one canonical stem and zero or more known word variants that should reduce to that stem. The format is intentionally lightweight, easy to maintain in source control, and directly consumable both by the programmatic loader and by the CLI compiler.
|
Each logical line describes one canonical stem and zero or more known word variants that should reduce to that stem. The format is intentionally lightweight, easy to maintain in source control, and directly consumable both by the programmatic loader and by the CLI compiler.
|
||||||
|
|
||||||
@@ -9,16 +9,16 @@ Each logical line describes one canonical stem and zero or more known word varia
|
|||||||
Each non-empty logical line has the following shape:
|
Each non-empty logical line has the following shape:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
<stem> <variant1> <variant2> <variant3> ...
|
<stem> <variant1> <variant2> <variant3> ...
|
||||||
```
|
```
|
||||||
|
|
||||||
The first token is interpreted as the **canonical stem**. Every following token on the same line is interpreted as a **known variant** belonging to that stem.
|
The first column is interpreted as the **canonical stem**. Every following token on the same line is interpreted as a **known variant** belonging to that stem.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
connect connected connecting connection
|
connect connected connecting connection
|
||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
@@ -30,7 +30,7 @@ In this example:
|
|||||||
|
|
||||||
When a dictionary is loaded through `StemmerPatchTrieLoader`, the loader processes each parsed line as follows:
|
When a dictionary is loaded through `StemmerPatchTrieLoader`, the loader processes each parsed line as follows:
|
||||||
|
|
||||||
1. the first token becomes the canonical stem,
|
1. the first column becomes the canonical stem,
|
||||||
2. every following token is treated as a variant,
|
2. every following token is treated as a variant,
|
||||||
3. each variant is converted into a patch command that transforms the variant into the stem,
|
3. each variant is converted into a patch command that transforms the variant into the stem,
|
||||||
4. if `storeOriginal` is enabled, the stem itself is also inserted using the canonical no-op patch command.
|
4. if `storeOriginal` is enabled, the stem itself is also inserted using the canonical no-op patch command.
|
||||||
@@ -52,21 +52,23 @@ Whether such a line is operationally useful depends on how the dictionary is loa
|
|||||||
- if `storeOriginal` is enabled, the stem itself is inserted as a no-op mapping,
|
- if `storeOriginal` is enabled, the stem itself is inserted as a no-op mapping,
|
||||||
- if `storeOriginal` is disabled, the line contributes no explicit variant mappings.
|
- if `storeOriginal` is disabled, the line contributes no explicit variant mappings.
|
||||||
|
|
||||||
## Whitespace rules
|
## Column and whitespace rules
|
||||||
|
|
||||||
Tokens are separated by whitespace. Leading and trailing whitespace is ignored.
|
Columns are separated by the tab character. Leading and trailing whitespace around each column is ignored.
|
||||||
|
|
||||||
These lines are equivalent:
|
This is the canonical form:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This is also accepted because the surrounding padding is removed before the item is processed:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
```
|
```
|
||||||
|
|
||||||
Tabs and repeated spaces are both accepted because tokenization is whitespace-based.
|
Embedded whitespace inside one dictionary item is currently not supported. A stem or variant such as `new york` therefore cannot yet be represented as one usable dictionary item in the textual source format. Such items are ignored during parsing and reported through a warning-level log entry together with the physical line number, the stem, and the ignored items from that line.
|
||||||
|
|
||||||
## Empty lines
|
## Empty lines
|
||||||
|
|
||||||
@@ -75,9 +77,9 @@ Empty lines are ignored.
|
|||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
|
|
||||||
connect connected connecting
|
connect connected connecting
|
||||||
```
|
```
|
||||||
|
|
||||||
The blank line between entries has no effect.
|
The blank line between entries has no effect.
|
||||||
@@ -96,8 +98,8 @@ The earliest occurrence of either marker terminates the logical content of the l
|
|||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran # English verb forms
|
run running runs ran # English verb forms
|
||||||
connect connected connecting // Common derived forms
|
connect connected connecting // Common derived forms
|
||||||
```
|
```
|
||||||
|
|
||||||
This is also valid:
|
This is also valid:
|
||||||
@@ -109,31 +111,37 @@ This is also valid:
|
|||||||
|
|
||||||
## Case normalization
|
## Case normalization
|
||||||
|
|
||||||
Input lines are normalized to lower case using `Locale.ROOT` before tokenization is processed into dictionary entries.
|
Input-line case normalization is controlled by `CaseProcessingMode`; by default the parser uses `LOWERCASE_WITH_LOCALE_ROOT` before tab-separated columns are processed into dictionary entries.
|
||||||
|
|
||||||
That means dictionary authors should treat the format as **case-insensitive at load time**. If a file contains uppercase or mixed-case tokens, they will be normalized during parsing.
|
That means dictionary authors should treat the format as **case-insensitive at load time**. If a file contains uppercase or mixed-case tokens, they will be normalized during parsing.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Run Running Runs Ran
|
Run Running Runs Ran
|
||||||
```
|
```
|
||||||
|
|
||||||
is processed the same way as:
|
is processed the same way as:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
```
|
```
|
||||||
|
|
||||||
## Character set and practical convention
|
## Character set, compression, and normalization
|
||||||
|
|
||||||
Dictionary files are read as UTF-8 text.
|
Dictionary files are read as UTF-8 text. Files loaded through `StemmerPatchTrieLoader.load(Path, ...)` may be either plain UTF-8 text or GZip-compressed UTF-8 text; the loader detects GZip input from the stream header instead of relying on the file extension. Bundled dictionaries are stored as GZip resources and are decoded as UTF-8 after decompression.
|
||||||
|
|
||||||
From the perspective of the parser and the stemming algorithm, the format is not restricted to plain ASCII tokens. The parser accepts ordinary Java `String` data, and the trie itself works with general character sequences rather than with an ASCII-only internal model. In principle, this means the system could process diacritic and non-diacritic forms alike, and it could also store forms with inconsistently used diacritics.
|
The parser and trie are not restricted to ASCII. Dictionary items are ordinary Java `String` values, and trie traversal works over Java `char` sequences. This supports Latin-script data with diacritics, Cyrillic data, Hebrew, Persian, Yiddish, and other scripts represented in UTF-8, subject to the normal Java `String` model and the project’s traversal configuration.
|
||||||
|
|
||||||
In practice, however, the format is currently best understood as **primarily intended for classical basic ASCII lexical input**, especially in the traditional stemming style where language data is normalized into plain characters in the ASCII range up to character code 127. This convention is particularly relevant for languages whose original orthography includes diacritics but whose stemming dictionaries are commonly maintained in normalized non-diacritic form.
|
Case normalization is controlled by `CaseProcessingMode`. The default `LOWERCASE_WITH_LOCALE_ROOT` mode lowercases the line before columns are split into dictionary items. `AS_IS` preserves the original casing.
|
||||||
|
|
||||||
Future versions may expand the documentation and operational guidance for dictionaries that intentionally preserve diacritics. At present, that workflow is not the primary documented use case, not because the algorithm fundamentally forbids it, but because a concrete project requirement for such support has not yet emerged.
|
Diacritic normalization is controlled at trie-build and lookup time by `DiacriticProcessingMode`:
|
||||||
|
|
||||||
|
- `AS_IS` preserves dictionary and lookup keys exactly after case handling,
|
||||||
|
- `REMOVE` strips supported diacritics and common Latin ligatures on both insertion and lookup paths,
|
||||||
|
- `AS_IS_AND_STRIPPED_FALLBACK` is declared in the public model but is not implemented yet and raises `UnsupportedOperationException`.
|
||||||
|
|
||||||
|
For reliable production behavior, choose one normalization policy deliberately and apply it consistently. Normalized ASCII dictionaries remain a practical convention for some legacy stemming data, but they are not a format requirement.
|
||||||
|
|
||||||
## Distinct stem and variant semantics
|
## Distinct stem and variant semantics
|
||||||
|
|
||||||
@@ -142,8 +150,8 @@ The format expresses a one-line grouping of forms under a canonical stem. It doe
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
axis axes
|
axis axes
|
||||||
axe axes
|
axe axes
|
||||||
```
|
```
|
||||||
|
|
||||||
These are simply two independent lines. If both contribute mappings for the same surface form, the compiled trie may later expose one or more candidate patch commands depending on the accumulated local counts and the selected reduction mode.
|
These are simply two independent lines. If both contribute mappings for the same surface form, the compiled trie may later expose one or more candidate patch commands depending on the accumulated local counts and the selected reduction mode.
|
||||||
@@ -163,35 +171,35 @@ As a result, repeating the same mapping is not just redundant text. It can influ
|
|||||||
### Simple English example
|
### Simple English example
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran
|
run running runs ran
|
||||||
connect connected connecting connection
|
connect connected connecting connection
|
||||||
build building builds built
|
build building builds built
|
||||||
```
|
```
|
||||||
|
|
||||||
### Dictionary with remarks
|
### Dictionary with remarks
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run running runs ran # canonical verb family
|
run running runs ran # canonical verb family
|
||||||
connect connected connecting // derived forms
|
connect connected connecting // derived forms
|
||||||
build building builds built
|
build building builds built
|
||||||
```
|
```
|
||||||
|
|
||||||
### Stem-only entries
|
### Stem-only entries
|
||||||
|
|
||||||
```text
|
```text
|
||||||
run
|
run
|
||||||
connect connected connecting
|
connect connected connecting
|
||||||
build
|
build
|
||||||
```
|
```
|
||||||
|
|
||||||
### Mixed case input
|
### Mixed case input
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Run Running Runs Ran
|
Run Running Runs Ran
|
||||||
CONNECT Connected Connecting
|
CONNECT Connected Connecting
|
||||||
```
|
```
|
||||||
|
|
||||||
This is accepted, but it is normalized to lower case during parsing.
|
This is accepted. Under the default `LOWERCASE_WITH_LOCALE_ROOT` mode it is normalized to lower case during parsing; under `AS_IS` it is preserved.
|
||||||
|
|
||||||
## Format limitations
|
## Format limitations
|
||||||
|
|
||||||
@@ -204,7 +212,7 @@ The current dictionary format intentionally stays minimal:
|
|||||||
- no explicit ambiguity syntax,
|
- no explicit ambiguity syntax,
|
||||||
- no sectioning or nested structure.
|
- no sectioning or nested structure.
|
||||||
|
|
||||||
Each token is simply a whitespace-delimited word form after remark stripping and lowercasing.
|
Each dictionary item is simply one tab-separated word form after remark stripping and the configured case and diacritic normalization.
|
||||||
|
|
||||||
## Authoring guidance
|
## Authoring guidance
|
||||||
|
|
||||||
@@ -216,7 +224,7 @@ For reliable results, keep dictionaries:
|
|||||||
- encoded in UTF-8,
|
- encoded in UTF-8,
|
||||||
- easy to audit in plain text form.
|
- easy to audit in plain text form.
|
||||||
|
|
||||||
For most current deployments, it is sensible to keep dictionary content in normalized basic ASCII form unless there is a clear requirement to preserve diacritics end-to-end.
|
For most deployments, it is sensible to choose either preserved UTF-8 forms or a normalized ASCII/diacritic-stripped convention and keep that choice consistent across dictionary authoring, compilation, and runtime lookup.
|
||||||
|
|
||||||
## Relationship to other documentation
|
## Relationship to other documentation
|
||||||
|
|
||||||
|
|||||||
193
docs/lookup-edge-optimization.md
Normal file
193
docs/lookup-edge-optimization.md
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
# Lookup Edge Optimization
|
||||||
|
|
||||||
|
Compiled trie nodes (`CompiledNode`) use three lookup strategies when resolving child edges:
|
||||||
|
|
||||||
|
1. dense array direct lookup,
|
||||||
|
2. linear scan for very small child counts,
|
||||||
|
3. binary search over sorted edge labels.
|
||||||
|
|
||||||
|
This page explains the dense path, what `maxExpandedIndex` controls, and how to tune it.
|
||||||
|
|
||||||
|
## Runtime model of one node
|
||||||
|
|
||||||
|
For a node with sorted edge labels `char[] edges`, the implementation can materialize an
|
||||||
|
index-aligned dense table when labels occupy a small compact code-point interval:
|
||||||
|
|
||||||
|
```text
|
||||||
|
span = maxEdge - minEdge
|
||||||
|
use dense table iff (span <= maxExpandedIndex) and (maxExpandedIndex > 0)
|
||||||
|
```
|
||||||
|
|
||||||
|
When dense lookup is used, lookup is constant-time indexing:
|
||||||
|
|
||||||
|
```text
|
||||||
|
denseIndex = requestedEdge - minEdge
|
||||||
|
return denseChildren[denseIndex] // or null if outside interval
|
||||||
|
```
|
||||||
|
|
||||||
|
When dense lookup is not active (interval is too wide or the configured
|
||||||
|
`maxExpandedIndex` is `0`), `CompiledNode` still chooses between two fallback
|
||||||
|
strategies:
|
||||||
|
|
||||||
|
- **linear scan** for very small child counts (`4` or fewer children),
|
||||||
|
- **binary search** for larger child counts.
|
||||||
|
|
||||||
|
This means the fallback method is selected by child count, not by “distance” alone.
|
||||||
|
`linear scan` is therefore used when there are only a few edges even if those edges are
|
||||||
|
spread across very distant code points.
|
||||||
|
|
||||||
|
### Example: few edges, wide Unicode span
|
||||||
|
|
||||||
|
```text
|
||||||
|
edges = ['a', '中', '你']
|
||||||
|
edge count = 3
|
||||||
|
minEdge = 'a' (U+0061)
|
||||||
|
maxEdge = '你' (U+4F60)
|
||||||
|
span = 20319
|
||||||
|
```
|
||||||
|
|
||||||
|
- If `maxExpandedIndex = 512`, dense indexing is not used because `span > maxExpandedIndex`.
|
||||||
|
- Because `edge count = 3` (<= 4), lookup falls back to a tiny linear scan of the
|
||||||
|
three labels.
|
||||||
|
- This is exactly the case where you get benefit from the threshold even though the interval is wide.
|
||||||
|
|
||||||
|
This is useful for non-Latin scripts as well: what matters is interval width in Unicode
|
||||||
|
code points, not script name. A compact Arabic-range block can still benefit from dense
|
||||||
|
lookups when keys stay in a tight code-point interval.
|
||||||
|
|
||||||
|
## Why this is configurable
|
||||||
|
|
||||||
|
`maxExpandedIndex` is only a performance/paging choice:
|
||||||
|
|
||||||
|
- higher value:
|
||||||
|
- more compact intervals qualify for dense tables,
|
||||||
|
- more constant-time child lookup,
|
||||||
|
- more memory for dense tables in qualifying nodes.
|
||||||
|
- lower value (or `0`):
|
||||||
|
- less dense-table allocation,
|
||||||
|
- fewer branches into constant-time path,
|
||||||
|
- lower materialization memory.
|
||||||
|
|
||||||
|
The value never changes lookup semantics. It only changes the in-memory structure shape.
|
||||||
|
|
||||||
|
## Persistence and loading model
|
||||||
|
|
||||||
|
This threshold is **not** stored in `TrieMetadata`.
|
||||||
|
|
||||||
|
- The binary format stores only trie payload and semantic metadata (`reduction`, `traversal`,
|
||||||
|
case/diacritic settings, and stream version).
|
||||||
|
- `maxExpandedIndex` is chosen when materializing nodes in memory.
|
||||||
|
- You can therefore keep one persisted artifact and load it with different in-memory
|
||||||
|
trade-offs depending on deployment constraints.
|
||||||
|
|
||||||
|
## Default
|
||||||
|
|
||||||
|
- `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||||
|
- `CompiledNode.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||||
|
|
||||||
|
These are practical defaults for mixed-language text and Latin-like scripts where edge labels
|
||||||
|
often cluster.
|
||||||
|
|
||||||
|
## Tune during build (writable phase)
|
||||||
|
|
||||||
|
Use the full `FrequencyTrie.Builder` constructor when you are compiling from source data.
|
||||||
|
The builder threshold is applied while freezing reduced nodes into the immutable form.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import org.egothor.stemmer.CaseProcessingMode;
|
||||||
|
import org.egothor.stemmer.DiacriticProcessingMode;
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.ReductionMode;
|
||||||
|
import org.egothor.stemmer.ReductionSettings;
|
||||||
|
import org.egothor.stemmer.WordTraversalDirection;
|
||||||
|
|
||||||
|
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
|
final FrequencyTrie.Builder<String> fastBuilder =
|
||||||
|
new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
settings,
|
||||||
|
WordTraversalDirection.BACKWARD,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.AS_IS,
|
||||||
|
1024); // prefer lookup speed
|
||||||
|
|
||||||
|
// ... put(...) ...
|
||||||
|
final FrequencyTrie<String> trie = fastBuilder.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `0` or `256` for lower memory while still building larger tries.
|
||||||
|
|
||||||
|
```java
|
||||||
|
final FrequencyTrie.Builder<String> compactBuilder =
|
||||||
|
new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
settings,
|
||||||
|
WordTraversalDirection.BACKWARD,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.AS_IS,
|
||||||
|
256); // lower memory profile
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tune when loading a binary artifact (runtime phase)
|
||||||
|
|
||||||
|
At artifact load time, you can tune the same trade-off independently of persisted metadata.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
var defaultLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
|
||||||
|
var fastLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"), 1024);
|
||||||
|
|
||||||
|
var compactLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"), 0);
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also set the threshold directly with `FrequencyTrie.readFrom(...)` when reading streams:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
|
||||||
|
public final class StreamLoadExample {
|
||||||
|
|
||||||
|
private StreamLoadExample() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
try (InputStream fileInput = Files.newInputStream(Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
GZIPInputStream gzip = new GZIPInputStream(fileInput);
|
||||||
|
DataInputStream dataInput = new DataInputStream(gzip)) {
|
||||||
|
final FrequencyTrie<String> compactOnLoad = FrequencyTrie.readFrom(
|
||||||
|
dataInput,
|
||||||
|
String[]::new,
|
||||||
|
input -> input.readUTF(),
|
||||||
|
256);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: the string codec is intentionally inline in this snippet to keep it self-contained.
|
||||||
|
|
||||||
|
## Practical guidance
|
||||||
|
|
||||||
|
- Start with default (`512`) in production and profile before changing it.
|
||||||
|
- Use `0` when memory is the priority and query throughput is not the bottleneck.
|
||||||
|
- Use values around `1024` for workloads dominated by compact alphabets and very hot lookups.
|
||||||
|
|
||||||
|
Trade-off expectation:
|
||||||
|
|
||||||
|
- increasing `maxExpandedIndex` improves lookup speed when edges tend to occupy short spans,
|
||||||
|
- decreasing it reduces per-node auxiliary memory in dense-span nodes.
|
||||||
@@ -87,3 +87,20 @@ This model works especially well when domain-specific extensions are added in la
|
|||||||
|
|
||||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md)
|
- [Loading and Building Stemmers](programmatic-loading-and-building.md)
|
||||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
|
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
|
||||||
|
|
||||||
|
|
||||||
|
## Inspecting persisted metadata
|
||||||
|
|
||||||
|
After loading a compiled artifact, applications can inspect the persisted build descriptor directly:
|
||||||
|
|
||||||
|
```java
|
||||||
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary("build/stemmers/cs_cz.dat.gz");
|
||||||
|
final TrieMetadata metadata = trie.metadata();
|
||||||
|
|
||||||
|
System.out.println(metadata.formatVersion());
|
||||||
|
System.out.println(metadata.traversalDirection());
|
||||||
|
System.out.println(metadata.reductionSettings().reductionMode());
|
||||||
|
System.out.println(metadata.diacriticProcessingMode());
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful when a deployment manages multiple artifacts compiled under different traversal or reduction regimes.
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ public final class BundledLanguageExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
}
|
}
|
||||||
@@ -32,7 +32,7 @@ The `storeOriginal` flag controls whether the canonical stem is inserted as a no
|
|||||||
|
|
||||||
## Load a textual dictionary
|
## Load a textual dictionary
|
||||||
|
|
||||||
Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. Each non-empty logical line starts with the stem and may contain zero or more variants. Input is normalized to lower case using `Locale.ROOT`, and trailing remarks introduced by `#` or `//` are ignored.
|
Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. The input may be plain UTF-8 text or GZip-compressed UTF-8 text; the loader detects GZip data from the stream header. The textual format is tab-separated values, meaning that columns are separated by the tab character. Each non-empty logical line starts with the stem column and may contain zero or more variant columns. Input case normalization is controlled by `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), trailing remarks introduced by `#` or `//` are ignored, and dictionary items containing embedded whitespace are currently ignored with warning-level diagnostics.
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -51,7 +51,7 @@ public final class LoadTextDictionaryExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
Path.of("data", "stemmer.txt"),
|
Path.of("data", "stemmer.tsv"),
|
||||||
true,
|
true,
|
||||||
ReductionSettings.withDefaults(
|
ReductionSettings.withDefaults(
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
@@ -59,6 +59,8 @@ public final class LoadTextDictionaryExample {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Additional `StemmerPatchTrieLoader.load(...)` overloads let callers provide explicit `WordTraversalDirection`, `CaseProcessingMode`, `DiacriticProcessingMode`, or a complete `TrieMetadata` instance. Use those overloads when a custom dictionary must be compiled with forward traversal for right-to-left languages, case-sensitive keys, or diacritic stripping.
|
||||||
|
|
||||||
## Load a compiled binary artifact
|
## Load a compiled binary artifact
|
||||||
|
|
||||||
Binary loading is typically the preferred runtime path because it avoids reparsing the textual source and skips the preparation step entirely.
|
Binary loading is typically the preferred runtime path because it avoids reparsing the textual source and skips the preparation step entirely.
|
||||||
@@ -83,7 +85,44 @@ public final class LoadBinaryExample {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression.
|
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
|
||||||
|
|
||||||
|
## Tune child lookup density when loading binaries
|
||||||
|
|
||||||
|
To optimize hot-path latency, you can tune direct child indexing by passing `maxExpandedIndex`
|
||||||
|
at load time. This does not change persisted metadata, only the materialized in-memory form.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
public final class LoadBinaryWithDenseLookupExample {
|
||||||
|
|
||||||
|
private LoadBinaryWithDenseLookupExample() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
final FrequencyTrie<String> balanced = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
|
||||||
|
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
1024);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Negative values still use `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX`.
|
||||||
|
|
||||||
|
[Lookup Edge Optimization](lookup-edge-optimization.md) describes the trade-off in detail and examples for build-time tuning as well.
|
||||||
|
|
||||||
## Build directly with a mutable builder
|
## Build directly with a mutable builder
|
||||||
|
|
||||||
@@ -108,7 +147,7 @@ public final class BuilderExample {
|
|||||||
final FrequencyTrie.Builder<String> builder =
|
final FrequencyTrie.Builder<String> builder =
|
||||||
new FrequencyTrie.Builder<>(String[]::new, settings);
|
new FrequencyTrie.Builder<>(String[]::new, settings);
|
||||||
|
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
builder.put("running", encoder.encode("running", "run"));
|
builder.put("running", encoder.encode("running", "run"));
|
||||||
builder.put("runs", encoder.encode("runs", "run"));
|
builder.put("runs", encoder.encode("runs", "run"));
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ This is why Radixor can generalize beyond explicitly listed forms and why compil
|
|||||||
The programmatic API is easier to understand when split by developer task:
|
The programmatic API is easier to understand when split by developer task:
|
||||||
|
|
||||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
||||||
|
- [Lookup Edge Optimization](lookup-edge-optimization.md) explains dense child lookup tuning and the speed/memory trade-off when materializing compiled tries.
|
||||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
||||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
||||||
|
|
||||||
|
|||||||
@@ -58,6 +58,27 @@ A deterministic system is easier to test, easier to reason about, and safer to i
|
|||||||
|
|
||||||
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
||||||
|
|
||||||
|
The recommended execution strategy is defined by the tagged test profiles in [Test taxonomy and execution filtering](test-taxonomy-and-filtering.md). In practice, teams can execute profile tasks directly:
|
||||||
|
|
||||||
|
- `./gradlew ciSmoke`: fast local/PR safety checks (`unit`, excluding `slow`; additionally excludes
|
||||||
|
`CompileIntegrationTest` as a defensive safeguard).
|
||||||
|
- `./gradlew ciSlow`: enterprise heavy gate for all tests marked with `slow` (typically
|
||||||
|
production dictionary and large corpus verification). This should be used for scheduled/manual
|
||||||
|
hardening gates and not in standard release build.
|
||||||
|
- `./gradlew ciCore`: behavioral coverage of trie and frequency-trie paths (`unit` + `property` where applicable)
|
||||||
|
- `./gradlew ciIntegration`: pipeline and CLI integration path checks
|
||||||
|
- `./gradlew ciCompat`: compatibility and regression verification for persisted artifacts
|
||||||
|
- `./gradlew ciRelease`: full non-slow suite for release-confidence runs (all test tags except `slow`,
|
||||||
|
plus explicit name-based exclusion of `CompileIntegrationTest*` and
|
||||||
|
`StemmerPatchTrieLoaderTest$BundledDictionaryTests*` as additional guardrails)
|
||||||
|
- `./gradlew ciNightly`: extended fuzz profile for robustness hardening
|
||||||
|
- `./gradlew ci`: umbrella profile depending on smoke/core/integration/compat
|
||||||
|
|
||||||
|
## Test taxonomy and execution filtering
|
||||||
|
|
||||||
|
The full tag taxonomy and executable filter examples are documented in
|
||||||
|
[Test taxonomy and execution filtering](test-taxonomy-and-filtering.md).
|
||||||
|
|
||||||
### Structural coverage
|
### Structural coverage
|
||||||
|
|
||||||
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ A compiled stemmer can be obtained in three common ways.
|
|||||||
|
|
||||||
### Use a bundled language dictionary
|
### Use a bundled language dictionary
|
||||||
|
|
||||||
Radixor ships with bundled dictionaries for a set of supported languages. These resources are line-oriented dictionaries stored with the library and compiled into a `FrequencyTrie<String>` when loaded. The loader can also store the canonical stem itself as a no-op patch command.
|
Radixor ships with bundled dictionaries for a set of supported languages. These resources are line-oriented dictionaries stored with the library and compiled into a `FrequencyTrie<String>` when loaded. The loader can also store the canonical stem itself as a no-op patch command. Compiled trie artifacts now persist self-describing metadata, including the traversal direction and compilation reduction settings used to build the artifact.
|
||||||
|
|
||||||
```java
|
```java
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -32,7 +32,7 @@ public final class BundledStemmerExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
@@ -67,9 +67,39 @@ public final class LoadBinaryStemmerExample {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can tune in-memory child lookup density at load time without changing the artifact:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
public final class LoadBinaryStemmerExampleTuned {
|
||||||
|
|
||||||
|
private LoadBinaryStemmerExampleTuned() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
1024);
|
||||||
|
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
128);
|
||||||
|
|
||||||
|
System.out.println("fast=" + fast.size() + ", compact=" + compact.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For the trade-off details, see [Lookup Edge Optimization](lookup-edge-optimization.md).
|
||||||
|
|
||||||
### Build or extend a stemmer from dictionary data
|
### Build or extend a stemmer from dictionary data
|
||||||
|
|
||||||
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The parser lowercases input with `Locale.ROOT`, ignores leading and trailing whitespace, and supports line remarks introduced by `#` or `//`.
|
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
|
||||||
|
|
||||||
This path is also relevant when you extend an existing compiled stemmer with additional domain-specific entries and rebuild a new compact artifact.
|
This path is also relevant when you extend an existing compiled stemmer with additional domain-specific entries and rebuild a new compact artifact.
|
||||||
|
|
||||||
@@ -104,7 +134,7 @@ public final class SingleStemExample {
|
|||||||
|
|
||||||
public static void main(final String[] arguments) throws IOException {
|
public static void main(final String[] arguments) throws IOException {
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true,
|
true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
@@ -202,3 +232,10 @@ Dictionary compilation is usually a one-time preparation step and is generally f
|
|||||||
- [CLI compilation](cli-compilation.md)
|
- [CLI compilation](cli-compilation.md)
|
||||||
- [Built-in languages](built-in-languages.md)
|
- [Built-in languages](built-in-languages.md)
|
||||||
- [Architecture and reduction](architecture-and-reduction.md)
|
- [Architecture and reduction](architecture-and-reduction.md)
|
||||||
|
|
||||||
|
|
||||||
|
## Persisted trie metadata
|
||||||
|
|
||||||
|
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
|
||||||
|
|
||||||
|
`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ These reports are primarily useful when reviewing the published API surface and
|
|||||||
|
|
||||||
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
||||||
|
|
||||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||||
|
|||||||
216
docs/test-taxonomy-and-filtering.md
Normal file
216
docs/test-taxonomy-and-filtering.md
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
# Test Tag Taxonomy and Execution Guide
|
||||||
|
|
||||||
|
Radixor uses JUnit tags as an explicit execution policy for its test suite.
|
||||||
|
|
||||||
|
The project uses three orthogonal axes:
|
||||||
|
|
||||||
|
1. **Scope** (how the test is executed in the pipeline)
|
||||||
|
2. **Domain** (where in the system it belongs)
|
||||||
|
3. **Intent** (what behavior it verifies)
|
||||||
|
|
||||||
|
## Canonical scope tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `unit` | Fast, deterministic tests that exercise a specific class or behavior without external processes. | Default developer feedback; should stay near-zero flakiness and low run time. |
|
||||||
|
| `integration` | Tests that span multiple components or end-to-end flows of the public pipeline. | Parser/loader/CLI/IO integration checks and multi-step compile-then-load validations. |
|
||||||
|
| `property` | Property-based tests with generator-driven coverage for invariants. | Semantics-preserving laws and edge-case exploration beyond curated fixtures. |
|
||||||
|
| `fuzz` | Randomized stress checks with bounded runtime. | Heavier probabilistic verification of robustness and reduction invariants. |
|
||||||
|
| `compat` | Backward/forward compatibility and reproducibility checks for persisted artifacts. | Artifact fingerprints, deterministic rebuild, and regression fixtures. |
|
||||||
|
| `slow` | Long-running or expensive tests that should not execute in every fast gate. | Heavy fuzz/property budgets or high-duration integration checks. |
|
||||||
|
|
||||||
|
## Canonical domain tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `core` | Core algorithm and foundational platform behavior. | Traversal direction, base data structures, low-level helpers. |
|
||||||
|
| `trie` | All mutable/compiled trie behaviors and traversal internals. | Lookup path selection, node shape, child representation, subtree behavior. |
|
||||||
|
| `frequency-trie` | Algorithms and corner cases specific to frequency-aware trie logic. | Ranking, weighted reductions, persistence of weighted nodes. |
|
||||||
|
| `stemmer` | End-user stemming pipeline semantics. | Parse-encode-apply flows and output invariants. |
|
||||||
|
| `patch` | Patch encoding, decoding, and application semantics. | `PatchCommandEncoder` behavior and related compatibility contracts. |
|
||||||
|
| `io` | Input/output and resource loading boundaries. | Filesystem readers, streams, and stream lifecycle handling. |
|
||||||
|
| `serialization` | Binary persistence contract of compiled artifacts. | Versioned format reads/writes and checksum/consistency checks. |
|
||||||
|
| `parser` | Dictionary and metadata parsing concerns. | Dictionary input parsing and malformed-source rejection. |
|
||||||
|
| `cli` | Command-line entrypoint and command orchestration behavior. | Compile CLI integration and CLI argument validation. |
|
||||||
|
| `metadata` | Trie metadata semantics, compatibility fields, and schema expectations. | Version flags, structural properties, and metadata round-trips. |
|
||||||
|
| `compile` | Compile-time pipeline and build-oriented behavior. | Building, reduction-mode behavior, and compiled artifact generation. |
|
||||||
|
| `diacritic` | Unicode diacritic normalization and stripping behavior. | Accent-removal correctness and locale-safe normalization checks. |
|
||||||
|
|
||||||
|
## Canonical intent tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `construction` | Tests around construction and assembly of runtime structures. | Builders, loaders, and compile-time object construction contracts. |
|
||||||
|
| `lookup` | Read behavior and retrieval semantics. | `get()`, `getAll()`, traversal and missing-key behavior. |
|
||||||
|
| `persistence` | Storage lifecycle semantics. | Serialization/deserialization and round-trip correctness. |
|
||||||
|
| `reduction` | Reduction algorithm correctness and corner cases. | Dominance threshold, subtree deduplication, rank-preservation invariants. |
|
||||||
|
| `encoding` | Encoding transformation direction. | `PatchCommandEncoder.encode` and serialized command form generation. |
|
||||||
|
| `decoding` | Decoding/interpretation of persisted or runtime commands. | Optional consumers that parse and apply encoded command payloads. |
|
||||||
|
| `apply` | Patch application and transformation behavior. | Verifies that applied patches produce expected derived forms. |
|
||||||
|
| `normalization` | Canonicalization and cleanup behavior. | String normalization around case/shape and mirrored input paths. |
|
||||||
|
| `validation` | Input rejection and defensive checks. | Null/empty/invalid contracts and explicit failure conditions. |
|
||||||
|
| `regression` | Guard tests for behavior changes over time. | Known historical bugs and behavioral drift prevention. |
|
||||||
|
| `determinism` | Repeatable results under fixed input and settings. | Compile determinism, stable ordering, and artifact reproducibility. |
|
||||||
|
| `error-handling` | Exception surface and robustness expectations. | Recovery/failure modes and diagnostics quality. |
|
||||||
|
|
||||||
|
## Class-level rules
|
||||||
|
|
||||||
|
1. Every test class has **exactly one** scope tag.
|
||||||
|
2. Every test class has at least one domain tag.
|
||||||
|
3. Additional tags describe intent and may be used on classes or nested tests.
|
||||||
|
4. For each test class, intent tags should reflect the primary behavior under test, not historical naming conventions.
|
||||||
|
|
||||||
|
## Governance and execution policy
|
||||||
|
|
||||||
|
The following rules are used to keep the suite auditable and stable:
|
||||||
|
|
||||||
|
| Rule | Required state | Why |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Scope discipline | Exactly one scope tag per class. | Prevents accidental promotion of integration-only behavior into fast unit runs. |
|
||||||
|
| Coverage breadth | At least one domain tag per class. | Ensures tests can be grouped by subsystem for targeted review. |
|
||||||
|
| Intent specificity | Use at least one intent tag when behavior is non-trivial. | Makes failure triage faster and profile composition explicit. |
|
||||||
|
| Runtime policy | Never run `slow` tests in the default `unit` profile unless explicitly required. | Preserves turnaround for PR feedback while preserving deep checks. |
|
||||||
|
| Change risk | Any persistence or compatibility-affecting change must include `compat` in validation. | Protects long-lived binary artifact contracts. |
|
||||||
|
| Mutation resistance | `fuzz`/`property` sets should be gated to dedicated profiles. | Limits flakiness exposure and controls CI resource cost. |
|
||||||
|
|
||||||
|
## Suggested CI profiles
|
||||||
|
|
||||||
|
These are recommended launch profiles for local and CI usage and are also exposed as Gradle tasks:
|
||||||
|
|
||||||
|
- **Profile: `ci-smoke` (fast feedback):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit -DexcludeTags=slow
|
||||||
|
./gradlew ciSmoke
|
||||||
|
```
|
||||||
|
|
||||||
|
`ciSmoke` also excludes `org.egothor.stemmer.CompileIntegrationTest*` at test-name filter level as a
|
||||||
|
defensive fallback in case of future tag drift.
|
||||||
|
`ciRelease` also excludes
|
||||||
|
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` at filter level.
|
||||||
|
|
||||||
|
- **Profile: `ci-core` (core behavioral coverage):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit,trie,frequency-trie,property
|
||||||
|
./gradlew ciCore
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-integration` (pipeline correctness):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=integration
|
||||||
|
./gradlew ciIntegration
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-slow` (explicit heavy validation):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew ciSlow
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-compat` (artifact stability):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat,regression
|
||||||
|
./gradlew ciCompat
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-release` (strong confidence before release):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DexcludeTags=slow
|
||||||
|
./gradlew ciRelease
|
||||||
|
```
|
||||||
|
`ciRelease` is non-slow by policy and uses the same defensive name-based exclusion for
|
||||||
|
`org.egothor.stemmer.CompileIntegrationTest*` and
|
||||||
|
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` in addition to tag filtering.
|
||||||
|
|
||||||
|
- **Profile: `ci-nightly` (extended hardening):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=fuzz
|
||||||
|
./gradlew ciNightly
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci` (enterprise umbrella):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew ci
|
||||||
|
```
|
||||||
|
|
||||||
|
`ci` and `ciRelease` intentionally do **not** include `slow` paths. Run `ciSlow` explicitly for production-dictionary stress and long-running corpus checks.
|
||||||
|
|
||||||
|
## Practical examples
|
||||||
|
|
||||||
|
All examples use Gradle with JUnit Platform integration:
|
||||||
|
|
||||||
|
- Only unit tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit
|
||||||
|
```
|
||||||
|
|
||||||
|
- Integration tests only:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=integration
|
||||||
|
```
|
||||||
|
|
||||||
|
- Only trie subsystem tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=trie
|
||||||
|
```
|
||||||
|
|
||||||
|
- Deterministic fuzz checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=fuzz
|
||||||
|
```
|
||||||
|
|
||||||
|
- Property tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=property
|
||||||
|
```
|
||||||
|
|
||||||
|
- Stemmer + patch command behavior:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=stemmer,patch
|
||||||
|
```
|
||||||
|
|
||||||
|
- Compatibility artifacts and regression checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat
|
||||||
|
```
|
||||||
|
|
||||||
|
- Keep regression suite and remove long-running cases:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=regression -DexcludeTags=slow
|
||||||
|
```
|
||||||
|
|
||||||
|
- Core + patch behavior:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=trie,patch
|
||||||
|
```
|
||||||
|
|
||||||
|
- Deterministic compatibility and persistence checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat,determinism,serialization
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- `-DincludeTags` and `-DexcludeTags` are interpreted by Gradle task filtering and forwarded into
|
||||||
|
JUnit tag filtering.
|
||||||
|
- Class-name filtering is also available via Gradle test selectors where needed
|
||||||
|
(for example, `--tests *CompileTest`), but tag filtering remains the default
|
||||||
|
execution strategy.
|
||||||
|
- `-DincludeTags` supports comma-separated literal tags. When you need a single exact tag with special
|
||||||
|
characters, quote the argument for the shell.
|
||||||
@@ -17,3 +17,6 @@ pomScmDeveloperConnection=scm:git:ssh://git@github.com/leogalambos/Radixor.git
|
|||||||
|
|
||||||
pomLicenseName=BSD-3-Clause
|
pomLicenseName=BSD-3-Clause
|
||||||
pomLicenseUrl=https://spdx.org/licenses/BSD-3-Clause.html
|
pomLicenseUrl=https://spdx.org/licenses/BSD-3-Clause.html
|
||||||
|
|
||||||
|
pomStemmerDataLicenseName=Stemmer Data License Policy
|
||||||
|
pomStemmerDataLicenseUrl=https://github.com/leogalambos/Radixor/blob/main/LICENSE-stemmer-data
|
||||||
|
|||||||
@@ -13,6 +13,12 @@ def pomScmDeveloperConnection = providers.gradleProperty('pomScmDeveloperConnect
|
|||||||
def pomLicenseName = providers.gradleProperty('pomLicenseName').orNull
|
def pomLicenseName = providers.gradleProperty('pomLicenseName').orNull
|
||||||
def pomLicenseUrl = providers.gradleProperty('pomLicenseUrl').orNull
|
def pomLicenseUrl = providers.gradleProperty('pomLicenseUrl').orNull
|
||||||
def pomLicenseDistribution = providers.gradleProperty('pomLicenseDistribution').orElse('repo').get()
|
def pomLicenseDistribution = providers.gradleProperty('pomLicenseDistribution').orElse('repo').get()
|
||||||
|
def pomStemmerDataLicenseName = providers.gradleProperty('pomStemmerDataLicenseName')
|
||||||
|
.orElse('Stemmer Data License Policy')
|
||||||
|
.get()
|
||||||
|
def pomStemmerDataLicenseUrl = providers.gradleProperty('pomStemmerDataLicenseUrl')
|
||||||
|
.orElse('https://github.com/leogalambos/Radixor/blob/main/LICENSE-stemmer-data')
|
||||||
|
.get()
|
||||||
def pomDeveloperId = providers.gradleProperty('pomDeveloperId').orElse('egothor').get()
|
def pomDeveloperId = providers.gradleProperty('pomDeveloperId').orElse('egothor').get()
|
||||||
def pomDeveloperName = providers.gradleProperty('pomDeveloperName').orElse('Leo Galambos').get()
|
def pomDeveloperName = providers.gradleProperty('pomDeveloperName').orElse('Leo Galambos').get()
|
||||||
def pomDeveloperEmail = providers.gradleProperty('pomDeveloperEmail').orElse('egothor@gmail.com').get()
|
def pomDeveloperEmail = providers.gradleProperty('pomDeveloperEmail').orElse('egothor@gmail.com').get()
|
||||||
@@ -45,6 +51,11 @@ publishing {
|
|||||||
url = pomLicenseUrl
|
url = pomLicenseUrl
|
||||||
distribution = pomLicenseDistribution
|
distribution = pomLicenseDistribution
|
||||||
}
|
}
|
||||||
|
license {
|
||||||
|
name = pomStemmerDataLicenseName
|
||||||
|
url = pomStemmerDataLicenseUrl
|
||||||
|
distribution = pomLicenseDistribution
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
developers {
|
developers {
|
||||||
@@ -73,7 +84,7 @@ publishing {
|
|||||||
}
|
}
|
||||||
|
|
||||||
signing {
|
signing {
|
||||||
required { !version.toString().endsWith('-SNAPSHOT') }
|
required = !version.toString().endsWith('-SNAPSHOT')
|
||||||
if (signingKey != null && !signingKey.isBlank()) {
|
if (signingKey != null && !signingKey.isBlank()) {
|
||||||
useInMemoryPgpKeys(signingKey, signingPassword)
|
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||||
sign publishing.publications.mavenJava
|
sign publishing.publications.mavenJava
|
||||||
@@ -93,6 +104,8 @@ tasks.register('validateReleaseMetadata') {
|
|||||||
if (pomScmDeveloperConnection == null || pomScmDeveloperConnection.isBlank()) missing.add('pomScmDeveloperConnection')
|
if (pomScmDeveloperConnection == null || pomScmDeveloperConnection.isBlank()) missing.add('pomScmDeveloperConnection')
|
||||||
if (pomLicenseName == null || pomLicenseName.isBlank()) missing.add('pomLicenseName')
|
if (pomLicenseName == null || pomLicenseName.isBlank()) missing.add('pomLicenseName')
|
||||||
if (pomLicenseUrl == null || pomLicenseUrl.isBlank()) missing.add('pomLicenseUrl')
|
if (pomLicenseUrl == null || pomLicenseUrl.isBlank()) missing.add('pomLicenseUrl')
|
||||||
|
if (pomStemmerDataLicenseName == null || pomStemmerDataLicenseName.isBlank()) missing.add('pomStemmerDataLicenseName')
|
||||||
|
if (pomStemmerDataLicenseUrl == null || pomStemmerDataLicenseUrl.isBlank()) missing.add('pomStemmerDataLicenseUrl')
|
||||||
if (signingKey == null || signingKey.isBlank()) missing.add('pomSigningKey / SIGNING_KEY')
|
if (signingKey == null || signingKey.isBlank()) missing.add('pomSigningKey / SIGNING_KEY')
|
||||||
if (signingPassword == null || signingPassword.isBlank()) missing.add('pomSigningPassword / SIGNING_PASSWORD')
|
if (signingPassword == null || signingPassword.isBlank()) missing.add('pomSigningPassword / SIGNING_PASSWORD')
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,27 @@
|
|||||||
|
import org.gradle.plugins.ide.eclipse.model.SourceFolder
|
||||||
|
|
||||||
|
|
||||||
def snowballVersion = '3.0.1'
|
def snowballVersion = '3.0.1'
|
||||||
def snowballArchiveName = "libstemmer_java-${snowballVersion}.tar.gz"
|
def snowballArchiveName = "libstemmer_java-${snowballVersion}.tar.gz"
|
||||||
|
def snowballDistributionDirectoryName = "libstemmer_java-${snowballVersion}"
|
||||||
|
def snowballRootRelativePath = 'third-party/snowball'
|
||||||
|
def snowballSourceRelativePath = "${snowballRootRelativePath}/source"
|
||||||
|
def snowballJavaSourceRelativePath = "${snowballSourceRelativePath}/${snowballDistributionDirectoryName}/java"
|
||||||
def snowballDownloadUrl = "https://snowballstem.org/dist/${snowballArchiveName}"
|
def snowballDownloadUrl = "https://snowballstem.org/dist/${snowballArchiveName}"
|
||||||
def snowballDownloadFile = layout.buildDirectory.file("third-party/snowball/${snowballArchiveName}")
|
def snowballDownloadFile = layout.buildDirectory.file("${snowballRootRelativePath}/${snowballArchiveName}")
|
||||||
def snowballExtractDirectory = layout.buildDirectory.dir('third-party/snowball/source')
|
def snowballExtractDirectory = layout.buildDirectory.dir(snowballSourceRelativePath)
|
||||||
def snowballJavaSourceDirectory = layout.buildDirectory.dir(
|
def snowballJavaSourceDirectory = layout.buildDirectory.dir(snowballJavaSourceRelativePath)
|
||||||
"third-party/snowball/source/libstemmer_java-${snowballVersion}/java")
|
def snowballJavaSourceClasspathPath = provider {
|
||||||
|
project.relativePath(snowballJavaSourceDirectory.get().asFile)
|
||||||
|
}
|
||||||
|
def snowballEclipseClasspathAttributes = [
|
||||||
|
gradle_scope : 'jmh',
|
||||||
|
gradle_used_by_scope: 'jmh',
|
||||||
|
test : 'true'
|
||||||
|
]
|
||||||
|
def isAbsoluteClasspathPath = { String path ->
|
||||||
|
path.startsWith('/') || path ==~ /^[A-Za-z]:[\\\/].*/
|
||||||
|
}
|
||||||
|
|
||||||
tasks.register('downloadSnowballJava') {
|
tasks.register('downloadSnowballJava') {
|
||||||
group = 'build setup'
|
group = 'build setup'
|
||||||
@@ -46,4 +63,31 @@ sourceSets {
|
|||||||
|
|
||||||
tasks.named('compileJmhJava') {
|
tasks.named('compileJmhJava') {
|
||||||
dependsOn(tasks.named('extractSnowballJava'))
|
dependsOn(tasks.named('extractSnowballJava'))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
eclipse {
|
||||||
|
classpath {
|
||||||
|
file {
|
||||||
|
whenMerged { classpath ->
|
||||||
|
String generatedSnowballPath = snowballJavaSourceClasspathPath.get()
|
||||||
|
String modelSnowballPath = snowballJavaSourceRelativePath
|
||||||
|
|
||||||
|
classpath.entries.removeAll { entry ->
|
||||||
|
entry.hasProperty('path') && (
|
||||||
|
entry.path == generatedSnowballPath ||
|
||||||
|
entry.path == modelSnowballPath ||
|
||||||
|
isAbsoluteClasspathPath(entry.path)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
SourceFolder snowballEntry = new SourceFolder(generatedSnowballPath, null)
|
||||||
|
snowballEntry.output = 'bin/jmh'
|
||||||
|
snowballEclipseClasspathAttributes.each { String name, String value ->
|
||||||
|
snowballEntry.entryAttributes[name] = value
|
||||||
|
}
|
||||||
|
|
||||||
|
classpath.entries.add(snowballEntry)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ nav:
|
|||||||
- Overview: architecture-and-reduction.md
|
- Overview: architecture-and-reduction.md
|
||||||
- Architecture: architecture.md
|
- Architecture: architecture.md
|
||||||
- Reduction Semantics: reduction-semantics.md
|
- Reduction Semantics: reduction-semantics.md
|
||||||
|
- Lookup Edge Optimization: lookup-edge-optimization.md
|
||||||
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
||||||
|
|
||||||
- Dictionaries:
|
- Dictionaries:
|
||||||
@@ -63,3 +64,4 @@ nav:
|
|||||||
- Quality and Operations: quality-and-operations.md
|
- Quality and Operations: quality-and-operations.md
|
||||||
- Benchmarking: benchmarking.md
|
- Benchmarking: benchmarking.md
|
||||||
- Reports: reports.md
|
- Reports: reports.md
|
||||||
|
- Test taxonomy and execution filtering: test-taxonomy-and-filtering.md
|
||||||
|
|||||||
@@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport {
|
|||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
StemmerDictionaryParser.parse(
|
StemmerDictionaryParser.parse(
|
||||||
new StringReader(corpusText),
|
new StringReader(corpusText),
|
||||||
|
|||||||
@@ -59,8 +59,7 @@ import org.tartarus.snowball.ext.porterStemmer;
|
|||||||
* The benchmark processes the same deterministic token array with:
|
* The benchmark processes the same deterministic token array with:
|
||||||
* </p>
|
* </p>
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Radixor using bundled
|
* <li>Radixor using bundled {@link StemmerPatchTrieLoader.Language#US_UK}</li>
|
||||||
* {@link StemmerPatchTrieLoader.Language#US_UK_PROFI}</li>
|
|
||||||
* <li>Snowball original Porter stemmer</li>
|
* <li>Snowball original Porter stemmer</li>
|
||||||
* <li>Snowball English stemmer, commonly referred to as Porter2</li>
|
* <li>Snowball English stemmer, commonly referred to as Porter2</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
@@ -106,7 +105,7 @@ public class EnglishStemmerComparisonBenchmark {
|
|||||||
@Setup(Level.Trial)
|
@Setup(Level.Trial)
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
|
this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
|
||||||
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK_PROFI, true,
|
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK, true,
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines how dictionary items are normalized with respect to letter casing.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The mode is applied while parsing dictionary sources and can be persisted in
|
||||||
|
* trie metadata so that compiled artifacts remain self-describing.
|
||||||
|
*/
|
||||||
|
public enum CaseProcessingMode {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preserves input character casing exactly as provided by the dictionary
|
||||||
|
* source.
|
||||||
|
*/
|
||||||
|
AS_IS,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes all dictionary content to lower case using
|
||||||
|
* {@link Locale#ROOT}.
|
||||||
|
*/
|
||||||
|
LOWERCASE_WITH_LOCALE_ROOT
|
||||||
|
}
|
||||||
@@ -61,6 +61,8 @@ import java.util.logging.Logger;
|
|||||||
* --output <file>
|
* --output <file>
|
||||||
* --reduction-mode <mode>
|
* --reduction-mode <mode>
|
||||||
* [--store-original]
|
* [--store-original]
|
||||||
|
* [--right-to-left]
|
||||||
|
* [--case-processing-mode <mode>]
|
||||||
* [--dominant-winner-min-percent <1..100>]
|
* [--dominant-winner-min-percent <1..100>]
|
||||||
* [--dominant-winner-over-second-ratio <1..n>]
|
* [--dominant-winner-over-second-ratio <1..n>]
|
||||||
* [--overwrite]
|
* [--overwrite]
|
||||||
@@ -149,8 +151,10 @@ public final class Compile {
|
|||||||
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
|
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
|
||||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
|
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
|
||||||
|
|
||||||
|
final WordTraversalDirection traversalDirection = arguments.rightToLeft() ? WordTraversalDirection.FORWARD
|
||||||
|
: WordTraversalDirection.BACKWARD;
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
||||||
reductionSettings);
|
reductionSettings, traversalDirection, arguments.caseProcessingMode());
|
||||||
|
|
||||||
final Path outputFile = arguments.outputFile();
|
final Path outputFile = arguments.outputFile();
|
||||||
final Path parent = outputFile.toAbsolutePath().getParent();
|
final Path parent = outputFile.toAbsolutePath().getParent();
|
||||||
@@ -166,11 +170,11 @@ public final class Compile {
|
|||||||
|
|
||||||
if (LOGGER.isLoggable(Level.INFO)) {
|
if (LOGGER.isLoggable(Level.INFO)) {
|
||||||
LOGGER.log(Level.INFO,
|
LOGGER.log(Level.INFO,
|
||||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
|
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, caseProcessingMode={5}, dominantWinnerMinPercent={6}, dominantWinnerOverSecondRatio={7}.",
|
||||||
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
||||||
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
||||||
arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
|
arguments.storeOriginal(), arguments.rightToLeft(), arguments.caseProcessingMode(),
|
||||||
arguments.dominantWinnerOverSecondRatio() });
|
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio() });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,10 +188,28 @@ public final class Compile {
|
|||||||
System.err.println(" --output <file> \\");
|
System.err.println(" --output <file> \\");
|
||||||
System.err.println(" --reduction-mode <mode> \\");
|
System.err.println(" --reduction-mode <mode> \\");
|
||||||
System.err.println(" [--store-original] \\");
|
System.err.println(" [--store-original] \\");
|
||||||
|
System.err.println(" [--case-processing-mode <mode>] \\");
|
||||||
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
||||||
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
||||||
System.err.println(" [--overwrite]");
|
System.err.println(" [--overwrite]");
|
||||||
System.err.println();
|
System.err.println();
|
||||||
|
System.err.println("Options:");
|
||||||
|
System.err.println(" --store-original");
|
||||||
|
System.err.println(" Inserts each canonical stem itself using the no-operation patch.");
|
||||||
|
System.err.println(" --right-to-left");
|
||||||
|
System.err.println(" Uses forward word traversal for right-to-left languages.");
|
||||||
|
System.err.println(" In this mode, trie keys are constructed from the logical beginning");
|
||||||
|
System.err.println(" of the stored word form and patch commands are encoded likewise.");
|
||||||
|
System.err.println(" --overwrite");
|
||||||
|
System.err.println(" Replaces the target file when it already exists.");
|
||||||
|
System.err.println(" --case-processing-mode");
|
||||||
|
System.err.println(" Controls whether dictionary input is lowercased or preserved as-is.");
|
||||||
|
System.err.println();
|
||||||
|
System.err.println("Supported case processing modes:");
|
||||||
|
for (CaseProcessingMode mode : CaseProcessingMode.values()) {
|
||||||
|
System.err.println(" " + mode.name());
|
||||||
|
}
|
||||||
|
System.err.println();
|
||||||
System.err.println("Supported reduction modes:");
|
System.err.println("Supported reduction modes:");
|
||||||
for (ReductionMode mode : ReductionMode.values()) {
|
for (ReductionMode mode : ReductionMode.values()) {
|
||||||
System.err.println(" " + mode.name());
|
System.err.println(" " + mode.name());
|
||||||
@@ -240,15 +262,20 @@ public final class Compile {
|
|||||||
* @param outputFile output compressed trie file
|
* @param outputFile output compressed trie file
|
||||||
* @param reductionMode subtree reduction mode
|
* @param reductionMode subtree reduction mode
|
||||||
* @param storeOriginal whether original stems are stored
|
* @param storeOriginal whether original stems are stored
|
||||||
|
* @param rightToLeft whether dictionary compilation should
|
||||||
|
* use forward traversal on stored word
|
||||||
|
* forms
|
||||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
* @param overwrite whether an existing output may be
|
* @param overwrite whether an existing output may be
|
||||||
* replaced
|
* replaced
|
||||||
* @param help whether usage help was requested
|
* @param help whether usage help was requested
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("PMD.LongVariable")
|
@SuppressWarnings("PMD.LongVariable")
|
||||||
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
||||||
int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
|
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio,
|
||||||
|
CaseProcessingMode caseProcessingMode, boolean overwrite, boolean help) {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses raw command-line arguments.
|
* Parses raw command-line arguments.
|
||||||
@@ -264,8 +291,10 @@ public final class Compile {
|
|||||||
Path outputFile = null;
|
Path outputFile = null;
|
||||||
ReductionMode reductionMode = null;
|
ReductionMode reductionMode = null;
|
||||||
boolean storeOriginal = false;
|
boolean storeOriginal = false;
|
||||||
|
boolean rightToLeft = false;
|
||||||
boolean overwrite = false;
|
boolean overwrite = false;
|
||||||
boolean help = false;
|
boolean help = false;
|
||||||
|
CaseProcessingMode caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||||
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
||||||
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
||||||
|
|
||||||
@@ -286,6 +315,10 @@ public final class Compile {
|
|||||||
overwrite = true;
|
overwrite = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case "--right-to-left":
|
||||||
|
rightToLeft = true;
|
||||||
|
break;
|
||||||
|
|
||||||
case "--input":
|
case "--input":
|
||||||
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
|
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
|
||||||
break;
|
break;
|
||||||
@@ -310,6 +343,10 @@ public final class Compile {
|
|||||||
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
||||||
"--dominant-winner-over-second-ratio");
|
"--dominant-winner-over-second-ratio");
|
||||||
break;
|
break;
|
||||||
|
case "--case-processing-mode":
|
||||||
|
caseProcessingMode = CaseProcessingMode.valueOf(
|
||||||
|
requireValue(arguments, ++index, "--case-processing-mode").toUpperCase(Locale.ROOT));
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||||
@@ -317,8 +354,8 @@ public final class Compile {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (help) {
|
if (help) {
|
||||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||||
dominantWinnerOverSecondRatio, overwrite, true);
|
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inputFile == null) {
|
if (inputFile == null) {
|
||||||
@@ -331,8 +368,8 @@ public final class Compile {
|
|||||||
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
|
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||||
dominantWinnerOverSecondRatio, overwrite, false);
|
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -0,0 +1,66 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines how dictionary loading and trie traversal should treat diacritics.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The selected mode is applied independently from other normalization modes
|
||||||
|
* (for example {@link CaseProcessingMode}). This means case normalization and
|
||||||
|
* diacritic normalization can be combined freely and each keeps its own
|
||||||
|
* semantics.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public enum DiacriticProcessingMode {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preserves dictionary entries and lookup keys exactly as provided.
|
||||||
|
*/
|
||||||
|
AS_IS,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes diacritics from dictionary entries before trie construction and
|
||||||
|
* removes diacritics from lookup keys before traversal.
|
||||||
|
*/
|
||||||
|
REMOVE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Planned dual-path mode where lookup may continue along both the original
|
||||||
|
* diacritic edge and a normalized non-diacritic alternative.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This mode is currently not supported and using it triggers
|
||||||
|
* {@link UnsupportedOperationException}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
AS_IS_AND_STRIPPED_FALLBACK
|
||||||
|
}
|
||||||
197
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
197
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.text.Normalizer.Form;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility that strips diacritics from text for diacritic-insensitive trie
|
||||||
|
* storage and lookup.
|
||||||
|
*/
|
||||||
|
final class DiacriticStripper {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Direct single-character replacement table.
|
||||||
|
*/
|
||||||
|
private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
|
||||||
|
|
||||||
|
static {
|
||||||
|
registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
|
||||||
|
registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
|
||||||
|
registerSingle("çćĉċč", 'c');
|
||||||
|
registerSingle("ÇĆĈĊČ", 'C');
|
||||||
|
registerSingle("ďđḍ", 'd');
|
||||||
|
registerSingle("ĎĐḌ", 'D');
|
||||||
|
registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
|
||||||
|
registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
|
||||||
|
registerSingle("ğĝġģǧ", 'g');
|
||||||
|
registerSingle("ĞĜĠĢǦ", 'G');
|
||||||
|
registerSingle("ĥħ", 'h');
|
||||||
|
registerSingle("ĤĦ", 'H');
|
||||||
|
registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
|
||||||
|
registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
|
||||||
|
registerSingle("ĵ", 'j');
|
||||||
|
registerSingle("Ĵ", 'J');
|
||||||
|
registerSingle("ķǩ", 'k');
|
||||||
|
registerSingle("ĶǨ", 'K');
|
||||||
|
registerSingle("ĺļľŀł", 'l');
|
||||||
|
registerSingle("ĹĻĽĿŁ", 'L');
|
||||||
|
registerSingle("ñńņňʼnŋ", 'n');
|
||||||
|
registerSingle("ÑŃŅŇŊ", 'N');
|
||||||
|
registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
|
||||||
|
registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
|
||||||
|
registerSingle("ŕŗř", 'r');
|
||||||
|
registerSingle("ŔŖŘ", 'R');
|
||||||
|
registerSingle("śŝşšș", 's');
|
||||||
|
registerSingle("ŚŜŞŠȘ", 'S');
|
||||||
|
registerSingle("ťţŧț", 't');
|
||||||
|
registerSingle("ŤŢŦȚ", 'T');
|
||||||
|
registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
|
||||||
|
registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
|
||||||
|
registerSingle("ýÿŷỳỵỷỹ", 'y');
|
||||||
|
registerSingle("ÝŶŸỲỴỶỸ", 'Y');
|
||||||
|
registerSingle("źżž", 'z');
|
||||||
|
registerSingle("ŹŻŽ", 'Z');
|
||||||
|
registerSingle("þ", 't');
|
||||||
|
registerSingle("Þ", 'T');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class.
|
||||||
|
*/
|
||||||
|
private DiacriticStripper() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes supported diacritic marks and common Latin ligatures from the supplied
|
||||||
|
* text.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The method returns the original {@link String} instance when no replacement is
|
||||||
|
* required, avoiding an unnecessary allocation on the common ASCII path.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param input text to normalize
|
||||||
|
* @return normalized text, or {@code input} itself when it is already unchanged
|
||||||
|
*/
|
||||||
|
/* default */ static String strip(final String input) {
|
||||||
|
StringBuilder normalized = null;
|
||||||
|
|
||||||
|
for (int index = 0; index < input.length(); index++) {
|
||||||
|
final char source = input.charAt(index);
|
||||||
|
final String replacement = replacementFor(source);
|
||||||
|
|
||||||
|
if (replacement == null) {
|
||||||
|
if (normalized != null) {
|
||||||
|
normalized.append(source);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized == null) {
|
||||||
|
normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
|
||||||
|
normalized.append(input, 0, index);
|
||||||
|
}
|
||||||
|
normalized.append(replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized == null) {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
return normalized.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the replacement text for one non-ASCII character.
|
||||||
|
*
|
||||||
|
* @param source source character
|
||||||
|
* @return replacement text, or {@code null} when the character should be kept
|
||||||
|
* unchanged
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||||
|
private static String replacementFor(final char source) {
|
||||||
|
if (source <= 0x007F) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final char mapped = DIRECT_REPLACEMENTS[source];
|
||||||
|
if (mapped != '\0') {
|
||||||
|
return String.valueOf(mapped);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source == 'ß') {
|
||||||
|
return "ss";
|
||||||
|
}
|
||||||
|
if (source == 'Æ') {
|
||||||
|
return "AE";
|
||||||
|
}
|
||||||
|
if (source == 'æ') {
|
||||||
|
return "ae";
|
||||||
|
}
|
||||||
|
if (source == 'Œ') {
|
||||||
|
return "OE";
|
||||||
|
}
|
||||||
|
if (source == 'œ') {
|
||||||
|
return "oe";
|
||||||
|
}
|
||||||
|
|
||||||
|
final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
|
||||||
|
final StringBuilder ascii = new StringBuilder(decomposed.length());
|
||||||
|
for (int index = 0; index < decomposed.length(); index++) {
|
||||||
|
final char part = decomposed.charAt(index);
|
||||||
|
if (Character.getType(part) == Character.NON_SPACING_MARK) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (part <= 0x007F) {
|
||||||
|
ascii.append(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ascii.length() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return ascii.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers one-character replacements for a set of source characters.
|
||||||
|
*
|
||||||
|
* @param sourceCharacters characters to replace
|
||||||
|
* @param replacement replacement character
|
||||||
|
*/
|
||||||
|
private static void registerSingle(final String sourceCharacters, final char replacement) {
|
||||||
|
for (int index = 0; index < sourceCharacters.length(); index++) {
|
||||||
|
DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -41,6 +41,7 @@ import java.util.Collections;
|
|||||||
import java.util.IdentityHashMap;
|
import java.util.IdentityHashMap;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.function.IntFunction;
|
import java.util.function.IntFunction;
|
||||||
@@ -50,7 +51,6 @@ import java.util.logging.Logger;
|
|||||||
import org.egothor.stemmer.trie.CompiledNode;
|
import org.egothor.stemmer.trie.CompiledNode;
|
||||||
import org.egothor.stemmer.trie.LocalValueSummary;
|
import org.egothor.stemmer.trie.LocalValueSummary;
|
||||||
import org.egothor.stemmer.trie.MutableNode;
|
import org.egothor.stemmer.trie.MutableNode;
|
||||||
import org.egothor.stemmer.trie.NodeData;
|
|
||||||
import org.egothor.stemmer.trie.ReducedNode;
|
import org.egothor.stemmer.trie.ReducedNode;
|
||||||
import org.egothor.stemmer.trie.ReductionContext;
|
import org.egothor.stemmer.trie.ReductionContext;
|
||||||
import org.egothor.stemmer.trie.ReductionSignature;
|
import org.egothor.stemmer.trie.ReductionSignature;
|
||||||
@@ -93,36 +93,120 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Root node of the compiled read-only trie.
|
||||||
|
*/
|
||||||
|
private final CompiledNode<V> root;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Metadata persisted together with this trie.
|
||||||
|
*/
|
||||||
|
private final TrieMetadata metadata;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cached traversal direction used for key lookup.
|
||||||
|
*/
|
||||||
|
private final WordTraversalDirection lookupTraversalDirection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether lookups require lowercase normalization.
|
||||||
|
*/
|
||||||
|
private final boolean lowercasesLookupKeys;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether lookups require diacritic stripping.
|
||||||
|
*/
|
||||||
|
private final boolean removeDiacritics;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared empty array instance for empty lookup results from {@link #getAll(String)}.
|
||||||
|
*/
|
||||||
|
private final V[] emptyValues;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Binary format magic header.
|
* Binary format magic header.
|
||||||
*/
|
*/
|
||||||
private static final int STREAM_MAGIC = 0x45475452;
|
private static final int STREAM_MAGIC = 0x45475452;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimum supported stream version constant retained for explicit range checks.
|
||||||
|
*/
|
||||||
|
private static final int MIN_STREAM_VERSION = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of stored values for which {@link #getEntries(String)} can return an
|
||||||
|
* empty result.
|
||||||
|
*/
|
||||||
|
private static final int NO_VALUE_COUNT = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of stored values for which {@link #getEntries(String)} can use a
|
||||||
|
* one-item immutable list special case.
|
||||||
|
*/
|
||||||
|
private static final int SINGLE_VALUE_COUNT = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Binary format version.
|
* Binary format version.
|
||||||
*/
|
*/
|
||||||
private static final int STREAM_VERSION = 1;
|
private static final int STREAM_VERSION = 5;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
* Version where traversal-direction ordinal is persisted.
|
||||||
*/
|
*/
|
||||||
private final IntFunction<V[]> arrayFactory;
|
private static final int TRAVERSAL_VERSION = 2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Root node of the compiled read-only trie.
|
* Version where compact reduction metadata is persisted.
|
||||||
*/
|
*/
|
||||||
private final CompiledNode<V> root;
|
private static final int REDUCTION_VERSION = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version where case-processing mode ordinal is persisted.
|
||||||
|
*/
|
||||||
|
private static final int CASE_VERSION = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default dense child lookup span in code points used when materializing
|
||||||
|
* compiled nodes without an explicit override.
|
||||||
|
* <p>
|
||||||
|
* Increasing this value increases the chance of direct array indexing for
|
||||||
|
* child lookup at runtime at the cost of per-node dense table memory for
|
||||||
|
* compact character spans.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current persisted binary stream format version.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This method exists so other components can construct {@link TrieMetadata}
|
||||||
|
* instances aligned with the currently written binary format without
|
||||||
|
* duplicating constants.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return current trie stream format version
|
||||||
|
*/
|
||||||
|
public static int currentFormatVersion() {
|
||||||
|
return STREAM_VERSION;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new compiled trie instance.
|
* Creates a new compiled trie instance.
|
||||||
*
|
*
|
||||||
* @param arrayFactory array factory
|
* @param arrayFactory array factory
|
||||||
* @param root compiled root node
|
* @param root compiled root node
|
||||||
|
* @param metadata trie metadata describing lookup and persistence semantics
|
||||||
* @throws NullPointerException if any argument is {@code null}
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
*/
|
*/
|
||||||
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) {
|
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root,
|
||||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
final TrieMetadata metadata) {
|
||||||
this.root = Objects.requireNonNull(root, "root");
|
this.root = Objects.requireNonNull(root, "root");
|
||||||
|
this.metadata = Objects.requireNonNull(metadata, "metadata");
|
||||||
|
this.lookupTraversalDirection = metadata.traversalDirection();
|
||||||
|
this.lowercasesLookupKeys = metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||||
|
this.removeDiacritics = metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE;
|
||||||
|
this.emptyValues = arrayFactory.apply(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -134,6 +218,10 @@ public final class FrequencyTrie<V> {
|
|||||||
* selected deterministically by shorter {@code toString()} value first, then by
|
* selected deterministically by shorter {@code toString()} value first, then by
|
||||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||||
* order.
|
* order.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The supplied key is normalized according to persisted
|
||||||
|
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||||
*
|
*
|
||||||
* @param key key to resolve
|
* @param key key to resolve
|
||||||
* @return most frequent value, or {@code null} if the key does not exist or no
|
* @return most frequent value, or {@code null} if the key does not exist or no
|
||||||
@@ -142,11 +230,15 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public V get(final String key) {
|
public V get(final String key) {
|
||||||
Objects.requireNonNull(key, "key");
|
Objects.requireNonNull(key, "key");
|
||||||
final CompiledNode<V> node = findNode(key);
|
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||||
if (node == null || node.orderedValues().length == 0) {
|
if (node == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return node.orderedValues()[0];
|
final V[] orderedValues = node.orderedValues();
|
||||||
|
if (orderedValues.length == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return orderedValues[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -162,19 +254,28 @@ public final class FrequencyTrie<V> {
|
|||||||
* <p>
|
* <p>
|
||||||
* The returned array is a defensive copy.
|
* The returned array is a defensive copy.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* The supplied key is normalized according to persisted
|
||||||
|
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||||
|
*
|
||||||
* @param key key to resolve
|
* @param key key to resolve
|
||||||
* @return all values stored at the addressed node, ordered by descending
|
* @return all values stored at the addressed node, ordered by descending
|
||||||
* frequency; returns an empty array if the key does not exist or no
|
* frequency; returns an empty array if the key does not exist or no
|
||||||
* value is stored at the addressed node
|
* value is stored at the addressed node
|
||||||
* @throws NullPointerException if {@code key} is {@code null}
|
* @throws NullPointerException if {@code key} is {@code null}
|
||||||
*/
|
*/
|
||||||
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public V[] getAll(final String key) {
|
public V[] getAll(final String key) {
|
||||||
Objects.requireNonNull(key, "key");
|
Objects.requireNonNull(key, "key");
|
||||||
final CompiledNode<V> node = findNode(key);
|
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||||
if (node == null || node.orderedValues().length == 0) {
|
if (node == null) {
|
||||||
return this.arrayFactory.apply(0);
|
return this.emptyValues;
|
||||||
}
|
}
|
||||||
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length);
|
final V[] orderedValues = node.orderedValues();
|
||||||
|
if (orderedValues.length == 0) {
|
||||||
|
return this.emptyValues;
|
||||||
|
}
|
||||||
|
return Arrays.copyOf(orderedValues, orderedValues.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -201,18 +302,52 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public List<ValueCount<V>> getEntries(final String key) {
|
public List<ValueCount<V>> getEntries(final String key) {
|
||||||
Objects.requireNonNull(key, "key");
|
Objects.requireNonNull(key, "key");
|
||||||
final CompiledNode<V> node = findNode(key);
|
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||||
if (node == null || node.orderedValues().length == 0) {
|
if (node == null) {
|
||||||
return List.of();
|
return List.of();
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length);
|
final V[] orderedValues = node.orderedValues();
|
||||||
for (int index = 0; index < node.orderedValues().length; index++) {
|
final int valueCount = orderedValues.length;
|
||||||
entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index]));
|
if (valueCount == NO_VALUE_COUNT) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (valueCount == SINGLE_VALUE_COUNT) {
|
||||||
|
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
final int[] orderedCounts = node.orderedCounts();
|
||||||
|
final List<ValueCount<V>> entries = new ArrayList<>(valueCount);
|
||||||
|
for (int index = 0; index < valueCount; index++) {
|
||||||
|
entries.add(new ValueCount<>(orderedValues[index], orderedCounts[index]));
|
||||||
}
|
}
|
||||||
return Collections.unmodifiableList(entries);
|
return Collections.unmodifiableList(entries);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the logical key traversal direction used by this trie.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The same direction must be used when reconstructing mutable builders or when
|
||||||
|
* applying patch commands that were generated against keys stored in this trie.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return logical key traversal direction
|
||||||
|
*/
|
||||||
|
public WordTraversalDirection traversalDirection() {
|
||||||
|
return this.metadata.traversalDirection();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns immutable persisted metadata associated with this trie.
|
||||||
|
*
|
||||||
|
* @return trie metadata
|
||||||
|
*/
|
||||||
|
public TrieMetadata metadata() {
|
||||||
|
return this.metadata;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the root node mainly for diagnostics and tests within the package.
|
* Returns the root node mainly for diagnostics and tests within the package.
|
||||||
*
|
*
|
||||||
@@ -262,6 +397,7 @@ public final class FrequencyTrie<V> {
|
|||||||
dataOutput.writeInt(STREAM_VERSION);
|
dataOutput.writeInt(STREAM_VERSION);
|
||||||
dataOutput.writeInt(orderedNodes.size());
|
dataOutput.writeInt(orderedNodes.size());
|
||||||
dataOutput.writeInt(nodeIds.get(this.root));
|
dataOutput.writeInt(nodeIds.get(this.root));
|
||||||
|
writeMetadata(dataOutput, this.metadata);
|
||||||
|
|
||||||
for (CompiledNode<V> node : orderedNodes) {
|
for (CompiledNode<V> node : orderedNodes) {
|
||||||
writeNode(dataOutput, valueCodec, node, nodeIds);
|
writeNode(dataOutput, valueCodec, node, nodeIds);
|
||||||
@@ -287,45 +423,43 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
final ValueStreamCodec<V> valueCodec) throws IOException {
|
final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||||
Objects.requireNonNull(inputStream, "inputStream");
|
return readFrom(inputStream, arrayFactory, valueCodec, -1);
|
||||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
}
|
||||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
|
||||||
|
|
||||||
final DataInputStream dataInput; // NOPMD
|
/**
|
||||||
if (inputStream instanceof DataInputStream) {
|
* Reads a compiled trie from the supplied input stream, optionally overriding
|
||||||
dataInput = (DataInputStream) inputStream;
|
* dense child-index span configuration.
|
||||||
} else {
|
* <p>
|
||||||
dataInput = new DataInputStream(inputStream);
|
* This setting is applied only while materializing the in-memory compiled
|
||||||
}
|
* representation during load. It is not serialized in {@link TrieMetadata},
|
||||||
|
* so each load can independently choose its own runtime lookup trade-off.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param inputStream source input stream
|
||||||
|
* @param arrayFactory array factory used to create typed arrays
|
||||||
|
* @param valueCodec codec used to read values
|
||||||
|
* @param maxExpandedIndex dense lookup span override; zero disables dense lookup,
|
||||||
|
* negative values use {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @param <V> value type
|
||||||
|
* @return deserialized compiled trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if reading fails or the binary format is invalid
|
||||||
|
*/
|
||||||
|
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
|
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||||
|
return CompiledTrieReader.read(inputStream, arrayFactory, valueCodec, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
final int magic = dataInput.readInt();
|
/**
|
||||||
if (magic != STREAM_MAGIC) {
|
* Writes persisted trie metadata.
|
||||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
*
|
||||||
}
|
* @param dataOutput output stream
|
||||||
|
* @param metadata metadata to serialize
|
||||||
final int version = dataInput.readInt();
|
* @throws IOException if writing fails
|
||||||
if (version != STREAM_VERSION) {
|
*/
|
||||||
throw new IOException("Unsupported trie stream version: " + version);
|
private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata)
|
||||||
}
|
throws IOException {
|
||||||
|
dataOutput.writeUTF(metadata.toTextBlock());
|
||||||
final int nodeCount = dataInput.readInt();
|
|
||||||
if (nodeCount < 0) {
|
|
||||||
throw new IOException("Negative node count: " + nodeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int rootNodeId = dataInput.readInt();
|
|
||||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
|
||||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
|
||||||
}
|
|
||||||
|
|
||||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
|
||||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
|
||||||
|
|
||||||
if (LOGGER.isLoggable(Level.FINE)) {
|
|
||||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new FrequencyTrie<>(arrayFactory, rootNode);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -397,103 +531,218 @@ public final class FrequencyTrie<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads all compiled nodes and resolves child references.
|
* Internal helper that materializes serialized trie data.
|
||||||
*
|
|
||||||
* @param dataInput input
|
|
||||||
* @param arrayFactory array factory
|
|
||||||
* @param valueCodec value codec
|
|
||||||
* @param nodeCount number of nodes
|
|
||||||
* @param <V> value type
|
|
||||||
* @return array of nodes indexed by serialized node identifier
|
|
||||||
* @throws IOException if reading fails or the stream is invalid
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
|
|
||||||
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
|
||||||
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
|
|
||||||
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final int edgeCount = dataInput.readInt();
|
|
||||||
if (edgeCount < 0) {
|
|
||||||
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
final char[] edgeLabels = new char[edgeCount];
|
|
||||||
final int[] childNodeIds = new int[edgeCount];
|
|
||||||
|
|
||||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
|
||||||
edgeLabels[edgeIndex] = dataInput.readChar();
|
|
||||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
|
||||||
}
|
|
||||||
|
|
||||||
validateSerializedEdges(nodeIndex, edgeLabels);
|
|
||||||
|
|
||||||
final int valueCount = dataInput.readInt();
|
|
||||||
if (valueCount < 0) {
|
|
||||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
final V[] orderedValues = arrayFactory.apply(valueCount);
|
|
||||||
final int[] orderedCounts = new int[valueCount];
|
|
||||||
|
|
||||||
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
|
||||||
orderedValues[valueIndex] = valueCodec.read(dataInput);
|
|
||||||
orderedCounts[valueIndex] = dataInput.readInt();
|
|
||||||
if (orderedCounts[valueIndex] <= 0) {
|
|
||||||
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
|
||||||
+ valueIndex + ": " + orderedCounts[valueIndex]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
|
|
||||||
}
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
|
|
||||||
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
|
|
||||||
nodeData.orderedCounts());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
|
||||||
final CompiledNode<V> node = nodes[nodeIndex];
|
|
||||||
|
|
||||||
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
|
|
||||||
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
|
|
||||||
if (childNodeId < 0 || childNodeId >= nodeCount) {
|
|
||||||
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
|
|
||||||
+ ": " + childNodeId);
|
|
||||||
}
|
|
||||||
node.children()[edgeIndex] = nodes[childNodeId];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validates the serialized edge-label sequence for one node.
|
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Compiled nodes rely on binary search for child lookup and therefore require
|
* Moving reader complexity into this helper keeps the public-facing class from
|
||||||
* edge labels to be stored in strict ascending order without duplicates.
|
* accumulating excessive class-level cyclomatic complexity while preserving the
|
||||||
* Rejecting malformed streams here keeps lookup semantics deterministic and
|
* same binary compatibility contract.
|
||||||
* avoids silently constructing a trie whose search behavior would be undefined.
|
* </p>
|
||||||
*
|
|
||||||
* @param nodeIndex serialized node identifier
|
|
||||||
* @param edgeLabels serialized edge labels
|
|
||||||
* @throws IOException if the edge labels are not strictly ascending
|
|
||||||
*/
|
*/
|
||||||
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
private static final class CompiledTrieReader {
|
||||||
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
|
||||||
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
private static <V> FrequencyTrie<V> read(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
|
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||||
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
|
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||||
|
if (maxExpandedIndex < -1) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be >= -1.");
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataInputStream dataInput = wrapInputStream(inputStream);
|
||||||
|
final int magic = dataInput.readInt();
|
||||||
|
if (magic != STREAM_MAGIC) {
|
||||||
|
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||||
|
}
|
||||||
|
|
||||||
|
final int version = dataInput.readInt();
|
||||||
|
if (version < MIN_STREAM_VERSION || version > STREAM_VERSION) {
|
||||||
|
throw new IOException("Unsupported trie stream version: " + version);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int nodeCount = dataInput.readInt();
|
||||||
|
if (nodeCount < 0) {
|
||||||
|
throw new IOException("Negative node count: " + nodeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int rootNodeId = dataInput.readInt();
|
||||||
|
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||||
|
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
final TrieMetadata sourceMetadata = readMetadata(dataInput, version);
|
||||||
|
final int effectiveMaxExpandedIndex = maxExpandedIndex >= 0 ? maxExpandedIndex : DEFAULT_MAX_EXPANDED_INDEX;
|
||||||
|
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount, effectiveMaxExpandedIndex);
|
||||||
|
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||||
|
|
||||||
|
if (LOGGER.isLoggable(Level.FINE)) {
|
||||||
|
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FrequencyTrie<>(arrayFactory, rootNode, sourceMetadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DataInputStream wrapInputStream(final InputStream inputStream) {
|
||||||
|
return inputStream instanceof DataInputStream
|
||||||
|
? (DataInputStream) inputStream
|
||||||
|
: new DataInputStream(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||||
|
if (version == STREAM_VERSION) {
|
||||||
|
return readTextMetadata(dataInput);
|
||||||
|
}
|
||||||
|
|
||||||
|
final WordTraversalDirection traversalDirection = readTraversalDirection(dataInput, version);
|
||||||
|
if (version < REDUCTION_VERSION) {
|
||||||
|
return TrieMetadata.legacy(version, traversalDirection);
|
||||||
|
}
|
||||||
|
|
||||||
|
final ReductionSettings reductionSettings = readReductionSettings(dataInput);
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode = readEnumByOrdinal(dataInput, DiacriticProcessingMode.values(),
|
||||||
|
"diacritic processing mode");
|
||||||
|
final CaseProcessingMode caseProcessingMode = version >= CASE_VERSION
|
||||||
|
? readCaseProcessingMode(dataInput)
|
||||||
|
: CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||||
|
return new TrieMetadata(version, traversalDirection, reductionSettings, diacriticProcessingMode, caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata readTextMetadata(final DataInputStream dataInput) throws IOException {
|
||||||
|
try {
|
||||||
|
return TrieMetadata.fromTextBlock(STREAM_VERSION, dataInput.readUTF());
|
||||||
|
} catch (IllegalArgumentException exception) {
|
||||||
|
throw new IOException("Invalid metadata block.", exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WordTraversalDirection readTraversalDirection(final DataInputStream dataInput, final int version)
|
||||||
|
throws IOException {
|
||||||
|
if (version < TRAVERSAL_VERSION) {
|
||||||
|
return WordTraversalDirection.BACKWARD;
|
||||||
|
}
|
||||||
|
return readEnumByOrdinal(dataInput, WordTraversalDirection.values(), "traversal direction");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ReductionSettings readReductionSettings(final DataInputStream dataInput) throws IOException {
|
||||||
|
final ReductionMode reductionMode = readEnumByOrdinal(dataInput, ReductionMode.values(), "reduction mode");
|
||||||
|
final int dominantWinnerMinPercent = dataInput.readInt();
|
||||||
|
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
||||||
|
return new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CaseProcessingMode readCaseProcessingMode(final DataInputStream dataInput) throws IOException {
|
||||||
|
return readEnumByOrdinal(dataInput, CaseProcessingMode.values(), "case processing mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <E extends Enum<E>> E readEnumByOrdinal(final DataInputStream dataInput, final E[] values,
|
||||||
|
final String name) throws IOException {
|
||||||
|
final int ordinal = dataInput.readInt();
|
||||||
|
if (ordinal < 0 || ordinal >= values.length) {
|
||||||
|
throw new IOException("Invalid " + name + " ordinal: " + ordinal);
|
||||||
|
}
|
||||||
|
return values[ordinal];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
||||||
|
final ValueStreamCodec<V> valueCodec, final int nodeCount, final int maxExpandedIndex) throws IOException {
|
||||||
|
final char[][] edgeLabelsByNode = new char[nodeCount][];
|
||||||
|
final int[][] childNodeIdsByNode = new int[nodeCount][];
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final V[][] orderedValuesByNode = (V[][]) new Object[nodeCount][];
|
||||||
|
final int[][] orderedCountsByNode = new int[nodeCount][];
|
||||||
|
|
||||||
|
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||||
|
final int edgeCount = dataInput.readInt();
|
||||||
|
if (edgeCount < 0) {
|
||||||
|
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
edgeLabelsByNode[nodeIndex] = new char[edgeCount];
|
||||||
|
childNodeIdsByNode[nodeIndex] = new int[edgeCount];
|
||||||
|
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||||
|
edgeLabelsByNode[nodeIndex][edgeIndex] = dataInput.readChar();
|
||||||
|
childNodeIdsByNode[nodeIndex][edgeIndex] = dataInput.readInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
validateSerializedEdges(nodeIndex, edgeLabelsByNode[nodeIndex]);
|
||||||
|
|
||||||
|
final int valueCount = dataInput.readInt();
|
||||||
|
if (valueCount < 0) {
|
||||||
|
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
orderedValuesByNode[nodeIndex] = arrayFactory.apply(valueCount);
|
||||||
|
orderedCountsByNode[nodeIndex] = new int[valueCount];
|
||||||
|
|
||||||
|
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
||||||
|
orderedValuesByNode[nodeIndex][valueIndex] = valueCodec.read(dataInput);
|
||||||
|
orderedCountsByNode[nodeIndex][valueIndex] = dataInput.readInt();
|
||||||
|
if (orderedCountsByNode[nodeIndex][valueIndex] <= 0) {
|
||||||
|
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
||||||
|
+ valueIndex + ": " + orderedCountsByNode[nodeIndex][valueIndex]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
||||||
|
final boolean[] inProgress = new boolean[nodeCount];
|
||||||
|
|
||||||
|
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||||
|
nodes[nodeIndex] = resolveNode(nodeIndex, edgeLabelsByNode, childNodeIdsByNode, orderedValuesByNode,
|
||||||
|
orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <V> CompiledNode<V> resolveNode(final int nodeIndex, final char[][] edgeLabelsByNode,
|
||||||
|
final int[][] childNodeIdsByNode, final V[][] orderedValuesByNode, final int[][] orderedCountsByNode,
|
||||||
|
final CompiledNode<V>[] nodes, final boolean[] inProgress, final int maxExpandedIndex) throws IOException {
|
||||||
|
final CompiledNode<V> cachedNode = nodes[nodeIndex];
|
||||||
|
if (cachedNode != null) {
|
||||||
|
return cachedNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inProgress[nodeIndex]) {
|
||||||
|
throw new IOException("Invalid serialized node graph: cyclic reference detected at node " + nodeIndex + '.');
|
||||||
|
}
|
||||||
|
inProgress[nodeIndex] = true;
|
||||||
|
try {
|
||||||
|
final char[] edgeLabels = edgeLabelsByNode[nodeIndex];
|
||||||
|
final int[] childNodeIds = childNodeIdsByNode[nodeIndex];
|
||||||
|
final int edgeCount = childNodeIds.length;
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] children = new CompiledNode[edgeCount];
|
||||||
|
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||||
|
final int childNodeId = childNodeIds[edgeIndex];
|
||||||
|
if (childNodeId < 0 || childNodeId >= edgeLabelsByNode.length) {
|
||||||
|
throw new IOException(
|
||||||
|
"Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex + ": "
|
||||||
|
+ childNodeId);
|
||||||
|
}
|
||||||
|
children[edgeIndex] = resolveNode(childNodeId, edgeLabelsByNode, childNodeIdsByNode,
|
||||||
|
orderedValuesByNode, orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
final CompiledNode<V> node = new CompiledNode<>(edgeLabels, children, orderedValuesByNode[nodeIndex], maxExpandedIndex,
|
||||||
|
orderedCountsByNode[nodeIndex]);
|
||||||
|
nodes[nodeIndex] = node;
|
||||||
|
return node;
|
||||||
|
} finally {
|
||||||
|
inProgress[nodeIndex] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
||||||
|
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||||
|
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
||||||
|
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
|
||||||
|
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -501,13 +750,23 @@ public final class FrequencyTrie<V> {
|
|||||||
/**
|
/**
|
||||||
* Locates the compiled node for the supplied key.
|
* Locates the compiled node for the supplied key.
|
||||||
*
|
*
|
||||||
* @param key key to resolve
|
* @param key already-normalized key to resolve
|
||||||
* @return compiled node, or {@code null} if the path does not exist
|
* @return compiled node, or {@code null} if the path does not exist
|
||||||
*/
|
*/
|
||||||
private CompiledNode<V> findNode(final String key) {
|
private CompiledNode<V> findNode(final String key) {
|
||||||
CompiledNode<V> current = this.root;
|
CompiledNode<V> current = this.root;
|
||||||
for (int index = 0; index < key.length(); index++) {
|
if (this.lookupTraversalDirection == WordTraversalDirection.BACKWARD) {
|
||||||
current = current.findChild(key.charAt(index));
|
for (int traversalOffset = key.length() - 1; traversalOffset >= 0; traversalOffset--) {
|
||||||
|
current = current.findChild(key.charAt(traversalOffset));
|
||||||
|
if (current == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
|
||||||
|
current = current.findChild(key.charAt(traversalOffset));
|
||||||
if (current == null) {
|
if (current == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -515,6 +774,31 @@ public final class FrequencyTrie<V> {
|
|||||||
return current;
|
return current;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies lookup-time case normalization according to persisted metadata.
|
||||||
|
*
|
||||||
|
* @param key lookup key
|
||||||
|
* @return normalized key for trie traversal
|
||||||
|
*/
|
||||||
|
private String normalizeLookupKey(final String key) {
|
||||||
|
if (!this.lowercasesLookupKeys && !this.removeDiacritics) {
|
||||||
|
return key;
|
||||||
|
}
|
||||||
|
|
||||||
|
String normalized = key;
|
||||||
|
if (this.lowercasesLookupKeys) {
|
||||||
|
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
if (this.removeDiacritics) {
|
||||||
|
normalized = DiacriticStripper.strip(normalized);
|
||||||
|
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builder of {@link FrequencyTrie}.
|
* Builder of {@link FrequencyTrie}.
|
||||||
*
|
*
|
||||||
@@ -544,6 +828,31 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private final ReductionSettings reductionSettings;
|
private final ReductionSettings reductionSettings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Logical key traversal direction used by this builder.
|
||||||
|
*/
|
||||||
|
private final WordTraversalDirection traversalDirection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dictionary case processing mode associated with this builder.
|
||||||
|
*/
|
||||||
|
private final CaseProcessingMode caseProcessingMode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dictionary diacritic processing mode associated with this builder.
|
||||||
|
*/
|
||||||
|
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dense edge lookup span threshold.
|
||||||
|
* <p>
|
||||||
|
* This value controls a speed/memory trade-off during freezing:
|
||||||
|
* dense child lookup tables are allocated only for nodes whose child
|
||||||
|
* labels fit in this span.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
private final int maxExpandedIndex;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mutable root node.
|
* Mutable root node.
|
||||||
*/
|
*/
|
||||||
@@ -552,13 +861,97 @@ public final class FrequencyTrie<V> {
|
|||||||
/**
|
/**
|
||||||
* Creates a new builder with the provided settings.
|
* Creates a new builder with the provided settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This constructor preserves the historical Egothor behavior and therefore
|
||||||
|
* traverses logical keys from their end toward their beginning.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param arrayFactory array factory
|
* @param arrayFactory array factory
|
||||||
* @param reductionSettings reduction configuration
|
* @param reductionSettings reduction configuration
|
||||||
* @throws NullPointerException if any argument is {@code null}
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
*/
|
*/
|
||||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||||
|
this(arrayFactory, reductionSettings, WordTraversalDirection.BACKWARD);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings and explicit traversal
|
||||||
|
* direction.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection) {
|
||||||
|
this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
|
* direction, and explicit case processing mode.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||||
|
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
|
* direction, explicit case processing mode, and explicit diacritic processing
|
||||||
|
* mode.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
|
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||||
|
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, diacriticProcessingMode,
|
||||||
|
CompiledNode.DEFAULT_MAX_EXPANDED_INDEX);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
|
* direction, explicit case processing mode, explicit diacritic processing
|
||||||
|
* mode, and an explicit dense child lookup threshold.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
|
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||||
|
* @param maxExpandedIndex dense lookup span override; zero disables
|
||||||
|
* dense lookup. Larger values increase direct
|
||||||
|
* indexing opportunities while potentially
|
||||||
|
* increasing materialization memory in nodes
|
||||||
|
* whose edge label span is within the limit.
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode, final int maxExpandedIndex) {
|
||||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
|
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
|
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
|
if (maxExpandedIndex < 0) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||||
|
}
|
||||||
|
this.maxExpandedIndex = maxExpandedIndex;
|
||||||
this.root = new MutableNode<>();
|
this.root = new MutableNode<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -566,12 +959,31 @@ public final class FrequencyTrie<V> {
|
|||||||
* Creates a new builder using default thresholds for the supplied reduction
|
* Creates a new builder using default thresholds for the supplied reduction
|
||||||
* mode.
|
* mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This constructor preserves the historical Egothor behavior and therefore
|
||||||
|
* traverses logical keys from their end toward their beginning.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param arrayFactory array factory
|
* @param arrayFactory array factory
|
||||||
* @param reductionMode reduction mode
|
* @param reductionMode reduction mode
|
||||||
* @throws NullPointerException if any argument is {@code null}
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
*/
|
*/
|
||||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
this(arrayFactory, ReductionSettings.withDefaults(reductionMode), WordTraversalDirection.BACKWARD);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder using default thresholds for the supplied reduction
|
||||||
|
* mode and explicit traversal direction.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionMode reduction mode
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode,
|
||||||
|
final WordTraversalDirection traversalDirection) {
|
||||||
|
this(arrayFactory, ReductionSettings.withDefaults(reductionMode), traversalDirection);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -611,7 +1023,9 @@ public final class FrequencyTrie<V> {
|
|||||||
reductionContext.canonicalNodeCount());
|
reductionContext.canonicalNodeCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot);
|
final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings,
|
||||||
|
this.diacriticProcessingMode, this.caseProcessingMode);
|
||||||
|
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -645,9 +1059,12 @@ public final class FrequencyTrie<V> {
|
|||||||
throw new IllegalArgumentException("count must be at least 1.");
|
throw new IllegalArgumentException("count must be at least 1.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String normalizedKey = normalizeDictionaryKey(key);
|
||||||
|
|
||||||
MutableNode<V> current = this.root;
|
MutableNode<V> current = this.root;
|
||||||
for (int index = 0; index < key.length(); index++) {
|
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
|
||||||
final Character edge = key.charAt(index);
|
final Character edge = normalizedKey
|
||||||
|
.charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
|
||||||
MutableNode<V> child = current.children().get(edge);
|
MutableNode<V> child = current.children().get(edge);
|
||||||
if (child == null) {
|
if (child == null) {
|
||||||
child = new MutableNode<>(); // NOPMD
|
child = new MutableNode<>(); // NOPMD
|
||||||
@@ -665,6 +1082,30 @@ public final class FrequencyTrie<V> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies build-time dictionary-key normalization according to the builder
|
||||||
|
* configuration.
|
||||||
|
*
|
||||||
|
* @param key dictionary key
|
||||||
|
* @return normalized key for trie insertion
|
||||||
|
*/
|
||||||
|
private String normalizeDictionaryKey(final String key) {
|
||||||
|
String normalized = key;
|
||||||
|
|
||||||
|
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||||
|
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
|
||||||
|
normalized = DiacriticStripper.strip(normalized);
|
||||||
|
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of mutable build-time nodes currently reachable from the
|
* Returns the number of mutable build-time nodes currently reachable from the
|
||||||
* builder root.
|
* builder root.
|
||||||
@@ -679,6 +1120,15 @@ public final class FrequencyTrie<V> {
|
|||||||
return countMutableNodes(this.root);
|
return countMutableNodes(this.root);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the logical key traversal direction used by this builder.
|
||||||
|
*
|
||||||
|
* @return logical key traversal direction
|
||||||
|
*/
|
||||||
|
/* default */ WordTraversalDirection traversalDirection() {
|
||||||
|
return this.traversalDirection;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Counts mutable nodes recursively.
|
* Counts mutable nodes recursively.
|
||||||
*
|
*
|
||||||
@@ -758,7 +1208,7 @@ public final class FrequencyTrie<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
||||||
localSummary.orderedCounts());
|
this.maxExpandedIndex, localSummary.orderedCounts());
|
||||||
cache.put(reducedNode, frozen);
|
cache.put(reducedNode, frozen);
|
||||||
return frozen;
|
return frozen;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -87,10 +87,12 @@ public final class FrequencyTrieBuilders {
|
|||||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings);
|
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
|
||||||
|
source.traversalDirection(), source.metadata().caseProcessingMode(),
|
||||||
|
source.metadata().diacriticProcessingMode());
|
||||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||||
|
|
||||||
copyNode(source.root(), keyBuilder, builder);
|
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
|
||||||
|
|
||||||
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
|
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
|
||||||
return builder;
|
return builder;
|
||||||
@@ -119,18 +121,20 @@ public final class FrequencyTrieBuilders {
|
|||||||
*
|
*
|
||||||
* @param node current compiled node
|
* @param node current compiled node
|
||||||
* @param keyBuilder current key builder
|
* @param keyBuilder current key builder
|
||||||
* @param builder target mutable builder
|
* @param builder target mutable builder
|
||||||
* @param <V> value type
|
* @param traversalDirection logical key traversal direction used by the source
|
||||||
|
* @param <V> value type
|
||||||
*/
|
*/
|
||||||
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
|
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
|
||||||
final FrequencyTrie.Builder<V> builder) {
|
final FrequencyTrie.Builder<V> builder, final WordTraversalDirection traversalDirection) {
|
||||||
|
final String logicalKey = traversalDirection.traversalPathToLogicalKey(keyBuilder);
|
||||||
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
|
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
|
||||||
builder.put(keyBuilder.toString(), node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
builder.put(logicalKey, node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
|
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
|
||||||
keyBuilder.append(node.edgeLabels()[childIndex]);
|
keyBuilder.append(node.edgeLabels()[childIndex]);
|
||||||
copyNode(node.children()[childIndex], keyBuilder, builder);
|
copyNode(node.children()[childIndex], keyBuilder, builder, traversalDirection);
|
||||||
keyBuilder.setLength(keyBuilder.length() - 1);
|
keyBuilder.setLength(keyBuilder.length() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -36,9 +36,10 @@ import java.io.Reader;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.StringTokenizer;
|
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
@@ -46,14 +47,14 @@ import java.util.logging.Logger;
|
|||||||
* Parser of line-oriented stemmer dictionary files.
|
* Parser of line-oriented stemmer dictionary files.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Each non-empty logical line consists of a stem followed by zero or more known
|
* Each non-empty logical line uses a tab-separated values layout. The first
|
||||||
* word variants separated by whitespace. The first token is interpreted as the
|
* column is interpreted as the canonical stem, and every following
|
||||||
* canonical stem, and every following token on the same line is interpreted as
|
* tab-separated column on the same line is interpreted as a variant belonging
|
||||||
* a variant belonging to that stem.
|
* to that stem.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
|
* Input line case normalization is controlled by {@link CaseProcessingMode}.
|
||||||
* and trailing whitespace is ignored.
|
* Leading and trailing whitespace around each column is ignored.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The parser supports line remarks and trailing remarks. The remark markers
|
* The parser supports line remarks and trailing remarks. The remark markers
|
||||||
@@ -61,6 +62,13 @@ import java.util.logging.Logger;
|
|||||||
* remainder of that line is ignored.
|
* remainder of that line is ignored.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
|
* Dictionary items containing any Unicode whitespace character are currently
|
||||||
|
* not supported. Such items are ignored and reported through a single
|
||||||
|
* {@link Level#WARNING warning}-level log entry per physical line together with
|
||||||
|
* the source line number, the normalized stem column, and the list of ignored
|
||||||
|
* items from that line.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
* This class is intentionally stateless and allocation-light so it can be used
|
* This class is intentionally stateless and allocation-light so it can be used
|
||||||
* both by runtime loading and by offline compilation tooling.
|
* both by runtime loading and by offline compilation tooling.
|
||||||
*/
|
*/
|
||||||
@@ -105,11 +113,27 @@ public final class StemmerDictionaryParser {
|
|||||||
* @throws IOException if reading fails
|
* @throws IOException if reading fails
|
||||||
*/
|
*/
|
||||||
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
||||||
|
return parse(path, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses a dictionary file from a filesystem path.
|
||||||
|
*
|
||||||
|
* @param path dictionary file path
|
||||||
|
* @param caseProcessingMode case processing mode
|
||||||
|
* @param entryHandler handler receiving parsed entries
|
||||||
|
* @return parsing statistics
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if reading fails
|
||||||
|
*/
|
||||||
|
public static ParseStatistics parse(final Path path, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final EntryHandler entryHandler) throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, "path");
|
||||||
|
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||||
|
|
||||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||||
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
|
return parse(reader, path.toAbsolutePath().toString(), caseProcessingMode, entryHandler);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -124,7 +148,23 @@ public final class StemmerDictionaryParser {
|
|||||||
*/
|
*/
|
||||||
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
||||||
Objects.requireNonNull(fileName, "fileName");
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
return parse(Path.of(fileName), entryHandler);
|
return parse(Path.of(fileName), CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses a dictionary file from a path string.
|
||||||
|
*
|
||||||
|
* @param fileName dictionary file name or path string
|
||||||
|
* @param caseProcessingMode case processing mode
|
||||||
|
* @param entryHandler handler receiving parsed entries
|
||||||
|
* @return parsing statistics
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if reading fails
|
||||||
|
*/
|
||||||
|
public static ParseStatistics parse(final String fileName, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final EntryHandler entryHandler) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
|
return parse(Path.of(fileName), caseProcessingMode, entryHandler);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -139,8 +179,25 @@ public final class StemmerDictionaryParser {
|
|||||||
*/
|
*/
|
||||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||||
final EntryHandler entryHandler) throws IOException {
|
final EntryHandler entryHandler) throws IOException {
|
||||||
|
return parse(reader, sourceDescription, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses a dictionary from a reader.
|
||||||
|
*
|
||||||
|
* @param reader source reader
|
||||||
|
* @param sourceDescription logical source description for diagnostics
|
||||||
|
* @param caseProcessingMode case processing mode
|
||||||
|
* @param entryHandler handler receiving parsed entries
|
||||||
|
* @return parsing statistics
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if reading or handler processing fails
|
||||||
|
*/
|
||||||
|
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||||
|
final CaseProcessingMode caseProcessingMode, final EntryHandler entryHandler) throws IOException {
|
||||||
Objects.requireNonNull(reader, "reader");
|
Objects.requireNonNull(reader, "reader");
|
||||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||||
|
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||||
|
|
||||||
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
||||||
@@ -153,26 +210,56 @@ public final class StemmerDictionaryParser {
|
|||||||
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
||||||
lineNumber++;
|
lineNumber++;
|
||||||
|
|
||||||
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
|
final String normalizedLine = normalizeLineCase(stripRemark(line).trim(), caseProcessingMode);
|
||||||
if (normalizedLine.isEmpty()) {
|
if (normalizedLine.isEmpty()) {
|
||||||
ignoredLineCount++;
|
ignoredLineCount++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
|
final String[] rawColumns = normalizedLine.split("\t", -1);
|
||||||
if (!tokenizer.hasMoreTokens()) {
|
if (rawColumns.length == 0) {
|
||||||
ignoredLineCount++;
|
ignoredLineCount++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String stem = tokenizer.nextToken();
|
final String stem = rawColumns[0].strip();
|
||||||
final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
|
final List<String> acceptedVariants = new ArrayList<String>(Math.max(0, rawColumns.length - 1)); // NOPMD
|
||||||
|
|
||||||
for (int index = 0; index < variants.length; index++) {
|
if (stem.isEmpty()) {
|
||||||
variants[index] = tokenizer.nextToken();
|
ignoredLineCount++;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
entryHandler.onEntry(stem, variants, lineNumber);
|
if (containsWhitespaceCharacter(stem)) {
|
||||||
|
if (LOGGER.isLoggable(Level.WARNING)) {
|
||||||
|
LOGGER.log(Level.WARNING,
|
||||||
|
"Ignoring dictionary line containing whitespace in source {0} at line {1}, stem {2}.",
|
||||||
|
new Object[] { sourceDescription, lineNumber, stem }); // NOPMD
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ignored = 0;
|
||||||
|
|
||||||
|
for (int index = 1; index < rawColumns.length; index++) {
|
||||||
|
final String variant = rawColumns[index].strip();
|
||||||
|
if (variant.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (containsWhitespaceCharacter(variant)) {
|
||||||
|
ignored++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
acceptedVariants.add(variant);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ignored > 0 && LOGGER.isLoggable(Level.WARNING)) {
|
||||||
|
LOGGER.log(Level.WARNING,
|
||||||
|
"Ignoring dictionary items containing whitespace in source {0} at line {1}, stem {2}, ignored {3}:{4}.",
|
||||||
|
new Object[] { sourceDescription, lineNumber, stem, ignored, rawColumns.length }); // NOPMD
|
||||||
|
}
|
||||||
|
|
||||||
|
entryHandler.onEntry(stem, acceptedVariants.toArray(String[]::new), lineNumber);
|
||||||
logicalEntryCount++;
|
logicalEntryCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -188,6 +275,36 @@ public final class StemmerDictionaryParser {
|
|||||||
return statistics;
|
return statistics;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies case normalization to one line according to the selected mode.
|
||||||
|
*
|
||||||
|
* @param line line to normalize
|
||||||
|
* @param caseProcessingMode case processing mode
|
||||||
|
* @return normalized line
|
||||||
|
*/
|
||||||
|
private static String normalizeLineCase(final String line, final CaseProcessingMode caseProcessingMode) {
|
||||||
|
if (caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||||
|
return line.toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether one dictionary item contains any Unicode whitespace
|
||||||
|
* character.
|
||||||
|
*
|
||||||
|
* @param item dictionary item to inspect
|
||||||
|
* @return {@code true} when the item contains at least one whitespace character
|
||||||
|
*/
|
||||||
|
private static boolean containsWhitespaceCharacter(final String item) {
|
||||||
|
for (int index = 0; index < item.length(); index++) {
|
||||||
|
if (Character.isWhitespace(item.charAt(index))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes a trailing remark from one physical line.
|
* Removes a trailing remark from one physical line.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -0,0 +1,758 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.SplittableRandom;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates how stemming quality degrades when the compiled trie is built from
|
||||||
|
* only a deterministic subset of the available dictionary knowledge.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The experiment operates on whole dictionary entries. For a chosen knowledge
|
||||||
|
* percentage, each parsed dictionary line is deterministically included or
|
||||||
|
* excluded from the training subset using a seeded {@link SplittableRandom}.
|
||||||
|
* The resulting subset is compiled into a {@link FrequencyTrie}, while the
|
||||||
|
* evaluation is performed against all word forms from the original dictionary.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Two lookup APIs are evaluated:
|
||||||
|
* </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>{@link FrequencyTrie#get(String)} through top-1 accuracy</li>
|
||||||
|
* <li>{@link FrequencyTrie#getAll(String)} through global precision, recall,
|
||||||
|
* and F1</li>
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
public final class StemmerKnowledgeExperiment {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Logger of this class.
|
||||||
|
*/
|
||||||
|
private static final Logger LOGGER = Logger.getLogger(StemmerKnowledgeExperiment.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimum supported knowledge percentage.
|
||||||
|
*/
|
||||||
|
public static final int MINIMUM_KNOWLEDGE_PERCENT = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum supported knowledge percentage.
|
||||||
|
*/
|
||||||
|
public static final int MAXIMUM_KNOWLEDGE_PERCENT = 100;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step between adjacent evaluated knowledge percentages.
|
||||||
|
*/
|
||||||
|
public static final int KNOWLEDGE_PERCENT_STEP = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical no-op patch command.
|
||||||
|
*/
|
||||||
|
private static final String NOOP_PATCH_COMMAND = PatchCommandEncoder.NOOP_PATCH;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared patch encoder reused for subset compilation.
|
||||||
|
*/
|
||||||
|
private final PatchCommandEncoder patchCommandEncoder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new experiment harness.
|
||||||
|
*/
|
||||||
|
public StemmerKnowledgeExperiment() {
|
||||||
|
this.patchCommandEncoder = PatchCommandEncoder.builder().build();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates all supported bundled dictionaries using the supplied seed.
|
||||||
|
*
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @return immutable ordered list of experiment rows
|
||||||
|
* @throws IOException if reading a bundled dictionary fails
|
||||||
|
*/
|
||||||
|
public List<ResultRow> evaluateAllBundledLanguages(final long seed) throws IOException {
|
||||||
|
final List<ResultRow> rows = new ArrayList<>();
|
||||||
|
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
|
||||||
|
rows.addAll(evaluateBundledLanguage(language, seed));
|
||||||
|
}
|
||||||
|
return List.copyOf(rows);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates one bundled dictionary across all supported experiment
|
||||||
|
* configurations.
|
||||||
|
*
|
||||||
|
* @param language bundled language dictionary
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @return immutable ordered list of experiment rows
|
||||||
|
* @throws NullPointerException if {@code language} is {@code null}
|
||||||
|
* @throws IOException if reading the bundled dictionary fails
|
||||||
|
*/
|
||||||
|
public List<ResultRow> evaluateBundledLanguage(final StemmerPatchTrieLoader.Language language, final long seed)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(language, "language");
|
||||||
|
final String resourcePath = language.resourcePath();
|
||||||
|
try (InputStream inputStream = StemmerPatchTrieLoader.openBundledResource(resourcePath)) {
|
||||||
|
try (BufferedReader reader = new BufferedReader(
|
||||||
|
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
|
return evaluate(reader, resourcePath, language.name(), seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates one filesystem dictionary across all supported experiment
|
||||||
|
* configurations.
|
||||||
|
*
|
||||||
|
* @param dictionaryPath path to a dictionary file
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @return immutable ordered list of experiment rows
|
||||||
|
* @throws NullPointerException if {@code dictionaryPath} is {@code null}
|
||||||
|
* @throws IOException if reading fails
|
||||||
|
*/
|
||||||
|
public List<ResultRow> evaluatePath(final Path dictionaryPath, final long seed) throws IOException {
|
||||||
|
Objects.requireNonNull(dictionaryPath, "dictionaryPath");
|
||||||
|
try (BufferedReader reader = Files.newBufferedReader(dictionaryPath, StandardCharsets.UTF_8)) {
|
||||||
|
return evaluate(reader, dictionaryPath.toAbsolutePath().toString(), dictionaryPath.getFileName().toString(),
|
||||||
|
seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates a dictionary provided through an arbitrary reader.
|
||||||
|
*
|
||||||
|
* @param reader source reader
|
||||||
|
* @param sourceDescription logical source description
|
||||||
|
* @param languageLabel label stored in the result rows
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @return immutable ordered list of experiment rows
|
||||||
|
* @throws NullPointerException if any argument except {@code seed} is
|
||||||
|
* {@code null}
|
||||||
|
* @throws IOException if parsing fails
|
||||||
|
*/
|
||||||
|
public List<ResultRow> evaluate(final Reader reader, final String sourceDescription, final String languageLabel,
|
||||||
|
final long seed) throws IOException {
|
||||||
|
Objects.requireNonNull(reader, "reader");
|
||||||
|
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||||
|
Objects.requireNonNull(languageLabel, "languageLabel");
|
||||||
|
|
||||||
|
final DictionaryData dictionaryData = readDictionary(reader, sourceDescription);
|
||||||
|
final List<ResultRow> rows = new ArrayList<>();
|
||||||
|
|
||||||
|
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||||
|
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
|
||||||
|
for (boolean storeOriginal : new boolean[] { false, true }) { // NOPMD
|
||||||
|
for (boolean includeStemInEvaluation : new boolean[] { false, true }) { // NOPMD
|
||||||
|
for (int knowledgePercent = MINIMUM_KNOWLEDGE_PERCENT; knowledgePercent <= MAXIMUM_KNOWLEDGE_PERCENT; knowledgePercent += KNOWLEDGE_PERCENT_STEP) {
|
||||||
|
final ResultRow row = evaluateScenario(dictionaryData, languageLabel, seed, reductionSettings,
|
||||||
|
storeOriginal, includeStemInEvaluation, knowledgePercent);
|
||||||
|
rows.add(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LOGGER.isLoggable(Level.INFO)) {
|
||||||
|
LOGGER.log(Level.INFO, "Knowledge experiment finished for source {0}: entries={1}, rows={2}, seed={3}.",
|
||||||
|
new Object[] { sourceDescription, dictionaryData.entryCount(), rows.size(), seed });
|
||||||
|
}
|
||||||
|
|
||||||
|
return List.copyOf(rows);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes result rows as UTF-8 CSV with a stable fixed header.
|
||||||
|
*
|
||||||
|
* @param outputPath target file path
|
||||||
|
* @param rows rows to write
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if writing fails
|
||||||
|
*/
|
||||||
|
public static void writeCsv(final Path outputPath, final List<ResultRow> rows) throws IOException {
|
||||||
|
Objects.requireNonNull(outputPath, "outputPath");
|
||||||
|
Objects.requireNonNull(rows, "rows");
|
||||||
|
|
||||||
|
final Path parent = outputPath.getParent();
|
||||||
|
if (parent != null) {
|
||||||
|
Files.createDirectories(parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> lines = new ArrayList<>(rows.size() + 1);
|
||||||
|
lines.add(ResultRow.csvHeader());
|
||||||
|
for (ResultRow row : rows) {
|
||||||
|
lines.add(row.toCsvRow());
|
||||||
|
}
|
||||||
|
Files.write(outputPath, lines, StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the full dictionary into an in-memory representation suitable for
|
||||||
|
* repeated deterministic subset compilation.
|
||||||
|
*
|
||||||
|
* @param reader source reader
|
||||||
|
* @param sourceDescription logical source description
|
||||||
|
* @return parsed dictionary data
|
||||||
|
* @throws IOException if parsing fails
|
||||||
|
*/
|
||||||
|
private static DictionaryData readDictionary(final Reader reader, final String sourceDescription)
|
||||||
|
throws IOException {
|
||||||
|
final List<DictionaryEntry> entries = new ArrayList<>();
|
||||||
|
final StemmerDictionaryParser.ParseStatistics parseStatistics = StemmerDictionaryParser.parse(reader,
|
||||||
|
sourceDescription,
|
||||||
|
(stem, variants, lineNumber) -> entries.add(new DictionaryEntry(stem, variants, lineNumber)));
|
||||||
|
return new DictionaryData(sourceDescription, parseStatistics, entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates one concrete experiment scenario.
|
||||||
|
*
|
||||||
|
* @param dictionaryData parsed dictionary data
|
||||||
|
* @param languageLabel logical language label
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param storeOriginal whether canonical stems are inserted with a
|
||||||
|
* no-op patch
|
||||||
|
* @param includeStemInEvaluation whether the canonical stem itself is evaluated
|
||||||
|
* @param knowledgePercent retained percentage of dictionary entries
|
||||||
|
* @return result row
|
||||||
|
*/
|
||||||
|
private ResultRow evaluateScenario(final DictionaryData dictionaryData, final String languageLabel, final long seed,
|
||||||
|
final ReductionSettings reductionSettings, final boolean storeOriginal,
|
||||||
|
final boolean includeStemInEvaluation, final int knowledgePercent) {
|
||||||
|
final FrequencyTrie<String> trie = compileSubset(dictionaryData, reductionSettings, storeOriginal,
|
||||||
|
knowledgePercent, seed);
|
||||||
|
|
||||||
|
long evaluatedInputCount = 0L;
|
||||||
|
long getCorrectCount = 0L;
|
||||||
|
long getAllTruePositiveCount = 0L;
|
||||||
|
long getAllFalsePositiveCount = 0L;
|
||||||
|
long getAllCoveredInputCount = 0L;
|
||||||
|
long uniqueCandidateCount = 0L;
|
||||||
|
|
||||||
|
for (DictionaryEntry entry : dictionaryData.entries()) {
|
||||||
|
if (includeStemInEvaluation) {
|
||||||
|
final EvaluationCounts stemCounts = evaluateInput(entry.stem(), entry.stem(), trie);
|
||||||
|
evaluatedInputCount++;
|
||||||
|
getCorrectCount += stemCounts.getCorrect();
|
||||||
|
getAllTruePositiveCount += stemCounts.getAllTruePositives();
|
||||||
|
getAllFalsePositiveCount += stemCounts.getAllFalsePositives();
|
||||||
|
getAllCoveredInputCount += stemCounts.getAllCoveredInputs();
|
||||||
|
uniqueCandidateCount += stemCounts.getUniqueCandidateCount();
|
||||||
|
}
|
||||||
|
for (String variant : entry.variants()) {
|
||||||
|
final EvaluationCounts variantCounts = evaluateInput(variant, entry.stem(), trie);
|
||||||
|
evaluatedInputCount++;
|
||||||
|
getCorrectCount += variantCounts.getCorrect();
|
||||||
|
getAllTruePositiveCount += variantCounts.getAllTruePositives();
|
||||||
|
getAllFalsePositiveCount += variantCounts.getAllFalsePositives();
|
||||||
|
getAllCoveredInputCount += variantCounts.getAllCoveredInputs();
|
||||||
|
uniqueCandidateCount += variantCounts.getUniqueCandidateCount();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final long trainingEntryCount = countSelectedEntries(dictionaryData.entryCount(), seed, knowledgePercent);
|
||||||
|
final double getAccuracy = ratio(getCorrectCount, evaluatedInputCount);
|
||||||
|
final double getAllPrecision = ratio(getAllTruePositiveCount,
|
||||||
|
getAllTruePositiveCount + getAllFalsePositiveCount);
|
||||||
|
final double getAllRecall = ratio(getAllCoveredInputCount, evaluatedInputCount);
|
||||||
|
final double getAllF1 = f1(getAllPrecision, getAllRecall);
|
||||||
|
final double averageUniqueCandidateCount = ratio(uniqueCandidateCount, evaluatedInputCount);
|
||||||
|
|
||||||
|
return new ResultRow(languageLabel, reductionSettings.reductionMode().name(), storeOriginal,
|
||||||
|
includeStemInEvaluation, knowledgePercent, seed, dictionaryData.entryCount(), trainingEntryCount,
|
||||||
|
evaluatedInputCount, getCorrectCount, getAccuracy, getAllTruePositiveCount, getAllFalsePositiveCount,
|
||||||
|
getAllCoveredInputCount, getAllPrecision, getAllRecall, getAllF1, averageUniqueCandidateCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a trie from the deterministically selected subset of dictionary
|
||||||
|
* entries.
|
||||||
|
*
|
||||||
|
* @param dictionaryData parsed dictionary data
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param storeOriginal whether stems themselves should be stored
|
||||||
|
* @param knowledgePercent retained percentage of dictionary entries
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @return compiled trie for the selected subset
|
||||||
|
*/
|
||||||
|
private FrequencyTrie<String> compileSubset(final DictionaryData dictionaryData,
|
||||||
|
final ReductionSettings reductionSettings, final boolean storeOriginal, final int knowledgePercent,
|
||||||
|
final long seed) {
|
||||||
|
validateKnowledgePercent(knowledgePercent);
|
||||||
|
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||||
|
final SplittableRandom random = new SplittableRandom(seed);
|
||||||
|
|
||||||
|
for (DictionaryEntry entry : dictionaryData.entries()) {
|
||||||
|
if (!isSelected(random, knowledgePercent)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (storeOriginal) {
|
||||||
|
builder.put(entry.stem(), NOOP_PATCH_COMMAND);
|
||||||
|
}
|
||||||
|
for (String variant : entry.variants()) {
|
||||||
|
final String patch = this.patchCommandEncoder.encode(variant, entry.stem());
|
||||||
|
builder.put(variant, patch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates one input word form against both lookup APIs.
|
||||||
|
*
|
||||||
|
* @param input input form to transform
|
||||||
|
* @param expectedStem expected stem
|
||||||
|
* @param trie compiled trie under test
|
||||||
|
* @return immutable counts for this single input
|
||||||
|
*/
|
||||||
|
private static EvaluationCounts evaluateInput(final String input, final String expectedStem,
|
||||||
|
final FrequencyTrie<String> trie) {
|
||||||
|
long getCorrect = 0L;
|
||||||
|
final String preferredPatch = trie.get(input);
|
||||||
|
if (preferredPatch != null) {
|
||||||
|
final String preferredStem = PatchCommandEncoder.apply(input, preferredPatch);
|
||||||
|
if (expectedStem.equals(preferredStem)) {
|
||||||
|
getCorrect = 1L;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (expectedStem.equals(input)) {
|
||||||
|
getCorrect = 1L;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final String[] patches = trie.getAll(input);
|
||||||
|
|
||||||
|
long truePositives = 0L;
|
||||||
|
long falsePositives = 0L;
|
||||||
|
long coveredInputs = 0L;
|
||||||
|
for (String patch : patches) {
|
||||||
|
final String candidateStem = PatchCommandEncoder.apply(input, patch);
|
||||||
|
if (expectedStem.equals(candidateStem)) {
|
||||||
|
truePositives++;
|
||||||
|
coveredInputs = 1L;
|
||||||
|
} else {
|
||||||
|
falsePositives++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new EvaluationCounts(getCorrect, truePositives, falsePositives, coveredInputs, patches.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts how many entries would be selected for one scenario without
|
||||||
|
* recompiling the trie.
|
||||||
|
*
|
||||||
|
* @param entryCount total entry count
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @param knowledgePercent retained percentage of dictionary entries
|
||||||
|
* @return selected entry count
|
||||||
|
*/
|
||||||
|
private static long countSelectedEntries(final int entryCount, final long seed, final int knowledgePercent) {
|
||||||
|
validateKnowledgePercent(knowledgePercent);
|
||||||
|
final SplittableRandom random = new SplittableRandom(seed);
|
||||||
|
long count = 0L;
|
||||||
|
for (int index = 0; index < entryCount; index++) {
|
||||||
|
if (isSelected(random, knowledgePercent)) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether one entry is selected for the supplied knowledge level.
|
||||||
|
*
|
||||||
|
* @param random deterministic random source
|
||||||
|
* @param knowledgePercent retained percentage of entries
|
||||||
|
* @return {@code true} when the entry should be kept
|
||||||
|
*/
|
||||||
|
private static boolean isSelected(final SplittableRandom random, final int knowledgePercent) {
|
||||||
|
return random.nextInt(100) < knowledgePercent;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates one knowledge percentage value.
|
||||||
|
*
|
||||||
|
* @param knowledgePercent value to validate
|
||||||
|
*/
|
||||||
|
private static void validateKnowledgePercent(final int knowledgePercent) {
|
||||||
|
if (knowledgePercent < MINIMUM_KNOWLEDGE_PERCENT || knowledgePercent > MAXIMUM_KNOWLEDGE_PERCENT
|
||||||
|
|| knowledgePercent % KNOWLEDGE_PERCENT_STEP != 0) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"knowledgePercent must be one of 10, 20, ..., 100 but was " + knowledgePercent + '.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a safe ratio.
|
||||||
|
*
|
||||||
|
* @param numerator numerator
|
||||||
|
* @param denominator denominator
|
||||||
|
* @return ratio, or {@code 0.0} when the denominator is zero
|
||||||
|
*/
|
||||||
|
private static double ratio(final long numerator, final long denominator) {
|
||||||
|
if (denominator == 0L) { // NOPMD
|
||||||
|
return 0.0d;
|
||||||
|
}
|
||||||
|
return (double) numerator / (double) denominator; // NOPMD
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the harmonic mean of precision and recall.
|
||||||
|
*
|
||||||
|
* @param precision global precision
|
||||||
|
* @param recall global recall
|
||||||
|
* @return F1 score, or {@code 0.0} when both inputs are zero
|
||||||
|
*/
|
||||||
|
private static double f1(final double precision, final double recall) {
|
||||||
|
if (precision == 0.0d && recall == 0.0d) {
|
||||||
|
return 0.0d;
|
||||||
|
}
|
||||||
|
return 2.0d * precision * recall / (precision + recall);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* One parsed dictionary line.
|
||||||
|
*
|
||||||
|
* @param stem canonical stem
|
||||||
|
* @param variants known variants of the stem
|
||||||
|
* @param lineNumber physical line number in the source dictionary
|
||||||
|
*/
|
||||||
|
private record DictionaryEntry(String stem, String[] variants, int lineNumber) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a parsed dictionary entry.
|
||||||
|
*
|
||||||
|
* @param stem canonical stem
|
||||||
|
* @param variants known variants of the stem
|
||||||
|
* @param lineNumber physical line number in the source dictionary
|
||||||
|
*/
|
||||||
|
private DictionaryEntry {
|
||||||
|
Objects.requireNonNull(stem, "stem");
|
||||||
|
Objects.requireNonNull(variants, "variants");
|
||||||
|
if (lineNumber < 1) { // NOPMD
|
||||||
|
throw new IllegalArgumentException("lineNumber must be positive.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parsed dictionary state reused across all scenarios.
|
||||||
|
*
|
||||||
|
* @param sourceDescription logical source description
|
||||||
|
* @param parseStatistics parser statistics
|
||||||
|
* @param entries immutable ordered entries
|
||||||
|
*/
|
||||||
|
private record DictionaryData(String sourceDescription, StemmerDictionaryParser.ParseStatistics parseStatistics,
|
||||||
|
List<DictionaryEntry> entries) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates parsed dictionary data.
|
||||||
|
*
|
||||||
|
* @param sourceDescription logical source description
|
||||||
|
* @param parseStatistics parser statistics
|
||||||
|
* @param entries immutable ordered entries
|
||||||
|
*/
|
||||||
|
private DictionaryData {
|
||||||
|
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||||
|
Objects.requireNonNull(parseStatistics, "parseStatistics");
|
||||||
|
Objects.requireNonNull(entries, "entries");
|
||||||
|
entries = List.copyOf(entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of logical dictionary entries.
|
||||||
|
*
|
||||||
|
* @return entry count
|
||||||
|
*/
|
||||||
|
private int entryCount() {
|
||||||
|
return this.entries.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Per-input evaluation counts.
|
||||||
|
*/
|
||||||
|
private static final class EvaluationCounts {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preferred lookup correctness.
|
||||||
|
*/
|
||||||
|
private final long getCorrect;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of correct candidates returned by {@code getAll()}.
|
||||||
|
*/
|
||||||
|
private final long getAllTruePositives;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of incorrect candidates returned by {@code getAll()}.
|
||||||
|
*/
|
||||||
|
private final long getAllFalsePositives;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether the correct stem was covered by {@code getAll()}.
|
||||||
|
*/
|
||||||
|
private final long getAllCoveredInputs;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of candidate commands returned by {@code getAll()}.
|
||||||
|
*/
|
||||||
|
private final long uniqueCandidateCount;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new immutable counter object.
|
||||||
|
*
|
||||||
|
* @param getCorrect preferred lookup correctness
|
||||||
|
* @param getAllTruePositives correct candidates
|
||||||
|
* @param getAllFalsePositives incorrect candidates
|
||||||
|
* @param getAllCoveredInputs coverage marker
|
||||||
|
* @param uniqueCandidateCount candidate command count
|
||||||
|
*/
|
||||||
|
private EvaluationCounts(final long getCorrect, final long getAllTruePositives, final long getAllFalsePositives,
|
||||||
|
final long getAllCoveredInputs, final long uniqueCandidateCount) {
|
||||||
|
this.getCorrect = getCorrect;
|
||||||
|
this.getAllTruePositives = getAllTruePositives;
|
||||||
|
this.getAllFalsePositives = getAllFalsePositives;
|
||||||
|
this.getAllCoveredInputs = getAllCoveredInputs;
|
||||||
|
this.uniqueCandidateCount = uniqueCandidateCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns preferred lookup correctness.
|
||||||
|
*
|
||||||
|
* @return preferred lookup correctness
|
||||||
|
*/
|
||||||
|
private long getCorrect() {
|
||||||
|
return this.getCorrect;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of correct candidates.
|
||||||
|
*
|
||||||
|
* @return correct candidates
|
||||||
|
*/
|
||||||
|
private long getAllTruePositives() {
|
||||||
|
return this.getAllTruePositives;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of incorrect candidates.
|
||||||
|
*
|
||||||
|
* @return incorrect candidates
|
||||||
|
*/
|
||||||
|
private long getAllFalsePositives() {
|
||||||
|
return this.getAllFalsePositives;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the per-input coverage marker.
|
||||||
|
*
|
||||||
|
* @return coverage marker
|
||||||
|
*/
|
||||||
|
private long getAllCoveredInputs() {
|
||||||
|
return this.getAllCoveredInputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of candidate commands.
|
||||||
|
*
|
||||||
|
* @return candidate command count
|
||||||
|
*/
|
||||||
|
private long getUniqueCandidateCount() {
|
||||||
|
return this.uniqueCandidateCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* One immutable result row of the knowledge experiment.
|
||||||
|
*
|
||||||
|
* @param language language label
|
||||||
|
* @param reductionMode reduction mode name
|
||||||
|
* @param storeOriginal whether no-op patches were stored for
|
||||||
|
* canonical stems
|
||||||
|
* @param includeStemInEvaluation whether canonical stems were part of the
|
||||||
|
* evaluated inputs
|
||||||
|
* @param knowledgePercent retained knowledge percentage
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @param dictionaryEntryCount total parsed dictionary entry count
|
||||||
|
* @param trainingEntryCount selected dictionary entry count used for
|
||||||
|
* build
|
||||||
|
* @param evaluatedInputCount total evaluated input count
|
||||||
|
* @param getCorrectCount number of correct preferred
|
||||||
|
* transformations
|
||||||
|
* @param getAccuracy preferred lookup accuracy
|
||||||
|
* @param getAllTruePositiveCount number of unique correct candidates from
|
||||||
|
* {@code getAll()}
|
||||||
|
* @param getAllFalsePositiveCount number of unique incorrect candidates from
|
||||||
|
* {@code getAll()}
|
||||||
|
* @param getAllCoveredInputCount number of inputs for which the correct
|
||||||
|
* stem appeared in {@code getAll()}
|
||||||
|
* @param getAllPrecision global candidate precision for
|
||||||
|
* {@code getAll()}
|
||||||
|
* @param getAllRecall global input recall for {@code getAll()}
|
||||||
|
* @param getAllF1 F1 score derived from {@code getAll()}
|
||||||
|
* precision and recall
|
||||||
|
* @param averageUniqueCandidateCount average number of unique candidate stems
|
||||||
|
* per input
|
||||||
|
*/
|
||||||
|
public record ResultRow(String language, String reductionMode, boolean storeOriginal,
|
||||||
|
boolean includeStemInEvaluation, int knowledgePercent, long seed, int dictionaryEntryCount,
|
||||||
|
long trainingEntryCount, long evaluatedInputCount, long getCorrectCount, double getAccuracy,
|
||||||
|
long getAllTruePositiveCount, long getAllFalsePositiveCount, long getAllCoveredInputCount,
|
||||||
|
double getAllPrecision, double getAllRecall, double getAllF1, double averageUniqueCandidateCount) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one immutable result row.
|
||||||
|
*
|
||||||
|
* @param language language label
|
||||||
|
* @param reductionMode reduction mode name
|
||||||
|
* @param storeOriginal whether no-op patches were stored for
|
||||||
|
* canonical stems
|
||||||
|
* @param includeStemInEvaluation whether canonical stems were evaluated
|
||||||
|
* @param knowledgePercent retained knowledge percentage
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @param dictionaryEntryCount total dictionary entry count
|
||||||
|
* @param trainingEntryCount selected training entry count
|
||||||
|
* @param evaluatedInputCount total evaluated input count
|
||||||
|
* @param getCorrectCount number of correct preferred
|
||||||
|
* transformations
|
||||||
|
* @param getAccuracy preferred lookup accuracy
|
||||||
|
* @param getAllTruePositiveCount number of unique correct candidates
|
||||||
|
* @param getAllFalsePositiveCount number of unique incorrect candidates
|
||||||
|
* @param getAllCoveredInputCount coverage count for {@code getAll()}
|
||||||
|
* @param getAllPrecision global candidate precision for
|
||||||
|
* {@code getAll()}
|
||||||
|
* @param getAllRecall global input recall for {@code getAll()}
|
||||||
|
* @param getAllF1 harmonic mean of precision and recall
|
||||||
|
* @param averageUniqueCandidateCount average unique candidate count per input
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||||
|
public ResultRow {
|
||||||
|
Objects.requireNonNull(language, "language");
|
||||||
|
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||||
|
validateKnowledgePercent(knowledgePercent);
|
||||||
|
if (dictionaryEntryCount < 0) {
|
||||||
|
throw new IllegalArgumentException("dictionaryEntryCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (trainingEntryCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("trainingEntryCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (evaluatedInputCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("evaluatedInputCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (getCorrectCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("getCorrectCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (getAllTruePositiveCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("getAllTruePositiveCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (getAllFalsePositiveCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("getAllFalsePositiveCount must not be negative.");
|
||||||
|
}
|
||||||
|
if (getAllCoveredInputCount < 0L) {
|
||||||
|
throw new IllegalArgumentException("getAllCoveredInputCount must not be negative.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the stable CSV header of this result format.
|
||||||
|
*
|
||||||
|
* @return CSV header line
|
||||||
|
*/
|
||||||
|
public static String csvHeader() {
|
||||||
|
return String.join(",",
|
||||||
|
List.of("language", "reductionMode", "storeOriginal", "includeStemInEvaluation", "knowledgePercent",
|
||||||
|
"seed", "dictionaryEntryCount", "trainingEntryCount", "evaluatedInputCount",
|
||||||
|
"getCorrectCount", "getAccuracy", "getAllTruePositiveCount", "getAllFalsePositiveCount",
|
||||||
|
"getAllCoveredInputCount", "getAllPrecision", "getAllRecall", "getAllF1",
|
||||||
|
"averageUniqueCandidateCount"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serializes this row as one CSV record.
|
||||||
|
*
|
||||||
|
* @return CSV record
|
||||||
|
*/
|
||||||
|
public String toCsvRow() {
|
||||||
|
return String.join(",",
|
||||||
|
List.of(escapeCsv(this.language), escapeCsv(this.reductionMode), String.valueOf(this.storeOriginal),
|
||||||
|
String.valueOf(this.includeStemInEvaluation), String.valueOf(this.knowledgePercent),
|
||||||
|
String.valueOf(this.seed), String.valueOf(this.dictionaryEntryCount),
|
||||||
|
String.valueOf(this.trainingEntryCount), String.valueOf(this.evaluatedInputCount),
|
||||||
|
String.valueOf(this.getCorrectCount), formatDouble(this.getAccuracy),
|
||||||
|
String.valueOf(this.getAllTruePositiveCount), String.valueOf(this.getAllFalsePositiveCount),
|
||||||
|
String.valueOf(this.getAllCoveredInputCount), formatDouble(this.getAllPrecision),
|
||||||
|
formatDouble(this.getAllRecall), formatDouble(this.getAllF1),
|
||||||
|
formatDouble(this.averageUniqueCandidateCount)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes a string for CSV output.
|
||||||
|
*
|
||||||
|
* @param value value to escape
|
||||||
|
* @return escaped CSV cell
|
||||||
|
*/
|
||||||
|
private static String escapeCsv(final String value) {
|
||||||
|
if (value.indexOf(',') < 0 && value.indexOf('"') < 0 && value.indexOf('\n') < 0
|
||||||
|
&& value.indexOf('\r') < 0) {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
return '"' + value.replace("\"", "\"\"") + '"';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats one floating-point value using a locale-independent decimal
|
||||||
|
* representation.
|
||||||
|
*
|
||||||
|
* @param value value to format
|
||||||
|
* @return formatted value
|
||||||
|
*/
|
||||||
|
private static String formatDouble(final double value) {
|
||||||
|
return String.format(Locale.ROOT, "%.10f", value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,344 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Command-line entry point for the stemmer knowledge experiment.
|
||||||
|
*/
|
||||||
|
public final class StemmerKnowledgeExperimentCli {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Logger of this class.
|
||||||
|
*/
|
||||||
|
private static final Logger LOGGER = Logger.getLogger(StemmerKnowledgeExperimentCli.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exit status indicating success.
|
||||||
|
*/
|
||||||
|
private static final int EXIT_SUCCESS = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exit status indicating processing failure.
|
||||||
|
*/
|
||||||
|
private static final int EXIT_PROCESSING_ERROR = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exit status indicating invalid command-line usage.
|
||||||
|
*/
|
||||||
|
private static final int EXIT_USAGE_ERROR = 2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default deterministic seed.
|
||||||
|
*/
|
||||||
|
private static final long DEFAULT_SEED = 20_260_421L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default output report location.
|
||||||
|
*/
|
||||||
|
private static final Path DEFAULT_OUTPUT_PATH = Path.of("build", "reports", "stemmer-knowledge-experiment.csv");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Usage banner.
|
||||||
|
*/
|
||||||
|
private static final String USAGE = String.join(System.lineSeparator(),
|
||||||
|
"Usage: StemmerKnowledgeExperimentCli [--bundled-all | --bundled-language <LANG> | --input <PATH>]",
|
||||||
|
" [--seed <LONG>] [--output <CSV_PATH>]", "", "Examples:", " --bundled-all",
|
||||||
|
" --bundled-language US_UK_PROFI --seed 20260421",
|
||||||
|
" --input src/main/resources/us_uk/stemmer --output build/reports/knowledge.csv");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class.
|
||||||
|
*/
|
||||||
|
private StemmerKnowledgeExperimentCli() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the CLI as a standalone process.
|
||||||
|
*
|
||||||
|
* @param arguments command-line arguments
|
||||||
|
*/
|
||||||
|
public static void main(final String[] arguments) {
|
||||||
|
final int exitCode = execute(arguments);
|
||||||
|
System.exit(exitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the CLI and translates all outcomes to process exit codes.
|
||||||
|
*
|
||||||
|
* @param arguments command-line arguments
|
||||||
|
* @return process exit code
|
||||||
|
*/
|
||||||
|
/* default */ static int execute(final String... arguments) {
|
||||||
|
Objects.requireNonNull(arguments, "arguments");
|
||||||
|
try {
|
||||||
|
final CliOptions options = CliOptions.parse(arguments);
|
||||||
|
if (options.command() == Command.HELP) {
|
||||||
|
printUsage(System.out);
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
return runExperiment(options);
|
||||||
|
} catch (final CliUsageException exception) {
|
||||||
|
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||||
|
LOGGER.log(Level.SEVERE, "Invalid command-line usage for arguments {0}: {1}",
|
||||||
|
new Object[] { Arrays.toString(arguments), exception.getMessage() });
|
||||||
|
}
|
||||||
|
printUsage(System.err);
|
||||||
|
return EXIT_USAGE_ERROR;
|
||||||
|
} catch (final IOException exception) {
|
||||||
|
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||||
|
LOGGER.log(Level.SEVERE, "Experiment processing failed for arguments {0}", Arrays.toString(arguments));
|
||||||
|
LOGGER.log(Level.SEVERE, "Processing failure details.", exception);
|
||||||
|
}
|
||||||
|
return EXIT_PROCESSING_ERROR;
|
||||||
|
} catch (final RuntimeException exception) { // NOPMD
|
||||||
|
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||||
|
LOGGER.log(Level.SEVERE, "Unexpected runtime failure for arguments {0}", Arrays.toString(arguments));
|
||||||
|
LOGGER.log(Level.SEVERE, "Unexpected processing failure details.", exception);
|
||||||
|
}
|
||||||
|
return EXIT_PROCESSING_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the experiment for already validated options.
|
||||||
|
*
|
||||||
|
* @param options validated CLI options
|
||||||
|
* @return process exit code
|
||||||
|
* @throws IOException if experiment execution fails
|
||||||
|
*/
|
||||||
|
private static int runExperiment(final CliOptions options) throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = switch (options.sourceMode()) {
|
||||||
|
case INPUT_PATH -> experiment.evaluatePath(options.inputPath(), options.seed());
|
||||||
|
case SINGLE_BUNDLED_LANGUAGE -> experiment.evaluateBundledLanguage(options.language(), options.seed());
|
||||||
|
case ALL_BUNDLED_LANGUAGES -> experiment.evaluateAllBundledLanguages(options.seed());
|
||||||
|
};
|
||||||
|
|
||||||
|
StemmerKnowledgeExperiment.writeCsv(options.outputPath(), rows);
|
||||||
|
if (LOGGER.isLoggable(Level.INFO)) {
|
||||||
|
LOGGER.log(Level.INFO, "Knowledge experiment report written to {0} with {1} rows.",
|
||||||
|
new Object[] { options.outputPath().toAbsolutePath(), rows.size() });
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prints the CLI usage text.
|
||||||
|
*
|
||||||
|
* @param stream target output stream
|
||||||
|
*/
|
||||||
|
private static void printUsage(final PrintStream stream) {
|
||||||
|
stream.println(USAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Supported top-level CLI commands.
|
||||||
|
*/
|
||||||
|
private enum Command {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the experiment.
|
||||||
|
*/
|
||||||
|
EXECUTE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prints usage text.
|
||||||
|
*/
|
||||||
|
HELP
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Supported experiment source selection modes.
|
||||||
|
*/
|
||||||
|
private enum ExperimentSourceMode {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the experiment for all bundled languages.
|
||||||
|
*/
|
||||||
|
ALL_BUNDLED_LANGUAGES,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the experiment for one bundled language.
|
||||||
|
*/
|
||||||
|
SINGLE_BUNDLED_LANGUAGE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the experiment for one external dictionary path.
|
||||||
|
*/
|
||||||
|
INPUT_PATH
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exception indicating invalid command-line usage.
|
||||||
|
*/
|
||||||
|
private static final class CliUsageException extends Exception {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = -3904751711104596247L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new usage exception.
|
||||||
|
*
|
||||||
|
* @param message failure description
|
||||||
|
*/
|
||||||
|
private CliUsageException(final String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new usage exception.
|
||||||
|
*
|
||||||
|
* @param message failure description
|
||||||
|
* @param cause original cause
|
||||||
|
*/
|
||||||
|
private CliUsageException(final String message, final Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parsed CLI options.
|
||||||
|
*
|
||||||
|
* @param command selected top-level command
|
||||||
|
* @param sourceMode selected experiment source mode
|
||||||
|
* @param inputPath optional filesystem dictionary path
|
||||||
|
* @param language optional bundled language
|
||||||
|
* @param seed deterministic sampling seed
|
||||||
|
* @param outputPath CSV report path
|
||||||
|
*/
|
||||||
|
private record CliOptions(Command command, ExperimentSourceMode sourceMode, Path inputPath,
|
||||||
|
StemmerPatchTrieLoader.Language language, long seed, Path outputPath) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the command line.
|
||||||
|
*
|
||||||
|
* @param arguments command-line arguments
|
||||||
|
* @return parsed options
|
||||||
|
* @throws CliUsageException if the command line is invalid
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("PMD.AvoidReassigningLoopVariables")
|
||||||
|
private static CliOptions parse(final String... arguments) throws CliUsageException {
|
||||||
|
Objects.requireNonNull(arguments, "arguments");
|
||||||
|
|
||||||
|
Command command = Command.EXECUTE;
|
||||||
|
ExperimentSourceMode sourceMode = ExperimentSourceMode.ALL_BUNDLED_LANGUAGES;
|
||||||
|
Path inputPath = null;
|
||||||
|
StemmerPatchTrieLoader.Language language = null;
|
||||||
|
long seed = DEFAULT_SEED;
|
||||||
|
Path outputPath = DEFAULT_OUTPUT_PATH;
|
||||||
|
|
||||||
|
final List<String> tokens = new ArrayList<>(List.of(arguments));
|
||||||
|
for (int index = 0; index < tokens.size(); index++) {
|
||||||
|
final String token = tokens.get(index);
|
||||||
|
switch (token) {
|
||||||
|
case "--input" -> {
|
||||||
|
sourceMode = ExperimentSourceMode.INPUT_PATH;
|
||||||
|
inputPath = Path.of(requireValue(tokens, ++index, token));
|
||||||
|
language = null;
|
||||||
|
}
|
||||||
|
case "--bundled-language" -> {
|
||||||
|
sourceMode = ExperimentSourceMode.SINGLE_BUNDLED_LANGUAGE;
|
||||||
|
language = parseLanguage(requireValue(tokens, ++index, token));
|
||||||
|
inputPath = null;
|
||||||
|
}
|
||||||
|
case "--bundled-all" -> {
|
||||||
|
sourceMode = ExperimentSourceMode.ALL_BUNDLED_LANGUAGES;
|
||||||
|
inputPath = null;
|
||||||
|
language = null;
|
||||||
|
}
|
||||||
|
case "--seed" -> seed = parseSeed(requireValue(tokens, ++index, token));
|
||||||
|
case "--output" -> outputPath = Path.of(requireValue(tokens, ++index, token));
|
||||||
|
case "--help", "-h" -> command = Command.HELP;
|
||||||
|
default -> throw new CliUsageException("Unknown argument: " + token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new CliOptions(command, sourceMode, inputPath, language, seed, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the required value after one option token.
|
||||||
|
*
|
||||||
|
* @param tokens all tokens
|
||||||
|
* @param index expected value index
|
||||||
|
* @param option current option token
|
||||||
|
* @return option value
|
||||||
|
* @throws CliUsageException if the option value is missing
|
||||||
|
*/
|
||||||
|
private static String requireValue(final List<String> tokens, final int index, final String option)
|
||||||
|
throws CliUsageException {
|
||||||
|
if (index >= tokens.size()) {
|
||||||
|
throw new CliUsageException("Missing value for option " + option + '.');
|
||||||
|
}
|
||||||
|
return tokens.get(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the deterministic seed.
|
||||||
|
*
|
||||||
|
* @param value textual seed value
|
||||||
|
* @return parsed seed
|
||||||
|
* @throws CliUsageException if the seed value is invalid
|
||||||
|
*/
|
||||||
|
private static long parseSeed(final String value) throws CliUsageException {
|
||||||
|
try {
|
||||||
|
return Long.parseLong(value);
|
||||||
|
} catch (final NumberFormatException exception) {
|
||||||
|
throw new CliUsageException("Invalid value for --seed: " + value, exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the bundled language selector.
|
||||||
|
*
|
||||||
|
* @param value textual language name
|
||||||
|
* @return parsed language
|
||||||
|
* @throws CliUsageException if the language value is invalid
|
||||||
|
*/
|
||||||
|
private static StemmerPatchTrieLoader.Language parseLanguage(final String value) throws CliUsageException {
|
||||||
|
try {
|
||||||
|
return StemmerPatchTrieLoader.Language.valueOf(value);
|
||||||
|
} catch (final IllegalArgumentException exception) {
|
||||||
|
throw new CliUsageException("Invalid value for --bundled-language: " + value, exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -94,6 +94,29 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* with an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path source file
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final Path path, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
|
||||||
|
try (InputStream fileInputStream = Files.newInputStream(path)) {
|
||||||
|
return read(fileInputStream, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
* string.
|
* string.
|
||||||
@@ -108,6 +131,26 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
return read(Path.of(fileName));
|
return read(Path.of(fileName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* string with an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName source file name or path string
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
|
return read(Path.of(fileName), maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
||||||
*
|
*
|
||||||
@@ -132,6 +175,76 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from an input stream with
|
||||||
|
* an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param inputStream source stream
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final InputStream inputStream, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
|
||||||
|
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
|
||||||
|
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC,
|
||||||
|
maxExpandedIndex);
|
||||||
|
|
||||||
|
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
|
||||||
|
return trie;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||||
|
* at a filesystem path.
|
||||||
|
*
|
||||||
|
* @param path source file
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final Path path) throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
return read(path).metadata();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||||
|
* at a filesystem path string.
|
||||||
|
*
|
||||||
|
* @param fileName source file name or path string
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final String fileName) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
|
return readMetadata(Path.of(fileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie from an
|
||||||
|
* input stream.
|
||||||
|
*
|
||||||
|
* @param inputStream source stream
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
return read(inputStream).metadata();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -30,24 +30,27 @@
|
|||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
package org.egothor.stemmer;
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.PushbackInputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loader of patch-command tries from bundled stemmer dictionaries.
|
* Loader of patch-command tries from bundled stemmer dictionaries.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Each dictionary is line-oriented. The first token on a line is interpreted as
|
* Each dictionary is line-oriented and uses a tab-separated values layout. The
|
||||||
* the stem, and all following tokens are treated as known variants of that
|
* first column on a line is interpreted as the stem, and all following
|
||||||
* stem.
|
* tab-separated columns are treated as known variants of that stem.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* For each line, the loader inserts:
|
* For each line, the loader inserts:
|
||||||
@@ -55,15 +58,21 @@ import java.util.logging.Logger;
|
|||||||
* <li>the stem itself mapped to the canonical no-op patch command
|
* <li>the stem itself mapped to the canonical no-op patch command
|
||||||
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
|
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
|
||||||
* <li>every distinct variant mapped to the patch command transforming that
|
* <li>every distinct variant mapped to the patch command transforming that
|
||||||
* variant to the stem</li>
|
* variant to the stem using the traversal direction implied by the selected
|
||||||
|
* language or loader overload</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
|
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
|
||||||
* line remarks introduced by {@code #} or {@code //}.
|
* line remarks introduced by {@code #} or {@code //} and ignores dictionary
|
||||||
|
* items containing Unicode whitespace characters while reporting them through
|
||||||
|
* aggregated warning log records.
|
||||||
*/
|
*/
|
||||||
public final class StemmerPatchTrieLoader {
|
public final class StemmerPatchTrieLoader {
|
||||||
|
|
||||||
|
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
||||||
|
private static final String PARAMETER_PATH = "path";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Logger of this class.
|
* Logger of this class.
|
||||||
*/
|
*/
|
||||||
@@ -83,90 +92,151 @@ public final class StemmerPatchTrieLoader {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Supported bundled stemmer dictionaries.
|
* Supported bundled stemmer dictionaries.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Each language constant defines:
|
||||||
|
* </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>the resource directory name used under the bundled resources tree</li>
|
||||||
|
* <li>whether the language is written right-to-left</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The right-to-left flag is intended for consumers that need to decide whether
|
||||||
|
* affix-oriented processing should conceptually traverse words from the visual
|
||||||
|
* end or from the logical beginning of the stored form.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public enum Language {
|
public enum Language {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Czech.
|
||||||
|
*/
|
||||||
|
CS_CZ("cs_cz", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Danish.
|
* Danish.
|
||||||
*/
|
*/
|
||||||
DA_DK("da_dk"),
|
DA_DK("da_dk", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* German.
|
* German.
|
||||||
*/
|
*/
|
||||||
DE_DE("de_de"),
|
DE_DE("de_de", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Spanish.
|
* Spanish.
|
||||||
*/
|
*/
|
||||||
ES_ES("es_es"),
|
ES_ES("es_es", false),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Persian.
|
||||||
|
*/
|
||||||
|
FA_IR("fa_ir", true),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finnish.
|
||||||
|
*/
|
||||||
|
FI_FI("fi_fi", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* French.
|
* French.
|
||||||
*/
|
*/
|
||||||
FR_FR("fr_fr"),
|
FR_FR("fr_fr", false),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hebrew.
|
||||||
|
*/
|
||||||
|
HE_IL("he_il", true),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hungarian.
|
||||||
|
*/
|
||||||
|
HU_HU("hu_hu", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Italian.
|
* Italian.
|
||||||
*/
|
*/
|
||||||
IT_IT("it_it"),
|
IT_IT("it_it", false),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Norwegian Bokmål.
|
||||||
|
*/
|
||||||
|
NB_NO("nb_no", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dutch.
|
* Dutch.
|
||||||
*/
|
*/
|
||||||
NL_NL("nl_nl"),
|
NL_NL("nl_nl", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Norwegian.
|
* Norwegian Nynorsk.
|
||||||
*/
|
*/
|
||||||
NO_NO("no_no"),
|
NN_NO("nn_no", false),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Polish.
|
||||||
|
*/
|
||||||
|
PL_PL("pl_pl", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Portuguese.
|
* Portuguese.
|
||||||
*/
|
*/
|
||||||
PT_PT("pt_pt"),
|
PT_PT("pt_pt", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Russian.
|
* Russian.
|
||||||
*/
|
*/
|
||||||
RU_RU("ru_ru"),
|
RU_RU("ru_ru", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Swedish.
|
* Swedish.
|
||||||
*/
|
*/
|
||||||
SV_SE("sv_se"),
|
SV_SE("sv_se", false),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ukrainian.
|
||||||
|
*/
|
||||||
|
UK_UA("uk_ua", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* English.
|
* English.
|
||||||
*/
|
*/
|
||||||
US_UK("us_uk"),
|
US_UK("us_uk", false),
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* English professional dictionary.
|
* Yiddish.
|
||||||
*/
|
*/
|
||||||
US_UK_PROFI("us_uk.profi");
|
YI("yi", true);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resource directory name.
|
* Resource directory name.
|
||||||
*/
|
*/
|
||||||
private final String resourceDirectory;
|
private final String resourceDirectory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether the language is written right-to-left.
|
||||||
|
*/
|
||||||
|
private final boolean rightToLeft;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a language constant.
|
* Creates a language constant.
|
||||||
*
|
*
|
||||||
* @param resourceDirectory resource directory name
|
* @param resourceDirectory resource directory name
|
||||||
|
* @param rightToLeft whether the language is written right-to-left
|
||||||
*/
|
*/
|
||||||
Language(final String resourceDirectory) {
|
Language(final String resourceDirectory, final boolean rightToLeft) {
|
||||||
this.resourceDirectory = resourceDirectory;
|
this.resourceDirectory = resourceDirectory;
|
||||||
|
this.rightToLeft = rightToLeft;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the classpath resource path of the stemmer dictionary.
|
* Returns the classpath resource path of the bundled stemmer dictionary.
|
||||||
*
|
*
|
||||||
* @return classpath resource path
|
* @return classpath resource path
|
||||||
*/
|
*/
|
||||||
public String resourcePath() {
|
public String resourcePath() {
|
||||||
return this.resourceDirectory + "/stemmer";
|
return this.resourceDirectory + "/stemmer.gz";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -177,11 +247,45 @@ public final class StemmerPatchTrieLoader {
|
|||||||
public String resourceDirectory() {
|
public String resourceDirectory() {
|
||||||
return this.resourceDirectory;
|
return this.resourceDirectory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the language is written right-to-left.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This flag can be used by trie-building and lookup logic to decide whether
|
||||||
|
* suffix-oriented traversal should operate on the stored word form as-is rather
|
||||||
|
* than by reversing the logical character sequence.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return {@code true} when the language is written right-to-left, otherwise
|
||||||
|
* {@code false}
|
||||||
|
*/
|
||||||
|
public boolean isRightToLeft() {
|
||||||
|
return this.rightToLeft;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a bundled dictionary using explicit reduction settings.
|
* Loads a bundled dictionary using explicit reduction settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload applies the following implicit compilation defaults in addition
|
||||||
|
* to the supplied {@code reductionSettings}:
|
||||||
|
* </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>traversal direction is derived from {@link Language#isRightToLeft()}
|
||||||
|
* ({@link WordTraversalDirection#FORWARD} for right-to-left languages,
|
||||||
|
* {@link WordTraversalDirection#BACKWARD} otherwise)</li>
|
||||||
|
* <li>case processing mode is
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}</li>
|
||||||
|
* <li>diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The resolved settings are persisted into {@link TrieMetadata} of the
|
||||||
|
* resulting trie.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param language bundled language dictionary
|
* @param language bundled language dictionary
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -194,13 +298,40 @@ public final class StemmerPatchTrieLoader {
|
|||||||
final ReductionSettings reductionSettings) throws IOException {
|
final ReductionSettings reductionSettings) throws IOException {
|
||||||
Objects.requireNonNull(language, "language");
|
Objects.requireNonNull(language, "language");
|
||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
|
return load(language, storeOriginal, metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a bundled dictionary using explicit trie compilation metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* All semantic compilation settings (reduction mode and thresholds, traversal
|
||||||
|
* direction, case processing mode, and diacritic processing mode) are taken
|
||||||
|
* from the supplied metadata object and are persisted unchanged in the
|
||||||
|
* resulting trie.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param language bundled language dictionary
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the dictionary cannot be found or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||||
|
final TrieMetadata metadata) throws IOException {
|
||||||
|
Objects.requireNonNull(language, "language");
|
||||||
|
Objects.requireNonNull(metadata, "metadata");
|
||||||
|
|
||||||
final String resourcePath = language.resourcePath();
|
final String resourcePath = language.resourcePath();
|
||||||
|
|
||||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||||
BufferedReader reader = new BufferedReader(
|
BufferedReader reader = new BufferedReader(
|
||||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
return load(reader, resourcePath, storeOriginal, reductionSettings);
|
return load(reader, resourcePath, storeOriginal, metadata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,6 +339,14 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||||
* mode.
|
* mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload is equivalent to calling
|
||||||
|
* {@link #load(Language, boolean, ReductionSettings)} with
|
||||||
|
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the
|
||||||
|
* same implicit defaults for traversal direction, case processing mode, and
|
||||||
|
* diacritic processing mode.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param language bundled language dictionary
|
* @param language bundled language dictionary
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -225,6 +364,14 @@ public final class StemmerPatchTrieLoader {
|
|||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload applies historical Egothor-compatible implicit defaults:
|
||||||
|
* {@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in
|
||||||
|
* resulting trie metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -235,11 +382,119 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
final ReductionSettings reductionSettings) throws IOException {
|
final ReductionSettings reductionSettings) throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
/**
|
||||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings);
|
* Loads a dictionary from a filesystem path using explicit reduction settings
|
||||||
|
* and explicit traversal direction.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Implicit defaults still apply for unspecified dimensions:
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
|
* the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys and
|
||||||
|
* patch commands
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||||
|
throws IOException {
|
||||||
|
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||||
|
* explicit traversal direction, and explicit case processing mode.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload still defaults diacritic processing to
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
|
* the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys and
|
||||||
|
* patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||||
|
return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||||
|
* traversal direction, case processing mode, and diacritic processing mode.
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted
|
||||||
|
* using the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys
|
||||||
|
* and patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary
|
||||||
|
* parsing
|
||||||
|
* @param diacriticProcessingMode diacritic processing mode used during
|
||||||
|
* dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
|
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||||
|
diacriticProcessingMode);
|
||||||
|
return load(path, storeOriginal, metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path using explicit trie compilation
|
||||||
|
* metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The supplied metadata is the authoritative source of trie compilation
|
||||||
|
* semantics. Callers should ensure metadata matches how they expect to query
|
||||||
|
* the trie (for example, with or without lowercasing or diacritic stripping).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
|
Objects.requireNonNull(metadata, "metadata");
|
||||||
|
|
||||||
|
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||||
|
BufferedReader reader = new BufferedReader(
|
||||||
|
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
|
return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,6 +502,15 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path using default settings for the
|
* Loads a dictionary from a filesystem path using default settings for the
|
||||||
* supplied reduction mode.
|
* supplied reduction mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload is equivalent to calling
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings)} with
|
||||||
|
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -265,6 +529,13 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
* settings.
|
* settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as {@link #load(Path, boolean, ReductionSettings)} including
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -275,14 +546,130 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
final ReductionSettings reductionSettings) throws IOException {
|
final ReductionSettings reductionSettings) throws IOException {
|
||||||
Objects.requireNonNull(fileName, "fileName");
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
|
* settings and explicit traversal direction.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}.
|
||||||
|
* Implicit defaults remain
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
|
* the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys and
|
||||||
|
* patch commands
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
|
* settings, explicit traversal direction, and explicit case processing mode.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}.
|
||||||
|
* Implicit default remains {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
|
* the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys and
|
||||||
|
* patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
|
* settings, explicit traversal direction, explicit case processing mode, and
|
||||||
|
* explicit diacritic processing mode.
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted
|
||||||
|
* using the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys
|
||||||
|
* and patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary
|
||||||
|
* parsing
|
||||||
|
* @param diacriticProcessingMode diacritic processing mode used during
|
||||||
|
* dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
diacriticProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit trie
|
||||||
|
* compilation metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as {@link #load(Path, boolean, TrieMetadata)}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final TrieMetadata metadata) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, metadata);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path string using default settings for
|
* Loads a dictionary from a filesystem path string using default settings for
|
||||||
* the supplied reduction mode.
|
* the supplied reduction mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -293,7 +680,7 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
final ReductionMode reductionMode) throws IOException {
|
final ReductionMode reductionMode) throws IOException {
|
||||||
Objects.requireNonNull(fileName, "fileName");
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
return load(Path.of(fileName), storeOriginal, reductionMode);
|
return load(Path.of(fileName), storeOriginal, reductionMode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -304,18 +691,21 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* @param sourceDescription logical source description used for diagnostics
|
* @param sourceDescription logical source description used for diagnostics
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
* @param reductionSettings reduction settings
|
* @param metadata trie metadata used to drive all compilation settings
|
||||||
* @return compiled patch-command trie
|
* @return compiled patch-command trie
|
||||||
* @throws IOException if parsing fails
|
* @throws IOException if parsing fails
|
||||||
*/
|
*/
|
||||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||||
final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException {
|
final boolean storeOriginal, final TrieMetadata metadata) throws IOException {
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder();
|
metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(),
|
||||||
|
metadata.diacriticProcessingMode());
|
||||||
|
final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(metadata.traversalDirection()).build();
|
||||||
final int[] insertedMappings = new int[1];
|
final int[] insertedMappings = new int[1];
|
||||||
|
|
||||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||||
sourceDescription, (stem, variants, lineNumber) -> {
|
sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> {
|
||||||
if (storeOriginal) {
|
if (storeOriginal) {
|
||||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||||
insertedMappings[0]++;
|
insertedMappings[0]++;
|
||||||
@@ -331,14 +721,35 @@ public final class StemmerPatchTrieLoader {
|
|||||||
|
|
||||||
if (LOGGER.isLoggable(Level.FINE)) {
|
if (LOGGER.isLoggable(Level.FINE)) {
|
||||||
LOGGER.log(Level.FINE,
|
LOGGER.log(Level.FINE,
|
||||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}.",
|
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.",
|
||||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||||
statistics.entryCount(), statistics.ignoredLineCount() });
|
statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() });
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.build();
|
return builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||||
|
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
|
Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
|
return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode,
|
||||||
|
caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolves the traversal direction implied by a bundled language definition.
|
||||||
|
*
|
||||||
|
* @param language bundled language
|
||||||
|
* @return traversal direction to use for that language
|
||||||
|
*/
|
||||||
|
private static WordTraversalDirection traversalDirectionOf(final Language language) {
|
||||||
|
return language.isRightToLeft() ? WordTraversalDirection.FORWARD : WordTraversalDirection.BACKWARD;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||||
*
|
*
|
||||||
@@ -349,10 +760,31 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* read
|
* read
|
||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
return StemmerPatchTrieBinaryIO.read(path);
|
return StemmerPatchTrieBinaryIO.read(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* using a custom dense lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter that does not affect persisted
|
||||||
|
* metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the compressed binary trie file
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> loadBinary(final Path path, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
|
return StemmerPatchTrieBinaryIO.read(path, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
* string.
|
* string.
|
||||||
@@ -364,10 +796,31 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* read
|
* read
|
||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
||||||
Objects.requireNonNull(fileName, "fileName");
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* string using a custom dense lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter that does not affect persisted
|
||||||
|
* metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> loadBinary(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return StemmerPatchTrieBinaryIO.read(fileName, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
||||||
*
|
*
|
||||||
@@ -381,6 +834,50 @@ public final class StemmerPatchTrieLoader {
|
|||||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie file.
|
||||||
|
*
|
||||||
|
* @param path path to the compressed binary trie file
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||||
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie file.
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie stream.
|
||||||
|
*
|
||||||
|
* @param inputStream source input stream
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if the stream cannot be decompressed or read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||||
*
|
*
|
||||||
@@ -391,7 +888,7 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||||
Objects.requireNonNull(trie, "trie");
|
Objects.requireNonNull(trie, "trie");
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
StemmerPatchTrieBinaryIO.write(trie, path);
|
StemmerPatchTrieBinaryIO.write(trie, path);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -405,10 +902,40 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||||
Objects.requireNonNull(trie, "trie");
|
Objects.requireNonNull(trie, "trie");
|
||||||
Objects.requireNonNull(fileName, "fileName");
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens one filesystem dictionary input stream.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Plain-text dictionaries are returned as-is. GZip-compressed dictionaries are
|
||||||
|
* detected from the stream header rather than from the file extension so that
|
||||||
|
* callers may provide arbitrary temporary file names without changing the
|
||||||
|
* loading contract.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path dictionary file path
|
||||||
|
* @return opened dictionary stream, transparently decompressing GZip inputs
|
||||||
|
* @throws IOException if the file cannot be opened
|
||||||
|
*/
|
||||||
|
private static InputStream openDictionaryInputStream(final Path path) throws IOException {
|
||||||
|
final PushbackInputStream pushbackInputStream = new PushbackInputStream(
|
||||||
|
new BufferedInputStream(Files.newInputStream(path)), 2);
|
||||||
|
final byte[] header = pushbackInputStream.readNBytes(2);
|
||||||
|
|
||||||
|
if (header.length > 0) {
|
||||||
|
pushbackInputStream.unread(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header.length == 2 && (header[0] & 0xFF) == 0x1F && (header[1] & 0xFF) == 0x8B) {
|
||||||
|
return new GZIPInputStream(pushbackInputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pushbackInputStream;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Opens a bundled resource from the classpath.
|
* Opens a bundled resource from the classpath.
|
||||||
*
|
*
|
||||||
@@ -416,12 +943,12 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* @return opened input stream
|
* @return opened input stream
|
||||||
* @throws IOException if the resource cannot be found
|
* @throws IOException if the resource cannot be found
|
||||||
*/
|
*/
|
||||||
private static InputStream openBundledResource(final String resourcePath) throws IOException {
|
/* default */ static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||||
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||||
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
|
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
|
||||||
if (inputStream == null) {
|
if (inputStream == null) {
|
||||||
throw new IOException("Stemmer resource not found: " + resourcePath);
|
throw new IOException("Stemmer resource not found: " + resourcePath);
|
||||||
}
|
}
|
||||||
return inputStream;
|
return new GZIPInputStream(inputStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
235
src/main/java/org/egothor/stemmer/TrieMetadata.java
Normal file
235
src/main/java/org/egothor/stemmer/TrieMetadata.java
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Immutable metadata persisted together with a compiled trie artifact.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The metadata captures the semantic build configuration required to interpret
|
||||||
|
* the compiled trie correctly after it is reloaded. Persisting the metadata as
|
||||||
|
* part of the artifact makes the binary format self-describing and avoids
|
||||||
|
* coupling runtime consumers to external side-channel configuration.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The record is intentionally extensible. It already models traversal
|
||||||
|
* direction, reduction settings, and diacritic processing strategy, even though
|
||||||
|
* not every field necessarily influences all current code paths yet.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param formatVersion persisted binary format version of the trie
|
||||||
|
* artifact
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param reductionSettings reduction settings used during compilation
|
||||||
|
* @param diacriticProcessingMode diacritic processing strategy associated with
|
||||||
|
* the artifact
|
||||||
|
* @param caseProcessingMode case processing strategy associated with the
|
||||||
|
* artifact
|
||||||
|
*/
|
||||||
|
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
|
||||||
|
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
|
||||||
|
CaseProcessingMode caseProcessingMode) {
|
||||||
|
/**
|
||||||
|
* Header identifying the human-readable metadata block layout.
|
||||||
|
*/
|
||||||
|
private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new metadata instance.
|
||||||
|
*
|
||||||
|
* @param formatVersion persisted binary format version, must be at
|
||||||
|
* least {@code 1}
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param reductionSettings reduction settings used during compilation
|
||||||
|
* @param diacriticProcessingMode diacritic processing strategy
|
||||||
|
* @param caseProcessingMode case processing strategy
|
||||||
|
*/
|
||||||
|
public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||||
|
final CaseProcessingMode caseProcessingMode) {
|
||||||
|
if (formatVersion < 1) { // NOPMD
|
||||||
|
throw new IllegalArgumentException("formatVersion must be at least 1.");
|
||||||
|
}
|
||||||
|
this.formatVersion = formatVersion;
|
||||||
|
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
|
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
|
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates metadata populated with current-format defaults for freshly compiled
|
||||||
|
* tries.
|
||||||
|
*
|
||||||
|
* @param formatVersion persisted binary format version
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param reductionSettings reduction settings used during compilation
|
||||||
|
* @return metadata initialized with current defaults
|
||||||
|
*/
|
||||||
|
public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings) {
|
||||||
|
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates metadata for a newly compiled trie using the currently persisted
|
||||||
|
* binary stream format version.
|
||||||
|
*
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param reductionSettings reduction settings used during compilation
|
||||||
|
* @param diacriticProcessingMode diacritic processing strategy
|
||||||
|
* @param caseProcessingMode case processing strategy
|
||||||
|
* @return metadata aligned with the current persisted stream format
|
||||||
|
*/
|
||||||
|
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||||
|
final CaseProcessingMode caseProcessingMode) {
|
||||||
|
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
|
||||||
|
diacriticProcessingMode, caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates metadata compatible with a legacy artifact version that did not store
|
||||||
|
* the full configuration explicitly.
|
||||||
|
*
|
||||||
|
* @param formatVersion legacy persisted binary format version
|
||||||
|
* @param traversalDirection logical key traversal direction reconstructed from
|
||||||
|
* the legacy stream
|
||||||
|
* @return metadata reconstructed with conservative compatibility defaults
|
||||||
|
*/
|
||||||
|
public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) {
|
||||||
|
return new TrieMetadata(formatVersion, traversalDirection,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns metadata encoded as a deterministic human-readable text block.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The format intentionally uses plain {@code key=value} lines so users can
|
||||||
|
* inspect metadata quickly from a decompressed trie payload without additional
|
||||||
|
* dependencies.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return persisted metadata text block
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("PMD.ConsecutiveLiteralAppends")
|
||||||
|
public String toTextBlock() {
|
||||||
|
final StringBuilder textBlockBuilder = new StringBuilder(1024);
|
||||||
|
textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n')
|
||||||
|
//
|
||||||
|
.append("formatVersion=").append(this.formatVersion).append('\n')
|
||||||
|
//
|
||||||
|
.append("traversalDirection=").append(this.traversalDirection.name()).append('\n')
|
||||||
|
//
|
||||||
|
.append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n')
|
||||||
|
//
|
||||||
|
.append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n')
|
||||||
|
//
|
||||||
|
.append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent())
|
||||||
|
.append('\n')
|
||||||
|
//
|
||||||
|
.append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio())
|
||||||
|
.append('\n')
|
||||||
|
//
|
||||||
|
.append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n')
|
||||||
|
//
|
||||||
|
.append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n');
|
||||||
|
return textBlockBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses metadata from a text block produced by {@link #toTextBlock()}.
|
||||||
|
*
|
||||||
|
* @param formatVersion persisted binary format version
|
||||||
|
* @param textBlock metadata text block
|
||||||
|
* @return parsed metadata
|
||||||
|
*/
|
||||||
|
public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) {
|
||||||
|
Objects.requireNonNull(textBlock, "textBlock");
|
||||||
|
|
||||||
|
final String[] lines = textBlock.split("\\R");
|
||||||
|
if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) {
|
||||||
|
throw new IllegalArgumentException("Unsupported metadata block header.");
|
||||||
|
}
|
||||||
|
|
||||||
|
final Map<String, String> entries = new HashMap<>();
|
||||||
|
for (int index = 1; index < lines.length; index++) {
|
||||||
|
final String line = lines[index];
|
||||||
|
if (line.isBlank()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final int delimiterIndex = line.indexOf('=');
|
||||||
|
if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) {
|
||||||
|
throw new IllegalArgumentException("Invalid metadata line: " + line);
|
||||||
|
}
|
||||||
|
entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
final WordTraversalDirection traversalDirection = WordTraversalDirection
|
||||||
|
.valueOf(requireEntry(entries, "traversalDirection"));
|
||||||
|
final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode"));
|
||||||
|
final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent"));
|
||||||
|
final int dominantWinnerOverSecondRatio = Integer // NOPMD
|
||||||
|
.parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio"));
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode
|
||||||
|
.valueOf(requireEntry(entries, "diacriticProcessingMode"));
|
||||||
|
final CaseProcessingMode caseProcessingMode = CaseProcessingMode
|
||||||
|
.valueOf(requireEntry(entries, "caseProcessingMode"));
|
||||||
|
|
||||||
|
return new TrieMetadata(formatVersion, traversalDirection,
|
||||||
|
new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
|
||||||
|
diacriticProcessingMode, caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a required metadata entry from a parsed text block.
|
||||||
|
*
|
||||||
|
* @param entries parsed metadata entries
|
||||||
|
* @param key required entry key
|
||||||
|
* @return non-blank entry value
|
||||||
|
* @throws IllegalArgumentException if the entry is absent or blank
|
||||||
|
*/
|
||||||
|
private static String requireEntry(final Map<String, String> entries, final String key) {
|
||||||
|
final String value = entries.get(key);
|
||||||
|
if (value == null || value.isBlank()) {
|
||||||
|
throw new IllegalArgumentException("Missing metadata entry: " + key);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
152
src/main/java/org/egothor/stemmer/WordTraversalDirection.java
Normal file
152
src/main/java/org/egothor/stemmer/WordTraversalDirection.java
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines the logical direction in which word characters are traversed.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The same direction is used consistently in two places:
|
||||||
|
* </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>when a word key is traversed through a trie</li>
|
||||||
|
* <li>when patch commands are serialized and then applied back to a source
|
||||||
|
* word</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* {@link #FORWARD} means that processing starts at the logical beginning of the
|
||||||
|
* stored form and moves toward its end. {@link #BACKWARD} means that processing
|
||||||
|
* starts at the logical end of the stored form and moves toward its beginning.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* For traditional suffix-oriented Egothor data, {@link #BACKWARD} matches the
|
||||||
|
* historical behavior. For right-to-left languages whose affix logic should
|
||||||
|
* operate on the stored form as written, {@link #FORWARD} can be used so that
|
||||||
|
* neither trie construction nor patch application needs to reverse words
|
||||||
|
* externally.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public enum WordTraversalDirection {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Traverses a word from its logical beginning toward its logical end.
|
||||||
|
*/
|
||||||
|
FORWARD,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Traverses a word from its logical end toward its logical beginning.
|
||||||
|
*/
|
||||||
|
BACKWARD;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the traversal start index for a character sequence of the supplied
|
||||||
|
* length.
|
||||||
|
*
|
||||||
|
* @param length sequence length
|
||||||
|
* @return start index, or {@code -1} when the sequence is empty and traversal
|
||||||
|
* should therefore not begin
|
||||||
|
* @throws IllegalArgumentException if {@code length} is negative
|
||||||
|
*/
|
||||||
|
public int startIndex(final int length) {
|
||||||
|
if (length < 0) {
|
||||||
|
throw new IllegalArgumentException("length must not be negative.");
|
||||||
|
}
|
||||||
|
if (length == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return this == FORWARD ? 0 : length - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the logical character index addressed by the supplied traversal
|
||||||
|
* offset.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* A traversal offset of {@code 0} addresses the first character seen in this
|
||||||
|
* direction, {@code 1} the second character, and so on.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param length sequence length
|
||||||
|
* @param traversalOffset zero-based offset from the traversal start
|
||||||
|
* @return corresponding logical character index
|
||||||
|
* @throws IllegalArgumentException if any argument is outside the valid range
|
||||||
|
*/
|
||||||
|
public int logicalIndex(final int length, final int traversalOffset) {
|
||||||
|
if (length < 0) {
|
||||||
|
throw new IllegalArgumentException("length must not be negative.");
|
||||||
|
}
|
||||||
|
if (traversalOffset < 0 || traversalOffset >= length) {
|
||||||
|
throw new IllegalArgumentException("traversalOffset is outside the valid range.");
|
||||||
|
}
|
||||||
|
return this == FORWARD ? traversalOffset : length - 1 - traversalOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the characters of the supplied word in this traversal order.
|
||||||
|
*
|
||||||
|
* @param word source word
|
||||||
|
* @return traversal-ordered characters
|
||||||
|
* @throws NullPointerException if {@code word} is {@code null}
|
||||||
|
*/
|
||||||
|
public char[] toTraversalCharacters(final String word) {
|
||||||
|
Objects.requireNonNull(word, "word");
|
||||||
|
final char[] characters = word.toCharArray();
|
||||||
|
if (this == FORWARD) {
|
||||||
|
return characters;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int left = 0, right = characters.length - 1; left < right; left++, right--) { // NOPMD
|
||||||
|
final char swap = characters[left];
|
||||||
|
characters[left] = characters[right];
|
||||||
|
characters[right] = swap;
|
||||||
|
}
|
||||||
|
return characters;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a path represented in traversal order back to the logical key form.
|
||||||
|
*
|
||||||
|
* @param traversalPath key path in traversal order
|
||||||
|
* @return logical key form
|
||||||
|
* @throws NullPointerException if {@code traversalPath} is {@code null}
|
||||||
|
*/
|
||||||
|
public String traversalPathToLogicalKey(final CharSequence traversalPath) {
|
||||||
|
Objects.requireNonNull(traversalPath, "traversalPath");
|
||||||
|
if (this == FORWARD) {
|
||||||
|
return traversalPath.toString();
|
||||||
|
}
|
||||||
|
return new StringBuilder(traversalPath).reverse().toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -56,12 +56,17 @@
|
|||||||
* <p>
|
* <p>
|
||||||
* Dictionary loading is provided by
|
* Dictionary loading is provided by
|
||||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
|
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
|
||||||
* traditional line-oriented stemmer resource format in which each non-empty
|
* traditional line-oriented tab-separated values resource format in which each
|
||||||
* logical line starts with a canonical stem followed by known surface variants.
|
* non-empty logical line starts with a canonical stem followed by known surface
|
||||||
|
* variants in subsequent tab-separated columns.
|
||||||
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
||||||
* which normalizes input to lower case using {@link java.util.Locale#ROOT} and
|
* which applies configurable case processing through
|
||||||
|
* {@link org.egothor.stemmer.CaseProcessingMode} (default:
|
||||||
|
* {@link org.egothor.stemmer.CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}),
|
||||||
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
||||||
* {@code //}. During loading, each variant is converted into a patch command
|
* {@code //}, and currently ignores dictionary items containing Unicode
|
||||||
|
* whitespace characters while reporting them through warning-level diagnostics.
|
||||||
|
* During loading, each variant is converted into a patch command
|
||||||
* targeting the canonical stem, and the stem itself may optionally be stored
|
* targeting the canonical stem, and the stem itself may optionally be stored
|
||||||
* under the canonical no-operation patch.
|
* under the canonical no-operation patch.
|
||||||
* </p>
|
* </p>
|
||||||
|
|||||||
@@ -60,11 +60,23 @@ import java.util.Objects;
|
|||||||
this.childSignature = childSignature;
|
this.childSignature = childSignature;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a hash code consistent with descriptor equality.
|
||||||
|
*
|
||||||
|
* @return descriptor hash code
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hash(this.edge, this.childSignature);
|
return Objects.hash(this.edge, this.childSignature);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares this descriptor with another object.
|
||||||
|
*
|
||||||
|
* @param other object to compare with
|
||||||
|
* @return {@code true} when both descriptors represent the same semantic
|
||||||
|
* reduction identity
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(final Object other) {
|
public boolean equals(final Object other) {
|
||||||
if (this == other) {
|
if (this == other) {
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Copyright (C) 2026, Leo Galambos
|
* Copyright (C) 2026, Leo Galambos
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
*
|
*
|
||||||
* 1. Redistributions of source code must retain the above copyright notice,
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
* this list of conditions and the following disclaimer.
|
* this list of conditions and the following disclaimer.
|
||||||
*
|
*
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
* this list of conditions and the following disclaimer in the documentation
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
* and/or other materials provided with the distribution.
|
* and/or other materials provided with the distribution.
|
||||||
*
|
*
|
||||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
* may be used to endorse or promote products derived from this software
|
* may be used to endorse or promote products derived from this software
|
||||||
* without specific prior written permission.
|
* without specific prior written permission.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
@@ -43,34 +43,128 @@ import java.util.Objects;
|
|||||||
* immutable from the public API perspective because construction wires these
|
* immutable from the public API perspective because construction wires these
|
||||||
* arrays once and all lookup operations thereafter treat them as read-only.
|
* arrays once and all lookup operations thereafter treat them as read-only.
|
||||||
*
|
*
|
||||||
* @param <V> value type
|
* @param <V> value type
|
||||||
* @param edgeLabels internal edge label array
|
|
||||||
* @param children internal child array
|
|
||||||
* @param orderedValues internal ordered values array
|
|
||||||
* @param orderedCounts internal ordered counts array
|
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("PMD.DataClass")
|
public final class CompiledNode<V> {
|
||||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates one validated compiled node.
|
* Default dense child lookup span in characters used when an explicit override is
|
||||||
|
* not provided.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of child edges where linear scan is cheaper than binary search.
|
||||||
|
*/
|
||||||
|
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Edge labels in sorted ascending order.
|
||||||
|
*/
|
||||||
|
private final char[] edgeLabels;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sparse child array aligned with {@link #edgeLabels}.
|
||||||
|
*/
|
||||||
|
private final CompiledNode<V>[] children;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dense child lookup table used when labels fit into a compact char interval.
|
||||||
|
* <p>
|
||||||
|
* The table enables direct O(1) indexing for child lookup and is allocated
|
||||||
|
* only when the character span of this node's edges is within the configured
|
||||||
|
* threshold.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
private final CompiledNode<V>[] denseChildren;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalized minimum edge value for the dense lookup table.
|
||||||
|
*/
|
||||||
|
private final int denseEdgeMin;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Values stored at this node in local order.
|
||||||
|
*/
|
||||||
|
private final V[] orderedValues;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Occurrence counts aligned with {@link #orderedValues}.
|
||||||
|
*/
|
||||||
|
private final int[] orderedCounts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one validated compiled node using {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* for dense lookup sizing.
|
||||||
*
|
*
|
||||||
* @throws NullPointerException if any array argument is {@code null}
|
* @throws NullPointerException if any array argument is {@code null}
|
||||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||||
* arrays do not have matching lengths
|
* arrays do not have matching lengths
|
||||||
*/
|
*/
|
||||||
public CompiledNode {
|
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||||
|
final int... orderedCounts) {
|
||||||
|
this(edgeLabels, children, orderedValues, DEFAULT_MAX_EXPANDED_INDEX, orderedCounts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one validated compiled node.
|
||||||
|
*
|
||||||
|
* @param maxExpandedIndex upper bound for the dense lookup interval size; zero
|
||||||
|
* disables dense lookup. Larger values improve
|
||||||
|
* direct-index likelihood while increasing dense
|
||||||
|
* table memory in compact-label nodes.
|
||||||
|
* @throws NullPointerException if any array argument is {@code null}
|
||||||
|
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||||
|
* arrays do not have matching lengths or the
|
||||||
|
* dense interval size is negative
|
||||||
|
*/
|
||||||
|
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||||
|
final int maxExpandedIndex, final int... orderedCounts) {
|
||||||
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
||||||
Objects.requireNonNull(children, "children");
|
Objects.requireNonNull(children, "children");
|
||||||
Objects.requireNonNull(orderedValues, "orderedValues");
|
Objects.requireNonNull(orderedValues, "orderedValues");
|
||||||
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
||||||
|
|
||||||
|
if (maxExpandedIndex < 0) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||||
|
}
|
||||||
|
|
||||||
if (edgeLabels.length != children.length) {
|
if (edgeLabels.length != children.length) {
|
||||||
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
||||||
}
|
}
|
||||||
if (orderedValues.length != orderedCounts.length) {
|
if (orderedValues.length != orderedCounts.length) {
|
||||||
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.edgeLabels = edgeLabels;
|
||||||
|
this.children = children;
|
||||||
|
this.orderedValues = orderedValues;
|
||||||
|
this.orderedCounts = orderedCounts;
|
||||||
|
|
||||||
|
if (edgeLabels.length == 0 || maxExpandedIndex == 0) {
|
||||||
|
this.denseChildren = null;
|
||||||
|
this.denseEdgeMin = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int minEdge = edgeLabels[0];
|
||||||
|
final int maxEdge = edgeLabels[edgeLabels.length - 1];
|
||||||
|
final int span = maxEdge - minEdge;
|
||||||
|
|
||||||
|
if (span < 0 || span > maxExpandedIndex) {
|
||||||
|
this.denseChildren = null;
|
||||||
|
this.denseEdgeMin = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] dense = (CompiledNode<V>[]) new CompiledNode[span + 1];
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||||
|
dense[edgeLabels[edgeIndex] - minEdge] = children[edgeIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
this.denseChildren = dense;
|
||||||
|
this.denseEdgeMin = minEdge;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -82,7 +176,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal edge-label array
|
* @return internal edge-label array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public char[] edgeLabels() {
|
public char[] edgeLabels() {
|
||||||
return this.edgeLabels;
|
return this.edgeLabels;
|
||||||
@@ -97,7 +190,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal child-node array
|
* @return internal child-node array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public CompiledNode<V>[] children() {
|
public CompiledNode<V>[] children() {
|
||||||
return this.children;
|
return this.children;
|
||||||
@@ -112,7 +204,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal ordered-values array
|
* @return internal ordered-values array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public V[] orderedValues() {
|
public V[] orderedValues() {
|
||||||
return this.orderedValues;
|
return this.orderedValues;
|
||||||
@@ -127,19 +218,170 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal ordered-counts array
|
* @return internal ordered-counts array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public int[] orderedCounts() {
|
public int[] orderedCounts() {
|
||||||
return this.orderedCounts;
|
return this.orderedCounts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of child edges represented by this node.
|
||||||
|
*
|
||||||
|
* @return child edge count
|
||||||
|
*/
|
||||||
|
public int edgeCount() {
|
||||||
|
return this.edgeLabels.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of values stored in this node.
|
||||||
|
*
|
||||||
|
* @return value count
|
||||||
|
*/
|
||||||
|
public int valueCount() {
|
||||||
|
return this.orderedValues.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node stores any values.
|
||||||
|
*
|
||||||
|
* @return {@code true} when values are present at this node
|
||||||
|
*/
|
||||||
|
public boolean hasValues() {
|
||||||
|
return this.orderedValues.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has child edges.
|
||||||
|
*
|
||||||
|
* @return {@code true} when this node has at least one outgoing edge
|
||||||
|
*/
|
||||||
|
public boolean hasChildren() {
|
||||||
|
return this.edgeLabels.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has no child edges.
|
||||||
|
*
|
||||||
|
* @return {@code true} when this node is a terminal leaf node
|
||||||
|
*/
|
||||||
|
public boolean isLeaf() {
|
||||||
|
return !hasChildren();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests whether an edge label is present at this node.
|
||||||
|
*
|
||||||
|
* @param edge edge label
|
||||||
|
* @return {@code true} if this node contains the supplied edge label
|
||||||
|
*/
|
||||||
|
public boolean hasEdge(final char edge) {
|
||||||
|
return findChild(edge) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has a dense direct-index child lookup table.
|
||||||
|
*
|
||||||
|
* @return {@code true} when a direct-index child table is available
|
||||||
|
*/
|
||||||
|
public boolean hasDenseLookup() {
|
||||||
|
return this.denseChildren != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a small memory-related metric describing this node's dense table size.
|
||||||
|
*
|
||||||
|
* @return number of dense table slots, or {@code 0} when dense lookup is not
|
||||||
|
* enabled
|
||||||
|
*/
|
||||||
|
public int denseTableLength() {
|
||||||
|
return this.denseChildren == null ? 0 : this.denseChildren.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a compact structural summary used by diagnostics and tests.
|
||||||
|
*
|
||||||
|
* @return summary hash for node structure and contents
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int hash = Arrays.hashCode(this.edgeLabels);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.children);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.orderedValues);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.orderedCounts);
|
||||||
|
hash = 31 * hash + Objects.hash(this.denseEdgeMin);
|
||||||
|
hash = 31 * hash + (hasDenseLookup() ? Arrays.hashCode(this.denseChildren) : 0);
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares structural node content, including dense table availability.
|
||||||
|
*
|
||||||
|
* @param object comparison object
|
||||||
|
* @return {@code true} when nodes describe identical structure and payload
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object object) {
|
||||||
|
if (this == object) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(object instanceof CompiledNode<?> other)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return Arrays.equals(this.edgeLabels, other.edgeLabels) && Arrays.equals(this.children, other.children)
|
||||||
|
&& Arrays.equals(this.orderedValues, other.orderedValues) && Arrays.equals(this.orderedCounts, other.orderedCounts)
|
||||||
|
&& this.denseEdgeMin == other.denseEdgeMin && Arrays.equals(this.denseChildren, other.denseChildren);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a short summary useful for debugging and diagnostics.
|
||||||
|
*
|
||||||
|
* @return textual node summary
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "CompiledNode{"
|
||||||
|
+ "edgeCount=" + this.edgeLabels.length + ", orderedValueCount=" + this.orderedValues.length
|
||||||
|
+ ", denseTableLength=" + denseTableLength() + '}';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds a child for the supplied edge character.
|
* Finds a child for the supplied edge character.
|
||||||
|
* <p>
|
||||||
|
* Lookup order is:
|
||||||
|
* <ol>
|
||||||
|
* <li>dense array index (if the label interval is compact enough),</li>
|
||||||
|
* <li>small-child linear scan when the fallback node has {@value #LINEAR_CHILD_COUNT_THRESHOLD}
|
||||||
|
* or fewer edges,</li>
|
||||||
|
* <li>binary search over sorted labels.</li>
|
||||||
|
* </ol>
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @param edge edge character
|
* @param edge edge character
|
||||||
* @return child node, or {@code null} if absent
|
* @return child node, or {@code null} if absent
|
||||||
*/
|
*/
|
||||||
public CompiledNode<V> findChild(final char edge) {
|
public CompiledNode<V> findChild(final char edge) {
|
||||||
|
final int childCount = this.edgeLabels.length;
|
||||||
|
if (childCount == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.denseChildren != null) {
|
||||||
|
final int denseIndex = edge - this.denseEdgeMin;
|
||||||
|
if (denseIndex < 0 || denseIndex >= this.denseChildren.length) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return this.denseChildren[denseIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
|
||||||
|
for (int index = 0; index < childCount; index++) {
|
||||||
|
if (this.edgeLabels[index] == edge) {
|
||||||
|
return this.children[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final int index = Arrays.binarySearch(this.edgeLabels, edge);
|
final int index = Arrays.binarySearch(this.edgeLabels, edge);
|
||||||
if (index < 0) {
|
if (index < 0) {
|
||||||
return null;
|
return null;
|
||||||
|
|||||||
@@ -53,11 +53,23 @@ import java.util.Objects;
|
|||||||
this.dominantValue = dominantValue;
|
this.dominantValue = dominantValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a hash code consistent with descriptor equality.
|
||||||
|
*
|
||||||
|
* @return descriptor hash code
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hashCode(this.dominantValue);
|
return Objects.hashCode(this.dominantValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares this descriptor with another object.
|
||||||
|
*
|
||||||
|
* @param other object to compare with
|
||||||
|
* @return {@code true} when both descriptors represent the same semantic
|
||||||
|
* reduction identity
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(final Object other) {
|
public boolean equals(final Object other) {
|
||||||
if (this == other) {
|
if (this == other) {
|
||||||
|
|||||||
@@ -65,11 +65,23 @@ import java.util.List;
|
|||||||
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
|
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a hash code consistent with descriptor equality.
|
||||||
|
*
|
||||||
|
* @return descriptor hash code
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return this.orderedValues.hashCode();
|
return this.orderedValues.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares this descriptor with another object.
|
||||||
|
*
|
||||||
|
* @param other object to compare with
|
||||||
|
* @return {@code true} when both descriptors represent the same semantic
|
||||||
|
* reduction identity
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(final Object other) {
|
public boolean equals(final Object other) {
|
||||||
if (this == other) {
|
if (this == other) {
|
||||||
|
|||||||
@@ -67,11 +67,23 @@ import java.util.Set;
|
|||||||
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
|
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a hash code consistent with descriptor equality.
|
||||||
|
*
|
||||||
|
* @return descriptor hash code
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return this.distinctValues.hashCode();
|
return this.distinctValues.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares this descriptor with another object.
|
||||||
|
*
|
||||||
|
* @param other object to compare with
|
||||||
|
* @return {@code true} when both descriptors represent the same semantic
|
||||||
|
* reduction identity
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(final Object other) {
|
public boolean equals(final Object other) {
|
||||||
if (this == other) {
|
if (this == other) {
|
||||||
|
|||||||
BIN
src/main/resources/cs_cz/stemmer.gz
Normal file
BIN
src/main/resources/cs_cz/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/da_dk/stemmer.gz
Normal file
BIN
src/main/resources/da_dk/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/de_de/stemmer.gz
Normal file
BIN
src/main/resources/de_de/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/es_es/stemmer.gz
Normal file
BIN
src/main/resources/es_es/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/fa_ir/stemmer.gz
Normal file
BIN
src/main/resources/fa_ir/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/fi_fi/stemmer.gz
Normal file
BIN
src/main/resources/fi_fi/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/fr_fr/stemmer.gz
Normal file
BIN
src/main/resources/fr_fr/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/he_il/stemmer.gz
Normal file
BIN
src/main/resources/he_il/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/hu_hu/stemmer.gz
Normal file
BIN
src/main/resources/hu_hu/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/it_it/stemmer.gz
Normal file
BIN
src/main/resources/it_it/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/nb_no/stemmer.gz
Normal file
BIN
src/main/resources/nb_no/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/nl_nl/stemmer.gz
Normal file
BIN
src/main/resources/nl_nl/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/nn_no/stemmer.gz
Normal file
BIN
src/main/resources/nn_no/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/pl_pl/stemmer.gz
Normal file
BIN
src/main/resources/pl_pl/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/pt_pt/stemmer.gz
Normal file
BIN
src/main/resources/pt_pt/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/ru_ru/stemmer.gz
Normal file
BIN
src/main/resources/ru_ru/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/sv_se/stemmer.gz
Normal file
BIN
src/main/resources/sv_se/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/uk_ua/stemmer.gz
Normal file
BIN
src/main/resources/uk_ua/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/us_uk/stemmer.gz
Normal file
BIN
src/main/resources/us_uk/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/yi/stemmer.gz
Normal file
BIN
src/main/resources/yi/stemmer.gz
Normal file
Binary file not shown.
@@ -48,9 +48,12 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
import org.junit.jupiter.api.DisplayName;
|
import org.junit.jupiter.api.DisplayName;
|
||||||
import org.junit.jupiter.api.Nested;
|
import org.junit.jupiter.api.Nested;
|
||||||
@@ -92,6 +95,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
@Tag("integration")
|
@Tag("integration")
|
||||||
@Tag("cli")
|
@Tag("cli")
|
||||||
@Tag("stemmer")
|
@Tag("stemmer")
|
||||||
|
@Tag("compile")
|
||||||
|
@Tag("slow")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
@DisplayName("Compile integration")
|
@DisplayName("Compile integration")
|
||||||
final class CompileIntegrationTest {
|
final class CompileIntegrationTest {
|
||||||
@@ -108,16 +113,14 @@ final class CompileIntegrationTest {
|
|||||||
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reader charset used for robust extraction of ASCII-safe representative probes
|
* Reader charset used for extraction of representative probes from bundled
|
||||||
* from bundled project dictionaries.
|
* project dictionaries.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* ISO-8859-1 is intentionally used here as a byte-preserving single-byte
|
* Bundled project dictionaries are expected to be encoded in UTF-8.
|
||||||
* decoder so that the test can safely scan heterogeneous dictionary resources
|
|
||||||
* and then select only ASCII-safe representative terms for semantic assertions.
|
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
private static final Charset BUNDLED_PROBE_SCAN_CHARSET = StandardCharsets.ISO_8859_1;
|
private static final Charset BUNDLED_PROBE_SCAN_CHARSET = StandardCharsets.UTF_8;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Maximum number of representative bundled variants asserted per dictionary.
|
* Maximum number of representative bundled variants asserted per dictionary.
|
||||||
@@ -136,12 +139,47 @@ final class CompileIntegrationTest {
|
|||||||
* @return parameter stream
|
* @return parameter stream
|
||||||
*/
|
*/
|
||||||
static Stream<Arguments> bundledDictionaryCases() {
|
static Stream<Arguments> bundledDictionaryCases() {
|
||||||
return Stream.of(Arguments.of("da_dk", "da_dk/stemmer"), Arguments.of("de_de", "de_de/stemmer"),
|
return Stream.of(
|
||||||
Arguments.of("es_es", "es_es/stemmer"), Arguments.of("fr_fr", "fr_fr/stemmer"),
|
//
|
||||||
Arguments.of("it_it", "it_it/stemmer"), Arguments.of("nl_nl", "nl_nl/stemmer"),
|
Arguments.of("cs_cz", "cs_cz/stemmer.gz"),
|
||||||
Arguments.of("no_no", "no_no/stemmer"), Arguments.of("pt_pt", "pt_pt/stemmer"),
|
//
|
||||||
Arguments.of("ru_ru", "ru_ru/stemmer"), Arguments.of("sv_se", "sv_se/stemmer"),
|
Arguments.of("da_dk", "da_dk/stemmer.gz"),
|
||||||
Arguments.of("us_uk", "us_uk/stemmer"), Arguments.of("us_uk.profi", "us_uk.profi/stemmer"));
|
//
|
||||||
|
Arguments.of("de_de", "de_de/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("es_es", "es_es/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("fa_ir", "fa_ir/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("fi_fi", "fi_fi/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("fr_fr", "fr_fr/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("he_il", "he_il/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("hu_hu", "hu_hu/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("it_it", "it_it/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("nb_no", "nb_no/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("nl_nl", "nl_nl/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("nn_no", "nn_no/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("pl_pl", "pl_pl/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("pt_pt", "pt_pt/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("ru_ru", "ru_ru/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("sv_se", "sv_se/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("uk_ua", "uk_ua/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("us_uk", "us_uk/stemmer.gz"),
|
||||||
|
//
|
||||||
|
Arguments.of("yi", "yi/stemmer.gz"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nested
|
@Nested
|
||||||
@@ -153,9 +191,10 @@ final class CompileIntegrationTest {
|
|||||||
* create nested output directories, preserve expected lookup behavior, and
|
* create nested output directories, preserve expected lookup behavior, and
|
||||||
* store canonical stems when {@code --store-original} is enabled.
|
* store canonical stems when {@code --store-original} is enabled.
|
||||||
*
|
*
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
|
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
|
||||||
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
|
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
|
||||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||||
@@ -198,9 +237,10 @@ final class CompileIntegrationTest {
|
|||||||
* Verifies that the CLI rejects an already existing output path unless
|
* Verifies that the CLI rejects an already existing output path unless
|
||||||
* overwrite is explicitly enabled.
|
* overwrite is explicitly enabled.
|
||||||
*
|
*
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
|
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
|
||||||
void shouldRequireOverwriteForExistingOutput() throws IOException {
|
void shouldRequireOverwriteForExistingOutput() throws IOException {
|
||||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||||
@@ -256,36 +296,42 @@ final class CompileIntegrationTest {
|
|||||||
"A preferred patch must be available for fixture word '" + word + "'."),
|
"A preferred patch must be available for fixture word '" + word + "'."),
|
||||||
() -> assertEquals(expectedStems, actualStems,
|
() -> assertEquals(expectedStems, actualStems,
|
||||||
"Fixture word '" + word + "' must preserve all expected stem candidates."),
|
"Fixture word '" + word + "' must preserve all expected stem candidates."),
|
||||||
() -> assertTrue(expectedStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
() -> assertTrue(
|
||||||
|
expectedStems.contains(
|
||||||
|
PatchCommandEncoder.apply(word, preferredPatch, trie.traversalDirection())),
|
||||||
"The preferred stem must be one of the acceptable stems for fixture word '" + word + "'."));
|
"The preferred stem must be one of the acceptable stems for fixture word '" + word + "'."));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("Bundled project dictionary workflows")
|
@DisplayName("Bundled project dictionary workflows")
|
||||||
|
@Tag("slow")
|
||||||
final class BundledProjectDictionaryWorkflows {
|
final class BundledProjectDictionaryWorkflows {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that the CLI can compile each bundled project dictionary, create a
|
* Verifies that the CLI can compile each bundled project dictionary, create a
|
||||||
* compressed artifact, reload it, and preserve representative variant lookup
|
* compressed artifact, reload it, and preserve representative variant stemming
|
||||||
* behavior derived from the source dictionary itself.
|
* behavior derived from the source dictionary itself at the level of acceptable
|
||||||
|
* reconstructed candidates.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The representative assertions intentionally target only variant terms, not
|
* Representative probes are derived directly from the same bundled source
|
||||||
* canonical stems, because direct lookup of the canonical stem is not part of
|
* dictionary that is being compiled. Items containing Unicode whitespace are
|
||||||
* the default non-{@code --store-original} contract.
|
* intentionally ignored by the representative-probe helper because the current
|
||||||
|
* probe policy does not yet support multi-token dictionary items.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param scenario scenario identifier
|
* @param scenario scenario identifier
|
||||||
* @param resourcePath bundled dictionary resource path
|
* @param resourcePath bundled dictionary resource path
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest(name = "[{index}] {0}")
|
@ParameterizedTest(name = "[{index}] {0}")
|
||||||
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
|
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
||||||
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
||||||
final String resourcePath) throws IOException {
|
final String resourcePath) throws IOException {
|
||||||
final Path inputFile = copyResourceToTemporaryFile(resourcePath, scenario + "-stemmer.txt");
|
final Path inputFile = copyResourceToTemporaryFile(resourcePath, scenario + "-stemmer.gz");
|
||||||
final Path outputFile = tempDir.resolve("bundled").resolve(scenario).resolve("compiled.dat.gz");
|
final Path outputFile = tempDir.resolve("bundled").resolve(scenario).resolve("compiled.dat.gz");
|
||||||
|
|
||||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||||
@@ -301,14 +347,17 @@ final class CompileIntegrationTest {
|
|||||||
final Map<String, Set<String>> representativeStemsByVariant = readRepresentativeVariantExpectations(
|
final Map<String, Set<String>> representativeStemsByVariant = readRepresentativeVariantExpectations(
|
||||||
resourcePath, REPRESENTATIVE_VARIANT_LIMIT);
|
resourcePath, REPRESENTATIVE_VARIANT_LIMIT);
|
||||||
|
|
||||||
assertFalse(representativeStemsByVariant.isEmpty(),
|
assertFalse(representativeStemsByVariant.isEmpty(), "The bundled dictionary must provide at least one "
|
||||||
"The bundled dictionary must provide at least one representative variant for " + scenario + '.');
|
+ "representative variant without Unicode whitespace for " + scenario + '.');
|
||||||
|
|
||||||
for (Map.Entry<String, Set<String>> entry : representativeStemsByVariant.entrySet()) {
|
for (Map.Entry<String, Set<String>> entry : representativeStemsByVariant.entrySet()) {
|
||||||
final String variant = entry.getKey();
|
final String variant = entry.getKey().toLowerCase(Locale.ROOT);
|
||||||
final Set<String> expectedStems = entry.getValue();
|
final Set<String> expectedStems = entry.getValue().stream().map(s -> s.toLowerCase(Locale.ROOT))
|
||||||
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
final String preferredPatch = trie.get(variant);
|
final String preferredPatch = trie.get(variant);
|
||||||
final Set<String> actualStems = reconstructAllStemCandidates(trie, variant);
|
final Set<String> actualStems = reconstructAllStemCandidates(trie, variant);
|
||||||
|
final String preferredStem = preferredPatch == null ? null
|
||||||
|
: PatchCommandEncoder.apply(variant, preferredPatch, trie.traversalDirection());
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertNotNull(preferredPatch,
|
() -> assertNotNull(preferredPatch,
|
||||||
@@ -317,13 +366,22 @@ final class CompileIntegrationTest {
|
|||||||
() -> assertFalse(actualStems.isEmpty(),
|
() -> assertFalse(actualStems.isEmpty(),
|
||||||
"At least one stem candidate must be returned for representative variant '" + variant
|
"At least one stem candidate must be returned for representative variant '" + variant
|
||||||
+ "' in " + scenario + '.'),
|
+ "' in " + scenario + '.'),
|
||||||
() -> assertTrue(actualStems.containsAll(expectedStems),
|
() -> assertTrue(expectedStems.stream().anyMatch(actualStems::contains),
|
||||||
"All acceptable stems must be preserved for representative variant '" + variant
|
"At least one acceptable stem must be preserved for representative variant '" + variant
|
||||||
+ "' in " + scenario + ". Expected=" + expectedStems + ", actual="
|
+ "' in " + scenario + ". Expected one of=" + expectedStems + ", actual="
|
||||||
+ actualStems),
|
+ actualStems),
|
||||||
() -> assertTrue(expectedStems.contains(PatchCommandEncoder.apply(variant, preferredPatch)),
|
() -> {
|
||||||
"The preferred stem must be one of the acceptable stems for representative variant '"
|
if (expectedStems.size() == 1 && actualStems.size() == 1) {
|
||||||
+ variant + "' in " + scenario + '.'));
|
assertEquals(expectedStems.iterator().next(), preferredStem,
|
||||||
|
"The preferred stem must match the only expected surviving stem for "
|
||||||
|
+ "representative variant '" + variant + "' in " + scenario + '.');
|
||||||
|
} else {
|
||||||
|
assertTrue(expectedStems.contains(preferredStem) || actualStems.contains(preferredStem),
|
||||||
|
"The preferred stem must remain among the reconstructed candidates for "
|
||||||
|
+ "representative variant '" + variant + "' in " + scenario
|
||||||
|
+ ". Preferred=" + preferredStem + ", actual=" + actualStems);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -371,25 +429,30 @@ final class CompileIntegrationTest {
|
|||||||
* Reads representative variant expectations from a bundled project dictionary.
|
* Reads representative variant expectations from a bundled project dictionary.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* This helper scans the source dictionary in a byte-preserving single-byte
|
* This helper scans the source dictionary as UTF-8 text and derives
|
||||||
* charset and selects only ASCII-safe probe terms. That keeps the
|
* representative stem-to-variant expectations directly from that bundled
|
||||||
* multidictionary integration assertions stable even when the bundled resources
|
* source. Only dictionary items that do not contain Unicode whitespace are
|
||||||
* use heterogeneous encodings, while still validating the CLI against the real
|
* considered eligible representative probes. This keeps the multidictionary
|
||||||
* shipped dictionaries.
|
* integration assertions aligned with the current single-token probe policy
|
||||||
|
* while still validating the CLI against the real shipped dictionaries and
|
||||||
|
* their actual script repertoire.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The dictionary format is expected to be:
|
* The bundled dictionary format is expected to be tab-separated values, meaning
|
||||||
|
* that columns are separated by the tab character:
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* stem variant1 variant2 ...
|
* stem variant1 variant2 ...
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Lines beginning with comment prefixes or blank lines are ignored. Canonical
|
* Lines beginning with comment prefixes or blank lines are ignored. Canonical
|
||||||
* stems are intentionally excluded from the expectation map unless they also
|
* stems are intentionally excluded from the expectation map unless they also
|
||||||
* appear as distinct variants on a source line.
|
* appear as distinct variants on a source line. Dictionary items containing any
|
||||||
|
* Unicode whitespace are intentionally ignored by this representative-probe
|
||||||
|
* helper.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param resourcePath bundled dictionary resource path
|
* @param resourcePath bundled dictionary resource path
|
||||||
@@ -402,8 +465,9 @@ final class CompileIntegrationTest {
|
|||||||
final Map<String, Set<String>> expectations = new LinkedHashMap<String, Set<String>>();
|
final Map<String, Set<String>> expectations = new LinkedHashMap<String, Set<String>>();
|
||||||
|
|
||||||
try (InputStream inputStream = openResource(resourcePath);
|
try (InputStream inputStream = openResource(resourcePath);
|
||||||
|
InputStream decompressedStream = new GZIPInputStream(inputStream);
|
||||||
BufferedReader reader = new BufferedReader(
|
BufferedReader reader = new BufferedReader(
|
||||||
new InputStreamReader(inputStream, BUNDLED_PROBE_SCAN_CHARSET))) {
|
new InputStreamReader(decompressedStream, BUNDLED_PROBE_SCAN_CHARSET))) {
|
||||||
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
|
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
|
||||||
if (expectations.size() >= limit) {
|
if (expectations.size() >= limit) {
|
||||||
break;
|
break;
|
||||||
@@ -414,20 +478,20 @@ final class CompileIntegrationTest {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String[] tokens = trimmedLine.split("\\s+");
|
final String[] tokens = trimmedLine.split("\\t+");
|
||||||
if (tokens.length < 2) {
|
if (tokens.length < 2) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String stem = tokens[0];
|
final String stem = tokens[0];
|
||||||
if (!isAsciiProbeToken(stem)) {
|
if (containsWhitespaceCharacter(stem)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int index = 1; index < tokens.length && expectations.size() < limit; index++) {
|
for (int index = 1; index < tokens.length && expectations.size() < limit; index++) {
|
||||||
final String variant = tokens[index];
|
final String variant = tokens[index];
|
||||||
|
|
||||||
if (!isAsciiProbeToken(variant) || variant.equals(stem)) {
|
if (containsWhitespaceCharacter(variant) || variant.equals(stem)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -440,26 +504,24 @@ final class CompileIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines whether one token is suitable for stable ASCII-safe bundled
|
* Determines whether one token contains any Unicode whitespace character.
|
||||||
* multidictionary probing.
|
|
||||||
*
|
*
|
||||||
* @param token token to inspect
|
* @param token token to inspect
|
||||||
* @return {@code true} when the token is a non-empty lower-case ASCII letter
|
* @return {@code true} when the token contains at least one whitespace
|
||||||
* sequence
|
* character
|
||||||
*/
|
*/
|
||||||
private static boolean isAsciiProbeToken(final String token) {
|
private static boolean containsWhitespaceCharacter(final String token) {
|
||||||
if (token == null || token.isEmpty()) {
|
if (token == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int index = 0; index < token.length(); index++) {
|
for (int index = 0; index < token.length(); index++) {
|
||||||
final char character = token.charAt(index);
|
if (Character.isWhitespace(token.charAt(index))) {
|
||||||
if (character < 'a' || character > 'z') {
|
return true;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -495,7 +557,7 @@ final class CompileIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (String patchCommand : patchCommands) {
|
for (String patchCommand : patchCommands) {
|
||||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return stems;
|
||||||
|
|||||||
@@ -66,7 +66,10 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
* {@link System#exit(int)}.
|
* {@link System#exit(int)}.
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("integration")
|
||||||
|
@Tag("cli")
|
||||||
|
@Tag("compile")
|
||||||
|
@Tag("stemmer")
|
||||||
@DisplayName("Compile")
|
@DisplayName("Compile")
|
||||||
class CompileTest {
|
class CompileTest {
|
||||||
|
|
||||||
@@ -342,8 +345,8 @@ class CompileTest {
|
|||||||
private Path createMinimalDictionaryFile(final String fileName) throws Exception {
|
private Path createMinimalDictionaryFile(final String fileName) throws Exception {
|
||||||
final Path inputFile = temporaryDirectory.resolve(fileName);
|
final Path inputFile = temporaryDirectory.resolve(fileName);
|
||||||
|
|
||||||
final String content = "" + "# minimal dictionary for CLI tests\n" + "run running runs runner\n"
|
final String content = "" + "# minimal dictionary for CLI tests\n" + "run running runs runner\n"
|
||||||
+ "walk walking walks walked\n";
|
+ "walk walking walks walked\n";
|
||||||
|
|
||||||
Files.writeString(inputFile, content, StandardCharsets.UTF_8);
|
Files.writeString(inputFile, content, StandardCharsets.UTF_8);
|
||||||
return inputFile;
|
return inputFile;
|
||||||
|
|||||||
@@ -31,11 +31,11 @@
|
|||||||
package org.egothor.stemmer;
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
@@ -56,9 +56,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* This suite protects the binary persistence contract of compiled tries by
|
* This suite protects the binary persistence contract of compiled tries by
|
||||||
* comparing freshly compiled artifacts against checked-in golden GZip outputs.
|
* validating committed golden GZip outputs and verifying representative
|
||||||
* It also verifies SHA-256 digests and representative semantic probes after
|
* semantic probes after loading both historical and freshly compiled artifacts.
|
||||||
* loading the produced artifact back.
|
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The goal is to catch unintended changes in:
|
* The goal is to catch unintended changes in:
|
||||||
@@ -67,14 +66,15 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
* <li>canonical subtree reduction</li>
|
* <li>canonical subtree reduction</li>
|
||||||
* <li>child ordering and node numbering</li>
|
* <li>child ordering and node numbering</li>
|
||||||
* <li>value ordering and frequency handling</li>
|
* <li>value ordering and frequency handling</li>
|
||||||
* <li>stream layout and binary format stability</li>
|
* <li>stream layout backward readability</li>
|
||||||
* <li>compressed artifact reproducibility</li>
|
* <li>compressed artifact reproducibility within the active format version</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("compat")
|
||||||
@Tag("regression")
|
@Tag("regression")
|
||||||
@Tag("determinism")
|
@Tag("determinism")
|
||||||
@Tag("serialization")
|
@Tag("serialization")
|
||||||
|
@Tag("trie")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
final class CompiledTrieArtifactRegressionTest {
|
final class CompiledTrieArtifactRegressionTest {
|
||||||
|
|
||||||
@@ -127,37 +127,26 @@ final class CompiledTrieArtifactRegressionTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that a newly compiled artifact matches the committed golden file,
|
* Verifies that each committed golden artifact remains internally consistent,
|
||||||
* matches the committed hash, and remains semantically valid when loaded back.
|
* matches its committed digest, and can still be read by the current binary
|
||||||
|
* loader.
|
||||||
*
|
*
|
||||||
* @param artifactCase regression case
|
* @param artifactCase regression case
|
||||||
* @throws IOException if test I/O fails
|
* @throws IOException if test I/O fails
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest(name = "{0}")
|
@ParameterizedTest(name = "{0}")
|
||||||
@MethodSource("artifactCases")
|
@MethodSource("artifactCases")
|
||||||
@DisplayName("Compiled trie artifact must remain byte-for-byte stable")
|
@DisplayName("Committed golden artifacts must remain readable and hash-stable")
|
||||||
void shouldMatchGoldenArtifactAndExpectedHash(final ArtifactCase artifactCase) throws IOException {
|
void shouldKeepGoldenArtifactReadableAndHashStable(final ArtifactCase artifactCase) throws IOException {
|
||||||
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
|
|
||||||
this.tempDir.resolve(artifactCase.id() + ".stemmer"));
|
|
||||||
|
|
||||||
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + ".gz");
|
|
||||||
final byte[] actualArtifactBytes = RegressionArtifactSupport.compileToArtifact(sourcePath,
|
|
||||||
artifactCase.storeOriginal(), artifactCase.reductionSettings(), actualArtifactPath);
|
|
||||||
|
|
||||||
final byte[] goldenArtifactBytes = RegressionArtifactSupport
|
final byte[] goldenArtifactBytes = RegressionArtifactSupport
|
||||||
.readResourceBytes(artifactCase.goldenArtifactResource());
|
.readResourceBytes(artifactCase.goldenArtifactResource());
|
||||||
final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource());
|
final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource());
|
||||||
|
final FrequencyTrie<String> trie = StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(goldenArtifactBytes));
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertArrayEquals(goldenArtifactBytes, actualArtifactBytes,
|
|
||||||
RegressionArtifactSupport.mismatchMessage(artifactCase.id(), expectedSha256,
|
|
||||||
RegressionArtifactSupport.sha256Hex(actualArtifactBytes), actualArtifactPath)),
|
|
||||||
|
|
||||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(actualArtifactBytes),
|
|
||||||
"Freshly compiled artifact SHA-256 must match the committed regression hash."),
|
|
||||||
|
|
||||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes),
|
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes),
|
||||||
"Golden artifact SHA-256 must match its committed sidecar hash."));
|
"Golden artifact SHA-256 must match its committed sidecar hash."),
|
||||||
|
() -> assertGoldenArtifactSemanticProbes(trie, artifactCase));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -181,7 +170,7 @@ final class CompiledTrieArtifactRegressionTest {
|
|||||||
final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
|
final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
|
||||||
artifactCase.storeOriginal(), artifactCase.reductionSettings());
|
artifactCase.storeOriginal(), artifactCase.reductionSettings());
|
||||||
|
|
||||||
assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
|
org.junit.jupiter.api.Assertions.assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
|
||||||
"Two consecutive compilations of the same source must produce identical artifact bytes.");
|
"Two consecutive compilations of the same source must produce identical artifact bytes.");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -209,8 +198,8 @@ final class CompiledTrieArtifactRegressionTest {
|
|||||||
final String[] allPatchCommands = trie.getAll(probe.word());
|
final String[] allPatchCommands = trie.getAll(probe.word());
|
||||||
final String preferredPatchCommand = trie.get(probe.word());
|
final String preferredPatchCommand = trie.get(probe.word());
|
||||||
final String preferredStem = preferredPatchCommand == null ? null
|
final String preferredStem = preferredPatchCommand == null ? null
|
||||||
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand);
|
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand, trie.traversalDirection());
|
||||||
final Set<String> allStems = reconstructStemCandidates(probe.word(), allPatchCommands);
|
final Set<String> allStems = reconstructStemCandidates(trie, probe.word(), allPatchCommands);
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertFalse(allPatchCommands.length == 0,
|
() -> assertFalse(allPatchCommands.length == 0,
|
||||||
@@ -233,7 +222,8 @@ final class CompiledTrieArtifactRegressionTest {
|
|||||||
* @param patchCommands serialized patch commands
|
* @param patchCommands serialized patch commands
|
||||||
* @return reconstructed stem candidates
|
* @return reconstructed stem candidates
|
||||||
*/
|
*/
|
||||||
private static Set<String> reconstructStemCandidates(final String word, final String[] patchCommands) {
|
private static Set<String> reconstructStemCandidates(final FrequencyTrie<String> trie, final String word,
|
||||||
|
final String[] patchCommands) {
|
||||||
final Set<String> stems = new LinkedHashSet<String>();
|
final Set<String> stems = new LinkedHashSet<String>();
|
||||||
|
|
||||||
if (patchCommands == null) {
|
if (patchCommands == null) {
|
||||||
@@ -241,12 +231,38 @@ final class CompiledTrieArtifactRegressionTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (String patchCommand : patchCommands) {
|
for (String patchCommand : patchCommands) {
|
||||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return stems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies representative semantic probes against one already loaded trie.
|
||||||
|
*
|
||||||
|
* @param trie trie to inspect
|
||||||
|
* @param artifactCase regression case providing the expected probes
|
||||||
|
*/
|
||||||
|
private static void assertGoldenArtifactSemanticProbes(final FrequencyTrie<String> trie,
|
||||||
|
final ArtifactCase artifactCase) {
|
||||||
|
for (ProbeExpectation probe : artifactCase.probes()) {
|
||||||
|
final String[] allPatchCommands = trie.getAll(probe.word());
|
||||||
|
final String preferredPatchCommand = trie.get(probe.word());
|
||||||
|
final String preferredStem = preferredPatchCommand == null ? null
|
||||||
|
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand, trie.traversalDirection());
|
||||||
|
final Set<String> allStems = reconstructStemCandidates(trie, probe.word(), allPatchCommands);
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertFalse(allPatchCommands.length == 0,
|
||||||
|
"Representative probe must produce at least one result for word: " + probe.word()),
|
||||||
|
() -> assertEquals(probe.preferredStem(), preferredStem,
|
||||||
|
"Preferred stem mismatch for representative probe word: " + probe.word()),
|
||||||
|
() -> assertTrue(allStems.containsAll(probe.acceptableStems()),
|
||||||
|
"All acceptable stems must be present in getAll() for representative probe word: "
|
||||||
|
+ probe.word()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Immutable regression case definition.
|
* Immutable regression case definition.
|
||||||
*
|
*
|
||||||
|
|||||||
110
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
110
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit tests for {@link DiacriticStripper}.
|
||||||
|
*/
|
||||||
|
@Tag("unit")
|
||||||
|
@Tag("diacritic")
|
||||||
|
@Tag("stemmer")
|
||||||
|
@DisplayName("DiacriticStripper")
|
||||||
|
class DiacriticStripperTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that pure ASCII input is returned unchanged and without allocating a
|
||||||
|
* new string instance.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("ASCII input is returned as-is")
|
||||||
|
void asciiInputIsReturnedAsIs() {
|
||||||
|
final String input = "plain-ascii-123";
|
||||||
|
|
||||||
|
final String stripped = DiacriticStripper.strip(input);
|
||||||
|
|
||||||
|
assertSame(input, stripped);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies direct-table replacements for Czech and other common diacritics.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Direct replacement table strips common diacritics")
|
||||||
|
void directReplacementTableStripsCommonDiacritics() {
|
||||||
|
assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies explicit multi-character replacements for ligatures and sharp s.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Special replacements support multi-character ASCII output")
|
||||||
|
void specialReplacementsSupportMultiCharacterAsciiOutput() {
|
||||||
|
assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
|
||||||
|
assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies Unicode decomposition fallback for characters not in the direct
|
||||||
|
* replacement table.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Unicode decomposition fallback strips combining marks")
|
||||||
|
void unicodeDecompositionFallbackStripsCombiningMarks() {
|
||||||
|
assertEquals("I", DiacriticStripper.strip("İ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Unmappable non-Latin characters remain unchanged")
|
||||||
|
void unmappableNonLatinCharactersRemainUnchanged() {
|
||||||
|
assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies mixed input where normalization starts mid-string and subsequent
|
||||||
|
* unchanged characters are preserved.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Mixed input preserves untouched characters after normalization starts")
|
||||||
|
void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
|
||||||
|
assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -59,7 +59,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
*/
|
*/
|
||||||
@DisplayName("FrequencyTrieBuilders")
|
@DisplayName("FrequencyTrieBuilders")
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("builder")
|
@Tag("construction")
|
||||||
@Tag("frequency-trie")
|
@Tag("frequency-trie")
|
||||||
class FrequencyTrieBuildersTest {
|
class FrequencyTrieBuildersTest {
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ import java.util.List;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for the compiled trie abstraction.
|
* Property-based tests for the compiled trie abstraction.
|
||||||
@@ -59,9 +59,9 @@ import net.jqwik.api.Tag;
|
|||||||
* core algorithm without overfitting to particular fixture data.
|
* core algorithm without overfitting to particular fixture data.
|
||||||
*/
|
*/
|
||||||
@Label("FrequencyTrie properties")
|
@Label("FrequencyTrie properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("trie")
|
@Tag("trie")
|
||||||
|
@Tag("frequency-trie")
|
||||||
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ package org.egothor.stemmer;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||||
@@ -201,6 +202,84 @@ class FrequencyTrieTest {
|
|||||||
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
|
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that lookup-time key normalization follows persisted case processing
|
||||||
|
* metadata.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Lookup applies lowercase normalization when metadata requires it")
|
||||||
|
void lookupAppliesLowercaseNormalizationWhenMetadataRequiresIt() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
builder.put("house", "noun");
|
||||||
|
builder.put("house", "verb");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals("noun", trie.get("HOUSE")),
|
||||||
|
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that REMOVE mode strips diacritics both at build time and at lookup
|
||||||
|
* time and composes independently with case normalization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
|
||||||
|
void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.REMOVE);
|
||||||
|
builder.put("Příliš", "cz");
|
||||||
|
builder.put("žluťoučký", "cz2");
|
||||||
|
builder.put("Smørrebrød", "da");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertEquals("cz", trie.get("PRILIS")),
|
||||||
|
() -> assertEquals("cz", trie.get("příliš")),
|
||||||
|
() -> assertEquals("cz2", trie.get("zlutoucky")),
|
||||||
|
() -> assertEquals("da", trie.get("SMORREBROD")),
|
||||||
|
() -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that fallback diacritic mode is explicitly rejected for now.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
|
||||||
|
void fallbackDiacriticModeIsNotSupportedYet() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
|
||||||
|
DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
|
||||||
|
|
||||||
|
final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
|
||||||
|
() -> builder.put("kůň", "horse"));
|
||||||
|
assertTrue(exception.getMessage().contains("not supported yet"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Lookup keeps case-sensitive behavior when metadata is AS_IS")
|
||||||
|
void lookupKeepsCaseSensitiveBehaviorWhenMetadataIsAsIs() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS);
|
||||||
|
builder.put("House", "noun");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals("noun", trie.get("House")), () -> assertNull(trie.get("house")),
|
||||||
|
() -> assertArrayEquals(new String[] { "noun" }, trie.getAll("House")),
|
||||||
|
() -> assertArrayEquals(new String[0], trie.getAll("HOUSE")));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that a missing path below an existing prefix returns empty results.
|
* Verifies that a missing path below an existing prefix returns empty results.
|
||||||
*/
|
*/
|
||||||
@@ -301,6 +380,24 @@ class FrequencyTrieTest {
|
|||||||
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that {@link FrequencyTrie#getEntries(String)} short-circuits to a one-item immutable list.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("getEntries returns a one-item list for single stored values")
|
||||||
|
void getEntriesReturnsSingleItemListForSingleStoredValue() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||||
|
|
||||||
|
builder.put("gamma", "only");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final List<ValueCount<String>> entries = trie.getEntries("gamma");
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(List.of(new ValueCount<String>("only", 1)), entries),
|
||||||
|
() -> assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1))));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that equal frequencies prefer the shorter string representation.
|
* Verifies that equal frequencies prefer the shorter string representation.
|
||||||
*/
|
*/
|
||||||
@@ -588,8 +685,15 @@ class FrequencyTrieTest {
|
|||||||
() -> assertEquals("prefix", trie.get("p19")), () -> assertEquals("mid", trie.get("p19x")),
|
() -> assertEquals("prefix", trie.get("p19")), () -> assertEquals("mid", trie.get("p19x")),
|
||||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p19xy")),
|
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p19xy")),
|
||||||
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p19xz")),
|
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p19xz")),
|
||||||
() -> assertEquals(82, buildTimeSize), () -> assertEquals(7, compiledSize),
|
() -> assertTrue(buildTimeSize > 0,
|
||||||
() -> assertEquals(1.0d - (7.0d / 82.0d), reductionRatio, 0.0000001d),
|
() -> "Build-time size must be positive, but was " + buildTimeSize + '.'),
|
||||||
|
() -> assertTrue(compiledSize > 0,
|
||||||
|
() -> "Compiled trie size must be positive, but was " + compiledSize + '.'),
|
||||||
|
() -> assertTrue(compiledSize < buildTimeSize,
|
||||||
|
() -> "Reduction must decrease the node count. Build-time size=" + buildTimeSize
|
||||||
|
+ ", compiled size=" + compiledSize + '.'),
|
||||||
|
() -> assertTrue(reductionRatio > 0.0d,
|
||||||
|
() -> "Reduction ratio must be positive, but was " + reductionRatio + '.'),
|
||||||
() -> assertTrue(reductionRatio >= 0.50d,
|
() -> assertTrue(reductionRatio >= 0.50d,
|
||||||
() -> "Expected at least 50% reduction, but build-time size was " + buildTimeSize
|
() -> "Expected at least 50% reduction, but build-time size was " + buildTimeSize
|
||||||
+ " and compiled size was " + compiledSize + ", giving ratio " + reductionRatio + '.'));
|
+ " and compiled size was " + compiledSize + ", giving ratio " + reductionRatio + '.'));
|
||||||
@@ -670,6 +774,115 @@ class FrequencyTrieTest {
|
|||||||
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that reading a compiled trie with a negative max-expanded override
|
||||||
|
* smaller than -1 is rejected.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid maxExpandedIndex override")
|
||||||
|
void readFromRejectsInvalidMaxExpandedIndexOverride() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC, -2));
|
||||||
|
|
||||||
|
assertEquals("maxExpandedIndex must be >= -1.", exception.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the max-expanded override controls dense lookup materialization
|
||||||
|
* while preserving lookup semantics.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom respects dense lookup max-expanded index override")
|
||||||
|
void readFromRespectsDenseLookupMaxExpandedIndexOverride() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||||
|
|
||||||
|
builder.put("a", "a");
|
||||||
|
builder.put("b", "b");
|
||||||
|
builder.put("c", "c");
|
||||||
|
builder.put("d", "d");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> original = builder.build();
|
||||||
|
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
original.writeTo(outputStream, STRING_CODEC);
|
||||||
|
final byte[] serializedTrie = outputStream.toByteArray();
|
||||||
|
|
||||||
|
final FrequencyTrie<String> defaultDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||||
|
STRING_CODEC);
|
||||||
|
final FrequencyTrie<String> defaultDenseByNegative = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie),
|
||||||
|
String[]::new, STRING_CODEC, -1);
|
||||||
|
final FrequencyTrie<String> disabledDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||||
|
STRING_CODEC, 0);
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertTrue(defaultDense.root().hasDenseLookup(),
|
||||||
|
"Default read should enable dense lookup for compact first-level edges."),
|
||||||
|
() -> assertTrue(defaultDenseByNegative.root().hasDenseLookup(),
|
||||||
|
"Negative override should use the default dense lookup span."),
|
||||||
|
() -> assertFalse(disabledDense.root().hasDenseLookup(),
|
||||||
|
"Zero override should disable dense lookup tables."),
|
||||||
|
() -> assertEquals(original.get("a"), disabledDense.get("a")),
|
||||||
|
() -> assertEquals(original.get("b"), disabledDense.get("b")),
|
||||||
|
() -> assertEquals(original.get("c"), disabledDense.get("c")),
|
||||||
|
() -> assertEquals(original.get("d"), disabledDense.get("d")),
|
||||||
|
() -> assertEquals(original.get("z"), disabledDense.get("z")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that cyclic serialized node references are rejected as invalid
|
||||||
|
* serialization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects cyclic serialized node references")
|
||||||
|
void readFromRejectsCyclicSerializedNodeReferences() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 2, 0, new NodeWriter[] {
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('b');
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
},
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('a');
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("cyclic reference detected"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that child node references outside the valid serialized range are
|
||||||
|
* rejected.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid child node identifiers")
|
||||||
|
void readFromRejectsInvalidChildNodeId() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('a');
|
||||||
|
dataOutput.writeInt(3);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid child node id"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that deserialization rejects an invalid stream magic header.
|
* Verifies that deserialization rejects an invalid stream magic header.
|
||||||
*/
|
*/
|
||||||
@@ -700,6 +913,27 @@ class FrequencyTrieTest {
|
|||||||
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the latest stream version validates textual metadata blocks.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid textual metadata block")
|
||||||
|
void readFromRejectsInvalidTextualMetadataBlock() {
|
||||||
|
final int version = FrequencyTrie.currentFormatVersion();
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, version, 1, 0, dataOutput -> {
|
||||||
|
dataOutput.writeUTF("not valid metadata");
|
||||||
|
}, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid metadata block"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that deserialization rejects a negative node count.
|
* Verifies that deserialization rejects a negative node count.
|
||||||
*/
|
*/
|
||||||
@@ -777,6 +1011,129 @@ class FrequencyTrieTest {
|
|||||||
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that legacy version 1 metadata uses compatibility defaults.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom supports legacy version 1 metadata")
|
||||||
|
void readFromSupportsLegacyVersionOneMetadata() throws IOException {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
|
||||||
|
assertEquals(TrieMetadata.legacy(1, WordTraversalDirection.BACKWARD), trie.metadata());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that legacy version 2 metadata stores traversal direction and uses
|
||||||
|
* compatibility defaults for other values.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom supports legacy version 2 metadata")
|
||||||
|
void readFromSupportsLegacyVersionTwoMetadata() throws IOException {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||||
|
dataOutput -> dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal()), new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
|
||||||
|
assertEquals(TrieMetadata.legacy(2, WordTraversalDirection.FORWARD), trie.metadata());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that version 3 metadata includes reduction and diacritic
|
||||||
|
* processing settings.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom parses version 3 metadata")
|
||||||
|
void readFromParsesVersionThreeMetadata() throws IOException {
|
||||||
|
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS, 81, 4);
|
||||||
|
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 3, 1, 0,
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(WordTraversalDirection.BACKWARD.ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||||
|
dataOutput.writeInt(DiacriticProcessingMode.REMOVE.ordinal());
|
||||||
|
},
|
||||||
|
new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
final TrieMetadata metadata = trie.metadata();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(3, metadata.formatVersion()),
|
||||||
|
() -> assertEquals(WordTraversalDirection.BACKWARD, metadata.traversalDirection()),
|
||||||
|
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||||
|
() -> assertEquals(DiacriticProcessingMode.REMOVE, metadata.diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, metadata.caseProcessingMode()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that version 4 metadata additionally stores case-processing mode.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom parses version 4 case processing metadata")
|
||||||
|
void readFromParsesVersionFourCaseMetadata() throws IOException {
|
||||||
|
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, 75, 3);
|
||||||
|
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 4, 1, 0,
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||||
|
dataOutput.writeInt(DiacriticProcessingMode.AS_IS.ordinal());
|
||||||
|
dataOutput.writeInt(CaseProcessingMode.AS_IS.ordinal());
|
||||||
|
},
|
||||||
|
new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
final TrieMetadata metadata = trie.metadata();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(4, metadata.formatVersion()),
|
||||||
|
() -> assertEquals(WordTraversalDirection.FORWARD, metadata.traversalDirection()),
|
||||||
|
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||||
|
() -> assertEquals(DiacriticProcessingMode.AS_IS, metadata.diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.AS_IS, metadata.caseProcessingMode()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that invalid legacy metadata ordinals are rejected by validation.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid metadata ordinal in legacy stream")
|
||||||
|
void readFromRejectsInvalidLegacyMetadataOrdinal() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||||
|
dataOutput -> dataOutput.writeInt(999), new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid traversal direction ordinal"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes one node body into a synthetic serialized trie stream.
|
* Writes one node body into a synthetic serialized trie stream.
|
||||||
*/
|
*/
|
||||||
@@ -804,6 +1161,24 @@ class FrequencyTrieTest {
|
|||||||
*/
|
*/
|
||||||
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||||
final int rootNodeId, final NodeWriter[] nodes) {
|
final int rootNodeId, final NodeWriter[] nodes) {
|
||||||
|
return createSerializedStream(magic, version, nodeCount, rootNodeId, dataOutput -> {
|
||||||
|
// legacy and text-based versions write their metadata differently.
|
||||||
|
}, nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a synthetic serialized trie stream with a metadata writer hook.
|
||||||
|
*
|
||||||
|
* @param magic stream magic
|
||||||
|
* @param version stream version
|
||||||
|
* @param nodeCount declared node count
|
||||||
|
* @param rootNodeId declared root node identifier
|
||||||
|
* @param metadata version-specific metadata writer
|
||||||
|
* @param nodes node body writers
|
||||||
|
* @return serialized bytes
|
||||||
|
*/
|
||||||
|
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||||
|
final int rootNodeId, final MetadataWriter metadata, final NodeWriter[] nodes) {
|
||||||
try {
|
try {
|
||||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||||
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
||||||
@@ -812,6 +1187,7 @@ class FrequencyTrieTest {
|
|||||||
dataOutputStream.writeInt(version);
|
dataOutputStream.writeInt(version);
|
||||||
dataOutputStream.writeInt(nodeCount);
|
dataOutputStream.writeInt(nodeCount);
|
||||||
dataOutputStream.writeInt(rootNodeId);
|
dataOutputStream.writeInt(rootNodeId);
|
||||||
|
metadata.write(dataOutputStream);
|
||||||
|
|
||||||
for (NodeWriter node : nodes) {
|
for (NodeWriter node : nodes) {
|
||||||
node.write(dataOutputStream);
|
node.write(dataOutputStream);
|
||||||
@@ -823,4 +1199,19 @@ class FrequencyTrieTest {
|
|||||||
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes one synthetic metadata block.
|
||||||
|
*/
|
||||||
|
@FunctionalInterface
|
||||||
|
private interface MetadataWriter {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes metadata bytes for one stream version.
|
||||||
|
*
|
||||||
|
* @param dataOutput output stream
|
||||||
|
* @throws IOException if writing fails
|
||||||
|
*/
|
||||||
|
void write(DataOutputStream dataOutput) throws IOException;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -65,10 +65,9 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
* stems declared by the source dictionary.
|
* stems declared by the source dictionary.
|
||||||
*/
|
*/
|
||||||
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("fuzz")
|
@Tag("fuzz")
|
||||||
@Tag("trie")
|
@Tag("trie")
|
||||||
@Tag("stemming")
|
@Tag("stemmer")
|
||||||
class FuzzStemmerAndTrieCompilationTest {
|
class FuzzStemmerAndTrieCompilationTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -161,10 +160,10 @@ class FuzzStemmerAndTrieCompilationTest {
|
|||||||
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
|
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
|
||||||
() -> assertTrue(allPatches.length >= 1,
|
() -> assertTrue(allPatches.length >= 1,
|
||||||
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
|
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
|
||||||
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch, trie.traversalDirection())),
|
||||||
describeScenario("preferred patch reconstructed an unexpected stem",
|
describeScenario("preferred patch reconstructed an unexpected stem",
|
||||||
reductionMode, scenario, word)),
|
reductionMode, scenario, word)),
|
||||||
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
|
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(trie, word, allPatches, acceptableStems),
|
||||||
describeScenario("getAll() contained a patch outside the accepted stem set",
|
describeScenario("getAll() contained a patch outside the accepted stem set",
|
||||||
reductionMode, scenario, word)));
|
reductionMode, scenario, word)));
|
||||||
}
|
}
|
||||||
@@ -276,10 +275,10 @@ class FuzzStemmerAndTrieCompilationTest {
|
|||||||
* @param acceptableStems acceptable stems
|
* @param acceptableStems acceptable stems
|
||||||
* @return {@code true} when all patches are acceptable
|
* @return {@code true} when all patches are acceptable
|
||||||
*/
|
*/
|
||||||
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
|
private static boolean allPatchesProduceOnlyAcceptableStems(final FrequencyTrie<String> trie,
|
||||||
final Set<String> acceptableStems) {
|
final String word, final String[] patches, final Set<String> acceptableStems) {
|
||||||
for (String patch : patches) {
|
for (String patch : patches) {
|
||||||
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
|
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch, trie.traversalDirection()))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ final class FuzzTestSupport {
|
|||||||
|
|
||||||
dictionary.append(stem);
|
dictionary.append(stem);
|
||||||
for (String variant : variants) {
|
for (String variant : variants) {
|
||||||
dictionary.append(' ').append(variant);
|
dictionary.append('\t').append(variant);
|
||||||
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
|
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
|
||||||
}
|
}
|
||||||
dictionary.append(" # entry ").append(index).append('\n');
|
dictionary.append(" # entry ").append(index).append('\n');
|
||||||
@@ -181,18 +181,19 @@ final class FuzzTestSupport {
|
|||||||
private static String createVariant(final Random random, final String stem) {
|
private static String createVariant(final Random random, final String stem) {
|
||||||
final int mode = random.nextInt(6);
|
final int mode = random.nextInt(6);
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case 0:
|
case 0:
|
||||||
return stem + suffix(random);
|
return stem + suffix(random);
|
||||||
case 1:
|
case 1:
|
||||||
return prefix(random) + stem;
|
return prefix(random) + stem;
|
||||||
case 2:
|
case 2:
|
||||||
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
|
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random)
|
||||||
case 3:
|
: stem + nextLetter(random);
|
||||||
return stem + nextLetter(random) + nextLetter(random);
|
case 3:
|
||||||
case 4:
|
return stem + nextLetter(random) + nextLetter(random);
|
||||||
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
|
case 4:
|
||||||
default:
|
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
|
||||||
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
|
default:
|
||||||
|
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -317,7 +318,8 @@ final class FuzzTestSupport {
|
|||||||
* @param dictionaryContent generated dictionary content
|
* @param dictionaryContent generated dictionary content
|
||||||
* @param expectedStemsByWord acceptable stems for each generated word
|
* @param expectedStemsByWord acceptable stems for each generated word
|
||||||
*/
|
*/
|
||||||
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
|
record StemmerDictionaryScenario(long seed, String dictionaryContent,
|
||||||
|
Map<String, Set<String>> expectedStemsByWord) {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a validated scenario.
|
* Creates a validated scenario.
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for {@link PatchCommandEncoder}.
|
* Property-based tests for {@link PatchCommandEncoder}.
|
||||||
@@ -47,9 +47,9 @@ import net.jqwik.api.Tag;
|
|||||||
* reconstruct the exact requested target.
|
* reconstruct the exact requested target.
|
||||||
*/
|
*/
|
||||||
@Label("PatchCommandEncoder properties")
|
@Label("PatchCommandEncoder properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("patch")
|
@Tag("patch")
|
||||||
|
@Tag("stemmer")
|
||||||
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
|||||||
@Label("encode followed by apply should reconstruct the target word")
|
@Label("encode followed by apply should reconstruct the target word")
|
||||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||||
@ForAll("words") final String target) {
|
@ForAll("words") final String target) {
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
final String patch = encoder.encode(source, target);
|
final String patch = encoder.encode(source, target);
|
||||||
|
|
||||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||||
@@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
|||||||
@Label("encode should be deterministic for one source-target pair")
|
@Label("encode should be deterministic for one source-target pair")
|
||||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||||
@ForAll("words") final String target) {
|
@ForAll("words") final String target) {
|
||||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build();
|
||||||
final String first = sharedEncoder.encode(source, target);
|
final String first = sharedEncoder.encode(source, target);
|
||||||
final String second = sharedEncoder.encode(source, target);
|
final String second = sharedEncoder.encode(source, target);
|
||||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
final String fresh = PatchCommandEncoder.builder().build().encode(source, target);
|
||||||
|
|
||||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||||
|
|||||||
@@ -241,7 +241,7 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("construction")
|
@DisplayName("construction")
|
||||||
@Tag("constructor")
|
@Tag("construction")
|
||||||
class ConstructionTests {
|
class ConstructionTests {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -250,12 +250,28 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("creates encoder with default cost model")
|
@DisplayName("creates encoder with default cost model")
|
||||||
void shouldCreateEncoderWithDefaultCostModel() {
|
void shouldCreateEncoderWithDefaultCostModel() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertNotNull(encoder);
|
assertNotNull(encoder);
|
||||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies fluent builder construction with explicit forward traversal.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("builds direction-specialized encoder via builder")
|
||||||
|
void shouldBuildDirectionSpecializedEncoderViaBuilder() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
String patch = encoder.encode("running", "run");
|
||||||
|
|
||||||
|
assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch),
|
||||||
|
() -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch)));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that a negative insert cost is rejected.
|
* Verifies that a negative insert cost is rejected.
|
||||||
*/
|
*/
|
||||||
@@ -263,7 +279,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative insert cost")
|
@DisplayName("rejects negative insert cost")
|
||||||
void shouldRejectNegativeInsertCost() {
|
void shouldRejectNegativeInsertCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(-1, 1, 1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -275,7 +291,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative delete cost")
|
@DisplayName("rejects negative delete cost")
|
||||||
void shouldRejectNegativeDeleteCost() {
|
void shouldRejectNegativeDeleteCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, -1, 1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -287,7 +303,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative replace cost")
|
@DisplayName("rejects negative replace cost")
|
||||||
void shouldRejectNegativeReplaceCost() {
|
void shouldRejectNegativeReplaceCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, 1, -1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -299,7 +315,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative match cost")
|
@DisplayName("rejects negative match cost")
|
||||||
void shouldRejectNegativeMatchCost() {
|
void shouldRejectNegativeMatchCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, 1, 1, -1));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build());
|
||||||
|
|
||||||
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -310,16 +326,32 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("encode(String, String)")
|
@DisplayName("encode(String, String)")
|
||||||
@Tag("encode")
|
@Tag("encoding")
|
||||||
class EncodeTests {
|
class EncodeTests {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that trailing SKIP instructions are omitted from the generated patch
|
||||||
|
* command because they do not affect reconstruction.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("does not emit trailing SKIP instructions into patch command")
|
||||||
|
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
|
String patch = encoder.encode("abcd", "ab");
|
||||||
|
|
||||||
|
assertAll(() -> assertNotNull(patch), () -> assertEquals("Db", patch),
|
||||||
|
() -> assertEquals("ab", PatchCommandEncoder.apply("abcd", patch)), () -> assertEquals(-1,
|
||||||
|
patch.indexOf('-'), () -> "Patch must not contain a trailing SKIP instruction: " + patch));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that a null source yields a null patch.
|
* Verifies that a null source yields a null patch.
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("returns null when source is null")
|
@DisplayName("returns null when source is null")
|
||||||
void shouldReturnNullWhenSourceIsNull() {
|
void shouldReturnNullWhenSourceIsNull() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode(null, "target");
|
String patch = encoder.encode(null, "target");
|
||||||
|
|
||||||
@@ -332,7 +364,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns null when target is null")
|
@DisplayName("returns null when target is null")
|
||||||
void shouldReturnNullWhenTargetIsNull() {
|
void shouldReturnNullWhenTargetIsNull() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("source", null);
|
String patch = encoder.encode("source", null);
|
||||||
|
|
||||||
@@ -345,7 +377,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns canonical NOOP patch for equal words")
|
@DisplayName("returns canonical NOOP patch for equal words")
|
||||||
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teacher");
|
String patch = encoder.encode("teacher", "teacher");
|
||||||
|
|
||||||
@@ -359,7 +391,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns canonical NOOP patch for equal empty words")
|
@DisplayName("returns canonical NOOP patch for equal empty words")
|
||||||
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("", "");
|
String patch = encoder.encode("", "");
|
||||||
|
|
||||||
@@ -378,7 +410,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
||||||
@DisplayName("produces patches that reconstruct the target")
|
@DisplayName("produces patches that reconstruct the target")
|
||||||
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode(source, target);
|
String patch = encoder.encode(source, target);
|
||||||
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
||||||
@@ -398,7 +430,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("remains correct when reused across different input sizes")
|
@DisplayName("remains correct when reused across different input sizes")
|
||||||
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals("transformation",
|
() -> assertEquals("transformation",
|
||||||
@@ -414,7 +446,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("supports custom operation costs")
|
@DisplayName("supports custom operation costs")
|
||||||
void shouldSupportCustomOperationCosts() {
|
void shouldSupportCustomOperationCosts() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teach");
|
String patch = encoder.encode("teacher", "teach");
|
||||||
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
||||||
@@ -473,6 +505,36 @@ class PatchCommandEncoderTest {
|
|||||||
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that instance-level application follows encoder traversal
|
||||||
|
* direction.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("applies patch via instance-level direction-specialized fast path")
|
||||||
|
void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
String patch = encoder.encode("transformation", "transform");
|
||||||
|
|
||||||
|
assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies dedicated forward traversal encode/apply round trip.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("reconstructs target with forward traversal encoder and static apply")
|
||||||
|
void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
String patch = encoder.encode("cities", "city");
|
||||||
|
|
||||||
|
assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies explicit patch application cases.
|
* Verifies explicit patch application cases.
|
||||||
*
|
*
|
||||||
@@ -544,7 +606,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles deletion-heavy suffix stripping")
|
@DisplayName("handles deletion-heavy suffix stripping")
|
||||||
void shouldHandleDeletionHeavySuffixStripping() {
|
void shouldHandleDeletionHeavySuffixStripping() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teach");
|
String patch = encoder.encode("teacher", "teach");
|
||||||
|
|
||||||
@@ -557,7 +619,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles plural to singular transformation")
|
@DisplayName("handles plural to singular transformation")
|
||||||
void shouldHandlePluralToSingularTransformation() {
|
void shouldHandlePluralToSingularTransformation() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("cities", "city");
|
String patch = encoder.encode("cities", "city");
|
||||||
|
|
||||||
@@ -570,7 +632,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles derivational reduction to a shorter stem")
|
@DisplayName("handles derivational reduction to a shorter stem")
|
||||||
void shouldHandleDerivationalReductionToShorterStem() {
|
void shouldHandleDerivationalReductionToShorterStem() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("stemming", "stem");
|
String patch = encoder.encode("stemming", "stem");
|
||||||
|
|
||||||
@@ -583,7 +645,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles single-character replacement")
|
@DisplayName("handles single-character replacement")
|
||||||
void shouldHandleSingleCharacterReplacement() {
|
void shouldHandleSingleCharacterReplacement() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("a", "z");
|
String patch = encoder.encode("a", "z");
|
||||||
|
|
||||||
@@ -596,7 +658,7 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("reversed-word processing")
|
@DisplayName("reversed-word processing")
|
||||||
@Tag("reverse")
|
@Tag("normalization")
|
||||||
class ReversedWordProcessingTests {
|
class ReversedWordProcessingTests {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -610,7 +672,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||||
@DisplayName("reconstructs reversed targets from reversed sources")
|
@DisplayName("reconstructs reversed targets from reversed sources")
|
||||||
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String reversedSource = reverse(source);
|
String reversedSource = reverse(source);
|
||||||
String reversedTarget = reverse(target);
|
String reversedTarget = reverse(target);
|
||||||
@@ -633,7 +695,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles mirrored stemming transformations")
|
@DisplayName("handles mirrored stemming transformations")
|
||||||
void shouldHandleMirroredStemmingTransformations() {
|
void shouldHandleMirroredStemmingTransformations() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals(reverse("teach"),
|
() -> assertEquals(reverse("teach"),
|
||||||
@@ -655,7 +717,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("remains correct when reused on reversed words of different sizes")
|
@DisplayName("remains correct when reused on reversed words of different sizes")
|
||||||
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals(reverse("transformation"),
|
() -> assertEquals(reverse("transformation"),
|
||||||
@@ -683,7 +745,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||||
@DisplayName("preserves correctness under mirrored input orientation")
|
@DisplayName("preserves correctness under mirrored input orientation")
|
||||||
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String normalPatch = encoder.encode(source, target);
|
String normalPatch = encoder.encode(source, target);
|
||||||
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport {
|
|||||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
for (StemmerEntry entry : scenario.entries()) {
|
for (StemmerEntry entry : scenario.entries()) {
|
||||||
if (storeOriginal) {
|
if (storeOriginal) {
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
@@ -44,6 +45,10 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.logging.Handler;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.LogRecord;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
import org.junit.jupiter.api.DisplayName;
|
import org.junit.jupiter.api.DisplayName;
|
||||||
import org.junit.jupiter.api.Nested;
|
import org.junit.jupiter.api.Nested;
|
||||||
@@ -59,7 +64,7 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
* </p>
|
* </p>
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>parsing through all public overloads,</li>
|
* <li>parsing through all public overloads,</li>
|
||||||
* <li>normalization to lower case,</li>
|
* <li>case processing according to the selected mode,</li>
|
||||||
* <li>handling of empty lines and remarks,</li>
|
* <li>handling of empty lines and remarks,</li>
|
||||||
* <li>correct entry emission including line numbers,</li>
|
* <li>correct entry emission including line numbers,</li>
|
||||||
* <li>propagation of I/O failures from the handler and file system,</li>
|
* <li>propagation of I/O failures from the handler and file system,</li>
|
||||||
@@ -70,6 +75,7 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
@DisplayName("StemmerDictionaryParser")
|
@DisplayName("StemmerDictionaryParser")
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("parser")
|
@Tag("parser")
|
||||||
|
@Tag("stemmer")
|
||||||
class StemmerDictionaryParserTest {
|
class StemmerDictionaryParserTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -89,6 +95,43 @@ class StemmerDictionaryParserTest {
|
|||||||
// Record used only as a compact assertion carrier.
|
// Record used only as a compact assertion carrier.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log handler capturing parser diagnostics for assertions.
|
||||||
|
*/
|
||||||
|
private static final class CapturedLogHandler extends Handler {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captured log records.
|
||||||
|
*/
|
||||||
|
private final List<LogRecord> records = new ArrayList<LogRecord>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void publish(final LogRecord record) {
|
||||||
|
if (record != null) {
|
||||||
|
this.records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void flush() {
|
||||||
|
// No buffered state.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
this.records.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the captured records.
|
||||||
|
*
|
||||||
|
* @return captured records
|
||||||
|
*/
|
||||||
|
private List<LogRecord> records() {
|
||||||
|
return this.records;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a handler that collects all parser callbacks into the supplied list.
|
* Creates a handler that collects all parser callbacks into the supplied list.
|
||||||
*
|
*
|
||||||
@@ -121,8 +164,8 @@ class StemmerDictionaryParserTest {
|
|||||||
@DisplayName("should parse normalized entries and collect accurate statistics")
|
@DisplayName("should parse normalized entries and collect accurate statistics")
|
||||||
void shouldParseNormalizedEntriesAndCollectAccurateStatistics() throws IOException {
|
void shouldParseNormalizedEntriesAndCollectAccurateStatistics() throws IOException {
|
||||||
final String input = "# full line remark\n" + " \n"
|
final String input = "# full line remark\n" + " \n"
|
||||||
+ "Root Running Runs RUNNER # trailing hash remark\n"
|
+ "Root Running Runs RUNNER # trailing hash remark\n"
|
||||||
+ "House HOUSEHOLD houseS // trailing slash remark\n" + "SingleStem\n"
|
+ "House HOUSEHOLD houseS // trailing slash remark\n" + "SingleStem\n"
|
||||||
+ "// full line slash remark\n";
|
+ "// full line slash remark\n";
|
||||||
|
|
||||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||||
@@ -157,11 +200,54 @@ class StemmerDictionaryParserTest {
|
|||||||
() -> assertEquals(5, third.lineNumber()));
|
() -> assertEquals(5, third.lineNumber()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("should ignore whitespace-containing items and emit one warning per physical line")
|
||||||
|
void shouldIgnoreWhitespaceContainingItemsAndLogOneWarningPerLine() throws IOException {
|
||||||
|
final String input = "root\trunning form\truns\tnew\u2003term\n" + "compound stem\talpha\tbeta\tvalue\n";
|
||||||
|
|
||||||
|
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||||
|
final Logger logger = Logger.getLogger(StemmerDictionaryParser.class.getName());
|
||||||
|
final Level previousLevel = logger.getLevel();
|
||||||
|
final boolean previousUseParentHandlers = logger.getUseParentHandlers();
|
||||||
|
final CapturedLogHandler handler = new CapturedLogHandler();
|
||||||
|
|
||||||
|
logger.setUseParentHandlers(false);
|
||||||
|
logger.setLevel(Level.WARNING);
|
||||||
|
logger.addHandler(handler);
|
||||||
|
try {
|
||||||
|
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser
|
||||||
|
.parse(new StringReader(input), "whitespace-source", collectingHandler(entries));
|
||||||
|
|
||||||
|
assertAll("Statistics", () -> assertEquals(2, statistics.lineCount()),
|
||||||
|
() -> assertEquals(1, statistics.entryCount()),
|
||||||
|
() -> assertEquals(0, statistics.ignoredLineCount()));
|
||||||
|
assertEquals(1, entries.size(), "Only the valid TSV row must be emitted.");
|
||||||
|
assertAll("Parsed entry", () -> assertEquals("root", entries.get(0).stem()),
|
||||||
|
() -> assertArrayEquals(new String[] { "runs" }, entries.get(0).variants()),
|
||||||
|
() -> assertEquals(1, entries.get(0).lineNumber()));
|
||||||
|
assertEquals(2, handler.records().size(), "Exactly one warning must be emitted per physical line.");
|
||||||
|
assertAll("First warning", () -> assertEquals(Level.WARNING, handler.records().get(0).getLevel()),
|
||||||
|
() -> assertTrue(handler.records().get(0).getMessage()
|
||||||
|
.contains("Ignoring dictionary items containing whitespace")),
|
||||||
|
() -> assertEquals("whitespace-source", handler.records().get(0).getParameters()[0]),
|
||||||
|
() -> assertEquals(Integer.valueOf(1), handler.records().get(0).getParameters()[1]),
|
||||||
|
() -> assertEquals("root", handler.records().get(0).getParameters()[2]),
|
||||||
|
() -> assertEquals(Integer.valueOf(2), handler.records().get(0).getParameters()[3]));
|
||||||
|
assertAll("Second warning",
|
||||||
|
() -> assertEquals(Integer.valueOf(2), handler.records().get(1).getParameters()[1]),
|
||||||
|
() -> assertEquals("compound stem", handler.records().get(1).getParameters()[2]));
|
||||||
|
} finally {
|
||||||
|
logger.removeHandler(handler);
|
||||||
|
logger.setUseParentHandlers(previousUseParentHandlers);
|
||||||
|
logger.setLevel(previousLevel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("should prefer earliest remark marker regardless of marker type")
|
@DisplayName("should prefer earliest remark marker regardless of marker type")
|
||||||
void shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType() throws IOException {
|
void shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType() throws IOException {
|
||||||
final String input = "alpha beta // slash remark before # hash remark # ignored\n"
|
final String input = "alpha beta // slash remark before # hash remark # ignored\n"
|
||||||
+ "gamma delta # hash remark before // slash remark // ignored\n";
|
+ "gamma delta # hash remark before // slash remark // ignored\n";
|
||||||
|
|
||||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||||
|
|
||||||
@@ -185,7 +271,7 @@ class StemmerDictionaryParserTest {
|
|||||||
@DisplayName("should propagate handler IOException without swallowing it")
|
@DisplayName("should propagate handler IOException without swallowing it")
|
||||||
void shouldPropagateHandlerIOExceptionWithoutSwallowingIt() {
|
void shouldPropagateHandlerIOExceptionWithoutSwallowingIt() {
|
||||||
final IOException expected = new IOException("Simulated handler failure.");
|
final IOException expected = new IOException("Simulated handler failure.");
|
||||||
final Reader reader = new StringReader("stem variant\n");
|
final Reader reader = new StringReader("stem variant\n");
|
||||||
|
|
||||||
final IOException exception = assertThrows(IOException.class,
|
final IOException exception = assertThrows(IOException.class,
|
||||||
() -> StemmerDictionaryParser.parse(reader, "failing-handler", (stem, variants, lineNumber) -> {
|
() -> StemmerDictionaryParser.parse(reader, "failing-handler", (stem, variants, lineNumber) -> {
|
||||||
@@ -195,6 +281,22 @@ class StemmerDictionaryParserTest {
|
|||||||
assertEquals(expected, exception, "The original exception instance should be preserved.");
|
assertEquals(expected, exception, "The original exception instance should be preserved.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("should preserve character case when AS_IS mode is selected")
|
||||||
|
void shouldPreserveCharacterCaseWhenAsIsModeIsSelected() throws IOException {
|
||||||
|
final String input = "Root\tRunning\tRuns\tRUNNER\n";
|
||||||
|
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||||
|
|
||||||
|
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(
|
||||||
|
new StringReader(input), "case-as-is", CaseProcessingMode.AS_IS, collectingHandler(entries));
|
||||||
|
|
||||||
|
assertAll("Statistics", () -> assertEquals(1, statistics.lineCount()),
|
||||||
|
() -> assertEquals(1, statistics.entryCount()), () -> assertEquals(0, statistics.ignoredLineCount()));
|
||||||
|
assertEquals(1, entries.size(), "Exactly one entry should be emitted.");
|
||||||
|
assertAll("Entry", () -> assertEquals("Root", entries.get(0).stem()),
|
||||||
|
() -> assertArrayEquals(new String[] { "Running", "Runs", "RUNNER" }, entries.get(0).variants()));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("should reject null reader")
|
@DisplayName("should reject null reader")
|
||||||
void shouldRejectNullReader() {
|
void shouldRejectNullReader() {
|
||||||
@@ -213,6 +315,15 @@ class StemmerDictionaryParserTest {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("should reject null case processing mode")
|
||||||
|
void shouldRejectNullCaseProcessingMode() {
|
||||||
|
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(new StringReader("a b"),
|
||||||
|
"source", null, (stem, variants, lineNumber) -> {
|
||||||
|
// no-op
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("should reject null entry handler")
|
@DisplayName("should reject null entry handler")
|
||||||
void shouldRejectNullEntryHandler() {
|
void shouldRejectNullEntryHandler() {
|
||||||
@@ -228,7 +339,7 @@ class StemmerDictionaryParserTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("should parse same content through path and string overloads")
|
@DisplayName("should parse same content through path and string overloads")
|
||||||
void shouldParseSameContentThroughPathAndStringOverloads() throws IOException {
|
void shouldParseSameContentThroughPathAndStringOverloads() throws IOException {
|
||||||
final String content = "walk walking walked\n" + "run running\n" + "\n" + "# ignored\n";
|
final String content = "walk walking walked\n" + "run running\n" + "\n" + "# ignored\n";
|
||||||
|
|
||||||
final Path file = createFile("dictionary.txt", content);
|
final Path file = createFile("dictionary.txt", content);
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,279 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link StemmerKnowledgeExperiment}.
|
||||||
|
*/
|
||||||
|
@Tag("integration")
|
||||||
|
@Tag("stemmer")
|
||||||
|
@Tag("trie")
|
||||||
|
final class StemmerKnowledgeExperimentTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deterministic seed used by all tests.
|
||||||
|
*/
|
||||||
|
private static final long TEST_SEED = 20260421L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Small deterministic morphology-shaped dictionary.
|
||||||
|
*/
|
||||||
|
private static final String DICTIONARY = String.join(System.lineSeparator(), "run running runs runner",
|
||||||
|
"walk walking walks walked", "play playing plays played");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Temporary directory for report writing tests.
|
||||||
|
*/
|
||||||
|
@TempDir
|
||||||
|
private Path tempDir;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies deterministic scenario generation and expected row count.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("evaluate should return deterministic full scenario matrix")
|
||||||
|
void evaluateShouldReturnDeterministicScenarioMatrix() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> first = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> second = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
assertEquals(ReductionMode.values().length * 2 * 2 * 10, first.size());
|
||||||
|
assertEquals(first, second);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that full knowledge with stored original stems reaches ideal
|
||||||
|
* quality.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("100 percent knowledge with stored originals should achieve perfect scores")
|
||||||
|
void fullKnowledgeWithStoredOriginalsShouldBePerfect() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||||
|
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, true, true, 100));
|
||||||
|
|
||||||
|
assertEquals(1.0d, row.getAccuracy());
|
||||||
|
assertEquals(1.0d, row.getAllPrecision());
|
||||||
|
assertEquals(1.0d, row.getAllRecall());
|
||||||
|
assertEquals(1.0d, row.getAllF1());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that evaluating canonical stems without storing no-op patches lowers
|
||||||
|
* recall at full knowledge, while {@code get()} still remains perfect due to
|
||||||
|
* the implicit identity fallback for already canonical inputs.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("evaluating stems without stored originals should reduce recall but preserve get accuracy")
|
||||||
|
void evaluatingStemsWithoutStoredOriginalsShouldReduceRecall() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||||
|
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, true, 100));
|
||||||
|
|
||||||
|
assertTrue(row.getAllRecall() < 1.0d);
|
||||||
|
assertEquals(1.0d, row.getAccuracy());
|
||||||
|
assertTrue(row.getAllF1() < 1.0d);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that storing original stems becomes irrelevant when canonical stems
|
||||||
|
* themselves are not part of the evaluated input set.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("storeOriginal should not affect scores when stems are not evaluated")
|
||||||
|
void storeOriginalShouldNotAffectScoresWhenStemsAreNotEvaluated() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow withoutStoredOriginals = uniqueRow(rows,
|
||||||
|
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, false, 100));
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow withStoredOriginals = uniqueRow(rows,
|
||||||
|
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, true, false, 100));
|
||||||
|
|
||||||
|
assertEquals(withoutStoredOriginals.getAccuracy(), withStoredOriginals.getAccuracy());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllPrecision(), withStoredOriginals.getAllPrecision());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllRecall(), withStoredOriginals.getAllRecall());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllF1(), withStoredOriginals.getAllF1());
|
||||||
|
assertEquals(withoutStoredOriginals.getCorrectCount(), withStoredOriginals.getCorrectCount());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllTruePositiveCount(), withStoredOriginals.getAllTruePositiveCount());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllFalsePositiveCount(), withStoredOriginals.getAllFalsePositiveCount());
|
||||||
|
assertEquals(withoutStoredOriginals.getAllCoveredInputCount(), withStoredOriginals.getAllCoveredInputCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that implicit identity fallback for {@code get()} does not propagate
|
||||||
|
* into {@code getAll()}, which still requires an explicit command to cover an
|
||||||
|
* input.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("get should accept implicit identity while getAll still requires explicit coverage")
|
||||||
|
void getShouldAcceptImplicitIdentityWhileGetAllStillRequiresExplicitCoverage() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final String minimalDictionary = "run running";
|
||||||
|
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(minimalDictionary),
|
||||||
|
"minimal", "MINIMAL", TEST_SEED);
|
||||||
|
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||||
|
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, true, 100));
|
||||||
|
|
||||||
|
assertEquals(2L, row.evaluatedInputCount());
|
||||||
|
assertEquals(2L, row.getCorrectCount());
|
||||||
|
assertEquals(1.0d, row.getAccuracy());
|
||||||
|
|
||||||
|
assertEquals(1L, row.getAllCoveredInputCount());
|
||||||
|
assertEquals(0.5d, row.getAllRecall());
|
||||||
|
assertTrue(row.getAllPrecision() > 0.0d);
|
||||||
|
assertTrue(row.getAllPrecision() <= 1.0d);
|
||||||
|
assertTrue(row.getAllF1() < 1.0d);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies CSV report generation.
|
||||||
|
*
|
||||||
|
* @throws IOException if report writing fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("writeCsv should emit header and data rows")
|
||||||
|
void writeCsvShouldEmitHeaderAndDataRows() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
final Path output = this.tempDir.resolve("knowledge.csv");
|
||||||
|
StemmerKnowledgeExperiment.writeCsv(output, rows);
|
||||||
|
|
||||||
|
final List<String> writtenLines = Files.readAllLines(output, StandardCharsets.UTF_8);
|
||||||
|
assertFalse(writtenLines.isEmpty());
|
||||||
|
assertEquals(StemmerKnowledgeExperiment.ResultRow.csvHeader(), writtenLines.get(0));
|
||||||
|
assertEquals(rows.size() + 1, writtenLines.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the result row key lookup remains stable for all generated
|
||||||
|
* rows.
|
||||||
|
*
|
||||||
|
* @throws IOException if evaluation fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("all generated rows should be addressable by the synthetic key")
|
||||||
|
void allGeneratedRowsShouldBeAddressableBySyntheticKey() throws IOException {
|
||||||
|
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||||
|
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||||
|
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||||
|
|
||||||
|
for (StemmerKnowledgeExperiment.ResultRow row : rows) {
|
||||||
|
assertDoesNotThrow(() -> uniqueRow(rows, resultKey(row)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds one unique row by a synthetic key.
|
||||||
|
*
|
||||||
|
* @param rows result rows
|
||||||
|
* @param key synthetic key
|
||||||
|
* @return matching row
|
||||||
|
*/
|
||||||
|
private static StemmerKnowledgeExperiment.ResultRow uniqueRow(final List<StemmerKnowledgeExperiment.ResultRow> rows,
|
||||||
|
final String key) {
|
||||||
|
final Map<String, StemmerKnowledgeExperiment.ResultRow> indexed = rows.stream()
|
||||||
|
.collect(Collectors.toMap(StemmerKnowledgeExperimentTest::resultKey, Function.identity()));
|
||||||
|
final StemmerKnowledgeExperiment.ResultRow row = indexed.get(key);
|
||||||
|
assertNotNull(row);
|
||||||
|
return row;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a lookup key from a row.
|
||||||
|
*
|
||||||
|
* @param row result row
|
||||||
|
* @return lookup key
|
||||||
|
*/
|
||||||
|
private static String resultKey(final StemmerKnowledgeExperiment.ResultRow row) {
|
||||||
|
return resultKey(ReductionMode.valueOf(row.reductionMode()), row.storeOriginal(), row.includeStemInEvaluation(),
|
||||||
|
row.knowledgePercent());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a lookup key from scenario components.
|
||||||
|
*
|
||||||
|
* @param reductionMode reduction mode
|
||||||
|
* @param storeOriginal whether no-op patches were stored
|
||||||
|
* @param includeStemInEvaluation whether stems were evaluated
|
||||||
|
* @param knowledgePercent knowledge percentage
|
||||||
|
* @return lookup key
|
||||||
|
*/
|
||||||
|
private static String resultKey(final ReductionMode reductionMode, final boolean storeOriginal,
|
||||||
|
final boolean includeStemInEvaluation, final int knowledgePercent) {
|
||||||
|
return reductionMode.name() + '|' + storeOriginal + '|' + includeStemInEvaluation + '|' + knowledgePercent;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -38,6 +38,8 @@ import static org.junit.jupiter.api.Assertions.assertSame;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.anyInt;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.mockStatic;
|
import static org.mockito.Mockito.mockStatic;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
@@ -91,6 +93,8 @@ import org.mockito.MockedStatic;
|
|||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("io")
|
@Tag("io")
|
||||||
@Tag("persistence")
|
@Tag("persistence")
|
||||||
|
@Tag("serialization")
|
||||||
|
@Tag("trie")
|
||||||
@DisplayName("StemmerPatchTrieBinaryIO")
|
@DisplayName("StemmerPatchTrieBinaryIO")
|
||||||
class StemmerPatchTrieBinaryIOTest {
|
class StemmerPatchTrieBinaryIOTest {
|
||||||
|
|
||||||
@@ -299,9 +303,19 @@ class StemmerPatchTrieBinaryIOTest {
|
|||||||
"read(Path) must reject null path."),
|
"read(Path) must reject null path."),
|
||||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
||||||
"read(String) must reject null file name."),
|
"read(String) must reject null file name."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(Path, int) must reject null path."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((String) null,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(String, int) must reject null file name."),
|
||||||
() -> assertThrows(NullPointerException.class,
|
() -> assertThrows(NullPointerException.class,
|
||||||
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
||||||
"read(InputStream) must reject null input stream."));
|
"read(InputStream) must reject null input stream."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(InputStream, int) must reject null input stream."));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -385,6 +399,143 @@ class StemmerPatchTrieBinaryIOTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that stream overload with dense span override delegates to the
|
||||||
|
* four-argument readFrom method.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate stream read with dense span override")
|
||||||
|
void shouldDelegateInputStreamReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final byte[] gzipPayload = gzip("binary-content-with-max-expanded-index");
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
|
||||||
|
.read(new ByteArrayInputStream(gzipPayload), 17);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(InputStream, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(17)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that path overload with dense span override delegates to the
|
||||||
|
* same method overload with the override parameter.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate path read with dense span override")
|
||||||
|
void shouldDelegatePathReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("input-max-expanded.bin.gz");
|
||||||
|
Files.write(sourceFile, gzip("path-based-max-expanded-index"));
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile, 0);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(Path, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(0)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that string path overload with dense span override delegates to the
|
||||||
|
* same method overload with the override parameter.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate file name read with dense span override")
|
||||||
|
void shouldDelegateStringReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("input-string-max-expanded.bin.gz");
|
||||||
|
Files.write(sourceFile, gzip("string-based-max-expanded-index"));
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString(), 32);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(String, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(32)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata-only read parses and returns the persisted metadata.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from gzip payload")
|
||||||
|
void shouldReadMetadataFromGzipPayload() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("run", PatchCommandEncoder.builder().build().encode("running", "run"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, outputStream);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(new ByteArrayInputStream(outputStream.toByteArray()));
|
||||||
|
|
||||||
|
assertEquals(trie.metadata(), metadata,
|
||||||
|
"readMetadata(InputStream) must return the same metadata persisted by write().");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata can be read from a binary file path.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from file path")
|
||||||
|
void shouldReadMetadataFromPath() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("metadata-path.bin.gz");
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile);
|
||||||
|
assertEquals(trie.metadata(), metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata can be read from a binary file name.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from file name")
|
||||||
|
void shouldReadMetadataFromStringPath() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("metadata-string.bin.gz");
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile.toString());
|
||||||
|
assertEquals(trie.metadata(), metadata);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
@@ -46,13 +47,17 @@ import java.io.InputStreamReader;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader.Language;
|
||||||
import org.junit.jupiter.api.DisplayName;
|
import org.junit.jupiter.api.DisplayName;
|
||||||
import org.junit.jupiter.api.Nested;
|
import org.junit.jupiter.api.Nested;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@@ -77,11 +82,13 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
* <li>comment-aware parsing delegated to {@link StemmerDictionaryParser}</li>
|
* <li>comment-aware parsing delegated to {@link StemmerDictionaryParser}</li>
|
||||||
* <li>preservation of all valid stem candidates returned by
|
* <li>preservation of all valid stem candidates returned by
|
||||||
* {@link FrequencyTrie#getAll(String)}</li>
|
* {@link FrequencyTrie#getAll(String)}</li>
|
||||||
|
* <li>the current bundled language set, including right-to-left metadata</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
|
||||||
@Tag("integration")
|
@Tag("integration")
|
||||||
@Tag("stemmer")
|
@Tag("stemmer")
|
||||||
|
@Tag("io")
|
||||||
|
@Tag("parser")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
final class StemmerPatchTrieLoaderTest {
|
final class StemmerPatchTrieLoaderTest {
|
||||||
|
|
||||||
@@ -97,126 +104,51 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
*/
|
*/
|
||||||
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Representative number of bundled words used for overload consistency checks.
|
||||||
|
*/
|
||||||
|
private static final int REPRESENTATIVE_BUNDLED_WORD_COUNT = 25;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides arguments for bundled dictionary verification across both supported
|
* Provides arguments for bundled dictionary verification across both supported
|
||||||
* getAll-preserving reduction modes.
|
* getAll-preserving reduction modes.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* The stream is derived directly from the current {@link Language} enum so the
|
||||||
|
* test suite follows the supported bundled language set automatically.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @return parameter stream
|
* @return parameter stream
|
||||||
*/
|
*/
|
||||||
static Stream<Arguments> bundledDictionaryCases() {
|
static Stream<Arguments> bundledDictionaryCases() {
|
||||||
return Stream.of(
|
final ReductionMode[] reductionModes = new ReductionMode[] {
|
||||||
// 01
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
|
||||||
Arguments.of("01-da_dk-ranked", StemmerPatchTrieLoader.Language.DA_DK,
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS };
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 02
|
return Arrays.stream(StemmerPatchTrieLoader.Language.values())
|
||||||
Arguments.of("02-de_de-ranked", StemmerPatchTrieLoader.Language.DE_DE,
|
.flatMap(
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
language -> IntStream.range(0, reductionModes.length)
|
||||||
|
.mapToObj(index -> Arguments.of(
|
||||||
// 03
|
String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(),
|
||||||
Arguments.of("03-es_es-ranked", StemmerPatchTrieLoader.Language.ES_ES,
|
reductionModes[index].name().toLowerCase()),
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
language, reductionModes[index])));
|
||||||
|
|
||||||
// 04
|
|
||||||
Arguments.of("04-fr_fr-ranked", StemmerPatchTrieLoader.Language.FR_FR,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 05
|
|
||||||
Arguments.of("05-it_it-ranked", StemmerPatchTrieLoader.Language.IT_IT,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 06
|
|
||||||
Arguments.of("06-nl_nl-ranked", StemmerPatchTrieLoader.Language.NL_NL,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 07
|
|
||||||
Arguments.of("07-no_no-ranked", StemmerPatchTrieLoader.Language.NO_NO,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 08
|
|
||||||
Arguments.of("08-pt_pt-ranked", StemmerPatchTrieLoader.Language.PT_PT,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 09
|
|
||||||
Arguments.of("09-ru_ru-ranked", StemmerPatchTrieLoader.Language.RU_RU,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 10
|
|
||||||
Arguments.of("10-sv_se-ranked", StemmerPatchTrieLoader.Language.SV_SE,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 11
|
|
||||||
Arguments.of("11-us_uk-ranked", StemmerPatchTrieLoader.Language.US_UK,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 12
|
|
||||||
Arguments.of("12-us_uk_profi-ranked", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 13
|
|
||||||
Arguments.of("13-da_dk-unordered", StemmerPatchTrieLoader.Language.DA_DK,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 14
|
|
||||||
Arguments.of("14-de_de-unordered", StemmerPatchTrieLoader.Language.DE_DE,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 15
|
|
||||||
Arguments.of("15-es_es-unordered", StemmerPatchTrieLoader.Language.ES_ES,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 16
|
|
||||||
Arguments.of("16-fr_fr-unordered", StemmerPatchTrieLoader.Language.FR_FR,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 17
|
|
||||||
Arguments.of("17-it_it-unordered", StemmerPatchTrieLoader.Language.IT_IT,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 18
|
|
||||||
Arguments.of("18-nl_nl-unordered", StemmerPatchTrieLoader.Language.NL_NL,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 19
|
|
||||||
Arguments.of("19-no_no-unordered", StemmerPatchTrieLoader.Language.NO_NO,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 20
|
|
||||||
Arguments.of("20-pt_pt-unordered", StemmerPatchTrieLoader.Language.PT_PT,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 21
|
|
||||||
Arguments.of("21-ru_ru-unordered", StemmerPatchTrieLoader.Language.RU_RU,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 22
|
|
||||||
Arguments.of("22-sv_se-unordered", StemmerPatchTrieLoader.Language.SV_SE,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 23
|
|
||||||
Arguments.of("23-us_uk-unordered", StemmerPatchTrieLoader.Language.US_UK,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
|
||||||
|
|
||||||
// 24
|
|
||||||
Arguments.of("24-us_uk_profi-unordered", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
|
||||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides representative bundled languages for overload consistency checks.
|
* Provides representative bundled languages for overload consistency checks.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* The sample intentionally covers both traversal directions.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @return parameter stream
|
* @return parameter stream
|
||||||
*/
|
*/
|
||||||
static Stream<Arguments> bundledLanguageSamples() {
|
static Stream<Arguments> bundledLanguageSamples() {
|
||||||
return Stream.of(
|
return Stream.of(Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||||
// 01
|
|
||||||
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
|
||||||
|
|
||||||
// 02
|
|
||||||
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
|
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
|
||||||
|
Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR),
|
||||||
// 03
|
Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL),
|
||||||
Arguments.of("03-fr_fr", StemmerPatchTrieLoader.Language.FR_FR));
|
Arguments.of("05-yi", StemmerPatchTrieLoader.Language.YI));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -227,107 +159,97 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
static Stream<Arguments> nullContractCases() {
|
static Stream<Arguments> nullContractCases() {
|
||||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||||
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
||||||
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
|
.put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build();
|
||||||
|
|
||||||
return Stream.of(
|
return Stream.of(
|
||||||
// 01
|
|
||||||
Arguments.of("01-load-language-settings",
|
Arguments.of("01-load-language-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||||
true, settings),
|
true, settings),
|
||||||
"language"),
|
"language"),
|
||||||
|
|
||||||
// 02
|
|
||||||
Arguments.of("02-load-language-mode",
|
Arguments.of("02-load-language-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||||
true, DEFAULT_REDUCTION_MODE),
|
true, DEFAULT_REDUCTION_MODE),
|
||||||
"language"),
|
"language"),
|
||||||
|
|
||||||
// 03
|
|
||||||
Arguments.of("03-load-language-null-settings",
|
Arguments.of("03-load-language-null-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true, (ReductionSettings) null),
|
true, (ReductionSettings) null),
|
||||||
"reductionSettings"),
|
"reductionSettings"),
|
||||||
|
|
||||||
// 04
|
|
||||||
Arguments.of("04-load-language-null-mode",
|
Arguments.of("04-load-language-null-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true, (ReductionMode) null),
|
true, (ReductionMode) null),
|
||||||
"reductionMode"),
|
"reductionMode"),
|
||||||
|
|
||||||
// 05
|
|
||||||
Arguments.of("05-load-path-settings",
|
Arguments.of("05-load-path-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true, settings), "path"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true, settings), "path"),
|
||||||
|
|
||||||
// 06
|
|
||||||
Arguments.of("06-load-path-mode",
|
Arguments.of("06-load-path-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true,
|
||||||
DEFAULT_REDUCTION_MODE),
|
DEFAULT_REDUCTION_MODE),
|
||||||
"path"),
|
"path"),
|
||||||
|
|
||||||
// 07
|
|
||||||
Arguments.of("07-load-path-null-settings",
|
Arguments.of("07-load-path-null-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true,
|
||||||
(ReductionSettings) null),
|
(ReductionSettings) null),
|
||||||
"reductionSettings"),
|
"reductionSettings"),
|
||||||
|
|
||||||
// 08
|
|
||||||
Arguments.of("08-load-path-null-mode",
|
Arguments.of("08-load-path-null-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (ReductionMode) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (ReductionMode) null),
|
||||||
"reductionMode"),
|
"reductionMode"),
|
||||||
|
|
||||||
// 09
|
|
||||||
Arguments.of("09-load-string-settings",
|
Arguments.of("09-load-string-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
|
||||||
"fileName"),
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
|
||||||
// 10
|
|
||||||
Arguments.of("10-load-string-mode",
|
Arguments.of("10-load-string-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
|
||||||
DEFAULT_REDUCTION_MODE),
|
DEFAULT_REDUCTION_MODE),
|
||||||
"fileName"),
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
|
||||||
// 11
|
|
||||||
Arguments.of("11-load-string-null-settings",
|
Arguments.of("11-load-string-null-settings",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||||
(ReductionSettings) null),
|
(ReductionSettings) null),
|
||||||
"reductionSettings"),
|
"reductionSettings"),
|
||||||
|
|
||||||
// 12
|
|
||||||
Arguments.of("12-load-string-null-mode",
|
Arguments.of("12-load-string-null-mode",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||||
(ReductionMode) null),
|
(ReductionMode) null),
|
||||||
"reductionMode"),
|
"reductionMode"),
|
||||||
|
|
||||||
// 13
|
|
||||||
Arguments.of("13-load-binary-path",
|
Arguments.of("13-load-binary-path",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
|
||||||
|
|
||||||
// 14
|
|
||||||
Arguments.of("14-load-binary-string",
|
Arguments.of("14-load-binary-string",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
||||||
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
// 15
|
Arguments.of("15-load-binary-path-override",
|
||||||
Arguments.of("15-load-binary-stream",
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"path"),
|
||||||
|
Arguments.of("16-load-binary-string-override",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("17-load-binary-stream",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||||
"inputStream"),
|
"inputStream"),
|
||||||
|
Arguments.of("18-save-binary-null-trie-path",
|
||||||
// 16
|
|
||||||
Arguments.of("16-save-binary-null-trie-path",
|
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
||||||
|
Arguments.of("19-save-binary-null-path",
|
||||||
// 17
|
|
||||||
Arguments.of("17-save-binary-null-path",
|
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
||||||
|
Arguments.of("20-save-binary-null-trie-string",
|
||||||
// 18
|
|
||||||
Arguments.of("18-save-binary-null-trie-string",
|
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
||||||
"trie"),
|
"trie"),
|
||||||
|
Arguments.of("21-save-binary-null-string",
|
||||||
// 19
|
|
||||||
Arguments.of("19-save-binary-null-string",
|
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||||
"fileName"));
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("22-load-language-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||||
|
true, (TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("23-load-path-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("24-load-string-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||||
|
(TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("25-load-binary-metadata-path-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||||
|
Arguments.of("26-load-binary-metadata-string-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||||
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("27-load-binary-metadata-stream-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||||
|
"inputStream"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -409,9 +331,9 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
@DisplayName("Path and String overloads must load equivalent tries")
|
@DisplayName("Path and String overloads must load equivalent tries")
|
||||||
void shouldLoadEquivalentTrieFromPathAndStringOverloads() throws IOException {
|
void shouldLoadEquivalentTrieFromPathAndStringOverloads() throws IOException {
|
||||||
final Path dictionaryFile = writeDictionary("""
|
final Path dictionaryFile = writeDictionary("""
|
||||||
run running runs runner
|
run running runs runner
|
||||||
play playing played plays
|
play playing played plays
|
||||||
city cities
|
city cities
|
||||||
""");
|
""");
|
||||||
|
|
||||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||||
@@ -432,6 +354,31 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
"run");
|
"run");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata-driven loading keeps all configuration dimensions in
|
||||||
|
* one explicit object and applies them during compilation.
|
||||||
|
*
|
||||||
|
* @throws IOException if the test file cannot be written or read
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Metadata overload must drive case and diacritic normalization")
|
||||||
|
void shouldLoadUsingExplicitMetadataConfiguration() throws IOException {
|
||||||
|
final Path dictionaryFile = writeDictionary("""
|
||||||
|
mÁma mamA mámě
|
||||||
|
""");
|
||||||
|
final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD,
|
||||||
|
ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata);
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
trie.metadata().caseProcessingMode()),
|
||||||
|
() -> assertNotNull(trie.get("MÁMĚ")),
|
||||||
|
() -> assertNotNull(trie.get("mame")));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
||||||
* canonical no-op patch for the stem itself.
|
* canonical no-op patch for the stem itself.
|
||||||
@@ -442,7 +389,7 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
@DisplayName("storeOriginal=true must make the stem itself resolvable through the no-op patch")
|
@DisplayName("storeOriginal=true must make the stem itself resolvable through the no-op patch")
|
||||||
void shouldStoreOriginalStemWhenRequested() throws IOException {
|
void shouldStoreOriginalStemWhenRequested() throws IOException {
|
||||||
final Path dictionaryFile = writeDictionary("""
|
final Path dictionaryFile = writeDictionary("""
|
||||||
run running runs
|
run running runs
|
||||||
""");
|
""");
|
||||||
|
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||||
@@ -467,8 +414,8 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
@DisplayName("storeOriginal=false must not insert the stem itself unless present as a variant elsewhere")
|
@DisplayName("storeOriginal=false must not insert the stem itself unless present as a variant elsewhere")
|
||||||
void shouldNotStoreOriginalStemWhenDisabled() throws IOException {
|
void shouldNotStoreOriginalStemWhenDisabled() throws IOException {
|
||||||
final Path dictionaryFile = writeDictionary("""
|
final Path dictionaryFile = writeDictionary("""
|
||||||
run running runs
|
run running runs
|
||||||
play playing played plays
|
play playing played plays
|
||||||
""");
|
""");
|
||||||
|
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, false,
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, false,
|
||||||
@@ -480,6 +427,29 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
"Variants must still reconstruct the proper stem.");
|
"Variants must still reconstruct the proper stem.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the loader honors forward traversal for right-to-left
|
||||||
|
* dictionaries loaded from filesystem overloads.
|
||||||
|
*
|
||||||
|
* @throws IOException if the test file cannot be written or read
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Explicit right-to-left loading must use forward traversal semantics")
|
||||||
|
void shouldUseForwardTraversalForExplicitRightToLeftLoading() throws IOException {
|
||||||
|
final Path dictionaryFile = writeDictionary("""
|
||||||
|
كتب كتابة كتاب
|
||||||
|
""");
|
||||||
|
|
||||||
|
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||||
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, settings,
|
||||||
|
WordTraversalDirection.FORWARD);
|
||||||
|
|
||||||
|
assertEquals(WordTraversalDirection.FORWARD, trie.traversalDirection(),
|
||||||
|
"Right-to-left loading must produce a forward-traversed trie.");
|
||||||
|
assertEquals(Set.of("كتب"), reconstructAllStemCandidates(trie, "كتابة"),
|
||||||
|
"Patch reconstruction must use the trie traversal direction.");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that comment syntax documented by the loader is effectively honored
|
* Verifies that comment syntax documented by the loader is effectively honored
|
||||||
* through delegated parsing.
|
* through delegated parsing.
|
||||||
@@ -492,10 +462,10 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
final Path dictionaryFile = writeDictionary("""
|
final Path dictionaryFile = writeDictionary("""
|
||||||
# full-line hash comment
|
# full-line hash comment
|
||||||
// full-line slash comment
|
// full-line slash comment
|
||||||
run running runs // inline slash comment
|
run running runs // inline slash comment
|
||||||
play playing played # inline hash comment
|
play playing played # inline hash comment
|
||||||
|
|
||||||
city cities
|
city cities
|
||||||
""");
|
""");
|
||||||
|
|
||||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||||
@@ -518,9 +488,9 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
@DisplayName("Binary save and load overloads must preserve trie semantics")
|
@DisplayName("Binary save and load overloads must preserve trie semantics")
|
||||||
void shouldRoundTripBinaryTrieAcrossAllBinaryOverloads() throws IOException {
|
void shouldRoundTripBinaryTrieAcrossAllBinaryOverloads() throws IOException {
|
||||||
final Path dictionaryFile = writeDictionary("""
|
final Path dictionaryFile = writeDictionary("""
|
||||||
run running runs runner
|
run running runs runner
|
||||||
city cities
|
city cities
|
||||||
study studies studying
|
study studies studying
|
||||||
""");
|
""");
|
||||||
final Path binaryFile = tempDir.resolve("stemmer-trie.bin.gz");
|
final Path binaryFile = tempDir.resolve("stemmer-trie.bin.gz");
|
||||||
|
|
||||||
@@ -539,6 +509,53 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile);
|
||||||
|
final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString());
|
||||||
|
try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||||
|
final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream);
|
||||||
|
assertAll(() -> assertEquals(original.metadata(), metadataFromPath),
|
||||||
|
() -> assertEquals(original.metadata(), metadataFromString),
|
||||||
|
() -> assertEquals(original.metadata(), metadataFromStream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that binary load overloads with an explicit dense lookup span
|
||||||
|
* preserve trie semantics while honoring the dense-layout override.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Binary dense-span override overloads should load equivalent tries")
|
||||||
|
void shouldLoadBinaryWithDenseSpanOverrideOverloads() throws IOException {
|
||||||
|
final Path dictionaryFile = writeDictionary("""
|
||||||
|
run running runs runner
|
||||||
|
city cities
|
||||||
|
study studies studying
|
||||||
|
""");
|
||||||
|
final Path binaryFile = tempDir.resolve("stemmer-trie-overrides.bin.gz");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||||
|
DEFAULT_REDUCTION_MODE);
|
||||||
|
|
||||||
|
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> fromPathDefault = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||||
|
final FrequencyTrie<String> fromPathDefaultByNegative = StemmerPatchTrieLoader.loadBinary(binaryFile,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX);
|
||||||
|
final FrequencyTrie<String> fromPathNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile, 0);
|
||||||
|
final FrequencyTrie<String> fromStringNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile.toString(), 0);
|
||||||
|
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathDefault, "run", "running", "runner", "cities", "studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathDefaultByNegative, "run", "running", "runner", "cities",
|
||||||
|
"studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathNoDense, "run", "running", "runner", "cities", "studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromStringNoDense, "run", "running", "runner", "cities",
|
||||||
|
"studying");
|
||||||
|
|
||||||
|
assertFalse(fromPathNoDense.root().hasDenseLookup(),
|
||||||
|
"Zero span should disable dense lookup on the loaded root.");
|
||||||
|
assertFalse(fromStringNoDense.root().hasDenseLookup(),
|
||||||
|
"Zero span should disable dense lookup on the loaded root.");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -559,9 +576,57 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
* Bundled dictionary integration tests.
|
* Bundled dictionary integration tests.
|
||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("Bundled dictionaries")
|
@DisplayName("Bundled dictionaries")
|
||||||
final class BundledDictionaryTests {
|
final class BundledDictionaryTests {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the current language enumeration exactly matches the bundled
|
||||||
|
* language set expected by this project revision.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Language enum must expose the current bundled language set")
|
||||||
|
void shouldExposeCurrentBundledLanguageSet() {
|
||||||
|
final Set<StemmerPatchTrieLoader.Language> expectedLanguages = new LinkedHashSet<StemmerPatchTrieLoader.Language>(
|
||||||
|
Arrays.asList(StemmerPatchTrieLoader.Language.CS_CZ, StemmerPatchTrieLoader.Language.DA_DK,
|
||||||
|
StemmerPatchTrieLoader.Language.DE_DE, StemmerPatchTrieLoader.Language.ES_ES,
|
||||||
|
StemmerPatchTrieLoader.Language.FA_IR, StemmerPatchTrieLoader.Language.FI_FI,
|
||||||
|
StemmerPatchTrieLoader.Language.FR_FR, StemmerPatchTrieLoader.Language.HE_IL,
|
||||||
|
StemmerPatchTrieLoader.Language.HU_HU, StemmerPatchTrieLoader.Language.IT_IT,
|
||||||
|
StemmerPatchTrieLoader.Language.NB_NO, StemmerPatchTrieLoader.Language.NL_NL,
|
||||||
|
StemmerPatchTrieLoader.Language.NN_NO, StemmerPatchTrieLoader.Language.PL_PL,
|
||||||
|
StemmerPatchTrieLoader.Language.PT_PT, StemmerPatchTrieLoader.Language.RU_RU,
|
||||||
|
StemmerPatchTrieLoader.Language.SV_SE, StemmerPatchTrieLoader.Language.UK_UA,
|
||||||
|
StemmerPatchTrieLoader.Language.US_UK, StemmerPatchTrieLoader.Language.YI));
|
||||||
|
|
||||||
|
final Set<StemmerPatchTrieLoader.Language> actualLanguages = new LinkedHashSet<StemmerPatchTrieLoader.Language>(
|
||||||
|
Arrays.asList(StemmerPatchTrieLoader.Language.values()));
|
||||||
|
|
||||||
|
assertEquals(expectedLanguages, actualLanguages,
|
||||||
|
"The bundled language enum must match the project's supported language set exactly.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the right-to-left metadata is correctly assigned for the
|
||||||
|
* currently supported bundled languages.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Language enum must mark right-to-left bundled languages correctly")
|
||||||
|
void shouldExposeCorrectRightToLeftMetadata() {
|
||||||
|
final Set<StemmerPatchTrieLoader.Language> expectedRightToLeftLanguages = Set.of(
|
||||||
|
StemmerPatchTrieLoader.Language.FA_IR, StemmerPatchTrieLoader.Language.HE_IL,
|
||||||
|
StemmerPatchTrieLoader.Language.YI);
|
||||||
|
|
||||||
|
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
|
||||||
|
if (expectedRightToLeftLanguages.contains(language)) {
|
||||||
|
assertTrue(language.isRightToLeft(), () -> language.name() + " must be marked as right-to-left.");
|
||||||
|
} else {
|
||||||
|
assertFalse(language.isRightToLeft(),
|
||||||
|
() -> language.name() + " must not be marked as right-to-left.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that each bundled dictionary compiles into a trie whose
|
* Verifies that each bundled dictionary compiles into a trie whose
|
||||||
* {@link FrequencyTrie#getAll(String)} results still reconstruct exactly the
|
* {@link FrequencyTrie#getAll(String)} results still reconstruct exactly the
|
||||||
@@ -586,6 +651,8 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
|
|
||||||
assertNotNull(trie, "Compiled trie must be created.");
|
assertNotNull(trie, "Compiled trie must be created.");
|
||||||
assertFalse(expectedStemsByWord.isEmpty(), "Bundled dictionary must not be empty.");
|
assertFalse(expectedStemsByWord.isEmpty(), "Bundled dictionary must not be empty.");
|
||||||
|
assertEquals(language.isRightToLeft() ? WordTraversalDirection.FORWARD : WordTraversalDirection.BACKWARD,
|
||||||
|
trie.traversalDirection(), "Trie traversal direction must match language metadata.");
|
||||||
|
|
||||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||||
final String word = entry.getKey();
|
final String word = entry.getKey();
|
||||||
@@ -619,13 +686,12 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
final FrequencyTrie<String> viaMode = StemmerPatchTrieLoader.load(language, true, DEFAULT_REDUCTION_MODE);
|
final FrequencyTrie<String> viaMode = StemmerPatchTrieLoader.load(language, true, DEFAULT_REDUCTION_MODE);
|
||||||
|
|
||||||
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
|
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
|
||||||
final int verifiedWords = 25;
|
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
|
|
||||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||||
assertTriePatchSemanticsEqual(viaSettings, viaMode, entry.getKey());
|
assertTriePatchSemanticsEqual(viaSettings, viaMode, entry.getKey());
|
||||||
counter++;
|
counter++;
|
||||||
if (counter >= verifiedWords) {
|
if (counter >= REPRESENTATIVE_BUNDLED_WORD_COUNT) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -704,7 +770,7 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (String patchCommand : patchCommands) {
|
for (String patchCommand : patchCommands) {
|
||||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return stems;
|
||||||
@@ -743,7 +809,7 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
if (inputStream == null) {
|
if (inputStream == null) {
|
||||||
throw new IOException("Bundled stemmer resource not found: " + resourcePath);
|
throw new IOException("Bundled stemmer resource not found: " + resourcePath);
|
||||||
}
|
}
|
||||||
return inputStream;
|
return new GZIPInputStream(inputStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ import java.util.Set;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for patch-command stemmer tries.
|
* Property-based tests for patch-command stemmer tries.
|
||||||
@@ -56,9 +56,8 @@ import net.jqwik.api.Tag;
|
|||||||
* persistence must not alter that behavior.
|
* persistence must not alter that behavior.
|
||||||
*/
|
*/
|
||||||
@Label("Stemmer patch trie properties")
|
@Label("Stemmer patch trie properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("stemming")
|
@Tag("stemmer")
|
||||||
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -82,10 +81,10 @@ class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
|||||||
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||||
"preferred patch must exist for an observed word.");
|
"preferred patch must exist for an observed word.");
|
||||||
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
|
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
|
||||||
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
|
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch, trie.traversalDirection())),
|
||||||
"preferred patch reconstructed an unexpected stem.");
|
"preferred patch reconstructed an unexpected stem.");
|
||||||
|
|
||||||
final Set<String> producedStems = applyAll(observedWord, allPatches);
|
final Set<String> producedStems = applyAll(trie, observedWord, allPatches);
|
||||||
assertTrue(acceptableStems.containsAll(producedStems),
|
assertTrue(acceptableStems.containsAll(producedStems),
|
||||||
"getAll() must not expose a patch that reconstructs an undeclared stem.");
|
"getAll() must not expose a patch that reconstructs an undeclared stem.");
|
||||||
|
|
||||||
@@ -125,10 +124,10 @@ class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
|||||||
* @param patches returned patches
|
* @param patches returned patches
|
||||||
* @return decoded stem set
|
* @return decoded stem set
|
||||||
*/
|
*/
|
||||||
private static Set<String> applyAll(final String source, final String[] patches) {
|
private static Set<String> applyAll(final FrequencyTrie<String> trie, final String source, final String[] patches) {
|
||||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||||
for (String patch : patches) {
|
for (String patch : patches) {
|
||||||
stems.add(PatchCommandEncoder.apply(source, patch));
|
stems.add(PatchCommandEncoder.apply(source, patch, trie.traversalDirection()));
|
||||||
}
|
}
|
||||||
return stems;
|
return stems;
|
||||||
}
|
}
|
||||||
|
|||||||
76
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
76
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@Tag("unit")
|
||||||
|
@Tag("metadata")
|
||||||
|
@Tag("trie")
|
||||||
|
@DisplayName("TrieMetadata")
|
||||||
|
class TrieMetadataTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Text block roundtrip preserves all persisted fields")
|
||||||
|
void textBlockRoundtripPreservesAllPersistedFields() {
|
||||||
|
final TrieMetadata metadata = new TrieMetadata(5, WordTraversalDirection.FORWARD,
|
||||||
|
new ReductionSettings(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 4),
|
||||||
|
DiacriticProcessingMode.AS_IS, CaseProcessingMode.AS_IS);
|
||||||
|
|
||||||
|
final String textBlock = metadata.toTextBlock();
|
||||||
|
final TrieMetadata parsed = TrieMetadata.fromTextBlock(5, textBlock);
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(metadata.traversalDirection(), parsed.traversalDirection()),
|
||||||
|
() -> assertEquals(metadata.reductionSettings(), parsed.reductionSettings()),
|
||||||
|
() -> assertEquals(metadata.diacriticProcessingMode(), parsed.diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(metadata.caseProcessingMode(), parsed.caseProcessingMode()),
|
||||||
|
() -> assertTrue(textBlock.contains("rightToLeft=true")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Text block parser rejects malformed input")
|
||||||
|
void textBlockParserRejectsMalformedInput() {
|
||||||
|
assertAll(
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> TrieMetadata.fromTextBlock(5, "unknown-header\nx=y\n")),
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\nmissingDelimiter\n")),
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\ntraversalDirection=FORWARD\n")));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@Tag("unit")
|
||||||
|
@Tag("core")
|
||||||
|
@Tag("stemmer")
|
||||||
|
@DisplayName("WordTraversalDirection")
|
||||||
|
class WordTraversalDirectionTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("startIndex follows direction and validates negatives")
|
||||||
|
void startIndexFollowsDirectionAndValidatesNegatives() {
|
||||||
|
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.startIndex(3)),
|
||||||
|
() -> assertEquals(2, WordTraversalDirection.BACKWARD.startIndex(3)),
|
||||||
|
() -> assertEquals(-1, WordTraversalDirection.FORWARD.startIndex(0)),
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> WordTraversalDirection.BACKWARD.startIndex(-1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("logicalIndex maps offsets in both directions")
|
||||||
|
void logicalIndexMapsOffsetsInBothDirections() {
|
||||||
|
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.logicalIndex(4, 0)),
|
||||||
|
() -> assertEquals(3, WordTraversalDirection.BACKWARD.logicalIndex(4, 0)),
|
||||||
|
() -> assertEquals(1, WordTraversalDirection.FORWARD.logicalIndex(4, 1)),
|
||||||
|
() -> assertEquals(2, WordTraversalDirection.BACKWARD.logicalIndex(4, 1)),
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> WordTraversalDirection.FORWARD.logicalIndex(-1, 0)),
|
||||||
|
() -> assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> WordTraversalDirection.BACKWARD.logicalIndex(3, 3)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("traversal character conversion preserves and reverses as expected")
|
||||||
|
void traversalCharacterConversionPreservesAndReversesAsExpected() {
|
||||||
|
assertAll(() -> assertArrayEquals(new char[] { 'a', 'b', 'c' },
|
||||||
|
WordTraversalDirection.FORWARD.toTraversalCharacters("abc")),
|
||||||
|
() -> assertArrayEquals(new char[] { 'c', 'b', 'a' },
|
||||||
|
WordTraversalDirection.BACKWARD.toTraversalCharacters("abc")),
|
||||||
|
() -> assertEquals("abc", WordTraversalDirection.FORWARD.traversalPathToLogicalKey("abc")),
|
||||||
|
() -> assertEquals("cba", WordTraversalDirection.BACKWARD.traversalPathToLogicalKey("abc")),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> WordTraversalDirection.FORWARD.toTraversalCharacters(null)),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> WordTraversalDirection.BACKWARD.traversalPathToLogicalKey(null)));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -45,7 +45,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link ChildDescriptor}.
|
* Unit tests for {@link ChildDescriptor}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("ChildDescriptor")
|
@DisplayName("ChildDescriptor")
|
||||||
class ChildDescriptorTest {
|
class ChildDescriptorTest {
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user