Compare commits
24 Commits
release@0.
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
6ccce248ea
|
|||
|
5a511374f3
|
|||
|
48f21cab72
|
|||
|
39969463a2
|
|||
|
6dbdb4bae8
|
|||
|
2ab3e74048
|
|||
|
128fa919f2
|
|||
|
1f5decd6ea
|
|||
|
9eee321fef
|
|||
|
3e0f786042
|
|||
|
041b7f43fb
|
|||
|
8785f2b7cb
|
|||
|
4d939f5b6e
|
|||
|
a9d15fa3ae
|
|||
|
0dc516357f
|
|||
|
0b674a39a8
|
|||
|
db79dd2d4f
|
|||
|
db446932fc
|
|||
|
1df6c0c87e
|
|||
|
31ed39c785
|
|||
|
4b57eecbeb
|
|||
|
a002238602
|
|||
|
92d2c98fed
|
|||
|
bc031f2d8b
|
46
.classpath
46
.classpath
@@ -1,46 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" output="bin/main" path="src/main/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="main"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/test" path="src/test/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="test"/>
|
||||
<attribute name="gradle_used_by_scope" value="test"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/main" path="src/main/resources">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="main"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/jmh" path="src/jmh/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="jmh"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/jmh" path="build/third-party/snowball/source/libstemmer_java-3.0.1/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="jmh"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/test" path="src/test/resources">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="test"/>
|
||||
<attribute name="gradle_used_by_scope" value="test"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-21/"/>
|
||||
<classpathentry kind="con" path="org.eclipse.buildship.core.gradleclasspathcontainer"/>
|
||||
<classpathentry kind="output" path="bin/default"/>
|
||||
</classpath>
|
||||
12
.github/workflows/benchmarks.yml
vendored
12
.github/workflows/benchmarks.yml
vendored
@@ -19,6 +19,10 @@ on:
|
||||
- 'gradlew.bat'
|
||||
- '.github/workflows/benchmarks.yml'
|
||||
|
||||
concurrency:
|
||||
group: benchmarks-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
jmh:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -31,15 +35,17 @@ jobs:
|
||||
- name: Check out sources
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Validate Gradle wrapper
|
||||
uses: gradle/actions/wrapper-validation@v4
|
||||
|
||||
- name: Set up JDK 21
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: '21'
|
||||
cache: gradle
|
||||
|
||||
- name: Make Gradle executable
|
||||
run: chmod +x ./gradlew
|
||||
- name: Set up Gradle caching and instrumentation
|
||||
uses: gradle/actions/setup-gradle@v4
|
||||
|
||||
- name: Verify reproducibility inputs
|
||||
shell: bash
|
||||
|
||||
9
.github/workflows/build.yml
vendored
9
.github/workflows/build.yml
vendored
@@ -156,15 +156,6 @@ jobs:
|
||||
test -f gradle.properties
|
||||
test -f gradle/verification-metadata.xml
|
||||
|
||||
- name: Generate release changelog for tagged builds
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/release@')
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
chmod +x ./tools/generate-release-notes.sh
|
||||
mkdir -p build/generated/release-notes
|
||||
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
|
||||
|
||||
- name: Build release inputs, signed Maven bundle, and SBOM
|
||||
env:
|
||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||
|
||||
148
.github/workflows/pages.yml
vendored
148
.github/workflows/pages.yml
vendored
@@ -5,6 +5,8 @@ on:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'docs/**'
|
||||
- 'mkdocs.yml'
|
||||
- 'src/main/**'
|
||||
- 'src/test/**'
|
||||
- 'src/jmh/**'
|
||||
@@ -17,6 +19,7 @@ on:
|
||||
- 'gradlew'
|
||||
- 'gradlew.bat'
|
||||
- '.github/workflows/pages.yml'
|
||||
- '.github/workflows/benchmarks.yml'
|
||||
- 'tools/generate-pages-badges.py'
|
||||
workflow_dispatch:
|
||||
|
||||
@@ -50,6 +53,14 @@ jobs:
|
||||
- name: Set up Gradle caching and instrumentation
|
||||
uses: gradle/actions/setup-gradle@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Install MkDocs Material
|
||||
run: python -m pip install --upgrade pip mkdocs-material
|
||||
|
||||
- name: Verify reproducibility inputs
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -111,12 +122,17 @@ jobs:
|
||||
JMH_CSV_LINK=''
|
||||
JMH_TXT_LATEST_LINK=''
|
||||
JMH_CSV_LATEST_LINK=''
|
||||
JMH_TXT_REPORT_MD='- Benchmark results (TXT): not currently available'
|
||||
JMH_CSV_REPORT_MD='- Benchmark results (CSV): not currently available'
|
||||
DEPENDENCY_CHECK_LINK=''
|
||||
DEPENDENCY_CHECK_LATEST_LINK=''
|
||||
DEPENDENCY_CHECK_REPORT_MD='- Dependency vulnerability report: not currently available'
|
||||
SBOM_JSON_LINK=''
|
||||
SBOM_XML_LINK=''
|
||||
SBOM_JSON_LATEST_LINK=''
|
||||
SBOM_XML_LATEST_LINK=''
|
||||
SBOM_JSON_REPORT_MD='- SBOM (JSON): not currently available'
|
||||
SBOM_XML_REPORT_MD='- SBOM (XML): not currently available'
|
||||
|
||||
if [ -d "build/reports/jmh" ]; then
|
||||
cp -R build/reports/jmh "${RUN_DIR}/jmh"
|
||||
@@ -125,10 +141,12 @@ jobs:
|
||||
if [ -f "${RUN_DIR}/jmh/jmh-results.txt" ]; then
|
||||
JMH_TXT_LINK='<li><a href="./jmh/jmh-results.txt">Benchmark Results (TXT)</a></li>'
|
||||
JMH_TXT_LATEST_LINK='<li><a href="./builds/latest/jmh/jmh-results.txt">Benchmark Results (TXT)</a></li>'
|
||||
JMH_TXT_REPORT_MD='- [JMH benchmark results (TXT)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)'
|
||||
fi
|
||||
if [ -f "${RUN_DIR}/jmh/jmh-results.csv" ]; then
|
||||
JMH_CSV_LINK='<li><a href="./jmh/jmh-results.csv">Benchmark Results (CSV)</a></li>'
|
||||
JMH_CSV_LATEST_LINK='<li><a href="./builds/latest/jmh/jmh-results.csv">Benchmark Results (CSV)</a></li>'
|
||||
JMH_CSV_REPORT_MD='- [JMH benchmark results (CSV)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.csv)'
|
||||
fi
|
||||
|
||||
HAS_JMH="true"
|
||||
@@ -143,6 +161,7 @@ jobs:
|
||||
if [ -f "${RUN_DIR}/dependency-check/dependency-check-report.html" ]; then
|
||||
DEPENDENCY_CHECK_LINK='<li><a href="./dependency-check/dependency-check-report.html">Dependency Vulnerability Report</a></li>'
|
||||
DEPENDENCY_CHECK_LATEST_LINK='<li><a href="./builds/latest/dependency-check/dependency-check-report.html">Dependency Vulnerability Report</a></li>'
|
||||
DEPENDENCY_CHECK_REPORT_MD='- [Dependency vulnerability report](https://leogalambos.github.io/Radixor/builds/latest/dependency-check/dependency-check-report.html)'
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -153,6 +172,8 @@ jobs:
|
||||
SBOM_XML_LINK='<li><a href="./sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
|
||||
SBOM_JSON_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.json">SBOM (JSON)</a></li>'
|
||||
SBOM_XML_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
|
||||
SBOM_JSON_REPORT_MD='- [SBOM (JSON)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.json)'
|
||||
SBOM_XML_REPORT_MD='- [SBOM (XML)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.xml)'
|
||||
fi
|
||||
|
||||
python3 \
|
||||
@@ -169,10 +190,22 @@ jobs:
|
||||
MUTATION_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
|
||||
JMH_BADGE_LINK='<li><a href="./metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
|
||||
JMH_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
|
||||
COVERAGE_BADGE_REPORT_MD='- [Coverage badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)'
|
||||
MUTATION_BADGE_REPORT_MD='- [Mutation badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)'
|
||||
JMH_BADGE_REPORT_MD='- [Benchmark badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)'
|
||||
|
||||
if [ ! -f "${RUN_METRICS_DIR}/coverage-badge.json" ]; then
|
||||
COVERAGE_BADGE_LINK='<li>Coverage Badge Metadata: not available</li>'
|
||||
COVERAGE_BADGE_LATEST_LINK='<li>Coverage Badge Metadata: not available</li>'
|
||||
COVERAGE_BADGE_REPORT_MD='- Coverage badge metadata: not currently available'
|
||||
fi
|
||||
|
||||
if [ ! -f "${RUN_METRICS_DIR}/pitest-badge.json" ]; then
|
||||
MUTATION_BADGE_REPORT_MD='- Mutation badge metadata: not currently available'
|
||||
fi
|
||||
|
||||
if [ ! -f "${RUN_METRICS_DIR}/jmh-badge.json" ]; then
|
||||
JMH_BADGE_REPORT_MD='- Benchmark badge metadata: not currently available'
|
||||
fi
|
||||
|
||||
cat > "${RUN_DIR}/index.html" <<EOF
|
||||
@@ -218,68 +251,73 @@ jobs:
|
||||
|
||||
cp "${RUN_DIR}/index.html" "${LATEST_DIR}/index.html"
|
||||
|
||||
cat > "${SITE_DIR}/.nojekyll" <<EOF
|
||||
EOF
|
||||
cat > docs/reports.md <<EOF
|
||||
# CI Reports
|
||||
|
||||
BUILD_LIST=$(find "${SITE_DIR}/builds" -mindepth 1 -maxdepth 1 -type d -printf '%f\n' | grep -E '^[0-9]+$' | sort -nr | head -20)
|
||||
Radixor publishes durable CI artifacts to GitHub Pages on every qualifying run of \`.github/workflows/pages.yml\`.
|
||||
|
||||
## Primary report entry points
|
||||
|
||||
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||
${DEPENDENCY_CHECK_REPORT_MD}
|
||||
${SBOM_JSON_REPORT_MD}
|
||||
${SBOM_XML_REPORT_MD}
|
||||
|
||||
## Benchmark and badge metadata
|
||||
|
||||
${JMH_TXT_REPORT_MD}
|
||||
${JMH_CSV_REPORT_MD}
|
||||
${COVERAGE_BADGE_REPORT_MD}
|
||||
${MUTATION_BADGE_REPORT_MD}
|
||||
${JMH_BADGE_REPORT_MD}
|
||||
|
||||
## Historical runs
|
||||
|
||||
- [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/)
|
||||
EOF
|
||||
|
||||
{
|
||||
cat <<EOF
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Radixor Reports</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; max-width: 1000px; margin: 2rem auto; padding: 0 1rem; line-height: 1.5; }
|
||||
h1, h2 { margin-bottom: 0.5rem; }
|
||||
ul { padding-left: 1.25rem; }
|
||||
code { background: #f4f4f4; padding: 0.1rem 0.3rem; }
|
||||
.meta { color: #555; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Radixor Published Reports</h1>
|
||||
<p class="meta">Durable CI reports published from GitHub Actions to the <code>gh-pages</code> branch.</p>
|
||||
echo "# Historical Build Reports"
|
||||
echo
|
||||
echo "The following build report sets are currently published on GitHub Pages."
|
||||
echo
|
||||
echo "| Build | Published | Link |"
|
||||
echo "|---:|---|---|"
|
||||
|
||||
<h2>Latest</h2>
|
||||
<ul>
|
||||
<li><a href="./builds/latest/">Latest build summary</a></li>
|
||||
<li><a href="./builds/latest/javadoc/">Javadoc</a></li>
|
||||
<li><a href="./builds/latest/test/">Test Report</a></li>
|
||||
<li><a href="./builds/latest/pmd/main.html">PMD Report</a></li>
|
||||
<li><a href="./builds/latest/coverage/">Coverage Report</a></li>
|
||||
${DEPENDENCY_CHECK_LATEST_LINK:-<li>Dependency Vulnerability Report: not currently available</li>}
|
||||
${SBOM_JSON_LATEST_LINK:-<li>SBOM (JSON): not available</li>}
|
||||
${SBOM_XML_LATEST_LINK:-<li>SBOM (XML): not available</li>}
|
||||
${COVERAGE_BADGE_LATEST_LINK}
|
||||
${MUTATION_BADGE_LATEST_LINK}
|
||||
${JMH_BADGE_LATEST_LINK}
|
||||
<li><a href="./builds/latest/pitest/">Mutation Testing Report</a></li>
|
||||
$(
|
||||
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LATEST_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LATEST_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
|
||||
|| echo '<li>Benchmark results: not currently available</li>'
|
||||
)
|
||||
EOF
|
||||
|
||||
cat <<EOF
|
||||
</ul>
|
||||
|
||||
<h2>Recent historical builds</h2>
|
||||
<ul>
|
||||
EOF
|
||||
|
||||
for build in ${BUILD_LIST}; do
|
||||
echo " <li><a href=\"./builds/${build}/\">Build ${build}</a></li>"
|
||||
find "${SITE_DIR}/builds" -mindepth 1 -maxdepth 1 -type d ! -name latest -printf '%P\n' \
|
||||
| grep -E '^[0-9]+$' \
|
||||
| while read -r build; do
|
||||
ts="$(git -C "${SITE_DIR}" log --diff-filter=A --format='%ct' --reverse -- "builds/${build}/index.html" | head -n 1)"
|
||||
if [ -n "${ts}" ]; then
|
||||
published="$(date -u -d "@${ts}" '+%Y-%m-%d %H:%M')"
|
||||
else
|
||||
published="unknown"
|
||||
ts="0"
|
||||
fi
|
||||
printf '%s\t%s\t%s\n' "${ts}" "${build}" "${published}"
|
||||
done \
|
||||
| sort -r -n -k1,1 \
|
||||
| while IFS=$'\t' read -r _ts build published; do
|
||||
echo "| ${build} | ${published} | [Open](../builds/${build}/) |"
|
||||
done
|
||||
} > docs/builds.md
|
||||
|
||||
cat <<EOF
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
- name: Build documentation site (MkDocs Material)
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
mkdocs build --strict --site-dir .mkdocs-site
|
||||
rsync -a --delete --exclude '.git' --exclude '.git/' --exclude 'builds/' .mkdocs-site/ .gh-pages/
|
||||
mkdir -p .gh-pages/builds
|
||||
cp .mkdocs-site/builds/index.html .gh-pages/builds/index.html
|
||||
cat > .gh-pages/.nojekyll <<EOF
|
||||
EOF
|
||||
} > "${SITE_DIR}/index.html"
|
||||
rm -rf .mkdocs-site
|
||||
|
||||
- name: Commit and push gh-pages
|
||||
shell: bash
|
||||
|
||||
4
.ruleset
4
.ruleset
@@ -162,12 +162,12 @@
|
||||
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
||||
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
||||
<properties>
|
||||
<property name="threshold" value="50" />
|
||||
<property name="threshold" value="60" />
|
||||
</properties>
|
||||
</rule>
|
||||
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
||||
<properties>
|
||||
<property name="methodReportLevel" value="18" />
|
||||
<property name="methodReportLevel" value="19" />
|
||||
</properties>
|
||||
</rule>
|
||||
<rule ref="category/java/design.xml/DataClass"/>
|
||||
|
||||
29
LICENSE-stemmer-data
Normal file
29
LICENSE-stemmer-data
Normal file
@@ -0,0 +1,29 @@
|
||||
Stemmer data licensing
|
||||
|
||||
The software source code in this repository is licensed separately under
|
||||
the BSD 3-Clause License.
|
||||
|
||||
Stemmer dictionary and morphology data files are not covered by
|
||||
the BSD 3-Clause License unless explicitly stated otherwise.
|
||||
|
||||
This repository contains adapted data derived from the UniMorph project:
|
||||
https://unimorph.github.io/
|
||||
|
||||
Only stemmer data derived from sources that permit commercial use are included
|
||||
in the main distribution of this repository.
|
||||
|
||||
Accepted upstream licenses for distributed stemmer data in this repository:
|
||||
- CC BY-SA 3.0
|
||||
- CC BY-SA 4.0
|
||||
- CC BY 4.0
|
||||
|
||||
Sources under non-commercial licenses, including CC BY-NC-SA 4.0, are excluded
|
||||
from the main distribution.
|
||||
|
||||
Modifications in this repository may include cleaning, normalization,
|
||||
deduplication, filtering, conversion, and reformatting.
|
||||
|
||||
Copyright (c) 2026 Leo Galambos for the modifications, to the extent permitted
|
||||
by the applicable upstream license terms.
|
||||
|
||||
Per-file licensing is stated in the header of each generated stemmer data file.
|
||||
182
README.md
182
README.md
@@ -11,53 +11,61 @@
|
||||
[](LICENSE)
|
||||
[](#)
|
||||
|
||||
*Fast algorithmic stemming with compact patch-command tries — measured at about 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
|
||||
*Fast, deterministic, multi-language stemming for Java, built around compact patch-command tries and measured at roughly 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
|
||||
|
||||
**Radixor** is a fast, algorithmic stemming toolkit for Java, built around compact **patch-command tries** in the tradition of the original **Egothor** stemmer.
|
||||
**Radixor** is a modern multi-language stemming toolkit for Java in the tradition of the original **Egothor** approach. It learns compact word-to-stem transformations from dictionary data, stores them in compiled patch-command tries, and exposes a runtime model designed for speed, determinism, and operational simplicity. Unlike a closed-form dictionary lookup stemmer, Radixor can also generalize beyond explicitly listed word forms.
|
||||
|
||||
On the current JMH English comparison benchmark, Radixor with bundled `US_UK_PROFI`
|
||||
reaches approximately **31 to 32 million tokens per second**, compared with about
|
||||
**8 million tokens per second** for Snowball original Porter and about
|
||||
**5 to 5.5 million tokens per second** for Snowball English (Porter2).
|
||||
It is particularly well suited to systems that need stemming which is:
|
||||
|
||||
That means the current Radixor implementation is approximately:
|
||||
- fast at runtime,
|
||||
- compact in memory and on disk,
|
||||
- deterministic in behavior,
|
||||
- adaptable through dictionary data rather than hardcoded language rules,
|
||||
- practical to compile, persist, version, extend, and deploy.
|
||||
|
||||
- **4× faster** than Snowball original Porter
|
||||
- **6× faster** than Snowball English (Porter2)
|
||||
|
||||
It is designed for production search and text-processing systems that need stemming which is:
|
||||
|
||||
- fast at runtime
|
||||
- compact in memory and on disk
|
||||
- deterministic in behavior
|
||||
- driven by dictionary data rather than hardcoded language rules
|
||||
- practical to maintain, extend, and test
|
||||
|
||||
Radixor keeps the valuable core of the original Egothor idea, modernizes the implementation, and adds capabilities that make it more useful in real software systems today.
|
||||
It also retains the operational advantages of a compiled artifact model: predictable runtime behavior, direct binary loading, and clear separation between preparation-time compilation and live request processing.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Why Radixor](#why-radixor)
|
||||
- [Performance](#performance)
|
||||
- [Heritage](#heritage)
|
||||
- [What Radixor adds](#what-radixor-adds)
|
||||
- [Key features](#key-features)
|
||||
- [Performance](#performance)
|
||||
- [Documentation](#documentation)
|
||||
- [Project philosophy](#project-philosophy)
|
||||
- [Historical note](#historical-note)
|
||||
|
||||
## Why Radixor
|
||||
|
||||
The central idea behind Radixor is simple: learn how to transform a word form into its stem, encode that transformation as a compact patch command, store it in a trie, and make runtime lookup extremely fast.
|
||||
The central idea behind Radixor is simple: learn how to transform a word form into its stem, encode that transformation as a compact patch command, store it in a trie, and make the runtime path as small and direct as possible.
|
||||
|
||||
This gives you a stemmer that is:
|
||||
That produces a stemmer that is:
|
||||
|
||||
- data-driven rather than rule-hardcoded
|
||||
- reusable across languages
|
||||
- compact enough for deployment-friendly binary artifacts
|
||||
- suitable for both offline compilation and runtime loading
|
||||
- data-driven rather than rule-hardcoded,
|
||||
- applicable across languages through compiled transformation models learned from dictionary data,
|
||||
- compact enough for deployment-friendly binary artifacts,
|
||||
- suitable for both offline compilation and direct runtime loading,
|
||||
- capable of exposing either a preferred result or multiple candidate results when ambiguity matters.
|
||||
|
||||
Radixor is especially attractive when you want something more adaptable than simple suffix stripping, but much smaller and easier to operate than a full morphological analyzer. In the current English benchmark comparison against the Snowball Porter stemmer family, it also delivers a substantial throughput advantage.
|
||||
Radixor is especially attractive when you want something more adaptable than simple suffix stripping, but much smaller and easier to operate than a full morphological analyzer.
|
||||
|
||||
## Performance
|
||||
|
||||
Radixor includes a JMH benchmark suite for both its own algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
|
||||
|
||||
On the current English comparison workload, Radixor with bundled `US_UK` reaches approximately **31 to 32 million tokens per second**. Snowball original Porter reaches approximately **8 million tokens per second**, and Snowball English (Porter2) approximately **5 to 5.5 million tokens per second**.
|
||||
|
||||
That places Radixor at approximately:
|
||||
|
||||
- **4× the throughput of Snowball original Porter**
|
||||
- **6× the throughput of Snowball English (Porter2)**
|
||||
|
||||
on the current benchmark workload.
|
||||
|
||||
This is a throughput comparison on the same deterministic token stream. It is **not** a claim that the compared stemmers are linguistically equivalent or interchangeable.
|
||||
|
||||
For benchmark scope, workload design, environment, commands, report locations, and interpretation guidance, see [Benchmarking](docs/benchmarking.md).
|
||||
|
||||
## Heritage
|
||||
|
||||
@@ -69,44 +77,47 @@ Useful historical references:
|
||||
|
||||
- [Egothor project](http://www.egothor.org/)
|
||||
- [Stempel overview](https://www.getopt.org/stempel/)
|
||||
- [Leo Galambos, *Lemmatizer for Document Information Retrieval Systems in JAVA* (SOFSEM 2001)](https://www.researchgate.net/publication/221512865_Lemmatizer_for_Document_Information_Retrieval_Systems_in_JAVA)
|
||||
- [Lucene Stempel overview](https://lucene.apache.org/core/5_3_0/analyzers-stempel/index.html)
|
||||
- [Elasticsearch Stempel plugin](https://www.elastic.co/docs/reference/elasticsearch/plugins/analysis-stempel)
|
||||
|
||||
Radixor is not just a repackaging of legacy code. It is a practical modernization of the approach for current Java development and long-term maintainability.
|
||||
The Galambos paper is a useful historical reference for the semi-automatic, transformation-based stemming idea that later informed the Egothor lineage and, in turn, the conceptual background of Radixor. It should be read as research and heritage context rather than as a description of Radixor's present-day implementation.
|
||||
|
||||
Radixor is not a repackaging of legacy code. It is a modern implementation that preserves the valuable core idea while reworking the engineering around maintainability, testing, persistence, and long-term operational use.
|
||||
|
||||
## What Radixor adds
|
||||
|
||||
Radixor keeps the patch-command trie model, but improves the engineering around it.
|
||||
Radixor keeps the patch-command trie model, but improves the engineering around it in ways that matter in real software systems.
|
||||
|
||||
Compared with the historical baseline, Radixor emphasizes:
|
||||
|
||||
- **simplification to the most practical core**
|
||||
The implementation focuses on the parts of the original approach that are most useful in production.
|
||||
- **a focused practical core**
|
||||
The implementation concentrates on the parts of the original approach that are most useful in production.
|
||||
|
||||
- **immutable compiled tries**
|
||||
Runtime lookup uses compact read-only structures optimized for efficient access.
|
||||
|
||||
- **support for more than one stemming result**
|
||||
Radixor can expose both a preferred result and multiple candidate results where the data is ambiguous.
|
||||
Radixor can expose both a preferred result and multiple candidate results when the underlying data is ambiguous.
|
||||
|
||||
- **frequency-aware deterministic ordering**
|
||||
Candidate results are ordered consistently and reproducibly.
|
||||
|
||||
- **practical subtree reduction modes**
|
||||
Reduction can be tuned toward stronger compression or more conservative behavioral preservation.
|
||||
Reduction can be tuned toward stronger compression or more conservative semantic preservation.
|
||||
|
||||
- **reconstruction of writable builders from compiled tables**
|
||||
- **reconstruction of writable builders from compiled artifacts**
|
||||
Existing compiled stemmer tables can be reopened, modified, and compiled again.
|
||||
|
||||
- **better tests and implementation stability**
|
||||
Stronger coverage improves confidence during refactoring and further development.
|
||||
- **strong validation discipline**
|
||||
Coverage, mutation testing, benchmark visibility, and published reports are treated as part of the engineering standard rather than optional project decoration.
|
||||
|
||||
## Key features
|
||||
|
||||
- Fast algorithmic stemming
|
||||
- Compact compiled binary artifacts
|
||||
- Patch-command based transformation model
|
||||
- Dictionary-driven language adaptation
|
||||
- Multi-language stemming through compiled transformation models
|
||||
- Single-result and multi-result lookup
|
||||
- Deterministic result ordering
|
||||
- Compressed binary persistence
|
||||
@@ -114,57 +125,69 @@ Compared with the historical baseline, Radixor emphasizes:
|
||||
- CLI compilation tool
|
||||
- Bundled language resources
|
||||
- Support for extending compiled stemmer tables
|
||||
|
||||
## Performance
|
||||
|
||||
Radixor includes a JMH benchmark suite for both its own algorithmic core and a
|
||||
side-by-side comparison against the Snowball Porter stemmer family.
|
||||
|
||||
On the current English comparison workload, Radixor with bundled `US_UK_PROFI`
|
||||
reaches approximately **31 to 32 million tokens per second**. Snowball original
|
||||
Porter reaches approximately **8 million tokens per second**, and Snowball
|
||||
English (Porter2) approximately **5 to 5.5 million tokens per second**.
|
||||
|
||||
That places Radixor at approximately **4× the throughput of Snowball original Porter**
|
||||
and approximately **6× the throughput of Snowball English (Porter2)**
|
||||
on the current benchmark workload.
|
||||
|
||||
This is a throughput comparison on the same deterministic token stream. It is
|
||||
not a claim that the compared stemmers are linguistically equivalent or
|
||||
interchangeable.
|
||||
|
||||
For benchmark scope, workload design, environment, commands, report locations,
|
||||
and interpretation guidance, see [Benchmarking](docs/benchmarking.md).
|
||||
- Reproducible and auditable engineering posture
|
||||
|
||||
## Documentation
|
||||
|
||||
The repository keeps the front page concise and places detailed documentation under `docs/`.
|
||||
|
||||
Start here:
|
||||
### Getting Started
|
||||
|
||||
- [Quick Start](docs/quick-start.md)
|
||||
A practical first guide to loading, compiling, and using Radixor.
|
||||
|
||||
- [Built-in Languages](docs/built-in-languages.md)
|
||||
Overview of bundled language resources such as `US_UK`.
|
||||
|
||||
- [Dictionary Format](docs/dictionary-format.md)
|
||||
How to write stemming dictionaries.
|
||||
How to write and normalize stemming dictionaries.
|
||||
|
||||
- [Compilation (CLI tool)](docs/cli-compilation.md)
|
||||
How to compile dictionaries with the `Compile` CLI.
|
||||
How to compile dictionaries into deployable binary artifacts.
|
||||
|
||||
- [Programmatic Usage](docs/programmatic-usage.md)
|
||||
How to build, load, modify, and query Radixor from Java code.
|
||||
### Programmatic Usage
|
||||
|
||||
- [Built-in Languages](docs/built-in-languages.md)
|
||||
How to use integrated language resources such as `US_UK_PROFI`.
|
||||
- [Programmatic Usage Overview](docs/programmatic-usage.md)
|
||||
Entry point to the Java API and the overall usage model.
|
||||
|
||||
- [Architecture and Reduction](docs/architecture-and-reduction.md)
|
||||
Internal model, compiled trie design, and reduction strategies.
|
||||
- [Loading and Building Stemmers](docs/programmatic-loading-and-building.md)
|
||||
Loading bundled resources, textual dictionaries, binary artifacts, and direct builder usage.
|
||||
|
||||
- [Querying and Ambiguity Handling](docs/programmatic-querying-and-ambiguity.md)
|
||||
`get()`, `getAll()`, `getEntries()`, patch application, and ambiguity behavior.
|
||||
|
||||
- [Extending and Persisting Compiled Tries](docs/programmatic-extending-and-persistence.md)
|
||||
Reopening compiled tries, rebuilding them, and writing binary artifacts.
|
||||
|
||||
### Concepts and Internals
|
||||
|
||||
- [Architecture and Reduction Overview](docs/architecture-and-reduction.md)
|
||||
High-level explanation of the build pipeline and compiled trie model.
|
||||
|
||||
- [Architecture](docs/architecture.md)
|
||||
Structural model, data flow, and runtime lookup behavior.
|
||||
|
||||
- [Reduction Semantics](docs/reduction-semantics.md)
|
||||
Ranked, unordered, and dominant reduction behavior.
|
||||
|
||||
- [Compatibility and Guarantees](docs/compatibility-and-guarantees.md)
|
||||
Supported public API, internal API boundaries, and compatibility expectations.
|
||||
|
||||
### Dictionaries and Language Resources
|
||||
|
||||
- [Contributing Dictionaries](docs/contributing-dictionaries.md)
|
||||
Guidance for high-quality lexical resource contributions.
|
||||
|
||||
### Quality and Operations
|
||||
|
||||
- [Quality and Operations](docs/quality-and-operations.md)
|
||||
Testing, persistence, deployment, and operational guidance.
|
||||
Engineering standards, validation posture, auditability, and operational model.
|
||||
|
||||
- [Benchmarking](docs/benchmarking.md)
|
||||
JMH benchmark design, Snowball comparison, execution, and interpretation.
|
||||
JMH benchmark methodology, Porter comparison, and result interpretation.
|
||||
|
||||
- [Published Reports](docs/reports.md)
|
||||
Entry points to CI-published reports and GitHub Pages artifacts.
|
||||
|
||||
## Project philosophy
|
||||
|
||||
@@ -172,19 +195,20 @@ Radixor does not preserve historical complexity for its own sake.
|
||||
|
||||
It preserves the valuable idea:
|
||||
|
||||
- compact learned transformations
|
||||
- trie-based lookup
|
||||
- language-data driven stemming
|
||||
- practical runtime speed
|
||||
- compact learned transformations,
|
||||
- trie-based lookup,
|
||||
- language-data driven stemming,
|
||||
- practical runtime speed.
|
||||
|
||||
Then it improves the parts modern users care about:
|
||||
|
||||
- maintainability
|
||||
- testability
|
||||
- modification workflows
|
||||
- persistence
|
||||
- determinism
|
||||
- clearer APIs
|
||||
- maintainability,
|
||||
- testability,
|
||||
- modification workflows,
|
||||
- persistence,
|
||||
- determinism,
|
||||
- clearer APIs,
|
||||
- explicit quality evidence.
|
||||
|
||||
The goal is to keep the Egothor/Stempel lineage useful as a serious contemporary software component.
|
||||
|
||||
|
||||
58
build.gradle
58
build.gradle
@@ -116,6 +116,14 @@ tasks.withType(Test).configureEach {
|
||||
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||
}
|
||||
|
||||
/*
|
||||
* Bundled dictionary integration tests compile and reload large real-world
|
||||
* stemming dictionaries, including large language resources such as es_es.
|
||||
* The default Gradle test executor heap is too small for this workload.
|
||||
*/
|
||||
minHeapSize = '1g'
|
||||
maxHeapSize = '4g'
|
||||
|
||||
finalizedBy(tasks.named('jacocoTestReport'))
|
||||
|
||||
reports {
|
||||
@@ -134,6 +142,13 @@ tasks.withType(Pmd).configureEach {
|
||||
tasks.named('jacocoTestReport', JacocoReport) {
|
||||
dependsOn(tasks.named('test'))
|
||||
|
||||
classDirectories.setFrom(
|
||||
files(sourceSets.main.output).asFileTree.matching {
|
||||
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||
}
|
||||
)
|
||||
|
||||
reports {
|
||||
xml.required = true
|
||||
csv.required = false
|
||||
@@ -178,7 +193,17 @@ pitest {
|
||||
'org.egothor.stemmer.trie.*Test'
|
||||
]
|
||||
|
||||
excludedClasses = ['org.egothor.stemmer.Compile']
|
||||
excludedClasses = [
|
||||
'org.egothor.stemmer.Compile*',
|
||||
'org.egothor.stemmer.StemmerPatchTrieLoader*',
|
||||
'org.egothor.stemmer.StemmerKnowledgeExperiment*',
|
||||
'org.egothor.stemmer.StemmerKnowledgeExperimentCli*'
|
||||
]
|
||||
excludedTestClasses = [
|
||||
'org.egothor.stemmer.CompileIntegrationTest',
|
||||
'org.egothor.stemmer.StemmerPatchTrieLoaderTest',
|
||||
'org.egothor.stemmer.StemmerKnowledgeExperimentTest'
|
||||
]
|
||||
outputFormats = ['XML', 'HTML']
|
||||
timestampedReports = false
|
||||
exportLineCoverage = true
|
||||
@@ -192,6 +217,13 @@ application {
|
||||
executableDir = 'bin'
|
||||
}
|
||||
|
||||
tasks.register('stemmerKnowledgeExperiment', JavaExec) {
|
||||
group = 'application'
|
||||
description = 'Runs the stemmer knowledge evaluation experiment.'
|
||||
classpath = sourceSets.main.runtimeClasspath
|
||||
mainClass = 'org.egothor.stemmer.StemmerKnowledgeExperimentCli'
|
||||
}
|
||||
|
||||
distributions {
|
||||
main {
|
||||
distributionBaseName = 'radixor'
|
||||
@@ -205,16 +237,13 @@ distributions {
|
||||
into ''
|
||||
}
|
||||
|
||||
from('LICENSE-stemmer-data') {
|
||||
into ''
|
||||
}
|
||||
|
||||
from('docs') {
|
||||
into 'docs'
|
||||
include 'quick-start.md'
|
||||
include 'cli-compilation.md'
|
||||
include 'dictionary-format.md'
|
||||
include 'built-in-languages.md'
|
||||
include 'programmatic-usage.md'
|
||||
include 'architecture-and-reduction.md'
|
||||
include 'quality-and-operations.md'
|
||||
include 'benchmarking.md'
|
||||
include '**/*.md'
|
||||
}
|
||||
|
||||
from(layout.buildDirectory.dir('generated/release-notes')) {
|
||||
@@ -309,6 +338,17 @@ javadoc {
|
||||
options.version = true
|
||||
options.windowTitle = 'Radixor - Egothor Stemmer'
|
||||
options.docTitle = 'Radixor - Egothor Stemmer API'
|
||||
options.overview = file('src/main/javadoc/overview.html')
|
||||
options.bottom = """
|
||||
<div class="legal-copy">
|
||||
© 2026 Egothor
|
||||
<br/>
|
||||
Licensed under <a href="https://github.com/leogalambos/Radixor/blob/main/LICENSE">BSD-3-Clause</a>
|
||||
</div>
|
||||
"""
|
||||
options.links('https://docs.oracle.com/en/java/javase/21/docs/api/')
|
||||
options.group('Core Stemming API', 'org.egothor.stemmer')
|
||||
options.group('Trie Infrastructure', 'org.egothor.stemmer.trie')
|
||||
|
||||
source = sourceSets.main.allJava
|
||||
}
|
||||
|
||||
@@ -1,470 +1,52 @@
|
||||
# Architecture and Reduction
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
This section explains how **Radixor** turns textual dictionary input into a compact compiled stemmer and how reduction affects the semantics preserved in the final runtime artifact.
|
||||
|
||||
This document describes the internal architecture of **Radixor** and the principles behind its **trie compilation and reduction model**.
|
||||
Radixor is easiest to understand when separated into two related concerns:
|
||||
|
||||
It explains:
|
||||
- **architecture**: what structures exist, how data moves through them, and what runtime lookup actually does,
|
||||
- **reduction semantics**: what it means for two subtrees to be considered equivalent and how that choice affects `get()` and `getAll()` behavior.
|
||||
|
||||
- how data flows from dictionary input to compiled trie
|
||||
- how patch-command tries are structured
|
||||
- how subtree reduction works
|
||||
- how reduction modes affect behavior and size
|
||||
## The short version
|
||||
|
||||
Radixor does not keep a large flat table of final stems. Instead, it converts dictionary entries into **patch commands**, stores them in a trie, reduces equivalent subtrees, and freezes the result into an immutable compiled structure.
|
||||
|
||||
The build-time flow is:
|
||||
|
||||
## Overview
|
||||
|
||||
Radixor transforms dictionary data into an optimized runtime structure through three stages:
|
||||
|
||||
1. **Mutable construction**
|
||||
2. **Reduction (canonicalization)**
|
||||
3. **Compilation (freezing)**
|
||||
|
||||
```
|
||||
Dictionary → Mutable trie → Reduced trie → Compiled trie
|
||||
```text
|
||||
Dictionary -> Mutable trie -> Reduced trie -> Compiled trie
|
||||
```
|
||||
|
||||
Each stage has a distinct purpose:
|
||||
At runtime, the compiled trie does not directly return the final stem string. It returns one or more stored patch commands for the addressed key, and those commands are then applied to the original input word.
|
||||
|
||||
| Stage | Purpose | Structure |
|
||||
|------------|----------------------------------|-------------------------|
|
||||
| Build | Collect mappings | `MutableNode` |
|
||||
| Reduction | Merge equivalent subtrees | `ReducedNode` |
|
||||
| Compilation | Optimize for runtime lookup | `CompiledNode` |
|
||||
## Why this matters
|
||||
|
||||
This design gives Radixor several practical properties at once:
|
||||
|
||||
- compact deployable artifacts,
|
||||
- deterministic runtime behavior,
|
||||
- support for both preferred and multiple candidate results,
|
||||
- separation of preparation-time complexity from runtime lookup.
|
||||
|
||||
## Core data model
|
||||
It also explains why a large source dictionary can be transformed into a much smaller compiled artifact without discarding the operational behavior that matters to the caller.
|
||||
|
||||
### Patch-command trie
|
||||
## Reading guide
|
||||
|
||||
Radixor stores **patch commands** instead of stems directly.
|
||||
Use the following pages depending on what you need to understand:
|
||||
|
||||
- keys: word forms
|
||||
- values: transformation commands
|
||||
- structure: trie (prefix tree)
|
||||
- [Architecture](architecture.md) explains the data flow, core structures, patch-command lookup model, and why the compiled trie is efficient at runtime.
|
||||
- [Reduction Semantics](reduction-semantics.md) explains how subtree equivalence is defined, what ranked, unordered, and dominant reduction preserve, and how those choices affect observable lookup behavior.
|
||||
|
||||
At runtime:
|
||||
## Recommended reading order
|
||||
|
||||
1. the word is traversed through the trie
|
||||
2. a patch command is retrieved
|
||||
3. the patch is applied to reconstruct the stem
|
||||
For most readers, the best order is:
|
||||
|
||||
1. [Architecture](architecture.md)
|
||||
2. [Reduction Semantics](reduction-semantics.md)
|
||||
|
||||
## Related documentation
|
||||
|
||||
## Stage 1: Mutable construction
|
||||
|
||||
The builder (`FrequencyTrie.Builder`) constructs a trie using:
|
||||
|
||||
- `MutableNode`
|
||||
- maps of children (`char → node`)
|
||||
- maps of value counts (`value → frequency`)
|
||||
|
||||
Characteristics:
|
||||
|
||||
- insertion-order preserving
|
||||
- mutable
|
||||
- optimized for building, not querying
|
||||
|
||||
Example structure:
|
||||
|
||||
```
|
||||
g
|
||||
└─ n
|
||||
└─ i
|
||||
└─ n
|
||||
└─ n
|
||||
└─ u
|
||||
└─ r
|
||||
└─ (values: {
|
||||
"<patch-command-1>": 3,
|
||||
"<patch-command-2>": 1
|
||||
})
|
||||
```
|
||||
|
||||
This example represents the word "running", stored in reversed form.
|
||||
|
||||
- each edge corresponds to one character of the word
|
||||
- the path is traversed from the end of the word toward the beginning
|
||||
- the terminal node stores one or more patch commands together with their local frequencies
|
||||
|
||||
The values represent transformations from the word form to candidate stems, and the counts indicate how often each mapping was observed during construction.
|
||||
|
||||
Note: Radixor stores word forms in reversed order so that suffix-based transformations can be matched efficiently in a trie.
|
||||
|
||||
|
||||
## Local value summary
|
||||
|
||||
Before reduction, each node is summarized using `LocalValueSummary`.
|
||||
|
||||
It computes:
|
||||
|
||||
- ordered values (by frequency)
|
||||
- aligned counts
|
||||
- total frequency
|
||||
- dominant value (if any)
|
||||
- second-best value
|
||||
|
||||
This summary is critical for:
|
||||
|
||||
- deterministic ordering
|
||||
- reduction decisions
|
||||
- dominance evaluation
|
||||
|
||||
|
||||
|
||||
## Stage 2: Reduction (canonicalization)
|
||||
|
||||
Reduction is the process of merging **semantically equivalent subtrees**.
|
||||
|
||||
### Why reduction exists
|
||||
|
||||
Without reduction:
|
||||
|
||||
- trie size grows linearly with input data
|
||||
- repeated patterns are duplicated
|
||||
|
||||
With reduction:
|
||||
|
||||
- identical subtrees are shared
|
||||
- memory footprint is reduced
|
||||
- binary output becomes smaller
|
||||
|
||||
|
||||
|
||||
## Reduction signature
|
||||
|
||||
Each subtree is represented by a **ReductionSignature**.
|
||||
|
||||
A signature consists of:
|
||||
|
||||
1. **local descriptor** (node semantics)
|
||||
2. **child descriptors** (structure)
|
||||
|
||||
```
|
||||
Signature = (LocalDescriptor, SortedChildDescriptors)
|
||||
```
|
||||
|
||||
Two subtrees are merged if their signatures are equal.
|
||||
|
||||
|
||||
|
||||
## Local descriptors
|
||||
|
||||
The local descriptor encodes how values at a node are interpreted.
|
||||
|
||||
Radixor supports three descriptor types:
|
||||
|
||||
### 1. Ranked descriptor
|
||||
|
||||
Preserves:
|
||||
|
||||
- full ordering of values (`getAll()`)
|
||||
|
||||
Uses:
|
||||
|
||||
- ordered value list
|
||||
|
||||
Best for:
|
||||
|
||||
- correctness
|
||||
- deterministic multi-result behavior
|
||||
|
||||
|
||||
|
||||
### 2. Unordered descriptor
|
||||
|
||||
Preserves:
|
||||
|
||||
- only membership (set of values)
|
||||
|
||||
Ignores:
|
||||
|
||||
- ordering differences
|
||||
|
||||
Best for:
|
||||
|
||||
- higher compression
|
||||
- use cases where ordering is irrelevant
|
||||
|
||||
|
||||
|
||||
### 3. Dominant descriptor
|
||||
|
||||
Preserves:
|
||||
|
||||
- only the dominant value (`get()`)
|
||||
|
||||
Condition:
|
||||
|
||||
- dominant value must satisfy thresholds:
|
||||
- minimum percentage
|
||||
- ratio over second-best
|
||||
|
||||
Fallback:
|
||||
|
||||
- if dominance is not strong enough → ranked descriptor is used
|
||||
|
||||
Best for:
|
||||
|
||||
- maximum compression
|
||||
- single-result workflows
|
||||
|
||||
|
||||
|
||||
## Child descriptors
|
||||
|
||||
Each child is represented as:
|
||||
|
||||
```
|
||||
(edge character, child signature)
|
||||
```
|
||||
|
||||
Children are sorted by edge character to ensure:
|
||||
|
||||
- deterministic signatures
|
||||
- stable equality comparisons
|
||||
|
||||
|
||||
|
||||
## Reduction context
|
||||
|
||||
`ReductionContext` maintains:
|
||||
|
||||
- mapping: `ReductionSignature → ReducedNode`
|
||||
- canonical instances of subtrees
|
||||
|
||||
Workflow:
|
||||
|
||||
1. compute signature
|
||||
2. check if already exists
|
||||
3. reuse existing node or create new one
|
||||
|
||||
This ensures:
|
||||
|
||||
- structural sharing
|
||||
- no duplicate equivalent subtrees
|
||||
|
||||
|
||||
|
||||
## Reduced nodes
|
||||
|
||||
`ReducedNode` represents:
|
||||
|
||||
- canonical subtree
|
||||
- aggregated value counts
|
||||
- canonical children
|
||||
|
||||
It supports:
|
||||
|
||||
- merging local counts
|
||||
- verifying structural consistency
|
||||
|
||||
At this stage:
|
||||
|
||||
- structure is canonical
|
||||
- still mutable (internally)
|
||||
|
||||
|
||||
|
||||
## Stage 3: Compilation (freezing)
|
||||
|
||||
The reduced trie is converted into a **CompiledNode** structure.
|
||||
|
||||
### CompiledNode characteristics
|
||||
|
||||
- immutable
|
||||
- array-based storage
|
||||
- optimized for fast lookup
|
||||
|
||||
Fields:
|
||||
|
||||
- `char[] edgeLabels`
|
||||
- `CompiledNode[] children`
|
||||
- `V[] orderedValues`
|
||||
- `int[] orderedCounts`
|
||||
|
||||
|
||||
|
||||
## Lookup algorithm
|
||||
|
||||
Runtime lookup:
|
||||
|
||||
1. traverse trie using `edgeLabels` (matching characters from the end of the word toward the beginning)
|
||||
2. binary search per node
|
||||
3. retrieve values
|
||||
4. apply patch command
|
||||
|
||||
Properties:
|
||||
|
||||
- O(length of word)
|
||||
- low memory overhead
|
||||
- minimal memory allocation during lookup; patch application produces the resulting string
|
||||
|
||||
|
||||
## Deterministic ordering
|
||||
|
||||
Value ordering is deterministic and stable:
|
||||
|
||||
1. higher frequency first
|
||||
2. shorter string first
|
||||
3. lexicographically smaller
|
||||
4. insertion order
|
||||
|
||||
This guarantees:
|
||||
|
||||
- reproducible builds
|
||||
- stable query results
|
||||
- predictable ranking
|
||||
|
||||
|
||||
|
||||
## Reduction modes
|
||||
|
||||
Reduction modes control how local descriptors are chosen.
|
||||
|
||||
### Ranked mode
|
||||
|
||||
```
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
```
|
||||
|
||||
- preserves full semantics
|
||||
- safest option
|
||||
- recommended default
|
||||
|
||||
|
||||
|
||||
### Unordered mode
|
||||
|
||||
```
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS
|
||||
```
|
||||
|
||||
- ignores ordering
|
||||
- higher compression
|
||||
- slightly weaker semantics
|
||||
|
||||
|
||||
|
||||
### Dominant mode
|
||||
|
||||
```
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS
|
||||
```
|
||||
|
||||
- keeps only dominant result
|
||||
- highest compression
|
||||
- may lose alternative candidates
|
||||
|
||||
|
||||
|
||||
## Trade-offs
|
||||
|
||||
| Aspect | Ranked | Unordered | Dominant |
|
||||
|---------------|--------|----------|----------|
|
||||
| Compression | Medium | High | Highest |
|
||||
| Accuracy | High | Medium | Lower |
|
||||
| getAll() | Full | Partial | Limited |
|
||||
| get() | Exact | Exact | Heuristic|
|
||||
|
||||
|
||||
|
||||
## Deserialization model
|
||||
|
||||
Binary loading uses:
|
||||
|
||||
- `NodeData` as intermediate representation
|
||||
- reconstruction of `CompiledNode`
|
||||
|
||||
This separates:
|
||||
|
||||
- I/O format
|
||||
- in-memory structure
|
||||
|
||||
|
||||
|
||||
## Why this architecture works
|
||||
|
||||
Radixor achieves:
|
||||
|
||||
### Compactness
|
||||
|
||||
- subtree sharing
|
||||
- efficient encoding
|
||||
- compressed binary output
|
||||
|
||||
### Performance
|
||||
|
||||
- array-based lookup
|
||||
- no runtime reduction
|
||||
- minimal branching
|
||||
|
||||
### Flexibility
|
||||
|
||||
- configurable reduction strategies
|
||||
- multiple result support
|
||||
- dictionary-driven behavior
|
||||
|
||||
### Determinism
|
||||
|
||||
- stable ordering
|
||||
- canonical signatures
|
||||
- reproducible builds
|
||||
|
||||
|
||||
|
||||
## Design philosophy
|
||||
|
||||
The architecture reflects a few key principles:
|
||||
|
||||
- separate build-time complexity from runtime simplicity
|
||||
- encode semantics explicitly (not implicitly in code)
|
||||
- favor deterministic behavior over heuristic shortcuts
|
||||
- allow controlled trade-offs between size and fidelity
|
||||
|
||||
|
||||
|
||||
## When to tune reduction
|
||||
|
||||
You should consider changing reduction mode when:
|
||||
|
||||
- binary size is too large
|
||||
- memory footprint must be minimized
|
||||
- only single-result stemming is needed
|
||||
|
||||
Otherwise:
|
||||
|
||||
**use ranked mode by default**
|
||||
|
||||
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Quick start](quick-start.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
Radixor’s architecture is built around:
|
||||
|
||||
- patch-command tries
|
||||
- canonical subtree reduction
|
||||
- immutable compiled structures
|
||||
|
||||
This design allows the system to remain:
|
||||
|
||||
- fast
|
||||
- compact
|
||||
- deterministic
|
||||
- adaptable
|
||||
|
||||
while still supporting advanced use cases such as:
|
||||
|
||||
- ambiguity-aware stemming
|
||||
- dictionary evolution
|
||||
- controlled trade-offs between size and behavior
|
||||
|
||||
209
docs/architecture.md
Normal file
209
docs/architecture.md
Normal file
@@ -0,0 +1,209 @@
|
||||
# Architecture
|
||||
|
||||
This document explains the structural architecture of **Radixor**: what data is stored, how it flows through the build pipeline, and how runtime lookup works once a compiled trie has been produced.
|
||||
|
||||
## The central idea
|
||||
|
||||
Radixor does not store final stems directly as a large flat lookup table. Instead, it stores **patch commands** that describe how a word form should be transformed into a canonical stem.
|
||||
|
||||
For example, if a dictionary states that `running` should reduce to `run`, the final runtime artifact does not need to store a full redundant `running -> run` output string entry in the simplest possible form. It can store a compact transformation command that expresses how to turn the source form into the target form.
|
||||
|
||||
That matters because many words share similar transformation patterns. Once those mappings are organized in a trie and compiled into a canonical structure, the result is much smaller and more reusable than a naive direct-output table.
|
||||
|
||||
## End-to-end build flow
|
||||
|
||||
The full build-time flow is:
|
||||
|
||||
```text
|
||||
Dictionary -> Mutable trie -> Reduced trie -> Compiled trie
|
||||
```
|
||||
|
||||
Each stage has a different purpose.
|
||||
|
||||
### Dictionary input
|
||||
|
||||
The textual dictionary groups known word forms under a canonical stem:
|
||||
|
||||
```text
|
||||
run running runs ran
|
||||
connect connected connecting connection
|
||||
```
|
||||
|
||||
The first column is the canonical stem. The following tab-separated columns are known variants.
|
||||
|
||||
### Patch-command generation
|
||||
|
||||
Each variant is converted into a patch command that transforms the variant into the stem.
|
||||
|
||||
Conceptually:
|
||||
|
||||
```text
|
||||
running -> <patch> -> run
|
||||
runs -> <patch> -> run
|
||||
ran -> <patch> -> run
|
||||
```
|
||||
|
||||
If `storeOriginal` is enabled, the stem itself is also inserted using a canonical no-op patch.
|
||||
|
||||
### Mutable trie construction
|
||||
|
||||
Those patch-command values are inserted into a mutable trie keyed by the source surface form.
|
||||
|
||||
### Reduction
|
||||
|
||||
Equivalent subtrees are merged into canonical reduced nodes.
|
||||
|
||||
### Compilation
|
||||
|
||||
The reduced structure is frozen into an immutable compiled trie optimized for runtime lookup.
|
||||
|
||||
## Why a trie is used
|
||||
|
||||
A trie is useful because many word forms share structural fragments. Instead of storing each word independently, the trie reuses paths and organizes lookup by character traversal.
|
||||
|
||||
A trie node can contain:
|
||||
|
||||
- outgoing edges,
|
||||
- one or more ordered values,
|
||||
- counts aligned with those values.
|
||||
|
||||
This is why the structure can represent both:
|
||||
|
||||
- a single preferred result,
|
||||
- multiple competing results for the same key.
|
||||
|
||||
## Stage 1: Mutable construction
|
||||
|
||||
The mutable build-time structure is created by `FrequencyTrie.Builder`.
|
||||
|
||||
This stage is optimized for insertion rather than runtime lookup. As dictionary data is added, the builder accumulates:
|
||||
|
||||
- child edges,
|
||||
- local values,
|
||||
- local frequencies of those values.
|
||||
|
||||
Those frequencies are not incidental metadata. They later influence both result ordering and, depending on reduction mode, the semantic identity of subtrees during reduction.
|
||||
|
||||
### Why the build-time form is mutable
|
||||
|
||||
The builder must be easy to extend and easy to aggregate into. That is the opposite of what a runtime lookup structure needs.
|
||||
|
||||
Build-time priorities are:
|
||||
|
||||
- flexibility,
|
||||
- accumulation of counts,
|
||||
- structural growth.
|
||||
|
||||
Runtime priorities are:
|
||||
|
||||
- compactness,
|
||||
- immutability,
|
||||
- fast lookup.
|
||||
|
||||
Radixor therefore keeps construction and runtime representation strictly separate.
|
||||
|
||||
## What a compiled node contains
|
||||
|
||||
After reduction and freezing, the runtime structure uses immutable compiled nodes.
|
||||
|
||||
A compiled node stores:
|
||||
|
||||
- `char[] edgeLabels`
|
||||
- child-node references aligned with those labels
|
||||
- ordered value arrays
|
||||
- aligned count arrays
|
||||
|
||||
This array-based form is compact and efficient for lookup.
|
||||
|
||||
## Runtime lookup model
|
||||
|
||||
At runtime, lookup is conceptually simple:
|
||||
|
||||
1. traverse the compiled trie by the input key,
|
||||
2. reach the node addressed by that key,
|
||||
3. retrieve one or more stored patch commands,
|
||||
4. apply the chosen patch command to the original word.
|
||||
|
||||
The trie itself does not create the final stem string. It selects the stored transformation command. `PatchCommandEncoder.apply(...)` then performs the actual transformation.
|
||||
|
||||
That separation is architecturally important:
|
||||
|
||||
- the trie is responsible for **selection**,
|
||||
- patch application is responsible for **transformation**.
|
||||
|
||||
## `get()` and `getAll()`
|
||||
|
||||
The runtime API exposes two complementary views of the addressed node.
|
||||
|
||||
### `get()`
|
||||
|
||||
`get()` returns the locally preferred value stored at that node.
|
||||
|
||||
Preference is deterministic:
|
||||
|
||||
1. higher local frequency wins,
|
||||
2. shorter textual representation wins,
|
||||
3. lexicographically lower textual representation wins,
|
||||
4. stable first-seen order acts as the final tie-breaker.
|
||||
|
||||
### `getAll()`
|
||||
|
||||
`getAll()` returns all locally stored values in deterministic ranked order.
|
||||
|
||||
This is what allows Radixor to preserve ambiguity explicitly instead of forcing every key into a single answer.
|
||||
|
||||
## Why multiple results can exist
|
||||
|
||||
Some stemming systems discard ambiguity early because they insist on returning exactly one answer.
|
||||
|
||||
Radixor does not require that simplification. If multiple plausible patch commands exist for a key, the compiled trie can preserve them and the runtime API can expose them.
|
||||
|
||||
That is useful when downstream logic wants to:
|
||||
|
||||
- inspect ambiguity,
|
||||
- preserve alternatives for retrieval,
|
||||
- apply later ranking or domain-specific selection.
|
||||
|
||||
## Why compiled artifacts are compact
|
||||
|
||||
The final compiled trie can be much smaller than the original dictionary for several reasons working together:
|
||||
|
||||
- patch commands are compact,
|
||||
- trie paths reuse shared structure,
|
||||
- reduction merges equivalent subtrees,
|
||||
- binary persistence stores the already reduced form,
|
||||
- GZip compression is applied on top of the binary format.
|
||||
|
||||
This is why a very large dictionary can still produce a manageable deployable runtime artifact.
|
||||
|
||||
## Why preparation can still use more memory
|
||||
|
||||
The compactness of the final artifact should not be confused with the memory usage of preparation.
|
||||
|
||||
Before reduction has completed, the mutable build-time structure must exist in memory. For large dictionaries, that temporary preparation cost can be noticeably higher than the size of the final persisted artifact or the loaded compiled trie.
|
||||
|
||||
That is why the preferred operational model is usually:
|
||||
|
||||
- compile offline,
|
||||
- persist the compiled artifact,
|
||||
- load the finished artifact in runtime services.
|
||||
|
||||
## Determinism as a design principle
|
||||
|
||||
Radixor favors deterministic behavior throughout the pipeline.
|
||||
|
||||
This appears in:
|
||||
|
||||
- lowercased dictionary parsing,
|
||||
- stable value ordering,
|
||||
- sorted child descriptors,
|
||||
- canonical reduction signatures,
|
||||
- reproducible compiled lookup behavior.
|
||||
|
||||
Determinism matters not only for tests, but also for operational trust. It makes stemming behavior explainable and reproducible across builds and environments.
|
||||
|
||||
## Continue with
|
||||
|
||||
- [Reduction Semantics](reduction-semantics.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
BIN
docs/assets/images/banner.jpg
Normal file
BIN
docs/assets/images/banner.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 540 KiB |
109
docs/assets/stylesheets/extra.css
Normal file
109
docs/assets/stylesheets/extra.css
Normal file
@@ -0,0 +1,109 @@
|
||||
/* Compact technical typography for Radixor */
|
||||
|
||||
:root {
|
||||
--md-text-font: "Inter", "Segoe UI", "Roboto", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
/* Hide page title only on the landing page */
|
||||
.visually-hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Main article text */
|
||||
.md-typeset {
|
||||
font-size: 0.78rem;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
/* Paragraph spacing */
|
||||
.md-typeset p,
|
||||
.md-typeset ul,
|
||||
.md-typeset ol,
|
||||
.md-typeset dl,
|
||||
.md-typeset blockquote {
|
||||
margin-top: 0.45em;
|
||||
margin-bottom: 0.45em;
|
||||
}
|
||||
|
||||
/* Headings */
|
||||
.md-typeset h1 {
|
||||
margin: 0 0 0.7rem;
|
||||
font-size: 1.8rem;
|
||||
line-height: 1.15;
|
||||
}
|
||||
|
||||
.md-typeset h2 {
|
||||
margin: 1.2rem 0 0.55rem;
|
||||
font-size: 1.3rem;
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
.md-typeset h3 {
|
||||
margin: 1rem 0 0.45rem;
|
||||
font-size: 1.05rem;
|
||||
line-height: 1.25;
|
||||
}
|
||||
|
||||
.md-typeset h4,
|
||||
.md-typeset h5,
|
||||
.md-typeset h6 {
|
||||
margin: 0.85rem 0 0.35rem;
|
||||
line-height: 1.25;
|
||||
}
|
||||
|
||||
/* Lists */
|
||||
.md-typeset li {
|
||||
margin-bottom: 0.15em;
|
||||
}
|
||||
|
||||
.md-typeset ul,
|
||||
.md-typeset ol {
|
||||
padding-left: 1.1rem;
|
||||
}
|
||||
|
||||
/* Tables */
|
||||
.md-typeset table:not([class]) td,
|
||||
.md-typeset table:not([class]) th {
|
||||
padding: 0.45rem 0.7rem;
|
||||
}
|
||||
|
||||
/* Code blocks */
|
||||
.md-typeset pre > code {
|
||||
font-size: 0.72rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
/* Inline code */
|
||||
.md-typeset code {
|
||||
font-size: 0.72rem;
|
||||
}
|
||||
|
||||
/* Navigation density */
|
||||
.md-nav__item .md-nav__link {
|
||||
margin-top: 0.12rem;
|
||||
margin-bottom: 0.12rem;
|
||||
}
|
||||
|
||||
.md-sidebar__scrollwrap {
|
||||
padding-top: 0.3rem;
|
||||
padding-bottom: 0.3rem;
|
||||
}
|
||||
|
||||
/* Slightly narrower content rhythm */
|
||||
.md-content__inner {
|
||||
margin-top: 0.6rem;
|
||||
padding-bottom: 1.2rem;
|
||||
}
|
||||
|
||||
/* Admonitions more compact */
|
||||
.md-typeset .admonition,
|
||||
.md-typeset details {
|
||||
margin: 0.8rem 0;
|
||||
}
|
||||
|
||||
/* Optional: use a bit wider content area on large screens */
|
||||
@media screen and (min-width: 76.25em) {
|
||||
.md-grid {
|
||||
max-width: 68rem;
|
||||
}
|
||||
}
|
||||
@@ -1,41 +1,66 @@
|
||||
# Benchmarking
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
|
||||
Radixor includes a JMH benchmark suite for both the internal algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
|
||||
|
||||
This document explains what is benchmarked, how to run it, and how to interpret the results responsibly.
|
||||
This document explains what is benchmarked, how to run the suite, and how benchmark results should be interpreted.
|
||||
|
||||
## Scope
|
||||
|
||||
The benchmark suite currently covers two categories:
|
||||
|
||||
- Radixor core operations
|
||||
- English stemmer comparison on the same token workload
|
||||
- Radixor core operations,
|
||||
- English stemmer comparison on the same token workload.
|
||||
|
||||
The comparison benchmark processes the same deterministic English token stream through:
|
||||
|
||||
- Radixor with bundled `US_UK_PROFI`
|
||||
- Snowball original Porter
|
||||
- Snowball English, commonly referred to as Porter2
|
||||
- Radixor with bundled `US_UK` (older benchmark snapshots used the now-retired `US_UK_PROFI` resource),
|
||||
- Snowball original Porter,
|
||||
- Snowball English, commonly referred to as Porter2.
|
||||
|
||||
The purpose of the comparison is throughput measurement on identical input. It is not intended to prove linguistic equivalence between the compared stemmers.
|
||||
The purpose of the comparison is throughput measurement on identical input. It is not intended to demonstrate linguistic equivalence between the compared stemmers.
|
||||
|
||||
## How to read the published numbers
|
||||
|
||||
Two kinds of benchmark numbers are relevant in the project.
|
||||
|
||||
### Reference measurements
|
||||
|
||||
The detailed benchmark snapshot documented on this page comes from a controlled run on a Ryzen 5 system. Those numbers are the best reference point for understanding absolute throughput under a known local benchmark environment.
|
||||
|
||||
### Published badge figures
|
||||
|
||||
The benchmark badge metadata published through GitHub Pages is generated in the GitHub-hosted container environment. That environment is convenient for continuous publication, but it is not the right place to treat absolute throughput values as stable across time. CPU scheduling, shared-host variability, and container-level noise can materially affect raw numbers from run to run.
|
||||
|
||||
For that reason, the published badge values should be treated primarily as a compact status surface. They are useful for observing broad trends and relative positioning, but not as the authoritative source for precise absolute throughput claims.
|
||||
|
||||
## Current snapshot
|
||||
|
||||
A recent JMH run on JDK 21.0.10 with JMH 1.37, one thread, three warmup iterations, and five measurement iterations produced the following approximate throughput ranges:
|
||||
|
||||
| Workload | Radixor `US_UK_PROFI` | Snowball Porter | Snowball English |
|
||||
| Workload | Radixor `US_UK` *(historical runs: `US_UK_PROFI`)* | Snowball Porter | Snowball English |
|
||||
| --- | ---: | ---: | ---: |
|
||||
| About 12,000 generated tokens | 30.99 M tokens/s | 8.21 M tokens/s | 5.46 M tokens/s |
|
||||
| About 60,000 generated tokens | 32.25 M tokens/s | 8.02 M tokens/s | 5.11 M tokens/s |
|
||||
|
||||
On that workload, Radixor is approximately:
|
||||
On that workload, Radixor measured approximately:
|
||||
|
||||
- 4 times faster than Snowball original Porter
|
||||
- 6 times faster than Snowball English
|
||||
- 4 times the throughput of Snowball original Porter,
|
||||
- 6 times the throughput of Snowball English.
|
||||
|
||||
These values are workload- and environment-dependent. Treat them as measured results for the documented benchmark setup, not as universal constants.
|
||||
These values are workload-dependent and environment-dependent. They should be read as measured results for the documented setup, not as universal constants.
|
||||
|
||||
## Interpreting the relative result
|
||||
|
||||
Although the absolute numbers can move across environments, the throughput relationship between Radixor and the compared Porter-family stemmers has remained broadly stable in practical measurements. In particular, the comparison against Snowball original Porter is consistently in the rough range of about four to one in Radixor’s favor.
|
||||
|
||||
That relative behavior is more informative than any single absolute figure. It reflects a real architectural difference rather than a cosmetic benchmark artifact.
|
||||
|
||||
Radixor is built around a compiled patch-command trie that resolves the result through a direct lookup and patch application path. In contrast, classic rule-based stemmers such as the Porter family follow a different operational model. The result is that Radixor combines two properties that do not often appear together:
|
||||
|
||||
- dictionary-driven compiled lookup performance,
|
||||
- the ability to generalize beyond explicitly listed word forms instead of behaving like a pure closed-form dictionary lookup table.
|
||||
|
||||
Within that design space, the measured throughput profile is strong enough to place Radixor among the fastest known practical implementations of this kind, while still supporting stemming of previously unseen forms. That should still be read as a carefully bounded engineering statement, not as an absolute claim over every possible stemmer architecture or benchmark scenario.
|
||||
|
||||
## Benchmark classes
|
||||
|
||||
@@ -43,9 +68,9 @@ The main benchmark classes are under `src/jmh/java/org/egothor/stemmer/benchmark
|
||||
|
||||
Relevant classes include:
|
||||
|
||||
- `FrequencyTrieLookupBenchmark`
|
||||
- `FrequencyTrieCompilationBenchmark`
|
||||
- `EnglishStemmerComparisonBenchmark`
|
||||
- `FrequencyTrieLookupBenchmark`,
|
||||
- `FrequencyTrieCompilationBenchmark`,
|
||||
- `EnglishStemmerComparisonBenchmark`.
|
||||
|
||||
The English comparison benchmark uses the bundled Radixor English resource and the official Snowball Java distribution integrated into the JMH source set.
|
||||
|
||||
@@ -55,10 +80,10 @@ The English comparison benchmark uses a deterministic generated corpus rather th
|
||||
|
||||
The workload intentionally mixes:
|
||||
|
||||
- simple inflections
|
||||
- common derivational forms
|
||||
- US and UK spelling families
|
||||
- lexical forms appropriate for `US_UK_PROFI`
|
||||
- simple inflections,
|
||||
- common derivational forms,
|
||||
- US and UK spelling families,
|
||||
- lexical forms appropriate for the current bundled `US_UK` resource (with historical continuity from earlier `US_UK_PROFI` runs).
|
||||
|
||||
This design keeps runs reproducible across environments and avoids accidental drift caused by changing external corpora.
|
||||
|
||||
@@ -80,42 +105,42 @@ Run only the English comparison benchmark:
|
||||
|
||||
JMH reports are written to:
|
||||
|
||||
- `build/reports/jmh/jmh-results.txt`
|
||||
- `build/reports/jmh/jmh-results.csv`
|
||||
- `build/reports/jmh/jmh-results.txt`,
|
||||
- `build/reports/jmh/jmh-results.csv`.
|
||||
|
||||
The text report is convenient for human review. The CSV report is more useful for CI archiving, historical tracking, and external processing.
|
||||
|
||||
## Interpreting results
|
||||
## Interpreting results responsibly
|
||||
|
||||
Benchmark numbers should be read with care.
|
||||
Benchmark numbers should always be read with care.
|
||||
|
||||
Important factors include:
|
||||
|
||||
- CPU model and frequency behavior
|
||||
- thermal throttling
|
||||
- JVM vendor and version
|
||||
- system background load
|
||||
- operating-system scheduling noise
|
||||
- benchmark parameter changes
|
||||
- CPU model and frequency behavior,
|
||||
- thermal throttling,
|
||||
- JVM vendor and version,
|
||||
- system background load,
|
||||
- operating-system scheduling noise,
|
||||
- benchmark parameter changes.
|
||||
|
||||
For meaningful comparison, keep these stable:
|
||||
|
||||
- hardware or VM class
|
||||
- JDK version
|
||||
- benchmark parameters
|
||||
- thread count
|
||||
- benchmark source revision
|
||||
- hardware or VM class,
|
||||
- JDK version,
|
||||
- benchmark parameters,
|
||||
- thread count,
|
||||
- benchmark source revision.
|
||||
|
||||
If a regression is suspected, repeat the run and compare against the previous CSV output rather than relying on a single measurement.
|
||||
If a regression is suspected, repeat the run and compare against previous CSV output rather than relying on a single measurement.
|
||||
|
||||
## Regression tracking
|
||||
|
||||
The recommended regression workflow is:
|
||||
|
||||
1. archive `jmh-results.csv`
|
||||
2. compare the same benchmark names across runs
|
||||
3. compare only like-for-like environments
|
||||
4. investigate sustained regressions rather than one-off noise
|
||||
1. archive `jmh-results.csv`,
|
||||
2. compare the same benchmark names across runs,
|
||||
3. compare only like-for-like environments,
|
||||
4. investigate sustained regressions rather than one-off noise.
|
||||
|
||||
For public reporting, the README should keep only the condensed benchmark summary, while detailed benchmark methodology and interpretation should remain in this document.
|
||||
|
||||
@@ -127,8 +152,8 @@ Radixor uses a compiled patch-command trie driven by dictionary data. Snowball P
|
||||
|
||||
Because of that, the comparison should be understood as:
|
||||
|
||||
- equal input workload
|
||||
- different stemming strategies
|
||||
- measured throughput, not semantic identity
|
||||
- equal input workload,
|
||||
- different stemming strategies,
|
||||
- measured throughput rather than semantic identity.
|
||||
|
||||
That distinction matters whenever performance claims are discussed in documentation or release notes.
|
||||
That distinction matters whenever performance claims are discussed in documentation, release notes, or badge summaries.
|
||||
|
||||
@@ -1,54 +1,49 @@
|
||||
# Built-in Languages
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
|
||||
Radixor provides a set of **bundled stemmer dictionaries** that can be loaded directly without preparing custom data.
|
||||
|
||||
These built-in resources are useful for:
|
||||
|
||||
- quick integration
|
||||
- testing and evaluation
|
||||
- reference behavior
|
||||
- prototyping search pipelines
|
||||
|
||||
|
||||
Radixor ships with a curated set of bundled stemmer dictionaries that can be loaded directly from the library distribution. These resources are intended to provide an immediately usable baseline for evaluation, prototyping, integration, and general-purpose stemming workloads, while still fitting naturally into workflows where the bundled baseline is later refined, extended, or replaced with custom lexical data.
|
||||
|
||||
## Overview
|
||||
|
||||
Bundled dictionaries are exposed through:
|
||||
|
||||
```java
|
||||
StemmerPatchTrieLoader.Language
|
||||
org.egothor.stemmer.StemmerPatchTrieLoader.Language
|
||||
```
|
||||
|
||||
They are packaged with the library and loaded from the classpath.
|
||||
Each bundled dictionary is packaged with the library as a compressed UTF-8 text resource. When loaded, the resource is parsed by `StemmerDictionaryParser`, transformed into patch-command mappings, and compiled into a read-only `FrequencyTrie<String>` by `StemmerPatchTrieLoader`.
|
||||
|
||||
The bundled language definition also carries a language-level right-to-left flag. That flag is used by the loader to derive the `WordTraversalDirection` used for both trie-key construction and patch-command generation. In practice, left-to-right bundled languages use historical backward Egothor traversal, while right-to-left bundled languages use forward traversal over the stored form.
|
||||
|
||||
## Supported bundled languages
|
||||
|
||||
## Supported languages
|
||||
|
||||
The following language identifiers are currently available:
|
||||
|
||||
| Language | Enum constant | Description |
|
||||
|----------|------------------|------------------------------|
|
||||
| Danish | `DA_DK` | Danish |
|
||||
| German | `DE_DE` | German |
|
||||
| Spanish | `ES_ES` | Spanish |
|
||||
| French | `FR_FR` | French |
|
||||
| Italian | `IT_IT` | Italian |
|
||||
| Dutch | `NL_NL` | Dutch |
|
||||
| Norwegian| `NO_NO` | Norwegian |
|
||||
| Portuguese| `PT_PT` | Portuguese |
|
||||
| Russian | `RU_RU` | Russian |
|
||||
| Swedish | `SV_SE` | Swedish |
|
||||
| English | `US_UK` | Standard English |
|
||||
| English | `US_UK_PROFI` | Extended English dictionary |
|
||||
|
||||
The following bundled language identifiers are currently available:
|
||||
|
||||
| Language | Enum constant | Writing direction | Notes |
|
||||
|---|---|---:|---|
|
||||
| Czech | `CS_CZ` | LTR | Bundled general-purpose dictionary |
|
||||
| Danish | `DA_DK` | LTR | Bundled general-purpose dictionary |
|
||||
| German | `DE_DE` | LTR | Bundled general-purpose dictionary |
|
||||
| Spanish | `ES_ES` | LTR | Bundled general-purpose dictionary |
|
||||
| Persian | `FA_IR` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||
| Finnish | `FI_FI` | LTR | Bundled general-purpose dictionary |
|
||||
| French | `FR_FR` | LTR | Bundled general-purpose dictionary |
|
||||
| Hebrew | `HE_IL` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||
| Hungarian | `HU_HU` | LTR | Bundled general-purpose dictionary |
|
||||
| Italian | `IT_IT` | LTR | Bundled general-purpose dictionary |
|
||||
| Norwegian Bokmål | `NB_NO` | LTR | Bundled general-purpose dictionary |
|
||||
| Dutch | `NL_NL` | LTR | Bundled general-purpose dictionary |
|
||||
| Norwegian Nynorsk | `NN_NO` | LTR | Bundled general-purpose dictionary |
|
||||
| Polish | `PL_PL` | LTR | Bundled general-purpose dictionary |
|
||||
| Portuguese | `PT_PT` | LTR | Bundled general-purpose dictionary |
|
||||
| Russian | `RU_RU` | LTR | Bundled general-purpose dictionary |
|
||||
| Swedish | `SV_SE` | LTR | Bundled general-purpose dictionary |
|
||||
| Ukrainian | `UK_UA` | LTR | Bundled general-purpose dictionary |
|
||||
| English | `US_UK` | LTR | Bundled general-purpose dictionary |
|
||||
| Yiddish | `YI` | RTL | Bundled dictionary uses forward traversal over the stored form |
|
||||
|
||||
## Basic usage
|
||||
|
||||
Load a bundled stemmer:
|
||||
Load a bundled dictionary like this:
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
@@ -59,194 +54,203 @@ import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class BuiltInExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
private BuiltInExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
System.out.println(trie.traversalDirection());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This call loads the bundled dictionary resource for the selected language, parses its lexical entries, derives patch-command mappings, and compiles the result into a read-only trie.
|
||||
|
||||
|
||||
## Example: stemming with `US_UK_PROFI`
|
||||
## Example: stemming with a bundled dictionary
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
|
||||
import org.egothor.stemmer.*;
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class EnglishExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
private EnglishExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
String word = "running";
|
||||
String patch = trie.get(word);
|
||||
String stem = PatchCommandEncoder.apply(word, patch);
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final String word = "running";
|
||||
final String patch = trie.get(word);
|
||||
final String stem = PatchCommandEncoder.apply(word, patch, trie.traversalDirection());
|
||||
|
||||
System.out.println(word + " -> " + stem);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Passing `trie.traversalDirection()` to `PatchCommandEncoder.apply(...)` is the correct general contract. It ensures that the patch is applied using the same logical traversal model that was used when the trie and its patch commands were produced.
|
||||
|
||||
## Traversal behavior and right-to-left languages
|
||||
|
||||
## `US_UK` vs `US_UK_PROFI`
|
||||
Bundled dictionaries are not all processed identically.
|
||||
|
||||
### `US_UK`
|
||||
For traditional left-to-right suffix-oriented resources, Radixor preserves historical Egothor behavior and traverses logical word characters backward. That means trie paths are constructed from the logical end of the stored word toward its beginning, and patch commands are interpreted with the same backward traversal model.
|
||||
|
||||
* smaller dictionary
|
||||
* faster load time
|
||||
* suitable for lightweight use cases
|
||||
For bundled right-to-left languages such as Persian, Hebrew, and Yiddish, Radixor uses forward traversal over the stored form. In those cases:
|
||||
|
||||
### `US_UK_PROFI`
|
||||
- trie keys are traversed from the logical beginning of the stored form,
|
||||
- patch commands are generated in that same forward direction,
|
||||
- patch application must use `WordTraversalDirection.FORWARD`, which is naturally obtained from `trie.traversalDirection()`.
|
||||
|
||||
* larger and more complete dataset
|
||||
* better coverage of word forms
|
||||
* improved stemming quality
|
||||
* slightly larger memory footprint
|
||||
This design keeps the traversal policy explicit and consistent across dictionary loading, trie lookup, binary persistence, builder reconstruction, and patch application.
|
||||
|
||||
### Recommendation
|
||||
## Reduction behavior
|
||||
|
||||
Use:
|
||||
Bundled dictionaries can be compiled using any supported `ReductionMode`. The reduction configuration controls how semantically equivalent subtrees are merged during trie compilation, while preserving the contract of the selected mode.
|
||||
|
||||
```
|
||||
US_UK_PROFI
|
||||
```
|
||||
Typical entry points are:
|
||||
|
||||
for most applications unless memory constraints are strict.
|
||||
- `StemmerPatchTrieLoader.load(language, storeOriginal, reductionMode)`
|
||||
- `StemmerPatchTrieLoader.load(language, storeOriginal, reductionSettings)`
|
||||
|
||||
For most users, `ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS` is the most conservative general-purpose choice because it preserves ranked `getAll(...)` behavior.
|
||||
|
||||
## Intended role of bundled dictionaries
|
||||
|
||||
## How bundled dictionaries are loaded
|
||||
Bundled dictionaries should be understood as practical default resources.
|
||||
|
||||
Internally:
|
||||
They are a good fit when:
|
||||
|
||||
- dictionaries are stored as text resources
|
||||
- parsed using `StemmerDictionaryParser`
|
||||
- compiled into a trie at load time
|
||||
- a supported language is already available,
|
||||
- immediate usability matters,
|
||||
- a reasonable baseline is sufficient,
|
||||
- the goal is evaluation, prototyping, or straightforward integration.
|
||||
|
||||
This means:
|
||||
They are also well suited to staged refinement workflows in which a bundled base is loaded first, then extended with domain-specific vocabulary, and finally persisted as a custom binary artifact.
|
||||
|
||||
- first load includes parsing + compilation cost
|
||||
- subsequent usage is fast
|
||||
## Character representation
|
||||
|
||||
Bundled dictionaries are ordinary UTF-8 lexical resources. The parser reads them as text, the trie stores standard Java strings, and the patch-command model operates on general character sequences.
|
||||
|
||||
This is important for two reasons:
|
||||
|
||||
## When to use bundled languages
|
||||
1. the built-in resources are not limited to ASCII-only processing,
|
||||
2. the traversal model is orthogonal to character encoding and script choice.
|
||||
|
||||
Bundled dictionaries are suitable when:
|
||||
In other words, right-to-left handling in the loader is about logical traversal strategy, not about introducing a separate character model.
|
||||
|
||||
- you need quick results without preparing custom data
|
||||
- you are prototyping or experimenting
|
||||
- your language requirements match the provided datasets
|
||||
## When to prefer custom dictionaries
|
||||
|
||||
A custom dictionary is usually the better choice when:
|
||||
|
||||
- domain-specific vocabulary materially affects stemming quality,
|
||||
- lexical coverage must be controlled more precisely,
|
||||
- a stronger lexical resource is available than the bundled baseline,
|
||||
- operational requirements demand an explicitly curated, versioned artifact.
|
||||
|
||||
## When to use custom dictionaries
|
||||
|
||||
You should prefer custom dictionaries when:
|
||||
|
||||
- domain-specific vocabulary is important
|
||||
- accuracy requirements are high
|
||||
- you need full control over stemming behavior
|
||||
|
||||
Typical examples:
|
||||
|
||||
- technical terminology
|
||||
- product catalogs
|
||||
- biomedical text
|
||||
- legal or financial language
|
||||
|
||||
Typical examples include:
|
||||
|
||||
- technical terminology,
|
||||
- biomedical language,
|
||||
- legal or financial vocabulary,
|
||||
- organization-specific product and process names,
|
||||
- dictionaries maintained with project-specific validation rules.
|
||||
|
||||
## Production recommendation
|
||||
|
||||
For production systems:
|
||||
For production systems, the most robust workflow is usually:
|
||||
|
||||
1. Load a bundled dictionary
|
||||
2. Extend it with domain-specific terms (optional)
|
||||
3. Compile it into a binary `.radixor.gz` file
|
||||
4. Deploy the compiled artifact
|
||||
5. Load it using `loadBinary(...)`
|
||||
1. start from a bundled dictionary when it is suitable,
|
||||
2. extend it with domain-specific forms if needed,
|
||||
3. rebuild it into a binary artifact,
|
||||
4. deploy that compiled binary artifact,
|
||||
5. load it at runtime through `loadBinary(...)`.
|
||||
|
||||
This avoids:
|
||||
This avoids repeated startup parsing and makes the deployed stemming behavior explicit, reproducible, and versionable.
|
||||
|
||||
- runtime parsing overhead
|
||||
- repeated compilation
|
||||
- startup latency
|
||||
|
||||
|
||||
|
||||
## Example workflow
|
||||
## Example refinement workflow
|
||||
|
||||
```java
|
||||
// 1. Load bundled dictionary
|
||||
FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
// 2. Modify (optional)
|
||||
FrequencyTrie.Builder<String> builder =
|
||||
FrequencyTrieBuilders.copyOf(
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.FrequencyTrieBuilders;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class BundledRefinementExample {
|
||||
|
||||
private BundledRefinementExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = FrequencyTrieBuilders.copyOf(
|
||||
base,
|
||||
String[]::new,
|
||||
ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
)
|
||||
);
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
|
||||
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
|
||||
builder.put("microservices", "Na");
|
||||
|
||||
// 3. Compile
|
||||
FrequencyTrie<String> compiled = builder.build();
|
||||
final FrequencyTrie<String> compiled = builder.build();
|
||||
|
||||
// 4. Save
|
||||
StemmerPatchTrieBinaryIO.write(compiled, Path.of("english-custom.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The reconstructed builder preserves the traversal direction of the source trie, so refinements remain semantically aligned with the original bundled dictionary.
|
||||
|
||||
## Extending language support
|
||||
|
||||
## Limitations
|
||||
The built-in set is intentionally a practical baseline rather than a closed catalog. Additional languages, stronger lexical coverage, and improved dictionaries for currently supported languages are all natural extension paths.
|
||||
|
||||
* bundled dictionaries are **general-purpose**
|
||||
* they may not reflect:
|
||||
What matters most is not only the number of entries, but the quality, consistency, maintainability, and operational usefulness of the lexical resource being added.
|
||||
|
||||
* domain-specific usage
|
||||
* rare or specialized vocabulary
|
||||
* organization-specific terminology
|
||||
## Related API surface
|
||||
|
||||
The following types are typically involved when working with bundled dictionaries:
|
||||
|
||||
- `StemmerPatchTrieLoader`
|
||||
- `StemmerPatchTrieLoader.Language`
|
||||
- `FrequencyTrie`
|
||||
- `PatchCommandEncoder`
|
||||
- `WordTraversalDirection`
|
||||
- `ReductionMode`
|
||||
- `ReductionSettings`
|
||||
- `StemmerPatchTrieBinaryIO`
|
||||
- `FrequencyTrieBuilders`
|
||||
|
||||
## Next steps
|
||||
|
||||
* [Quick start](quick-start.md)
|
||||
* [Dictionary format](dictionary-format.md)
|
||||
* [CLI compilation](cli-compilation.md)
|
||||
* [Programmatic usage](programmatic-usage.md)
|
||||
|
||||
|
||||
- [Quick start](quick-start.md)
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
|
||||
## Summary
|
||||
|
||||
Radixor’s built-in language support provides:
|
||||
|
||||
* immediate usability
|
||||
* reference datasets
|
||||
* a starting point for customization
|
||||
|
||||
For production systems, they are best used as:
|
||||
|
||||
* a baseline
|
||||
* a seed for further extension
|
||||
* a source for compiled deployment artifacts
|
||||
|
||||
Radixor’s built-in language support provides immediate usability, a professionally defined baseline API, and a practical starting point for custom refinement. The bundled set now includes both left-to-right and right-to-left languages, and the library models that distinction explicitly through `WordTraversalDirection` so that trie construction, lookup, and patch application remain consistent.
|
||||
|
||||
@@ -1,305 +1,278 @@
|
||||
# CLI Compilation
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
Radixor provides a command-line compiler for turning line-oriented dictionary files into compact binary stemmer artifacts.
|
||||
|
||||
Radixor provides a command-line tool for compiling dictionary files into compact, production-ready binary stemmer tables.
|
||||
This is the preferred preparation workflow when stemming should run against an already compiled artifact rather than against raw dictionary input. The CLI reads the dictionary, derives patch commands, builds a mutable trie, applies the selected subtree reduction strategy, and writes the final compiled trie in the project binary format under GZip compression. The result is a deployment-ready `.radixor.gz` file that can be loaded directly by application code.
|
||||
|
||||
This is the recommended workflow for deployment environments, as it separates:
|
||||
## What the CLI does
|
||||
|
||||
- dictionary preparation (offline)
|
||||
- stemming execution (runtime)
|
||||
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
The `Compile` tool:
|
||||
|
||||
1. reads a line-oriented dictionary file
|
||||
2. converts word–stem pairs into patch commands
|
||||
3. builds a trie structure
|
||||
4. applies subtree reduction
|
||||
5. writes a compressed binary artifact
|
||||
|
||||
The output is a `.radixor.gz` file suitable for fast runtime loading.
|
||||
The `Compile` tool performs the following steps:
|
||||
|
||||
1. reads the input dictionary in the standard Radixor stemmer format, accepting either plain UTF-8 text or GZip-compressed UTF-8 text,
|
||||
2. parses each line into a canonical stem column and its known variant columns,
|
||||
3. converts variants into patch commands,
|
||||
4. builds a mutable trie of patch-command values,
|
||||
5. applies the configured reduction mode,
|
||||
6. writes the compiled trie as a GZip-compressed binary artifact.
|
||||
|
||||
This workflow is intentionally aligned with the same dictionary semantics used elsewhere in the library. Remarks introduced by `#` or `//` are supported through the shared dictionary parser.
|
||||
|
||||
## Basic usage
|
||||
|
||||
```bash
|
||||
java org.egothor.stemmer.Compile \
|
||||
--input ./data/stemmer.txt \
|
||||
--input ./data/stemmer.tsv \
|
||||
--output ./build/english.radixor.gz \
|
||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||
--case-processing-mode LOWERCASE_WITH_LOCALE_ROOT \
|
||||
--store-original \
|
||||
--overwrite
|
||||
```
|
||||
|
||||
## Supported arguments
|
||||
|
||||
The CLI supports the following arguments:
|
||||
|
||||
## Required arguments
|
||||
```text
|
||||
--input <file>
|
||||
--output <file>
|
||||
--reduction-mode <mode>
|
||||
[--store-original]
|
||||
[--right-to-left]
|
||||
[--case-processing-mode <mode>]
|
||||
[--dominant-winner-min-percent <1..100>]
|
||||
[--dominant-winner-over-second-ratio <1..n>]
|
||||
[--overwrite]
|
||||
[--help]
|
||||
```
|
||||
|
||||
### `--input`
|
||||
### `--input <file>`
|
||||
|
||||
Path to the source dictionary file.
|
||||
|
||||
* must be in the [dictionary format](dictionary-format.md)
|
||||
* must be readable
|
||||
* UTF-8 encoding is expected
|
||||
|
||||
```
|
||||
--input ./data/stemmer.txt
|
||||
```
|
||||
|
||||
### `--output`
|
||||
|
||||
Path to the output binary file.
|
||||
|
||||
* parent directories are created automatically
|
||||
* output is written as **GZip-compressed binary**
|
||||
|
||||
```
|
||||
--output ./build/english.radixor.gz
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Optional arguments
|
||||
|
||||
### `--reduction-mode`
|
||||
|
||||
Controls how aggressively the trie is reduced during compilation.
|
||||
|
||||
Available values:
|
||||
|
||||
* `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
|
||||
* `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
|
||||
* `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
|
||||
The file must use the standard line-oriented tab-separated values dictionary format, meaning that columns are separated by the tab character. Each non-empty logical line starts with the canonical stem column and may contain zero or more variant columns. The input may be plain UTF-8 text or GZip-compressed UTF-8 text; compression is detected from the stream header rather than the file extension. The parser processes case according to `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores trailing remarks introduced by `#` or `//`, and currently ignores dictionary items containing embedded whitespace while reporting them through warning-level log entries.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--input ./data/stemmer.tsv
|
||||
```
|
||||
|
||||
### `--output <file>`
|
||||
|
||||
Path to the output binary artifact.
|
||||
|
||||
The output file is written as a GZip-compressed binary trie. Parent directories are created automatically when needed.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--output ./build/english.radixor.gz
|
||||
```
|
||||
|
||||
### `--reduction-mode <mode>`
|
||||
|
||||
Selects the subtree reduction strategy used during compilation.
|
||||
|
||||
Supported values are:
|
||||
|
||||
- `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
|
||||
- `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
|
||||
- `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
```
|
||||
|
||||
#### Recommendation
|
||||
|
||||
Use:
|
||||
|
||||
```
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
```
|
||||
|
||||
This provides:
|
||||
|
||||
* safe behavior
|
||||
* deterministic ordering
|
||||
* good compression
|
||||
|
||||
|
||||
This argument is required.
|
||||
|
||||
### `--store-original`
|
||||
|
||||
Stores the stem itself as a no-op mapping.
|
||||
When this flag is present, the canonical stem itself is inserted using the no-op patch command.
|
||||
|
||||
```
|
||||
```text
|
||||
--store-original
|
||||
```
|
||||
|
||||
Effect:
|
||||
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
|
||||
|
||||
* ensures that canonical forms are always resolvable
|
||||
* improves robustness in real-world inputs
|
||||
### `--right-to-left`
|
||||
|
||||
Recommended for most use cases.
|
||||
When present, compilation uses forward traversal (`WordTraversalDirection.FORWARD`) so stored forms are processed from their logical beginning.
|
||||
|
||||
```text
|
||||
--right-to-left
|
||||
```
|
||||
|
||||
This option is intended for right-to-left languages where affix behavior should operate on the written form without externally reversing words.
|
||||
|
||||
### `--case-processing-mode <mode>`
|
||||
|
||||
Controls dictionary key normalization during compilation and lookup. The setting is stored in persisted trie metadata and is therefore available to runtime lookup after binary loading.
|
||||
|
||||
Supported values are:
|
||||
|
||||
- `LOWERCASE_WITH_LOCALE_ROOT` (default)
|
||||
- `AS_IS`
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--case-processing-mode AS_IS
|
||||
```
|
||||
|
||||
### `--dominant-winner-min-percent <1..100>`
|
||||
|
||||
Sets the minimum winner percentage used by dominant-result reduction settings.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--dominant-winner-min-percent 75
|
||||
```
|
||||
|
||||
This option matters primarily when `--reduction-mode` is `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`. The default value is `75`.
|
||||
|
||||
### `--dominant-winner-over-second-ratio <1..n>`
|
||||
|
||||
Sets the minimum winner-over-second ratio used by dominant-result reduction settings.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--dominant-winner-over-second-ratio 3
|
||||
```
|
||||
|
||||
This option also matters primarily for `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`. The default value is `3`.
|
||||
|
||||
### `--overwrite`
|
||||
|
||||
Allows overwriting an existing output file.
|
||||
Allows the CLI to replace an already existing output file.
|
||||
|
||||
```
|
||||
```text
|
||||
--overwrite
|
||||
```
|
||||
|
||||
Without this flag:
|
||||
Without this flag, compilation fails when the output path already exists.
|
||||
|
||||
* compilation fails if the output file already exists
|
||||
### `--help`
|
||||
|
||||
Prints usage help and exits successfully.
|
||||
|
||||
```text
|
||||
--help
|
||||
```
|
||||
|
||||
## Reduction strategy explained
|
||||
The short form `-h` is also supported.
|
||||
|
||||
Reduction merges semantically equivalent subtrees to reduce memory and file size.
|
||||
## Reduction modes in practice
|
||||
|
||||
Trade-offs:
|
||||
Reduction mode is not only a storage decision. It also influences what semantics are preserved when the mutable trie is compiled into its canonical read-only form.
|
||||
|
||||
| Mode | Compression | Behavioral fidelity |
|
||||
| --------- | ----------- | ------------------- |
|
||||
| Ranked | Medium | High |
|
||||
| Unordered | High | Medium |
|
||||
| Dominant | Highest | Lower (heuristic) |
|
||||
### Ranked `getAll()` equivalence
|
||||
|
||||
### Ranked (recommended)
|
||||
`MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS` merges subtrees whose `getAll()` results remain equivalent for every reachable key suffix and whose local result ordering is the same.
|
||||
|
||||
* preserves full `getAll()` ordering
|
||||
* safest and most predictable
|
||||
This is the best general-purpose choice when result ordering and ambiguity handling matter. It preserves ranked multi-result semantics while still achieving useful structural reduction.
|
||||
|
||||
### Unordered
|
||||
This is the recommended default for most users.
|
||||
|
||||
* ignores ordering differences
|
||||
* higher compression, but less precise semantics
|
||||
### Unordered `getAll()` equivalence
|
||||
|
||||
### Dominant
|
||||
`MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS` also uses `getAll()`-level equivalence, but it ignores local ordering differences in addition to absolute frequencies.
|
||||
|
||||
* focuses on the most frequent result
|
||||
* useful when only `get()` is relevant
|
||||
* may lose secondary candidates
|
||||
This can yield stronger reduction, but it also weakens the precision of ordered multi-result semantics.
|
||||
|
||||
Choose this mode only when the application does not depend on the ordering of alternative results.
|
||||
|
||||
### Dominant `get()` equivalence
|
||||
|
||||
## Output format
|
||||
`MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS` focuses on preserving preferred-result semantics for `get()`, subject to dominance thresholds.
|
||||
|
||||
The compiled file:
|
||||
If a node does not satisfy the configured dominance constraints, compilation falls back to ranked `getAll()` semantics for that node to avoid unsafe over-reduction.
|
||||
|
||||
* is a binary representation of the trie
|
||||
* uses **GZip compression**
|
||||
* is optimized for:
|
||||
This mode is most suitable when the application primarily consumes the preferred result and does not rely on preserving richer ambiguity information.
|
||||
|
||||
* fast loading
|
||||
* minimal memory footprint
|
||||
## Recommended usage patterns
|
||||
|
||||
Typical properties:
|
||||
### Use offline preparation
|
||||
|
||||
* small file size
|
||||
* fast deserialization
|
||||
* no runtime preprocessing required
|
||||
The CLI is best used as a preparation step during packaging, deployment, or controlled artifact generation. This keeps compilation outside the runtime startup path and allows services to load only the finished binary trie.
|
||||
|
||||
### Treat compiled files as versioned assets
|
||||
|
||||
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
|
||||
|
||||
Compiled tries also persist a human-readable metadata block (`key=value` lines) that includes format version, traversal direction, RTL indicator, reduction mode, dominant thresholds, diacritic-processing mode, and case-processing mode. After decompression, you can inspect this block directly to identify what dictionary/trie configuration the artifact contains. The current CLI uses `DiacriticProcessingMode.AS_IS`; custom diacritic stripping is available through the programmatic builder and loader APIs rather than through a CLI flag.
|
||||
|
||||
### Choose reduction mode deliberately
|
||||
|
||||
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.
|
||||
|
||||
### Expect memory pressure during preparation, not runtime
|
||||
|
||||
Compilation is usually a one-time step and is generally fast. The more important operational consideration is memory usage during preparation, because the dictionary-derived mutable structure exists before reduction compacts it into the final read-only trie. This is especially relevant for very large source dictionaries.
|
||||
|
||||
## Example workflow
|
||||
|
||||
### 1. Prepare dictionary
|
||||
### 1. Prepare a dictionary
|
||||
|
||||
```
|
||||
```text
|
||||
run running runs ran
|
||||
connect connected connecting
|
||||
```
|
||||
|
||||
### 2. Compile
|
||||
### 2. Compile it
|
||||
|
||||
```bash
|
||||
java org.egothor.stemmer.Compile \
|
||||
--input ./data/stemmer.txt \
|
||||
--input ./data/stemmer.tsv \
|
||||
--output ./build/english.radixor.gz \
|
||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||
--store-original
|
||||
```
|
||||
|
||||
### 3. Use in application
|
||||
### 3. Load it in an application
|
||||
|
||||
```java
|
||||
FrequencyTrie<String> trie =
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
final FrequencyTrie<String> trie =
|
||||
StemmerPatchTrieLoader.loadBinary("english.radixor.gz");
|
||||
```
|
||||
|
||||
## Exit codes and error handling
|
||||
|
||||
The CLI uses three exit outcomes:
|
||||
|
||||
## Error handling
|
||||
- `0` for success,
|
||||
- `1` for processing failures such as I/O or compilation errors,
|
||||
- `2` for invalid command-line usage.
|
||||
|
||||
The CLI reports:
|
||||
When argument parsing fails, the CLI prints the error message, prints the usage summary, and exits with usage error status.
|
||||
|
||||
* missing input file
|
||||
* invalid arguments
|
||||
* I/O failures
|
||||
* parsing errors
|
||||
When compilation fails during processing, the CLI prints a `Compilation failed: ...` message to standard error and exits with processing error status.
|
||||
|
||||
Typical exit codes:
|
||||
Examples of failure conditions include:
|
||||
|
||||
* `0` – success
|
||||
* non-zero – failure
|
||||
|
||||
Error details are printed to standard error.
|
||||
|
||||
|
||||
|
||||
## Performance considerations
|
||||
|
||||
### Compilation
|
||||
|
||||
* typically CPU-bound
|
||||
* depends on dictionary size and reduction mode
|
||||
|
||||
### Output size
|
||||
|
||||
* depends on:
|
||||
|
||||
* dictionary completeness
|
||||
* reduction strategy
|
||||
* can vary significantly between modes
|
||||
|
||||
### Runtime impact
|
||||
|
||||
* compiled tries are optimized for:
|
||||
|
||||
* fast lookup
|
||||
* low allocation
|
||||
* predictable latency
|
||||
|
||||
|
||||
|
||||
## Best practices
|
||||
|
||||
### Use offline compilation
|
||||
|
||||
* compile dictionaries during build or deployment
|
||||
* do not compile on application startup
|
||||
|
||||
### Version your artifacts
|
||||
|
||||
* treat `.radixor.gz` files as versioned assets
|
||||
* store them alongside application releases
|
||||
|
||||
### Choose reduction mode deliberately
|
||||
|
||||
* use **ranked** for correctness
|
||||
* use **dominant** only if you fully understand the trade-offs
|
||||
|
||||
### Keep dictionaries clean
|
||||
|
||||
* better input → better compiled output
|
||||
* avoid noise and inconsistencies
|
||||
|
||||
|
||||
|
||||
## Integration tips
|
||||
|
||||
* store compiled files under `resources/` or a dedicated directory
|
||||
* load them once and reuse the trie instance
|
||||
* avoid repeated loading in frequently executed code paths (for example, per-request processing)
|
||||
- missing required arguments,
|
||||
- unknown arguments,
|
||||
- invalid integer values for dominant thresholds,
|
||||
- missing input files,
|
||||
- unreadable input,
|
||||
- existing output file without `--overwrite`,
|
||||
- general I/O failures during reading or writing.
|
||||
|
||||
## Relation to programmatic usage
|
||||
|
||||
The CLI and the programmatic API implement the same conceptual preparation step. The CLI is the operationally convenient choice when you want a ready-made binary artifact. The programmatic API is the better fit when compilation must be integrated directly into custom Java workflows.
|
||||
|
||||
## Next steps
|
||||
|
||||
* [Dictionary format](dictionary-format.md)
|
||||
* [Programmatic usage](programmatic-usage.md)
|
||||
* [Quick start](quick-start.md)
|
||||
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
The `Compile` CLI is the bridge between:
|
||||
|
||||
* human-readable dictionary data
|
||||
* optimized runtime stemmer tables
|
||||
|
||||
It enables a clean separation between:
|
||||
|
||||
* data preparation
|
||||
* runtime execution
|
||||
|
||||
and is the preferred way to prepare Radixor for production use.
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
- [Quick start](quick-start.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
- [Architecture and reduction](architecture-and-reduction.md)
|
||||
|
||||
185
docs/compatibility-and-guarantees.md
Normal file
185
docs/compatibility-and-guarantees.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# Compatibility and Guarantees
|
||||
|
||||
This document explains what Radixor treats as stable public behavior, what should be regarded as internal implementation detail, and how to think about compatibility across versions.
|
||||
|
||||
Its purpose is to make adoption safer. Users should be able to understand which parts of the project are intended as supported API, which parts may evolve more freely, and which kinds of change are expected to remain compatible in future releases.
|
||||
|
||||
## Compatibility philosophy
|
||||
|
||||
Radixor is designed to be used as a real library, not only as a code drop. That means compatibility matters.
|
||||
|
||||
At the same time, the project distinguishes clearly between:
|
||||
|
||||
- **public API and behavior** that users are expected to build against,
|
||||
- **internal implementation layers** that may change more freely when needed for correctness, performance, or maintainability.
|
||||
|
||||
The practical goal is straightforward:
|
||||
|
||||
- keep the main user-facing API in `org.egothor.stemmer` stable and supportable,
|
||||
- allow more freedom of evolution in internal trie-focused implementation layers,
|
||||
- extend the project conservatively without creating unnecessary behavioral ambiguity.
|
||||
|
||||
## Public API posture
|
||||
|
||||
As a general rule, the `org.egothor.stemmer` package should be treated as the primary supported API surface.
|
||||
|
||||
That includes the main user-facing types involved in:
|
||||
|
||||
- dictionary loading,
|
||||
- binary loading and persistence,
|
||||
- patch-command application,
|
||||
- compiled trie querying,
|
||||
- reconstruction workflows,
|
||||
- reduction configuration,
|
||||
- CLI use.
|
||||
|
||||
This API is expected to remain supportable across future versions. The preferred compatibility model is additive evolution: improving documentation, clarifying behavior, and adding capabilities without unnecessary disruption of existing usage patterns.
|
||||
|
||||
Examples of likely additive evolution include:
|
||||
|
||||
- additional bundled language resources,
|
||||
- fuller support for diacritics or native-script language resources,
|
||||
- expanded documentation and operational tooling,
|
||||
- new convenience methods that do not break existing code.
|
||||
|
||||
## Internal API posture
|
||||
|
||||
The `org.egothor.stemmer.trie` package should be treated as internal or at least significantly less stable implementation API.
|
||||
|
||||
It represents the structural machinery behind mutable nodes, reduced nodes, compiled nodes, reduction context, signatures, and related internal compilation details. These types may evolve more aggressively when needed to improve implementation quality, correctness, reduction behavior, internal representations, or performance characteristics.
|
||||
|
||||
Users should therefore avoid building long-term integrations against `org.egothor.stemmer.trie` unless they are intentionally accepting that tighter coupling.
|
||||
|
||||
In practical terms:
|
||||
|
||||
- `org.egothor.stemmer` is the supported integration layer,
|
||||
- `org.egothor.stemmer.trie` is the implementation layer.
|
||||
|
||||
## Behavioral guarantees
|
||||
|
||||
Several project properties are intended as core behavioral guarantees.
|
||||
|
||||
### Deterministic dictionary loading and compilation
|
||||
|
||||
Given the same textual dictionary input and the same reduction settings, Radixor is intended to produce the same compiled stemming semantics in a reproducible way.
|
||||
|
||||
This includes deterministic local result ordering and deterministic observable lookup behavior.
|
||||
|
||||
### Stable meaning of `get()` and `getAll()`
|
||||
|
||||
The distinction between preferred-result lookup and multi-result lookup is part of the supported behavior model.
|
||||
|
||||
- `get()` returns the locally preferred stored value,
|
||||
- `getAll()` returns all locally stored values in deterministic ranked order,
|
||||
- `getEntries()` returns aligned values with counts.
|
||||
|
||||
That model is part of how the public API should be understood.
|
||||
|
||||
### Stable reduction-mode intent
|
||||
|
||||
Each public `ReductionMode` constant carries a semantic contract that should remain meaningful across versions.
|
||||
|
||||
In other words, the implementation may evolve, but the intended meaning of modes such as ranked `getAll()` equivalence, unordered `getAll()` equivalence, and dominant `get()` equivalence should not drift casually.
|
||||
|
||||
### Stable binary artifact purpose
|
||||
|
||||
Compiled `.radixor.gz` artifacts are a first-class project output. Loading and persisting compiled stemmer artifacts is part of the intended usage model, not an incidental implementation side effect.
|
||||
|
||||
## What is allowed to evolve
|
||||
|
||||
Compatibility does not mean the project is frozen.
|
||||
|
||||
The following kinds of change are generally compatible with the project’s direction:
|
||||
|
||||
- improved internal data structures,
|
||||
- changes inside `org.egothor.stemmer.trie`,
|
||||
- expanded bundled dictionaries,
|
||||
- additional supported languages,
|
||||
- improved native-script handling,
|
||||
- better benchmarks, tests, and reports,
|
||||
- additive public API growth that does not invalidate existing usage.
|
||||
|
||||
The project should be able to improve substantially while keeping the main user-facing integration model intact.
|
||||
|
||||
## What may change more cautiously
|
||||
|
||||
Some areas should be treated as stable in intent but still approached carefully when changed.
|
||||
|
||||
### Bundled dictionary contents
|
||||
|
||||
Bundled resources are versioned project data, not immutable language standards. Their contents may improve over time.
|
||||
|
||||
That means stemming outcomes can legitimately change when bundled dictionaries are refined or expanded. Such changes are compatible with the project’s direction, but they should still be understood as behavior changes at the lexical-resource level.
|
||||
|
||||
### Binary format evolution
|
||||
|
||||
Compiled binary artifacts are an intended project output, but binary-format evolution may still be needed in future versions.
|
||||
|
||||
If the format changes, that should be handled deliberately and documented clearly. Users should not assume that every historical persisted artifact will remain readable forever without versioning considerations. What should remain stable is the project’s support for compiled artifact workflows, not necessarily perpetual cross-version binary interchange without explicit format evolution rules.
|
||||
|
||||
### Performance characteristics
|
||||
|
||||
Radixor places strong emphasis on performance, but no benchmark number should be treated as a formal compatibility guarantee.
|
||||
|
||||
What is more meaningful than any single raw number is the architectural performance posture: the library is intended to remain a compact compiled stemmer with very strong runtime throughput characteristics.
|
||||
|
||||
## What users should rely on
|
||||
|
||||
Long-term users should rely primarily on the following:
|
||||
|
||||
- the main integration path in `org.egothor.stemmer`,
|
||||
- the documented meaning of `get()`, `getAll()`, and reduction modes,
|
||||
- the offline-compilation plus runtime-loading workflow,
|
||||
- the availability of compiled artifact support,
|
||||
- the project’s preference for deterministic and auditable behavior.
|
||||
|
||||
These are the parts of the project that are intended to remain the most stable and supportable.
|
||||
|
||||
## What users should not rely on casually
|
||||
|
||||
Users should avoid depending on:
|
||||
|
||||
- internal trie package details,
|
||||
- undocumented internal classes or intermediate representations,
|
||||
- incidental internal ordering outside documented lookup semantics,
|
||||
- assumptions that bundled dictionary contents will never evolve,
|
||||
- assumptions that internal binary-format details are frozen forever.
|
||||
|
||||
If a behavior is important to your integration, it should ideally be documented at the public API or project-documentation level rather than inferred from internal implementation details.
|
||||
|
||||
## Source compatibility and behavioral compatibility
|
||||
|
||||
It is useful to distinguish two different notions of compatibility.
|
||||
|
||||
### Source compatibility
|
||||
|
||||
Whether existing Java code using the supported public API still compiles and integrates cleanly after an upgrade.
|
||||
|
||||
### Behavioral compatibility
|
||||
|
||||
Whether the upgraded system still behaves the same way for the same dictionary data, compiled artifacts, and runtime calls.
|
||||
|
||||
Radixor aims to preserve both where reasonably possible, but behavioral compatibility can still be influenced by intentional improvements such as dictionary refinement or bug fixes. For that reason, upgrades should be evaluated not only as code upgrades but also as stemming-behavior upgrades.
|
||||
|
||||
## Recommended upgrade discipline
|
||||
|
||||
When upgrading Radixor in a production environment, it is good practice to:
|
||||
|
||||
1. review release notes and documentation changes,
|
||||
2. rebuild compiled artifacts if the upgrade affects dictionary or artifact handling,
|
||||
3. rerun representative stemming validation tests,
|
||||
4. compare benchmark outputs where performance matters,
|
||||
5. inspect whether bundled-dictionary changes affect expected canonical results.
|
||||
|
||||
This is especially important for deployments that treat stemming behavior as part of search relevance or normalization policy.
|
||||
|
||||
## Summary
|
||||
|
||||
Radixor’s compatibility model is intentionally layered.
|
||||
|
||||
- `org.egothor.stemmer` should be treated as the supported public integration API,
|
||||
- `org.egothor.stemmer.trie` should be treated as an internal implementation layer,
|
||||
- deterministic public behavior and compiled-artifact workflows are core project commitments,
|
||||
- internal structure and lexical-resource quality can continue to evolve.
|
||||
|
||||
This model gives the project room to improve while still providing a reliable surface for long-term use.
|
||||
228
docs/contributing-dictionaries.md
Normal file
228
docs/contributing-dictionaries.md
Normal file
@@ -0,0 +1,228 @@
|
||||
# Contributing Dictionaries
|
||||
|
||||
High-quality dictionaries are one of the most valuable ways to improve **Radixor**.
|
||||
|
||||
The project already includes practical bundled dictionaries for common use, but the long-term quality and language reach of the stemmer depend heavily on the quality of its lexical resources. Contributions are therefore welcome not only in the form of code changes, but also in the form of well-prepared dictionary data for existing or additional languages.
|
||||
|
||||
This document explains what makes a dictionary contribution useful, how to structure it, and how to prepare it so that it integrates cleanly with the project.
|
||||
|
||||
## What a good dictionary contribution looks like
|
||||
|
||||
A good dictionary contribution is not defined only by the number of entries.
|
||||
|
||||
The most useful contributions are dictionaries that are:
|
||||
|
||||
- linguistically consistent,
|
||||
- operationally clean,
|
||||
- easy to review,
|
||||
- easy to reproduce,
|
||||
- appropriate for actual stemming use rather than raw lexical accumulation.
|
||||
|
||||
In practice, dictionary quality matters more than dictionary size. A smaller but coherent and carefully normalized dictionary is often more valuable than a larger resource that mixes conventions, contains noisy forms, or introduces accidental ambiguity.
|
||||
|
||||
## Preferred dictionary shape
|
||||
|
||||
Radixor uses a simple line-oriented tab-separated values format, meaning that columns are separated by the tab character:
|
||||
|
||||
```text
|
||||
<stem> <variant1> <variant2> <variant3> ...
|
||||
```
|
||||
|
||||
The first column on a line is the canonical stem. All following tab-separated columns on that line are known variants that should reduce to that stem.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
run running runs ran
|
||||
connect connected connecting connection
|
||||
```
|
||||
|
||||
The parser:
|
||||
|
||||
- reads UTF-8 text,
|
||||
- interprets each line as tab-separated values,
|
||||
- applies configurable case processing through `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`),
|
||||
- ignores empty lines,
|
||||
- supports remarks introduced by `#` or `//`,
|
||||
- currently ignores dictionary items containing embedded whitespace and reports them through warning-level log entries.
|
||||
|
||||
For full format details, see [Dictionary format](dictionary-format.md).
|
||||
|
||||
## Contribution priorities
|
||||
|
||||
The most useful dictionary contributions generally fall into one of four categories.
|
||||
|
||||
### 1. Stronger dictionaries for already bundled languages
|
||||
|
||||
Improving lexical quality for already supported languages is often more valuable than merely expanding the language list. Better coverage, cleaner canonicalization, and improved consistency directly improve practical stemming outcomes.
|
||||
|
||||
### 2. Additional languages
|
||||
|
||||
New language support is welcome when the submitted resource is strong enough to be useful as a maintainable bundled baseline rather than as an incomplete demonstration artifact.
|
||||
|
||||
### 3. Native-script language resources
|
||||
|
||||
The current bundled resources follow a pragmatic normalization convention and may use transliterated or otherwise normalized forms. This is especially visible for languages such as Russian.
|
||||
|
||||
That convention belongs to the supplied dictionaries, not to the underlying algorithm. The parser, trie, and patch-command model are not fundamentally restricted to plain ASCII. Contributions of high-quality native-script dictionaries in full UTF-8 text are therefore particularly valuable, because they would enable more direct language support without transliteration-based workflows.
|
||||
|
||||
### 4. Domain-quality refinements
|
||||
|
||||
Some contributions may be more appropriate as curated domain extensions than as replacements for a general-purpose bundled dictionary. These are still useful when they are clearly scoped and operationally coherent.
|
||||
|
||||
## Normalization guidance
|
||||
|
||||
A dictionary should follow one normalization convention consistently.
|
||||
|
||||
For current general-purpose bundled resources, the safest convention remains normalized plain-ASCII lexical input where that is already the established project style. For languages where a stronger native-script resource exists, a coherent UTF-8 dictionary may be preferable, provided that the contribution is deliberate, well-structured, and consistently normalized.
|
||||
|
||||
The important point is not to mix incompatible conventions casually.
|
||||
|
||||
Avoid contributions that combine, without clear design intent:
|
||||
|
||||
- native-script and transliterated forms,
|
||||
- multiple incompatible stem conventions,
|
||||
- inconsistent use of diacritics,
|
||||
- ad hoc spelling normalization,
|
||||
- noisy typo-like forms presented as ordinary lexical variants.
|
||||
|
||||
## Choosing canonical stems
|
||||
|
||||
A dictionary line should reflect a stable canonical target form.
|
||||
|
||||
That means:
|
||||
|
||||
- choose one canonical representation and use it consistently,
|
||||
- avoid mixing alternative stem conventions without a clear lexical reason,
|
||||
- keep variants grouped under the form that the project should actually return as the canonical result.
|
||||
|
||||
For example, the following is coherent:
|
||||
|
||||
```text
|
||||
analyze analyzing analyzed analyzes
|
||||
```
|
||||
|
||||
The following is less useful if the project has not intentionally chosen mixed conventions:
|
||||
|
||||
```text
|
||||
analyse analyzing analyzed analyzes
|
||||
```
|
||||
|
||||
The contribution should make the intended canonical policy easy to understand.
|
||||
|
||||
## Ambiguity handling
|
||||
|
||||
Ambiguity is allowed, but it should be intentional.
|
||||
|
||||
If the same surface form appears under multiple stems, the compiled trie may later expose multiple candidate patch commands. This can be correct and desirable when the lexical reality genuinely requires it. However, accidental ambiguity caused by inconsistent source preparation makes the resource harder to trust and harder to review.
|
||||
|
||||
Before contributing a dictionary, check whether repeated surface forms across lines are:
|
||||
|
||||
- linguistically intentional,
|
||||
- consistent with the chosen canonical policy,
|
||||
- useful for runtime stemming behavior.
|
||||
|
||||
## What to avoid
|
||||
|
||||
Dictionary contributions are much easier to review and accept when they avoid common quality problems.
|
||||
|
||||
Avoid:
|
||||
|
||||
- mechanically aggregated word lists without review,
|
||||
- inconsistent canonical forms,
|
||||
- mixed orthographic conventions without explanation,
|
||||
- accidental duplicates caused by source merging,
|
||||
- noisy or non-lexical tokens,
|
||||
- comments or formatting that make the source hard to audit.
|
||||
|
||||
A dictionary should read like a curated lexical resource, not like an unfiltered export.
|
||||
|
||||
## Practical preparation workflow
|
||||
|
||||
A disciplined dictionary contribution should typically follow this path:
|
||||
|
||||
1. prepare or normalize the lexical source,
|
||||
2. convert it into Radixor dictionary format,
|
||||
3. review canonical stem choices,
|
||||
4. check for accidental duplicates and unintended ambiguity,
|
||||
5. compile the dictionary,
|
||||
6. test representative lookups,
|
||||
7. inspect `get()` and `getAll()` behavior for important edge cases,
|
||||
8. include a concise explanation of source provenance and normalization choices.
|
||||
|
||||
## What to test before submitting
|
||||
|
||||
At minimum, a proposed dictionary should be checked for:
|
||||
|
||||
- successful parsing,
|
||||
- successful compilation,
|
||||
- expected stemming behavior on representative examples,
|
||||
- acceptable ambiguity behavior,
|
||||
- stable canonical policy,
|
||||
- absence of obvious malformed lines or accidental source contamination.
|
||||
|
||||
For important resources, it is also useful to test:
|
||||
|
||||
- whether representative forms survive reduction as expected,
|
||||
- whether dominant-result behavior remains sensible if alternate reduction modes are used,
|
||||
- whether the resulting artifact has a practical size for the intended use case.
|
||||
|
||||
## Contribution notes that help maintainers
|
||||
|
||||
A dictionary contribution becomes much easier to review when it includes a short maintainer-facing note describing:
|
||||
|
||||
- the language or domain covered,
|
||||
- the provenance of the lexical data,
|
||||
- the normalization convention used,
|
||||
- whether the dictionary is ASCII-normalized or native-script UTF-8,
|
||||
- the intended canonical stem policy,
|
||||
- any known limitations,
|
||||
- why the contribution improves the project in practical terms.
|
||||
|
||||
This note does not need to be long. It simply needs to make the resource intelligible.
|
||||
|
||||
## Bundled-resource expectations
|
||||
|
||||
Not every useful dictionary must automatically become a bundled language resource.
|
||||
|
||||
To be suitable for bundling, a dictionary should generally be:
|
||||
|
||||
- broadly useful,
|
||||
- maintainable,
|
||||
- legally safe to include,
|
||||
- coherent enough to serve as a project baseline,
|
||||
- strong enough that users can rely on it as more than a demonstration resource.
|
||||
|
||||
Some dictionaries are better treated as examples, experiments, or domain-specific artifacts rather than as general built-in resources.
|
||||
|
||||
## Native scripts and future language support
|
||||
|
||||
One of the most meaningful future directions for the project is stronger support for languages in their native writing systems.
|
||||
|
||||
The architecture does not need to change fundamentally for that to happen. What matters is the availability of strong lexical resources and the willingness to define clear conventions for how those resources should be bundled and maintained.
|
||||
|
||||
Contributions in this area are therefore especially valuable when they are:
|
||||
|
||||
- internally consistent,
|
||||
- encoded as proper UTF-8 text,
|
||||
- accompanied by a clear explanation of normalization assumptions,
|
||||
- strong enough to support practical use rather than only demonstration.
|
||||
|
||||
## Related documentation
|
||||
|
||||
- [Built-in languages](built-in-languages.md)
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
|
||||
## Summary
|
||||
|
||||
The best dictionary contributions improve Radixor not merely by adding more entries, but by improving the linguistic quality, consistency, and practical usefulness of the lexical resources the project can compile and ship.
|
||||
|
||||
A strong contribution is therefore one that is:
|
||||
|
||||
- coherent,
|
||||
- reviewable,
|
||||
- operationally clean,
|
||||
- well explained,
|
||||
- and valuable for real stemming workloads.
|
||||
@@ -1,255 +1,237 @@
|
||||
# Dictionary Format
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
Radixor uses a simple line-oriented dictionary format designed for practical stemming workflows. The textual source format is tab-separated values, meaning that columns are separated by the tab character.
|
||||
|
||||
Radixor uses a simple, line-oriented dictionary format to define mappings between **word forms** and their **canonical stems**.
|
||||
Each logical line describes one canonical stem and zero or more known word variants that should reduce to that stem. The format is intentionally lightweight, easy to maintain in source control, and directly consumable both by the programmatic loader and by the CLI compiler.
|
||||
|
||||
This format is intentionally minimal, language-agnostic, and easy to generate from existing linguistic resources or corpora.
|
||||
## Core structure
|
||||
|
||||
## Overview
|
||||
Each non-empty logical line has the following shape:
|
||||
|
||||
Each logical line defines:
|
||||
|
||||
- one **canonical stem**
|
||||
- zero or more **word variants** belonging to that stem
|
||||
|
||||
```
|
||||
stem variant1 variant2 variant3 ...
|
||||
```text
|
||||
<stem> <variant1> <variant2> <variant3> ...
|
||||
```
|
||||
|
||||
At compile time:
|
||||
The first column is interpreted as the **canonical stem**. Every following token on the same line is interpreted as a **known variant** belonging to that stem.
|
||||
|
||||
- each variant is converted into a **patch command** transforming the variant into the stem
|
||||
- the stem itself may optionally be stored as a **no-op mapping**
|
||||
Example:
|
||||
|
||||
## Basic example
|
||||
|
||||
```
|
||||
```text
|
||||
run running runs ran
|
||||
connect connected connecting connection
|
||||
analyze analyzing analysed analyses
|
||||
```
|
||||
|
||||
This defines:
|
||||
In this example:
|
||||
|
||||
| Stem | Variants |
|
||||
|----------|----------------------------------------|
|
||||
| run | running, runs, ran |
|
||||
| connect | connected, connecting, connection |
|
||||
| analyze | analyzing, analysed, analyses |
|
||||
- `run` is the canonical stem for `running`, `runs`, and `ran`,
|
||||
- `connect` is the canonical stem for `connected`, `connecting`, and `connection`.
|
||||
|
||||
## Syntax rules
|
||||
## How the loader interprets a line
|
||||
|
||||
### 1. Tokenization
|
||||
When a dictionary is loaded through `StemmerPatchTrieLoader`, the loader processes each parsed line as follows:
|
||||
|
||||
- Tokens are separated by **whitespace**
|
||||
- Multiple spaces and tabs are treated as a single separator
|
||||
- Leading and trailing whitespace is ignored
|
||||
1. the first column becomes the canonical stem,
|
||||
2. every following token is treated as a variant,
|
||||
3. each variant is converted into a patch command that transforms the variant into the stem,
|
||||
4. if `storeOriginal` is enabled, the stem itself is also inserted using the canonical no-op patch command.
|
||||
|
||||
### 2. First token is the stem
|
||||
This means the textual dictionary is not used directly at runtime. Instead, it is transformed into patch-command data and compiled into a reduced read-only trie.
|
||||
|
||||
- The **first token** on each line is always the canonical stem
|
||||
- All following tokens are treated as variants of that stem
|
||||
## Minimal valid lines
|
||||
|
||||
### 3. Case normalization
|
||||
A line may consist of the stem only:
|
||||
|
||||
- All input is normalized to **lowercase using `Locale.ROOT`**
|
||||
- Dictionaries should ideally already be lowercase to avoid ambiguity
|
||||
```text
|
||||
run
|
||||
```
|
||||
|
||||
### 4. Empty lines
|
||||
This is syntactically valid. It defines a stem entry with no explicit variants on that line.
|
||||
|
||||
- Empty lines are ignored
|
||||
Whether such a line is operationally useful depends on how the dictionary is loaded:
|
||||
|
||||
### 5. Duplicate variants
|
||||
- if `storeOriginal` is enabled, the stem itself is inserted as a no-op mapping,
|
||||
- if `storeOriginal` is disabled, the line contributes no explicit variant mappings.
|
||||
|
||||
- Duplicate variants are allowed but have no additional effect
|
||||
- Frequency is determined by occurrence across the entire dataset
|
||||
## Column and whitespace rules
|
||||
|
||||
## Remarks (comments)
|
||||
Columns are separated by the tab character. Leading and trailing whitespace around each column is ignored.
|
||||
|
||||
This is the canonical form:
|
||||
|
||||
```text
|
||||
run running runs ran
|
||||
```
|
||||
|
||||
This is also accepted because the surrounding padding is removed before the item is processed:
|
||||
|
||||
```text
|
||||
run running runs ran
|
||||
```
|
||||
|
||||
Embedded whitespace inside one dictionary item is currently not supported. A stem or variant such as `new york` therefore cannot yet be represented as one usable dictionary item in the textual source format. Such items are ignored during parsing and reported through a warning-level log entry together with the physical line number, the stem, and the ignored items from that line.
|
||||
|
||||
## Empty lines
|
||||
|
||||
Empty lines are ignored.
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
run running runs ran
|
||||
|
||||
connect connected connecting
|
||||
```
|
||||
|
||||
The blank line between entries has no effect.
|
||||
|
||||
## Remarks and comments
|
||||
|
||||
The parser supports both full-line and trailing remarks.
|
||||
|
||||
### Supported remark markers
|
||||
Two remark markers are recognized:
|
||||
|
||||
- `#`
|
||||
- `//`
|
||||
|
||||
### Examples
|
||||
The earliest occurrence of either marker terminates the logical content of the line, and the remainder of that line is ignored.
|
||||
|
||||
```
|
||||
Examples:
|
||||
|
||||
```text
|
||||
run running runs ran # English verb forms
|
||||
connect connected connecting // basic forms
|
||||
connect connected connecting // Common derived forms
|
||||
```
|
||||
|
||||
Everything after the first occurrence of a remark marker is ignored.
|
||||
This is also valid:
|
||||
|
||||
### Important note
|
||||
|
||||
Remark markers are not escaped. If `#` or `//` appear in a token, they will terminate the line.
|
||||
|
||||
## Storing the original form
|
||||
|
||||
When compiling, you may enable:
|
||||
|
||||
```
|
||||
--store-original
|
||||
```text
|
||||
# This line is ignored completely
|
||||
// This line is also ignored completely
|
||||
```
|
||||
|
||||
This causes the stem itself to be stored using a **no-op patch command**.
|
||||
## Case normalization
|
||||
|
||||
Input-line case normalization is controlled by `CaseProcessingMode`; by default the parser uses `LOWERCASE_WITH_LOCALE_ROOT` before tab-separated columns are processed into dictionary entries.
|
||||
|
||||
That means dictionary authors should treat the format as **case-insensitive at load time**. If a file contains uppercase or mixed-case tokens, they will be normalized during parsing.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
run running runs
|
||||
```text
|
||||
Run Running Runs Ran
|
||||
```
|
||||
|
||||
With `--store-original`, this implicitly includes:
|
||||
is processed the same way as:
|
||||
|
||||
```
|
||||
run -> run
|
||||
```text
|
||||
run running runs ran
|
||||
```
|
||||
|
||||
This is useful when:
|
||||
## Character set, compression, and normalization
|
||||
|
||||
- the input may already be normalized
|
||||
- you want stable identity mappings
|
||||
- you want to avoid missing entries for canonical forms
|
||||
Dictionary files are read as UTF-8 text. Files loaded through `StemmerPatchTrieLoader.load(Path, ...)` may be either plain UTF-8 text or GZip-compressed UTF-8 text; the loader detects GZip input from the stream header instead of relying on the file extension. Bundled dictionaries are stored as GZip resources and are decoded as UTF-8 after decompression.
|
||||
|
||||
## Frequency and ordering
|
||||
The parser and trie are not restricted to ASCII. Dictionary items are ordinary Java `String` values, and trie traversal works over Java `char` sequences. This supports Latin-script data with diacritics, Cyrillic data, Hebrew, Persian, Yiddish, and other scripts represented in UTF-8, subject to the normal Java `String` model and the project’s traversal configuration.
|
||||
|
||||
Radixor tracks **local frequencies** of values.
|
||||
Case normalization is controlled by `CaseProcessingMode`. The default `LOWERCASE_WITH_LOCALE_ROOT` mode lowercases the line before columns are split into dictionary items. `AS_IS` preserves the original casing.
|
||||
|
||||
Frequency is determined by:
|
||||
Diacritic normalization is controlled at trie-build and lookup time by `DiacriticProcessingMode`:
|
||||
|
||||
- how many times a mapping appears during construction
|
||||
- merging behavior during reduction
|
||||
- `AS_IS` preserves dictionary and lookup keys exactly after case handling,
|
||||
- `REMOVE` strips supported diacritics and common Latin ligatures on both insertion and lookup paths,
|
||||
- `AS_IS_AND_STRIPPED_FALLBACK` is declared in the public model but is not implemented yet and raises `UnsupportedOperationException`.
|
||||
|
||||
When multiple stems exist for a word:
|
||||
For reliable production behavior, choose one normalization policy deliberately and apply it consistently. Normalized ASCII dictionaries remain a practical convention for some legacy stemming data, but they are not a format requirement.
|
||||
|
||||
- results are ordered by **descending frequency**
|
||||
- ties are resolved deterministically:
|
||||
1. shorter textual representation wins
|
||||
2. lexicographically smaller value wins
|
||||
3. earlier insertion order wins
|
||||
## Distinct stem and variant semantics
|
||||
|
||||
This guarantees **stable and reproducible results**.
|
||||
The format expresses a one-line grouping of forms under a canonical stem. It does not encode linguistic metadata, part-of-speech information, weights, or explicit ambiguity markers.
|
||||
|
||||
## Ambiguity and multiple stems
|
||||
For example:
|
||||
|
||||
A word may legitimately map to more than one stem:
|
||||
|
||||
```
|
||||
axes ax axe
|
||||
```text
|
||||
axis axes
|
||||
axe axes
|
||||
```
|
||||
|
||||
This allows Radixor to represent ambiguity explicitly.
|
||||
These are simply two independent lines. If both contribute mappings for the same surface form, the compiled trie may later expose one or more candidate patch commands depending on the accumulated local counts and the selected reduction mode.
|
||||
|
||||
At runtime:
|
||||
In other words, the dictionary format itself is deliberately simple. Richer behavior such as preferred-result ranking or multiple candidate results emerges during trie construction and reduction rather than through extra syntax in the dictionary file.
|
||||
|
||||
- `get(word)` returns the **preferred result**
|
||||
- `getAll(word)` returns **all candidates**
|
||||
## Duplicate forms and repeated entries
|
||||
|
||||
## Design guidelines
|
||||
The format does not reserve any special syntax for duplicates. If the same mapping is inserted multiple times through repeated dictionary content, the builder accumulates local counts for the stored value at the addressed key.
|
||||
|
||||
### Keep stems consistent
|
||||
This matters because compiled tries preserve local value frequencies and use them to determine preferred ordering for `get(...)`, `getAll(...)`, and `getEntries(...)`.
|
||||
|
||||
Use a single canonical form:
|
||||
As a result, repeating the same mapping is not just redundant text. It can influence the ranking behavior of the compiled trie.
|
||||
|
||||
- `run` instead of mixing `run` / `running`
|
||||
- `analyze` vs `analyse` — pick one convention
|
||||
## Practical examples
|
||||
|
||||
### Avoid noise
|
||||
### Simple English example
|
||||
|
||||
Do not include:
|
||||
|
||||
- typos
|
||||
- extremely rare forms (unless required)
|
||||
- inconsistent normalization
|
||||
|
||||
### Prefer completeness over clever rules
|
||||
|
||||
Radixor is data-driven:
|
||||
|
||||
- more complete dictionaries → better results
|
||||
- no hidden rule system compensates for missing entries
|
||||
|
||||
### Handle domain-specific vocabulary
|
||||
|
||||
You can extend dictionaries with:
|
||||
|
||||
- product names
|
||||
- technical terms
|
||||
- organization-specific terminology
|
||||
|
||||
## Example: minimal dictionary
|
||||
|
||||
```
|
||||
go goes going went
|
||||
be is are was were being
|
||||
have has having had
|
||||
```text
|
||||
run running runs ran
|
||||
connect connected connecting connection
|
||||
build building builds built
|
||||
```
|
||||
|
||||
## Example: domain-specific extension
|
||||
### Dictionary with remarks
|
||||
|
||||
```
|
||||
microservice microservices
|
||||
container containers containerized
|
||||
kubernetes kubernetes
|
||||
```text
|
||||
run running runs ran # canonical verb family
|
||||
connect connected connecting // derived forms
|
||||
build building builds built
|
||||
```
|
||||
|
||||
## Common pitfalls
|
||||
### Stem-only entries
|
||||
|
||||
### Mixing cases
|
||||
|
||||
```
|
||||
Run running Runs ❌
|
||||
```text
|
||||
run
|
||||
connect connected connecting
|
||||
build
|
||||
```
|
||||
|
||||
→ normalized to lowercase, but inconsistent input is error-prone
|
||||
### Mixed case input
|
||||
|
||||
### Multiple stems on one line
|
||||
|
||||
```
|
||||
run running connect ❌
|
||||
```text
|
||||
Run Running Runs Ran
|
||||
CONNECT Connected Connecting
|
||||
```
|
||||
|
||||
→ `connect` becomes a variant of `run`, which is incorrect
|
||||
This is accepted. Under the default `LOWERCASE_WITH_LOCALE_ROOT` mode it is normalized to lower case during parsing; under `AS_IS` it is preserved.
|
||||
|
||||
### Hidden comments
|
||||
## Format limitations
|
||||
|
||||
```
|
||||
run running //comment runs ❌
|
||||
```
|
||||
The current dictionary format intentionally stays minimal:
|
||||
|
||||
→ everything after `//` is ignored
|
||||
- no quoted tokens,
|
||||
- no escaping rules,
|
||||
- no multi-word entries,
|
||||
- no inline weighting syntax,
|
||||
- no explicit ambiguity syntax,
|
||||
- no sectioning or nested structure.
|
||||
|
||||
## When to use this format
|
||||
Each dictionary item is simply one tab-separated word form after remark stripping and the configured case and diacritic normalization.
|
||||
|
||||
This format is suitable for:
|
||||
## Authoring guidance
|
||||
|
||||
- curated linguistic datasets
|
||||
- exported morphological dictionaries
|
||||
- domain-specific vocabularies
|
||||
- generated `(word, stem)` pairs from corpora
|
||||
For reliable results, keep dictionaries:
|
||||
|
||||
## Next steps
|
||||
- consistent in normalization,
|
||||
- free of accidental duplicates unless repeated weighting is intentional,
|
||||
- focused on meaningful stem-to-variant groupings,
|
||||
- encoded in UTF-8,
|
||||
- easy to audit in plain text form.
|
||||
|
||||
For most deployments, it is sensible to choose either preserved UTF-8 forms or a normalized ASCII/diacritic-stripped convention and keep that choice consistent across dictionary authoring, compilation, and runtime lookup.
|
||||
|
||||
## Relationship to other documentation
|
||||
|
||||
This page describes only the textual source format.
|
||||
|
||||
To understand how those dictionary lines are transformed into compiled runtime artifacts, continue with:
|
||||
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
- [Quick start](quick-start.md)
|
||||
|
||||
## Summary
|
||||
|
||||
Radixor dictionaries are intentionally simple:
|
||||
|
||||
- one line per stem
|
||||
- whitespace-separated tokens
|
||||
- optional remarks
|
||||
- no embedded rules
|
||||
|
||||
This simplicity enables:
|
||||
|
||||
- easy generation
|
||||
- fast parsing
|
||||
- deterministic behavior
|
||||
- efficient compilation into compact patch-command tries
|
||||
- [Architecture and reduction](architecture-and-reduction.md)
|
||||
|
||||
37
docs/index.md
Normal file
37
docs/index.md
Normal file
@@ -0,0 +1,37 @@
|
||||
<h1 class="visually-hidden">Home</h1>
|
||||
<p align="center">
|
||||
<img src="assets/images/banner.jpg" alt="Radixor banner" style="width: 100%; max-width: 1100px;">
|
||||
</p>
|
||||
|
||||
**Radixor** is a high-performance, multi-language stemmer for Java, built for production-grade search and text-processing systems.
|
||||
|
||||
It modernizes the proven Egothor patch-command trie approach and extends it for deployment realities that classic stemming pipelines do not handle well.
|
||||
|
||||
Traditional Egothor-style stemming workflows usually treat a compiled dictionary as a fixed artifact. Once built, its lexical knowledge is effectively closed unless the original source dictionary is recompiled. Radixor removes that constraint. An already compiled stemming structure can be extended with additional words and transformations, which makes it possible to evolve an existing dictionary for domain-specific, customer-specific, or deployment-specific vocabulary without rebuilding the entire lexical base from scratch.
|
||||
|
||||
Radixor also improves how ambiguous reductions can be handled at runtime. Instead of always forcing a single result, it can return multiple plausible stems when the input token cannot be reduced unambiguously. This allows downstream systems to preserve linguistic ambiguity where that is operationally useful, whether for retrieval quality, ranking strategies, diagnostics, or domain-specific normalization policies.
|
||||
|
||||
The project also has a clear research lineage. The historical idea behind this stemming family is described in Leo Galambos's paper *Lemmatizer for Document Information Retrieval Systems in JAVA* (SOFSEM 2001), which presents a semi-automatic stemming technique designed for Java-based information retrieval systems. In Radixor documentation, this reference serves as historical and algorithmic background rather than as technical documentation of the current implementation.
|
||||
|
||||
> Unlike traditional Egothor-based deployments, Radixor can extend an already compiled stemmer dictionary and can return multiple stems when a word is not reducible to a single unambiguous form.
|
||||
|
||||
Radixor delivers:
|
||||
|
||||
- **Fast runtime stemming** with compact lookup structures
|
||||
- **Multi-language adaptability** through dictionary-driven compilation
|
||||
- **Extension of compiled stemmer structures** without full recompilation from source dictionaries
|
||||
- **Incremental vocabulary growth** for deployment-specific lexical refinement
|
||||
- **Support for multiple stemming results** when reduction is ambiguous
|
||||
- **Deterministic behavior** suitable for reproducible processing pipelines
|
||||
- **Flexible integration paths**, including CLI-based and programmatic workflows
|
||||
- **Operational transparency** through continuously published quality and benchmark reports
|
||||
|
||||
Radixor is intended for teams that require consistent stemming quality at scale, while retaining the ability to evolve lexical resources after compilation and to handle ambiguous reductions with greater precision than traditional single-stem pipelines allow.
|
||||
|
||||
## Start here
|
||||
|
||||
- Read [Quick Start](quick-start.md) for immediate implementation guidance.
|
||||
- Use [Programmatic Usage](programmatic-usage.md) for application integration patterns.
|
||||
- Review [Benchmarking](benchmarking.md) for reproducible performance methodology.
|
||||
- Open [CI Reports](reports.md) to inspect published build artifacts and quality metrics.
|
||||
- See the historical paper: [*Lemmatizer for Document Information Retrieval Systems in JAVA*](https://www.researchgate.net/publication/221512865_Lemmatizer_for_Document_Information_Retrieval_Systems_in_JAVA).
|
||||
106
docs/programmatic-extending-and-persistence.md
Normal file
106
docs/programmatic-extending-and-persistence.md
Normal file
@@ -0,0 +1,106 @@
|
||||
# Extending and Persisting Compiled Tries
|
||||
|
||||
This document explains how compiled Radixor tries can be reopened, extended, rebuilt, and stored for deployment.
|
||||
|
||||
## Reopen and extend a compiled trie
|
||||
|
||||
`FrequencyTrieBuilders.copyOf(...)` reconstructs a mutable builder from a compiled trie. The reconstructed builder preserves the key-local value counts of the compiled trie as currently stored, making it suitable for subsequent modification and recompilation. Reconstruction is performed from the compiled state, not from the original unreduced insertion history.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.FrequencyTrieBuilders;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
|
||||
|
||||
public final class ExtendCompiledStemmerExample {
|
||||
|
||||
private ExtendCompiledStemmerExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = FrequencyTrieBuilders.copyOf(
|
||||
compiledTrie,
|
||||
String[]::new,
|
||||
settings);
|
||||
|
||||
builder.put("microservices", "Na");
|
||||
|
||||
final FrequencyTrie<String> updatedTrie = builder.build();
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(
|
||||
updatedTrie,
|
||||
Path.of("stemmers", "english-custom.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This enables a layered workflow:
|
||||
|
||||
1. start from a bundled or already compiled stemmer,
|
||||
2. reconstruct a builder,
|
||||
3. add custom lexical data,
|
||||
4. compile and persist a new binary artifact.
|
||||
|
||||
## Persist and deploy compiled tries
|
||||
|
||||
`StemmerPatchTrieBinaryIO` reads and writes patch-command tries as GZip-compressed binary files. `StemmerPatchTrieLoader` exposes convenience methods around the same persistence functionality.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, Path.of("stemmers", "english.radixor.gz"));
|
||||
```
|
||||
|
||||
In deployment terms, the cleanest model is usually:
|
||||
|
||||
- compile once,
|
||||
- persist the binary artifact,
|
||||
- load the artifact directly in runtime services.
|
||||
|
||||
## Binary-first operational model
|
||||
|
||||
For larger dictionaries or controlled deployment environments, a binary-first workflow is usually the most robust choice:
|
||||
|
||||
- prepare the compiled trie offline,
|
||||
- keep the preparation step outside the runtime startup path,
|
||||
- version and distribute the binary artifact,
|
||||
- load the finished trie directly in production.
|
||||
|
||||
This model works especially well when domain-specific extensions are added in layers and then recompiled into a new read-only artifact.
|
||||
|
||||
## Continue with
|
||||
|
||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md)
|
||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
|
||||
|
||||
|
||||
## Inspecting persisted metadata
|
||||
|
||||
After loading a compiled artifact, applications can inspect the persisted build descriptor directly:
|
||||
|
||||
```java
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary("build/stemmers/cs_cz.dat.gz");
|
||||
final TrieMetadata metadata = trie.metadata();
|
||||
|
||||
System.out.println(metadata.formatVersion());
|
||||
System.out.println(metadata.traversalDirection());
|
||||
System.out.println(metadata.reductionSettings().reductionMode());
|
||||
System.out.println(metadata.diacriticProcessingMode());
|
||||
```
|
||||
|
||||
This is especially useful when a deployment manages multiple artifacts compiled under different traversal or reduction regimes.
|
||||
135
docs/programmatic-loading-and-building.md
Normal file
135
docs/programmatic-loading-and-building.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# Loading and Building Stemmers
|
||||
|
||||
This document explains how to acquire a compiled Radixor stemmer in Java.
|
||||
|
||||
## Load a bundled language dictionary
|
||||
|
||||
Bundled language resources are simple to use and compile directly into a `FrequencyTrie<String>` during loading.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class BundledLanguageExample {
|
||||
|
||||
private BundledLanguageExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `storeOriginal` flag controls whether the canonical stem is inserted as a no-op patch entry for the stem itself.
|
||||
|
||||
## Load a textual dictionary
|
||||
|
||||
Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. The input may be plain UTF-8 text or GZip-compressed UTF-8 text; the loader detects GZip data from the stream header. The textual format is tab-separated values, meaning that columns are separated by the tab character. Each non-empty logical line starts with the stem column and may contain zero or more variant columns. Input case normalization is controlled by `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), trailing remarks introduced by `#` or `//` are ignored, and dictionary items containing embedded whitespace are currently ignored with warning-level diagnostics.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class LoadTextDictionaryExample {
|
||||
|
||||
private LoadTextDictionaryExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
Path.of("data", "stemmer.tsv"),
|
||||
true,
|
||||
ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Additional `StemmerPatchTrieLoader.load(...)` overloads let callers provide explicit `WordTraversalDirection`, `CaseProcessingMode`, `DiacriticProcessingMode`, or a complete `TrieMetadata` instance. Use those overloads when a custom dictionary must be compiled with forward traversal for right-to-left languages, case-sensitive keys, or diacritic stripping.
|
||||
|
||||
## Load a compiled binary artifact
|
||||
|
||||
Binary loading is typically the preferred runtime path because it avoids reparsing the textual source and skips the preparation step entirely.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class LoadBinaryExample {
|
||||
|
||||
private LoadBinaryExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
|
||||
|
||||
## Build directly with a mutable builder
|
||||
|
||||
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
|
||||
|
||||
```java
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
public final class BuilderExample {
|
||||
|
||||
private BuilderExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) {
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder =
|
||||
new FrequencyTrie.Builder<>(String[]::new, settings);
|
||||
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
builder.put("running", encoder.encode("running", "run"));
|
||||
builder.put("runs", encoder.encode("runs", "run"));
|
||||
builder.put("ran", encoder.encode("ran", "run"));
|
||||
builder.put("runner", encoder.encode("runner", "run"));
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
System.out.println("Canonical node count: " + trie.size());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Preparation-time memory characteristics
|
||||
|
||||
Compilation is commonly a one-time preparation activity and is generally fast enough not to be the main operational concern. The more important constraint is memory usage while building from textual dictionary data. Before reduction produces the compact immutable structure, the mutable build-time representation keeps the inserted data in memory. This is precisely why very large source dictionaries may require noticeably more memory during preparation than after compilation. The resulting compiled trie, by contrast, is designed as the compact runtime form.
|
||||
|
||||
This makes offline preparation especially attractive for large dictionaries.
|
||||
|
||||
## Continue with
|
||||
|
||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
|
||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)
|
||||
83
docs/programmatic-querying-and-ambiguity.md
Normal file
83
docs/programmatic-querying-and-ambiguity.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# Querying and Ambiguity Handling
|
||||
|
||||
This document explains how a compiled Radixor trie is queried and how ambiguity is represented.
|
||||
|
||||
## Query a compiled trie
|
||||
|
||||
### `get(...)`: preferred local value
|
||||
|
||||
`FrequencyTrie.get(String)` returns the most frequent value stored at the node addressed by the supplied key. If several values have the same local frequency, the winner is chosen deterministically by shorter `toString()` value first, then by lexicographically lower `toString()`, and finally by stable first-seen order. If the key does not exist or no value is stored at the addressed node, `null` is returned.
|
||||
|
||||
```java
|
||||
final String word = "running";
|
||||
final String patch = trie.get(word);
|
||||
```
|
||||
|
||||
### `getAll(...)`: ordered local values
|
||||
|
||||
`FrequencyTrie.getAll(String)` returns all values stored at the addressed node, ordered by descending frequency using the same deterministic tie-breaking rules. The returned array is a defensive copy. If the key is missing or has no local values, an empty array is returned.
|
||||
|
||||
```java
|
||||
final String[] patches = trie.getAll("axes");
|
||||
```
|
||||
|
||||
### `getEntries(...)`: values with counts
|
||||
|
||||
`FrequencyTrie.getEntries(String)` returns immutable `ValueCount<V>` objects aligned with the same ordering used by `getAll(...)`.
|
||||
|
||||
```java
|
||||
import java.util.List;
|
||||
|
||||
import org.egothor.stemmer.ValueCount;
|
||||
|
||||
final List<ValueCount<String>> entries = trie.getEntries("axes");
|
||||
```
|
||||
|
||||
## Apply patch commands
|
||||
|
||||
A patch command is not the final stem. It must be applied to the original input token. `PatchCommandEncoder.apply(source, patchCommand)` performs that transformation directly on the serialized command format. If the source is `null`, the method returns `null`. If the patch is `null`, empty, or malformed in compatibility-relevant ways, the original source word is preserved. Equal source and target words are represented by the canonical no-op patch.
|
||||
|
||||
```java
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
|
||||
final String word = "running";
|
||||
final String patch = trie.get(word);
|
||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
||||
```
|
||||
|
||||
For multiple candidates:
|
||||
|
||||
```java
|
||||
final String word = "axes";
|
||||
for (final String patch : trie.getAll(word)) {
|
||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
||||
System.out.println(word + " -> " + stem + " (" + patch + ")");
|
||||
}
|
||||
```
|
||||
|
||||
## Understand reduction modes
|
||||
|
||||
Reduction mode determines how mutable subtrees are merged during compilation. All modes operate on full subtree semantics rather than only on local node content.
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
|
||||
|
||||
This mode merges subtrees whose `getAll()` results are equivalent for every reachable key suffix and whose local result ordering is the same. It ignores absolute frequencies when comparing subtree signatures, but it preserves ranked multi-result ordering semantics.
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
|
||||
|
||||
This mode also merges according to `getAll()` equivalence for every reachable key suffix, but it ignores local result ordering in addition to absolute frequencies. It is therefore more aggressive in what it considers equivalent.
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
|
||||
|
||||
This mode focuses on `get()` equivalence for every reachable key suffix, subject to dominance constraints. If a node does not satisfy the configured dominance thresholds, the implementation falls back to ranked `getAll()` semantics for that node to avoid unsafe over-reduction. The thresholds are configured through `ReductionSettings`. Defaults are 75 percent minimum winner share and a winner-over-second ratio of 3.
|
||||
|
||||
## Practical guidance
|
||||
|
||||
- choose a ranked `getAll()` mode when downstream ambiguity handling matters,
|
||||
- choose the dominant `get()` mode when the primary operational concern is the preferred result,
|
||||
- treat reduction mode as part of observable lookup semantics, not merely as an internal compression setting.
|
||||
|
||||
## Continue with
|
||||
|
||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)
|
||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md)
|
||||
@@ -1,322 +1,56 @@
|
||||
# Programmatic Usage
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
This document provides the programmatic entry point to **Radixor**.
|
||||
|
||||
This document describes how to use **Radixor** programmatically from Java.
|
||||
Radixor follows a clear lifecycle:
|
||||
|
||||
It covers:
|
||||
1. acquire a compiled stemmer,
|
||||
2. query it for patch commands,
|
||||
3. apply those commands to produce stems,
|
||||
4. reopen and extend the compiled structure when needed.
|
||||
|
||||
- building a trie from dictionary data
|
||||
- compiling it into an immutable structure
|
||||
- loading compiled stemmers
|
||||
- querying for stems
|
||||
- working with multiple candidates
|
||||
- modifying existing compiled stemmers
|
||||
## Conceptual model
|
||||
|
||||
Radixor is dictionary-driven, but runtime stemming does not operate by scanning raw dictionary files. A source dictionary is parsed as a sequence of canonical stems and their known variants. Each variant is converted into a compact patch command that transforms the variant into the stem, while the stem itself may optionally be stored as a canonical no-op patch. The mutable trie is then reduced into a compiled read-only structure that stores ordered values and their counts at addressed nodes.
|
||||
|
||||
Two consequences matter for developers:
|
||||
|
||||
## Overview
|
||||
- the quality and coverage of stemming behavior depend on dictionary richness,
|
||||
- runtime usage is based on compiled patch-command lookup rather than on direct dictionary traversal.
|
||||
|
||||
Radixor separates the stemming lifecycle into three stages:
|
||||
This is why Radixor can generalize beyond explicitly listed forms and why compiled artifacts are well suited for deployment.
|
||||
|
||||
1. **Build** – collect word–stem mappings in a mutable structure
|
||||
2. **Compile** – reduce and convert to an immutable trie
|
||||
3. **Query** – perform fast runtime lookups
|
||||
## Documentation map
|
||||
|
||||
These stages are represented by:
|
||||
The programmatic API is easier to understand when split by developer task:
|
||||
|
||||
- `FrequencyTrie.Builder` (mutable)
|
||||
- `FrequencyTrie` (immutable, compiled)
|
||||
- `StemmerPatchTrieLoader` / `StemmerPatchTrieBinaryIO` (I/O)
|
||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
||||
|
||||
## Core types
|
||||
|
||||
The main types involved in programmatic usage are:
|
||||
|
||||
## Building a trie programmatically
|
||||
- `FrequencyTrie.Builder<V>` for mutable construction and extension,
|
||||
- `FrequencyTrie<V>` for the compiled read-only trie,
|
||||
- `PatchCommandEncoder` for creating and applying patch commands,
|
||||
- `StemmerPatchTrieLoader` for loading bundled or textual dictionaries,
|
||||
- `StemmerPatchTrieBinaryIO` for reading and writing compressed binary artifacts,
|
||||
- `FrequencyTrieBuilders` for reconstructing a mutable builder from a compiled trie,
|
||||
- `ReductionMode` and `ReductionSettings` for controlling compilation semantics.
|
||||
|
||||
You can construct a trie directly without using the CLI.
|
||||
|
||||
```java
|
||||
import org.egothor.stemmer.*;
|
||||
|
||||
public final class BuildExample {
|
||||
|
||||
public static void main(String[] args) {
|
||||
ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
|
||||
FrequencyTrie.Builder<String> builder =
|
||||
new FrequencyTrie.Builder<>(String[]::new, settings);
|
||||
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
builder.put("running", encoder.encode("running", "run"));
|
||||
builder.put("runs", encoder.encode("runs", "run"));
|
||||
builder.put("ran", encoder.encode("ran", "run"));
|
||||
|
||||
FrequencyTrie<String> trie = builder.build();
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Loading from dictionary files
|
||||
|
||||
To parse dictionary files directly:
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.*;
|
||||
|
||||
public final class LoadFromDictionaryExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
Path.of("data/stemmer.txt"),
|
||||
true,
|
||||
ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Loading a compiled binary trie
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.*;
|
||||
|
||||
public final class LoadBinaryExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
FrequencyTrie<String> trie =
|
||||
StemmerPatchTrieLoader.loadBinary(Path.of("english.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This is the **preferred production approach**.
|
||||
|
||||
|
||||
|
||||
## Querying for stems
|
||||
|
||||
### Preferred result
|
||||
|
||||
```java
|
||||
String word = "running";
|
||||
String patch = trie.get(word);
|
||||
String stem = PatchCommandEncoder.apply(word, patch);
|
||||
```
|
||||
|
||||
### All candidates
|
||||
|
||||
```java
|
||||
String[] patches = trie.getAll(word);
|
||||
|
||||
for (String patch : patches) {
|
||||
String stem = PatchCommandEncoder.apply(word, patch);
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Accessing value frequencies
|
||||
|
||||
For diagnostic or advanced use cases:
|
||||
|
||||
```java
|
||||
import org.egothor.stemmer.ValueCount;
|
||||
|
||||
java.util.List<ValueCount<String>> entries = trie.getEntries("axes");
|
||||
|
||||
for (ValueCount<String> entry : entries) {
|
||||
String patch = entry.value();
|
||||
int count = entry.count();
|
||||
}
|
||||
```
|
||||
|
||||
This allows:
|
||||
|
||||
* inspecting ambiguity
|
||||
* understanding ranking decisions
|
||||
* debugging dictionary quality
|
||||
|
||||
|
||||
|
||||
## Using bundled language resources
|
||||
|
||||
```java
|
||||
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
```
|
||||
|
||||
Bundled dictionaries are useful for:
|
||||
|
||||
* quick integration
|
||||
* testing
|
||||
* reference behavior
|
||||
|
||||
|
||||
|
||||
## Persisting a compiled trie
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.*;
|
||||
|
||||
public final class SaveExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
StemmerPatchTrieBinaryIO.write(trie, Path.of("english.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Modifying an existing trie
|
||||
|
||||
A compiled trie can be reopened into a builder, extended, and rebuilt.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.*;
|
||||
|
||||
public final class ModifyExample {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
FrequencyTrie<String> compiled =
|
||||
StemmerPatchTrieBinaryIO.read(Path.of("english.radixor.gz"));
|
||||
|
||||
ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
|
||||
);
|
||||
|
||||
FrequencyTrie.Builder<String> builder =
|
||||
FrequencyTrieBuilders.copyOf(compiled, String[]::new, settings);
|
||||
|
||||
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
|
||||
|
||||
FrequencyTrie<String> updated = builder.build();
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(updated,
|
||||
Path.of("english-custom.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Thread safety
|
||||
|
||||
* `FrequencyTrie` (compiled):
|
||||
|
||||
* **thread-safe**
|
||||
* safe for concurrent reads
|
||||
|
||||
* `FrequencyTrie.Builder`:
|
||||
|
||||
* **not thread-safe**
|
||||
* intended for single-threaded construction
|
||||
|
||||
|
||||
|
||||
## Performance characteristics
|
||||
|
||||
### Querying
|
||||
|
||||
* O(length of word)
|
||||
* minimal allocations
|
||||
* suitable for high-throughput pipelines
|
||||
|
||||
### Loading
|
||||
|
||||
* binary loading is fast
|
||||
* no preprocessing required
|
||||
|
||||
### Building
|
||||
|
||||
* depends on dictionary size
|
||||
* reduction phase may be CPU-intensive
|
||||
|
||||
|
||||
|
||||
## Best practices
|
||||
|
||||
### Reuse compiled trie instances
|
||||
|
||||
* load once
|
||||
* share across threads
|
||||
|
||||
### Prefer binary loading in production
|
||||
|
||||
* avoid rebuilding at runtime
|
||||
* treat compiled files as deployable artifacts
|
||||
|
||||
### Use `getAll()` only when needed
|
||||
|
||||
* `get()` is faster and sufficient for most use cases
|
||||
|
||||
### Keep builders short-lived
|
||||
|
||||
* build → compile → discard
|
||||
|
||||
|
||||
|
||||
## Integration patterns
|
||||
|
||||
### Search systems
|
||||
|
||||
* apply stemming during indexing and querying
|
||||
* ensure consistent dictionary usage
|
||||
|
||||
### Text normalization pipelines
|
||||
|
||||
* integrate as a transformation step
|
||||
* combine with tokenization and filtering
|
||||
|
||||
### Domain adaptation
|
||||
|
||||
* extend dictionaries with domain-specific vocabulary
|
||||
* rebuild compiled artifacts
|
||||
## Recommended reading order
|
||||
|
||||
For most developers, the best order is:
|
||||
|
||||
1. [Loading and Building Stemmers](programmatic-loading-and-building.md)
|
||||
2. [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
|
||||
3. [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)
|
||||
|
||||
## Next steps
|
||||
|
||||
* [Dictionary format](dictionary-format.md)
|
||||
* [CLI compilation](cli-compilation.md)
|
||||
* [Architecture and reduction](architecture-and-reduction.md)
|
||||
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
Programmatic usage of Radixor follows a clear pattern:
|
||||
|
||||
* build or load a trie
|
||||
* query using patch commands
|
||||
* apply transformations
|
||||
|
||||
The API is intentionally simple at the surface, while providing deeper control when needed for:
|
||||
|
||||
* ambiguity handling
|
||||
* diagnostics
|
||||
* dictionary evolution
|
||||
- [Quick Start](quick-start.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
- [Architecture and reduction](architecture-and-reduction.md)
|
||||
|
||||
@@ -1,317 +1,239 @@
|
||||
# Quality and Operations
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
This document describes the engineering standards, quality posture, and operational model of **Radixor**.
|
||||
|
||||
This document describes quality, testing, and operational practices for **Radixor**.
|
||||
It is intentionally broader than a test checklist. The purpose of the project is not only to provide a fast stemmer, but to provide one whose behavior is explainable, measurable, reproducible, and straightforward to audit. That objective influences both the implementation style and the surrounding operational practices.
|
||||
|
||||
It focuses on:
|
||||
## Engineering position
|
||||
|
||||
- reliability and determinism
|
||||
- testing strategies
|
||||
- deployment patterns
|
||||
- performance considerations
|
||||
- lifecycle management of stemmer data
|
||||
Radixor is developed with a strong preference for objective quality signals over informal confidence.
|
||||
|
||||
In practical terms, that means the project emphasizes:
|
||||
|
||||
- deterministic behavior,
|
||||
- reproducible compiled artifacts,
|
||||
- very high structural test coverage,
|
||||
- very high mutation resistance,
|
||||
- explicit benchmark methodology,
|
||||
- minimal operational ambiguity in deployment.
|
||||
|
||||
## Overview
|
||||
This is not treated as a cosmetic quality layer added after the implementation. It is part of the design goal of the project itself.
|
||||
|
||||
Radixor is designed to separate:
|
||||
## Why quality discipline matters here
|
||||
|
||||
- **data preparation** (dictionary construction and compilation)
|
||||
- **runtime execution** (lookup and patch application)
|
||||
A stemmer can appear deceptively simple from the outside. In practice, however, correctness depends on several interacting layers:
|
||||
|
||||
This separation enables:
|
||||
|
||||
- predictable runtime behavior
|
||||
- reproducible builds
|
||||
- controlled evolution of stemming data
|
||||
- dictionary parsing,
|
||||
- patch-command generation,
|
||||
- trie construction,
|
||||
- reduction semantics,
|
||||
- binary persistence,
|
||||
- runtime lookup behavior.
|
||||
|
||||
A defect in any one of these layers can produce subtle and difficult-to-detect errors, including silent ranking drift, loss of ambiguity information, reconstruction inconsistencies, or incorrect stemming outcomes under only a narrow subset of inputs.
|
||||
|
||||
For that reason, Radixor aims to be validated not only by example-based tests, but by a broader quality model that combines functional testing, mutation testing, coverage analysis, benchmark visibility, and artifact publication.
|
||||
|
||||
## Determinism and reproducibility
|
||||
|
||||
Radixor emphasizes deterministic behavior.
|
||||
Determinism is a foundational property of the project.
|
||||
|
||||
### Deterministic outputs
|
||||
Given the same dictionary input and the same reduction settings, the project aims to produce:
|
||||
|
||||
Given:
|
||||
- the same compiled trie semantics,
|
||||
- the same local value ordering,
|
||||
- the same observable `get()` and `getAll()` behavior,
|
||||
- the same persisted binary output structure in semantic terms.
|
||||
|
||||
- the same dictionary input
|
||||
- the same reduction settings
|
||||
This matters for more than technical elegance. It enables:
|
||||
|
||||
Radixor guarantees:
|
||||
- stable search behavior across deployments,
|
||||
- reproducible build outputs,
|
||||
- reliable regression analysis,
|
||||
- explainable differences when a dictionary or reduction setting changes.
|
||||
|
||||
- identical compiled trie structure
|
||||
- identical value ordering
|
||||
- identical lookup results
|
||||
A deterministic system is easier to test, easier to reason about, and safer to integrate into production pipelines.
|
||||
|
||||
### Why this matters
|
||||
## Test strategy
|
||||
|
||||
- stable search behavior across deployments
|
||||
- reproducible builds
|
||||
- easier debugging and regression analysis
|
||||
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
||||
|
||||
### Structural coverage
|
||||
|
||||
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
||||
|
||||
## Testing strategy
|
||||
In Radixor, strong coverage is expected across areas such as:
|
||||
|
||||
### Unit testing
|
||||
- patch encoding and application,
|
||||
- mutable trie construction,
|
||||
- subtree reduction,
|
||||
- compiled trie lookup,
|
||||
- binary serialization and deserialization,
|
||||
- reconstruction from compiled state,
|
||||
- dictionary parsing and CLI behavior.
|
||||
|
||||
Core components should be tested independently:
|
||||
### Mutation resistance
|
||||
|
||||
- patch encoding and decoding
|
||||
- trie construction
|
||||
- reduction behavior
|
||||
- binary serialization and deserialization
|
||||
Mutation testing is especially important for this project because it helps distinguish superficial test execution from genuinely discriminating tests.
|
||||
|
||||
### Dictionary validation tests
|
||||
A project can report high line or branch coverage while still failing to detect semantically dangerous implementation drift. Mutation testing provides a stronger objective signal: whether the test suite actually notices meaningful behavioral changes.
|
||||
|
||||
A recommended pattern:
|
||||
For Radixor, very high mutation scores are therefore part of the intended engineering standard, not an optional vanity metric.
|
||||
|
||||
1. load dictionary input
|
||||
2. compile trie
|
||||
3. re-apply all word → stem mappings
|
||||
4. verify that:
|
||||
### Boundary and negative-path validation
|
||||
|
||||
- expected stem is present in `getAll()`
|
||||
- preferred result (`get()`) is correct when deterministic
|
||||
The project also benefits from extensive negative and edge-case testing, for example around:
|
||||
|
||||
This ensures:
|
||||
- malformed patch commands,
|
||||
- missing or corrupt binary data,
|
||||
- invalid CLI arguments,
|
||||
- ambiguous mappings,
|
||||
- dominance-threshold edge conditions,
|
||||
- reconstruction of reduced compiled tries,
|
||||
- empty inputs and short words.
|
||||
|
||||
- no data loss during reduction
|
||||
- correctness of patch encoding
|
||||
These cases are important because many real integration failures occur at the boundary conditions, not in the central happy path.
|
||||
|
||||
## Quality signals and published evidence
|
||||
|
||||
The project publishes durable quality artifacts through GitHub Pages so that important signals remain externally inspectable rather than existing only as transient CI output.
|
||||
|
||||
## Regression testing
|
||||
Those published surfaces include:
|
||||
|
||||
Maintain a stable test dataset:
|
||||
- unit test results,
|
||||
- coverage reports,
|
||||
- mutation testing reports,
|
||||
- static analysis reports,
|
||||
- benchmark outputs,
|
||||
- software composition artifacts.
|
||||
|
||||
- representative vocabulary
|
||||
- edge cases (short words, long words, ambiguous forms)
|
||||
This publication model improves transparency and makes it easier to inspect the project’s quality posture without having to reconstruct the CI environment locally.
|
||||
|
||||
Use it to:
|
||||
## Operational model
|
||||
|
||||
- detect unintended changes
|
||||
- verify behavior after refactoring
|
||||
- validate reduction mode changes
|
||||
Radixor is designed around a clean separation between preparation-time work and runtime execution.
|
||||
|
||||
### Preparation phase
|
||||
|
||||
Preparation includes:
|
||||
|
||||
## Performance testing
|
||||
- creating or refining dictionary data,
|
||||
- compiling the dictionary into a reduced read-only trie,
|
||||
- validating the resulting artifact,
|
||||
- persisting it as a deployable binary stemmer.
|
||||
|
||||
Performance should be evaluated in terms of:
|
||||
### Runtime phase
|
||||
|
||||
### Throughput
|
||||
Runtime usage is intentionally simpler:
|
||||
|
||||
- words processed per second
|
||||
- load the compiled artifact,
|
||||
- reuse the resulting trie,
|
||||
- perform fast lookups and patch application,
|
||||
- avoid rebuilding or reparsing during live request handling.
|
||||
|
||||
### Latency
|
||||
This separation reduces startup unpredictability, keeps runtime behavior stable, and makes deployment artifacts explicit.
|
||||
|
||||
- time per lookup
|
||||
## Production posture
|
||||
|
||||
### Memory footprint
|
||||
For production use, the preferred model is straightforward:
|
||||
|
||||
- size of compiled trie
|
||||
- runtime memory usage
|
||||
1. prepare or refine the lexical resource,
|
||||
2. compile it offline,
|
||||
3. validate the resulting artifact,
|
||||
4. deploy the compiled binary,
|
||||
5. load it once and reuse it.
|
||||
|
||||
Benchmark with:
|
||||
This model has several advantages:
|
||||
|
||||
- realistic token streams
|
||||
- production-like dictionaries
|
||||
- no runtime compilation cost,
|
||||
- no repeated parsing overhead,
|
||||
- clear versioning of stemming behavior,
|
||||
- better reproducibility across environments,
|
||||
- simpler operational diagnosis when results change.
|
||||
|
||||
## Auditability and dependency posture
|
||||
|
||||
Radixor deliberately avoids external runtime dependencies.
|
||||
|
||||
## Deployment model
|
||||
That choice serves a practical engineering goal: the project should be easy to audit from both a correctness and a security perspective, without forcing downstream users to reason through a large dependency graph or a complex software supply chain for core functionality.
|
||||
|
||||
### Recommended workflow
|
||||
A dependency-free core does not make a project automatically secure, but it does simplify several important activities:
|
||||
|
||||
1. prepare dictionary data
|
||||
2. compile using CLI
|
||||
3. store `.radixor.gz` artifact
|
||||
4. deploy artifact with application
|
||||
5. load using `loadBinary(...)`
|
||||
- source review,
|
||||
- behavioral auditing,
|
||||
- release inspection,
|
||||
- software composition analysis,
|
||||
- long-term maintenance.
|
||||
|
||||
### Why this model
|
||||
In operational terms, this means there is less hidden behavior outside the project’s own codebase and less need to evaluate third-party runtime libraries for the core implementation path.
|
||||
|
||||
- avoids runtime compilation overhead
|
||||
- reduces startup latency
|
||||
- ensures consistent behavior across environments
|
||||
## Security-minded operational guidance
|
||||
|
||||
The project’s operational simplicity should be preserved in deployment practice.
|
||||
|
||||
Recommended principles include:
|
||||
|
||||
## Artifact management
|
||||
- treat source dictionaries as controlled inputs,
|
||||
- generate compiled artifacts in known build environments,
|
||||
- version compiled artifacts explicitly,
|
||||
- avoid loading untrusted binary stemmer files,
|
||||
- keep benchmark, test, and quality outputs attached to the same revision that produced the artifact.
|
||||
|
||||
Compiled stemmers should be treated as versioned assets.
|
||||
These practices support traceability and reduce ambiguity about what exactly is running in production.
|
||||
|
||||
### Versioning
|
||||
## Performance as a quality concern
|
||||
|
||||
- include version in filename or metadata
|
||||
- track dictionary source and reduction settings
|
||||
Performance is not isolated from quality; for Radixor, it is part of the project’s engineering contract.
|
||||
|
||||
Example:
|
||||
The benchmark suite exists to make throughput behavior measurable and historically visible. At the same time, benchmark interpretation must remain disciplined. Absolute numbers can vary by environment, especially when published through shared CI infrastructure. Sustained relative behavior and reproducible local benchmark methodology are more meaningful than one-off raw figures.
|
||||
|
||||
```
|
||||
english-v1.2-ranked.radixor.gz
|
||||
```
|
||||
This is why benchmarking belongs alongside testing and reporting rather than outside the quality discussion altogether.
|
||||
|
||||
### Storage
|
||||
## Operational observability
|
||||
|
||||
- store in repository or artifact storage
|
||||
- ensure consistent distribution across environments
|
||||
Radixor itself is intentionally small and does not attempt to become an observability framework. Instead, integrations should provide the surrounding operational visibility that production systems require.
|
||||
|
||||
Typical integration-level observability includes:
|
||||
|
||||
- reporting load failures,
|
||||
- monitoring startup artifact loading,
|
||||
- measuring lookup throughput in the host application,
|
||||
- tracking memory usage of loaded compiled tries,
|
||||
- optionally sampling ambiguity-heavy cases when `getAll()` is part of the application logic.
|
||||
|
||||
## Runtime usage
|
||||
The project’s role is to remain deterministic and inspectable enough that such operational signals are meaningful.
|
||||
|
||||
### Loading
|
||||
## What feedback is most valuable
|
||||
|
||||
- load once during application startup
|
||||
- reuse `FrequencyTrie` instance
|
||||
Feedback is especially valuable when it improves the objectivity or professional rigor of the project.
|
||||
|
||||
### Thread safety
|
||||
That includes, for example:
|
||||
|
||||
- compiled trie is safe for concurrent access
|
||||
- no synchronization required for reads
|
||||
- defects in behavioral correctness,
|
||||
- weaknesses in reduction semantics or edge-case handling,
|
||||
- benchmark methodology issues,
|
||||
- gaps in tests or mutation resistance,
|
||||
- ambiguities in published reports,
|
||||
- opportunities to improve auditability, reproducibility, or operational clarity.
|
||||
|
||||
### Avoid repeated loading
|
||||
Project feedback is most useful when it helps strengthen the project as an implementation that can be trusted, reviewed, and maintained at a professional standard.
|
||||
|
||||
Do not:
|
||||
## Practical summary
|
||||
|
||||
- load trie per request
|
||||
- rebuild trie at runtime
|
||||
Radixor aims to combine:
|
||||
|
||||
- strong algorithmic performance,
|
||||
- deterministic behavior,
|
||||
- very high validation standards,
|
||||
- transparent published quality evidence,
|
||||
- low operational ambiguity,
|
||||
- easy auditability of the core implementation.
|
||||
|
||||
That combination is central to the identity of the project. The goal is not merely to be fast, but to be fast in a way that remains explainable, testable, reproducible, and professionally defensible.
|
||||
|
||||
## Memory considerations
|
||||
## Related documentation
|
||||
|
||||
- compiled tries are compact but not negligible
|
||||
- size depends on:
|
||||
- dictionary size
|
||||
- reduction mode
|
||||
|
||||
Recommendations:
|
||||
|
||||
- monitor memory usage in production
|
||||
- choose reduction mode appropriately
|
||||
|
||||
|
||||
|
||||
## Reduction mode in production
|
||||
|
||||
Default recommendation:
|
||||
|
||||
- use **ranked mode**
|
||||
|
||||
Switch to other modes only when:
|
||||
|
||||
- memory constraints are strict
|
||||
- multiple candidate results are not required
|
||||
|
||||
Always validate behavior after changing reduction mode.
|
||||
|
||||
|
||||
|
||||
## Dictionary lifecycle
|
||||
|
||||
### Updating dictionaries
|
||||
|
||||
When dictionary data changes:
|
||||
|
||||
1. update source file
|
||||
2. recompile
|
||||
3. run validation tests
|
||||
4. deploy new artifact
|
||||
|
||||
### Backward compatibility
|
||||
|
||||
- changes in dictionary may affect stemming results
|
||||
- evaluate impact on search relevance
|
||||
|
||||
|
||||
|
||||
## Observability
|
||||
|
||||
Radixor itself does not provide observability features; integration should provide:
|
||||
|
||||
- logging for loading failures
|
||||
- metrics for lookup throughput
|
||||
- monitoring of memory usage
|
||||
|
||||
Optional:
|
||||
|
||||
- sampling of ambiguous results (`getAll()`)
|
||||
|
||||
|
||||
|
||||
## Error handling
|
||||
|
||||
### During compilation
|
||||
|
||||
Handle:
|
||||
|
||||
- invalid dictionary format
|
||||
- I/O failures
|
||||
- invalid arguments
|
||||
|
||||
### During runtime
|
||||
|
||||
Handle:
|
||||
|
||||
- missing dictionary files
|
||||
- corrupted binary artifacts
|
||||
|
||||
Fail fast on initialization errors.
|
||||
|
||||
|
||||
|
||||
## Operational best practices
|
||||
|
||||
- compile dictionaries offline
|
||||
- version compiled artifacts
|
||||
- test before deployment
|
||||
- load once and reuse
|
||||
- monitor performance and memory
|
||||
- document reduction settings used
|
||||
|
||||
|
||||
|
||||
## Security considerations
|
||||
|
||||
- treat dictionary input as trusted data
|
||||
- validate external sources before compilation
|
||||
- avoid loading unverified binary artifacts
|
||||
|
||||
|
||||
|
||||
## Integration checklist
|
||||
|
||||
Before production deployment:
|
||||
|
||||
- dictionary validated
|
||||
- compiled artifact generated
|
||||
- reduction mode documented
|
||||
- performance tested
|
||||
- memory usage verified
|
||||
- regression tests passing
|
||||
|
||||
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Quick start](quick-start.md)
|
||||
- [Benchmarking](benchmarking.md)
|
||||
- [Reports](reports.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
Radixor is designed for:
|
||||
|
||||
- deterministic behavior
|
||||
- efficient runtime execution
|
||||
- controlled data-driven evolution
|
||||
|
||||
By separating compilation from runtime and following proper operational practices, it can be reliably integrated into production-grade systems.
|
||||
|
||||
@@ -1,10 +1,92 @@
|
||||
# Quick Start
|
||||
|
||||
> ← Back to [README.md](../README.md)
|
||||
This guide introduces the fastest practical path to using **Radixor**.
|
||||
|
||||
This guide shows the fastest way to start using **Radixor** and the most common next steps.
|
||||
Radixor separates preparation from runtime usage. Source dictionaries are used to derive patch commands and reduce them into a compact read-only trie. Runtime stemming then operates on that compiled structure rather than on the original dictionary text. A richer dictionary usually improves the quality and coverage of inferred transformations, including transformations that are applicable to words not explicitly present in the source material. The reduction step also removes a large amount of redundant lexical information, which is why very large dictionaries can still produce compact runtime artifacts. These artifacts can be persisted and loaded directly when needed.
|
||||
|
||||
## Hello world
|
||||
A practical workflow usually consists of two independent phases:
|
||||
|
||||
1. obtain a compiled stemmer,
|
||||
2. use the compiled stemmer.
|
||||
|
||||
## 1. Obtain a compiled stemmer
|
||||
|
||||
A compiled stemmer can be obtained in three common ways.
|
||||
|
||||
### Use a bundled language dictionary
|
||||
|
||||
Radixor ships with bundled dictionaries for a set of supported languages. These resources are line-oriented dictionaries stored with the library and compiled into a `FrequencyTrie<String>` when loaded. The loader can also store the canonical stem itself as a no-op patch command. Compiled trie artifacts now persist self-describing metadata, including the traversal direction and compilation reduction settings used to build the artifact.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class BundledStemmerExample {
|
||||
|
||||
private BundledStemmerExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
System.out.println("Canonical node count: " + trie.size());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Load a previously compiled binary stemmer
|
||||
|
||||
Compiled stemmers can be stored as GZip-compressed binary artifacts and loaded directly. This is usually the most convenient production path because no dictionary parsing or recompilation is needed during application startup.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class LoadBinaryStemmerExample {
|
||||
|
||||
private LoadBinaryStemmerExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
|
||||
System.out.println("Canonical node count: " + trie.size());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Build or extend a stemmer from dictionary data
|
||||
|
||||
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
|
||||
|
||||
This path is also relevant when you extend an existing compiled stemmer with additional domain-specific entries and rebuild a new compact artifact.
|
||||
|
||||
A dedicated CLI compilation workflow deserves its own focused page and should remain separate from Quick Start, but conceptually it is simply another way to prepare the compiled artifact before runtime use.
|
||||
|
||||
## 2. Use the compiled stemmer
|
||||
|
||||
A compiled `FrequencyTrie<String>` stores patch commands, not final stems. Querying therefore has two steps:
|
||||
|
||||
1. retrieve one or more patch commands from the trie,
|
||||
2. apply each patch command to the original input word.
|
||||
|
||||
The trie returns values associated with the exact addressed node. `get(...)` returns the locally preferred value, while `getAll(...)` returns all locally stored values ordered by descending frequency with deterministic tie-breaking.
|
||||
|
||||
### Get the preferred result
|
||||
|
||||
Use `get(...)` when the application needs a single preferred transformation.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
@@ -14,15 +96,15 @@ import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
|
||||
public final class HelloRadixor {
|
||||
public final class SingleStemExample {
|
||||
|
||||
private HelloRadixor() {
|
||||
private SingleStemExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
|
||||
StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
StemmerPatchTrieLoader.Language.US_UK,
|
||||
true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
@@ -30,75 +112,44 @@ public final class HelloRadixor {
|
||||
final String patch = trie.get(word);
|
||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
||||
|
||||
System.out.println(word + " -> " + stem);
|
||||
System.out.println(word + " -> " + stem + " (" + patch + ")");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This example shows the core workflow:
|
||||
### Get all candidate results
|
||||
|
||||
1. load a trie
|
||||
2. get a patch command for a word
|
||||
3. apply the patch
|
||||
4. obtain the stem
|
||||
|
||||
## Retrieve multiple candidate stems
|
||||
|
||||
If you need more than one candidate result, use `getAll(...)` instead of `get(...)`.
|
||||
Use `getAll(...)` when the application should preserve ambiguity instead of collapsing everything into one result. The method is available on every compiled trie. What changes across reduction modes is the semantic strength with which multi-result behavior is preserved during reduction, not whether the method exists.
|
||||
|
||||
```java
|
||||
final String word = "axes";
|
||||
final String[] patches = trie.getAll(word);
|
||||
|
||||
for (String patch : patches) {
|
||||
for (final String patch : patches) {
|
||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
||||
System.out.println(word + " -> " + stem + " (" + patch + ")");
|
||||
}
|
||||
```
|
||||
|
||||
## Load a compiled binary stemmer
|
||||
### Inspect ranked values and counts
|
||||
|
||||
For production systems, the preferred approach is usually to precompile the dictionary and load the compressed binary artifact at runtime.
|
||||
For diagnostics or advanced ranking logic, use `getEntries(...)` to obtain value-count pairs in the same deterministic order as `getAll(...)`.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
import org.egothor.stemmer.ValueCount;
|
||||
|
||||
public final class BinaryStemmerExample {
|
||||
final List<ValueCount<String>> entries = trie.getEntries("axes");
|
||||
|
||||
private BinaryStemmerExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final Path path = Path.of("stemmers", "english.radixor.gz");
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(path);
|
||||
|
||||
final String word = "connected";
|
||||
final String patch = trie.get(word);
|
||||
final String stem = PatchCommandEncoder.apply(word, patch);
|
||||
|
||||
System.out.println(word + " -> " + stem);
|
||||
}
|
||||
for (final ValueCount<String> entry : entries) {
|
||||
System.out.println(entry.value() + " -> " + entry.count());
|
||||
}
|
||||
```
|
||||
|
||||
## Compile a dictionary from the command line
|
||||
## Extend an existing compiled stemmer
|
||||
|
||||
```bash
|
||||
java org.egothor.stemmer.Compile \
|
||||
--input ./data/stemmer.txt \
|
||||
--output ./build/english.radixor.gz \
|
||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||
--store-original \
|
||||
--overwrite
|
||||
```
|
||||
|
||||
## Modify an existing compiled stemmer
|
||||
A compiled trie is read-only, but it is not permanently closed. Radixor can reconstruct a mutable builder from a compiled trie, preserve the currently stored local counts, accept additional insertions, and then compile a new read-only trie. Reconstruction operates on the compiled form, so if the source trie was already reduced by subtree merging, the reopened builder reflects that compiled state rather than the original unreduced insertion history.
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
@@ -111,17 +162,15 @@ import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
|
||||
|
||||
public final class ModifyCompiledExample {
|
||||
public final class ExtendCompiledStemmerExample {
|
||||
|
||||
private ModifyCompiledExample() {
|
||||
private ExtendCompiledStemmerExample() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
public static void main(final String[] arguments) throws IOException {
|
||||
final Path input = Path.of("stemmers", "english.radixor.gz");
|
||||
final Path output = Path.of("stemmers", "english-custom.radixor.gz");
|
||||
|
||||
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(input);
|
||||
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(
|
||||
Path.of("stemmers", "english.radixor.gz"));
|
||||
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
@@ -131,18 +180,32 @@ public final class ModifyCompiledExample {
|
||||
String[]::new,
|
||||
settings);
|
||||
|
||||
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
|
||||
builder.put("microservices", "Na");
|
||||
|
||||
final FrequencyTrie<String> updatedTrie = builder.build();
|
||||
StemmerPatchTrieBinaryIO.write(updatedTrie, output);
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(
|
||||
updatedTrie,
|
||||
Path.of("stemmers", "english-custom.radixor.gz"));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Operational note on memory and preparation
|
||||
|
||||
Dictionary compilation is usually a one-time preparation step and is generally fast. The more relevant operational constraint is memory consumption during preparation: before reduction, the mutable build-time structure keeps the full dictionary-derived content in RAM. Reduction then compacts it substantially, but very large source dictionaries can still require significant memory during the initial build phase. The best operational model is therefore to compile once, persist the resulting binary artifact, and load that artifact directly in runtime environments.
|
||||
|
||||
## Where to continue
|
||||
|
||||
* [Dictionary format](dictionary-format.md)
|
||||
* [CLI compilation](cli-compilation.md)
|
||||
* [Programmatic usage](programmatic-usage.md)
|
||||
* [Built-in languages](built-in-languages.md)
|
||||
* [Architecture and reduction](architecture-and-reduction.md)
|
||||
- [Programmatic Usage](programmatic-usage.md)
|
||||
- [Dictionary format](dictionary-format.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
- [Built-in languages](built-in-languages.md)
|
||||
- [Architecture and reduction](architecture-and-reduction.md)
|
||||
|
||||
|
||||
## Persisted trie metadata
|
||||
|
||||
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
|
||||
|
||||
`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.
|
||||
|
||||
208
docs/reduction-semantics.md
Normal file
208
docs/reduction-semantics.md
Normal file
@@ -0,0 +1,208 @@
|
||||
# Reduction Semantics
|
||||
|
||||
This document explains how **Radixor** decides that two subtrees are equivalent, how the different reduction modes work, and how those choices affect observable runtime behavior.
|
||||
|
||||
## Why reduction exists
|
||||
|
||||
Without reduction, the trie would still work, but many subtrees that mean the same thing would remain duplicated. The result would be a much larger runtime artifact than necessary.
|
||||
|
||||
Reduction solves that by merging semantically equivalent subtrees into one canonical representative.
|
||||
|
||||
The key idea is simple:
|
||||
|
||||
> if two subtrees behave the same way under the semantic contract chosen for compilation, only one physical copy is needed.
|
||||
|
||||
## Reduction is semantic, not merely structural
|
||||
|
||||
Radixor does not reduce nodes merely because they look similar locally. It reduces subtrees only when their **meaning** matches according to the selected mode.
|
||||
|
||||
That is why reduction is based on a **signature** that captures both:
|
||||
|
||||
1. the local semantics of the current node,
|
||||
2. the structure and semantics of all descendant edges.
|
||||
|
||||
Conceptually:
|
||||
|
||||
```text
|
||||
Signature = (LocalDescriptor, SortedChildDescriptors)
|
||||
```
|
||||
|
||||
Two subtrees are merged only if their signatures are equal.
|
||||
|
||||
## Local descriptors
|
||||
|
||||
The local descriptor defines what “equivalent” means for the values stored at one node.
|
||||
|
||||
Radixor supports three semantic views.
|
||||
|
||||
### Ranked descriptor
|
||||
|
||||
The ranked descriptor preserves the full ordered result semantics of `getAll()`.
|
||||
|
||||
That means:
|
||||
|
||||
- candidate membership is preserved,
|
||||
- local ordering is preserved,
|
||||
- observable ranked multi-result behavior remains stable.
|
||||
|
||||
This is the most semantically faithful mode when ambiguity handling matters.
|
||||
|
||||
### Unordered descriptor
|
||||
|
||||
The unordered descriptor preserves the set of reachable results, but not their local ordering.
|
||||
|
||||
That means:
|
||||
|
||||
- candidate membership is preserved,
|
||||
- ordering differences may be ignored,
|
||||
- more subtrees can be merged than in ranked mode.
|
||||
|
||||
This mode is useful when alternative candidates matter but exact ranking does not.
|
||||
|
||||
### Dominant descriptor
|
||||
|
||||
The dominant descriptor focuses on the preferred result returned by `get()`.
|
||||
|
||||
This mode is used only when the dominant local candidate is strong enough according to configured thresholds:
|
||||
|
||||
- minimum winner percentage,
|
||||
- winner-over-second ratio.
|
||||
|
||||
If that local dominance is not strong enough, Radixor does not force dominant semantics anyway. It falls back to ranked semantics for that node to avoid unsafe over-reduction.
|
||||
|
||||
That fallback is one of the most important safeguards in the design.
|
||||
|
||||
## Child descriptors
|
||||
|
||||
A subtree is not defined only by the values stored at the current node. It is also defined by what behavior is reachable through its children.
|
||||
|
||||
Each child contributes:
|
||||
|
||||
```text
|
||||
(edge character, child signature)
|
||||
```
|
||||
|
||||
Children are sorted by edge character so that signatures remain deterministic and stable.
|
||||
|
||||
This matters because reduction must not depend on incidental map iteration order or other non-semantic implementation details.
|
||||
|
||||
## Canonicalization
|
||||
|
||||
Once a subtree signature is computed, the reduction process checks whether an equivalent canonical subtree already exists.
|
||||
|
||||
If yes, the existing reduced node is reused.
|
||||
|
||||
If no, a new canonical reduced node is created and registered.
|
||||
|
||||
This turns reduction into a canonicalization process:
|
||||
|
||||
- compute semantic identity,
|
||||
- find canonical representative,
|
||||
- reuse or create,
|
||||
- continue bottom-up.
|
||||
|
||||
That is how Radixor eliminates duplicated equivalent subtrees.
|
||||
|
||||
## Count aggregation and compiled state
|
||||
|
||||
When multiple original build-time subtrees collapse into one canonical reduced node, local counts may be aggregated.
|
||||
|
||||
This is an important point for understanding compiled artifacts.
|
||||
|
||||
A compiled trie is not always a verbatim replay of original insertion history. It is a canonical runtime structure that preserves the semantics guaranteed by the chosen reduction mode.
|
||||
|
||||
This explains two things:
|
||||
|
||||
- why compiled artifacts can become dramatically smaller,
|
||||
- why reconstructing a builder from a compiled trie reflects the compiled state rather than the full original unreduced history.
|
||||
|
||||
## Reduction modes
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
|
||||
|
||||
This mode merges subtrees only when their `getAll()` results are equivalent for every reachable key suffix and when local ordering is preserved.
|
||||
|
||||
Use this mode when:
|
||||
|
||||
- ambiguity handling matters,
|
||||
- `getAll()` ordering should remain meaningful,
|
||||
- behavioral fidelity is more important than maximum compression.
|
||||
|
||||
This is the safest and most generally recommended mode.
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
|
||||
|
||||
This mode also preserves `getAll()`-level membership equivalence for every reachable key suffix, but it ignores local ordering differences.
|
||||
|
||||
Use this mode when:
|
||||
|
||||
- alternative candidates still matter,
|
||||
- exact ordering is less important,
|
||||
- stronger reduction is acceptable.
|
||||
|
||||
This mode is more aggressive than ranked mode, but less semantically rich.
|
||||
|
||||
### `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
|
||||
|
||||
This mode focuses on preserving dominant `get()` semantics for every reachable key suffix, subject to dominance thresholds.
|
||||
|
||||
Use this mode when:
|
||||
|
||||
- the main operational concern is the preferred result,
|
||||
- richer alternative-result behavior is less important,
|
||||
- stronger reduction is desirable.
|
||||
|
||||
Because non-dominant nodes fall back to ranked semantics, this mode is not simply “discard everything except the winner”. It is a controlled reduction strategy with a built-in safety condition.
|
||||
|
||||
## Practical effect on runtime behavior
|
||||
|
||||
Reduction mode is not just a storage optimization setting. It affects what distinctions remain visible after compilation.
|
||||
|
||||
### When ranked mode is used
|
||||
|
||||
You can rely on full ranked `getAll()` semantics being preserved.
|
||||
|
||||
### When unordered mode is used
|
||||
|
||||
You can rely on candidate membership, but not necessarily on preserving the same local ranking distinctions.
|
||||
|
||||
### When dominant mode is used
|
||||
|
||||
You optimize primarily for preferred-result semantics. Alternative-result behavior may still exist, but it is no longer the primary semantic contract of the reduction.
|
||||
|
||||
## Choosing a mode
|
||||
|
||||
A practical rule of thumb is:
|
||||
|
||||
- choose **ranked** if you are unsure,
|
||||
- choose **unordered** if alternative membership matters but ranking does not,
|
||||
- choose **dominant** only when your application is fundamentally driven by `get()` and you understand the trade-off.
|
||||
|
||||
## Why this design works well
|
||||
|
||||
The reduction model succeeds because it does not confuse “smaller” with “acceptable”.
|
||||
|
||||
Instead, it makes the semantic contract explicit:
|
||||
|
||||
- what exactly must be preserved,
|
||||
- what differences may be ignored,
|
||||
- when a more aggressive mode is safe,
|
||||
- when the system must fall back to a stricter interpretation.
|
||||
|
||||
That explicitness is what makes the compression trustworthy.
|
||||
|
||||
## Mental model to keep
|
||||
|
||||
If you want one concise mental model for reduction, use this one:
|
||||
|
||||
- build-time insertion collects examples,
|
||||
- reduction asks which subtrees mean the same thing,
|
||||
- the answer depends on the chosen semantic contract,
|
||||
- canonical representatives are shared,
|
||||
- the compiled trie preserves the behavior promised by that contract.
|
||||
|
||||
## Continue with
|
||||
|
||||
- [Architecture](architecture.md)
|
||||
- [Programmatic usage](programmatic-usage.md)
|
||||
- [CLI compilation](cli-compilation.md)
|
||||
61
docs/reports.md
Normal file
61
docs/reports.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Reports and Published Build Artifacts
|
||||
|
||||
Radixor publishes durable build outputs to GitHub Pages from qualifying runs of `.github/workflows/pages.yml`.
|
||||
|
||||
This page is the central entry point for published project artifacts, including build summaries, API documentation, test and quality reports, benchmark outputs, and software composition materials. It is intended both for routine project inspection and for linking stable report surfaces from external references such as the README, release notes, or development workflows.
|
||||
|
||||
## Stable entry points
|
||||
|
||||
The following links are the primary stable locations for the most recent published build outputs:
|
||||
|
||||
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||
- [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/)
|
||||
|
||||
Use `builds/latest/` when you want the current published report surface. Use `builds/` when you need to inspect or compare retained historical runs.
|
||||
|
||||
## API and developer documentation
|
||||
|
||||
These reports are primarily useful when reviewing the published API surface and generated developer-facing documentation:
|
||||
|
||||
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
||||
|
||||
## Verification and code quality reports
|
||||
|
||||
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
||||
|
||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||
- [Dependency vulnerability report](https://leogalambos.github.io/Radixor/builds/latest/dependency-check/dependency-check-report.html)
|
||||
|
||||
Together, these reports provide the most direct published view of functional correctness, static quality signals, coverage, mutation resistance, and dependency-level security review outputs.
|
||||
|
||||
## Software composition artifacts
|
||||
|
||||
These artifacts expose the published software bill of materials for the latest build:
|
||||
|
||||
- [SBOM (JSON)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.json)
|
||||
- [SBOM (XML)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.xml)
|
||||
|
||||
They are useful for dependency inspection, downstream integration, compliance-oriented workflows, and artifact traceability.
|
||||
|
||||
## Benchmark outputs and badge metadata
|
||||
|
||||
These resources expose benchmark results and generated badge metadata derived from the latest published build:
|
||||
|
||||
- [JMH benchmark results (TXT)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)
|
||||
- [JMH benchmark results (CSV)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.csv)
|
||||
- [Coverage badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)
|
||||
- [Mutation badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)
|
||||
- [Benchmark badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)
|
||||
|
||||
The benchmark outputs provide direct access to the published JMH result files, while the badge metadata endpoints are intended for status surfaces such as the project README or other generated dashboards.
|
||||
|
||||
## Practical usage
|
||||
|
||||
In most cases, the recommended entry path is:
|
||||
|
||||
1. start with the [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/),
|
||||
2. open the specific report category relevant to your task,
|
||||
3. use [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/) when historical inspection is needed.
|
||||
@@ -17,3 +17,6 @@ pomScmDeveloperConnection=scm:git:ssh://git@github.com/leogalambos/Radixor.git
|
||||
|
||||
pomLicenseName=BSD-3-Clause
|
||||
pomLicenseUrl=https://spdx.org/licenses/BSD-3-Clause.html
|
||||
|
||||
pomStemmerDataLicenseName=Stemmer Data License Policy
|
||||
pomStemmerDataLicenseUrl=https://github.com/leogalambos/Radixor/blob/main/LICENSE-stemmer-data
|
||||
|
||||
@@ -13,6 +13,12 @@ def pomScmDeveloperConnection = providers.gradleProperty('pomScmDeveloperConnect
|
||||
def pomLicenseName = providers.gradleProperty('pomLicenseName').orNull
|
||||
def pomLicenseUrl = providers.gradleProperty('pomLicenseUrl').orNull
|
||||
def pomLicenseDistribution = providers.gradleProperty('pomLicenseDistribution').orElse('repo').get()
|
||||
def pomStemmerDataLicenseName = providers.gradleProperty('pomStemmerDataLicenseName')
|
||||
.orElse('Stemmer Data License Policy')
|
||||
.get()
|
||||
def pomStemmerDataLicenseUrl = providers.gradleProperty('pomStemmerDataLicenseUrl')
|
||||
.orElse('https://github.com/leogalambos/Radixor/blob/main/LICENSE-stemmer-data')
|
||||
.get()
|
||||
def pomDeveloperId = providers.gradleProperty('pomDeveloperId').orElse('egothor').get()
|
||||
def pomDeveloperName = providers.gradleProperty('pomDeveloperName').orElse('Leo Galambos').get()
|
||||
def pomDeveloperEmail = providers.gradleProperty('pomDeveloperEmail').orElse('egothor@gmail.com').get()
|
||||
@@ -45,6 +51,11 @@ publishing {
|
||||
url = pomLicenseUrl
|
||||
distribution = pomLicenseDistribution
|
||||
}
|
||||
license {
|
||||
name = pomStemmerDataLicenseName
|
||||
url = pomStemmerDataLicenseUrl
|
||||
distribution = pomLicenseDistribution
|
||||
}
|
||||
}
|
||||
|
||||
developers {
|
||||
@@ -93,6 +104,8 @@ tasks.register('validateReleaseMetadata') {
|
||||
if (pomScmDeveloperConnection == null || pomScmDeveloperConnection.isBlank()) missing.add('pomScmDeveloperConnection')
|
||||
if (pomLicenseName == null || pomLicenseName.isBlank()) missing.add('pomLicenseName')
|
||||
if (pomLicenseUrl == null || pomLicenseUrl.isBlank()) missing.add('pomLicenseUrl')
|
||||
if (pomStemmerDataLicenseName == null || pomStemmerDataLicenseName.isBlank()) missing.add('pomStemmerDataLicenseName')
|
||||
if (pomStemmerDataLicenseUrl == null || pomStemmerDataLicenseUrl.isBlank()) missing.add('pomStemmerDataLicenseUrl')
|
||||
if (signingKey == null || signingKey.isBlank()) missing.add('pomSigningKey / SIGNING_KEY')
|
||||
if (signingPassword == null || signingPassword.isBlank()) missing.add('pomSigningPassword / SIGNING_PASSWORD')
|
||||
|
||||
|
||||
@@ -1,10 +1,27 @@
|
||||
import org.gradle.plugins.ide.eclipse.model.SourceFolder
|
||||
|
||||
|
||||
def snowballVersion = '3.0.1'
|
||||
def snowballArchiveName = "libstemmer_java-${snowballVersion}.tar.gz"
|
||||
def snowballDistributionDirectoryName = "libstemmer_java-${snowballVersion}"
|
||||
def snowballRootRelativePath = 'third-party/snowball'
|
||||
def snowballSourceRelativePath = "${snowballRootRelativePath}/source"
|
||||
def snowballJavaSourceRelativePath = "${snowballSourceRelativePath}/${snowballDistributionDirectoryName}/java"
|
||||
def snowballDownloadUrl = "https://snowballstem.org/dist/${snowballArchiveName}"
|
||||
def snowballDownloadFile = layout.buildDirectory.file("third-party/snowball/${snowballArchiveName}")
|
||||
def snowballExtractDirectory = layout.buildDirectory.dir('third-party/snowball/source')
|
||||
def snowballJavaSourceDirectory = layout.buildDirectory.dir(
|
||||
"third-party/snowball/source/libstemmer_java-${snowballVersion}/java")
|
||||
def snowballDownloadFile = layout.buildDirectory.file("${snowballRootRelativePath}/${snowballArchiveName}")
|
||||
def snowballExtractDirectory = layout.buildDirectory.dir(snowballSourceRelativePath)
|
||||
def snowballJavaSourceDirectory = layout.buildDirectory.dir(snowballJavaSourceRelativePath)
|
||||
def snowballJavaSourceClasspathPath = provider {
|
||||
project.relativePath(snowballJavaSourceDirectory.get().asFile)
|
||||
}
|
||||
def snowballEclipseClasspathAttributes = [
|
||||
gradle_scope : 'jmh',
|
||||
gradle_used_by_scope: 'jmh',
|
||||
test : 'true'
|
||||
]
|
||||
def isAbsoluteClasspathPath = { String path ->
|
||||
path.startsWith('/') || path ==~ /^[A-Za-z]:[\\\/].*/
|
||||
}
|
||||
|
||||
tasks.register('downloadSnowballJava') {
|
||||
group = 'build setup'
|
||||
@@ -47,3 +64,30 @@ sourceSets {
|
||||
tasks.named('compileJmhJava') {
|
||||
dependsOn(tasks.named('extractSnowballJava'))
|
||||
}
|
||||
|
||||
eclipse {
|
||||
classpath {
|
||||
file {
|
||||
whenMerged { classpath ->
|
||||
String generatedSnowballPath = snowballJavaSourceClasspathPath.get()
|
||||
String modelSnowballPath = snowballJavaSourceRelativePath
|
||||
|
||||
classpath.entries.removeAll { entry ->
|
||||
entry.hasProperty('path') && (
|
||||
entry.path == generatedSnowballPath ||
|
||||
entry.path == modelSnowballPath ||
|
||||
isAbsoluteClasspathPath(entry.path)
|
||||
)
|
||||
}
|
||||
|
||||
SourceFolder snowballEntry = new SourceFolder(generatedSnowballPath, null)
|
||||
snowballEntry.output = 'bin/jmh'
|
||||
snowballEclipseClasspathAttributes.each { String name, String value ->
|
||||
snowballEntry.entryAttributes[name] = value
|
||||
}
|
||||
|
||||
classpath.entries.add(snowballEntry)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
65
mkdocs.yml
Normal file
65
mkdocs.yml
Normal file
@@ -0,0 +1,65 @@
|
||||
site_name: Radixor
|
||||
site_description: High-performance multi-language stemming toolkit for Java
|
||||
site_url: https://leogalambos.github.io/Radixor/
|
||||
repo_url: https://github.com/leogalambos/Radixor
|
||||
repo_name: leogalambos/Radixor
|
||||
copyright: "© 2026 Egothor. Licensed under <a href='https://github.com/leogalambos/Radixor/blob/main/LICENSE'>BSD-3-Clause</a>."
|
||||
|
||||
theme:
|
||||
name: material
|
||||
language: en
|
||||
features:
|
||||
- navigation.instant
|
||||
- navigation.sections
|
||||
- navigation.top
|
||||
- search.suggest
|
||||
- search.highlight
|
||||
- content.code.copy
|
||||
palette:
|
||||
- scheme: default
|
||||
primary: indigo
|
||||
accent: indigo
|
||||
|
||||
extra:
|
||||
generator: false
|
||||
|
||||
extra_css:
|
||||
- assets/stylesheets/extra.css
|
||||
|
||||
markdown_extensions:
|
||||
- admonition
|
||||
- attr_list
|
||||
- md_in_html
|
||||
- pymdownx.details
|
||||
- pymdownx.highlight
|
||||
- pymdownx.superfences
|
||||
- tables
|
||||
|
||||
nav:
|
||||
- Home: index.md
|
||||
|
||||
- Getting Started:
|
||||
- Quick Start: quick-start.md
|
||||
- Built-in Languages: built-in-languages.md
|
||||
- Dictionary Format: dictionary-format.md
|
||||
- CLI Compilation: cli-compilation.md
|
||||
|
||||
- Programmatic Usage:
|
||||
- Overview: programmatic-usage.md
|
||||
- Loading and Building Stemmers: programmatic-loading-and-building.md
|
||||
- Querying and Ambiguity Handling: programmatic-querying-and-ambiguity.md
|
||||
- Extending and Persisting Compiled Tries: programmatic-extending-and-persistence.md
|
||||
|
||||
- Architecture and Semantics:
|
||||
- Overview: architecture-and-reduction.md
|
||||
- Architecture: architecture.md
|
||||
- Reduction Semantics: reduction-semantics.md
|
||||
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
||||
|
||||
- Dictionaries:
|
||||
- Contributing Dictionaries: contributing-dictionaries.md
|
||||
|
||||
- Quality and Operations:
|
||||
- Quality and Operations: quality-and-operations.md
|
||||
- Benchmarking: benchmarking.md
|
||||
- Reports: reports.md
|
||||
@@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport {
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
StemmerDictionaryParser.parse(
|
||||
new StringReader(corpusText),
|
||||
|
||||
@@ -59,8 +59,7 @@ import org.tartarus.snowball.ext.porterStemmer;
|
||||
* The benchmark processes the same deterministic token array with:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>Radixor using bundled
|
||||
* {@link StemmerPatchTrieLoader.Language#US_UK_PROFI}</li>
|
||||
* <li>Radixor using bundled {@link StemmerPatchTrieLoader.Language#US_UK}</li>
|
||||
* <li>Snowball original Porter stemmer</li>
|
||||
* <li>Snowball English stemmer, commonly referred to as Porter2</li>
|
||||
* </ul>
|
||||
@@ -106,7 +105,7 @@ public class EnglishStemmerComparisonBenchmark {
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() throws IOException {
|
||||
this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
|
||||
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK_PROFI, true,
|
||||
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK, true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
}
|
||||
}
|
||||
|
||||
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
@@ -0,0 +1,55 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Defines how dictionary items are normalized with respect to letter casing.
|
||||
*
|
||||
* <p>
|
||||
* The mode is applied while parsing dictionary sources and can be persisted in
|
||||
* trie metadata so that compiled artifacts remain self-describing.
|
||||
*/
|
||||
public enum CaseProcessingMode {
|
||||
|
||||
/**
|
||||
* Preserves input character casing exactly as provided by the dictionary
|
||||
* source.
|
||||
*/
|
||||
AS_IS,
|
||||
|
||||
/**
|
||||
* Normalizes all dictionary content to lower case using
|
||||
* {@link Locale#ROOT}.
|
||||
*/
|
||||
LOWERCASE_WITH_LOCALE_ROOT
|
||||
}
|
||||
@@ -61,6 +61,8 @@ import java.util.logging.Logger;
|
||||
* --output <file>
|
||||
* --reduction-mode <mode>
|
||||
* [--store-original]
|
||||
* [--right-to-left]
|
||||
* [--case-processing-mode <mode>]
|
||||
* [--dominant-winner-min-percent <1..100>]
|
||||
* [--dominant-winner-over-second-ratio <1..n>]
|
||||
* [--overwrite]
|
||||
@@ -149,8 +151,10 @@ public final class Compile {
|
||||
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
|
||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
|
||||
|
||||
final WordTraversalDirection traversalDirection = arguments.rightToLeft() ? WordTraversalDirection.FORWARD
|
||||
: WordTraversalDirection.BACKWARD;
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
||||
reductionSettings);
|
||||
reductionSettings, traversalDirection, arguments.caseProcessingMode());
|
||||
|
||||
final Path outputFile = arguments.outputFile();
|
||||
final Path parent = outputFile.toAbsolutePath().getParent();
|
||||
@@ -166,11 +170,11 @@ public final class Compile {
|
||||
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO,
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, caseProcessingMode={5}, dominantWinnerMinPercent={6}, dominantWinnerOverSecondRatio={7}.",
|
||||
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
||||
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
||||
arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
|
||||
arguments.dominantWinnerOverSecondRatio() });
|
||||
arguments.storeOriginal(), arguments.rightToLeft(), arguments.caseProcessingMode(),
|
||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio() });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,10 +188,28 @@ public final class Compile {
|
||||
System.err.println(" --output <file> \\");
|
||||
System.err.println(" --reduction-mode <mode> \\");
|
||||
System.err.println(" [--store-original] \\");
|
||||
System.err.println(" [--case-processing-mode <mode>] \\");
|
||||
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
||||
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
||||
System.err.println(" [--overwrite]");
|
||||
System.err.println();
|
||||
System.err.println("Options:");
|
||||
System.err.println(" --store-original");
|
||||
System.err.println(" Inserts each canonical stem itself using the no-operation patch.");
|
||||
System.err.println(" --right-to-left");
|
||||
System.err.println(" Uses forward word traversal for right-to-left languages.");
|
||||
System.err.println(" In this mode, trie keys are constructed from the logical beginning");
|
||||
System.err.println(" of the stored word form and patch commands are encoded likewise.");
|
||||
System.err.println(" --overwrite");
|
||||
System.err.println(" Replaces the target file when it already exists.");
|
||||
System.err.println(" --case-processing-mode");
|
||||
System.err.println(" Controls whether dictionary input is lowercased or preserved as-is.");
|
||||
System.err.println();
|
||||
System.err.println("Supported case processing modes:");
|
||||
for (CaseProcessingMode mode : CaseProcessingMode.values()) {
|
||||
System.err.println(" " + mode.name());
|
||||
}
|
||||
System.err.println();
|
||||
System.err.println("Supported reduction modes:");
|
||||
for (ReductionMode mode : ReductionMode.values()) {
|
||||
System.err.println(" " + mode.name());
|
||||
@@ -240,15 +262,20 @@ public final class Compile {
|
||||
* @param outputFile output compressed trie file
|
||||
* @param reductionMode subtree reduction mode
|
||||
* @param storeOriginal whether original stems are stored
|
||||
* @param rightToLeft whether dictionary compilation should
|
||||
* use forward traversal on stored word
|
||||
* forms
|
||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @param overwrite whether an existing output may be
|
||||
* replaced
|
||||
* @param help whether usage help was requested
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
||||
int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
|
||||
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio,
|
||||
CaseProcessingMode caseProcessingMode, boolean overwrite, boolean help) {
|
||||
|
||||
/**
|
||||
* Parses raw command-line arguments.
|
||||
@@ -264,8 +291,10 @@ public final class Compile {
|
||||
Path outputFile = null;
|
||||
ReductionMode reductionMode = null;
|
||||
boolean storeOriginal = false;
|
||||
boolean rightToLeft = false;
|
||||
boolean overwrite = false;
|
||||
boolean help = false;
|
||||
CaseProcessingMode caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
||||
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
||||
|
||||
@@ -286,6 +315,10 @@ public final class Compile {
|
||||
overwrite = true;
|
||||
break;
|
||||
|
||||
case "--right-to-left":
|
||||
rightToLeft = true;
|
||||
break;
|
||||
|
||||
case "--input":
|
||||
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
|
||||
break;
|
||||
@@ -310,6 +343,10 @@ public final class Compile {
|
||||
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
||||
"--dominant-winner-over-second-ratio");
|
||||
break;
|
||||
case "--case-processing-mode":
|
||||
caseProcessingMode = CaseProcessingMode.valueOf(
|
||||
requireValue(arguments, ++index, "--case-processing-mode").toUpperCase(Locale.ROOT));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||
@@ -317,8 +354,8 @@ public final class Compile {
|
||||
}
|
||||
|
||||
if (help) {
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, true);
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, true);
|
||||
}
|
||||
|
||||
if (inputFile == null) {
|
||||
@@ -331,8 +368,8 @@ public final class Compile {
|
||||
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
|
||||
}
|
||||
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, false);
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* Defines how dictionary loading and trie traversal should treat diacritics.
|
||||
*
|
||||
* <p>
|
||||
* The selected mode is applied independently from other normalization modes
|
||||
* (for example {@link CaseProcessingMode}). This means case normalization and
|
||||
* diacritic normalization can be combined freely and each keeps its own
|
||||
* semantics.
|
||||
* </p>
|
||||
*/
|
||||
public enum DiacriticProcessingMode {
|
||||
|
||||
/**
|
||||
* Preserves dictionary entries and lookup keys exactly as provided.
|
||||
*/
|
||||
AS_IS,
|
||||
|
||||
/**
|
||||
* Removes diacritics from dictionary entries before trie construction and
|
||||
* removes diacritics from lookup keys before traversal.
|
||||
*/
|
||||
REMOVE,
|
||||
|
||||
/**
|
||||
* Planned dual-path mode where lookup may continue along both the original
|
||||
* diacritic edge and a normalized non-diacritic alternative.
|
||||
*
|
||||
* <p>
|
||||
* This mode is currently not supported and using it triggers
|
||||
* {@link UnsupportedOperationException}.
|
||||
* </p>
|
||||
*/
|
||||
AS_IS_AND_STRIPPED_FALLBACK
|
||||
}
|
||||
197
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
197
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
@@ -0,0 +1,197 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.text.Normalizer.Form;
|
||||
|
||||
/**
|
||||
* Utility that strips diacritics from text for diacritic-insensitive trie
|
||||
* storage and lookup.
|
||||
*/
|
||||
final class DiacriticStripper {
|
||||
|
||||
/**
|
||||
* Direct single-character replacement table.
|
||||
*/
|
||||
private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
|
||||
|
||||
static {
|
||||
registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
|
||||
registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
|
||||
registerSingle("çćĉċč", 'c');
|
||||
registerSingle("ÇĆĈĊČ", 'C');
|
||||
registerSingle("ďđḍ", 'd');
|
||||
registerSingle("ĎĐḌ", 'D');
|
||||
registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
|
||||
registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
|
||||
registerSingle("ğĝġģǧ", 'g');
|
||||
registerSingle("ĞĜĠĢǦ", 'G');
|
||||
registerSingle("ĥħ", 'h');
|
||||
registerSingle("ĤĦ", 'H');
|
||||
registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
|
||||
registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
|
||||
registerSingle("ĵ", 'j');
|
||||
registerSingle("Ĵ", 'J');
|
||||
registerSingle("ķǩ", 'k');
|
||||
registerSingle("ĶǨ", 'K');
|
||||
registerSingle("ĺļľŀł", 'l');
|
||||
registerSingle("ĹĻĽĿŁ", 'L');
|
||||
registerSingle("ñńņňʼnŋ", 'n');
|
||||
registerSingle("ÑŃŅŇŊ", 'N');
|
||||
registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
|
||||
registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
|
||||
registerSingle("ŕŗř", 'r');
|
||||
registerSingle("ŔŖŘ", 'R');
|
||||
registerSingle("śŝşšș", 's');
|
||||
registerSingle("ŚŜŞŠȘ", 'S');
|
||||
registerSingle("ťţŧț", 't');
|
||||
registerSingle("ŤŢŦȚ", 'T');
|
||||
registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
|
||||
registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
|
||||
registerSingle("ýÿŷỳỵỷỹ", 'y');
|
||||
registerSingle("ÝŶŸỲỴỶỸ", 'Y');
|
||||
registerSingle("źżž", 'z');
|
||||
registerSingle("ŹŻŽ", 'Z');
|
||||
registerSingle("þ", 't');
|
||||
registerSingle("Þ", 'T');
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private DiacriticStripper() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes supported diacritic marks and common Latin ligatures from the supplied
|
||||
* text.
|
||||
*
|
||||
* <p>
|
||||
* The method returns the original {@link String} instance when no replacement is
|
||||
* required, avoiding an unnecessary allocation on the common ASCII path.
|
||||
* </p>
|
||||
*
|
||||
* @param input text to normalize
|
||||
* @return normalized text, or {@code input} itself when it is already unchanged
|
||||
*/
|
||||
/* default */ static String strip(final String input) {
|
||||
StringBuilder normalized = null;
|
||||
|
||||
for (int index = 0; index < input.length(); index++) {
|
||||
final char source = input.charAt(index);
|
||||
final String replacement = replacementFor(source);
|
||||
|
||||
if (replacement == null) {
|
||||
if (normalized != null) {
|
||||
normalized.append(source);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (normalized == null) {
|
||||
normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
|
||||
normalized.append(input, 0, index);
|
||||
}
|
||||
normalized.append(replacement);
|
||||
}
|
||||
|
||||
if (normalized == null) {
|
||||
return input;
|
||||
}
|
||||
return normalized.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the replacement text for one non-ASCII character.
|
||||
*
|
||||
* @param source source character
|
||||
* @return replacement text, or {@code null} when the character should be kept
|
||||
* unchanged
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
private static String replacementFor(final char source) {
|
||||
if (source <= 0x007F) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final char mapped = DIRECT_REPLACEMENTS[source];
|
||||
if (mapped != '\0') {
|
||||
return String.valueOf(mapped);
|
||||
}
|
||||
|
||||
if (source == 'ß') {
|
||||
return "ss";
|
||||
}
|
||||
if (source == 'Æ') {
|
||||
return "AE";
|
||||
}
|
||||
if (source == 'æ') {
|
||||
return "ae";
|
||||
}
|
||||
if (source == 'Œ') {
|
||||
return "OE";
|
||||
}
|
||||
if (source == 'œ') {
|
||||
return "oe";
|
||||
}
|
||||
|
||||
final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
|
||||
final StringBuilder ascii = new StringBuilder(decomposed.length());
|
||||
for (int index = 0; index < decomposed.length(); index++) {
|
||||
final char part = decomposed.charAt(index);
|
||||
if (Character.getType(part) == Character.NON_SPACING_MARK) {
|
||||
continue;
|
||||
}
|
||||
if (part <= 0x007F) {
|
||||
ascii.append(part);
|
||||
}
|
||||
}
|
||||
|
||||
if (ascii.length() == 0) {
|
||||
return null;
|
||||
}
|
||||
return ascii.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers one-character replacements for a set of source characters.
|
||||
*
|
||||
* @param sourceCharacters characters to replace
|
||||
* @param replacement replacement character
|
||||
*/
|
||||
private static void registerSingle(final String sourceCharacters, final char replacement) {
|
||||
for (int index = 0; index < sourceCharacters.length(); index++) {
|
||||
DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -41,6 +41,7 @@ import java.util.Collections;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
@@ -86,6 +87,7 @@ import org.egothor.stemmer.trie.ReductionSignature;
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||
public final class FrequencyTrie<V> {
|
||||
|
||||
/**
|
||||
@@ -93,16 +95,6 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 1;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
*/
|
||||
@@ -113,16 +105,49 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final CompiledNode<V> root;
|
||||
|
||||
/**
|
||||
* Metadata persisted together with this trie.
|
||||
*/
|
||||
private final TrieMetadata metadata;
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 5;
|
||||
|
||||
/**
|
||||
* Returns the current persisted binary stream format version.
|
||||
*
|
||||
* <p>
|
||||
* This method exists so other components can construct {@link TrieMetadata}
|
||||
* instances aligned with the currently written binary format without
|
||||
* duplicating constants.
|
||||
* </p>
|
||||
*
|
||||
* @return current trie stream format version
|
||||
*/
|
||||
public static int currentFormatVersion() {
|
||||
return STREAM_VERSION;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new compiled trie instance.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param root compiled root node
|
||||
* @param metadata trie metadata describing lookup and persistence semantics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) {
|
||||
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root,
|
||||
final TrieMetadata metadata) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.root = Objects.requireNonNull(root, "root");
|
||||
this.metadata = Objects.requireNonNull(metadata, "metadata");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -135,6 +160,10 @@ public final class FrequencyTrie<V> {
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* <p>
|
||||
* The supplied key is normalized according to persisted
|
||||
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return most frequent value, or {@code null} if the key does not exist or no
|
||||
* value is stored at the addressed node
|
||||
@@ -142,7 +171,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public V get(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return null;
|
||||
}
|
||||
@@ -162,6 +191,10 @@ public final class FrequencyTrie<V> {
|
||||
* <p>
|
||||
* The returned array is a defensive copy.
|
||||
*
|
||||
* <p>
|
||||
* The supplied key is normalized according to persisted
|
||||
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return all values stored at the addressed node, ordered by descending
|
||||
* frequency; returns an empty array if the key does not exist or no
|
||||
@@ -170,7 +203,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public V[] getAll(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return this.arrayFactory.apply(0);
|
||||
}
|
||||
@@ -201,7 +234,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public List<ValueCount<V>> getEntries(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
@@ -213,6 +246,29 @@ public final class FrequencyTrie<V> {
|
||||
return Collections.unmodifiableList(entries);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the logical key traversal direction used by this trie.
|
||||
*
|
||||
* <p>
|
||||
* The same direction must be used when reconstructing mutable builders or when
|
||||
* applying patch commands that were generated against keys stored in this trie.
|
||||
* </p>
|
||||
*
|
||||
* @return logical key traversal direction
|
||||
*/
|
||||
public WordTraversalDirection traversalDirection() {
|
||||
return this.metadata.traversalDirection();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns immutable persisted metadata associated with this trie.
|
||||
*
|
||||
* @return trie metadata
|
||||
*/
|
||||
public TrieMetadata metadata() {
|
||||
return this.metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the root node mainly for diagnostics and tests within the package.
|
||||
*
|
||||
@@ -262,6 +318,7 @@ public final class FrequencyTrie<V> {
|
||||
dataOutput.writeInt(STREAM_VERSION);
|
||||
dataOutput.writeInt(orderedNodes.size());
|
||||
dataOutput.writeInt(nodeIds.get(this.root));
|
||||
writeMetadata(dataOutput, this.metadata);
|
||||
|
||||
for (CompiledNode<V> node : orderedNodes) {
|
||||
writeNode(dataOutput, valueCodec, node, nodeIds);
|
||||
@@ -304,7 +361,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != STREAM_VERSION) {
|
||||
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
@@ -318,6 +375,8 @@ public final class FrequencyTrie<V> {
|
||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||
}
|
||||
|
||||
final TrieMetadata metadata = readMetadata(dataInput, version);
|
||||
|
||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||
|
||||
@@ -325,7 +384,86 @@ public final class FrequencyTrie<V> {
|
||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode);
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes persisted trie metadata.
|
||||
*
|
||||
* @param dataOutput output stream
|
||||
* @param metadata metadata to serialize
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata)
|
||||
throws IOException {
|
||||
dataOutput.writeUTF(metadata.toTextBlock());
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads persisted trie metadata while remaining backward compatible with
|
||||
* earlier stream versions.
|
||||
*
|
||||
* @param dataInput input stream
|
||||
* @param version persisted stream version
|
||||
* @return deserialized metadata
|
||||
* @throws IOException if the metadata section is invalid
|
||||
*/
|
||||
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||
if (version >= 5) { // NOPMD
|
||||
try {
|
||||
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
|
||||
} catch (IllegalArgumentException exception) {
|
||||
throw new IOException("Invalid metadata block.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection;
|
||||
if (version >= 2) { // NOPMD
|
||||
final int traversalDirectionOrdinal = dataInput.readInt();
|
||||
final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values();
|
||||
if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) {
|
||||
throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal);
|
||||
}
|
||||
traversalDirection = traversalDirections[traversalDirectionOrdinal];
|
||||
} else {
|
||||
traversalDirection = WordTraversalDirection.BACKWARD;
|
||||
}
|
||||
|
||||
if (version < 3) { // NOPMD
|
||||
return TrieMetadata.legacy(version, traversalDirection);
|
||||
}
|
||||
|
||||
final ReductionMode[] reductionModes = ReductionMode.values();
|
||||
final int reductionModeOrdinal = dataInput.readInt();
|
||||
if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) {
|
||||
throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal);
|
||||
}
|
||||
|
||||
final int dominantWinnerMinPercent = dataInput.readInt();
|
||||
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
||||
|
||||
final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values();
|
||||
final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD
|
||||
if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) {
|
||||
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
|
||||
}
|
||||
|
||||
final CaseProcessingMode caseProcessingMode;
|
||||
if (version >= 4) { // NOPMD
|
||||
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
|
||||
final int caseProcessingModeOrdinal = dataInput.readInt();
|
||||
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
|
||||
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
|
||||
}
|
||||
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
|
||||
} else {
|
||||
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
}
|
||||
|
||||
return new TrieMetadata(version, traversalDirection,
|
||||
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -501,13 +639,14 @@ public final class FrequencyTrie<V> {
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @param key already-normalized key to resolve
|
||||
* @return compiled node, or {@code null} if the path does not exist
|
||||
*/
|
||||
private CompiledNode<V> findNode(final String key) {
|
||||
CompiledNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
current = current.findChild(key.charAt(index));
|
||||
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
|
||||
current = current.findChild(
|
||||
key.charAt(this.metadata.traversalDirection().logicalIndex(key.length(), traversalOffset)));
|
||||
if (current == null) {
|
||||
return null;
|
||||
}
|
||||
@@ -515,6 +654,29 @@ public final class FrequencyTrie<V> {
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies lookup-time case normalization according to persisted metadata.
|
||||
*
|
||||
* @param key lookup key
|
||||
* @return normalized key for trie traversal
|
||||
*/
|
||||
private String normalizeLookupKey(final String key) {
|
||||
String normalized = key;
|
||||
|
||||
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) {
|
||||
normalized = DiacriticStripper.strip(normalized);
|
||||
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder of {@link FrequencyTrie}.
|
||||
*
|
||||
@@ -544,6 +706,21 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final ReductionSettings reductionSettings;
|
||||
|
||||
/**
|
||||
* Logical key traversal direction used by this builder.
|
||||
*/
|
||||
private final WordTraversalDirection traversalDirection;
|
||||
|
||||
/**
|
||||
* Dictionary case processing mode associated with this builder.
|
||||
*/
|
||||
private final CaseProcessingMode caseProcessingMode;
|
||||
|
||||
/**
|
||||
* Dictionary diacritic processing mode associated with this builder.
|
||||
*/
|
||||
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
@@ -552,13 +729,69 @@ public final class FrequencyTrie<V> {
|
||||
/**
|
||||
* Creates a new builder with the provided settings.
|
||||
*
|
||||
* <p>
|
||||
* This constructor preserves the historical Egothor behavior and therefore
|
||||
* traverses logical keys from their end toward their beginning.
|
||||
* </p>
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||
this(arrayFactory, reductionSettings, WordTraversalDirection.BACKWARD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings and explicit traversal
|
||||
* direction.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, and explicit case processing mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, explicit case processing mode, and explicit diacritic processing
|
||||
* mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
@@ -566,12 +799,31 @@ public final class FrequencyTrie<V> {
|
||||
* Creates a new builder using default thresholds for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* <p>
|
||||
* This constructor preserves the historical Egothor behavior and therefore
|
||||
* traverses logical keys from their end toward their beginning.
|
||||
* </p>
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionMode reduction mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode), WordTraversalDirection.BACKWARD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder using default thresholds for the supplied reduction
|
||||
* mode and explicit traversal direction.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionMode reduction mode
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode,
|
||||
final WordTraversalDirection traversalDirection) {
|
||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode), traversalDirection);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -611,7 +863,9 @@ public final class FrequencyTrie<V> {
|
||||
reductionContext.canonicalNodeCount());
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot);
|
||||
final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings,
|
||||
this.diacriticProcessingMode, this.caseProcessingMode);
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -645,9 +899,12 @@ public final class FrequencyTrie<V> {
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
|
||||
final String normalizedKey = normalizeDictionaryKey(key);
|
||||
|
||||
MutableNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
final Character edge = key.charAt(index);
|
||||
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
|
||||
final Character edge = normalizedKey
|
||||
.charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
|
||||
MutableNode<V> child = current.children().get(edge);
|
||||
if (child == null) {
|
||||
child = new MutableNode<>(); // NOPMD
|
||||
@@ -665,6 +922,30 @@ public final class FrequencyTrie<V> {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies build-time dictionary-key normalization according to the builder
|
||||
* configuration.
|
||||
*
|
||||
* @param key dictionary key
|
||||
* @return normalized key for trie insertion
|
||||
*/
|
||||
private String normalizeDictionaryKey(final String key) {
|
||||
String normalized = key;
|
||||
|
||||
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
|
||||
normalized = DiacriticStripper.strip(normalized);
|
||||
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mutable build-time nodes currently reachable from the
|
||||
* builder root.
|
||||
@@ -679,6 +960,15 @@ public final class FrequencyTrie<V> {
|
||||
return countMutableNodes(this.root);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the logical key traversal direction used by this builder.
|
||||
*
|
||||
* @return logical key traversal direction
|
||||
*/
|
||||
/* default */ WordTraversalDirection traversalDirection() {
|
||||
return this.traversalDirection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts mutable nodes recursively.
|
||||
*
|
||||
|
||||
@@ -87,10 +87,12 @@ public final class FrequencyTrieBuilders {
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings);
|
||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
|
||||
source.traversalDirection(), source.metadata().caseProcessingMode(),
|
||||
source.metadata().diacriticProcessingMode());
|
||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||
|
||||
copyNode(source.root(), keyBuilder, builder);
|
||||
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
|
||||
|
||||
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
|
||||
return builder;
|
||||
@@ -120,17 +122,19 @@ public final class FrequencyTrieBuilders {
|
||||
* @param node current compiled node
|
||||
* @param keyBuilder current key builder
|
||||
* @param builder target mutable builder
|
||||
* @param traversalDirection logical key traversal direction used by the source
|
||||
* @param <V> value type
|
||||
*/
|
||||
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
|
||||
final FrequencyTrie.Builder<V> builder) {
|
||||
final FrequencyTrie.Builder<V> builder, final WordTraversalDirection traversalDirection) {
|
||||
final String logicalKey = traversalDirection.traversalPathToLogicalKey(keyBuilder);
|
||||
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
|
||||
builder.put(keyBuilder.toString(), node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
||||
builder.put(logicalKey, node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
||||
}
|
||||
|
||||
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
|
||||
keyBuilder.append(node.edgeLabels()[childIndex]);
|
||||
copyNode(node.children()[childIndex], keyBuilder, builder);
|
||||
copyNode(node.children()[childIndex], keyBuilder, builder, traversalDirection);
|
||||
keyBuilder.setLength(keyBuilder.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
/**
|
||||
@@ -37,10 +38,19 @@ import java.util.concurrent.locks.ReentrantLock;
|
||||
* and applies such commands back to source words.
|
||||
*
|
||||
* <p>
|
||||
* The generated patch command follows the historical Egothor convention:
|
||||
* instructions are serialized so that they are applied from the end of the
|
||||
* source word toward its beginning. This keeps the command stream compact and
|
||||
* matches the behavior expected by existing stemming data.
|
||||
* The historical Egothor patch language is defined for backward traversal, that
|
||||
* is, from the logical end of a word toward its beginning. This implementation
|
||||
* preserves that proven opcode semantics as the single internal representation.
|
||||
* Forward traversal is implemented by translating source and target words to
|
||||
* the equivalent reversed logical form at the API boundary and then delegating
|
||||
* to the same backward encoder and decoder.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* This design keeps the patch language stable, avoids maintaining two distinct
|
||||
* opcode interpreters, and guarantees that forward traversal is semantically
|
||||
* equivalent to running the historical algorithm on the reversed logical word
|
||||
* form.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
@@ -57,8 +67,19 @@ import java.util.concurrent.locks.ReentrantLock;
|
||||
* instance can still be used safely when needed.
|
||||
* </p>
|
||||
*/
|
||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||
public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Backward direction apply strategy with no runtime direction branching.
|
||||
*/
|
||||
private static final ApplyStrategy BACKWARD_APPLY_STRATEGY = PatchCommandEncoder::applyBackward;
|
||||
|
||||
/**
|
||||
* Forward direction apply strategy with no runtime direction branching.
|
||||
*/
|
||||
private static final ApplyStrategy FORWARD_APPLY_STRATEGY = PatchCommandEncoder::applyForward;
|
||||
|
||||
/**
|
||||
* Serialized opcode for deleting one or more characters.
|
||||
*/
|
||||
@@ -87,12 +108,6 @@ public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Serialized opcode for a canonical no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This opcode represents an identity transform of the whole source word. It is
|
||||
* used to ensure that equal source and target words always produce the same
|
||||
* serialized patch command.
|
||||
* </p>
|
||||
*/
|
||||
private static final char NOOP_OPCODE = 'N';
|
||||
|
||||
@@ -103,11 +118,6 @@ public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Canonical serialized no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This constant is returned by {@link #encode(String, String)} whenever source
|
||||
* and target are equal.
|
||||
* </p>
|
||||
*/
|
||||
/* default */ static final String NOOP_PATCH = String.valueOf(new char[] { NOOP_OPCODE, NOOP_ARGUMENT });
|
||||
|
||||
@@ -118,13 +128,6 @@ public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Extra matrix headroom reserved beyond the immediately required dimensions.
|
||||
*
|
||||
* <p>
|
||||
* A small fixed margin reduces repeated reallocation when a caller encodes many
|
||||
* similarly sized terms in sequence. The value is intentionally modest: large
|
||||
* enough to absorb minor size fluctuations, yet small enough to avoid
|
||||
* materially over-allocating the reused dynamic-programming matrices.
|
||||
* </p>
|
||||
*/
|
||||
private static final int CAPACITY_MARGIN = 8;
|
||||
|
||||
@@ -148,6 +151,17 @@ public final class PatchCommandEncoder {
|
||||
*/
|
||||
private final int matchCost;
|
||||
|
||||
/**
|
||||
* Direction in which words are traversed during both patch serialization and
|
||||
* patch application.
|
||||
*/
|
||||
private final WordTraversalDirection traversalDirection;
|
||||
|
||||
/**
|
||||
* Direction-specialized patch apply strategy.
|
||||
*/
|
||||
private final ApplyStrategy applyStrategy;
|
||||
|
||||
/**
|
||||
* Currently allocated source dimension of reusable matrices.
|
||||
*/
|
||||
@@ -178,53 +192,49 @@ public final class PatchCommandEncoder {
|
||||
*/
|
||||
private enum Trace {
|
||||
|
||||
/**
|
||||
* Deletes one character from the source sequence.
|
||||
*/
|
||||
/** Deletes one character from the source sequence. */
|
||||
DELETE,
|
||||
|
||||
/**
|
||||
* Inserts one character from the target sequence.
|
||||
*/
|
||||
/** Inserts one character from the target sequence. */
|
||||
INSERT,
|
||||
|
||||
/**
|
||||
* Replaces one source character with one target character.
|
||||
*/
|
||||
/** Replaces one source character with one target character. */
|
||||
REPLACE,
|
||||
|
||||
/**
|
||||
* Keeps one matching character unchanged.
|
||||
*/
|
||||
/** Keeps one matching character unchanged. */
|
||||
MATCH
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with the traditional Egothor cost model: insert = 1,
|
||||
* delete = 1, replace = 1, match = 0.
|
||||
* Direction-specialized patch application strategy.
|
||||
*/
|
||||
public PatchCommandEncoder() {
|
||||
this(1, 1, 1, 0);
|
||||
@FunctionalInterface
|
||||
private interface ApplyStrategy {
|
||||
/**
|
||||
* Applies the command.
|
||||
*
|
||||
* @param source original text
|
||||
* @param patchCommand patch command
|
||||
* @return final text after applying the command
|
||||
*/
|
||||
String apply(String source, String patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with explicit operation costs.
|
||||
*
|
||||
* @param insertCost cost of inserting one character
|
||||
* @param deleteCost cost of deleting one character
|
||||
* @param replaceCost cost of replacing one character
|
||||
* @param matchCost cost of keeping one equal character unchanged
|
||||
*/
|
||||
public PatchCommandEncoder(int insertCost, int deleteCost, int replaceCost, int matchCost) {
|
||||
private PatchCommandEncoder(final Builder builder) {
|
||||
this.traversalDirection = Objects.requireNonNull(builder.traversalDirection, "traversalDirection");
|
||||
final int insertCost = builder.insertCost;
|
||||
if (insertCost < 0) {
|
||||
throw new IllegalArgumentException("insertCost must be non-negative.");
|
||||
}
|
||||
final int deleteCost = builder.deleteCost;
|
||||
if (deleteCost < 0) {
|
||||
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
||||
}
|
||||
final int replaceCost = builder.replaceCost;
|
||||
if (replaceCost < 0) {
|
||||
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
||||
}
|
||||
final int matchCost = builder.matchCost;
|
||||
if (matchCost < 0) {
|
||||
throw new IllegalArgumentException("matchCost must be non-negative.");
|
||||
}
|
||||
@@ -233,12 +243,22 @@ public final class PatchCommandEncoder {
|
||||
this.deleteCost = deleteCost;
|
||||
this.replaceCost = replaceCost;
|
||||
this.matchCost = matchCost;
|
||||
this.applyStrategy = applyStrategyFor(this.traversalDirection);
|
||||
this.sourceCapacity = 0;
|
||||
this.targetCapacity = 0;
|
||||
this.costMatrix = new int[0][0];
|
||||
this.traceMatrix = new Trace[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a fluent builder for constructing a direction-specialized encoder.
|
||||
*
|
||||
* @return new builder instance
|
||||
*/
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a compact patch command that transforms {@code source} into
|
||||
* {@code target}.
|
||||
@@ -248,55 +268,139 @@ public final class PatchCommandEncoder {
|
||||
* @return compact patch command, or {@code null} when any argument is
|
||||
* {@code null}
|
||||
*/
|
||||
public String encode(String source, String target) {
|
||||
public String encode(final String source, final String target) {
|
||||
if (source == null || target == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (source.equals(target)) {
|
||||
return NOOP_PATCH;
|
||||
}
|
||||
|
||||
int sourceLength = source.length();
|
||||
int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditions(sourceLength, targetLength);
|
||||
|
||||
char[] sourceCharacters = source.toCharArray();
|
||||
char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
|
||||
|
||||
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
if (this.traversalDirection == WordTraversalDirection.BACKWARD) {
|
||||
return encodeBackward(source, target);
|
||||
}
|
||||
return encodeForward(source, target);
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command to the supplied source word.
|
||||
* Applies a compact patch command using this encoder instance traversal
|
||||
* direction.
|
||||
*
|
||||
* <p>
|
||||
* This method operates directly on serialized opcodes rather than mapping them
|
||||
* to another representation. That keeps the hot path small and avoids
|
||||
* unnecessary indirection during patch application.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For compatibility with the historical behavior, malformed patch input that
|
||||
* causes index failures results in the original source word being returned
|
||||
* unchanged.
|
||||
* This is the branch-free instance-level fast path for repeated patch
|
||||
* application in a known traversal direction.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public String applyWithConfiguredDirection(final String source, final String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
return this.applyStrategy.apply(source, patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command to the supplied source word using the
|
||||
* historical backward traversal direction.
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public static String apply(final String source, final String patchCommand) {
|
||||
return apply(source, patchCommand, WordTraversalDirection.BACKWARD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command to the supplied source word using the
|
||||
* specified traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* The implementation uses dedicated direction-specific patch decoders.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @param traversalDirection traversal direction used by the patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public static String apply(final String source, final String patchCommand,
|
||||
final WordTraversalDirection traversalDirection) {
|
||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
return applyStrategyFor(traversalDirection).apply(source, patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a patch command using the historical backward Egothor semantics.
|
||||
*
|
||||
* @param source source word form in legacy backward logical space
|
||||
* @param target target word form in legacy backward logical space
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String encodeBackward(final String source, final String target) {
|
||||
final int sourceLength = source.length();
|
||||
final int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditionsBackward(sourceLength, targetLength);
|
||||
|
||||
final char[] sourceCharacters = source.toCharArray();
|
||||
final char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||
WordTraversalDirection.BACKWARD);
|
||||
|
||||
return buildPatchCommandBackward(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a patch command using forward traversal semantics.
|
||||
*
|
||||
* @param source source word form
|
||||
* @param target target word form
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String encodeForward(final String source, final String target) {
|
||||
final int sourceLength = source.length();
|
||||
final int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditionsForward(sourceLength, targetLength);
|
||||
|
||||
final char[] sourceCharacters = source.toCharArray();
|
||||
final char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||
WordTraversalDirection.FORWARD);
|
||||
|
||||
return buildPatchCommandForward(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command using the historical backward Egothor semantics.
|
||||
*
|
||||
* @param source original source word in legacy backward logical space
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
|
||||
public static String apply(String source, String patchCommand) {
|
||||
private static String applyBackward(final String source, final String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
@@ -306,24 +410,21 @@ public final class PatchCommandEncoder {
|
||||
if (NOOP_PATCH.equals(patchCommand)) {
|
||||
return source;
|
||||
}
|
||||
|
||||
if ((patchCommand.length() & 1) != 0) {
|
||||
return source;
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder(source);
|
||||
|
||||
final StringBuilder result = new StringBuilder(source);
|
||||
if (result.isEmpty()) {
|
||||
return applyToEmptySource(result, patchCommand);
|
||||
return applyBackwardToEmptySource(result, patchCommand);
|
||||
}
|
||||
|
||||
int position = result.length() - 1;
|
||||
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
@@ -343,7 +444,7 @@ public final class PatchCommandEncoder {
|
||||
if (deleteCount < 1) {
|
||||
return source;
|
||||
}
|
||||
int deleteEndExclusive = position + 1;
|
||||
final int deleteEndExclusive = position + 1;
|
||||
position -= deleteCount - 1;
|
||||
result.delete(position, deleteEndExclusive);
|
||||
break;
|
||||
@@ -373,27 +474,86 @@ public final class PatchCommandEncoder {
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a compact count argument used by skip and delete instructions.
|
||||
* Applies a patch command using forward traversal semantics.
|
||||
*
|
||||
* <p>
|
||||
* Valid encoded counts start at {@code 'a'} for one affected character. Values
|
||||
* below {@code 'a'} are malformed and are reported to callers via the
|
||||
* compatibility fallback path rather than by throwing a dedicated exception.
|
||||
* </p>
|
||||
*
|
||||
* @param argument serialized count argument
|
||||
* @return decoded positive count, or {@code -1} when the argument is malformed
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
private static int decodeEncodedCount(final char argument) {
|
||||
if (argument < 'a') {
|
||||
return -1;
|
||||
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
|
||||
private static String applyForward(final String source, final String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
return argument - 'a' + 1;
|
||||
if (patchCommand == null || patchCommand.isEmpty()) {
|
||||
return source;
|
||||
}
|
||||
if (NOOP_PATCH.equals(patchCommand)) {
|
||||
return source;
|
||||
}
|
||||
if ((patchCommand.length() & 1) != 0) {
|
||||
return source;
|
||||
}
|
||||
|
||||
final StringBuilder result = new StringBuilder(source);
|
||||
if (result.isEmpty()) {
|
||||
return applyForwardToEmptySource(result, patchCommand);
|
||||
}
|
||||
|
||||
int position = 0;
|
||||
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
final int skipCount = decodeEncodedCount(argument);
|
||||
if (skipCount < 1) {
|
||||
return source;
|
||||
}
|
||||
position = position + skipCount - 1;
|
||||
break;
|
||||
|
||||
case REPLACE_OPCODE:
|
||||
result.setCharAt(position, argument);
|
||||
break;
|
||||
|
||||
case DELETE_OPCODE:
|
||||
final int deleteCount = decodeEncodedCount(argument);
|
||||
if (deleteCount < 1) {
|
||||
return source;
|
||||
}
|
||||
result.delete(position, position + deleteCount);
|
||||
position--;
|
||||
break;
|
||||
|
||||
case INSERT_OPCODE:
|
||||
result.insert(position, argument);
|
||||
break;
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return source;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
|
||||
position++;
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return source;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command to an empty source word.
|
||||
* Applies a backward patch command to an empty source word.
|
||||
*
|
||||
* <p>
|
||||
* Only insertion instructions are meaningful for an empty source. Skip,
|
||||
@@ -407,12 +567,11 @@ public final class PatchCommandEncoder {
|
||||
* @return transformed word, or the original empty word when the patch is
|
||||
* malformed
|
||||
*/
|
||||
private static String applyToEmptySource(StringBuilder result, String patchCommand) {
|
||||
private static String applyBackwardToEmptySource(final StringBuilder result, final String patchCommand) {
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case INSERT_OPCODE:
|
||||
@@ -441,6 +600,71 @@ public final class PatchCommandEncoder {
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a forward patch command to an empty source word.
|
||||
*
|
||||
* @param result empty result builder
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or the original empty word when the patch is
|
||||
* malformed
|
||||
*/
|
||||
private static String applyForwardToEmptySource(final StringBuilder result, final String patchCommand) {
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case INSERT_OPCODE:
|
||||
result.append(argument);
|
||||
break;
|
||||
|
||||
case SKIP_OPCODE:
|
||||
case REPLACE_OPCODE:
|
||||
case DELETE_OPCODE:
|
||||
return "";
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return "";
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the direction-specialized apply strategy.
|
||||
*
|
||||
* @param traversalDirection requested traversal direction
|
||||
* @return branch-free apply strategy for that direction
|
||||
*/
|
||||
private static ApplyStrategy applyStrategyFor(final WordTraversalDirection traversalDirection) {
|
||||
return traversalDirection == WordTraversalDirection.BACKWARD ? BACKWARD_APPLY_STRATEGY : FORWARD_APPLY_STRATEGY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a compact count argument used by skip and delete instructions.
|
||||
*
|
||||
* @param argument serialized count argument
|
||||
* @return decoded positive count, or {@code -1} when the argument is malformed
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
private static int decodeEncodedCount(final char argument) {
|
||||
if (argument < 'a') {
|
||||
return -1;
|
||||
}
|
||||
return argument - 'a' + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that internal matrices are large enough for the requested input
|
||||
* dimensions.
|
||||
@@ -448,16 +672,16 @@ public final class PatchCommandEncoder {
|
||||
* @param requiredSourceCapacity required source dimension
|
||||
* @param requiredTargetCapacity required target dimension
|
||||
*/
|
||||
private void ensureCapacity(int requiredSourceCapacity, int requiredTargetCapacity) {
|
||||
if (requiredSourceCapacity <= sourceCapacity && requiredTargetCapacity <= targetCapacity) {
|
||||
private void ensureCapacity(final int requiredSourceCapacity, final int requiredTargetCapacity) {
|
||||
if (requiredSourceCapacity <= this.sourceCapacity && requiredTargetCapacity <= this.targetCapacity) {
|
||||
return;
|
||||
}
|
||||
|
||||
sourceCapacity = Math.max(sourceCapacity, requiredSourceCapacity) + CAPACITY_MARGIN;
|
||||
targetCapacity = Math.max(targetCapacity, requiredTargetCapacity) + CAPACITY_MARGIN;
|
||||
this.sourceCapacity = Math.max(this.sourceCapacity, requiredSourceCapacity) + CAPACITY_MARGIN;
|
||||
this.targetCapacity = Math.max(this.targetCapacity, requiredTargetCapacity) + CAPACITY_MARGIN;
|
||||
|
||||
costMatrix = new int[sourceCapacity][targetCapacity];
|
||||
traceMatrix = new Trace[sourceCapacity][targetCapacity];
|
||||
this.costMatrix = new int[this.sourceCapacity][this.targetCapacity];
|
||||
this.traceMatrix = new Trace[this.sourceCapacity][this.targetCapacity];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -467,18 +691,41 @@ public final class PatchCommandEncoder {
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditions(int sourceLength, int targetLength) {
|
||||
costMatrix[0][0] = 0;
|
||||
traceMatrix[0][0] = Trace.MATCH;
|
||||
private void initializeBoundaryConditionsBackward(final int sourceLength, final int targetLength) {
|
||||
this.costMatrix[0][0] = 0;
|
||||
this.traceMatrix[0][0] = Trace.MATCH;
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
costMatrix[sourceIndex][0] = sourceIndex * deleteCost;
|
||||
traceMatrix[sourceIndex][0] = Trace.DELETE;
|
||||
this.costMatrix[sourceIndex][0] = sourceIndex * this.deleteCost;
|
||||
this.traceMatrix[sourceIndex][0] = Trace.DELETE;
|
||||
}
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
costMatrix[0][targetIndex] = targetIndex * insertCost;
|
||||
traceMatrix[0][targetIndex] = Trace.INSERT;
|
||||
this.costMatrix[0][targetIndex] = targetIndex * this.insertCost;
|
||||
this.traceMatrix[0][targetIndex] = Trace.INSERT;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes boundary conditions for forward dynamic-programming traversal.
|
||||
*
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditionsForward(final int sourceLength, final int targetLength) {
|
||||
this.costMatrix[sourceLength][targetLength] = 0;
|
||||
this.traceMatrix[sourceLength][targetLength] = Trace.MATCH;
|
||||
|
||||
for (int sourceIndex = sourceLength - 1; sourceIndex >= 0; sourceIndex--) {
|
||||
this.costMatrix[sourceIndex][targetLength] = this.costMatrix[sourceIndex + 1][targetLength]
|
||||
+ this.deleteCost;
|
||||
this.traceMatrix[sourceIndex][targetLength] = Trace.DELETE;
|
||||
}
|
||||
|
||||
for (int targetIndex = targetLength - 1; targetIndex >= 0; targetIndex--) {
|
||||
this.costMatrix[sourceLength][targetIndex] = this.costMatrix[sourceLength][targetIndex + 1]
|
||||
+ this.insertCost;
|
||||
this.traceMatrix[sourceLength][targetIndex] = Trace.INSERT;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -490,20 +737,58 @@ public final class PatchCommandEncoder {
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
* @param direction traversal direction used to compare characters
|
||||
*/
|
||||
private void fillMatrices(char[] sourceCharacters, char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength, final WordTraversalDirection direction) {
|
||||
final int sourceStart;
|
||||
final int sourceEndExclusive;
|
||||
final int sourceStep;
|
||||
final int targetStart;
|
||||
final int targetEndExclusive;
|
||||
final int targetStep;
|
||||
final int sourceCharacterOffset;
|
||||
final int targetCharacterOffset;
|
||||
final int sourceNeighborDelta;
|
||||
final int targetNeighborDelta;
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
char sourceCharacter = sourceCharacters[sourceIndex - 1];
|
||||
if (direction == WordTraversalDirection.BACKWARD) {
|
||||
sourceStart = 1;
|
||||
sourceEndExclusive = sourceLength + 1;
|
||||
sourceStep = 1;
|
||||
targetStart = 1;
|
||||
targetEndExclusive = targetLength + 1;
|
||||
targetStep = 1;
|
||||
sourceCharacterOffset = -1;
|
||||
targetCharacterOffset = -1;
|
||||
sourceNeighborDelta = -1;
|
||||
targetNeighborDelta = -1;
|
||||
} else {
|
||||
sourceStart = sourceLength - 1;
|
||||
sourceEndExclusive = -1;
|
||||
sourceStep = -1;
|
||||
targetStart = targetLength - 1;
|
||||
targetEndExclusive = -1;
|
||||
targetStep = -1;
|
||||
sourceCharacterOffset = 0;
|
||||
targetCharacterOffset = 0;
|
||||
sourceNeighborDelta = 1;
|
||||
targetNeighborDelta = 1;
|
||||
}
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
char targetCharacter = targetCharacters[targetIndex - 1];
|
||||
for (int sourceIndex = sourceStart; sourceIndex != sourceEndExclusive; sourceIndex += sourceStep) {
|
||||
final char sourceCharacter = sourceCharacters[sourceIndex + sourceCharacterOffset];
|
||||
final int sourceNeighbor = sourceIndex + sourceNeighborDelta;
|
||||
|
||||
int deleteCandidate = costMatrix[sourceIndex - 1][targetIndex] + deleteCost;
|
||||
int insertCandidate = costMatrix[sourceIndex][targetIndex - 1] + insertCost;
|
||||
int replaceCandidate = costMatrix[sourceIndex - 1][targetIndex - 1] + replaceCost;
|
||||
int matchCandidate = costMatrix[sourceIndex - 1][targetIndex - 1]
|
||||
+ (sourceCharacter == targetCharacter ? matchCost : MISMATCH_PENALTY);
|
||||
for (int targetIndex = targetStart; targetIndex != targetEndExclusive; targetIndex += targetStep) {
|
||||
final char targetCharacter = targetCharacters[targetIndex + targetCharacterOffset];
|
||||
final int targetNeighbor = targetIndex + targetNeighborDelta;
|
||||
|
||||
final int deleteCandidate = this.costMatrix[sourceNeighbor][targetIndex] + this.deleteCost;
|
||||
final int insertCandidate = this.costMatrix[sourceIndex][targetNeighbor] + this.insertCost;
|
||||
final int replaceCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + this.replaceCost;
|
||||
final int matchCandidate = this.costMatrix[sourceNeighbor][targetNeighbor]
|
||||
+ (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY);
|
||||
|
||||
int bestCost = matchCandidate;
|
||||
Trace bestTrace = Trace.MATCH;
|
||||
@@ -521,8 +806,8 @@ public final class PatchCommandEncoder {
|
||||
bestTrace = Trace.REPLACE;
|
||||
}
|
||||
|
||||
costMatrix[sourceIndex][targetIndex] = bestCost;
|
||||
traceMatrix[sourceIndex][targetIndex] = bestTrace;
|
||||
this.costMatrix[sourceIndex][targetIndex] = bestCost;
|
||||
this.traceMatrix[sourceIndex][targetIndex] = bestTrace;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -536,9 +821,9 @@ public final class PatchCommandEncoder {
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommand(char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
|
||||
StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
private String buildPatchCommandBackward(final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength) {
|
||||
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
char pendingSkips = COUNT_SENTINEL;
|
||||
@@ -547,7 +832,7 @@ public final class PatchCommandEncoder {
|
||||
int targetIndex = targetLength;
|
||||
|
||||
while (sourceIndex != 0 || targetIndex != 0) {
|
||||
Trace trace = traceMatrix[sourceIndex][targetIndex];
|
||||
final Trace trace = this.traceMatrix[sourceIndex][targetIndex];
|
||||
|
||||
switch (trace) {
|
||||
case DELETE:
|
||||
@@ -605,6 +890,83 @@ public final class PatchCommandEncoder {
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs compact patch command for forward traversal.
|
||||
*
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommandForward(final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength) {
|
||||
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
char pendingSkips = COUNT_SENTINEL;
|
||||
|
||||
int sourceIndex = 0;
|
||||
int targetIndex = 0;
|
||||
|
||||
while (sourceIndex != sourceLength || targetIndex != targetLength) {
|
||||
final Trace trace = this.traceMatrix[sourceIndex][targetIndex];
|
||||
|
||||
switch (trace) {
|
||||
case DELETE:
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
pendingDeletes++;
|
||||
sourceIndex++;
|
||||
break;
|
||||
|
||||
case INSERT:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
|
||||
targetIndex++;
|
||||
break;
|
||||
|
||||
case REPLACE:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
|
||||
sourceIndex++;
|
||||
targetIndex++;
|
||||
break;
|
||||
|
||||
case MATCH:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
pendingSkips++;
|
||||
sourceIndex++;
|
||||
targetIndex++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
}
|
||||
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends one serialized instruction to the patch command builder.
|
||||
*
|
||||
@@ -612,7 +974,91 @@ public final class PatchCommandEncoder {
|
||||
* @param opcode single-character instruction opcode
|
||||
* @param argument encoded instruction argument
|
||||
*/
|
||||
private static void appendInstruction(StringBuilder patchBuilder, char opcode, char argument) {
|
||||
private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) {
|
||||
patchBuilder.append(opcode).append(argument);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fluent builder for creating direction-specialized {@link PatchCommandEncoder}
|
||||
* instances.
|
||||
*/
|
||||
public static final class Builder {
|
||||
private WordTraversalDirection traversalDirection = WordTraversalDirection.BACKWARD;
|
||||
private int insertCost = 1;
|
||||
private int deleteCost = 1;
|
||||
private int replaceCost = 1;
|
||||
private int matchCost; // = 0
|
||||
|
||||
/**
|
||||
* Creates a builder initialized with the default Egothor-compatible cost model
|
||||
* and backward traversal.
|
||||
*/
|
||||
public Builder() {
|
||||
// Default values are assigned in field initializers.
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets traversal direction used by the created encoder.
|
||||
*
|
||||
* @param value traversal direction
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder traversalDirection(final WordTraversalDirection value) {
|
||||
this.traversalDirection = Objects.requireNonNull(value, "traversalDirection");
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of an insert operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder insertCost(final int value) {
|
||||
this.insertCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of a delete operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder deleteCost(final int value) {
|
||||
this.deleteCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of a replace operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder replaceCost(final int value) {
|
||||
this.replaceCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of a match operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder matchCost(final int value) {
|
||||
this.matchCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a direction-specialized encoder instance.
|
||||
*
|
||||
* @return configured encoder
|
||||
*/
|
||||
public PatchCommandEncoder build() {
|
||||
return new PatchCommandEncoder(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,9 +36,10 @@ import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
@@ -46,14 +47,14 @@ import java.util.logging.Logger;
|
||||
* Parser of line-oriented stemmer dictionary files.
|
||||
*
|
||||
* <p>
|
||||
* Each non-empty logical line consists of a stem followed by zero or more known
|
||||
* word variants separated by whitespace. The first token is interpreted as the
|
||||
* canonical stem, and every following token on the same line is interpreted as
|
||||
* a variant belonging to that stem.
|
||||
* Each non-empty logical line uses a tab-separated values layout. The first
|
||||
* column is interpreted as the canonical stem, and every following
|
||||
* tab-separated column on the same line is interpreted as a variant belonging
|
||||
* to that stem.
|
||||
*
|
||||
* <p>
|
||||
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
|
||||
* and trailing whitespace is ignored.
|
||||
* Input line case normalization is controlled by {@link CaseProcessingMode}.
|
||||
* Leading and trailing whitespace around each column is ignored.
|
||||
*
|
||||
* <p>
|
||||
* The parser supports line remarks and trailing remarks. The remark markers
|
||||
@@ -61,6 +62,13 @@ import java.util.logging.Logger;
|
||||
* remainder of that line is ignored.
|
||||
*
|
||||
* <p>
|
||||
* Dictionary items containing any Unicode whitespace character are currently
|
||||
* not supported. Such items are ignored and reported through a single
|
||||
* {@link Level#WARNING warning}-level log entry per physical line together with
|
||||
* the source line number, the normalized stem column, and the list of ignored
|
||||
* items from that line.
|
||||
*
|
||||
* <p>
|
||||
* This class is intentionally stateless and allocation-light so it can be used
|
||||
* both by runtime loading and by offline compilation tooling.
|
||||
*/
|
||||
@@ -105,11 +113,27 @@ public final class StemmerDictionaryParser {
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
||||
return parse(path, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a filesystem path.
|
||||
*
|
||||
* @param path dictionary file path
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final CaseProcessingMode caseProcessingMode,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
|
||||
return parse(reader, path.toAbsolutePath().toString(), caseProcessingMode, entryHandler);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,7 +148,23 @@ public final class StemmerDictionaryParser {
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), entryHandler);
|
||||
return parse(Path.of(fileName), CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a path string.
|
||||
*
|
||||
* @param fileName dictionary file name or path string
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final CaseProcessingMode caseProcessingMode,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), caseProcessingMode, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -139,8 +179,25 @@ public final class StemmerDictionaryParser {
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
return parse(reader, sourceDescription, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary from a reader.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description for diagnostics
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading or handler processing fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final CaseProcessingMode caseProcessingMode, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(reader, "reader");
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
||||
@@ -153,26 +210,56 @@ public final class StemmerDictionaryParser {
|
||||
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
||||
lineNumber++;
|
||||
|
||||
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
|
||||
final String normalizedLine = normalizeLineCase(stripRemark(line).trim(), caseProcessingMode);
|
||||
if (normalizedLine.isEmpty()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
|
||||
if (!tokenizer.hasMoreTokens()) {
|
||||
final String[] rawColumns = normalizedLine.split("\t", -1);
|
||||
if (rawColumns.length == 0) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final String stem = tokenizer.nextToken();
|
||||
final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
|
||||
final String stem = rawColumns[0].strip();
|
||||
final List<String> acceptedVariants = new ArrayList<String>(Math.max(0, rawColumns.length - 1)); // NOPMD
|
||||
|
||||
for (int index = 0; index < variants.length; index++) {
|
||||
variants[index] = tokenizer.nextToken();
|
||||
if (stem.isEmpty()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
entryHandler.onEntry(stem, variants, lineNumber);
|
||||
if (containsWhitespaceCharacter(stem)) {
|
||||
if (LOGGER.isLoggable(Level.WARNING)) {
|
||||
LOGGER.log(Level.WARNING,
|
||||
"Ignoring dictionary line containing whitespace in source {0} at line {1}, stem {2}.",
|
||||
new Object[] { sourceDescription, lineNumber, stem }); // NOPMD
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int ignored = 0;
|
||||
|
||||
for (int index = 1; index < rawColumns.length; index++) {
|
||||
final String variant = rawColumns[index].strip();
|
||||
if (variant.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
if (containsWhitespaceCharacter(variant)) {
|
||||
ignored++;
|
||||
continue;
|
||||
}
|
||||
acceptedVariants.add(variant);
|
||||
}
|
||||
|
||||
if (ignored > 0 && LOGGER.isLoggable(Level.WARNING)) {
|
||||
LOGGER.log(Level.WARNING,
|
||||
"Ignoring dictionary items containing whitespace in source {0} at line {1}, stem {2}, ignored {3}:{4}.",
|
||||
new Object[] { sourceDescription, lineNumber, stem, ignored, rawColumns.length }); // NOPMD
|
||||
}
|
||||
|
||||
entryHandler.onEntry(stem, acceptedVariants.toArray(String[]::new), lineNumber);
|
||||
logicalEntryCount++;
|
||||
}
|
||||
|
||||
@@ -188,6 +275,36 @@ public final class StemmerDictionaryParser {
|
||||
return statistics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies case normalization to one line according to the selected mode.
|
||||
*
|
||||
* @param line line to normalize
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @return normalized line
|
||||
*/
|
||||
private static String normalizeLineCase(final String line, final CaseProcessingMode caseProcessingMode) {
|
||||
if (caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
return line.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether one dictionary item contains any Unicode whitespace
|
||||
* character.
|
||||
*
|
||||
* @param item dictionary item to inspect
|
||||
* @return {@code true} when the item contains at least one whitespace character
|
||||
*/
|
||||
private static boolean containsWhitespaceCharacter(final String item) {
|
||||
for (int index = 0; index < item.length(); index++) {
|
||||
if (Character.isWhitespace(item.charAt(index))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a trailing remark from one physical line.
|
||||
*
|
||||
|
||||
@@ -0,0 +1,758 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.SplittableRandom;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Evaluates how stemming quality degrades when the compiled trie is built from
|
||||
* only a deterministic subset of the available dictionary knowledge.
|
||||
*
|
||||
* <p>
|
||||
* The experiment operates on whole dictionary entries. For a chosen knowledge
|
||||
* percentage, each parsed dictionary line is deterministically included or
|
||||
* excluded from the training subset using a seeded {@link SplittableRandom}.
|
||||
* The resulting subset is compiled into a {@link FrequencyTrie}, while the
|
||||
* evaluation is performed against all word forms from the original dictionary.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Two lookup APIs are evaluated:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>{@link FrequencyTrie#get(String)} through top-1 accuracy</li>
|
||||
* <li>{@link FrequencyTrie#getAll(String)} through global precision, recall,
|
||||
* and F1</li>
|
||||
* </ul>
|
||||
*/
|
||||
public final class StemmerKnowledgeExperiment {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerKnowledgeExperiment.class.getName());
|
||||
|
||||
/**
|
||||
* Minimum supported knowledge percentage.
|
||||
*/
|
||||
public static final int MINIMUM_KNOWLEDGE_PERCENT = 10;
|
||||
|
||||
/**
|
||||
* Maximum supported knowledge percentage.
|
||||
*/
|
||||
public static final int MAXIMUM_KNOWLEDGE_PERCENT = 100;
|
||||
|
||||
/**
|
||||
* Step between adjacent evaluated knowledge percentages.
|
||||
*/
|
||||
public static final int KNOWLEDGE_PERCENT_STEP = 10;
|
||||
|
||||
/**
|
||||
* Canonical no-op patch command.
|
||||
*/
|
||||
private static final String NOOP_PATCH_COMMAND = PatchCommandEncoder.NOOP_PATCH;
|
||||
|
||||
/**
|
||||
* Shared patch encoder reused for subset compilation.
|
||||
*/
|
||||
private final PatchCommandEncoder patchCommandEncoder;
|
||||
|
||||
/**
|
||||
* Creates a new experiment harness.
|
||||
*/
|
||||
public StemmerKnowledgeExperiment() {
|
||||
this.patchCommandEncoder = PatchCommandEncoder.builder().build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates all supported bundled dictionaries using the supplied seed.
|
||||
*
|
||||
* @param seed deterministic sampling seed
|
||||
* @return immutable ordered list of experiment rows
|
||||
* @throws IOException if reading a bundled dictionary fails
|
||||
*/
|
||||
public List<ResultRow> evaluateAllBundledLanguages(final long seed) throws IOException {
|
||||
final List<ResultRow> rows = new ArrayList<>();
|
||||
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
|
||||
rows.addAll(evaluateBundledLanguage(language, seed));
|
||||
}
|
||||
return List.copyOf(rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates one bundled dictionary across all supported experiment
|
||||
* configurations.
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param seed deterministic sampling seed
|
||||
* @return immutable ordered list of experiment rows
|
||||
* @throws NullPointerException if {@code language} is {@code null}
|
||||
* @throws IOException if reading the bundled dictionary fails
|
||||
*/
|
||||
public List<ResultRow> evaluateBundledLanguage(final StemmerPatchTrieLoader.Language language, final long seed)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
final String resourcePath = language.resourcePath();
|
||||
try (InputStream inputStream = StemmerPatchTrieLoader.openBundledResource(resourcePath)) {
|
||||
try (BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return evaluate(reader, resourcePath, language.name(), seed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates one filesystem dictionary across all supported experiment
|
||||
* configurations.
|
||||
*
|
||||
* @param dictionaryPath path to a dictionary file
|
||||
* @param seed deterministic sampling seed
|
||||
* @return immutable ordered list of experiment rows
|
||||
* @throws NullPointerException if {@code dictionaryPath} is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public List<ResultRow> evaluatePath(final Path dictionaryPath, final long seed) throws IOException {
|
||||
Objects.requireNonNull(dictionaryPath, "dictionaryPath");
|
||||
try (BufferedReader reader = Files.newBufferedReader(dictionaryPath, StandardCharsets.UTF_8)) {
|
||||
return evaluate(reader, dictionaryPath.toAbsolutePath().toString(), dictionaryPath.getFileName().toString(),
|
||||
seed);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates a dictionary provided through an arbitrary reader.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description
|
||||
* @param languageLabel label stored in the result rows
|
||||
* @param seed deterministic sampling seed
|
||||
* @return immutable ordered list of experiment rows
|
||||
* @throws NullPointerException if any argument except {@code seed} is
|
||||
* {@code null}
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
public List<ResultRow> evaluate(final Reader reader, final String sourceDescription, final String languageLabel,
|
||||
final long seed) throws IOException {
|
||||
Objects.requireNonNull(reader, "reader");
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(languageLabel, "languageLabel");
|
||||
|
||||
final DictionaryData dictionaryData = readDictionary(reader, sourceDescription);
|
||||
final List<ResultRow> rows = new ArrayList<>();
|
||||
|
||||
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
|
||||
for (boolean storeOriginal : new boolean[] { false, true }) { // NOPMD
|
||||
for (boolean includeStemInEvaluation : new boolean[] { false, true }) { // NOPMD
|
||||
for (int knowledgePercent = MINIMUM_KNOWLEDGE_PERCENT; knowledgePercent <= MAXIMUM_KNOWLEDGE_PERCENT; knowledgePercent += KNOWLEDGE_PERCENT_STEP) {
|
||||
final ResultRow row = evaluateScenario(dictionaryData, languageLabel, seed, reductionSettings,
|
||||
storeOriginal, includeStemInEvaluation, knowledgePercent);
|
||||
rows.add(row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO, "Knowledge experiment finished for source {0}: entries={1}, rows={2}, seed={3}.",
|
||||
new Object[] { sourceDescription, dictionaryData.entryCount(), rows.size(), seed });
|
||||
}
|
||||
|
||||
return List.copyOf(rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes result rows as UTF-8 CSV with a stable fixed header.
|
||||
*
|
||||
* @param outputPath target file path
|
||||
* @param rows rows to write
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void writeCsv(final Path outputPath, final List<ResultRow> rows) throws IOException {
|
||||
Objects.requireNonNull(outputPath, "outputPath");
|
||||
Objects.requireNonNull(rows, "rows");
|
||||
|
||||
final Path parent = outputPath.getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
final List<String> lines = new ArrayList<>(rows.size() + 1);
|
||||
lines.add(ResultRow.csvHeader());
|
||||
for (ResultRow row : rows) {
|
||||
lines.add(row.toCsvRow());
|
||||
}
|
||||
Files.write(outputPath, lines, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the full dictionary into an in-memory representation suitable for
|
||||
* repeated deterministic subset compilation.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description
|
||||
* @return parsed dictionary data
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
private static DictionaryData readDictionary(final Reader reader, final String sourceDescription)
|
||||
throws IOException {
|
||||
final List<DictionaryEntry> entries = new ArrayList<>();
|
||||
final StemmerDictionaryParser.ParseStatistics parseStatistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription,
|
||||
(stem, variants, lineNumber) -> entries.add(new DictionaryEntry(stem, variants, lineNumber)));
|
||||
return new DictionaryData(sourceDescription, parseStatistics, entries);
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates one concrete experiment scenario.
|
||||
*
|
||||
* @param dictionaryData parsed dictionary data
|
||||
* @param languageLabel logical language label
|
||||
* @param seed deterministic sampling seed
|
||||
* @param reductionSettings reduction settings
|
||||
* @param storeOriginal whether canonical stems are inserted with a
|
||||
* no-op patch
|
||||
* @param includeStemInEvaluation whether the canonical stem itself is evaluated
|
||||
* @param knowledgePercent retained percentage of dictionary entries
|
||||
* @return result row
|
||||
*/
|
||||
private ResultRow evaluateScenario(final DictionaryData dictionaryData, final String languageLabel, final long seed,
|
||||
final ReductionSettings reductionSettings, final boolean storeOriginal,
|
||||
final boolean includeStemInEvaluation, final int knowledgePercent) {
|
||||
final FrequencyTrie<String> trie = compileSubset(dictionaryData, reductionSettings, storeOriginal,
|
||||
knowledgePercent, seed);
|
||||
|
||||
long evaluatedInputCount = 0L;
|
||||
long getCorrectCount = 0L;
|
||||
long getAllTruePositiveCount = 0L;
|
||||
long getAllFalsePositiveCount = 0L;
|
||||
long getAllCoveredInputCount = 0L;
|
||||
long uniqueCandidateCount = 0L;
|
||||
|
||||
for (DictionaryEntry entry : dictionaryData.entries()) {
|
||||
if (includeStemInEvaluation) {
|
||||
final EvaluationCounts stemCounts = evaluateInput(entry.stem(), entry.stem(), trie);
|
||||
evaluatedInputCount++;
|
||||
getCorrectCount += stemCounts.getCorrect();
|
||||
getAllTruePositiveCount += stemCounts.getAllTruePositives();
|
||||
getAllFalsePositiveCount += stemCounts.getAllFalsePositives();
|
||||
getAllCoveredInputCount += stemCounts.getAllCoveredInputs();
|
||||
uniqueCandidateCount += stemCounts.getUniqueCandidateCount();
|
||||
}
|
||||
for (String variant : entry.variants()) {
|
||||
final EvaluationCounts variantCounts = evaluateInput(variant, entry.stem(), trie);
|
||||
evaluatedInputCount++;
|
||||
getCorrectCount += variantCounts.getCorrect();
|
||||
getAllTruePositiveCount += variantCounts.getAllTruePositives();
|
||||
getAllFalsePositiveCount += variantCounts.getAllFalsePositives();
|
||||
getAllCoveredInputCount += variantCounts.getAllCoveredInputs();
|
||||
uniqueCandidateCount += variantCounts.getUniqueCandidateCount();
|
||||
}
|
||||
}
|
||||
|
||||
final long trainingEntryCount = countSelectedEntries(dictionaryData.entryCount(), seed, knowledgePercent);
|
||||
final double getAccuracy = ratio(getCorrectCount, evaluatedInputCount);
|
||||
final double getAllPrecision = ratio(getAllTruePositiveCount,
|
||||
getAllTruePositiveCount + getAllFalsePositiveCount);
|
||||
final double getAllRecall = ratio(getAllCoveredInputCount, evaluatedInputCount);
|
||||
final double getAllF1 = f1(getAllPrecision, getAllRecall);
|
||||
final double averageUniqueCandidateCount = ratio(uniqueCandidateCount, evaluatedInputCount);
|
||||
|
||||
return new ResultRow(languageLabel, reductionSettings.reductionMode().name(), storeOriginal,
|
||||
includeStemInEvaluation, knowledgePercent, seed, dictionaryData.entryCount(), trainingEntryCount,
|
||||
evaluatedInputCount, getCorrectCount, getAccuracy, getAllTruePositiveCount, getAllFalsePositiveCount,
|
||||
getAllCoveredInputCount, getAllPrecision, getAllRecall, getAllF1, averageUniqueCandidateCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles a trie from the deterministically selected subset of dictionary
|
||||
* entries.
|
||||
*
|
||||
* @param dictionaryData parsed dictionary data
|
||||
* @param reductionSettings reduction settings
|
||||
* @param storeOriginal whether stems themselves should be stored
|
||||
* @param knowledgePercent retained percentage of dictionary entries
|
||||
* @param seed deterministic sampling seed
|
||||
* @return compiled trie for the selected subset
|
||||
*/
|
||||
private FrequencyTrie<String> compileSubset(final DictionaryData dictionaryData,
|
||||
final ReductionSettings reductionSettings, final boolean storeOriginal, final int knowledgePercent,
|
||||
final long seed) {
|
||||
validateKnowledgePercent(knowledgePercent);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final SplittableRandom random = new SplittableRandom(seed);
|
||||
|
||||
for (DictionaryEntry entry : dictionaryData.entries()) {
|
||||
if (!isSelected(random, knowledgePercent)) {
|
||||
continue;
|
||||
}
|
||||
if (storeOriginal) {
|
||||
builder.put(entry.stem(), NOOP_PATCH_COMMAND);
|
||||
}
|
||||
for (String variant : entry.variants()) {
|
||||
final String patch = this.patchCommandEncoder.encode(variant, entry.stem());
|
||||
builder.put(variant, patch);
|
||||
}
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates one input word form against both lookup APIs.
|
||||
*
|
||||
* @param input input form to transform
|
||||
* @param expectedStem expected stem
|
||||
* @param trie compiled trie under test
|
||||
* @return immutable counts for this single input
|
||||
*/
|
||||
private static EvaluationCounts evaluateInput(final String input, final String expectedStem,
|
||||
final FrequencyTrie<String> trie) {
|
||||
long getCorrect = 0L;
|
||||
final String preferredPatch = trie.get(input);
|
||||
if (preferredPatch != null) {
|
||||
final String preferredStem = PatchCommandEncoder.apply(input, preferredPatch);
|
||||
if (expectedStem.equals(preferredStem)) {
|
||||
getCorrect = 1L;
|
||||
}
|
||||
} else {
|
||||
if (expectedStem.equals(input)) {
|
||||
getCorrect = 1L;
|
||||
}
|
||||
}
|
||||
|
||||
final String[] patches = trie.getAll(input);
|
||||
|
||||
long truePositives = 0L;
|
||||
long falsePositives = 0L;
|
||||
long coveredInputs = 0L;
|
||||
for (String patch : patches) {
|
||||
final String candidateStem = PatchCommandEncoder.apply(input, patch);
|
||||
if (expectedStem.equals(candidateStem)) {
|
||||
truePositives++;
|
||||
coveredInputs = 1L;
|
||||
} else {
|
||||
falsePositives++;
|
||||
}
|
||||
}
|
||||
return new EvaluationCounts(getCorrect, truePositives, falsePositives, coveredInputs, patches.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts how many entries would be selected for one scenario without
|
||||
* recompiling the trie.
|
||||
*
|
||||
* @param entryCount total entry count
|
||||
* @param seed deterministic sampling seed
|
||||
* @param knowledgePercent retained percentage of dictionary entries
|
||||
* @return selected entry count
|
||||
*/
|
||||
private static long countSelectedEntries(final int entryCount, final long seed, final int knowledgePercent) {
|
||||
validateKnowledgePercent(knowledgePercent);
|
||||
final SplittableRandom random = new SplittableRandom(seed);
|
||||
long count = 0L;
|
||||
for (int index = 0; index < entryCount; index++) {
|
||||
if (isSelected(random, knowledgePercent)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether one entry is selected for the supplied knowledge level.
|
||||
*
|
||||
* @param random deterministic random source
|
||||
* @param knowledgePercent retained percentage of entries
|
||||
* @return {@code true} when the entry should be kept
|
||||
*/
|
||||
private static boolean isSelected(final SplittableRandom random, final int knowledgePercent) {
|
||||
return random.nextInt(100) < knowledgePercent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates one knowledge percentage value.
|
||||
*
|
||||
* @param knowledgePercent value to validate
|
||||
*/
|
||||
private static void validateKnowledgePercent(final int knowledgePercent) {
|
||||
if (knowledgePercent < MINIMUM_KNOWLEDGE_PERCENT || knowledgePercent > MAXIMUM_KNOWLEDGE_PERCENT
|
||||
|| knowledgePercent % KNOWLEDGE_PERCENT_STEP != 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"knowledgePercent must be one of 10, 20, ..., 100 but was " + knowledgePercent + '.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a safe ratio.
|
||||
*
|
||||
* @param numerator numerator
|
||||
* @param denominator denominator
|
||||
* @return ratio, or {@code 0.0} when the denominator is zero
|
||||
*/
|
||||
private static double ratio(final long numerator, final long denominator) {
|
||||
if (denominator == 0L) { // NOPMD
|
||||
return 0.0d;
|
||||
}
|
||||
return (double) numerator / (double) denominator; // NOPMD
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the harmonic mean of precision and recall.
|
||||
*
|
||||
* @param precision global precision
|
||||
* @param recall global recall
|
||||
* @return F1 score, or {@code 0.0} when both inputs are zero
|
||||
*/
|
||||
private static double f1(final double precision, final double recall) {
|
||||
if (precision == 0.0d && recall == 0.0d) {
|
||||
return 0.0d;
|
||||
}
|
||||
return 2.0d * precision * recall / (precision + recall);
|
||||
}
|
||||
|
||||
/**
|
||||
* One parsed dictionary line.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants known variants of the stem
|
||||
* @param lineNumber physical line number in the source dictionary
|
||||
*/
|
||||
private record DictionaryEntry(String stem, String[] variants, int lineNumber) {
|
||||
|
||||
/**
|
||||
* Creates a parsed dictionary entry.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants known variants of the stem
|
||||
* @param lineNumber physical line number in the source dictionary
|
||||
*/
|
||||
private DictionaryEntry {
|
||||
Objects.requireNonNull(stem, "stem");
|
||||
Objects.requireNonNull(variants, "variants");
|
||||
if (lineNumber < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("lineNumber must be positive.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsed dictionary state reused across all scenarios.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param parseStatistics parser statistics
|
||||
* @param entries immutable ordered entries
|
||||
*/
|
||||
private record DictionaryData(String sourceDescription, StemmerDictionaryParser.ParseStatistics parseStatistics,
|
||||
List<DictionaryEntry> entries) {
|
||||
|
||||
/**
|
||||
* Creates parsed dictionary data.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param parseStatistics parser statistics
|
||||
* @param entries immutable ordered entries
|
||||
*/
|
||||
private DictionaryData {
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(parseStatistics, "parseStatistics");
|
||||
Objects.requireNonNull(entries, "entries");
|
||||
entries = List.copyOf(entries);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of logical dictionary entries.
|
||||
*
|
||||
* @return entry count
|
||||
*/
|
||||
private int entryCount() {
|
||||
return this.entries.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-input evaluation counts.
|
||||
*/
|
||||
private static final class EvaluationCounts {
|
||||
|
||||
/**
|
||||
* Preferred lookup correctness.
|
||||
*/
|
||||
private final long getCorrect;
|
||||
|
||||
/**
|
||||
* Number of correct candidates returned by {@code getAll()}.
|
||||
*/
|
||||
private final long getAllTruePositives;
|
||||
|
||||
/**
|
||||
* Number of incorrect candidates returned by {@code getAll()}.
|
||||
*/
|
||||
private final long getAllFalsePositives;
|
||||
|
||||
/**
|
||||
* Whether the correct stem was covered by {@code getAll()}.
|
||||
*/
|
||||
private final long getAllCoveredInputs;
|
||||
|
||||
/**
|
||||
* Number of candidate commands returned by {@code getAll()}.
|
||||
*/
|
||||
private final long uniqueCandidateCount;
|
||||
|
||||
/**
|
||||
* Creates a new immutable counter object.
|
||||
*
|
||||
* @param getCorrect preferred lookup correctness
|
||||
* @param getAllTruePositives correct candidates
|
||||
* @param getAllFalsePositives incorrect candidates
|
||||
* @param getAllCoveredInputs coverage marker
|
||||
* @param uniqueCandidateCount candidate command count
|
||||
*/
|
||||
private EvaluationCounts(final long getCorrect, final long getAllTruePositives, final long getAllFalsePositives,
|
||||
final long getAllCoveredInputs, final long uniqueCandidateCount) {
|
||||
this.getCorrect = getCorrect;
|
||||
this.getAllTruePositives = getAllTruePositives;
|
||||
this.getAllFalsePositives = getAllFalsePositives;
|
||||
this.getAllCoveredInputs = getAllCoveredInputs;
|
||||
this.uniqueCandidateCount = uniqueCandidateCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns preferred lookup correctness.
|
||||
*
|
||||
* @return preferred lookup correctness
|
||||
*/
|
||||
private long getCorrect() {
|
||||
return this.getCorrect;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of correct candidates.
|
||||
*
|
||||
* @return correct candidates
|
||||
*/
|
||||
private long getAllTruePositives() {
|
||||
return this.getAllTruePositives;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of incorrect candidates.
|
||||
*
|
||||
* @return incorrect candidates
|
||||
*/
|
||||
private long getAllFalsePositives() {
|
||||
return this.getAllFalsePositives;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the per-input coverage marker.
|
||||
*
|
||||
* @return coverage marker
|
||||
*/
|
||||
private long getAllCoveredInputs() {
|
||||
return this.getAllCoveredInputs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of candidate commands.
|
||||
*
|
||||
* @return candidate command count
|
||||
*/
|
||||
private long getUniqueCandidateCount() {
|
||||
return this.uniqueCandidateCount;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* One immutable result row of the knowledge experiment.
|
||||
*
|
||||
* @param language language label
|
||||
* @param reductionMode reduction mode name
|
||||
* @param storeOriginal whether no-op patches were stored for
|
||||
* canonical stems
|
||||
* @param includeStemInEvaluation whether canonical stems were part of the
|
||||
* evaluated inputs
|
||||
* @param knowledgePercent retained knowledge percentage
|
||||
* @param seed deterministic sampling seed
|
||||
* @param dictionaryEntryCount total parsed dictionary entry count
|
||||
* @param trainingEntryCount selected dictionary entry count used for
|
||||
* build
|
||||
* @param evaluatedInputCount total evaluated input count
|
||||
* @param getCorrectCount number of correct preferred
|
||||
* transformations
|
||||
* @param getAccuracy preferred lookup accuracy
|
||||
* @param getAllTruePositiveCount number of unique correct candidates from
|
||||
* {@code getAll()}
|
||||
* @param getAllFalsePositiveCount number of unique incorrect candidates from
|
||||
* {@code getAll()}
|
||||
* @param getAllCoveredInputCount number of inputs for which the correct
|
||||
* stem appeared in {@code getAll()}
|
||||
* @param getAllPrecision global candidate precision for
|
||||
* {@code getAll()}
|
||||
* @param getAllRecall global input recall for {@code getAll()}
|
||||
* @param getAllF1 F1 score derived from {@code getAll()}
|
||||
* precision and recall
|
||||
* @param averageUniqueCandidateCount average number of unique candidate stems
|
||||
* per input
|
||||
*/
|
||||
public record ResultRow(String language, String reductionMode, boolean storeOriginal,
|
||||
boolean includeStemInEvaluation, int knowledgePercent, long seed, int dictionaryEntryCount,
|
||||
long trainingEntryCount, long evaluatedInputCount, long getCorrectCount, double getAccuracy,
|
||||
long getAllTruePositiveCount, long getAllFalsePositiveCount, long getAllCoveredInputCount,
|
||||
double getAllPrecision, double getAllRecall, double getAllF1, double averageUniqueCandidateCount) {
|
||||
|
||||
/**
|
||||
* Creates one immutable result row.
|
||||
*
|
||||
* @param language language label
|
||||
* @param reductionMode reduction mode name
|
||||
* @param storeOriginal whether no-op patches were stored for
|
||||
* canonical stems
|
||||
* @param includeStemInEvaluation whether canonical stems were evaluated
|
||||
* @param knowledgePercent retained knowledge percentage
|
||||
* @param seed deterministic sampling seed
|
||||
* @param dictionaryEntryCount total dictionary entry count
|
||||
* @param trainingEntryCount selected training entry count
|
||||
* @param evaluatedInputCount total evaluated input count
|
||||
* @param getCorrectCount number of correct preferred
|
||||
* transformations
|
||||
* @param getAccuracy preferred lookup accuracy
|
||||
* @param getAllTruePositiveCount number of unique correct candidates
|
||||
* @param getAllFalsePositiveCount number of unique incorrect candidates
|
||||
* @param getAllCoveredInputCount coverage count for {@code getAll()}
|
||||
* @param getAllPrecision global candidate precision for
|
||||
* {@code getAll()}
|
||||
* @param getAllRecall global input recall for {@code getAll()}
|
||||
* @param getAllF1 harmonic mean of precision and recall
|
||||
* @param averageUniqueCandidateCount average unique candidate count per input
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
public ResultRow {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
validateKnowledgePercent(knowledgePercent);
|
||||
if (dictionaryEntryCount < 0) {
|
||||
throw new IllegalArgumentException("dictionaryEntryCount must not be negative.");
|
||||
}
|
||||
if (trainingEntryCount < 0L) {
|
||||
throw new IllegalArgumentException("trainingEntryCount must not be negative.");
|
||||
}
|
||||
if (evaluatedInputCount < 0L) {
|
||||
throw new IllegalArgumentException("evaluatedInputCount must not be negative.");
|
||||
}
|
||||
if (getCorrectCount < 0L) {
|
||||
throw new IllegalArgumentException("getCorrectCount must not be negative.");
|
||||
}
|
||||
if (getAllTruePositiveCount < 0L) {
|
||||
throw new IllegalArgumentException("getAllTruePositiveCount must not be negative.");
|
||||
}
|
||||
if (getAllFalsePositiveCount < 0L) {
|
||||
throw new IllegalArgumentException("getAllFalsePositiveCount must not be negative.");
|
||||
}
|
||||
if (getAllCoveredInputCount < 0L) {
|
||||
throw new IllegalArgumentException("getAllCoveredInputCount must not be negative.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the stable CSV header of this result format.
|
||||
*
|
||||
* @return CSV header line
|
||||
*/
|
||||
public static String csvHeader() {
|
||||
return String.join(",",
|
||||
List.of("language", "reductionMode", "storeOriginal", "includeStemInEvaluation", "knowledgePercent",
|
||||
"seed", "dictionaryEntryCount", "trainingEntryCount", "evaluatedInputCount",
|
||||
"getCorrectCount", "getAccuracy", "getAllTruePositiveCount", "getAllFalsePositiveCount",
|
||||
"getAllCoveredInputCount", "getAllPrecision", "getAllRecall", "getAllF1",
|
||||
"averageUniqueCandidateCount"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializes this row as one CSV record.
|
||||
*
|
||||
* @return CSV record
|
||||
*/
|
||||
public String toCsvRow() {
|
||||
return String.join(",",
|
||||
List.of(escapeCsv(this.language), escapeCsv(this.reductionMode), String.valueOf(this.storeOriginal),
|
||||
String.valueOf(this.includeStemInEvaluation), String.valueOf(this.knowledgePercent),
|
||||
String.valueOf(this.seed), String.valueOf(this.dictionaryEntryCount),
|
||||
String.valueOf(this.trainingEntryCount), String.valueOf(this.evaluatedInputCount),
|
||||
String.valueOf(this.getCorrectCount), formatDouble(this.getAccuracy),
|
||||
String.valueOf(this.getAllTruePositiveCount), String.valueOf(this.getAllFalsePositiveCount),
|
||||
String.valueOf(this.getAllCoveredInputCount), formatDouble(this.getAllPrecision),
|
||||
formatDouble(this.getAllRecall), formatDouble(this.getAllF1),
|
||||
formatDouble(this.averageUniqueCandidateCount)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes a string for CSV output.
|
||||
*
|
||||
* @param value value to escape
|
||||
* @return escaped CSV cell
|
||||
*/
|
||||
private static String escapeCsv(final String value) {
|
||||
if (value.indexOf(',') < 0 && value.indexOf('"') < 0 && value.indexOf('\n') < 0
|
||||
&& value.indexOf('\r') < 0) {
|
||||
return value;
|
||||
}
|
||||
return '"' + value.replace("\"", "\"\"") + '"';
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats one floating-point value using a locale-independent decimal
|
||||
* representation.
|
||||
*
|
||||
* @param value value to format
|
||||
* @return formatted value
|
||||
*/
|
||||
private static String formatDouble(final double value) {
|
||||
return String.format(Locale.ROOT, "%.10f", value);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,344 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Command-line entry point for the stemmer knowledge experiment.
|
||||
*/
|
||||
public final class StemmerKnowledgeExperimentCli {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerKnowledgeExperimentCli.class.getName());
|
||||
|
||||
/**
|
||||
* Exit status indicating success.
|
||||
*/
|
||||
private static final int EXIT_SUCCESS = 0;
|
||||
|
||||
/**
|
||||
* Exit status indicating processing failure.
|
||||
*/
|
||||
private static final int EXIT_PROCESSING_ERROR = 1;
|
||||
|
||||
/**
|
||||
* Exit status indicating invalid command-line usage.
|
||||
*/
|
||||
private static final int EXIT_USAGE_ERROR = 2;
|
||||
|
||||
/**
|
||||
* Default deterministic seed.
|
||||
*/
|
||||
private static final long DEFAULT_SEED = 20_260_421L;
|
||||
|
||||
/**
|
||||
* Default output report location.
|
||||
*/
|
||||
private static final Path DEFAULT_OUTPUT_PATH = Path.of("build", "reports", "stemmer-knowledge-experiment.csv");
|
||||
|
||||
/**
|
||||
* Usage banner.
|
||||
*/
|
||||
private static final String USAGE = String.join(System.lineSeparator(),
|
||||
"Usage: StemmerKnowledgeExperimentCli [--bundled-all | --bundled-language <LANG> | --input <PATH>]",
|
||||
" [--seed <LONG>] [--output <CSV_PATH>]", "", "Examples:", " --bundled-all",
|
||||
" --bundled-language US_UK_PROFI --seed 20260421",
|
||||
" --input src/main/resources/us_uk/stemmer --output build/reports/knowledge.csv");
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerKnowledgeExperimentCli() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the CLI as a standalone process.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
*/
|
||||
public static void main(final String[] arguments) {
|
||||
final int exitCode = execute(arguments);
|
||||
System.exit(exitCode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the CLI and translates all outcomes to process exit codes.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
* @return process exit code
|
||||
*/
|
||||
/* default */ static int execute(final String... arguments) {
|
||||
Objects.requireNonNull(arguments, "arguments");
|
||||
try {
|
||||
final CliOptions options = CliOptions.parse(arguments);
|
||||
if (options.command() == Command.HELP) {
|
||||
printUsage(System.out);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
return runExperiment(options);
|
||||
} catch (final CliUsageException exception) {
|
||||
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||
LOGGER.log(Level.SEVERE, "Invalid command-line usage for arguments {0}: {1}",
|
||||
new Object[] { Arrays.toString(arguments), exception.getMessage() });
|
||||
}
|
||||
printUsage(System.err);
|
||||
return EXIT_USAGE_ERROR;
|
||||
} catch (final IOException exception) {
|
||||
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||
LOGGER.log(Level.SEVERE, "Experiment processing failed for arguments {0}", Arrays.toString(arguments));
|
||||
LOGGER.log(Level.SEVERE, "Processing failure details.", exception);
|
||||
}
|
||||
return EXIT_PROCESSING_ERROR;
|
||||
} catch (final RuntimeException exception) { // NOPMD
|
||||
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||
LOGGER.log(Level.SEVERE, "Unexpected runtime failure for arguments {0}", Arrays.toString(arguments));
|
||||
LOGGER.log(Level.SEVERE, "Unexpected processing failure details.", exception);
|
||||
}
|
||||
return EXIT_PROCESSING_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the experiment for already validated options.
|
||||
*
|
||||
* @param options validated CLI options
|
||||
* @return process exit code
|
||||
* @throws IOException if experiment execution fails
|
||||
*/
|
||||
private static int runExperiment(final CliOptions options) throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = switch (options.sourceMode()) {
|
||||
case INPUT_PATH -> experiment.evaluatePath(options.inputPath(), options.seed());
|
||||
case SINGLE_BUNDLED_LANGUAGE -> experiment.evaluateBundledLanguage(options.language(), options.seed());
|
||||
case ALL_BUNDLED_LANGUAGES -> experiment.evaluateAllBundledLanguages(options.seed());
|
||||
};
|
||||
|
||||
StemmerKnowledgeExperiment.writeCsv(options.outputPath(), rows);
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO, "Knowledge experiment report written to {0} with {1} rows.",
|
||||
new Object[] { options.outputPath().toAbsolutePath(), rows.size() });
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints the CLI usage text.
|
||||
*
|
||||
* @param stream target output stream
|
||||
*/
|
||||
private static void printUsage(final PrintStream stream) {
|
||||
stream.println(USAGE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported top-level CLI commands.
|
||||
*/
|
||||
private enum Command {
|
||||
|
||||
/**
|
||||
* Executes the experiment.
|
||||
*/
|
||||
EXECUTE,
|
||||
|
||||
/**
|
||||
* Prints usage text.
|
||||
*/
|
||||
HELP
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported experiment source selection modes.
|
||||
*/
|
||||
private enum ExperimentSourceMode {
|
||||
|
||||
/**
|
||||
* Runs the experiment for all bundled languages.
|
||||
*/
|
||||
ALL_BUNDLED_LANGUAGES,
|
||||
|
||||
/**
|
||||
* Runs the experiment for one bundled language.
|
||||
*/
|
||||
SINGLE_BUNDLED_LANGUAGE,
|
||||
|
||||
/**
|
||||
* Runs the experiment for one external dictionary path.
|
||||
*/
|
||||
INPUT_PATH
|
||||
}
|
||||
|
||||
/**
|
||||
* Exception indicating invalid command-line usage.
|
||||
*/
|
||||
private static final class CliUsageException extends Exception {
|
||||
|
||||
private static final long serialVersionUID = -3904751711104596247L;
|
||||
|
||||
/**
|
||||
* Creates a new usage exception.
|
||||
*
|
||||
* @param message failure description
|
||||
*/
|
||||
private CliUsageException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new usage exception.
|
||||
*
|
||||
* @param message failure description
|
||||
* @param cause original cause
|
||||
*/
|
||||
private CliUsageException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsed CLI options.
|
||||
*
|
||||
* @param command selected top-level command
|
||||
* @param sourceMode selected experiment source mode
|
||||
* @param inputPath optional filesystem dictionary path
|
||||
* @param language optional bundled language
|
||||
* @param seed deterministic sampling seed
|
||||
* @param outputPath CSV report path
|
||||
*/
|
||||
private record CliOptions(Command command, ExperimentSourceMode sourceMode, Path inputPath,
|
||||
StemmerPatchTrieLoader.Language language, long seed, Path outputPath) {
|
||||
|
||||
/**
|
||||
* Parses the command line.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
* @return parsed options
|
||||
* @throws CliUsageException if the command line is invalid
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidReassigningLoopVariables")
|
||||
private static CliOptions parse(final String... arguments) throws CliUsageException {
|
||||
Objects.requireNonNull(arguments, "arguments");
|
||||
|
||||
Command command = Command.EXECUTE;
|
||||
ExperimentSourceMode sourceMode = ExperimentSourceMode.ALL_BUNDLED_LANGUAGES;
|
||||
Path inputPath = null;
|
||||
StemmerPatchTrieLoader.Language language = null;
|
||||
long seed = DEFAULT_SEED;
|
||||
Path outputPath = DEFAULT_OUTPUT_PATH;
|
||||
|
||||
final List<String> tokens = new ArrayList<>(List.of(arguments));
|
||||
for (int index = 0; index < tokens.size(); index++) {
|
||||
final String token = tokens.get(index);
|
||||
switch (token) {
|
||||
case "--input" -> {
|
||||
sourceMode = ExperimentSourceMode.INPUT_PATH;
|
||||
inputPath = Path.of(requireValue(tokens, ++index, token));
|
||||
language = null;
|
||||
}
|
||||
case "--bundled-language" -> {
|
||||
sourceMode = ExperimentSourceMode.SINGLE_BUNDLED_LANGUAGE;
|
||||
language = parseLanguage(requireValue(tokens, ++index, token));
|
||||
inputPath = null;
|
||||
}
|
||||
case "--bundled-all" -> {
|
||||
sourceMode = ExperimentSourceMode.ALL_BUNDLED_LANGUAGES;
|
||||
inputPath = null;
|
||||
language = null;
|
||||
}
|
||||
case "--seed" -> seed = parseSeed(requireValue(tokens, ++index, token));
|
||||
case "--output" -> outputPath = Path.of(requireValue(tokens, ++index, token));
|
||||
case "--help", "-h" -> command = Command.HELP;
|
||||
default -> throw new CliUsageException("Unknown argument: " + token);
|
||||
}
|
||||
}
|
||||
|
||||
return new CliOptions(command, sourceMode, inputPath, language, seed, outputPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the required value after one option token.
|
||||
*
|
||||
* @param tokens all tokens
|
||||
* @param index expected value index
|
||||
* @param option current option token
|
||||
* @return option value
|
||||
* @throws CliUsageException if the option value is missing
|
||||
*/
|
||||
private static String requireValue(final List<String> tokens, final int index, final String option)
|
||||
throws CliUsageException {
|
||||
if (index >= tokens.size()) {
|
||||
throw new CliUsageException("Missing value for option " + option + '.');
|
||||
}
|
||||
return tokens.get(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the deterministic seed.
|
||||
*
|
||||
* @param value textual seed value
|
||||
* @return parsed seed
|
||||
* @throws CliUsageException if the seed value is invalid
|
||||
*/
|
||||
private static long parseSeed(final String value) throws CliUsageException {
|
||||
try {
|
||||
return Long.parseLong(value);
|
||||
} catch (final NumberFormatException exception) {
|
||||
throw new CliUsageException("Invalid value for --seed: " + value, exception);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the bundled language selector.
|
||||
*
|
||||
* @param value textual language name
|
||||
* @return parsed language
|
||||
* @throws CliUsageException if the language value is invalid
|
||||
*/
|
||||
private static StemmerPatchTrieLoader.Language parseLanguage(final String value) throws CliUsageException {
|
||||
try {
|
||||
return StemmerPatchTrieLoader.Language.valueOf(value);
|
||||
} catch (final IllegalArgumentException exception) {
|
||||
throw new CliUsageException("Invalid value for --bundled-language: " + value, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -132,6 +132,48 @@ public final class StemmerPatchTrieBinaryIO {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||
* at a filesystem path.
|
||||
*
|
||||
* @param path source file
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return read(path).metadata();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||
* at a filesystem path string.
|
||||
*
|
||||
* @param fileName source file name or path string
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return readMetadata(Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie from an
|
||||
* input stream.
|
||||
*
|
||||
* @param inputStream source stream
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return read(inputStream).metadata();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||
*
|
||||
|
||||
@@ -30,24 +30,27 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/**
|
||||
* Loader of patch-command tries from bundled stemmer dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* Each dictionary is line-oriented. The first token on a line is interpreted as
|
||||
* the stem, and all following tokens are treated as known variants of that
|
||||
* stem.
|
||||
* Each dictionary is line-oriented and uses a tab-separated values layout. The
|
||||
* first column on a line is interpreted as the stem, and all following
|
||||
* tab-separated columns are treated as known variants of that stem.
|
||||
*
|
||||
* <p>
|
||||
* For each line, the loader inserts:
|
||||
@@ -55,15 +58,20 @@ import java.util.logging.Logger;
|
||||
* <li>the stem itself mapped to the canonical no-op patch command
|
||||
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
|
||||
* <li>every distinct variant mapped to the patch command transforming that
|
||||
* variant to the stem</li>
|
||||
* variant to the stem using the traversal direction implied by the selected
|
||||
* language or loader overload</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
|
||||
* line remarks introduced by {@code #} or {@code //}.
|
||||
* line remarks introduced by {@code #} or {@code //} and ignores dictionary
|
||||
* items containing Unicode whitespace characters while reporting them through
|
||||
* aggregated warning log records.
|
||||
*/
|
||||
public final class StemmerPatchTrieLoader {
|
||||
|
||||
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
@@ -83,90 +91,151 @@ public final class StemmerPatchTrieLoader {
|
||||
|
||||
/**
|
||||
* Supported bundled stemmer dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* Each language constant defines:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>the resource directory name used under the bundled resources tree</li>
|
||||
* <li>whether the language is written right-to-left</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* The right-to-left flag is intended for consumers that need to decide whether
|
||||
* affix-oriented processing should conceptually traverse words from the visual
|
||||
* end or from the logical beginning of the stored form.
|
||||
* </p>
|
||||
*/
|
||||
public enum Language {
|
||||
|
||||
/**
|
||||
* Czech.
|
||||
*/
|
||||
CS_CZ("cs_cz", false),
|
||||
|
||||
/**
|
||||
* Danish.
|
||||
*/
|
||||
DA_DK("da_dk"),
|
||||
DA_DK("da_dk", false),
|
||||
|
||||
/**
|
||||
* German.
|
||||
*/
|
||||
DE_DE("de_de"),
|
||||
DE_DE("de_de", false),
|
||||
|
||||
/**
|
||||
* Spanish.
|
||||
*/
|
||||
ES_ES("es_es"),
|
||||
ES_ES("es_es", false),
|
||||
|
||||
/**
|
||||
* Persian.
|
||||
*/
|
||||
FA_IR("fa_ir", true),
|
||||
|
||||
/**
|
||||
* Finnish.
|
||||
*/
|
||||
FI_FI("fi_fi", false),
|
||||
|
||||
/**
|
||||
* French.
|
||||
*/
|
||||
FR_FR("fr_fr"),
|
||||
FR_FR("fr_fr", false),
|
||||
|
||||
/**
|
||||
* Hebrew.
|
||||
*/
|
||||
HE_IL("he_il", true),
|
||||
|
||||
/**
|
||||
* Hungarian.
|
||||
*/
|
||||
HU_HU("hu_hu", false),
|
||||
|
||||
/**
|
||||
* Italian.
|
||||
*/
|
||||
IT_IT("it_it"),
|
||||
IT_IT("it_it", false),
|
||||
|
||||
/**
|
||||
* Norwegian Bokmål.
|
||||
*/
|
||||
NB_NO("nb_no", false),
|
||||
|
||||
/**
|
||||
* Dutch.
|
||||
*/
|
||||
NL_NL("nl_nl"),
|
||||
NL_NL("nl_nl", false),
|
||||
|
||||
/**
|
||||
* Norwegian.
|
||||
* Norwegian Nynorsk.
|
||||
*/
|
||||
NO_NO("no_no"),
|
||||
NN_NO("nn_no", false),
|
||||
|
||||
/**
|
||||
* Polish.
|
||||
*/
|
||||
PL_PL("pl_pl", false),
|
||||
|
||||
/**
|
||||
* Portuguese.
|
||||
*/
|
||||
PT_PT("pt_pt"),
|
||||
PT_PT("pt_pt", false),
|
||||
|
||||
/**
|
||||
* Russian.
|
||||
*/
|
||||
RU_RU("ru_ru"),
|
||||
RU_RU("ru_ru", false),
|
||||
|
||||
/**
|
||||
* Swedish.
|
||||
*/
|
||||
SV_SE("sv_se"),
|
||||
SV_SE("sv_se", false),
|
||||
|
||||
/**
|
||||
* Ukrainian.
|
||||
*/
|
||||
UK_UA("uk_ua", false),
|
||||
|
||||
/**
|
||||
* English.
|
||||
*/
|
||||
US_UK("us_uk"),
|
||||
US_UK("us_uk", false),
|
||||
|
||||
/**
|
||||
* English professional dictionary.
|
||||
* Yiddish.
|
||||
*/
|
||||
US_UK_PROFI("us_uk.profi");
|
||||
YI("yi", true);
|
||||
|
||||
/**
|
||||
* Resource directory name.
|
||||
*/
|
||||
private final String resourceDirectory;
|
||||
|
||||
/**
|
||||
* Whether the language is written right-to-left.
|
||||
*/
|
||||
private final boolean rightToLeft;
|
||||
|
||||
/**
|
||||
* Creates a language constant.
|
||||
*
|
||||
* @param resourceDirectory resource directory name
|
||||
* @param rightToLeft whether the language is written right-to-left
|
||||
*/
|
||||
Language(final String resourceDirectory) {
|
||||
Language(final String resourceDirectory, final boolean rightToLeft) {
|
||||
this.resourceDirectory = resourceDirectory;
|
||||
this.rightToLeft = rightToLeft;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the classpath resource path of the stemmer dictionary.
|
||||
* Returns the classpath resource path of the bundled stemmer dictionary.
|
||||
*
|
||||
* @return classpath resource path
|
||||
*/
|
||||
public String resourcePath() {
|
||||
return this.resourceDirectory + "/stemmer";
|
||||
return this.resourceDirectory + "/stemmer.gz";
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -177,11 +246,45 @@ public final class StemmerPatchTrieLoader {
|
||||
public String resourceDirectory() {
|
||||
return this.resourceDirectory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the language is written right-to-left.
|
||||
*
|
||||
* <p>
|
||||
* This flag can be used by trie-building and lookup logic to decide whether
|
||||
* suffix-oriented traversal should operate on the stored word form as-is rather
|
||||
* than by reversing the logical character sequence.
|
||||
* </p>
|
||||
*
|
||||
* @return {@code true} when the language is written right-to-left, otherwise
|
||||
* {@code false}
|
||||
*/
|
||||
public boolean isRightToLeft() {
|
||||
return this.rightToLeft;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit reduction settings.
|
||||
*
|
||||
* <p>
|
||||
* This overload applies the following implicit compilation defaults in addition
|
||||
* to the supplied {@code reductionSettings}:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>traversal direction is derived from {@link Language#isRightToLeft()}
|
||||
* ({@link WordTraversalDirection#FORWARD} for right-to-left languages,
|
||||
* {@link WordTraversalDirection#BACKWARD} otherwise)</li>
|
||||
* <li>case processing mode is
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}</li>
|
||||
* <li>diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* The resolved settings are persisted into {@link TrieMetadata} of the
|
||||
* resulting trie.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -194,13 +297,40 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
return load(language, storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit trie compilation metadata.
|
||||
*
|
||||
* <p>
|
||||
* All semantic compilation settings (reduction mode and thresholds, traversal
|
||||
* direction, case processing mode, and diacritic processing mode) are taken
|
||||
* from the supplied metadata object and are persisted unchanged in the
|
||||
* resulting trie.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final TrieMetadata metadata) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(metadata, "metadata");
|
||||
|
||||
final String resourcePath = language.resourcePath();
|
||||
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings);
|
||||
return load(reader, resourcePath, storeOriginal, metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,6 +338,14 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload is equivalent to calling
|
||||
* {@link #load(Language, boolean, ReductionSettings)} with
|
||||
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the
|
||||
* same implicit defaults for traversal direction, case processing mode, and
|
||||
* diacritic processing mode.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -225,6 +363,14 @@ public final class StemmerPatchTrieLoader {
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||
*
|
||||
* <p>
|
||||
* This overload applies historical Egothor-compatible implicit defaults:
|
||||
* {@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and
|
||||
* {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in
|
||||
* resulting trie metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -235,11 +381,119 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings);
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings
|
||||
* and explicit traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* Implicit defaults still apply for unspecified dimensions:
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||
* explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload still defaults diacritic processing to
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||
* traversal direction, case processing mode, and diacritic processing mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted
|
||||
* using the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys
|
||||
* and patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary
|
||||
* parsing
|
||||
* @param diacriticProcessingMode diacritic processing mode used during
|
||||
* dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||
diacriticProcessingMode);
|
||||
return load(path, storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit trie compilation
|
||||
* metadata.
|
||||
*
|
||||
* <p>
|
||||
* The supplied metadata is the authoritative source of trie compilation
|
||||
* semantics. Callers should ensure metadata matches how they expect to query
|
||||
* the trie (for example, with or without lowercasing or diacritic stripping).
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(metadata, "metadata");
|
||||
|
||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -247,6 +501,15 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path using default settings for the
|
||||
* supplied reduction mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload is equivalent to calling
|
||||
* {@link #load(Path, boolean, ReductionSettings)} with
|
||||
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -265,6 +528,13 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as {@link #load(Path, boolean, ReductionSettings)} including
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -275,14 +545,130 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings and explicit traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as
|
||||
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}.
|
||||
* Implicit defaults remain
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings, explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as
|
||||
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}.
|
||||
* Implicit default remains {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings, explicit traversal direction, explicit case processing mode, and
|
||||
* explicit diacritic processing mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted
|
||||
* using the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys
|
||||
* and patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary
|
||||
* parsing
|
||||
* @param diacriticProcessingMode diacritic processing mode used during
|
||||
* dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
diacriticProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit trie
|
||||
* compilation metadata.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as {@link #load(Path, boolean, TrieMetadata)}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final TrieMetadata metadata) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using default settings for
|
||||
* the supplied reduction mode.
|
||||
*
|
||||
* <p>
|
||||
* Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -293,7 +679,7 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionMode);
|
||||
}
|
||||
|
||||
@@ -304,18 +690,21 @@ public final class StemmerPatchTrieLoader {
|
||||
* @param sourceDescription logical source description used for diagnostics
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param metadata trie metadata used to drive all compilation settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||
final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder();
|
||||
final boolean storeOriginal, final TrieMetadata metadata) throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(),
|
||||
metadata.diacriticProcessingMode());
|
||||
final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(metadata.traversalDirection()).build();
|
||||
final int[] insertedMappings = new int[1];
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription, (stem, variants, lineNumber) -> {
|
||||
sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> {
|
||||
if (storeOriginal) {
|
||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||
insertedMappings[0]++;
|
||||
@@ -331,14 +720,35 @@ public final class StemmerPatchTrieLoader {
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE,
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}.",
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.",
|
||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||
statistics.entryCount(), statistics.ignoredLineCount() });
|
||||
statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() });
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode,
|
||||
caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the traversal direction implied by a bundled language definition.
|
||||
*
|
||||
* @param language bundled language
|
||||
* @return traversal direction to use for that language
|
||||
*/
|
||||
private static WordTraversalDirection traversalDirectionOf(final Language language) {
|
||||
return language.isRightToLeft() ? WordTraversalDirection.FORWARD : WordTraversalDirection.BACKWARD;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||
*
|
||||
@@ -364,7 +774,7 @@ public final class StemmerPatchTrieLoader {
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||
}
|
||||
|
||||
@@ -381,6 +791,50 @@ public final class StemmerPatchTrieLoader {
|
||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie file.
|
||||
*
|
||||
* @param path path to the compressed binary trie file
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie file.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie stream.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if the stream cannot be decompressed or read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
@@ -405,10 +859,40 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens one filesystem dictionary input stream.
|
||||
*
|
||||
* <p>
|
||||
* Plain-text dictionaries are returned as-is. GZip-compressed dictionaries are
|
||||
* detected from the stream header rather than from the file extension so that
|
||||
* callers may provide arbitrary temporary file names without changing the
|
||||
* loading contract.
|
||||
* </p>
|
||||
*
|
||||
* @param path dictionary file path
|
||||
* @return opened dictionary stream, transparently decompressing GZip inputs
|
||||
* @throws IOException if the file cannot be opened
|
||||
*/
|
||||
private static InputStream openDictionaryInputStream(final Path path) throws IOException {
|
||||
final PushbackInputStream pushbackInputStream = new PushbackInputStream(
|
||||
new BufferedInputStream(Files.newInputStream(path)), 2);
|
||||
final byte[] header = pushbackInputStream.readNBytes(2);
|
||||
|
||||
if (header.length > 0) {
|
||||
pushbackInputStream.unread(header);
|
||||
}
|
||||
|
||||
if (header.length == 2 && (header[0] & 0xFF) == 0x1F && (header[1] & 0xFF) == 0x8B) {
|
||||
return new GZIPInputStream(pushbackInputStream);
|
||||
}
|
||||
|
||||
return pushbackInputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a bundled resource from the classpath.
|
||||
*
|
||||
@@ -416,12 +900,12 @@ public final class StemmerPatchTrieLoader {
|
||||
* @return opened input stream
|
||||
* @throws IOException if the resource cannot be found
|
||||
*/
|
||||
private static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||
/* default */ static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
|
||||
if (inputStream == null) {
|
||||
throw new IOException("Stemmer resource not found: " + resourcePath);
|
||||
}
|
||||
return inputStream;
|
||||
return new GZIPInputStream(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
235
src/main/java/org/egothor/stemmer/TrieMetadata.java
Normal file
235
src/main/java/org/egothor/stemmer/TrieMetadata.java
Normal file
@@ -0,0 +1,235 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable metadata persisted together with a compiled trie artifact.
|
||||
*
|
||||
* <p>
|
||||
* The metadata captures the semantic build configuration required to interpret
|
||||
* the compiled trie correctly after it is reloaded. Persisting the metadata as
|
||||
* part of the artifact makes the binary format self-describing and avoids
|
||||
* coupling runtime consumers to external side-channel configuration.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The record is intentionally extensible. It already models traversal
|
||||
* direction, reduction settings, and diacritic processing strategy, even though
|
||||
* not every field necessarily influences all current code paths yet.
|
||||
* </p>
|
||||
*
|
||||
* @param formatVersion persisted binary format version of the trie
|
||||
* artifact
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy associated with
|
||||
* the artifact
|
||||
* @param caseProcessingMode case processing strategy associated with the
|
||||
* artifact
|
||||
*/
|
||||
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
|
||||
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
|
||||
CaseProcessingMode caseProcessingMode) {
|
||||
/**
|
||||
* Header identifying the human-readable metadata block layout.
|
||||
*/
|
||||
private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1";
|
||||
|
||||
/**
|
||||
* Creates a new metadata instance.
|
||||
*
|
||||
* @param formatVersion persisted binary format version, must be at
|
||||
* least {@code 1}
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy
|
||||
* @param caseProcessingMode case processing strategy
|
||||
*/
|
||||
public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||
final CaseProcessingMode caseProcessingMode) {
|
||||
if (formatVersion < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("formatVersion must be at least 1.");
|
||||
}
|
||||
this.formatVersion = formatVersion;
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates metadata populated with current-format defaults for freshly compiled
|
||||
* tries.
|
||||
*
|
||||
* @param formatVersion persisted binary format version
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @return metadata initialized with current defaults
|
||||
*/
|
||||
public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings) {
|
||||
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates metadata for a newly compiled trie using the currently persisted
|
||||
* binary stream format version.
|
||||
*
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy
|
||||
* @param caseProcessingMode case processing strategy
|
||||
* @return metadata aligned with the current persisted stream format
|
||||
*/
|
||||
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||
final CaseProcessingMode caseProcessingMode) {
|
||||
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
|
||||
diacriticProcessingMode, caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates metadata compatible with a legacy artifact version that did not store
|
||||
* the full configuration explicitly.
|
||||
*
|
||||
* @param formatVersion legacy persisted binary format version
|
||||
* @param traversalDirection logical key traversal direction reconstructed from
|
||||
* the legacy stream
|
||||
* @return metadata reconstructed with conservative compatibility defaults
|
||||
*/
|
||||
public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) {
|
||||
return new TrieMetadata(formatVersion, traversalDirection,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns metadata encoded as a deterministic human-readable text block.
|
||||
*
|
||||
* <p>
|
||||
* The format intentionally uses plain {@code key=value} lines so users can
|
||||
* inspect metadata quickly from a decompressed trie payload without additional
|
||||
* dependencies.
|
||||
* </p>
|
||||
*
|
||||
* @return persisted metadata text block
|
||||
*/
|
||||
@SuppressWarnings("PMD.ConsecutiveLiteralAppends")
|
||||
public String toTextBlock() {
|
||||
final StringBuilder textBlockBuilder = new StringBuilder(1024);
|
||||
textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n')
|
||||
//
|
||||
.append("formatVersion=").append(this.formatVersion).append('\n')
|
||||
//
|
||||
.append("traversalDirection=").append(this.traversalDirection.name()).append('\n')
|
||||
//
|
||||
.append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n')
|
||||
//
|
||||
.append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n')
|
||||
//
|
||||
.append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent())
|
||||
.append('\n')
|
||||
//
|
||||
.append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio())
|
||||
.append('\n')
|
||||
//
|
||||
.append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n')
|
||||
//
|
||||
.append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n');
|
||||
return textBlockBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses metadata from a text block produced by {@link #toTextBlock()}.
|
||||
*
|
||||
* @param formatVersion persisted binary format version
|
||||
* @param textBlock metadata text block
|
||||
* @return parsed metadata
|
||||
*/
|
||||
public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) {
|
||||
Objects.requireNonNull(textBlock, "textBlock");
|
||||
|
||||
final String[] lines = textBlock.split("\\R");
|
||||
if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) {
|
||||
throw new IllegalArgumentException("Unsupported metadata block header.");
|
||||
}
|
||||
|
||||
final Map<String, String> entries = new HashMap<>();
|
||||
for (int index = 1; index < lines.length; index++) {
|
||||
final String line = lines[index];
|
||||
if (line.isBlank()) {
|
||||
continue;
|
||||
}
|
||||
final int delimiterIndex = line.indexOf('=');
|
||||
if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) {
|
||||
throw new IllegalArgumentException("Invalid metadata line: " + line);
|
||||
}
|
||||
entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1));
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection = WordTraversalDirection
|
||||
.valueOf(requireEntry(entries, "traversalDirection"));
|
||||
final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode"));
|
||||
final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent"));
|
||||
final int dominantWinnerOverSecondRatio = Integer // NOPMD
|
||||
.parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio"));
|
||||
final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode
|
||||
.valueOf(requireEntry(entries, "diacriticProcessingMode"));
|
||||
final CaseProcessingMode caseProcessingMode = CaseProcessingMode
|
||||
.valueOf(requireEntry(entries, "caseProcessingMode"));
|
||||
|
||||
return new TrieMetadata(formatVersion, traversalDirection,
|
||||
new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingMode, caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a required metadata entry from a parsed text block.
|
||||
*
|
||||
* @param entries parsed metadata entries
|
||||
* @param key required entry key
|
||||
* @return non-blank entry value
|
||||
* @throws IllegalArgumentException if the entry is absent or blank
|
||||
*/
|
||||
private static String requireEntry(final Map<String, String> entries, final String key) {
|
||||
final String value = entries.get(key);
|
||||
if (value == null || value.isBlank()) {
|
||||
throw new IllegalArgumentException("Missing metadata entry: " + key);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
152
src/main/java/org/egothor/stemmer/WordTraversalDirection.java
Normal file
152
src/main/java/org/egothor/stemmer/WordTraversalDirection.java
Normal file
@@ -0,0 +1,152 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Defines the logical direction in which word characters are traversed.
|
||||
*
|
||||
* <p>
|
||||
* The same direction is used consistently in two places:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>when a word key is traversed through a trie</li>
|
||||
* <li>when patch commands are serialized and then applied back to a source
|
||||
* word</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* {@link #FORWARD} means that processing starts at the logical beginning of the
|
||||
* stored form and moves toward its end. {@link #BACKWARD} means that processing
|
||||
* starts at the logical end of the stored form and moves toward its beginning.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For traditional suffix-oriented Egothor data, {@link #BACKWARD} matches the
|
||||
* historical behavior. For right-to-left languages whose affix logic should
|
||||
* operate on the stored form as written, {@link #FORWARD} can be used so that
|
||||
* neither trie construction nor patch application needs to reverse words
|
||||
* externally.
|
||||
* </p>
|
||||
*/
|
||||
public enum WordTraversalDirection {
|
||||
|
||||
/**
|
||||
* Traverses a word from its logical beginning toward its logical end.
|
||||
*/
|
||||
FORWARD,
|
||||
|
||||
/**
|
||||
* Traverses a word from its logical end toward its logical beginning.
|
||||
*/
|
||||
BACKWARD;
|
||||
|
||||
/**
|
||||
* Returns the traversal start index for a character sequence of the supplied
|
||||
* length.
|
||||
*
|
||||
* @param length sequence length
|
||||
* @return start index, or {@code -1} when the sequence is empty and traversal
|
||||
* should therefore not begin
|
||||
* @throws IllegalArgumentException if {@code length} is negative
|
||||
*/
|
||||
public int startIndex(final int length) {
|
||||
if (length < 0) {
|
||||
throw new IllegalArgumentException("length must not be negative.");
|
||||
}
|
||||
if (length == 0) {
|
||||
return -1;
|
||||
}
|
||||
return this == FORWARD ? 0 : length - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the logical character index addressed by the supplied traversal
|
||||
* offset.
|
||||
*
|
||||
* <p>
|
||||
* A traversal offset of {@code 0} addresses the first character seen in this
|
||||
* direction, {@code 1} the second character, and so on.
|
||||
* </p>
|
||||
*
|
||||
* @param length sequence length
|
||||
* @param traversalOffset zero-based offset from the traversal start
|
||||
* @return corresponding logical character index
|
||||
* @throws IllegalArgumentException if any argument is outside the valid range
|
||||
*/
|
||||
public int logicalIndex(final int length, final int traversalOffset) {
|
||||
if (length < 0) {
|
||||
throw new IllegalArgumentException("length must not be negative.");
|
||||
}
|
||||
if (traversalOffset < 0 || traversalOffset >= length) {
|
||||
throw new IllegalArgumentException("traversalOffset is outside the valid range.");
|
||||
}
|
||||
return this == FORWARD ? traversalOffset : length - 1 - traversalOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the characters of the supplied word in this traversal order.
|
||||
*
|
||||
* @param word source word
|
||||
* @return traversal-ordered characters
|
||||
* @throws NullPointerException if {@code word} is {@code null}
|
||||
*/
|
||||
public char[] toTraversalCharacters(final String word) {
|
||||
Objects.requireNonNull(word, "word");
|
||||
final char[] characters = word.toCharArray();
|
||||
if (this == FORWARD) {
|
||||
return characters;
|
||||
}
|
||||
|
||||
for (int left = 0, right = characters.length - 1; left < right; left++, right--) { // NOPMD
|
||||
final char swap = characters[left];
|
||||
characters[left] = characters[right];
|
||||
characters[right] = swap;
|
||||
}
|
||||
return characters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a path represented in traversal order back to the logical key form.
|
||||
*
|
||||
* @param traversalPath key path in traversal order
|
||||
* @return logical key form
|
||||
* @throws NullPointerException if {@code traversalPath} is {@code null}
|
||||
*/
|
||||
public String traversalPathToLogicalKey(final CharSequence traversalPath) {
|
||||
Objects.requireNonNull(traversalPath, "traversalPath");
|
||||
if (this == FORWARD) {
|
||||
return traversalPath.toString();
|
||||
}
|
||||
return new StringBuilder(traversalPath).reverse().toString();
|
||||
}
|
||||
}
|
||||
@@ -56,12 +56,17 @@
|
||||
* <p>
|
||||
* Dictionary loading is provided by
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
|
||||
* traditional line-oriented stemmer resource format in which each non-empty
|
||||
* logical line starts with a canonical stem followed by known surface variants.
|
||||
* traditional line-oriented tab-separated values resource format in which each
|
||||
* non-empty logical line starts with a canonical stem followed by known surface
|
||||
* variants in subsequent tab-separated columns.
|
||||
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
||||
* which normalizes input to lower case using {@link java.util.Locale#ROOT} and
|
||||
* which applies configurable case processing through
|
||||
* {@link org.egothor.stemmer.CaseProcessingMode} (default:
|
||||
* {@link org.egothor.stemmer.CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}),
|
||||
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
||||
* {@code //}. During loading, each variant is converted into a patch command
|
||||
* {@code //}, and currently ignores dictionary items containing Unicode
|
||||
* whitespace characters while reporting them through warning-level diagnostics.
|
||||
* During loading, each variant is converted into a patch command
|
||||
* targeting the canonical stem, and the stem itself may optionally be stored
|
||||
* under the canonical no-operation patch.
|
||||
* </p>
|
||||
|
||||
@@ -60,11 +60,23 @@ import java.util.Objects;
|
||||
this.childSignature = childSignature;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code consistent with descriptor equality.
|
||||
*
|
||||
* @return descriptor hash code
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.edge, this.childSignature);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this descriptor with another object.
|
||||
*
|
||||
* @param other object to compare with
|
||||
* @return {@code true} when both descriptors represent the same semantic
|
||||
* reduction identity
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
|
||||
@@ -53,11 +53,23 @@ import java.util.Objects;
|
||||
this.dominantValue = dominantValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code consistent with descriptor equality.
|
||||
*
|
||||
* @return descriptor hash code
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hashCode(this.dominantValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this descriptor with another object.
|
||||
*
|
||||
* @param other object to compare with
|
||||
* @return {@code true} when both descriptors represent the same semantic
|
||||
* reduction identity
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
|
||||
@@ -65,11 +65,23 @@ import java.util.List;
|
||||
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code consistent with descriptor equality.
|
||||
*
|
||||
* @return descriptor hash code
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.orderedValues.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this descriptor with another object.
|
||||
*
|
||||
* @param other object to compare with
|
||||
* @return {@code true} when both descriptors represent the same semantic
|
||||
* reduction identity
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
|
||||
@@ -67,11 +67,23 @@ import java.util.Set;
|
||||
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code consistent with descriptor equality.
|
||||
*
|
||||
* @return descriptor hash code
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.distinctValues.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this descriptor with another object.
|
||||
*
|
||||
* @param other object to compare with
|
||||
* @return {@code true} when both descriptors represent the same semantic
|
||||
* reduction identity
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
|
||||
53
src/main/javadoc/overview.html
Normal file
53
src/main/javadoc/overview.html
Normal file
@@ -0,0 +1,53 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Radixor Overview</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Radixor</h1>
|
||||
|
||||
<p>
|
||||
Radixor is a high-performance Java toolkit for dictionary-driven stemming based on
|
||||
the proven Egothor patch-command trie approach. It is designed for production-grade
|
||||
search and text-processing systems that require deterministic behavior, efficient
|
||||
runtime execution, and maintainable lexical assets.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In addition to compiling and executing stemming dictionaries, Radixor extends the
|
||||
traditional Egothor model with support for evolving compiled dictionary artifacts
|
||||
through additional transformation layers. This allows existing lexical resources to
|
||||
be refined incrementally without requiring full recompilation from source dictionaries.
|
||||
</p>
|
||||
|
||||
<h2>Project Scope</h2>
|
||||
<ul>
|
||||
<li>Compilation of Egothor-compatible stemming dictionaries</li>
|
||||
<li>Runtime stemming over compact compiled trie artifacts</li>
|
||||
<li>Transformation and reduction infrastructure for lexical processing</li>
|
||||
<li>CLI and programmatic integration for Java 21 and newer</li>
|
||||
</ul>
|
||||
|
||||
<h2>API Documentation</h2>
|
||||
<p>
|
||||
This Javadoc site documents the Java API of the project. For usage guidance,
|
||||
architectural context, benchmarking methodology, published reports, and general
|
||||
project documentation, refer to the main project site:
|
||||
<a href="https://leogalambos.github.io/Radixor/">leogalambos.github.io/Radixor</a>.
|
||||
</p>
|
||||
|
||||
<h2>License</h2>
|
||||
<p>
|
||||
Radixor is distributed under the
|
||||
<a href="https://github.com/leogalambos/Radixor/blob/main/LICENSE">BSD-3-Clause License</a>.
|
||||
</p>
|
||||
|
||||
<h2>Packages</h2>
|
||||
<p>
|
||||
The main API is located in <code>org.egothor.stemmer</code>. Supporting trie-oriented
|
||||
structures and related implementation components are located in
|
||||
<code>org.egothor.stemmer.trie</code>.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
BIN
src/main/resources/cs_cz/stemmer.gz
Normal file
BIN
src/main/resources/cs_cz/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/da_dk/stemmer.gz
Normal file
BIN
src/main/resources/da_dk/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/de_de/stemmer.gz
Normal file
BIN
src/main/resources/de_de/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/es_es/stemmer.gz
Normal file
BIN
src/main/resources/es_es/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/fa_ir/stemmer.gz
Normal file
BIN
src/main/resources/fa_ir/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/fi_fi/stemmer.gz
Normal file
BIN
src/main/resources/fi_fi/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/fr_fr/stemmer.gz
Normal file
BIN
src/main/resources/fr_fr/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/he_il/stemmer.gz
Normal file
BIN
src/main/resources/he_il/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/hu_hu/stemmer.gz
Normal file
BIN
src/main/resources/hu_hu/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/it_it/stemmer.gz
Normal file
BIN
src/main/resources/it_it/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/nb_no/stemmer.gz
Normal file
BIN
src/main/resources/nb_no/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/nl_nl/stemmer.gz
Normal file
BIN
src/main/resources/nl_nl/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/nn_no/stemmer.gz
Normal file
BIN
src/main/resources/nn_no/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/pl_pl/stemmer.gz
Normal file
BIN
src/main/resources/pl_pl/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/pt_pt/stemmer.gz
Normal file
BIN
src/main/resources/pt_pt/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/ru_ru/stemmer.gz
Normal file
BIN
src/main/resources/ru_ru/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/sv_se/stemmer.gz
Normal file
BIN
src/main/resources/sv_se/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/uk_ua/stemmer.gz
Normal file
BIN
src/main/resources/uk_ua/stemmer.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
src/main/resources/us_uk/stemmer.gz
Normal file
BIN
src/main/resources/us_uk/stemmer.gz
Normal file
Binary file not shown.
BIN
src/main/resources/yi/stemmer.gz
Normal file
BIN
src/main/resources/yi/stemmer.gz
Normal file
Binary file not shown.
@@ -48,9 +48,12 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
@@ -108,16 +111,14 @@ final class CompileIntegrationTest {
|
||||
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
||||
|
||||
/**
|
||||
* Reader charset used for robust extraction of ASCII-safe representative probes
|
||||
* from bundled project dictionaries.
|
||||
* Reader charset used for extraction of representative probes from bundled
|
||||
* project dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* ISO-8859-1 is intentionally used here as a byte-preserving single-byte
|
||||
* decoder so that the test can safely scan heterogeneous dictionary resources
|
||||
* and then select only ASCII-safe representative terms for semantic assertions.
|
||||
* Bundled project dictionaries are expected to be encoded in UTF-8.
|
||||
* </p>
|
||||
*/
|
||||
private static final Charset BUNDLED_PROBE_SCAN_CHARSET = StandardCharsets.ISO_8859_1;
|
||||
private static final Charset BUNDLED_PROBE_SCAN_CHARSET = StandardCharsets.UTF_8;
|
||||
|
||||
/**
|
||||
* Maximum number of representative bundled variants asserted per dictionary.
|
||||
@@ -136,12 +137,47 @@ final class CompileIntegrationTest {
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledDictionaryCases() {
|
||||
return Stream.of(Arguments.of("da_dk", "da_dk/stemmer"), Arguments.of("de_de", "de_de/stemmer"),
|
||||
Arguments.of("es_es", "es_es/stemmer"), Arguments.of("fr_fr", "fr_fr/stemmer"),
|
||||
Arguments.of("it_it", "it_it/stemmer"), Arguments.of("nl_nl", "nl_nl/stemmer"),
|
||||
Arguments.of("no_no", "no_no/stemmer"), Arguments.of("pt_pt", "pt_pt/stemmer"),
|
||||
Arguments.of("ru_ru", "ru_ru/stemmer"), Arguments.of("sv_se", "sv_se/stemmer"),
|
||||
Arguments.of("us_uk", "us_uk/stemmer"), Arguments.of("us_uk.profi", "us_uk.profi/stemmer"));
|
||||
return Stream.of(
|
||||
//
|
||||
Arguments.of("cs_cz", "cs_cz/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("da_dk", "da_dk/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("de_de", "de_de/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("es_es", "es_es/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("fa_ir", "fa_ir/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("fi_fi", "fi_fi/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("fr_fr", "fr_fr/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("he_il", "he_il/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("hu_hu", "hu_hu/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("it_it", "it_it/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("nb_no", "nb_no/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("nl_nl", "nl_nl/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("nn_no", "nn_no/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("pl_pl", "pl_pl/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("pt_pt", "pt_pt/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("ru_ru", "ru_ru/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("sv_se", "sv_se/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("uk_ua", "uk_ua/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("us_uk", "us_uk/stemmer.gz"),
|
||||
//
|
||||
Arguments.of("yi", "yi/stemmer.gz"));
|
||||
}
|
||||
|
||||
@Nested
|
||||
@@ -256,7 +292,9 @@ final class CompileIntegrationTest {
|
||||
"A preferred patch must be available for fixture word '" + word + "'."),
|
||||
() -> assertEquals(expectedStems, actualStems,
|
||||
"Fixture word '" + word + "' must preserve all expected stem candidates."),
|
||||
() -> assertTrue(expectedStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
||||
() -> assertTrue(
|
||||
expectedStems.contains(
|
||||
PatchCommandEncoder.apply(word, preferredPatch, trie.traversalDirection())),
|
||||
"The preferred stem must be one of the acceptable stems for fixture word '" + word + "'."));
|
||||
}
|
||||
}
|
||||
@@ -267,13 +305,15 @@ final class CompileIntegrationTest {
|
||||
|
||||
/**
|
||||
* Verifies that the CLI can compile each bundled project dictionary, create a
|
||||
* compressed artifact, reload it, and preserve representative variant lookup
|
||||
* behavior derived from the source dictionary itself.
|
||||
* compressed artifact, reload it, and preserve representative variant stemming
|
||||
* behavior derived from the source dictionary itself at the level of acceptable
|
||||
* reconstructed candidates.
|
||||
*
|
||||
* <p>
|
||||
* The representative assertions intentionally target only variant terms, not
|
||||
* canonical stems, because direct lookup of the canonical stem is not part of
|
||||
* the default non-{@code --store-original} contract.
|
||||
* Representative probes are derived directly from the same bundled source
|
||||
* dictionary that is being compiled. Items containing Unicode whitespace are
|
||||
* intentionally ignored by the representative-probe helper because the current
|
||||
* probe policy does not yet support multi-token dictionary items.
|
||||
* </p>
|
||||
*
|
||||
* @param scenario scenario identifier
|
||||
@@ -285,7 +325,7 @@ final class CompileIntegrationTest {
|
||||
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
||||
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
||||
final String resourcePath) throws IOException {
|
||||
final Path inputFile = copyResourceToTemporaryFile(resourcePath, scenario + "-stemmer.txt");
|
||||
final Path inputFile = copyResourceToTemporaryFile(resourcePath, scenario + "-stemmer.gz");
|
||||
final Path outputFile = tempDir.resolve("bundled").resolve(scenario).resolve("compiled.dat.gz");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
@@ -301,14 +341,17 @@ final class CompileIntegrationTest {
|
||||
final Map<String, Set<String>> representativeStemsByVariant = readRepresentativeVariantExpectations(
|
||||
resourcePath, REPRESENTATIVE_VARIANT_LIMIT);
|
||||
|
||||
assertFalse(representativeStemsByVariant.isEmpty(),
|
||||
"The bundled dictionary must provide at least one representative variant for " + scenario + '.');
|
||||
assertFalse(representativeStemsByVariant.isEmpty(), "The bundled dictionary must provide at least one "
|
||||
+ "representative variant without Unicode whitespace for " + scenario + '.');
|
||||
|
||||
for (Map.Entry<String, Set<String>> entry : representativeStemsByVariant.entrySet()) {
|
||||
final String variant = entry.getKey();
|
||||
final Set<String> expectedStems = entry.getValue();
|
||||
final String variant = entry.getKey().toLowerCase(Locale.ROOT);
|
||||
final Set<String> expectedStems = entry.getValue().stream().map(s -> s.toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
final String preferredPatch = trie.get(variant);
|
||||
final Set<String> actualStems = reconstructAllStemCandidates(trie, variant);
|
||||
final String preferredStem = preferredPatch == null ? null
|
||||
: PatchCommandEncoder.apply(variant, preferredPatch, trie.traversalDirection());
|
||||
|
||||
assertAll(
|
||||
() -> assertNotNull(preferredPatch,
|
||||
@@ -317,13 +360,22 @@ final class CompileIntegrationTest {
|
||||
() -> assertFalse(actualStems.isEmpty(),
|
||||
"At least one stem candidate must be returned for representative variant '" + variant
|
||||
+ "' in " + scenario + '.'),
|
||||
() -> assertTrue(actualStems.containsAll(expectedStems),
|
||||
"All acceptable stems must be preserved for representative variant '" + variant
|
||||
+ "' in " + scenario + ". Expected=" + expectedStems + ", actual="
|
||||
() -> assertTrue(expectedStems.stream().anyMatch(actualStems::contains),
|
||||
"At least one acceptable stem must be preserved for representative variant '" + variant
|
||||
+ "' in " + scenario + ". Expected one of=" + expectedStems + ", actual="
|
||||
+ actualStems),
|
||||
() -> assertTrue(expectedStems.contains(PatchCommandEncoder.apply(variant, preferredPatch)),
|
||||
"The preferred stem must be one of the acceptable stems for representative variant '"
|
||||
+ variant + "' in " + scenario + '.'));
|
||||
() -> {
|
||||
if (expectedStems.size() == 1 && actualStems.size() == 1) {
|
||||
assertEquals(expectedStems.iterator().next(), preferredStem,
|
||||
"The preferred stem must match the only expected surviving stem for "
|
||||
+ "representative variant '" + variant + "' in " + scenario + '.');
|
||||
} else {
|
||||
assertTrue(expectedStems.contains(preferredStem) || actualStems.contains(preferredStem),
|
||||
"The preferred stem must remain among the reconstructed candidates for "
|
||||
+ "representative variant '" + variant + "' in " + scenario
|
||||
+ ". Preferred=" + preferredStem + ", actual=" + actualStems);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -371,15 +423,18 @@ final class CompileIntegrationTest {
|
||||
* Reads representative variant expectations from a bundled project dictionary.
|
||||
*
|
||||
* <p>
|
||||
* This helper scans the source dictionary in a byte-preserving single-byte
|
||||
* charset and selects only ASCII-safe probe terms. That keeps the
|
||||
* multidictionary integration assertions stable even when the bundled resources
|
||||
* use heterogeneous encodings, while still validating the CLI against the real
|
||||
* shipped dictionaries.
|
||||
* This helper scans the source dictionary as UTF-8 text and derives
|
||||
* representative stem-to-variant expectations directly from that bundled
|
||||
* source. Only dictionary items that do not contain Unicode whitespace are
|
||||
* considered eligible representative probes. This keeps the multidictionary
|
||||
* integration assertions aligned with the current single-token probe policy
|
||||
* while still validating the CLI against the real shipped dictionaries and
|
||||
* their actual script repertoire.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The dictionary format is expected to be:
|
||||
* The bundled dictionary format is expected to be tab-separated values, meaning
|
||||
* that columns are separated by the tab character:
|
||||
* </p>
|
||||
*
|
||||
* <pre>
|
||||
@@ -389,7 +444,9 @@ final class CompileIntegrationTest {
|
||||
* <p>
|
||||
* Lines beginning with comment prefixes or blank lines are ignored. Canonical
|
||||
* stems are intentionally excluded from the expectation map unless they also
|
||||
* appear as distinct variants on a source line.
|
||||
* appear as distinct variants on a source line. Dictionary items containing any
|
||||
* Unicode whitespace are intentionally ignored by this representative-probe
|
||||
* helper.
|
||||
* </p>
|
||||
*
|
||||
* @param resourcePath bundled dictionary resource path
|
||||
@@ -402,8 +459,9 @@ final class CompileIntegrationTest {
|
||||
final Map<String, Set<String>> expectations = new LinkedHashMap<String, Set<String>>();
|
||||
|
||||
try (InputStream inputStream = openResource(resourcePath);
|
||||
InputStream decompressedStream = new GZIPInputStream(inputStream);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, BUNDLED_PROBE_SCAN_CHARSET))) {
|
||||
new InputStreamReader(decompressedStream, BUNDLED_PROBE_SCAN_CHARSET))) {
|
||||
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
|
||||
if (expectations.size() >= limit) {
|
||||
break;
|
||||
@@ -414,20 +472,20 @@ final class CompileIntegrationTest {
|
||||
continue;
|
||||
}
|
||||
|
||||
final String[] tokens = trimmedLine.split("\\s+");
|
||||
final String[] tokens = trimmedLine.split("\\t+");
|
||||
if (tokens.length < 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final String stem = tokens[0];
|
||||
if (!isAsciiProbeToken(stem)) {
|
||||
if (containsWhitespaceCharacter(stem)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int index = 1; index < tokens.length && expectations.size() < limit; index++) {
|
||||
final String variant = tokens[index];
|
||||
|
||||
if (!isAsciiProbeToken(variant) || variant.equals(stem)) {
|
||||
if (containsWhitespaceCharacter(variant) || variant.equals(stem)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -440,26 +498,24 @@ final class CompileIntegrationTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether one token is suitable for stable ASCII-safe bundled
|
||||
* multidictionary probing.
|
||||
* Determines whether one token contains any Unicode whitespace character.
|
||||
*
|
||||
* @param token token to inspect
|
||||
* @return {@code true} when the token is a non-empty lower-case ASCII letter
|
||||
* sequence
|
||||
* @return {@code true} when the token contains at least one whitespace
|
||||
* character
|
||||
*/
|
||||
private static boolean isAsciiProbeToken(final String token) {
|
||||
if (token == null || token.isEmpty()) {
|
||||
private static boolean containsWhitespaceCharacter(final String token) {
|
||||
if (token == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int index = 0; index < token.length(); index++) {
|
||||
final char character = token.charAt(index);
|
||||
if (character < 'a' || character > 'z') {
|
||||
return false;
|
||||
if (Character.isWhitespace(token.charAt(index))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -495,7 +551,7 @@ final class CompileIntegrationTest {
|
||||
}
|
||||
|
||||
for (String patchCommand : patchCommands) {
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||
}
|
||||
|
||||
return stems;
|
||||
|
||||
@@ -31,11 +31,11 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedHashSet;
|
||||
@@ -56,9 +56,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
*
|
||||
* <p>
|
||||
* This suite protects the binary persistence contract of compiled tries by
|
||||
* comparing freshly compiled artifacts against checked-in golden GZip outputs.
|
||||
* It also verifies SHA-256 digests and representative semantic probes after
|
||||
* loading the produced artifact back.
|
||||
* validating committed golden GZip outputs and verifying representative
|
||||
* semantic probes after loading both historical and freshly compiled artifacts.
|
||||
*
|
||||
* <p>
|
||||
* The goal is to catch unintended changes in:
|
||||
@@ -67,8 +66,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
* <li>canonical subtree reduction</li>
|
||||
* <li>child ordering and node numbering</li>
|
||||
* <li>value ordering and frequency handling</li>
|
||||
* <li>stream layout and binary format stability</li>
|
||||
* <li>compressed artifact reproducibility</li>
|
||||
* <li>stream layout backward readability</li>
|
||||
* <li>compressed artifact reproducibility within the active format version</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@@ -127,37 +126,26 @@ final class CompiledTrieArtifactRegressionTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a newly compiled artifact matches the committed golden file,
|
||||
* matches the committed hash, and remains semantically valid when loaded back.
|
||||
* Verifies that each committed golden artifact remains internally consistent,
|
||||
* matches its committed digest, and can still be read by the current binary
|
||||
* loader.
|
||||
*
|
||||
* @param artifactCase regression case
|
||||
* @throws IOException if test I/O fails
|
||||
*/
|
||||
@ParameterizedTest(name = "{0}")
|
||||
@MethodSource("artifactCases")
|
||||
@DisplayName("Compiled trie artifact must remain byte-for-byte stable")
|
||||
void shouldMatchGoldenArtifactAndExpectedHash(final ArtifactCase artifactCase) throws IOException {
|
||||
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
|
||||
this.tempDir.resolve(artifactCase.id() + ".stemmer"));
|
||||
|
||||
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + ".gz");
|
||||
final byte[] actualArtifactBytes = RegressionArtifactSupport.compileToArtifact(sourcePath,
|
||||
artifactCase.storeOriginal(), artifactCase.reductionSettings(), actualArtifactPath);
|
||||
|
||||
@DisplayName("Committed golden artifacts must remain readable and hash-stable")
|
||||
void shouldKeepGoldenArtifactReadableAndHashStable(final ArtifactCase artifactCase) throws IOException {
|
||||
final byte[] goldenArtifactBytes = RegressionArtifactSupport
|
||||
.readResourceBytes(artifactCase.goldenArtifactResource());
|
||||
final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource());
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(goldenArtifactBytes));
|
||||
|
||||
assertAll(
|
||||
() -> assertArrayEquals(goldenArtifactBytes, actualArtifactBytes,
|
||||
RegressionArtifactSupport.mismatchMessage(artifactCase.id(), expectedSha256,
|
||||
RegressionArtifactSupport.sha256Hex(actualArtifactBytes), actualArtifactPath)),
|
||||
|
||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(actualArtifactBytes),
|
||||
"Freshly compiled artifact SHA-256 must match the committed regression hash."),
|
||||
|
||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes),
|
||||
"Golden artifact SHA-256 must match its committed sidecar hash."));
|
||||
"Golden artifact SHA-256 must match its committed sidecar hash."),
|
||||
() -> assertGoldenArtifactSemanticProbes(trie, artifactCase));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -181,7 +169,7 @@ final class CompiledTrieArtifactRegressionTest {
|
||||
final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
|
||||
artifactCase.storeOriginal(), artifactCase.reductionSettings());
|
||||
|
||||
assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
|
||||
org.junit.jupiter.api.Assertions.assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
|
||||
"Two consecutive compilations of the same source must produce identical artifact bytes.");
|
||||
}
|
||||
|
||||
@@ -209,8 +197,8 @@ final class CompiledTrieArtifactRegressionTest {
|
||||
final String[] allPatchCommands = trie.getAll(probe.word());
|
||||
final String preferredPatchCommand = trie.get(probe.word());
|
||||
final String preferredStem = preferredPatchCommand == null ? null
|
||||
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand);
|
||||
final Set<String> allStems = reconstructStemCandidates(probe.word(), allPatchCommands);
|
||||
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand, trie.traversalDirection());
|
||||
final Set<String> allStems = reconstructStemCandidates(trie, probe.word(), allPatchCommands);
|
||||
|
||||
assertAll(
|
||||
() -> assertFalse(allPatchCommands.length == 0,
|
||||
@@ -233,7 +221,8 @@ final class CompiledTrieArtifactRegressionTest {
|
||||
* @param patchCommands serialized patch commands
|
||||
* @return reconstructed stem candidates
|
||||
*/
|
||||
private static Set<String> reconstructStemCandidates(final String word, final String[] patchCommands) {
|
||||
private static Set<String> reconstructStemCandidates(final FrequencyTrie<String> trie, final String word,
|
||||
final String[] patchCommands) {
|
||||
final Set<String> stems = new LinkedHashSet<String>();
|
||||
|
||||
if (patchCommands == null) {
|
||||
@@ -241,12 +230,38 @@ final class CompiledTrieArtifactRegressionTest {
|
||||
}
|
||||
|
||||
for (String patchCommand : patchCommands) {
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||
}
|
||||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies representative semantic probes against one already loaded trie.
|
||||
*
|
||||
* @param trie trie to inspect
|
||||
* @param artifactCase regression case providing the expected probes
|
||||
*/
|
||||
private static void assertGoldenArtifactSemanticProbes(final FrequencyTrie<String> trie,
|
||||
final ArtifactCase artifactCase) {
|
||||
for (ProbeExpectation probe : artifactCase.probes()) {
|
||||
final String[] allPatchCommands = trie.getAll(probe.word());
|
||||
final String preferredPatchCommand = trie.get(probe.word());
|
||||
final String preferredStem = preferredPatchCommand == null ? null
|
||||
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand, trie.traversalDirection());
|
||||
final Set<String> allStems = reconstructStemCandidates(trie, probe.word(), allPatchCommands);
|
||||
|
||||
assertAll(
|
||||
() -> assertFalse(allPatchCommands.length == 0,
|
||||
"Representative probe must produce at least one result for word: " + probe.word()),
|
||||
() -> assertEquals(probe.preferredStem(), preferredStem,
|
||||
"Preferred stem mismatch for representative probe word: " + probe.word()),
|
||||
() -> assertTrue(allStems.containsAll(probe.acceptableStems()),
|
||||
"All acceptable stems must be present in getAll() for representative probe word: "
|
||||
+ probe.word()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable regression case definition.
|
||||
*
|
||||
|
||||
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
@@ -0,0 +1,109 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link DiacriticStripper}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("diacritics")
|
||||
@DisplayName("DiacriticStripper")
|
||||
class DiacriticStripperTest {
|
||||
|
||||
/**
|
||||
* Verifies that pure ASCII input is returned unchanged and without allocating a
|
||||
* new string instance.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("ASCII input is returned as-is")
|
||||
void asciiInputIsReturnedAsIs() {
|
||||
final String input = "plain-ascii-123";
|
||||
|
||||
final String stripped = DiacriticStripper.strip(input);
|
||||
|
||||
assertSame(input, stripped);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies direct-table replacements for Czech and other common diacritics.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Direct replacement table strips common diacritics")
|
||||
void directReplacementTableStripsCommonDiacritics() {
|
||||
assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies explicit multi-character replacements for ligatures and sharp s.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Special replacements support multi-character ASCII output")
|
||||
void specialReplacementsSupportMultiCharacterAsciiOutput() {
|
||||
assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
|
||||
assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies Unicode decomposition fallback for characters not in the direct
|
||||
* replacement table.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Unicode decomposition fallback strips combining marks")
|
||||
void unicodeDecompositionFallbackStripsCombiningMarks() {
|
||||
assertEquals("I", DiacriticStripper.strip("İ"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Unmappable non-Latin characters remain unchanged")
|
||||
void unmappableNonLatinCharactersRemainUnchanged() {
|
||||
assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies mixed input where normalization starts mid-string and subsequent
|
||||
* unchanged characters are preserved.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Mixed input preserves untouched characters after normalization starts")
|
||||
void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
|
||||
assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
|
||||
}
|
||||
}
|
||||
@@ -201,6 +201,84 @@ class FrequencyTrieTest {
|
||||
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup-time key normalization follows persisted case processing
|
||||
* metadata.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Lookup applies lowercase normalization when metadata requires it")
|
||||
void lookupAppliesLowercaseNormalizationWhenMetadataRequiresIt() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
builder.put("house", "noun");
|
||||
builder.put("house", "verb");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("HOUSE")),
|
||||
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that REMOVE mode strips diacritics both at build time and at lookup
|
||||
* time and composes independently with case normalization.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
|
||||
void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
DiacriticProcessingMode.REMOVE);
|
||||
builder.put("Příliš", "cz");
|
||||
builder.put("žluťoučký", "cz2");
|
||||
builder.put("Smørrebrød", "da");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals("cz", trie.get("PRILIS")),
|
||||
() -> assertEquals("cz", trie.get("příliš")),
|
||||
() -> assertEquals("cz2", trie.get("zlutoucky")),
|
||||
() -> assertEquals("da", trie.get("SMORREBROD")),
|
||||
() -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that fallback diacritic mode is explicitly rejected for now.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
|
||||
void fallbackDiacriticModeIsNotSupportedYet() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
|
||||
DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
|
||||
|
||||
final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
|
||||
() -> builder.put("kůň", "horse"));
|
||||
assertTrue(exception.getMessage().contains("not supported yet"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Lookup keeps case-sensitive behavior when metadata is AS_IS")
|
||||
void lookupKeepsCaseSensitiveBehaviorWhenMetadataIsAsIs() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS);
|
||||
builder.put("House", "noun");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("House")), () -> assertNull(trie.get("house")),
|
||||
() -> assertArrayEquals(new String[] { "noun" }, trie.getAll("House")),
|
||||
() -> assertArrayEquals(new String[0], trie.getAll("HOUSE")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a missing path below an existing prefix returns empty results.
|
||||
*/
|
||||
@@ -588,8 +666,15 @@ class FrequencyTrieTest {
|
||||
() -> assertEquals("prefix", trie.get("p19")), () -> assertEquals("mid", trie.get("p19x")),
|
||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p19xy")),
|
||||
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p19xz")),
|
||||
() -> assertEquals(82, buildTimeSize), () -> assertEquals(7, compiledSize),
|
||||
() -> assertEquals(1.0d - (7.0d / 82.0d), reductionRatio, 0.0000001d),
|
||||
() -> assertTrue(buildTimeSize > 0,
|
||||
() -> "Build-time size must be positive, but was " + buildTimeSize + '.'),
|
||||
() -> assertTrue(compiledSize > 0,
|
||||
() -> "Compiled trie size must be positive, but was " + compiledSize + '.'),
|
||||
() -> assertTrue(compiledSize < buildTimeSize,
|
||||
() -> "Reduction must decrease the node count. Build-time size=" + buildTimeSize
|
||||
+ ", compiled size=" + compiledSize + '.'),
|
||||
() -> assertTrue(reductionRatio > 0.0d,
|
||||
() -> "Reduction ratio must be positive, but was " + reductionRatio + '.'),
|
||||
() -> assertTrue(reductionRatio >= 0.50d,
|
||||
() -> "Expected at least 50% reduction, but build-time size was " + buildTimeSize
|
||||
+ " and compiled size was " + compiledSize + ", giving ratio " + reductionRatio + '.'));
|
||||
|
||||
@@ -161,10 +161,10 @@ class FuzzStemmerAndTrieCompilationTest {
|
||||
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
|
||||
() -> assertTrue(allPatches.length >= 1,
|
||||
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
|
||||
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
||||
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch, trie.traversalDirection())),
|
||||
describeScenario("preferred patch reconstructed an unexpected stem",
|
||||
reductionMode, scenario, word)),
|
||||
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
|
||||
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(trie, word, allPatches, acceptableStems),
|
||||
describeScenario("getAll() contained a patch outside the accepted stem set",
|
||||
reductionMode, scenario, word)));
|
||||
}
|
||||
@@ -276,10 +276,10 @@ class FuzzStemmerAndTrieCompilationTest {
|
||||
* @param acceptableStems acceptable stems
|
||||
* @return {@code true} when all patches are acceptable
|
||||
*/
|
||||
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
|
||||
final Set<String> acceptableStems) {
|
||||
private static boolean allPatchesProduceOnlyAcceptableStems(final FrequencyTrie<String> trie,
|
||||
final String word, final String[] patches, final Set<String> acceptableStems) {
|
||||
for (String patch : patches) {
|
||||
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
|
||||
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch, trie.traversalDirection()))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ final class FuzzTestSupport {
|
||||
|
||||
dictionary.append(stem);
|
||||
for (String variant : variants) {
|
||||
dictionary.append(' ').append(variant);
|
||||
dictionary.append('\t').append(variant);
|
||||
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
|
||||
}
|
||||
dictionary.append(" # entry ").append(index).append('\n');
|
||||
@@ -186,7 +186,8 @@ final class FuzzTestSupport {
|
||||
case 1:
|
||||
return prefix(random) + stem;
|
||||
case 2:
|
||||
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
|
||||
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random)
|
||||
: stem + nextLetter(random);
|
||||
case 3:
|
||||
return stem + nextLetter(random) + nextLetter(random);
|
||||
case 4:
|
||||
@@ -317,7 +318,8 @@ final class FuzzTestSupport {
|
||||
* @param dictionaryContent generated dictionary content
|
||||
* @param expectedStemsByWord acceptable stems for each generated word
|
||||
*/
|
||||
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
|
||||
record StemmerDictionaryScenario(long seed, String dictionaryContent,
|
||||
Map<String, Set<String>> expectedStemsByWord) {
|
||||
|
||||
/**
|
||||
* Creates a validated scenario.
|
||||
|
||||
@@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
@Label("encode followed by apply should reconstruct the target word")
|
||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
final String patch = encoder.encode(source, target);
|
||||
|
||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||
@@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
@Label("encode should be deterministic for one source-target pair")
|
||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build();
|
||||
final String first = sharedEncoder.encode(source, target);
|
||||
final String second = sharedEncoder.encode(source, target);
|
||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
||||
final String fresh = PatchCommandEncoder.builder().build().encode(source, target);
|
||||
|
||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||
|
||||
@@ -250,12 +250,28 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("creates encoder with default cost model")
|
||||
void shouldCreateEncoderWithDefaultCostModel() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertNotNull(encoder);
|
||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies fluent builder construction with explicit forward traversal.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("builds direction-specialized encoder via builder")
|
||||
void shouldBuildDirectionSpecializedEncoderViaBuilder() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
|
||||
String patch = encoder.encode("running", "run");
|
||||
|
||||
assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch),
|
||||
() -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative insert cost is rejected.
|
||||
*/
|
||||
@@ -263,7 +279,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative insert cost")
|
||||
void shouldRejectNegativeInsertCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(-1, 1, 1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build());
|
||||
|
||||
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -275,7 +291,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative delete cost")
|
||||
void shouldRejectNegativeDeleteCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, -1, 1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build());
|
||||
|
||||
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -287,7 +303,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative replace cost")
|
||||
void shouldRejectNegativeReplaceCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, -1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build());
|
||||
|
||||
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -299,7 +315,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative match cost")
|
||||
void shouldRejectNegativeMatchCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, 1, -1));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build());
|
||||
|
||||
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -313,13 +329,29 @@ class PatchCommandEncoderTest {
|
||||
@Tag("encode")
|
||||
class EncodeTests {
|
||||
|
||||
/**
|
||||
* Verifies that trailing SKIP instructions are omitted from the generated patch
|
||||
* command because they do not affect reconstruction.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("does not emit trailing SKIP instructions into patch command")
|
||||
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("abcd", "ab");
|
||||
|
||||
assertAll(() -> assertNotNull(patch), () -> assertEquals("Db", patch),
|
||||
() -> assertEquals("ab", PatchCommandEncoder.apply("abcd", patch)), () -> assertEquals(-1,
|
||||
patch.indexOf('-'), () -> "Patch must not contain a trailing SKIP instruction: " + patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a null source yields a null patch.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns null when source is null")
|
||||
void shouldReturnNullWhenSourceIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode(null, "target");
|
||||
|
||||
@@ -332,7 +364,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns null when target is null")
|
||||
void shouldReturnNullWhenTargetIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("source", null);
|
||||
|
||||
@@ -345,7 +377,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teacher");
|
||||
|
||||
@@ -359,7 +391,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal empty words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("", "");
|
||||
|
||||
@@ -378,7 +410,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
||||
@DisplayName("produces patches that reconstruct the target")
|
||||
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode(source, target);
|
||||
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
||||
@@ -398,7 +430,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("remains correct when reused across different input sizes")
|
||||
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals("transformation",
|
||||
@@ -414,7 +446,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("supports custom operation costs")
|
||||
void shouldSupportCustomOperationCosts() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
||||
@@ -473,6 +505,36 @@ class PatchCommandEncoderTest {
|
||||
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that instance-level application follows encoder traversal
|
||||
* direction.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("applies patch via instance-level direction-specialized fast path")
|
||||
void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
|
||||
String patch = encoder.encode("transformation", "transform");
|
||||
|
||||
assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies dedicated forward traversal encode/apply round trip.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("reconstructs target with forward traversal encoder and static apply")
|
||||
void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
String patch = encoder.encode("cities", "city");
|
||||
|
||||
assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies explicit patch application cases.
|
||||
*
|
||||
@@ -544,7 +606,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles deletion-heavy suffix stripping")
|
||||
void shouldHandleDeletionHeavySuffixStripping() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
|
||||
@@ -557,7 +619,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles plural to singular transformation")
|
||||
void shouldHandlePluralToSingularTransformation() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("cities", "city");
|
||||
|
||||
@@ -570,7 +632,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles derivational reduction to a shorter stem")
|
||||
void shouldHandleDerivationalReductionToShorterStem() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("stemming", "stem");
|
||||
|
||||
@@ -583,7 +645,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles single-character replacement")
|
||||
void shouldHandleSingleCharacterReplacement() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("a", "z");
|
||||
|
||||
@@ -610,7 +672,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("reconstructs reversed targets from reversed sources")
|
||||
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String reversedSource = reverse(source);
|
||||
String reversedTarget = reverse(target);
|
||||
@@ -633,7 +695,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles mirrored stemming transformations")
|
||||
void shouldHandleMirroredStemmingTransformations() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("teach"),
|
||||
@@ -655,7 +717,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("remains correct when reused on reversed words of different sizes")
|
||||
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("transformation"),
|
||||
@@ -683,7 +745,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("preserves correctness under mirrored input orientation")
|
||||
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String normalPatch = encoder.encode(source, target);
|
||||
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
||||
|
||||
@@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
for (StemmerEntry entry : scenario.entries()) {
|
||||
if (storeOriginal) {
|
||||
|
||||
@@ -35,6 +35,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
@@ -44,6 +45,10 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.logging.Handler;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.LogRecord;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
@@ -59,7 +64,7 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>parsing through all public overloads,</li>
|
||||
* <li>normalization to lower case,</li>
|
||||
* <li>case processing according to the selected mode,</li>
|
||||
* <li>handling of empty lines and remarks,</li>
|
||||
* <li>correct entry emission including line numbers,</li>
|
||||
* <li>propagation of I/O failures from the handler and file system,</li>
|
||||
@@ -89,6 +94,43 @@ class StemmerDictionaryParserTest {
|
||||
// Record used only as a compact assertion carrier.
|
||||
}
|
||||
|
||||
/**
|
||||
* Log handler capturing parser diagnostics for assertions.
|
||||
*/
|
||||
private static final class CapturedLogHandler extends Handler {
|
||||
|
||||
/**
|
||||
* Captured log records.
|
||||
*/
|
||||
private final List<LogRecord> records = new ArrayList<LogRecord>();
|
||||
|
||||
@Override
|
||||
public void publish(final LogRecord record) {
|
||||
if (record != null) {
|
||||
this.records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() {
|
||||
// No buffered state.
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
this.records.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the captured records.
|
||||
*
|
||||
* @return captured records
|
||||
*/
|
||||
private List<LogRecord> records() {
|
||||
return this.records;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a handler that collects all parser callbacks into the supplied list.
|
||||
*
|
||||
@@ -157,6 +199,49 @@ class StemmerDictionaryParserTest {
|
||||
() -> assertEquals(5, third.lineNumber()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should ignore whitespace-containing items and emit one warning per physical line")
|
||||
void shouldIgnoreWhitespaceContainingItemsAndLogOneWarningPerLine() throws IOException {
|
||||
final String input = "root\trunning form\truns\tnew\u2003term\n" + "compound stem\talpha\tbeta\tvalue\n";
|
||||
|
||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||
final Logger logger = Logger.getLogger(StemmerDictionaryParser.class.getName());
|
||||
final Level previousLevel = logger.getLevel();
|
||||
final boolean previousUseParentHandlers = logger.getUseParentHandlers();
|
||||
final CapturedLogHandler handler = new CapturedLogHandler();
|
||||
|
||||
logger.setUseParentHandlers(false);
|
||||
logger.setLevel(Level.WARNING);
|
||||
logger.addHandler(handler);
|
||||
try {
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser
|
||||
.parse(new StringReader(input), "whitespace-source", collectingHandler(entries));
|
||||
|
||||
assertAll("Statistics", () -> assertEquals(2, statistics.lineCount()),
|
||||
() -> assertEquals(1, statistics.entryCount()),
|
||||
() -> assertEquals(0, statistics.ignoredLineCount()));
|
||||
assertEquals(1, entries.size(), "Only the valid TSV row must be emitted.");
|
||||
assertAll("Parsed entry", () -> assertEquals("root", entries.get(0).stem()),
|
||||
() -> assertArrayEquals(new String[] { "runs" }, entries.get(0).variants()),
|
||||
() -> assertEquals(1, entries.get(0).lineNumber()));
|
||||
assertEquals(2, handler.records().size(), "Exactly one warning must be emitted per physical line.");
|
||||
assertAll("First warning", () -> assertEquals(Level.WARNING, handler.records().get(0).getLevel()),
|
||||
() -> assertTrue(handler.records().get(0).getMessage()
|
||||
.contains("Ignoring dictionary items containing whitespace")),
|
||||
() -> assertEquals("whitespace-source", handler.records().get(0).getParameters()[0]),
|
||||
() -> assertEquals(Integer.valueOf(1), handler.records().get(0).getParameters()[1]),
|
||||
() -> assertEquals("root", handler.records().get(0).getParameters()[2]),
|
||||
() -> assertEquals(Integer.valueOf(2), handler.records().get(0).getParameters()[3]));
|
||||
assertAll("Second warning",
|
||||
() -> assertEquals(Integer.valueOf(2), handler.records().get(1).getParameters()[1]),
|
||||
() -> assertEquals("compound stem", handler.records().get(1).getParameters()[2]));
|
||||
} finally {
|
||||
logger.removeHandler(handler);
|
||||
logger.setUseParentHandlers(previousUseParentHandlers);
|
||||
logger.setLevel(previousLevel);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should prefer earliest remark marker regardless of marker type")
|
||||
void shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType() throws IOException {
|
||||
@@ -195,6 +280,22 @@ class StemmerDictionaryParserTest {
|
||||
assertEquals(expected, exception, "The original exception instance should be preserved.");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should preserve character case when AS_IS mode is selected")
|
||||
void shouldPreserveCharacterCaseWhenAsIsModeIsSelected() throws IOException {
|
||||
final String input = "Root\tRunning\tRuns\tRUNNER\n";
|
||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(
|
||||
new StringReader(input), "case-as-is", CaseProcessingMode.AS_IS, collectingHandler(entries));
|
||||
|
||||
assertAll("Statistics", () -> assertEquals(1, statistics.lineCount()),
|
||||
() -> assertEquals(1, statistics.entryCount()), () -> assertEquals(0, statistics.ignoredLineCount()));
|
||||
assertEquals(1, entries.size(), "Exactly one entry should be emitted.");
|
||||
assertAll("Entry", () -> assertEquals("Root", entries.get(0).stem()),
|
||||
() -> assertArrayEquals(new String[] { "Running", "Runs", "RUNNER" }, entries.get(0).variants()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null reader")
|
||||
void shouldRejectNullReader() {
|
||||
@@ -213,6 +314,15 @@ class StemmerDictionaryParserTest {
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null case processing mode")
|
||||
void shouldRejectNullCaseProcessingMode() {
|
||||
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(new StringReader("a b"),
|
||||
"source", null, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null entry handler")
|
||||
void shouldRejectNullEntryHandler() {
|
||||
|
||||
@@ -0,0 +1,279 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
/**
|
||||
* Tests for {@link StemmerKnowledgeExperiment}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("integration")
|
||||
@Tag("stemmer")
|
||||
final class StemmerKnowledgeExperimentTest {
|
||||
|
||||
/**
|
||||
* Deterministic seed used by all tests.
|
||||
*/
|
||||
private static final long TEST_SEED = 20260421L;
|
||||
|
||||
/**
|
||||
* Small deterministic morphology-shaped dictionary.
|
||||
*/
|
||||
private static final String DICTIONARY = String.join(System.lineSeparator(), "run running runs runner",
|
||||
"walk walking walks walked", "play playing plays played");
|
||||
|
||||
/**
|
||||
* Temporary directory for report writing tests.
|
||||
*/
|
||||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
/**
|
||||
* Verifies deterministic scenario generation and expected row count.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("evaluate should return deterministic full scenario matrix")
|
||||
void evaluateShouldReturnDeterministicScenarioMatrix() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> first = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> second = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
assertEquals(ReductionMode.values().length * 2 * 2 * 10, first.size());
|
||||
assertEquals(first, second);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that full knowledge with stored original stems reaches ideal
|
||||
* quality.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("100 percent knowledge with stored originals should achieve perfect scores")
|
||||
void fullKnowledgeWithStoredOriginalsShouldBePerfect() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, true, true, 100));
|
||||
|
||||
assertEquals(1.0d, row.getAccuracy());
|
||||
assertEquals(1.0d, row.getAllPrecision());
|
||||
assertEquals(1.0d, row.getAllRecall());
|
||||
assertEquals(1.0d, row.getAllF1());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that evaluating canonical stems without storing no-op patches lowers
|
||||
* recall at full knowledge, while {@code get()} still remains perfect due to
|
||||
* the implicit identity fallback for already canonical inputs.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("evaluating stems without stored originals should reduce recall but preserve get accuracy")
|
||||
void evaluatingStemsWithoutStoredOriginalsShouldReduceRecall() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, true, 100));
|
||||
|
||||
assertTrue(row.getAllRecall() < 1.0d);
|
||||
assertEquals(1.0d, row.getAccuracy());
|
||||
assertTrue(row.getAllF1() < 1.0d);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that storing original stems becomes irrelevant when canonical stems
|
||||
* themselves are not part of the evaluated input set.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("storeOriginal should not affect scores when stems are not evaluated")
|
||||
void storeOriginalShouldNotAffectScoresWhenStemsAreNotEvaluated() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
final StemmerKnowledgeExperiment.ResultRow withoutStoredOriginals = uniqueRow(rows,
|
||||
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, false, 100));
|
||||
final StemmerKnowledgeExperiment.ResultRow withStoredOriginals = uniqueRow(rows,
|
||||
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, true, false, 100));
|
||||
|
||||
assertEquals(withoutStoredOriginals.getAccuracy(), withStoredOriginals.getAccuracy());
|
||||
assertEquals(withoutStoredOriginals.getAllPrecision(), withStoredOriginals.getAllPrecision());
|
||||
assertEquals(withoutStoredOriginals.getAllRecall(), withStoredOriginals.getAllRecall());
|
||||
assertEquals(withoutStoredOriginals.getAllF1(), withStoredOriginals.getAllF1());
|
||||
assertEquals(withoutStoredOriginals.getCorrectCount(), withStoredOriginals.getCorrectCount());
|
||||
assertEquals(withoutStoredOriginals.getAllTruePositiveCount(), withStoredOriginals.getAllTruePositiveCount());
|
||||
assertEquals(withoutStoredOriginals.getAllFalsePositiveCount(), withStoredOriginals.getAllFalsePositiveCount());
|
||||
assertEquals(withoutStoredOriginals.getAllCoveredInputCount(), withStoredOriginals.getAllCoveredInputCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that implicit identity fallback for {@code get()} does not propagate
|
||||
* into {@code getAll()}, which still requires an explicit command to cover an
|
||||
* input.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("get should accept implicit identity while getAll still requires explicit coverage")
|
||||
void getShouldAcceptImplicitIdentityWhileGetAllStillRequiresExplicitCoverage() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final String minimalDictionary = "run running";
|
||||
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(minimalDictionary),
|
||||
"minimal", "MINIMAL", TEST_SEED);
|
||||
|
||||
final StemmerKnowledgeExperiment.ResultRow row = uniqueRow(rows,
|
||||
resultKey(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, false, true, 100));
|
||||
|
||||
assertEquals(2L, row.evaluatedInputCount());
|
||||
assertEquals(2L, row.getCorrectCount());
|
||||
assertEquals(1.0d, row.getAccuracy());
|
||||
|
||||
assertEquals(1L, row.getAllCoveredInputCount());
|
||||
assertEquals(0.5d, row.getAllRecall());
|
||||
assertTrue(row.getAllPrecision() > 0.0d);
|
||||
assertTrue(row.getAllPrecision() <= 1.0d);
|
||||
assertTrue(row.getAllF1() < 1.0d);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies CSV report generation.
|
||||
*
|
||||
* @throws IOException if report writing fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("writeCsv should emit header and data rows")
|
||||
void writeCsvShouldEmitHeaderAndDataRows() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
final Path output = this.tempDir.resolve("knowledge.csv");
|
||||
StemmerKnowledgeExperiment.writeCsv(output, rows);
|
||||
|
||||
final List<String> writtenLines = Files.readAllLines(output, StandardCharsets.UTF_8);
|
||||
assertFalse(writtenLines.isEmpty());
|
||||
assertEquals(StemmerKnowledgeExperiment.ResultRow.csvHeader(), writtenLines.get(0));
|
||||
assertEquals(rows.size() + 1, writtenLines.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the result row key lookup remains stable for all generated
|
||||
* rows.
|
||||
*
|
||||
* @throws IOException if evaluation fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("all generated rows should be addressable by the synthetic key")
|
||||
void allGeneratedRowsShouldBeAddressableBySyntheticKey() throws IOException {
|
||||
final StemmerKnowledgeExperiment experiment = new StemmerKnowledgeExperiment();
|
||||
final List<StemmerKnowledgeExperiment.ResultRow> rows = experiment.evaluate(new StringReader(DICTIONARY),
|
||||
"synthetic", "SYNTHETIC", TEST_SEED);
|
||||
|
||||
for (StemmerKnowledgeExperiment.ResultRow row : rows) {
|
||||
assertDoesNotThrow(() -> uniqueRow(rows, resultKey(row)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds one unique row by a synthetic key.
|
||||
*
|
||||
* @param rows result rows
|
||||
* @param key synthetic key
|
||||
* @return matching row
|
||||
*/
|
||||
private static StemmerKnowledgeExperiment.ResultRow uniqueRow(final List<StemmerKnowledgeExperiment.ResultRow> rows,
|
||||
final String key) {
|
||||
final Map<String, StemmerKnowledgeExperiment.ResultRow> indexed = rows.stream()
|
||||
.collect(Collectors.toMap(StemmerKnowledgeExperimentTest::resultKey, Function.identity()));
|
||||
final StemmerKnowledgeExperiment.ResultRow row = indexed.get(key);
|
||||
assertNotNull(row);
|
||||
return row;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a lookup key from a row.
|
||||
*
|
||||
* @param row result row
|
||||
* @return lookup key
|
||||
*/
|
||||
private static String resultKey(final StemmerKnowledgeExperiment.ResultRow row) {
|
||||
return resultKey(ReductionMode.valueOf(row.reductionMode()), row.storeOriginal(), row.includeStemInEvaluation(),
|
||||
row.knowledgePercent());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a lookup key from scenario components.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @param storeOriginal whether no-op patches were stored
|
||||
* @param includeStemInEvaluation whether stems were evaluated
|
||||
* @param knowledgePercent knowledge percentage
|
||||
* @return lookup key
|
||||
*/
|
||||
private static String resultKey(final ReductionMode reductionMode, final boolean storeOriginal,
|
||||
final boolean includeStemInEvaluation, final int knowledgePercent) {
|
||||
return reductionMode.name() + '|' + storeOriginal + '|' + includeStemInEvaluation + '|' + knowledgePercent;
|
||||
}
|
||||
}
|
||||
@@ -37,6 +37,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
@@ -46,13 +47,17 @@ import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader.Language;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -77,6 +82,7 @@ import org.junit.jupiter.params.provider.MethodSource;
|
||||
* <li>comment-aware parsing delegated to {@link StemmerDictionaryParser}</li>
|
||||
* <li>preservation of all valid stem candidates returned by
|
||||
* {@link FrequencyTrie#getAll(String)}</li>
|
||||
* <li>the current bundled language set, including right-to-left metadata</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@@ -97,126 +103,51 @@ final class StemmerPatchTrieLoaderTest {
|
||||
*/
|
||||
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
||||
|
||||
/**
|
||||
* Representative number of bundled words used for overload consistency checks.
|
||||
*/
|
||||
private static final int REPRESENTATIVE_BUNDLED_WORD_COUNT = 25;
|
||||
|
||||
/**
|
||||
* Provides arguments for bundled dictionary verification across both supported
|
||||
* getAll-preserving reduction modes.
|
||||
*
|
||||
* <p>
|
||||
* The stream is derived directly from the current {@link Language} enum so the
|
||||
* test suite follows the supported bundled language set automatically.
|
||||
* </p>
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledDictionaryCases() {
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-da_dk-ranked", StemmerPatchTrieLoader.Language.DA_DK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
final ReductionMode[] reductionModes = new ReductionMode[] {
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS };
|
||||
|
||||
// 02
|
||||
Arguments.of("02-de_de-ranked", StemmerPatchTrieLoader.Language.DE_DE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-es_es-ranked", StemmerPatchTrieLoader.Language.ES_ES,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 04
|
||||
Arguments.of("04-fr_fr-ranked", StemmerPatchTrieLoader.Language.FR_FR,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 05
|
||||
Arguments.of("05-it_it-ranked", StemmerPatchTrieLoader.Language.IT_IT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 06
|
||||
Arguments.of("06-nl_nl-ranked", StemmerPatchTrieLoader.Language.NL_NL,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 07
|
||||
Arguments.of("07-no_no-ranked", StemmerPatchTrieLoader.Language.NO_NO,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 08
|
||||
Arguments.of("08-pt_pt-ranked", StemmerPatchTrieLoader.Language.PT_PT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 09
|
||||
Arguments.of("09-ru_ru-ranked", StemmerPatchTrieLoader.Language.RU_RU,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 10
|
||||
Arguments.of("10-sv_se-ranked", StemmerPatchTrieLoader.Language.SV_SE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 11
|
||||
Arguments.of("11-us_uk-ranked", StemmerPatchTrieLoader.Language.US_UK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 12
|
||||
Arguments.of("12-us_uk_profi-ranked", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 13
|
||||
Arguments.of("13-da_dk-unordered", StemmerPatchTrieLoader.Language.DA_DK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 14
|
||||
Arguments.of("14-de_de-unordered", StemmerPatchTrieLoader.Language.DE_DE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 15
|
||||
Arguments.of("15-es_es-unordered", StemmerPatchTrieLoader.Language.ES_ES,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 16
|
||||
Arguments.of("16-fr_fr-unordered", StemmerPatchTrieLoader.Language.FR_FR,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 17
|
||||
Arguments.of("17-it_it-unordered", StemmerPatchTrieLoader.Language.IT_IT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 18
|
||||
Arguments.of("18-nl_nl-unordered", StemmerPatchTrieLoader.Language.NL_NL,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 19
|
||||
Arguments.of("19-no_no-unordered", StemmerPatchTrieLoader.Language.NO_NO,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 20
|
||||
Arguments.of("20-pt_pt-unordered", StemmerPatchTrieLoader.Language.PT_PT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 21
|
||||
Arguments.of("21-ru_ru-unordered", StemmerPatchTrieLoader.Language.RU_RU,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 22
|
||||
Arguments.of("22-sv_se-unordered", StemmerPatchTrieLoader.Language.SV_SE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 23
|
||||
Arguments.of("23-us_uk-unordered", StemmerPatchTrieLoader.Language.US_UK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 24
|
||||
Arguments.of("24-us_uk_profi-unordered", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS));
|
||||
return Arrays.stream(StemmerPatchTrieLoader.Language.values())
|
||||
.flatMap(
|
||||
language -> IntStream.range(0, reductionModes.length)
|
||||
.mapToObj(index -> Arguments.of(
|
||||
String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(),
|
||||
reductionModes[index].name().toLowerCase()),
|
||||
language, reductionModes[index])));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides representative bundled languages for overload consistency checks.
|
||||
*
|
||||
* <p>
|
||||
* The sample intentionally covers both traversal directions.
|
||||
* </p>
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledLanguageSamples() {
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||
|
||||
// 02
|
||||
return Stream.of(Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-fr_fr", StemmerPatchTrieLoader.Language.FR_FR));
|
||||
Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR),
|
||||
Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL),
|
||||
Arguments.of("05-yi", StemmerPatchTrieLoader.Language.YI));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -227,107 +158,90 @@ final class StemmerPatchTrieLoaderTest {
|
||||
static Stream<Arguments> nullContractCases() {
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
||||
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
|
||||
.put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build();
|
||||
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-load-language-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||
true, settings),
|
||||
"language"),
|
||||
|
||||
// 02
|
||||
Arguments.of("02-load-language-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||
true, DEFAULT_REDUCTION_MODE),
|
||||
"language"),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-load-language-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 04
|
||||
Arguments.of("04-load-language-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 05
|
||||
Arguments.of("05-load-path-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true, settings), "path"),
|
||||
|
||||
// 06
|
||||
Arguments.of("06-load-path-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true,
|
||||
DEFAULT_REDUCTION_MODE),
|
||||
"path"),
|
||||
|
||||
// 07
|
||||
Arguments.of("07-load-path-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true,
|
||||
(ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 08
|
||||
Arguments.of("08-load-path-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 09
|
||||
Arguments.of("09-load-string-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
|
||||
"fileName"),
|
||||
|
||||
// 10
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("10-load-string-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
|
||||
DEFAULT_REDUCTION_MODE),
|
||||
"fileName"),
|
||||
|
||||
// 11
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("11-load-string-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 12
|
||||
Arguments.of("12-load-string-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 13
|
||||
Arguments.of("13-load-binary-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
|
||||
|
||||
// 14
|
||||
Arguments.of("14-load-binary-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"),
|
||||
|
||||
// 15
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("15-load-binary-stream",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||
"inputStream"),
|
||||
|
||||
// 16
|
||||
Arguments.of("16-save-binary-null-trie-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
||||
|
||||
// 17
|
||||
Arguments.of("17-save-binary-null-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
||||
|
||||
// 18
|
||||
Arguments.of("18-save-binary-null-trie-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
||||
"trie"),
|
||||
|
||||
// 19
|
||||
Arguments.of("19-save-binary-null-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||
"fileName"));
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("20-load-language-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("21-load-path-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("22-load-string-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("23-load-binary-metadata-path-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||
Arguments.of("24-load-binary-metadata-string-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("25-load-binary-metadata-stream-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||
"inputStream"));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -432,6 +346,31 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"run");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that metadata-driven loading keeps all configuration dimensions in
|
||||
* one explicit object and applies them during compilation.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Metadata overload must drive case and diacritic normalization")
|
||||
void shouldLoadUsingExplicitMetadataConfiguration() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
mÁma mamA mámě
|
||||
""");
|
||||
final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD,
|
||||
ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata);
|
||||
|
||||
assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()),
|
||||
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
trie.metadata().caseProcessingMode()),
|
||||
() -> assertNotNull(trie.get("MÁMĚ")),
|
||||
() -> assertNotNull(trie.get("mame")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
||||
* canonical no-op patch for the stem itself.
|
||||
@@ -480,6 +419,29 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"Variants must still reconstruct the proper stem.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the loader honors forward traversal for right-to-left
|
||||
* dictionaries loaded from filesystem overloads.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Explicit right-to-left loading must use forward traversal semantics")
|
||||
void shouldUseForwardTraversalForExplicitRightToLeftLoading() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
كتب كتابة كتاب
|
||||
""");
|
||||
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, settings,
|
||||
WordTraversalDirection.FORWARD);
|
||||
|
||||
assertEquals(WordTraversalDirection.FORWARD, trie.traversalDirection(),
|
||||
"Right-to-left loading must produce a forward-traversed trie.");
|
||||
assertEquals(Set.of("كتب"), reconstructAllStemCandidates(trie, "كتابة"),
|
||||
"Patch reconstruction must use the trie traversal direction.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that comment syntax documented by the loader is effectively honored
|
||||
* through delegated parsing.
|
||||
@@ -539,6 +501,15 @@ final class StemmerPatchTrieLoaderTest {
|
||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||
}
|
||||
|
||||
final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile);
|
||||
final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString());
|
||||
try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||
final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream);
|
||||
assertAll(() -> assertEquals(original.metadata(), metadataFromPath),
|
||||
() -> assertEquals(original.metadata(), metadataFromString),
|
||||
() -> assertEquals(original.metadata(), metadataFromStream));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -562,6 +533,53 @@ final class StemmerPatchTrieLoaderTest {
|
||||
@DisplayName("Bundled dictionaries")
|
||||
final class BundledDictionaryTests {
|
||||
|
||||
/**
|
||||
* Verifies that the current language enumeration exactly matches the bundled
|
||||
* language set expected by this project revision.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Language enum must expose the current bundled language set")
|
||||
void shouldExposeCurrentBundledLanguageSet() {
|
||||
final Set<StemmerPatchTrieLoader.Language> expectedLanguages = new LinkedHashSet<StemmerPatchTrieLoader.Language>(
|
||||
Arrays.asList(StemmerPatchTrieLoader.Language.CS_CZ, StemmerPatchTrieLoader.Language.DA_DK,
|
||||
StemmerPatchTrieLoader.Language.DE_DE, StemmerPatchTrieLoader.Language.ES_ES,
|
||||
StemmerPatchTrieLoader.Language.FA_IR, StemmerPatchTrieLoader.Language.FI_FI,
|
||||
StemmerPatchTrieLoader.Language.FR_FR, StemmerPatchTrieLoader.Language.HE_IL,
|
||||
StemmerPatchTrieLoader.Language.HU_HU, StemmerPatchTrieLoader.Language.IT_IT,
|
||||
StemmerPatchTrieLoader.Language.NB_NO, StemmerPatchTrieLoader.Language.NL_NL,
|
||||
StemmerPatchTrieLoader.Language.NN_NO, StemmerPatchTrieLoader.Language.PL_PL,
|
||||
StemmerPatchTrieLoader.Language.PT_PT, StemmerPatchTrieLoader.Language.RU_RU,
|
||||
StemmerPatchTrieLoader.Language.SV_SE, StemmerPatchTrieLoader.Language.UK_UA,
|
||||
StemmerPatchTrieLoader.Language.US_UK, StemmerPatchTrieLoader.Language.YI));
|
||||
|
||||
final Set<StemmerPatchTrieLoader.Language> actualLanguages = new LinkedHashSet<StemmerPatchTrieLoader.Language>(
|
||||
Arrays.asList(StemmerPatchTrieLoader.Language.values()));
|
||||
|
||||
assertEquals(expectedLanguages, actualLanguages,
|
||||
"The bundled language enum must match the project's supported language set exactly.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the right-to-left metadata is correctly assigned for the
|
||||
* currently supported bundled languages.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Language enum must mark right-to-left bundled languages correctly")
|
||||
void shouldExposeCorrectRightToLeftMetadata() {
|
||||
final Set<StemmerPatchTrieLoader.Language> expectedRightToLeftLanguages = Set.of(
|
||||
StemmerPatchTrieLoader.Language.FA_IR, StemmerPatchTrieLoader.Language.HE_IL,
|
||||
StemmerPatchTrieLoader.Language.YI);
|
||||
|
||||
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
|
||||
if (expectedRightToLeftLanguages.contains(language)) {
|
||||
assertTrue(language.isRightToLeft(), () -> language.name() + " must be marked as right-to-left.");
|
||||
} else {
|
||||
assertFalse(language.isRightToLeft(),
|
||||
() -> language.name() + " must not be marked as right-to-left.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that each bundled dictionary compiles into a trie whose
|
||||
* {@link FrequencyTrie#getAll(String)} results still reconstruct exactly the
|
||||
@@ -586,6 +604,8 @@ final class StemmerPatchTrieLoaderTest {
|
||||
|
||||
assertNotNull(trie, "Compiled trie must be created.");
|
||||
assertFalse(expectedStemsByWord.isEmpty(), "Bundled dictionary must not be empty.");
|
||||
assertEquals(language.isRightToLeft() ? WordTraversalDirection.FORWARD : WordTraversalDirection.BACKWARD,
|
||||
trie.traversalDirection(), "Trie traversal direction must match language metadata.");
|
||||
|
||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||
final String word = entry.getKey();
|
||||
@@ -619,13 +639,12 @@ final class StemmerPatchTrieLoaderTest {
|
||||
final FrequencyTrie<String> viaMode = StemmerPatchTrieLoader.load(language, true, DEFAULT_REDUCTION_MODE);
|
||||
|
||||
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
|
||||
final int verifiedWords = 25;
|
||||
int counter = 0;
|
||||
|
||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||
assertTriePatchSemanticsEqual(viaSettings, viaMode, entry.getKey());
|
||||
counter++;
|
||||
if (counter >= verifiedWords) {
|
||||
if (counter >= REPRESENTATIVE_BUNDLED_WORD_COUNT) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -704,7 +723,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
}
|
||||
|
||||
for (String patchCommand : patchCommands) {
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand, trie.traversalDirection()));
|
||||
}
|
||||
|
||||
return stems;
|
||||
@@ -743,7 +762,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
if (inputStream == null) {
|
||||
throw new IOException("Bundled stemmer resource not found: " + resourcePath);
|
||||
}
|
||||
return inputStream;
|
||||
return new GZIPInputStream(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -82,10 +82,10 @@ class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||
"preferred patch must exist for an observed word.");
|
||||
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
|
||||
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
|
||||
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch, trie.traversalDirection())),
|
||||
"preferred patch reconstructed an unexpected stem.");
|
||||
|
||||
final Set<String> producedStems = applyAll(observedWord, allPatches);
|
||||
final Set<String> producedStems = applyAll(trie, observedWord, allPatches);
|
||||
assertTrue(acceptableStems.containsAll(producedStems),
|
||||
"getAll() must not expose a patch that reconstructs an undeclared stem.");
|
||||
|
||||
@@ -125,10 +125,10 @@ class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||
* @param patches returned patches
|
||||
* @return decoded stem set
|
||||
*/
|
||||
private static Set<String> applyAll(final String source, final String[] patches) {
|
||||
private static Set<String> applyAll(final FrequencyTrie<String> trie, final String source, final String[] patches) {
|
||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||
for (String patch : patches) {
|
||||
stems.add(PatchCommandEncoder.apply(source, patch));
|
||||
stems.add(PatchCommandEncoder.apply(source, patch, trie.traversalDirection()));
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
74
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
74
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
@@ -0,0 +1,74 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@DisplayName("TrieMetadata")
|
||||
class TrieMetadataTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("Text block roundtrip preserves all persisted fields")
|
||||
void textBlockRoundtripPreservesAllPersistedFields() {
|
||||
final TrieMetadata metadata = new TrieMetadata(5, WordTraversalDirection.FORWARD,
|
||||
new ReductionSettings(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 4),
|
||||
DiacriticProcessingMode.AS_IS, CaseProcessingMode.AS_IS);
|
||||
|
||||
final String textBlock = metadata.toTextBlock();
|
||||
final TrieMetadata parsed = TrieMetadata.fromTextBlock(5, textBlock);
|
||||
|
||||
assertAll(() -> assertEquals(metadata.traversalDirection(), parsed.traversalDirection()),
|
||||
() -> assertEquals(metadata.reductionSettings(), parsed.reductionSettings()),
|
||||
() -> assertEquals(metadata.diacriticProcessingMode(), parsed.diacriticProcessingMode()),
|
||||
() -> assertEquals(metadata.caseProcessingMode(), parsed.caseProcessingMode()),
|
||||
() -> assertTrue(textBlock.contains("rightToLeft=true")));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Text block parser rejects malformed input")
|
||||
void textBlockParserRejectsMalformedInput() {
|
||||
assertAll(
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "unknown-header\nx=y\n")),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\nmissingDelimiter\n")),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\ntraversalDirection=FORWARD\n")));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@DisplayName("WordTraversalDirection")
|
||||
class WordTraversalDirectionTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("startIndex follows direction and validates negatives")
|
||||
void startIndexFollowsDirectionAndValidatesNegatives() {
|
||||
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.startIndex(3)),
|
||||
() -> assertEquals(2, WordTraversalDirection.BACKWARD.startIndex(3)),
|
||||
() -> assertEquals(-1, WordTraversalDirection.FORWARD.startIndex(0)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.startIndex(-1)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("logicalIndex maps offsets in both directions")
|
||||
void logicalIndexMapsOffsetsInBothDirections() {
|
||||
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.logicalIndex(4, 0)),
|
||||
() -> assertEquals(3, WordTraversalDirection.BACKWARD.logicalIndex(4, 0)),
|
||||
() -> assertEquals(1, WordTraversalDirection.FORWARD.logicalIndex(4, 1)),
|
||||
() -> assertEquals(2, WordTraversalDirection.BACKWARD.logicalIndex(4, 1)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.FORWARD.logicalIndex(-1, 0)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.logicalIndex(3, 3)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("traversal character conversion preserves and reverses as expected")
|
||||
void traversalCharacterConversionPreservesAndReversesAsExpected() {
|
||||
assertAll(() -> assertArrayEquals(new char[] { 'a', 'b', 'c' },
|
||||
WordTraversalDirection.FORWARD.toTraversalCharacters("abc")),
|
||||
() -> assertArrayEquals(new char[] { 'c', 'b', 'a' },
|
||||
WordTraversalDirection.BACKWARD.toTraversalCharacters("abc")),
|
||||
() -> assertEquals("abc", WordTraversalDirection.FORWARD.traversalPathToLogicalKey("abc")),
|
||||
() -> assertEquals("cba", WordTraversalDirection.BACKWARD.traversalPathToLogicalKey("abc")),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> WordTraversalDirection.FORWARD.toTraversalCharacters(null)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.traversalPathToLogicalKey(null)));
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user