17 Commits

Author SHA1 Message Date
0dc516357f docs: improve README, MkDocs content, branding assets, and site polish (2) 2026-04-19 00:20:24 +02:00
0b674a39a8 docs: improve README, MkDocs content, branding assets, and site polish 2026-04-19 00:18:42 +02:00
db79dd2d4f ci: refine build, benchmark, and Pages workflows
* add workflow-level concurrency control for benchmark and Pages pipelines
* keep release changelog generation and the separate distZip step in the build workflow by design
* align the benchmark workflow with the primary Gradle action setup
* add Gradle wrapper validation to benchmark runs
* switch benchmark caching and setup to gradle/actions/setup-gradle
* remove the redundant Gradle wrapper executable-bit adjustment
* keep benchmark generation in Pages unchanged while improving workflow control
2026-04-18 15:38:19 +02:00
db446932fc docs: refine footer branding and improve Javadoc overview
- remove Material for MkDocs generator branding from the site footer
- keep footer presentation aligned with the project's professional documentation style
- improve Javadoc overview content for the API landing page
- align Javadoc introductory text with the main project site messaging
- clarify project scope, documentation purpose, and license information
2026-04-18 15:04:37 +02:00
1df6c0c87e docs: refine Pages publishing and homepage positioning
fix Pages publishing workflow to preserve worktree metadata and keep .nojekyll after site synchronization
add generated historical builds index and publish builds/index.html explicitly
improve homepage messaging to highlight extensibility of compiled dictionaries through additional transformation layers
2026-04-18 14:15:41 +02:00
31ed39c785 Merge branch 'main' of https://gitea.egothor.org/Egothor/Radixor 2026-04-18 11:57:26 +02:00
4b57eecbeb fix: .gh-pages folder was not pushed to gh-pages 2026-04-18 11:55:55 +02:00
a002238602 fix: mkdocs build --strict ...failed 2026-04-18 02:40:55 +02:00
92d2c98fed fix: mkdocs build --strict ...failed 2026-04-18 02:35:29 +02:00
bc031f2d8b feat: add MkDocs Material site and publish docs + CI reports to GitHub Pages 2026-04-18 02:14:45 +02:00
59128edc42 fix: exclude all maven-metadata.xml variants from central bundle 2026-04-16 22:25:36 +02:00
7e1aea72bf refactor: apply minor Radixor refinements and refresh dependency locks 2026-04-16 21:31:01 +02:00
594abe2c4b feat: add jqwik property-based coverage for trie and patch invariants
test: add property-based tests for FrequencyTrie determinism across repeated compilation
test: verify semantic alignment of get(), getAll(), and getEntries()
test: verify binary serialization and compressed persistence round-trip stability
test: verify builder reconstruction preserves observable trie behavior
test: add property-based tests for PatchCommandEncoder encode/apply round-trip and determinism
test: add generated stemmer-trie properties ensuring returned patches reconstruct only acceptable stems
test: introduce bounded reusable jqwik generators and scenario builders for maintainable property coverage
build: add jqwik to test dependencies and integrate it with the existing JUnit Platform setup
test: replace Jupiter display and tag annotations in jqwik suites with jqwik-native metadata to remove discovery warnings
2026-04-16 19:40:29 +02:00
953ce2226a feat(test): add deterministic fuzz-style coverage for trie compilation and stemming
* add fixed-seed fuzz scenario generator for bounded trie and dictionary inputs
* validate compilation stability across repeated builds and binary round-trips
* validate generated stemming dictionaries for non-crashing compilation and acceptable stem reconstruction
* add CI-safe semantic invariants for reduced trie reconstruction using get() and getAll()
* avoid unstable count-preservation assertions for builder reconstruction from reduced shared tries
2026-04-16 18:51:39 +02:00
05692726c5 feat: publish Pages-backed quality badges in README
* add README badges for CI status, coverage, reports, mutation score, benchmark speedup, Maven Central, license, and Java baseline
* generate Shields endpoint metadata for JaCoCo, PIT, and JMH results
* move badge generation logic into tools/generate-pages-badges.py to keep workflows concise and maintainable
* update Pages publishing workflow to publish badge metadata for both build-specific and latest report views
* expose published badge metadata links in the reports index for transparency and troubleshooting
2026-04-16 18:22:24 +02:00
c18563617d feat: add release changelog generation and package distribution integration
feat: add custom release changelog generator based on release tag ranges and prefixed commit lines
build: include generated CHANGELOG.md in the distribution ZIP when present
ci: generate release changelog during release workflow and use it as the GitHub release body
ci: split release packaging so distZip is rebuilt after changelog generation
chore: keep changelog generation out of quality-gate and report publishing workflows
2026-04-16 17:42:22 +02:00
436deefd14 fix: exclude Maven metadata files from Central upload bundle
fix: remove maven-metadata files from the generated Central bundle
fix: align uploaded archive with Sonatype Portal component layout expectations
2026-04-16 03:42:59 +02:00
46 changed files with 4860 additions and 1757 deletions

View File

@@ -19,6 +19,10 @@ on:
- 'gradlew.bat'
- '.github/workflows/benchmarks.yml'
concurrency:
group: benchmarks-${{ github.ref }}
cancel-in-progress: true
jobs:
jmh:
runs-on: ubuntu-latest
@@ -31,15 +35,17 @@ jobs:
- name: Check out sources
uses: actions/checkout@v4
- name: Validate Gradle wrapper
uses: gradle/actions/wrapper-validation@v4
- name: Set up JDK 21
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: '21'
cache: gradle
- name: Make Gradle executable
run: chmod +x ./gradlew
- name: Set up Gradle caching and instrumentation
uses: gradle/actions/setup-gradle@v4
- name: Verify reproducibility inputs
shell: bash

View File

@@ -156,11 +156,22 @@ jobs:
test -f gradle.properties
test -f gradle/verification-metadata.xml
- name: Build release distribution, signed Maven bundle, and SBOM
- name: Build release inputs, signed Maven bundle, and SBOM
env:
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom centralBundle
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
- name: Generate release changelog
shell: bash
run: |
set -euo pipefail
chmod +x ./tools/generate-release-notes.sh
mkdir -p build/generated/release-notes
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
- name: Package release distribution
run: ./gradlew --no-daemon distZip
- name: Publish bundle to Maven Central
shell: bash
@@ -188,7 +199,7 @@ jobs:
- name: Publish GitHub release assets
uses: softprops/action-gh-release@v2
with:
generate_release_notes: true
body_path: build/generated/release-notes/CHANGELOG.md
files: |
build/distributions/*.zip
build/reports/sbom/radixor-sbom.json

View File

@@ -5,6 +5,8 @@ on:
branches:
- main
paths:
- 'docs/**'
- 'mkdocs.yml'
- 'src/main/**'
- 'src/test/**'
- 'src/jmh/**'
@@ -17,6 +19,8 @@ on:
- 'gradlew'
- 'gradlew.bat'
- '.github/workflows/pages.yml'
- '.github/workflows/benchmarks.yml'
- 'tools/generate-pages-badges.py'
workflow_dispatch:
permissions:
@@ -49,6 +53,14 @@ jobs:
- name: Set up Gradle caching and instrumentation
uses: gradle/actions/setup-gradle@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install MkDocs Material
run: python -m pip install --upgrade pip mkdocs-material
- name: Verify reproducibility inputs
shell: bash
run: |
@@ -83,11 +95,13 @@ jobs:
SITE_DIR=".gh-pages"
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
RUN_METRICS_DIR="${RUN_DIR}/metrics"
LATEST_DIR="${SITE_DIR}/builds/latest"
LATEST_METRICS_DIR="${LATEST_DIR}/metrics"
mkdir -p "${RUN_DIR}"
rm -rf "${LATEST_DIR}"
mkdir -p "${LATEST_DIR}"
mkdir -p "${LATEST_DIR}" "${RUN_METRICS_DIR}" "${LATEST_METRICS_DIR}"
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
@@ -108,12 +122,17 @@ jobs:
JMH_CSV_LINK=''
JMH_TXT_LATEST_LINK=''
JMH_CSV_LATEST_LINK=''
JMH_TXT_REPORT_MD='- Benchmark results (TXT): not currently available'
JMH_CSV_REPORT_MD='- Benchmark results (CSV): not currently available'
DEPENDENCY_CHECK_LINK=''
DEPENDENCY_CHECK_LATEST_LINK=''
DEPENDENCY_CHECK_REPORT_MD='- Dependency vulnerability report: not currently available'
SBOM_JSON_LINK=''
SBOM_XML_LINK=''
SBOM_JSON_LATEST_LINK=''
SBOM_XML_LATEST_LINK=''
SBOM_JSON_REPORT_MD='- SBOM (JSON): not currently available'
SBOM_XML_REPORT_MD='- SBOM (XML): not currently available'
if [ -d "build/reports/jmh" ]; then
cp -R build/reports/jmh "${RUN_DIR}/jmh"
@@ -122,10 +141,12 @@ jobs:
if [ -f "${RUN_DIR}/jmh/jmh-results.txt" ]; then
JMH_TXT_LINK='<li><a href="./jmh/jmh-results.txt">Benchmark Results (TXT)</a></li>'
JMH_TXT_LATEST_LINK='<li><a href="./builds/latest/jmh/jmh-results.txt">Benchmark Results (TXT)</a></li>'
JMH_TXT_REPORT_MD='- [JMH benchmark results (TXT)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)'
fi
if [ -f "${RUN_DIR}/jmh/jmh-results.csv" ]; then
JMH_CSV_LINK='<li><a href="./jmh/jmh-results.csv">Benchmark Results (CSV)</a></li>'
JMH_CSV_LATEST_LINK='<li><a href="./builds/latest/jmh/jmh-results.csv">Benchmark Results (CSV)</a></li>'
JMH_CSV_REPORT_MD='- [JMH benchmark results (CSV)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.csv)'
fi
HAS_JMH="true"
@@ -140,6 +161,7 @@ jobs:
if [ -f "${RUN_DIR}/dependency-check/dependency-check-report.html" ]; then
DEPENDENCY_CHECK_LINK='<li><a href="./dependency-check/dependency-check-report.html">Dependency Vulnerability Report</a></li>'
DEPENDENCY_CHECK_LATEST_LINK='<li><a href="./builds/latest/dependency-check/dependency-check-report.html">Dependency Vulnerability Report</a></li>'
DEPENDENCY_CHECK_REPORT_MD='- [Dependency vulnerability report](https://leogalambos.github.io/Radixor/builds/latest/dependency-check/dependency-check-report.html)'
fi
fi
@@ -150,6 +172,40 @@ jobs:
SBOM_XML_LINK='<li><a href="./sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
SBOM_JSON_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.json">SBOM (JSON)</a></li>'
SBOM_XML_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
SBOM_JSON_REPORT_MD='- [SBOM (JSON)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.json)'
SBOM_XML_REPORT_MD='- [SBOM (XML)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.xml)'
fi
python3 \
./tools/generate-pages-badges.py \
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
--pit-xml build/reports/pitest/mutations.xml \
--jmh-csv build/reports/jmh/jmh-results.csv \
--run-metrics-dir "${RUN_METRICS_DIR}" \
--latest-metrics-dir "${LATEST_METRICS_DIR}"
COVERAGE_BADGE_LINK='<li><a href="./metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
COVERAGE_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
MUTATION_BADGE_LINK='<li><a href="./metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
MUTATION_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
JMH_BADGE_LINK='<li><a href="./metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
JMH_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
COVERAGE_BADGE_REPORT_MD='- [Coverage badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)'
MUTATION_BADGE_REPORT_MD='- [Mutation badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)'
JMH_BADGE_REPORT_MD='- [Benchmark badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)'
if [ ! -f "${RUN_METRICS_DIR}/coverage-badge.json" ]; then
COVERAGE_BADGE_LINK='<li>Coverage Badge Metadata: not available</li>'
COVERAGE_BADGE_LATEST_LINK='<li>Coverage Badge Metadata: not available</li>'
COVERAGE_BADGE_REPORT_MD='- Coverage badge metadata: not currently available'
fi
if [ ! -f "${RUN_METRICS_DIR}/pitest-badge.json" ]; then
MUTATION_BADGE_REPORT_MD='- Mutation badge metadata: not currently available'
fi
if [ ! -f "${RUN_METRICS_DIR}/jmh-badge.json" ]; then
JMH_BADGE_REPORT_MD='- Benchmark badge metadata: not currently available'
fi
cat > "${RUN_DIR}/index.html" <<EOF
@@ -178,6 +234,9 @@ jobs:
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
${SBOM_JSON_LINK:-<li>SBOM (JSON): not available</li>}
${SBOM_XML_LINK:-<li>SBOM (XML): not available</li>}
${COVERAGE_BADGE_LINK}
${MUTATION_BADGE_LINK}
${JMH_BADGE_LINK}
<li><a href="./pitest/">Mutation Testing Report</a></li>
$(
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
@@ -192,65 +251,73 @@ jobs:
cp "${RUN_DIR}/index.html" "${LATEST_DIR}/index.html"
cat > "${SITE_DIR}/.nojekyll" <<EOF
EOF
cat > docs/reports.md <<EOF
# CI Reports
BUILD_LIST=$(find "${SITE_DIR}/builds" -mindepth 1 -maxdepth 1 -type d -printf '%f\n' | grep -E '^[0-9]+$' | sort -nr | head -20)
Radixor publishes durable CI artifacts to GitHub Pages on every qualifying run of \`.github/workflows/pages.yml\`.
## Primary report entry points
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
${DEPENDENCY_CHECK_REPORT_MD}
${SBOM_JSON_REPORT_MD}
${SBOM_XML_REPORT_MD}
## Benchmark and badge metadata
${JMH_TXT_REPORT_MD}
${JMH_CSV_REPORT_MD}
${COVERAGE_BADGE_REPORT_MD}
${MUTATION_BADGE_REPORT_MD}
${JMH_BADGE_REPORT_MD}
## Historical runs
- [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/)
EOF
{
cat <<EOF
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Radixor Reports</title>
<style>
body { font-family: Arial, sans-serif; max-width: 1000px; margin: 2rem auto; padding: 0 1rem; line-height: 1.5; }
h1, h2 { margin-bottom: 0.5rem; }
ul { padding-left: 1.25rem; }
code { background: #f4f4f4; padding: 0.1rem 0.3rem; }
.meta { color: #555; }
</style>
</head>
<body>
<h1>Radixor Published Reports</h1>
<p class="meta">Durable CI reports published from GitHub Actions to the <code>gh-pages</code> branch.</p>
echo "# Historical Build Reports"
echo
echo "The following build report sets are currently published on GitHub Pages."
echo
echo "| Build | Published | Link |"
echo "|---:|---|---|"
<h2>Latest</h2>
<ul>
<li><a href="./builds/latest/">Latest build summary</a></li>
<li><a href="./builds/latest/javadoc/">Javadoc</a></li>
<li><a href="./builds/latest/test/">Test Report</a></li>
<li><a href="./builds/latest/pmd/main.html">PMD Report</a></li>
<li><a href="./builds/latest/coverage/">Coverage Report</a></li>
${DEPENDENCY_CHECK_LATEST_LINK:-<li>Dependency Vulnerability Report: not currently available</li>}
${SBOM_JSON_LATEST_LINK:-<li>SBOM (JSON): not available</li>}
${SBOM_XML_LATEST_LINK:-<li>SBOM (XML): not available</li>}
<li><a href="./builds/latest/pitest/">Mutation Testing Report</a></li>
$(
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LATEST_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LATEST_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
|| echo '<li>Benchmark results: not currently available</li>'
)
EOF
cat <<EOF
</ul>
<h2>Recent historical builds</h2>
<ul>
EOF
for build in ${BUILD_LIST}; do
echo " <li><a href=\"./builds/${build}/\">Build ${build}</a></li>"
find "${SITE_DIR}/builds" -mindepth 1 -maxdepth 1 -type d ! -name latest -printf '%P\n' \
| grep -E '^[0-9]+$' \
| while read -r build; do
ts="$(git -C "${SITE_DIR}" log --diff-filter=A --format='%ct' --reverse -- "builds/${build}/index.html" | head -n 1)"
if [ -n "${ts}" ]; then
published="$(date -u -d "@${ts}" '+%Y-%m-%d %H:%M')"
else
published="unknown"
ts="0"
fi
printf '%s\t%s\t%s\n' "${ts}" "${build}" "${published}"
done \
| sort -r -n -k1,1 \
| while IFS=$'\t' read -r _ts build published; do
echo "| ${build} | ${published} | [Open](../builds/${build}/) |"
done
} > docs/builds.md
cat <<EOF
</ul>
</body>
</html>
- name: Build documentation site (MkDocs Material)
shell: bash
run: |
set -euo pipefail
mkdocs build --strict --site-dir .mkdocs-site
rsync -a --delete --exclude '.git' --exclude '.git/' --exclude 'builds/' .mkdocs-site/ .gh-pages/
mkdir -p .gh-pages/builds
cp .mkdocs-site/builds/index.html .gh-pages/builds/index.html
cat > .gh-pages/.nojekyll <<EOF
EOF
} > "${SITE_DIR}/index.html"
rm -rf .mkdocs-site
- name: Commit and push gh-pages
shell: bash

3
.gitignore vendored
View File

@@ -90,6 +90,9 @@ local.properties
# PMD plugin conf
.pmd
# jqwik local db
.jqwik-database
##---------------------------------------------------------------------------------------- Gradle
.gradle
**/build/

189
README.md
View File

@@ -2,53 +2,70 @@
# Radixor
*Fast algorithmic stemming with compact patch-command tries — measured at about 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
[![Quality gates](https://github.com/leogalambos/Radixor/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/leogalambos/Radixor/actions/workflows/build.yml)
[![Coverage](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
[![Published reports](https://img.shields.io/badge/reports-GitHub%20Pages-blue)](https://leogalambos.github.io/Radixor/builds/latest/)
[![Mutation score](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
[![English benchmark](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)
[![Maven Central](https://img.shields.io/maven-central/v/org.egothor/radixor)](https://central.sonatype.com/artifact/org.egothor/radixor)
[![License](https://img.shields.io/github/license/leogalambos/Radixor)](LICENSE)
[![Java](https://img.shields.io/badge/Java-21%2B-brightgreen)](#)
**Radixor** is a fast, algorithmic stemming toolkit for Java, built around compact **patch-command tries** in the tradition of the original **Egothor** stemmer.
*Fast, deterministic, multi-language stemming for Java, built around compact patch-command tries and measured at roughly 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
On the current JMH English comparison benchmark, Radixor with bundled `US_UK_PROFI`
reaches approximately **31 to 32 million tokens per second**, compared with about
**8 million tokens per second** for Snowball original Porter and about
**5 to 5.5 million tokens per second** for Snowball English (Porter2).
**Radixor** is a modern multi-language stemming toolkit for Java in the tradition of the original **Egothor** approach. It learns compact word-to-stem transformations from dictionary data, stores them in compiled patch-command tries, and exposes a runtime model designed for speed, determinism, and operational simplicity. Unlike a closed-form dictionary lookup stemmer, Radixor can also generalize beyond explicitly listed word forms.
That means the current Radixor implementation is approximately:
It is particularly well suited to systems that need stemming which is:
- **4× faster** than Snowball original Porter
- **6× faster** than Snowball English (Porter2)
- fast at runtime,
- compact in memory and on disk,
- deterministic in behavior,
- adaptable through dictionary data rather than hardcoded language rules,
- practical to compile, persist, version, extend, and deploy.
It is designed for production search and text-processing systems that need stemming which is:
- fast at runtime
- compact in memory and on disk
- deterministic in behavior
- driven by dictionary data rather than hardcoded language rules
- practical to maintain, extend, and test
Radixor keeps the valuable core of the original Egothor idea, modernizes the implementation, and adds capabilities that make it more useful in real software systems today.
It also retains the operational advantages of a compiled artifact model: predictable runtime behavior, direct binary loading, and clear separation between preparation-time compilation and live request processing.
## Table of Contents
- [Why Radixor](#why-radixor)
- [Performance](#performance)
- [Heritage](#heritage)
- [What Radixor adds](#what-radixor-adds)
- [Key features](#key-features)
- [Performance](#performance)
- [Documentation](#documentation)
- [Project philosophy](#project-philosophy)
- [Historical note](#historical-note)
## Why Radixor
The central idea behind Radixor is simple: learn how to transform a word form into its stem, encode that transformation as a compact patch command, store it in a trie, and make runtime lookup extremely fast.
The central idea behind Radixor is simple: learn how to transform a word form into its stem, encode that transformation as a compact patch command, store it in a trie, and make the runtime path as small and direct as possible.
This gives you a stemmer that is:
That produces a stemmer that is:
- data-driven rather than rule-hardcoded
- reusable across languages
- compact enough for deployment-friendly binary artifacts
- suitable for both offline compilation and runtime loading
- data-driven rather than rule-hardcoded,
- applicable across languages through compiled transformation models learned from dictionary data,
- compact enough for deployment-friendly binary artifacts,
- suitable for both offline compilation and direct runtime loading,
- capable of exposing either a preferred result or multiple candidate results when ambiguity matters.
Radixor is especially attractive when you want something more adaptable than simple suffix stripping, but much smaller and easier to operate than a full morphological analyzer. In the current English benchmark comparison against the Snowball Porter stemmer family, it also delivers a substantial throughput advantage.
Radixor is especially attractive when you want something more adaptable than simple suffix stripping, but much smaller and easier to operate than a full morphological analyzer.
## Performance
Radixor includes a JMH benchmark suite for both its own algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
On the current English comparison workload, Radixor with bundled `US_UK_PROFI` reaches approximately **31 to 32 million tokens per second**. Snowball original Porter reaches approximately **8 million tokens per second**, and Snowball English (Porter2) approximately **5 to 5.5 million tokens per second**.
That places Radixor at approximately:
- **4× the throughput of Snowball original Porter**
- **6× the throughput of Snowball English (Porter2)**
on the current benchmark workload.
This is a throughput comparison on the same deterministic token stream. It is **not** a claim that the compared stemmers are linguistically equivalent or interchangeable.
For benchmark scope, workload design, environment, commands, report locations, and interpretation guidance, see [Benchmarking](docs/benchmarking.md).
## Heritage
@@ -60,44 +77,47 @@ Useful historical references:
- [Egothor project](http://www.egothor.org/)
- [Stempel overview](https://www.getopt.org/stempel/)
- [Leo Galambos, *Lemmatizer for Document Information Retrieval Systems in JAVA* (SOFSEM 2001)](https://www.researchgate.net/publication/221512865_Lemmatizer_for_Document_Information_Retrieval_Systems_in_JAVA)
- [Lucene Stempel overview](https://lucene.apache.org/core/5_3_0/analyzers-stempel/index.html)
- [Elasticsearch Stempel plugin](https://www.elastic.co/docs/reference/elasticsearch/plugins/analysis-stempel)
Radixor is not just a repackaging of legacy code. It is a practical modernization of the approach for current Java development and long-term maintainability.
The Galambos paper is a useful historical reference for the semi-automatic, transformation-based stemming idea that later informed the Egothor lineage and, in turn, the conceptual background of Radixor. It should be read as research and heritage context rather than as a description of Radixor's present-day implementation.
Radixor is not a repackaging of legacy code. It is a modern implementation that preserves the valuable core idea while reworking the engineering around maintainability, testing, persistence, and long-term operational use.
## What Radixor adds
Radixor keeps the patch-command trie model, but improves the engineering around it.
Radixor keeps the patch-command trie model, but improves the engineering around it in ways that matter in real software systems.
Compared with the historical baseline, Radixor emphasizes:
- **simplification to the most practical core**
The implementation focuses on the parts of the original approach that are most useful in production.
- **a focused practical core**
The implementation concentrates on the parts of the original approach that are most useful in production.
- **immutable compiled tries**
Runtime lookup uses compact read-only structures optimized for efficient access.
- **support for more than one stemming result**
Radixor can expose both a preferred result and multiple candidate results where the data is ambiguous.
Radixor can expose both a preferred result and multiple candidate results when the underlying data is ambiguous.
- **frequency-aware deterministic ordering**
Candidate results are ordered consistently and reproducibly.
- **practical subtree reduction modes**
Reduction can be tuned toward stronger compression or more conservative behavioral preservation.
Reduction can be tuned toward stronger compression or more conservative semantic preservation.
- **reconstruction of writable builders from compiled tables**
- **reconstruction of writable builders from compiled artifacts**
Existing compiled stemmer tables can be reopened, modified, and compiled again.
- **better tests and implementation stability**
Stronger coverage improves confidence during refactoring and further development.
- **strong validation discipline**
Coverage, mutation testing, benchmark visibility, and published reports are treated as part of the engineering standard rather than optional project decoration.
## Key features
- Fast algorithmic stemming
- Compact compiled binary artifacts
- Patch-command based transformation model
- Dictionary-driven language adaptation
- Multi-language stemming through compiled transformation models
- Single-result and multi-result lookup
- Deterministic result ordering
- Compressed binary persistence
@@ -105,57 +125,69 @@ Compared with the historical baseline, Radixor emphasizes:
- CLI compilation tool
- Bundled language resources
- Support for extending compiled stemmer tables
## Performance
Radixor includes a JMH benchmark suite for both its own algorithmic core and a
side-by-side comparison against the Snowball Porter stemmer family.
On the current English comparison workload, Radixor with bundled `US_UK_PROFI`
reaches approximately **31 to 32 million tokens per second**. Snowball original
Porter reaches approximately **8 million tokens per second**, and Snowball
English (Porter2) approximately **5 to 5.5 million tokens per second**.
That places Radixor at approximately **4× the throughput of Snowball original Porter**
and approximately **6× the throughput of Snowball English (Porter2)**
on the current benchmark workload.
This is a throughput comparison on the same deterministic token stream. It is
not a claim that the compared stemmers are linguistically equivalent or
interchangeable.
For benchmark scope, workload design, environment, commands, report locations,
and interpretation guidance, see [Benchmarking](docs/benchmarking.md).
- Reproducible and auditable engineering posture
## Documentation
The repository keeps the front page concise and places detailed documentation under `docs/`.
Start here:
### Getting Started
- [Quick Start](docs/quick-start.md)
A practical first guide to loading, compiling, and using Radixor.
- [Built-in Languages](docs/built-in-languages.md)
Overview of bundled language resources such as `US_UK` and `US_UK_PROFI`.
- [Dictionary Format](docs/dictionary-format.md)
How to write stemming dictionaries.
How to write and normalize stemming dictionaries.
- [Compilation (CLI tool)](docs/cli-compilation.md)
How to compile dictionaries with the `Compile` CLI.
How to compile dictionaries into deployable binary artifacts.
- [Programmatic Usage](docs/programmatic-usage.md)
How to build, load, modify, and query Radixor from Java code.
### Programmatic Usage
- [Built-in Languages](docs/built-in-languages.md)
How to use integrated language resources such as `US_UK_PROFI`.
- [Programmatic Usage Overview](docs/programmatic-usage.md)
Entry point to the Java API and the overall usage model.
- [Architecture and Reduction](docs/architecture-and-reduction.md)
Internal model, compiled trie design, and reduction strategies.
- [Loading and Building Stemmers](docs/programmatic-loading-and-building.md)
Loading bundled resources, textual dictionaries, binary artifacts, and direct builder usage.
- [Querying and Ambiguity Handling](docs/programmatic-querying-and-ambiguity.md)
`get()`, `getAll()`, `getEntries()`, patch application, and ambiguity behavior.
- [Extending and Persisting Compiled Tries](docs/programmatic-extending-and-persistence.md)
Reopening compiled tries, rebuilding them, and writing binary artifacts.
### Concepts and Internals
- [Architecture and Reduction Overview](docs/architecture-and-reduction.md)
High-level explanation of the build pipeline and compiled trie model.
- [Architecture](docs/architecture.md)
Structural model, data flow, and runtime lookup behavior.
- [Reduction Semantics](docs/reduction-semantics.md)
Ranked, unordered, and dominant reduction behavior.
- [Compatibility and Guarantees](docs/compatibility-and-guarantees.md)
Supported public API, internal API boundaries, and compatibility expectations.
### Dictionaries and Language Resources
- [Contributing Dictionaries](docs/contributing-dictionaries.md)
Guidance for high-quality lexical resource contributions.
### Quality and Operations
- [Quality and Operations](docs/quality-and-operations.md)
Testing, persistence, deployment, and operational guidance.
Engineering standards, validation posture, auditability, and operational model.
- [Benchmarking](docs/benchmarking.md)
JMH benchmark design, Snowball comparison, execution, and interpretation.
JMH benchmark methodology, Porter comparison, and result interpretation.
- [Published Reports](docs/reports.md)
Entry points to CI-published reports and GitHub Pages artifacts.
## Project philosophy
@@ -163,19 +195,20 @@ Radixor does not preserve historical complexity for its own sake.
It preserves the valuable idea:
- compact learned transformations
- trie-based lookup
- language-data driven stemming
- practical runtime speed
- compact learned transformations,
- trie-based lookup,
- language-data driven stemming,
- practical runtime speed.
Then it improves the parts modern users care about:
- maintainability
- testability
- modification workflows
- persistence
- determinism
- clearer APIs
- maintainability,
- testability,
- modification workflows,
- persistence,
- determinism,
- clearer APIs,
- explicit quality evidence.
The goal is to keep the Egothor/Stempel lineage useful as a serious contemporary software component.

View File

@@ -70,6 +70,7 @@ dependencies {
testImplementation libs.mockito.core
testImplementation libs.mockito.junit.jupiter
testImplementation libs.jqwik
mockitoAgent(libs.mockito.core) {
transitive = false
@@ -187,6 +188,47 @@ pitest {
application {
mainClass = 'org.egothor.stemmer.Compile'
applicationName = 'radixor'
executableDir = 'bin'
}
distributions {
main {
distributionBaseName = 'radixor'
contents {
from('README.md') {
into ''
}
from('LICENSE') {
into ''
}
from('docs') {
into 'docs'
include '**/*.md'
}
from(layout.buildDirectory.dir('generated/release-notes')) {
into ''
include 'CHANGELOG.md'
}
}
}
}
tasks.named('startScripts') {
applicationName = 'radixor'
}
tasks.named('distZip', Zip) {
archiveBaseName = 'radixor'
archiveClassifier = 'bin'
}
tasks.named('distTar') {
enabled = false
}
jmh {
@@ -260,6 +302,17 @@ javadoc {
options.version = true
options.windowTitle = 'Radixor - Egothor Stemmer'
options.docTitle = 'Radixor - Egothor Stemmer API'
options.overview = file('src/main/javadoc/overview.html')
options.bottom = """
<div class="legal-copy">
&copy; 2026 Egothor
<br/>
Licensed under <a href="https://github.com/leogalambos/Radixor/blob/main/LICENSE">BSD-3-Clause</a>
</div>
"""
options.links('https://docs.oracle.com/en/java/javase/21/docs/api/')
options.group('Core Stemming API', 'org.egothor.stemmer')
options.group('Trie Infrastructure', 'org.egothor.stemmer.trie')
source = sourceSets.main.allJava
}

View File

@@ -1,470 +1,52 @@
# Architecture and Reduction
> ← Back to [README.md](../README.md)
This section explains how **Radixor** turns textual dictionary input into a compact compiled stemmer and how reduction affects the semantics preserved in the final runtime artifact.
This document describes the internal architecture of **Radixor** and the principles behind its **trie compilation and reduction model**.
Radixor is easiest to understand when separated into two related concerns:
It explains:
- **architecture**: what structures exist, how data moves through them, and what runtime lookup actually does,
- **reduction semantics**: what it means for two subtrees to be considered equivalent and how that choice affects `get()` and `getAll()` behavior.
- how data flows from dictionary input to compiled trie
- how patch-command tries are structured
- how subtree reduction works
- how reduction modes affect behavior and size
## The short version
Radixor does not keep a large flat table of final stems. Instead, it converts dictionary entries into **patch commands**, stores them in a trie, reduces equivalent subtrees, and freezes the result into an immutable compiled structure.
The build-time flow is:
## Overview
Radixor transforms dictionary data into an optimized runtime structure through three stages:
1. **Mutable construction**
2. **Reduction (canonicalization)**
3. **Compilation (freezing)**
```
Dictionary → Mutable trie → Reduced trie → Compiled trie
```text
Dictionary -> Mutable trie -> Reduced trie -> Compiled trie
```
Each stage has a distinct purpose:
At runtime, the compiled trie does not directly return the final stem string. It returns one or more stored patch commands for the addressed key, and those commands are then applied to the original input word.
| Stage | Purpose | Structure |
|------------|----------------------------------|-------------------------|
| Build | Collect mappings | `MutableNode` |
| Reduction | Merge equivalent subtrees | `ReducedNode` |
| Compilation | Optimize for runtime lookup | `CompiledNode` |
## Why this matters
This design gives Radixor several practical properties at once:
- compact deployable artifacts,
- deterministic runtime behavior,
- support for both preferred and multiple candidate results,
- separation of preparation-time complexity from runtime lookup.
## Core data model
It also explains why a large source dictionary can be transformed into a much smaller compiled artifact without discarding the operational behavior that matters to the caller.
### Patch-command trie
## Reading guide
Radixor stores **patch commands** instead of stems directly.
Use the following pages depending on what you need to understand:
- keys: word forms
- values: transformation commands
- structure: trie (prefix tree)
- [Architecture](architecture.md) explains the data flow, core structures, patch-command lookup model, and why the compiled trie is efficient at runtime.
- [Reduction Semantics](reduction-semantics.md) explains how subtree equivalence is defined, what ranked, unordered, and dominant reduction preserve, and how those choices affect observable lookup behavior.
At runtime:
## Recommended reading order
1. the word is traversed through the trie
2. a patch command is retrieved
3. the patch is applied to reconstruct the stem
For most readers, the best order is:
1. [Architecture](architecture.md)
2. [Reduction Semantics](reduction-semantics.md)
## Related documentation
## Stage 1: Mutable construction
The builder (`FrequencyTrie.Builder`) constructs a trie using:
- `MutableNode`
- maps of children (`char → node`)
- maps of value counts (`value → frequency`)
Characteristics:
- insertion-order preserving
- mutable
- optimized for building, not querying
Example structure:
```
g
└─ n
└─ i
└─ n
└─ n
└─ u
└─ r
└─ (values: {
"<patch-command-1>": 3,
"<patch-command-2>": 1
})
```
This example represents the word "running", stored in reversed form.
- each edge corresponds to one character of the word
- the path is traversed from the end of the word toward the beginning
- the terminal node stores one or more patch commands together with their local frequencies
The values represent transformations from the word form to candidate stems, and the counts indicate how often each mapping was observed during construction.
Note: Radixor stores word forms in reversed order so that suffix-based transformations can be matched efficiently in a trie.
## Local value summary
Before reduction, each node is summarized using `LocalValueSummary`.
It computes:
- ordered values (by frequency)
- aligned counts
- total frequency
- dominant value (if any)
- second-best value
This summary is critical for:
- deterministic ordering
- reduction decisions
- dominance evaluation
## Stage 2: Reduction (canonicalization)
Reduction is the process of merging **semantically equivalent subtrees**.
### Why reduction exists
Without reduction:
- trie size grows linearly with input data
- repeated patterns are duplicated
With reduction:
- identical subtrees are shared
- memory footprint is reduced
- binary output becomes smaller
## Reduction signature
Each subtree is represented by a **ReductionSignature**.
A signature consists of:
1. **local descriptor** (node semantics)
2. **child descriptors** (structure)
```
Signature = (LocalDescriptor, SortedChildDescriptors)
```
Two subtrees are merged if their signatures are equal.
## Local descriptors
The local descriptor encodes how values at a node are interpreted.
Radixor supports three descriptor types:
### 1. Ranked descriptor
Preserves:
- full ordering of values (`getAll()`)
Uses:
- ordered value list
Best for:
- correctness
- deterministic multi-result behavior
### 2. Unordered descriptor
Preserves:
- only membership (set of values)
Ignores:
- ordering differences
Best for:
- higher compression
- use cases where ordering is irrelevant
### 3. Dominant descriptor
Preserves:
- only the dominant value (`get()`)
Condition:
- dominant value must satisfy thresholds:
- minimum percentage
- ratio over second-best
Fallback:
- if dominance is not strong enough → ranked descriptor is used
Best for:
- maximum compression
- single-result workflows
## Child descriptors
Each child is represented as:
```
(edge character, child signature)
```
Children are sorted by edge character to ensure:
- deterministic signatures
- stable equality comparisons
## Reduction context
`ReductionContext` maintains:
- mapping: `ReductionSignature → ReducedNode`
- canonical instances of subtrees
Workflow:
1. compute signature
2. check if already exists
3. reuse existing node or create new one
This ensures:
- structural sharing
- no duplicate equivalent subtrees
## Reduced nodes
`ReducedNode` represents:
- canonical subtree
- aggregated value counts
- canonical children
It supports:
- merging local counts
- verifying structural consistency
At this stage:
- structure is canonical
- still mutable (internally)
## Stage 3: Compilation (freezing)
The reduced trie is converted into a **CompiledNode** structure.
### CompiledNode characteristics
- immutable
- array-based storage
- optimized for fast lookup
Fields:
- `char[] edgeLabels`
- `CompiledNode[] children`
- `V[] orderedValues`
- `int[] orderedCounts`
## Lookup algorithm
Runtime lookup:
1. traverse trie using `edgeLabels` (matching characters from the end of the word toward the beginning)
2. binary search per node
3. retrieve values
4. apply patch command
Properties:
- O(length of word)
- low memory overhead
- minimal memory allocation during lookup; patch application produces the resulting string
## Deterministic ordering
Value ordering is deterministic and stable:
1. higher frequency first
2. shorter string first
3. lexicographically smaller
4. insertion order
This guarantees:
- reproducible builds
- stable query results
- predictable ranking
## Reduction modes
Reduction modes control how local descriptors are chosen.
### Ranked mode
```
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
```
- preserves full semantics
- safest option
- recommended default
### Unordered mode
```
MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS
```
- ignores ordering
- higher compression
- slightly weaker semantics
### Dominant mode
```
MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS
```
- keeps only dominant result
- highest compression
- may lose alternative candidates
## Trade-offs
| Aspect | Ranked | Unordered | Dominant |
|---------------|--------|----------|----------|
| Compression | Medium | High | Highest |
| Accuracy | High | Medium | Lower |
| getAll() | Full | Partial | Limited |
| get() | Exact | Exact | Heuristic|
## Deserialization model
Binary loading uses:
- `NodeData` as intermediate representation
- reconstruction of `CompiledNode`
This separates:
- I/O format
- in-memory structure
## Why this architecture works
Radixor achieves:
### Compactness
- subtree sharing
- efficient encoding
- compressed binary output
### Performance
- array-based lookup
- no runtime reduction
- minimal branching
### Flexibility
- configurable reduction strategies
- multiple result support
- dictionary-driven behavior
### Determinism
- stable ordering
- canonical signatures
- reproducible builds
## Design philosophy
The architecture reflects a few key principles:
- separate build-time complexity from runtime simplicity
- encode semantics explicitly (not implicitly in code)
- favor deterministic behavior over heuristic shortcuts
- allow controlled trade-offs between size and fidelity
## When to tune reduction
You should consider changing reduction mode when:
- binary size is too large
- memory footprint must be minimized
- only single-result stemming is needed
Otherwise:
**use ranked mode by default**
## Next steps
- [Quick start](quick-start.md)
- [Programmatic usage](programmatic-usage.md)
- [CLI compilation](cli-compilation.md)
- [Dictionary format](dictionary-format.md)
## Summary
Radixors architecture is built around:
- patch-command tries
- canonical subtree reduction
- immutable compiled structures
This design allows the system to remain:
- fast
- compact
- deterministic
- adaptable
while still supporting advanced use cases such as:
- ambiguity-aware stemming
- dictionary evolution
- controlled trade-offs between size and behavior

209
docs/architecture.md Normal file
View File

@@ -0,0 +1,209 @@
# Architecture
This document explains the structural architecture of **Radixor**: what data is stored, how it flows through the build pipeline, and how runtime lookup works once a compiled trie has been produced.
## The central idea
Radixor does not store final stems directly as a large flat lookup table. Instead, it stores **patch commands** that describe how a word form should be transformed into a canonical stem.
For example, if a dictionary states that `running` should reduce to `run`, the final runtime artifact does not need to store a full redundant `running -> run` output string entry in the simplest possible form. It can store a compact transformation command that expresses how to turn the source form into the target form.
That matters because many words share similar transformation patterns. Once those mappings are organized in a trie and compiled into a canonical structure, the result is much smaller and more reusable than a naive direct-output table.
## End-to-end build flow
The full build-time flow is:
```text
Dictionary -> Mutable trie -> Reduced trie -> Compiled trie
```
Each stage has a different purpose.
### Dictionary input
The textual dictionary groups known word forms under a canonical stem:
```text
run running runs ran
connect connected connecting connection
```
The first token is the canonical stem. The following tokens are known variants.
### Patch-command generation
Each variant is converted into a patch command that transforms the variant into the stem.
Conceptually:
```text
running -> <patch> -> run
runs -> <patch> -> run
ran -> <patch> -> run
```
If `storeOriginal` is enabled, the stem itself is also inserted using a canonical no-op patch.
### Mutable trie construction
Those patch-command values are inserted into a mutable trie keyed by the source surface form.
### Reduction
Equivalent subtrees are merged into canonical reduced nodes.
### Compilation
The reduced structure is frozen into an immutable compiled trie optimized for runtime lookup.
## Why a trie is used
A trie is useful because many word forms share structural fragments. Instead of storing each word independently, the trie reuses paths and organizes lookup by character traversal.
A trie node can contain:
- outgoing edges,
- one or more ordered values,
- counts aligned with those values.
This is why the structure can represent both:
- a single preferred result,
- multiple competing results for the same key.
## Stage 1: Mutable construction
The mutable build-time structure is created by `FrequencyTrie.Builder`.
This stage is optimized for insertion rather than runtime lookup. As dictionary data is added, the builder accumulates:
- child edges,
- local values,
- local frequencies of those values.
Those frequencies are not incidental metadata. They later influence both result ordering and, depending on reduction mode, the semantic identity of subtrees during reduction.
### Why the build-time form is mutable
The builder must be easy to extend and easy to aggregate into. That is the opposite of what a runtime lookup structure needs.
Build-time priorities are:
- flexibility,
- accumulation of counts,
- structural growth.
Runtime priorities are:
- compactness,
- immutability,
- fast lookup.
Radixor therefore keeps construction and runtime representation strictly separate.
## What a compiled node contains
After reduction and freezing, the runtime structure uses immutable compiled nodes.
A compiled node stores:
- `char[] edgeLabels`
- child-node references aligned with those labels
- ordered value arrays
- aligned count arrays
This array-based form is compact and efficient for lookup.
## Runtime lookup model
At runtime, lookup is conceptually simple:
1. traverse the compiled trie by the input key,
2. reach the node addressed by that key,
3. retrieve one or more stored patch commands,
4. apply the chosen patch command to the original word.
The trie itself does not create the final stem string. It selects the stored transformation command. `PatchCommandEncoder.apply(...)` then performs the actual transformation.
That separation is architecturally important:
- the trie is responsible for **selection**,
- patch application is responsible for **transformation**.
## `get()` and `getAll()`
The runtime API exposes two complementary views of the addressed node.
### `get()`
`get()` returns the locally preferred value stored at that node.
Preference is deterministic:
1. higher local frequency wins,
2. shorter textual representation wins,
3. lexicographically lower textual representation wins,
4. stable first-seen order acts as the final tie-breaker.
### `getAll()`
`getAll()` returns all locally stored values in deterministic ranked order.
This is what allows Radixor to preserve ambiguity explicitly instead of forcing every key into a single answer.
## Why multiple results can exist
Some stemming systems discard ambiguity early because they insist on returning exactly one answer.
Radixor does not require that simplification. If multiple plausible patch commands exist for a key, the compiled trie can preserve them and the runtime API can expose them.
That is useful when downstream logic wants to:
- inspect ambiguity,
- preserve alternatives for retrieval,
- apply later ranking or domain-specific selection.
## Why compiled artifacts are compact
The final compiled trie can be much smaller than the original dictionary for several reasons working together:
- patch commands are compact,
- trie paths reuse shared structure,
- reduction merges equivalent subtrees,
- binary persistence stores the already reduced form,
- GZip compression is applied on top of the binary format.
This is why a very large dictionary can still produce a manageable deployable runtime artifact.
## Why preparation can still use more memory
The compactness of the final artifact should not be confused with the memory usage of preparation.
Before reduction has completed, the mutable build-time structure must exist in memory. For large dictionaries, that temporary preparation cost can be noticeably higher than the size of the final persisted artifact or the loaded compiled trie.
That is why the preferred operational model is usually:
- compile offline,
- persist the compiled artifact,
- load the finished artifact in runtime services.
## Determinism as a design principle
Radixor favors deterministic behavior throughout the pipeline.
This appears in:
- lowercased dictionary parsing,
- stable value ordering,
- sorted child descriptors,
- canonical reduction signatures,
- reproducible compiled lookup behavior.
Determinism matters not only for tests, but also for operational trust. It makes stemming behavior explainable and reproducible across builds and environments.
## Continue with
- [Reduction Semantics](reduction-semantics.md)
- [Programmatic usage](programmatic-usage.md)
- [CLI compilation](cli-compilation.md)

Binary file not shown.

After

Width:  |  Height:  |  Size: 540 KiB

View File

@@ -0,0 +1,109 @@
/* Compact technical typography for Radixor */
:root {
--md-text-font: "Inter", "Segoe UI", "Roboto", "Helvetica Neue", Arial, sans-serif;
}
/* Hide page title only on the landing page */
.visually-hidden {
display: none;
}
/* Main article text */
.md-typeset {
font-size: 0.78rem;
line-height: 1.3;
}
/* Paragraph spacing */
.md-typeset p,
.md-typeset ul,
.md-typeset ol,
.md-typeset dl,
.md-typeset blockquote {
margin-top: 0.45em;
margin-bottom: 0.45em;
}
/* Headings */
.md-typeset h1 {
margin: 0 0 0.7rem;
font-size: 1.8rem;
line-height: 1.15;
}
.md-typeset h2 {
margin: 1.2rem 0 0.55rem;
font-size: 1.3rem;
line-height: 1.2;
}
.md-typeset h3 {
margin: 1rem 0 0.45rem;
font-size: 1.05rem;
line-height: 1.25;
}
.md-typeset h4,
.md-typeset h5,
.md-typeset h6 {
margin: 0.85rem 0 0.35rem;
line-height: 1.25;
}
/* Lists */
.md-typeset li {
margin-bottom: 0.15em;
}
.md-typeset ul,
.md-typeset ol {
padding-left: 1.1rem;
}
/* Tables */
.md-typeset table:not([class]) td,
.md-typeset table:not([class]) th {
padding: 0.45rem 0.7rem;
}
/* Code blocks */
.md-typeset pre > code {
font-size: 0.72rem;
line-height: 1.4;
}
/* Inline code */
.md-typeset code {
font-size: 0.72rem;
}
/* Navigation density */
.md-nav__item .md-nav__link {
margin-top: 0.12rem;
margin-bottom: 0.12rem;
}
.md-sidebar__scrollwrap {
padding-top: 0.3rem;
padding-bottom: 0.3rem;
}
/* Slightly narrower content rhythm */
.md-content__inner {
margin-top: 0.6rem;
padding-bottom: 1.2rem;
}
/* Admonitions more compact */
.md-typeset .admonition,
.md-typeset details {
margin: 0.8rem 0;
}
/* Optional: use a bit wider content area on large screens */
@media screen and (min-width: 76.25em) {
.md-grid {
max-width: 68rem;
}
}

View File

@@ -1,25 +1,37 @@
# Benchmarking
> ← Back to [README.md](../README.md)
Radixor includes a JMH benchmark suite for both the internal algorithmic core and a side-by-side English comparison against the Snowball Porter stemmer family.
This document explains what is benchmarked, how to run it, and how to interpret the results responsibly.
This document explains what is benchmarked, how to run the suite, and how benchmark results should be interpreted.
## Scope
The benchmark suite currently covers two categories:
- Radixor core operations
- English stemmer comparison on the same token workload
- Radixor core operations,
- English stemmer comparison on the same token workload.
The comparison benchmark processes the same deterministic English token stream through:
- Radixor with bundled `US_UK_PROFI`
- Snowball original Porter
- Snowball English, commonly referred to as Porter2
- Radixor with bundled `US_UK_PROFI`,
- Snowball original Porter,
- Snowball English, commonly referred to as Porter2.
The purpose of the comparison is throughput measurement on identical input. It is not intended to prove linguistic equivalence between the compared stemmers.
The purpose of the comparison is throughput measurement on identical input. It is not intended to demonstrate linguistic equivalence between the compared stemmers.
## How to read the published numbers
Two kinds of benchmark numbers are relevant in the project.
### Reference measurements
The detailed benchmark snapshot documented on this page comes from a controlled run on a Ryzen 5 system. Those numbers are the best reference point for understanding absolute throughput under a known local benchmark environment.
### Published badge figures
The benchmark badge metadata published through GitHub Pages is generated in the GitHub-hosted container environment. That environment is convenient for continuous publication, but it is not the right place to treat absolute throughput values as stable across time. CPU scheduling, shared-host variability, and container-level noise can materially affect raw numbers from run to run.
For that reason, the published badge values should be treated primarily as a compact status surface. They are useful for observing broad trends and relative positioning, but not as the authoritative source for precise absolute throughput claims.
## Current snapshot
@@ -30,12 +42,25 @@ A recent JMH run on JDK 21.0.10 with JMH 1.37, one thread, three warmup iteratio
| About 12,000 generated tokens | 30.99 M tokens/s | 8.21 M tokens/s | 5.46 M tokens/s |
| About 60,000 generated tokens | 32.25 M tokens/s | 8.02 M tokens/s | 5.11 M tokens/s |
On that workload, Radixor is approximately:
On that workload, Radixor measured approximately:
- 4 times faster than Snowball original Porter
- 6 times faster than Snowball English
- 4 times the throughput of Snowball original Porter,
- 6 times the throughput of Snowball English.
These values are workload- and environment-dependent. Treat them as measured results for the documented benchmark setup, not as universal constants.
These values are workload-dependent and environment-dependent. They should be read as measured results for the documented setup, not as universal constants.
## Interpreting the relative result
Although the absolute numbers can move across environments, the throughput relationship between Radixor and the compared Porter-family stemmers has remained broadly stable in practical measurements. In particular, the comparison against Snowball original Porter is consistently in the rough range of about four to one in Radixors favor.
That relative behavior is more informative than any single absolute figure. It reflects a real architectural difference rather than a cosmetic benchmark artifact.
Radixor is built around a compiled patch-command trie that resolves the result through a direct lookup and patch application path. In contrast, classic rule-based stemmers such as the Porter family follow a different operational model. The result is that Radixor combines two properties that do not often appear together:
- dictionary-driven compiled lookup performance,
- the ability to generalize beyond explicitly listed word forms instead of behaving like a pure closed-form dictionary lookup table.
Within that design space, the measured throughput profile is strong enough to place Radixor among the fastest known practical implementations of this kind, while still supporting stemming of previously unseen forms. That should still be read as a carefully bounded engineering statement, not as an absolute claim over every possible stemmer architecture or benchmark scenario.
## Benchmark classes
@@ -43,9 +68,9 @@ The main benchmark classes are under `src/jmh/java/org/egothor/stemmer/benchmark
Relevant classes include:
- `FrequencyTrieLookupBenchmark`
- `FrequencyTrieCompilationBenchmark`
- `EnglishStemmerComparisonBenchmark`
- `FrequencyTrieLookupBenchmark`,
- `FrequencyTrieCompilationBenchmark`,
- `EnglishStemmerComparisonBenchmark`.
The English comparison benchmark uses the bundled Radixor English resource and the official Snowball Java distribution integrated into the JMH source set.
@@ -55,10 +80,10 @@ The English comparison benchmark uses a deterministic generated corpus rather th
The workload intentionally mixes:
- simple inflections
- common derivational forms
- US and UK spelling families
- lexical forms appropriate for `US_UK_PROFI`
- simple inflections,
- common derivational forms,
- US and UK spelling families,
- lexical forms appropriate for `US_UK_PROFI`.
This design keeps runs reproducible across environments and avoids accidental drift caused by changing external corpora.
@@ -80,42 +105,42 @@ Run only the English comparison benchmark:
JMH reports are written to:
- `build/reports/jmh/jmh-results.txt`
- `build/reports/jmh/jmh-results.csv`
- `build/reports/jmh/jmh-results.txt`,
- `build/reports/jmh/jmh-results.csv`.
The text report is convenient for human review. The CSV report is more useful for CI archiving, historical tracking, and external processing.
## Interpreting results
## Interpreting results responsibly
Benchmark numbers should be read with care.
Benchmark numbers should always be read with care.
Important factors include:
- CPU model and frequency behavior
- thermal throttling
- JVM vendor and version
- system background load
- operating-system scheduling noise
- benchmark parameter changes
- CPU model and frequency behavior,
- thermal throttling,
- JVM vendor and version,
- system background load,
- operating-system scheduling noise,
- benchmark parameter changes.
For meaningful comparison, keep these stable:
- hardware or VM class
- JDK version
- benchmark parameters
- thread count
- benchmark source revision
- hardware or VM class,
- JDK version,
- benchmark parameters,
- thread count,
- benchmark source revision.
If a regression is suspected, repeat the run and compare against the previous CSV output rather than relying on a single measurement.
If a regression is suspected, repeat the run and compare against previous CSV output rather than relying on a single measurement.
## Regression tracking
The recommended regression workflow is:
1. archive `jmh-results.csv`
2. compare the same benchmark names across runs
3. compare only like-for-like environments
4. investigate sustained regressions rather than one-off noise
1. archive `jmh-results.csv`,
2. compare the same benchmark names across runs,
3. compare only like-for-like environments,
4. investigate sustained regressions rather than one-off noise.
For public reporting, the README should keep only the condensed benchmark summary, while detailed benchmark methodology and interpretation should remain in this document.
@@ -127,8 +152,8 @@ Radixor uses a compiled patch-command trie driven by dictionary data. Snowball P
Because of that, the comparison should be understood as:
- equal input workload
- different stemming strategies
- measured throughput, not semantic identity
- equal input workload,
- different stemming strategies,
- measured throughput rather than semantic identity.
That distinction matters whenever performance claims are discussed in documentation or release notes.
That distinction matters whenever performance claims are discussed in documentation, release notes, or badge summaries.

View File

@@ -1,17 +1,8 @@
# Built-in Languages
> ← Back to [README.md](../README.md)
Radixor provides a set of **bundled stemmer dictionaries** that can be loaded directly without preparing custom data.
These built-in resources are useful for:
- quick integration
- testing and evaluation
- reference behavior
- prototyping search pipelines
Radixor provides a set of bundled stemmer dictionaries that can be loaded directly without preparing custom lexical data first.
These resources are intended as practical default dictionaries for common use. They provide a solid starting point for evaluation, integration, and general-purpose stemming workloads, while still fitting naturally into workflows where the bundled baseline is later refined, extended, or replaced by a custom dictionary.
## Overview
@@ -21,34 +12,30 @@ Bundled dictionaries are exposed through:
StemmerPatchTrieLoader.Language
```
They are packaged with the library and loaded from the classpath.
They are packaged with the library as text resources and compiled into a `FrequencyTrie<String>` when loaded.
## Supported languages
The following language identifiers are currently available:
The following bundled language identifiers are currently available:
| Language | Enum constant | Description |
|----------|------------------|------------------------------|
| Danish | `DA_DK` | Danish |
| German | `DE_DE` | German |
| Spanish | `ES_ES` | Spanish |
| French | `FR_FR` | French |
| Italian | `IT_IT` | Italian |
| Dutch | `NL_NL` | Dutch |
| Norwegian| `NO_NO` | Norwegian |
| Portuguese| `PT_PT` | Portuguese |
| Russian | `RU_RU` | Russian |
| Swedish | `SV_SE` | Swedish |
| English | `US_UK` | Standard English |
| Language | Enum constant | Notes |
|---|---|---|
| Danish | `DA_DK` | Bundled general-purpose dictionary |
| German | `DE_DE` | Bundled general-purpose dictionary |
| Spanish | `ES_ES` | Bundled general-purpose dictionary |
| French | `FR_FR` | Bundled general-purpose dictionary |
| Italian | `IT_IT` | Bundled general-purpose dictionary |
| Dutch | `NL_NL` | Bundled general-purpose dictionary |
| Norwegian | `NO_NO` | Bundled general-purpose dictionary |
| Portuguese | `PT_PT` | Bundled general-purpose dictionary |
| Russian | `RU_RU` | Currently supplied in normalized transliterated form |
| Swedish | `SV_SE` | Bundled general-purpose dictionary |
| English | `US_UK` | Standard English dictionary |
| English | `US_UK_PROFI` | Extended English dictionary |
## Basic usage
Load a bundled stemmer:
Load a bundled stemmer like this:
```java
import java.io.IOException;
@@ -59,194 +46,177 @@ import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class BuiltInExample {
public static void main(String[] args) throws IOException {
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
private BuiltInExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
}
}
```
The loader reads the bundled dictionary resource, parses the textual entries, derives patch-command mappings, and compiles the result into a read-only trie.
## Example: stemming with `US_UK_PROFI`
```java
import java.io.IOException;
import org.egothor.stemmer.*;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class EnglishExample {
public static void main(String[] args) throws IOException {
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
private EnglishExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
String word = "running";
String patch = trie.get(word);
String stem = PatchCommandEncoder.apply(word, patch);
final String word = "running";
final String patch = trie.get(word);
final String stem = PatchCommandEncoder.apply(word, patch);
System.out.println(word + " -> " + stem);
}
}
```
## `US_UK` and `US_UK_PROFI`
## `US_UK` vs `US_UK_PROFI`
Radixor currently provides two bundled English variants.
### `US_UK`
* smaller dictionary
* faster load time
* suitable for lightweight use cases
`US_UK` is the lighter-weight bundled English resource. It is suitable where a smaller default dictionary is preferred and maximal lexical coverage is not the primary goal.
### `US_UK_PROFI`
* larger and more complete dataset
* better coverage of word forms
* improved stemming quality
* slightly larger memory footprint
`US_UK_PROFI` is the more extensive bundled English resource. It offers broader lexical coverage and is the better default for most applications that want stronger out-of-the-box behavior.
### Recommendation
Use:
For most English-language deployments, prefer:
```
```text
US_UK_PROFI
```
for most applications unless memory constraints are strict.
Use `US_UK` when a smaller bundled baseline is more appropriate.
## Intended role of bundled dictionaries
Bundled dictionaries should be understood as **general-purpose default resources**.
## How bundled dictionaries are loaded
They are a good fit when:
Internally:
- a supported language is already available,
- immediate usability matters,
- a reasonable baseline is sufficient,
- the goal is evaluation, prototyping, or straightforward integration.
- dictionaries are stored as text resources
- parsed using `StemmerDictionaryParser`
- compiled into a trie at load time
They are also well suited to staged refinement workflows in which the bundled base is loaded first, then extended with domain-specific vocabulary, and finally persisted as a custom binary artifact.
This means:
## Character representation
- first load includes parsing + compilation cost
- subsequent usage is fast
The current bundled resources follow a pragmatic normalization convention.
At present, bundled dictionaries are supplied in normalized plain-ASCII form. For some languages, this is simply a lightweight maintenance convention. For others, especially languages commonly written in another script, it reflects a transliterated lexical resource. Russian is the clearest example in the current bundled set.
This convention belongs to the supplied dictionary resources, not to the core stemming model. The parser reads UTF-8 text, the dictionary model works with ordinary Java strings, and the trie and patch-command mechanism operate on general character sequences. In practical terms, the architecture is compatible with native-script dictionaries when suitable lexical resources are available.
## When to use bundled languages
## When to prefer custom dictionaries
Bundled dictionaries are suitable when:
A custom dictionary is usually the better choice when:
- you need quick results without preparing custom data
- you are prototyping or experimenting
- your language requirements match the provided datasets
## When to use custom dictionaries
You should prefer custom dictionaries when:
- domain-specific vocabulary is important
- accuracy requirements are high
- you need full control over stemming behavior
Typical examples:
- technical terminology
- product catalogs
- biomedical text
- legal or financial language
- domain-specific vocabulary materially affects stemming quality,
- lexical coverage must be controlled more precisely,
- a stronger language resource is available than the bundled baseline,
- native-script support is needed beyond the currently bundled resources.
Typical examples include:
- technical terminology,
- biomedical language,
- legal or financial vocabulary,
- organization-specific product and process names,
- language resources maintained in native scripts.
## Production recommendation
For production systems:
For production systems, the most robust workflow is usually:
1. Load a bundled dictionary
2. Extend it with domain-specific terms (optional)
3. Compile it into a binary `.radixor.gz` file
4. Deploy the compiled artifact
5. Load it using `loadBinary(...)`
1. start from a bundled dictionary when it is suitable,
2. extend it with domain-specific forms if needed,
3. compile or rebuild it into a binary `.radixor.gz` artifact,
4. deploy that compiled artifact,
5. load it at runtime using `loadBinary(...)`.
This avoids:
This avoids repeated startup parsing and makes the deployed stemming behavior explicit and versionable.
- runtime parsing overhead
- repeated compilation
- startup latency
## Example workflow
## Example refinement workflow
```java
// 1. Load bundled dictionary
FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.FrequencyTrieBuilders;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class BundledRefinementExample {
private BundledRefinementExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> base = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
// 2. Modify (optional)
FrequencyTrie.Builder<String> builder =
FrequencyTrieBuilders.copyOf(
final FrequencyTrie.Builder<String> builder = FrequencyTrieBuilders.copyOf(
base,
String[]::new,
ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
)
);
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
builder.put("microservices", "Na");
// 3. Compile
FrequencyTrie<String> compiled = builder.build();
final FrequencyTrie<String> compiled = builder.build();
// 4. Save
StemmerPatchTrieBinaryIO.write(compiled, Path.of("english-custom.radixor.gz"));
StemmerPatchTrieBinaryIO.write(compiled, Path.of("english-custom.radixor.gz"));
}
}
```
## Extending language support
The built-in set is intentionally a practical baseline rather than a closed catalog. High-quality dictionaries for additional languages, improved language coverage, and stronger native-script resources are all natural extension paths for the project.
## Limitations
* bundled dictionaries are **general-purpose**
* they may not reflect:
* domain-specific usage
* rare or specialized vocabulary
* organization-specific terminology
What matters most is not only the number of entries, but the quality, consistency, and operational usefulness of the lexical resource being added.
## Next steps
* [Quick start](quick-start.md)
* [Dictionary format](dictionary-format.md)
* [CLI compilation](cli-compilation.md)
* [Programmatic usage](programmatic-usage.md)
- [Quick start](quick-start.md)
- [Dictionary format](dictionary-format.md)
- [CLI compilation](cli-compilation.md)
- [Programmatic usage](programmatic-usage.md)
## Summary
Radixors built-in language support provides:
* immediate usability
* reference datasets
* a starting point for customization
For production systems, they are best used as:
* a baseline
* a seed for further extension
* a source for compiled deployment artifacts
Radixors built-in language support provides immediate usability, practical default dictionaries, and a strong starting point for custom refinement. The current bundled resources follow a pragmatic normalization convention, while the underlying architecture remains well suited to richer language resources and future extensions.

View File

@@ -1,29 +1,21 @@
# CLI Compilation
> ← Back to [README.md](../README.md)
Radixor provides a command-line compiler for turning line-oriented dictionary files into compact binary stemmer artifacts.
Radixor provides a command-line tool for compiling dictionary files into compact, production-ready binary stemmer tables.
This is the preferred preparation workflow when stemming should run against an already compiled artifact rather than against raw dictionary input. The CLI reads the dictionary, derives patch commands, builds a mutable trie, applies the selected subtree reduction strategy, and writes the final compiled trie in the project binary format under GZip compression. The result is a deployment-ready `.radixor.gz` file that can be loaded directly by application code.
This is the recommended workflow for deployment environments, as it separates:
## What the CLI does
- dictionary preparation (offline)
- stemming execution (runtime)
## Overview
The `Compile` tool:
1. reads a line-oriented dictionary file
2. converts wordstem pairs into patch commands
3. builds a trie structure
4. applies subtree reduction
5. writes a compressed binary artifact
The output is a `.radixor.gz` file suitable for fast runtime loading.
The `Compile` tool performs the following steps:
1. reads the input dictionary in the standard Radixor stemmer format,
2. parses each line into a canonical stem and its known variants,
3. converts variants into patch commands,
4. builds a mutable trie of patch-command values,
5. applies the configured reduction mode,
6. writes the compiled trie as a GZip-compressed binary artifact.
This workflow is intentionally aligned with the same dictionary semantics used elsewhere in the library. Remarks introduced by `#` or `//` are supported through the shared dictionary parser.
## Basic usage
@@ -36,159 +28,173 @@ java org.egothor.stemmer.Compile \
--overwrite
```
## Supported arguments
The CLI supports the following arguments:
## Required arguments
```text
--input <file>
--output <file>
--reduction-mode <mode>
[--store-original]
[--dominant-winner-min-percent <1..100>]
[--dominant-winner-over-second-ratio <1..n>]
[--overwrite]
[--help]
```
### `--input`
### `--input <file>`
Path to the source dictionary file.
* must be in the [dictionary format](dictionary-format.md)
* must be readable
* UTF-8 encoding is expected
```
--input ./data/stemmer.txt
```
### `--output`
Path to the output binary file.
* parent directories are created automatically
* output is written as **GZip-compressed binary**
```
--output ./build/english.radixor.gz
```
## Optional arguments
### `--reduction-mode`
Controls how aggressively the trie is reduced during compilation.
Available values:
* `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
* `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
* `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
The file must use the standard line-oriented dictionary format. Each non-empty logical line starts with the canonical stem and may contain zero or more variants. The parser expects UTF-8 input, lowercases it using `Locale.ROOT`, and ignores trailing remarks introduced by `#` or `//`.
Example:
```text
--input ./data/stemmer.txt
```
### `--output <file>`
Path to the output binary artifact.
The output file is written as a GZip-compressed binary trie. Parent directories are created automatically when needed.
Example:
```text
--output ./build/english.radixor.gz
```
### `--reduction-mode <mode>`
Selects the subtree reduction strategy used during compilation.
Supported values are:
- `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
- `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
- `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
Example:
```text
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
```
#### Recommendation
Use:
```
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
```
This provides:
* safe behavior
* deterministic ordering
* good compression
This argument is required.
### `--store-original`
Stores the stem itself as a no-op mapping.
When this flag is present, the canonical stem itself is inserted using the no-op patch command.
```
```text
--store-original
```
Effect:
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
* ensures that canonical forms are always resolvable
* improves robustness in real-world inputs
### `--dominant-winner-min-percent <1..100>`
Recommended for most use cases.
Sets the minimum winner percentage used by dominant-result reduction settings.
Example:
```text
--dominant-winner-min-percent 75
```
This option matters primarily when `--reduction-mode` is `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`. The default value is `75`.
### `--dominant-winner-over-second-ratio <1..n>`
Sets the minimum winner-over-second ratio used by dominant-result reduction settings.
Example:
```text
--dominant-winner-over-second-ratio 3
```
This option also matters primarily for `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`. The default value is `3`.
### `--overwrite`
Allows overwriting an existing output file.
Allows the CLI to replace an already existing output file.
```
```text
--overwrite
```
Without this flag:
Without this flag, compilation fails when the output path already exists.
* compilation fails if the output file already exists
### `--help`
Prints usage help and exits successfully.
```text
--help
```
## Reduction strategy explained
The short form `-h` is also supported.
Reduction merges semantically equivalent subtrees to reduce memory and file size.
## Reduction modes in practice
Trade-offs:
Reduction mode is not only a storage decision. It also influences what semantics are preserved when the mutable trie is compiled into its canonical read-only form.
| Mode | Compression | Behavioral fidelity |
| --------- | ----------- | ------------------- |
| Ranked | Medium | High |
| Unordered | High | Medium |
| Dominant | Highest | Lower (heuristic) |
### Ranked `getAll()` equivalence
### Ranked (recommended)
`MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS` merges subtrees whose `getAll()` results remain equivalent for every reachable key suffix and whose local result ordering is the same.
* preserves full `getAll()` ordering
* safest and most predictable
This is the best general-purpose choice when result ordering and ambiguity handling matter. It preserves ranked multi-result semantics while still achieving useful structural reduction.
### Unordered
This is the recommended default for most users.
* ignores ordering differences
* higher compression, but less precise semantics
### Unordered `getAll()` equivalence
### Dominant
`MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS` also uses `getAll()`-level equivalence, but it ignores local ordering differences in addition to absolute frequencies.
* focuses on the most frequent result
* useful when only `get()` is relevant
* may lose secondary candidates
This can yield stronger reduction, but it also weakens the precision of ordered multi-result semantics.
Choose this mode only when the application does not depend on the ordering of alternative results.
### Dominant `get()` equivalence
## Output format
`MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS` focuses on preserving preferred-result semantics for `get()`, subject to dominance thresholds.
The compiled file:
If a node does not satisfy the configured dominance constraints, compilation falls back to ranked `getAll()` semantics for that node to avoid unsafe over-reduction.
* is a binary representation of the trie
* uses **GZip compression**
* is optimized for:
This mode is most suitable when the application primarily consumes the preferred result and does not rely on preserving richer ambiguity information.
* fast loading
* minimal memory footprint
## Recommended usage patterns
Typical properties:
### Use offline preparation
* small file size
* fast deserialization
* no runtime preprocessing required
The CLI is best used as a preparation step during packaging, deployment, or controlled artifact generation. This keeps compilation outside the runtime startup path and allows services to load only the finished binary trie.
### Treat compiled files as versioned assets
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
### Choose reduction mode deliberately
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.
### Expect memory pressure during preparation, not runtime
Compilation is usually a one-time step and is generally fast. The more important operational consideration is memory usage during preparation, because the dictionary-derived mutable structure exists before reduction compacts it into the final read-only trie. This is especially relevant for very large source dictionaries.
## Example workflow
### 1. Prepare dictionary
### 1. Prepare a dictionary
```
```text
run running runs ran
connect connected connecting
```
### 2. Compile
### 2. Compile it
```bash
java org.egothor.stemmer.Compile \
@@ -198,108 +204,45 @@ java org.egothor.stemmer.Compile \
--store-original
```
### 3. Use in application
### 3. Load it in an application
```java
FrequencyTrie<String> trie =
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.StemmerPatchTrieLoader;
final FrequencyTrie<String> trie =
StemmerPatchTrieLoader.loadBinary("english.radixor.gz");
```
## Exit codes and error handling
The CLI uses three exit outcomes:
## Error handling
- `0` for success,
- `1` for processing failures such as I/O or compilation errors,
- `2` for invalid command-line usage.
The CLI reports:
When argument parsing fails, the CLI prints the error message, prints the usage summary, and exits with usage error status.
* missing input file
* invalid arguments
* I/O failures
* parsing errors
When compilation fails during processing, the CLI prints a `Compilation failed: ...` message to standard error and exits with processing error status.
Typical exit codes:
Examples of failure conditions include:
* `0` success
* non-zero failure
Error details are printed to standard error.
## Performance considerations
### Compilation
* typically CPU-bound
* depends on dictionary size and reduction mode
### Output size
* depends on:
* dictionary completeness
* reduction strategy
* can vary significantly between modes
### Runtime impact
* compiled tries are optimized for:
* fast lookup
* low allocation
* predictable latency
## Best practices
### Use offline compilation
* compile dictionaries during build or deployment
* do not compile on application startup
### Version your artifacts
* treat `.radixor.gz` files as versioned assets
* store them alongside application releases
### Choose reduction mode deliberately
* use **ranked** for correctness
* use **dominant** only if you fully understand the trade-offs
### Keep dictionaries clean
* better input → better compiled output
* avoid noise and inconsistencies
## Integration tips
* store compiled files under `resources/` or a dedicated directory
* load them once and reuse the trie instance
* avoid repeated loading in frequently executed code paths (for example, per-request processing)
- missing required arguments,
- unknown arguments,
- invalid integer values for dominant thresholds,
- missing input files,
- unreadable input,
- existing output file without `--overwrite`,
- general I/O failures during reading or writing.
## Relation to programmatic usage
The CLI and the programmatic API implement the same conceptual preparation step. The CLI is the operationally convenient choice when you want a ready-made binary artifact. The programmatic API is the better fit when compilation must be integrated directly into custom Java workflows.
## Next steps
* [Dictionary format](dictionary-format.md)
* [Programmatic usage](programmatic-usage.md)
* [Quick start](quick-start.md)
## Summary
The `Compile` CLI is the bridge between:
* human-readable dictionary data
* optimized runtime stemmer tables
It enables a clean separation between:
* data preparation
* runtime execution
and is the preferred way to prepare Radixor for production use.
- [Dictionary format](dictionary-format.md)
- [Quick start](quick-start.md)
- [Programmatic usage](programmatic-usage.md)
- [Architecture and reduction](architecture-and-reduction.md)

View File

@@ -0,0 +1,185 @@
# Compatibility and Guarantees
This document explains what Radixor treats as stable public behavior, what should be regarded as internal implementation detail, and how to think about compatibility across versions.
Its purpose is to make adoption safer. Users should be able to understand which parts of the project are intended as supported API, which parts may evolve more freely, and which kinds of change are expected to remain compatible in future releases.
## Compatibility philosophy
Radixor is designed to be used as a real library, not only as a code drop. That means compatibility matters.
At the same time, the project distinguishes clearly between:
- **public API and behavior** that users are expected to build against,
- **internal implementation layers** that may change more freely when needed for correctness, performance, or maintainability.
The practical goal is straightforward:
- keep the main user-facing API in `org.egothor.stemmer` stable and supportable,
- allow more freedom of evolution in internal trie-focused implementation layers,
- extend the project conservatively without creating unnecessary behavioral ambiguity.
## Public API posture
As a general rule, the `org.egothor.stemmer` package should be treated as the primary supported API surface.
That includes the main user-facing types involved in:
- dictionary loading,
- binary loading and persistence,
- patch-command application,
- compiled trie querying,
- reconstruction workflows,
- reduction configuration,
- CLI use.
This API is expected to remain supportable across future versions. The preferred compatibility model is additive evolution: improving documentation, clarifying behavior, and adding capabilities without unnecessary disruption of existing usage patterns.
Examples of likely additive evolution include:
- additional bundled language resources,
- fuller support for diacritics or native-script language resources,
- expanded documentation and operational tooling,
- new convenience methods that do not break existing code.
## Internal API posture
The `org.egothor.stemmer.trie` package should be treated as internal or at least significantly less stable implementation API.
It represents the structural machinery behind mutable nodes, reduced nodes, compiled nodes, reduction context, signatures, and related internal compilation details. These types may evolve more aggressively when needed to improve implementation quality, correctness, reduction behavior, internal representations, or performance characteristics.
Users should therefore avoid building long-term integrations against `org.egothor.stemmer.trie` unless they are intentionally accepting that tighter coupling.
In practical terms:
- `org.egothor.stemmer` is the supported integration layer,
- `org.egothor.stemmer.trie` is the implementation layer.
## Behavioral guarantees
Several project properties are intended as core behavioral guarantees.
### Deterministic dictionary loading and compilation
Given the same textual dictionary input and the same reduction settings, Radixor is intended to produce the same compiled stemming semantics in a reproducible way.
This includes deterministic local result ordering and deterministic observable lookup behavior.
### Stable meaning of `get()` and `getAll()`
The distinction between preferred-result lookup and multi-result lookup is part of the supported behavior model.
- `get()` returns the locally preferred stored value,
- `getAll()` returns all locally stored values in deterministic ranked order,
- `getEntries()` returns aligned values with counts.
That model is part of how the public API should be understood.
### Stable reduction-mode intent
Each public `ReductionMode` constant carries a semantic contract that should remain meaningful across versions.
In other words, the implementation may evolve, but the intended meaning of modes such as ranked `getAll()` equivalence, unordered `getAll()` equivalence, and dominant `get()` equivalence should not drift casually.
### Stable binary artifact purpose
Compiled `.radixor.gz` artifacts are a first-class project output. Loading and persisting compiled stemmer artifacts is part of the intended usage model, not an incidental implementation side effect.
## What is allowed to evolve
Compatibility does not mean the project is frozen.
The following kinds of change are generally compatible with the projects direction:
- improved internal data structures,
- changes inside `org.egothor.stemmer.trie`,
- expanded bundled dictionaries,
- additional supported languages,
- improved native-script handling,
- better benchmarks, tests, and reports,
- additive public API growth that does not invalidate existing usage.
The project should be able to improve substantially while keeping the main user-facing integration model intact.
## What may change more cautiously
Some areas should be treated as stable in intent but still approached carefully when changed.
### Bundled dictionary contents
Bundled resources are versioned project data, not immutable language standards. Their contents may improve over time.
That means stemming outcomes can legitimately change when bundled dictionaries are refined or expanded. Such changes are compatible with the projects direction, but they should still be understood as behavior changes at the lexical-resource level.
### Binary format evolution
Compiled binary artifacts are an intended project output, but binary-format evolution may still be needed in future versions.
If the format changes, that should be handled deliberately and documented clearly. Users should not assume that every historical persisted artifact will remain readable forever without versioning considerations. What should remain stable is the projects support for compiled artifact workflows, not necessarily perpetual cross-version binary interchange without explicit format evolution rules.
### Performance characteristics
Radixor places strong emphasis on performance, but no benchmark number should be treated as a formal compatibility guarantee.
What is more meaningful than any single raw number is the architectural performance posture: the library is intended to remain a compact compiled stemmer with very strong runtime throughput characteristics.
## What users should rely on
Long-term users should rely primarily on the following:
- the main integration path in `org.egothor.stemmer`,
- the documented meaning of `get()`, `getAll()`, and reduction modes,
- the offline-compilation plus runtime-loading workflow,
- the availability of compiled artifact support,
- the projects preference for deterministic and auditable behavior.
These are the parts of the project that are intended to remain the most stable and supportable.
## What users should not rely on casually
Users should avoid depending on:
- internal trie package details,
- undocumented internal classes or intermediate representations,
- incidental internal ordering outside documented lookup semantics,
- assumptions that bundled dictionary contents will never evolve,
- assumptions that internal binary-format details are frozen forever.
If a behavior is important to your integration, it should ideally be documented at the public API or project-documentation level rather than inferred from internal implementation details.
## Source compatibility and behavioral compatibility
It is useful to distinguish two different notions of compatibility.
### Source compatibility
Whether existing Java code using the supported public API still compiles and integrates cleanly after an upgrade.
### Behavioral compatibility
Whether the upgraded system still behaves the same way for the same dictionary data, compiled artifacts, and runtime calls.
Radixor aims to preserve both where reasonably possible, but behavioral compatibility can still be influenced by intentional improvements such as dictionary refinement or bug fixes. For that reason, upgrades should be evaluated not only as code upgrades but also as stemming-behavior upgrades.
## Recommended upgrade discipline
When upgrading Radixor in a production environment, it is good practice to:
1. review release notes and documentation changes,
2. rebuild compiled artifacts if the upgrade affects dictionary or artifact handling,
3. rerun representative stemming validation tests,
4. compare benchmark outputs where performance matters,
5. inspect whether bundled-dictionary changes affect expected canonical results.
This is especially important for deployments that treat stemming behavior as part of search relevance or normalization policy.
## Summary
Radixors compatibility model is intentionally layered.
- `org.egothor.stemmer` should be treated as the supported public integration API,
- `org.egothor.stemmer.trie` should be treated as an internal implementation layer,
- deterministic public behavior and compiled-artifact workflows are core project commitments,
- internal structure and lexical-resource quality can continue to evolve.
This model gives the project room to improve while still providing a reliable surface for long-term use.

View File

@@ -0,0 +1,226 @@
# Contributing Dictionaries
High-quality dictionaries are one of the most valuable ways to improve **Radixor**.
The project already includes practical bundled dictionaries for common use, but the long-term quality and language reach of the stemmer depend heavily on the quality of its lexical resources. Contributions are therefore welcome not only in the form of code changes, but also in the form of well-prepared dictionary data for existing or additional languages.
This document explains what makes a dictionary contribution useful, how to structure it, and how to prepare it so that it integrates cleanly with the project.
## What a good dictionary contribution looks like
A good dictionary contribution is not defined only by the number of entries.
The most useful contributions are dictionaries that are:
- linguistically consistent,
- operationally clean,
- easy to review,
- easy to reproduce,
- appropriate for actual stemming use rather than raw lexical accumulation.
In practice, dictionary quality matters more than dictionary size. A smaller but coherent and carefully normalized dictionary is often more valuable than a larger resource that mixes conventions, contains noisy forms, or introduces accidental ambiguity.
## Preferred dictionary shape
Radixor uses a simple line-oriented format:
```text
<stem> <variant1> <variant2> <variant3> ...
```
The first token on a line is the canonical stem. All following tokens on that line are known variants that should reduce to that stem.
Example:
```text
run running runs ran
connect connected connecting connection
```
The parser:
- reads UTF-8 text,
- normalizes input to lower case using `Locale.ROOT`,
- ignores empty lines,
- supports remarks introduced by `#` or `//`.
For full format details, see [Dictionary format](dictionary-format.md).
## Contribution priorities
The most useful dictionary contributions generally fall into one of four categories.
### 1. Stronger dictionaries for already bundled languages
Improving lexical quality for already supported languages is often more valuable than merely expanding the language list. Better coverage, cleaner canonicalization, and improved consistency directly improve practical stemming outcomes.
### 2. Additional languages
New language support is welcome when the submitted resource is strong enough to be useful as a maintainable bundled baseline rather than as an incomplete demonstration artifact.
### 3. Native-script language resources
The current bundled resources follow a pragmatic normalization convention and may use transliterated or otherwise normalized forms. This is especially visible for languages such as Russian.
That convention belongs to the supplied dictionaries, not to the underlying algorithm. The parser, trie, and patch-command model are not fundamentally restricted to plain ASCII. Contributions of high-quality native-script dictionaries in full UTF-8 text are therefore particularly valuable, because they would enable more direct language support without transliteration-based workflows.
### 4. Domain-quality refinements
Some contributions may be more appropriate as curated domain extensions than as replacements for a general-purpose bundled dictionary. These are still useful when they are clearly scoped and operationally coherent.
## Normalization guidance
A dictionary should follow one normalization convention consistently.
For current general-purpose bundled resources, the safest convention remains normalized plain-ASCII lexical input where that is already the established project style. For languages where a stronger native-script resource exists, a coherent UTF-8 dictionary may be preferable, provided that the contribution is deliberate, well-structured, and consistently normalized.
The important point is not to mix incompatible conventions casually.
Avoid contributions that combine, without clear design intent:
- native-script and transliterated forms,
- multiple incompatible stem conventions,
- inconsistent use of diacritics,
- ad hoc spelling normalization,
- noisy typo-like forms presented as ordinary lexical variants.
## Choosing canonical stems
A dictionary line should reflect a stable canonical target form.
That means:
- choose one canonical representation and use it consistently,
- avoid mixing alternative stem conventions without a clear lexical reason,
- keep variants grouped under the form that the project should actually return as the canonical result.
For example, the following is coherent:
```text
analyze analyzing analyzed analyzes
```
The following is less useful if the project has not intentionally chosen mixed conventions:
```text
analyse analyzing analyzed analyzes
```
The contribution should make the intended canonical policy easy to understand.
## Ambiguity handling
Ambiguity is allowed, but it should be intentional.
If the same surface form appears under multiple stems, the compiled trie may later expose multiple candidate patch commands. This can be correct and desirable when the lexical reality genuinely requires it. However, accidental ambiguity caused by inconsistent source preparation makes the resource harder to trust and harder to review.
Before contributing a dictionary, check whether repeated surface forms across lines are:
- linguistically intentional,
- consistent with the chosen canonical policy,
- useful for runtime stemming behavior.
## What to avoid
Dictionary contributions are much easier to review and accept when they avoid common quality problems.
Avoid:
- mechanically aggregated word lists without review,
- inconsistent canonical forms,
- mixed orthographic conventions without explanation,
- accidental duplicates caused by source merging,
- noisy or non-lexical tokens,
- comments or formatting that make the source hard to audit.
A dictionary should read like a curated lexical resource, not like an unfiltered export.
## Practical preparation workflow
A disciplined dictionary contribution should typically follow this path:
1. prepare or normalize the lexical source,
2. convert it into Radixor dictionary format,
3. review canonical stem choices,
4. check for accidental duplicates and unintended ambiguity,
5. compile the dictionary,
6. test representative lookups,
7. inspect `get()` and `getAll()` behavior for important edge cases,
8. include a concise explanation of source provenance and normalization choices.
## What to test before submitting
At minimum, a proposed dictionary should be checked for:
- successful parsing,
- successful compilation,
- expected stemming behavior on representative examples,
- acceptable ambiguity behavior,
- stable canonical policy,
- absence of obvious malformed lines or accidental source contamination.
For important resources, it is also useful to test:
- whether representative forms survive reduction as expected,
- whether dominant-result behavior remains sensible if alternate reduction modes are used,
- whether the resulting artifact has a practical size for the intended use case.
## Contribution notes that help maintainers
A dictionary contribution becomes much easier to review when it includes a short maintainer-facing note describing:
- the language or domain covered,
- the provenance of the lexical data,
- the normalization convention used,
- whether the dictionary is ASCII-normalized or native-script UTF-8,
- the intended canonical stem policy,
- any known limitations,
- why the contribution improves the project in practical terms.
This note does not need to be long. It simply needs to make the resource intelligible.
## Bundled-resource expectations
Not every useful dictionary must automatically become a bundled language resource.
To be suitable for bundling, a dictionary should generally be:
- broadly useful,
- maintainable,
- legally safe to include,
- coherent enough to serve as a project baseline,
- strong enough that users can rely on it as more than a demonstration resource.
Some dictionaries are better treated as examples, experiments, or domain-specific artifacts rather than as general built-in resources.
## Native scripts and future language support
One of the most meaningful future directions for the project is stronger support for languages in their native writing systems.
The architecture does not need to change fundamentally for that to happen. What matters is the availability of strong lexical resources and the willingness to define clear conventions for how those resources should be bundled and maintained.
Contributions in this area are therefore especially valuable when they are:
- internally consistent,
- encoded as proper UTF-8 text,
- accompanied by a clear explanation of normalization assumptions,
- strong enough to support practical use rather than only demonstration.
## Related documentation
- [Built-in languages](built-in-languages.md)
- [Dictionary format](dictionary-format.md)
- [CLI compilation](cli-compilation.md)
- [Programmatic usage](programmatic-usage.md)
## Summary
The best dictionary contributions improve Radixor not merely by adding more entries, but by improving the linguistic quality, consistency, and practical usefulness of the lexical resources the project can compile and ship.
A strong contribution is therefore one that is:
- coherent,
- reviewable,
- operationally clean,
- well explained,
- and valuable for real stemming workloads.

View File

@@ -1,255 +1,229 @@
# Dictionary Format
> ← Back to [README.md](../README.md)
Radixor uses a simple line-oriented dictionary format designed for practical stemming workflows.
Radixor uses a simple, line-oriented dictionary format to define mappings between **word forms** and their **canonical stems**.
Each logical line describes one canonical stem and zero or more known word variants that should reduce to that stem. The format is intentionally lightweight, easy to maintain in source control, and directly consumable both by the programmatic loader and by the CLI compiler.
This format is intentionally minimal, language-agnostic, and easy to generate from existing linguistic resources or corpora.
## Core structure
## Overview
Each non-empty logical line has the following shape:
Each logical line defines:
- one **canonical stem**
- zero or more **word variants** belonging to that stem
```
stem variant1 variant2 variant3 ...
```text
<stem> <variant1> <variant2> <variant3> ...
```
At compile time:
The first token is interpreted as the **canonical stem**. Every following token on the same line is interpreted as a **known variant** belonging to that stem.
- each variant is converted into a **patch command** transforming the variant into the stem
- the stem itself may optionally be stored as a **no-op mapping**
Example:
## Basic example
```
```text
run running runs ran
connect connected connecting connection
analyze analyzing analysed analyses
```
This defines:
In this example:
| Stem | Variants |
|----------|----------------------------------------|
| run | running, runs, ran |
| connect | connected, connecting, connection |
| analyze | analyzing, analysed, analyses |
- `run` is the canonical stem for `running`, `runs`, and `ran`,
- `connect` is the canonical stem for `connected`, `connecting`, and `connection`.
## Syntax rules
## How the loader interprets a line
### 1. Tokenization
When a dictionary is loaded through `StemmerPatchTrieLoader`, the loader processes each parsed line as follows:
- Tokens are separated by **whitespace**
- Multiple spaces and tabs are treated as a single separator
- Leading and trailing whitespace is ignored
1. the first token becomes the canonical stem,
2. every following token is treated as a variant,
3. each variant is converted into a patch command that transforms the variant into the stem,
4. if `storeOriginal` is enabled, the stem itself is also inserted using the canonical no-op patch command.
### 2. First token is the stem
This means the textual dictionary is not used directly at runtime. Instead, it is transformed into patch-command data and compiled into a reduced read-only trie.
- The **first token** on each line is always the canonical stem
- All following tokens are treated as variants of that stem
## Minimal valid lines
### 3. Case normalization
A line may consist of the stem only:
- All input is normalized to **lowercase using `Locale.ROOT`**
- Dictionaries should ideally already be lowercase to avoid ambiguity
```text
run
```
### 4. Empty lines
This is syntactically valid. It defines a stem entry with no explicit variants on that line.
- Empty lines are ignored
Whether such a line is operationally useful depends on how the dictionary is loaded:
### 5. Duplicate variants
- if `storeOriginal` is enabled, the stem itself is inserted as a no-op mapping,
- if `storeOriginal` is disabled, the line contributes no explicit variant mappings.
- Duplicate variants are allowed but have no additional effect
- Frequency is determined by occurrence across the entire dataset
## Whitespace rules
## Remarks (comments)
Tokens are separated by whitespace. Leading and trailing whitespace is ignored.
These lines are equivalent:
```text
run running runs ran
```
```text
run running runs ran
```
Tabs and repeated spaces are both accepted because tokenization is whitespace-based.
## Empty lines
Empty lines are ignored.
Example:
```text
run running runs ran
connect connected connecting
```
The blank line between entries has no effect.
## Remarks and comments
The parser supports both full-line and trailing remarks.
### Supported remark markers
Two remark markers are recognized:
- `#`
- `//`
### Examples
The earliest occurrence of either marker terminates the logical content of the line, and the remainder of that line is ignored.
```
Examples:
```text
run running runs ran # English verb forms
connect connected connecting // basic forms
connect connected connecting // Common derived forms
```
Everything after the first occurrence of a remark marker is ignored.
This is also valid:
### Important note
Remark markers are not escaped. If `#` or `//` appear in a token, they will terminate the line.
## Storing the original form
When compiling, you may enable:
```
--store-original
```text
# This line is ignored completely
// This line is also ignored completely
```
This causes the stem itself to be stored using a **no-op patch command**.
## Case normalization
Input lines are normalized to lower case using `Locale.ROOT` before tokenization is processed into dictionary entries.
That means dictionary authors should treat the format as **case-insensitive at load time**. If a file contains uppercase or mixed-case tokens, they will be normalized during parsing.
Example:
```
run running runs
```text
Run Running Runs Ran
```
With `--store-original`, this implicitly includes:
is processed the same way as:
```
run -> run
```text
run running runs ran
```
This is useful when:
## Character set and practical convention
- the input may already be normalized
- you want stable identity mappings
- you want to avoid missing entries for canonical forms
Dictionary files are read as UTF-8 text.
## Frequency and ordering
From the perspective of the parser and the stemming algorithm, the format is not restricted to plain ASCII tokens. The parser accepts ordinary Java `String` data, and the trie itself works with general character sequences rather than with an ASCII-only internal model. In principle, this means the system could process diacritic and non-diacritic forms alike, and it could also store forms with inconsistently used diacritics.
Radixor tracks **local frequencies** of values.
In practice, however, the format is currently best understood as **primarily intended for classical basic ASCII lexical input**, especially in the traditional stemming style where language data is normalized into plain characters in the ASCII range up to character code 127. This convention is particularly relevant for languages whose original orthography includes diacritics but whose stemming dictionaries are commonly maintained in normalized non-diacritic form.
Frequency is determined by:
Future versions may expand the documentation and operational guidance for dictionaries that intentionally preserve diacritics. At present, that workflow is not the primary documented use case, not because the algorithm fundamentally forbids it, but because a concrete project requirement for such support has not yet emerged.
- how many times a mapping appears during construction
- merging behavior during reduction
## Distinct stem and variant semantics
When multiple stems exist for a word:
The format expresses a one-line grouping of forms under a canonical stem. It does not encode linguistic metadata, part-of-speech information, weights, or explicit ambiguity markers.
- results are ordered by **descending frequency**
- ties are resolved deterministically:
1. shorter textual representation wins
2. lexicographically smaller value wins
3. earlier insertion order wins
For example:
This guarantees **stable and reproducible results**.
## Ambiguity and multiple stems
A word may legitimately map to more than one stem:
```
axes ax axe
```text
axis axes
axe axes
```
This allows Radixor to represent ambiguity explicitly.
These are simply two independent lines. If both contribute mappings for the same surface form, the compiled trie may later expose one or more candidate patch commands depending on the accumulated local counts and the selected reduction mode.
At runtime:
In other words, the dictionary format itself is deliberately simple. Richer behavior such as preferred-result ranking or multiple candidate results emerges during trie construction and reduction rather than through extra syntax in the dictionary file.
- `get(word)` returns the **preferred result**
- `getAll(word)` returns **all candidates**
## Duplicate forms and repeated entries
## Design guidelines
The format does not reserve any special syntax for duplicates. If the same mapping is inserted multiple times through repeated dictionary content, the builder accumulates local counts for the stored value at the addressed key.
### Keep stems consistent
This matters because compiled tries preserve local value frequencies and use them to determine preferred ordering for `get(...)`, `getAll(...)`, and `getEntries(...)`.
Use a single canonical form:
As a result, repeating the same mapping is not just redundant text. It can influence the ranking behavior of the compiled trie.
- `run` instead of mixing `run` / `running`
- `analyze` vs `analyse` — pick one convention
## Practical examples
### Avoid noise
### Simple English example
Do not include:
- typos
- extremely rare forms (unless required)
- inconsistent normalization
### Prefer completeness over clever rules
Radixor is data-driven:
- more complete dictionaries → better results
- no hidden rule system compensates for missing entries
### Handle domain-specific vocabulary
You can extend dictionaries with:
- product names
- technical terms
- organization-specific terminology
## Example: minimal dictionary
```
go goes going went
be is are was were being
have has having had
```text
run running runs ran
connect connected connecting connection
build building builds built
```
## Example: domain-specific extension
### Dictionary with remarks
```
microservice microservices
container containers containerized
kubernetes kubernetes
```text
run running runs ran # canonical verb family
connect connected connecting // derived forms
build building builds built
```
## Common pitfalls
### Stem-only entries
### Mixing cases
```
Run running Runs ❌
```text
run
connect connected connecting
build
```
→ normalized to lowercase, but inconsistent input is error-prone
### Mixed case input
### Multiple stems on one line
```
run running connect ❌
```text
Run Running Runs Ran
CONNECT Connected Connecting
```
`connect` becomes a variant of `run`, which is incorrect
This is accepted, but it is normalized to lower case during parsing.
### Hidden comments
## Format limitations
```
run running //comment runs ❌
```
The current dictionary format intentionally stays minimal:
→ everything after `//` is ignored
- no quoted tokens,
- no escaping rules,
- no multi-word entries,
- no inline weighting syntax,
- no explicit ambiguity syntax,
- no sectioning or nested structure.
## When to use this format
Each token is simply a whitespace-delimited word form after remark stripping and lowercasing.
This format is suitable for:
## Authoring guidance
- curated linguistic datasets
- exported morphological dictionaries
- domain-specific vocabularies
- generated `(word, stem)` pairs from corpora
For reliable results, keep dictionaries:
## Next steps
- consistent in normalization,
- free of accidental duplicates unless repeated weighting is intentional,
- focused on meaningful stem-to-variant groupings,
- encoded in UTF-8,
- easy to audit in plain text form.
For most current deployments, it is sensible to keep dictionary content in normalized basic ASCII form unless there is a clear requirement to preserve diacritics end-to-end.
## Relationship to other documentation
This page describes only the textual source format.
To understand how those dictionary lines are transformed into compiled runtime artifacts, continue with:
- [CLI compilation](cli-compilation.md)
- [Programmatic usage](programmatic-usage.md)
- [Quick start](quick-start.md)
## Summary
Radixor dictionaries are intentionally simple:
- one line per stem
- whitespace-separated tokens
- optional remarks
- no embedded rules
This simplicity enables:
- easy generation
- fast parsing
- deterministic behavior
- efficient compilation into compact patch-command tries
- [Architecture and reduction](architecture-and-reduction.md)

37
docs/index.md Normal file
View File

@@ -0,0 +1,37 @@
<h1 class="visually-hidden">Home</h1>
<p align="center">
<img src="assets/images/banner.jpg" alt="Radixor banner" style="width: 100%; max-width: 1100px;">
</p>
**Radixor** is a high-performance, multi-language stemmer for Java, built for production-grade search and text-processing systems.
It modernizes the proven Egothor patch-command trie approach and extends it for deployment realities that classic stemming pipelines do not handle well.
Traditional Egothor-style stemming workflows usually treat a compiled dictionary as a fixed artifact. Once built, its lexical knowledge is effectively closed unless the original source dictionary is recompiled. Radixor removes that constraint. An already compiled stemming structure can be extended with additional words and transformations, which makes it possible to evolve an existing dictionary for domain-specific, customer-specific, or deployment-specific vocabulary without rebuilding the entire lexical base from scratch.
Radixor also improves how ambiguous reductions can be handled at runtime. Instead of always forcing a single result, it can return multiple plausible stems when the input token cannot be reduced unambiguously. This allows downstream systems to preserve linguistic ambiguity where that is operationally useful, whether for retrieval quality, ranking strategies, diagnostics, or domain-specific normalization policies.
The project also has a clear research lineage. The historical idea behind this stemming family is described in Leo Galambos's paper *Lemmatizer for Document Information Retrieval Systems in JAVA* (SOFSEM 2001), which presents a semi-automatic stemming technique designed for Java-based information retrieval systems. In Radixor documentation, this reference serves as historical and algorithmic background rather than as technical documentation of the current implementation.
> Unlike traditional Egothor-based deployments, Radixor can extend an already compiled stemmer dictionary and can return multiple stems when a word is not reducible to a single unambiguous form.
Radixor delivers:
- **Fast runtime stemming** with compact lookup structures
- **Multi-language adaptability** through dictionary-driven compilation
- **Extension of compiled stemmer structures** without full recompilation from source dictionaries
- **Incremental vocabulary growth** for deployment-specific lexical refinement
- **Support for multiple stemming results** when reduction is ambiguous
- **Deterministic behavior** suitable for reproducible processing pipelines
- **Flexible integration paths**, including CLI-based and programmatic workflows
- **Operational transparency** through continuously published quality and benchmark reports
Radixor is intended for teams that require consistent stemming quality at scale, while retaining the ability to evolve lexical resources after compilation and to handle ambiguous reductions with greater precision than traditional single-stem pipelines allow.
## Start here
- Read [Quick Start](quick-start.md) for immediate implementation guidance.
- Use [Programmatic Usage](programmatic-usage.md) for application integration patterns.
- Review [Benchmarking](benchmarking.md) for reproducible performance methodology.
- Open [CI Reports](reports.md) to inspect published build artifacts and quality metrics.
- See the historical paper: [*Lemmatizer for Document Information Retrieval Systems in JAVA*](https://www.researchgate.net/publication/221512865_Lemmatizer_for_Document_Information_Retrieval_Systems_in_JAVA).

View File

@@ -0,0 +1,89 @@
# Extending and Persisting Compiled Tries
This document explains how compiled Radixor tries can be reopened, extended, rebuilt, and stored for deployment.
## Reopen and extend a compiled trie
`FrequencyTrieBuilders.copyOf(...)` reconstructs a mutable builder from a compiled trie. The reconstructed builder preserves the key-local value counts of the compiled trie as currently stored, making it suitable for subsequent modification and recompilation. Reconstruction is performed from the compiled state, not from the original unreduced insertion history.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.FrequencyTrieBuilders;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
public final class ExtendCompiledStemmerExample {
private ExtendCompiledStemmerExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(
Path.of("stemmers", "english.radixor.gz"));
final ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final FrequencyTrie.Builder<String> builder = FrequencyTrieBuilders.copyOf(
compiledTrie,
String[]::new,
settings);
builder.put("microservices", "Na");
final FrequencyTrie<String> updatedTrie = builder.build();
StemmerPatchTrieBinaryIO.write(
updatedTrie,
Path.of("stemmers", "english-custom.radixor.gz"));
}
}
```
This enables a layered workflow:
1. start from a bundled or already compiled stemmer,
2. reconstruct a builder,
3. add custom lexical data,
4. compile and persist a new binary artifact.
## Persist and deploy compiled tries
`StemmerPatchTrieBinaryIO` reads and writes patch-command tries as GZip-compressed binary files. `StemmerPatchTrieLoader` exposes convenience methods around the same persistence functionality.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
StemmerPatchTrieBinaryIO.write(trie, Path.of("stemmers", "english.radixor.gz"));
```
In deployment terms, the cleanest model is usually:
- compile once,
- persist the binary artifact,
- load the artifact directly in runtime services.
## Binary-first operational model
For larger dictionaries or controlled deployment environments, a binary-first workflow is usually the most robust choice:
- prepare the compiled trie offline,
- keep the preparation step outside the runtime startup path,
- version and distribute the binary artifact,
- load the finished trie directly in production.
This model works especially well when domain-specific extensions are added in layers and then recompiled into a new read-only artifact.
## Continue with
- [Loading and Building Stemmers](programmatic-loading-and-building.md)
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)

View File

@@ -0,0 +1,133 @@
# Loading and Building Stemmers
This document explains how to acquire a compiled Radixor stemmer in Java.
## Load a bundled language dictionary
Bundled language resources are simple to use and compile directly into a `FrequencyTrie<String>` during loading.
```java
import java.io.IOException;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class BundledLanguageExample {
private BundledLanguageExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
}
}
```
The `storeOriginal` flag controls whether the canonical stem is inserted as a no-op patch entry for the stem itself.
## Load a textual dictionary
Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. Each non-empty logical line starts with the stem and may contain zero or more variants. Input is normalized to lower case using `Locale.ROOT`, and trailing remarks introduced by `#` or `//` are ignored.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class LoadTextDictionaryExample {
private LoadTextDictionaryExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
Path.of("data", "stemmer.txt"),
true,
ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
}
}
```
## Load a compiled binary artifact
Binary loading is typically the preferred runtime path because it avoids reparsing the textual source and skips the preparation step entirely.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class LoadBinaryExample {
private LoadBinaryExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"));
}
}
```
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression.
## Build directly with a mutable builder
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
```java
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
public final class BuilderExample {
private BuilderExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) {
final ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final FrequencyTrie.Builder<String> builder =
new FrequencyTrie.Builder<>(String[]::new, settings);
final PatchCommandEncoder encoder = new PatchCommandEncoder();
builder.put("running", encoder.encode("running", "run"));
builder.put("runs", encoder.encode("runs", "run"));
builder.put("ran", encoder.encode("ran", "run"));
builder.put("runner", encoder.encode("runner", "run"));
final FrequencyTrie<String> trie = builder.build();
System.out.println("Canonical node count: " + trie.size());
}
}
```
## Preparation-time memory characteristics
Compilation is commonly a one-time preparation activity and is generally fast enough not to be the main operational concern. The more important constraint is memory usage while building from textual dictionary data. Before reduction produces the compact immutable structure, the mutable build-time representation keeps the inserted data in memory. This is precisely why very large source dictionaries may require noticeably more memory during preparation than after compilation. The resulting compiled trie, by contrast, is designed as the compact runtime form.
This makes offline preparation especially attractive for large dictionaries.
## Continue with
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)

View File

@@ -0,0 +1,83 @@
# Querying and Ambiguity Handling
This document explains how a compiled Radixor trie is queried and how ambiguity is represented.
## Query a compiled trie
### `get(...)`: preferred local value
`FrequencyTrie.get(String)` returns the most frequent value stored at the node addressed by the supplied key. If several values have the same local frequency, the winner is chosen deterministically by shorter `toString()` value first, then by lexicographically lower `toString()`, and finally by stable first-seen order. If the key does not exist or no value is stored at the addressed node, `null` is returned.
```java
final String word = "running";
final String patch = trie.get(word);
```
### `getAll(...)`: ordered local values
`FrequencyTrie.getAll(String)` returns all values stored at the addressed node, ordered by descending frequency using the same deterministic tie-breaking rules. The returned array is a defensive copy. If the key is missing or has no local values, an empty array is returned.
```java
final String[] patches = trie.getAll("axes");
```
### `getEntries(...)`: values with counts
`FrequencyTrie.getEntries(String)` returns immutable `ValueCount<V>` objects aligned with the same ordering used by `getAll(...)`.
```java
import java.util.List;
import org.egothor.stemmer.ValueCount;
final List<ValueCount<String>> entries = trie.getEntries("axes");
```
## Apply patch commands
A patch command is not the final stem. It must be applied to the original input token. `PatchCommandEncoder.apply(source, patchCommand)` performs that transformation directly on the serialized command format. If the source is `null`, the method returns `null`. If the patch is `null`, empty, or malformed in compatibility-relevant ways, the original source word is preserved. Equal source and target words are represented by the canonical no-op patch.
```java
import org.egothor.stemmer.PatchCommandEncoder;
final String word = "running";
final String patch = trie.get(word);
final String stem = PatchCommandEncoder.apply(word, patch);
```
For multiple candidates:
```java
final String word = "axes";
for (final String patch : trie.getAll(word)) {
final String stem = PatchCommandEncoder.apply(word, patch);
System.out.println(word + " -> " + stem + " (" + patch + ")");
}
```
## Understand reduction modes
Reduction mode determines how mutable subtrees are merged during compilation. All modes operate on full subtree semantics rather than only on local node content.
### `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
This mode merges subtrees whose `getAll()` results are equivalent for every reachable key suffix and whose local result ordering is the same. It ignores absolute frequencies when comparing subtree signatures, but it preserves ranked multi-result ordering semantics.
### `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
This mode also merges according to `getAll()` equivalence for every reachable key suffix, but it ignores local result ordering in addition to absolute frequencies. It is therefore more aggressive in what it considers equivalent.
### `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
This mode focuses on `get()` equivalence for every reachable key suffix, subject to dominance constraints. If a node does not satisfy the configured dominance thresholds, the implementation falls back to ranked `getAll()` semantics for that node to avoid unsafe over-reduction. The thresholds are configured through `ReductionSettings`. Defaults are 75 percent minimum winner share and a winner-over-second ratio of 3.
## Practical guidance
- choose a ranked `getAll()` mode when downstream ambiguity handling matters,
- choose the dominant `get()` mode when the primary operational concern is the preferred result,
- treat reduction mode as part of observable lookup semantics, not merely as an internal compression setting.
## Continue with
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)
- [Loading and Building Stemmers](programmatic-loading-and-building.md)

View File

@@ -1,322 +1,56 @@
# Programmatic Usage
> ← Back to [README.md](../README.md)
This document provides the programmatic entry point to **Radixor**.
This document describes how to use **Radixor** programmatically from Java.
Radixor follows a clear lifecycle:
It covers:
1. acquire a compiled stemmer,
2. query it for patch commands,
3. apply those commands to produce stems,
4. reopen and extend the compiled structure when needed.
- building a trie from dictionary data
- compiling it into an immutable structure
- loading compiled stemmers
- querying for stems
- working with multiple candidates
- modifying existing compiled stemmers
## Conceptual model
Radixor is dictionary-driven, but runtime stemming does not operate by scanning raw dictionary files. A source dictionary is parsed as a sequence of canonical stems and their known variants. Each variant is converted into a compact patch command that transforms the variant into the stem, while the stem itself may optionally be stored as a canonical no-op patch. The mutable trie is then reduced into a compiled read-only structure that stores ordered values and their counts at addressed nodes.
Two consequences matter for developers:
## Overview
- the quality and coverage of stemming behavior depend on dictionary richness,
- runtime usage is based on compiled patch-command lookup rather than on direct dictionary traversal.
Radixor separates the stemming lifecycle into three stages:
This is why Radixor can generalize beyond explicitly listed forms and why compiled artifacts are well suited for deployment.
1. **Build** collect wordstem mappings in a mutable structure
2. **Compile** reduce and convert to an immutable trie
3. **Query** perform fast runtime lookups
## Documentation map
These stages are represented by:
The programmatic API is easier to understand when split by developer task:
- `FrequencyTrie.Builder` (mutable)
- `FrequencyTrie` (immutable, compiled)
- `StemmerPatchTrieLoader` / `StemmerPatchTrieBinaryIO` (I/O)
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
## Core types
The main types involved in programmatic usage are:
## Building a trie programmatically
- `FrequencyTrie.Builder<V>` for mutable construction and extension,
- `FrequencyTrie<V>` for the compiled read-only trie,
- `PatchCommandEncoder` for creating and applying patch commands,
- `StemmerPatchTrieLoader` for loading bundled or textual dictionaries,
- `StemmerPatchTrieBinaryIO` for reading and writing compressed binary artifacts,
- `FrequencyTrieBuilders` for reconstructing a mutable builder from a compiled trie,
- `ReductionMode` and `ReductionSettings` for controlling compilation semantics.
You can construct a trie directly without using the CLI.
```java
import org.egothor.stemmer.*;
public final class BuildExample {
public static void main(String[] args) {
ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
FrequencyTrie.Builder<String> builder =
new FrequencyTrie.Builder<>(String[]::new, settings);
PatchCommandEncoder encoder = new PatchCommandEncoder();
builder.put("running", encoder.encode("running", "run"));
builder.put("runs", encoder.encode("runs", "run"));
builder.put("ran", encoder.encode("ran", "run"));
FrequencyTrie<String> trie = builder.build();
}
}
```
## Loading from dictionary files
To parse dictionary files directly:
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.*;
public final class LoadFromDictionaryExample {
public static void main(String[] args) throws IOException {
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
Path.of("data/stemmer.txt"),
true,
ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
)
);
}
}
```
## Loading a compiled binary trie
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.*;
public final class LoadBinaryExample {
public static void main(String[] args) throws IOException {
FrequencyTrie<String> trie =
StemmerPatchTrieLoader.loadBinary(Path.of("english.radixor.gz"));
}
}
```
This is the **preferred production approach**.
## Querying for stems
### Preferred result
```java
String word = "running";
String patch = trie.get(word);
String stem = PatchCommandEncoder.apply(word, patch);
```
### All candidates
```java
String[] patches = trie.getAll(word);
for (String patch : patches) {
String stem = PatchCommandEncoder.apply(word, patch);
}
```
## Accessing value frequencies
For diagnostic or advanced use cases:
```java
import org.egothor.stemmer.ValueCount;
java.util.List<ValueCount<String>> entries = trie.getEntries("axes");
for (ValueCount<String> entry : entries) {
String patch = entry.value();
int count = entry.count();
}
```
This allows:
* inspecting ambiguity
* understanding ranking decisions
* debugging dictionary quality
## Using bundled language resources
```java
FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
```
Bundled dictionaries are useful for:
* quick integration
* testing
* reference behavior
## Persisting a compiled trie
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.*;
public final class SaveExample {
public static void main(String[] args) throws IOException {
StemmerPatchTrieBinaryIO.write(trie, Path.of("english.radixor.gz"));
}
}
```
## Modifying an existing trie
A compiled trie can be reopened into a builder, extended, and rebuilt.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.*;
public final class ModifyExample {
public static void main(String[] args) throws IOException {
FrequencyTrie<String> compiled =
StemmerPatchTrieBinaryIO.read(Path.of("english.radixor.gz"));
ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS
);
FrequencyTrie.Builder<String> builder =
FrequencyTrieBuilders.copyOf(compiled, String[]::new, settings);
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
FrequencyTrie<String> updated = builder.build();
StemmerPatchTrieBinaryIO.write(updated,
Path.of("english-custom.radixor.gz"));
}
}
```
## Thread safety
* `FrequencyTrie` (compiled):
* **thread-safe**
* safe for concurrent reads
* `FrequencyTrie.Builder`:
* **not thread-safe**
* intended for single-threaded construction
## Performance characteristics
### Querying
* O(length of word)
* minimal allocations
* suitable for high-throughput pipelines
### Loading
* binary loading is fast
* no preprocessing required
### Building
* depends on dictionary size
* reduction phase may be CPU-intensive
## Best practices
### Reuse compiled trie instances
* load once
* share across threads
### Prefer binary loading in production
* avoid rebuilding at runtime
* treat compiled files as deployable artifacts
### Use `getAll()` only when needed
* `get()` is faster and sufficient for most use cases
### Keep builders short-lived
* build → compile → discard
## Integration patterns
### Search systems
* apply stemming during indexing and querying
* ensure consistent dictionary usage
### Text normalization pipelines
* integrate as a transformation step
* combine with tokenization and filtering
### Domain adaptation
* extend dictionaries with domain-specific vocabulary
* rebuild compiled artifacts
## Recommended reading order
For most developers, the best order is:
1. [Loading and Building Stemmers](programmatic-loading-and-building.md)
2. [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md)
3. [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md)
## Next steps
* [Dictionary format](dictionary-format.md)
* [CLI compilation](cli-compilation.md)
* [Architecture and reduction](architecture-and-reduction.md)
## Summary
Programmatic usage of Radixor follows a clear pattern:
* build or load a trie
* query using patch commands
* apply transformations
The API is intentionally simple at the surface, while providing deeper control when needed for:
* ambiguity handling
* diagnostics
* dictionary evolution
- [Quick Start](quick-start.md)
- [CLI compilation](cli-compilation.md)
- [Dictionary format](dictionary-format.md)
- [Architecture and reduction](architecture-and-reduction.md)

View File

@@ -1,317 +1,239 @@
# Quality and Operations
> ← Back to [README.md](../README.md)
This document describes the engineering standards, quality posture, and operational model of **Radixor**.
This document describes quality, testing, and operational practices for **Radixor**.
It is intentionally broader than a test checklist. The purpose of the project is not only to provide a fast stemmer, but to provide one whose behavior is explainable, measurable, reproducible, and straightforward to audit. That objective influences both the implementation style and the surrounding operational practices.
It focuses on:
## Engineering position
- reliability and determinism
- testing strategies
- deployment patterns
- performance considerations
- lifecycle management of stemmer data
Radixor is developed with a strong preference for objective quality signals over informal confidence.
In practical terms, that means the project emphasizes:
- deterministic behavior,
- reproducible compiled artifacts,
- very high structural test coverage,
- very high mutation resistance,
- explicit benchmark methodology,
- minimal operational ambiguity in deployment.
## Overview
This is not treated as a cosmetic quality layer added after the implementation. It is part of the design goal of the project itself.
Radixor is designed to separate:
## Why quality discipline matters here
- **data preparation** (dictionary construction and compilation)
- **runtime execution** (lookup and patch application)
A stemmer can appear deceptively simple from the outside. In practice, however, correctness depends on several interacting layers:
This separation enables:
- predictable runtime behavior
- reproducible builds
- controlled evolution of stemming data
- dictionary parsing,
- patch-command generation,
- trie construction,
- reduction semantics,
- binary persistence,
- runtime lookup behavior.
A defect in any one of these layers can produce subtle and difficult-to-detect errors, including silent ranking drift, loss of ambiguity information, reconstruction inconsistencies, or incorrect stemming outcomes under only a narrow subset of inputs.
For that reason, Radixor aims to be validated not only by example-based tests, but by a broader quality model that combines functional testing, mutation testing, coverage analysis, benchmark visibility, and artifact publication.
## Determinism and reproducibility
Radixor emphasizes deterministic behavior.
Determinism is a foundational property of the project.
### Deterministic outputs
Given the same dictionary input and the same reduction settings, the project aims to produce:
Given:
- the same compiled trie semantics,
- the same local value ordering,
- the same observable `get()` and `getAll()` behavior,
- the same persisted binary output structure in semantic terms.
- the same dictionary input
- the same reduction settings
This matters for more than technical elegance. It enables:
Radixor guarantees:
- stable search behavior across deployments,
- reproducible build outputs,
- reliable regression analysis,
- explainable differences when a dictionary or reduction setting changes.
- identical compiled trie structure
- identical value ordering
- identical lookup results
A deterministic system is easier to test, easier to reason about, and safer to integrate into production pipelines.
### Why this matters
## Test strategy
- stable search behavior across deployments
- reproducible builds
- easier debugging and regression analysis
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
### Structural coverage
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
## Testing strategy
In Radixor, strong coverage is expected across areas such as:
### Unit testing
- patch encoding and application,
- mutable trie construction,
- subtree reduction,
- compiled trie lookup,
- binary serialization and deserialization,
- reconstruction from compiled state,
- dictionary parsing and CLI behavior.
Core components should be tested independently:
### Mutation resistance
- patch encoding and decoding
- trie construction
- reduction behavior
- binary serialization and deserialization
Mutation testing is especially important for this project because it helps distinguish superficial test execution from genuinely discriminating tests.
### Dictionary validation tests
A project can report high line or branch coverage while still failing to detect semantically dangerous implementation drift. Mutation testing provides a stronger objective signal: whether the test suite actually notices meaningful behavioral changes.
A recommended pattern:
For Radixor, very high mutation scores are therefore part of the intended engineering standard, not an optional vanity metric.
1. load dictionary input
2. compile trie
3. re-apply all word → stem mappings
4. verify that:
### Boundary and negative-path validation
- expected stem is present in `getAll()`
- preferred result (`get()`) is correct when deterministic
The project also benefits from extensive negative and edge-case testing, for example around:
This ensures:
- malformed patch commands,
- missing or corrupt binary data,
- invalid CLI arguments,
- ambiguous mappings,
- dominance-threshold edge conditions,
- reconstruction of reduced compiled tries,
- empty inputs and short words.
- no data loss during reduction
- correctness of patch encoding
These cases are important because many real integration failures occur at the boundary conditions, not in the central happy path.
## Quality signals and published evidence
The project publishes durable quality artifacts through GitHub Pages so that important signals remain externally inspectable rather than existing only as transient CI output.
## Regression testing
Those published surfaces include:
Maintain a stable test dataset:
- unit test results,
- coverage reports,
- mutation testing reports,
- static analysis reports,
- benchmark outputs,
- software composition artifacts.
- representative vocabulary
- edge cases (short words, long words, ambiguous forms)
This publication model improves transparency and makes it easier to inspect the projects quality posture without having to reconstruct the CI environment locally.
Use it to:
## Operational model
- detect unintended changes
- verify behavior after refactoring
- validate reduction mode changes
Radixor is designed around a clean separation between preparation-time work and runtime execution.
### Preparation phase
Preparation includes:
## Performance testing
- creating or refining dictionary data,
- compiling the dictionary into a reduced read-only trie,
- validating the resulting artifact,
- persisting it as a deployable binary stemmer.
Performance should be evaluated in terms of:
### Runtime phase
### Throughput
Runtime usage is intentionally simpler:
- words processed per second
- load the compiled artifact,
- reuse the resulting trie,
- perform fast lookups and patch application,
- avoid rebuilding or reparsing during live request handling.
### Latency
This separation reduces startup unpredictability, keeps runtime behavior stable, and makes deployment artifacts explicit.
- time per lookup
## Production posture
### Memory footprint
For production use, the preferred model is straightforward:
- size of compiled trie
- runtime memory usage
1. prepare or refine the lexical resource,
2. compile it offline,
3. validate the resulting artifact,
4. deploy the compiled binary,
5. load it once and reuse it.
Benchmark with:
This model has several advantages:
- realistic token streams
- production-like dictionaries
- no runtime compilation cost,
- no repeated parsing overhead,
- clear versioning of stemming behavior,
- better reproducibility across environments,
- simpler operational diagnosis when results change.
## Auditability and dependency posture
Radixor deliberately avoids external runtime dependencies.
## Deployment model
That choice serves a practical engineering goal: the project should be easy to audit from both a correctness and a security perspective, without forcing downstream users to reason through a large dependency graph or a complex software supply chain for core functionality.
### Recommended workflow
A dependency-free core does not make a project automatically secure, but it does simplify several important activities:
1. prepare dictionary data
2. compile using CLI
3. store `.radixor.gz` artifact
4. deploy artifact with application
5. load using `loadBinary(...)`
- source review,
- behavioral auditing,
- release inspection,
- software composition analysis,
- long-term maintenance.
### Why this model
In operational terms, this means there is less hidden behavior outside the projects own codebase and less need to evaluate third-party runtime libraries for the core implementation path.
- avoids runtime compilation overhead
- reduces startup latency
- ensures consistent behavior across environments
## Security-minded operational guidance
The projects operational simplicity should be preserved in deployment practice.
Recommended principles include:
## Artifact management
- treat source dictionaries as controlled inputs,
- generate compiled artifacts in known build environments,
- version compiled artifacts explicitly,
- avoid loading untrusted binary stemmer files,
- keep benchmark, test, and quality outputs attached to the same revision that produced the artifact.
Compiled stemmers should be treated as versioned assets.
These practices support traceability and reduce ambiguity about what exactly is running in production.
### Versioning
## Performance as a quality concern
- include version in filename or metadata
- track dictionary source and reduction settings
Performance is not isolated from quality; for Radixor, it is part of the projects engineering contract.
Example:
The benchmark suite exists to make throughput behavior measurable and historically visible. At the same time, benchmark interpretation must remain disciplined. Absolute numbers can vary by environment, especially when published through shared CI infrastructure. Sustained relative behavior and reproducible local benchmark methodology are more meaningful than one-off raw figures.
```
english-v1.2-ranked.radixor.gz
```
This is why benchmarking belongs alongside testing and reporting rather than outside the quality discussion altogether.
### Storage
## Operational observability
- store in repository or artifact storage
- ensure consistent distribution across environments
Radixor itself is intentionally small and does not attempt to become an observability framework. Instead, integrations should provide the surrounding operational visibility that production systems require.
Typical integration-level observability includes:
- reporting load failures,
- monitoring startup artifact loading,
- measuring lookup throughput in the host application,
- tracking memory usage of loaded compiled tries,
- optionally sampling ambiguity-heavy cases when `getAll()` is part of the application logic.
## Runtime usage
The projects role is to remain deterministic and inspectable enough that such operational signals are meaningful.
### Loading
## What feedback is most valuable
- load once during application startup
- reuse `FrequencyTrie` instance
Feedback is especially valuable when it improves the objectivity or professional rigor of the project.
### Thread safety
That includes, for example:
- compiled trie is safe for concurrent access
- no synchronization required for reads
- defects in behavioral correctness,
- weaknesses in reduction semantics or edge-case handling,
- benchmark methodology issues,
- gaps in tests or mutation resistance,
- ambiguities in published reports,
- opportunities to improve auditability, reproducibility, or operational clarity.
### Avoid repeated loading
Project feedback is most useful when it helps strengthen the project as an implementation that can be trusted, reviewed, and maintained at a professional standard.
Do not:
## Practical summary
- load trie per request
- rebuild trie at runtime
Radixor aims to combine:
- strong algorithmic performance,
- deterministic behavior,
- very high validation standards,
- transparent published quality evidence,
- low operational ambiguity,
- easy auditability of the core implementation.
That combination is central to the identity of the project. The goal is not merely to be fast, but to be fast in a way that remains explainable, testable, reproducible, and professionally defensible.
## Memory considerations
## Related documentation
- compiled tries are compact but not negligible
- size depends on:
- dictionary size
- reduction mode
Recommendations:
- monitor memory usage in production
- choose reduction mode appropriately
## Reduction mode in production
Default recommendation:
- use **ranked mode**
Switch to other modes only when:
- memory constraints are strict
- multiple candidate results are not required
Always validate behavior after changing reduction mode.
## Dictionary lifecycle
### Updating dictionaries
When dictionary data changes:
1. update source file
2. recompile
3. run validation tests
4. deploy new artifact
### Backward compatibility
- changes in dictionary may affect stemming results
- evaluate impact on search relevance
## Observability
Radixor itself does not provide observability features; integration should provide:
- logging for loading failures
- metrics for lookup throughput
- monitoring of memory usage
Optional:
- sampling of ambiguous results (`getAll()`)
## Error handling
### During compilation
Handle:
- invalid dictionary format
- I/O failures
- invalid arguments
### During runtime
Handle:
- missing dictionary files
- corrupted binary artifacts
Fail fast on initialization errors.
## Operational best practices
- compile dictionaries offline
- version compiled artifacts
- test before deployment
- load once and reuse
- monitor performance and memory
- document reduction settings used
## Security considerations
- treat dictionary input as trusted data
- validate external sources before compilation
- avoid loading unverified binary artifacts
## Integration checklist
Before production deployment:
- dictionary validated
- compiled artifact generated
- reduction mode documented
- performance tested
- memory usage verified
- regression tests passing
## Next steps
- [Quick start](quick-start.md)
- [Benchmarking](benchmarking.md)
- [Reports](reports.md)
- [CLI compilation](cli-compilation.md)
- [Programmatic usage](programmatic-usage.md)
## Summary
Radixor is designed for:
- deterministic behavior
- efficient runtime execution
- controlled data-driven evolution
By separating compilation from runtime and following proper operational practices, it can be reliably integrated into production-grade systems.

View File

@@ -1,10 +1,92 @@
# Quick Start
> ← Back to [README.md](../README.md)
This guide introduces the fastest practical path to using **Radixor**.
This guide shows the fastest way to start using **Radixor** and the most common next steps.
Radixor separates preparation from runtime usage. Source dictionaries are used to derive patch commands and reduce them into a compact read-only trie. Runtime stemming then operates on that compiled structure rather than on the original dictionary text. A richer dictionary usually improves the quality and coverage of inferred transformations, including transformations that are applicable to words not explicitly present in the source material. The reduction step also removes a large amount of redundant lexical information, which is why very large dictionaries can still produce compact runtime artifacts. These artifacts can be persisted and loaded directly when needed.
## Hello world
A practical workflow usually consists of two independent phases:
1. obtain a compiled stemmer,
2. use the compiled stemmer.
## 1. Obtain a compiled stemmer
A compiled stemmer can be obtained in three common ways.
### Use a bundled language dictionary
Radixor ships with bundled dictionaries for a set of supported languages. These resources are line-oriented dictionaries stored with the library and compiled into a `FrequencyTrie<String>` when loaded. The loader can also store the canonical stem itself as a no-op patch command.
```java
import java.io.IOException;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class BundledStemmerExample {
private BundledStemmerExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(
StemmerPatchTrieLoader.Language.US_UK_PROFI,
true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
System.out.println("Canonical node count: " + trie.size());
}
}
```
### Load a previously compiled binary stemmer
Compiled stemmers can be stored as GZip-compressed binary artifacts and loaded directly. This is usually the most convenient production path because no dictionary parsing or recompilation is needed during application startup.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class LoadBinaryStemmerExample {
private LoadBinaryStemmerExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"));
System.out.println("Canonical node count: " + trie.size());
}
}
```
### Build or extend a stemmer from dictionary data
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The parser lowercases input with `Locale.ROOT`, ignores leading and trailing whitespace, and supports line remarks introduced by `#` or `//`.
This path is also relevant when you extend an existing compiled stemmer with additional domain-specific entries and rebuild a new compact artifact.
A dedicated CLI compilation workflow deserves its own focused page and should remain separate from Quick Start, but conceptually it is simply another way to prepare the compiled artifact before runtime use.
## 2. Use the compiled stemmer
A compiled `FrequencyTrie<String>` stores patch commands, not final stems. Querying therefore has two steps:
1. retrieve one or more patch commands from the trie,
2. apply each patch command to the original input word.
The trie returns values associated with the exact addressed node. `get(...)` returns the locally preferred value, while `getAll(...)` returns all locally stored values ordered by descending frequency with deterministic tie-breaking.
### Get the preferred result
Use `get(...)` when the application needs a single preferred transformation.
```java
import java.io.IOException;
@@ -14,9 +96,9 @@ import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class HelloRadixor {
public final class SingleStemExample {
private HelloRadixor() {
private SingleStemExample() {
throw new AssertionError("No instances.");
}
@@ -30,75 +112,44 @@ public final class HelloRadixor {
final String patch = trie.get(word);
final String stem = PatchCommandEncoder.apply(word, patch);
System.out.println(word + " -> " + stem);
System.out.println(word + " -> " + stem + " (" + patch + ")");
}
}
```
This example shows the core workflow:
### Get all candidate results
1. load a trie
2. get a patch command for a word
3. apply the patch
4. obtain the stem
## Retrieve multiple candidate stems
If you need more than one candidate result, use `getAll(...)` instead of `get(...)`.
Use `getAll(...)` when the application should preserve ambiguity instead of collapsing everything into one result. The method is available on every compiled trie. What changes across reduction modes is the semantic strength with which multi-result behavior is preserved during reduction, not whether the method exists.
```java
final String word = "axes";
final String[] patches = trie.getAll(word);
for (String patch : patches) {
for (final String patch : patches) {
final String stem = PatchCommandEncoder.apply(word, patch);
System.out.println(word + " -> " + stem + " (" + patch + ")");
}
```
## Load a compiled binary stemmer
### Inspect ranked values and counts
For production systems, the preferred approach is usually to precompile the dictionary and load the compressed binary artifact at runtime.
For diagnostics or advanced ranking logic, use `getEntries(...)` to obtain value-count pairs in the same deterministic order as `getAll(...)`.
```java
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.StemmerPatchTrieLoader;
import org.egothor.stemmer.ValueCount;
public final class BinaryStemmerExample {
final List<ValueCount<String>> entries = trie.getEntries("axes");
private BinaryStemmerExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final Path path = Path.of("stemmers", "english.radixor.gz");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.loadBinary(path);
final String word = "connected";
final String patch = trie.get(word);
final String stem = PatchCommandEncoder.apply(word, patch);
System.out.println(word + " -> " + stem);
}
for (final ValueCount<String> entry : entries) {
System.out.println(entry.value() + " -> " + entry.count());
}
```
## Compile a dictionary from the command line
## Extend an existing compiled stemmer
```bash
java org.egothor.stemmer.Compile \
--input ./data/stemmer.txt \
--output ./build/english.radixor.gz \
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
--store-original \
--overwrite
```
## Modify an existing compiled stemmer
A compiled trie is read-only, but it is not permanently closed. Radixor can reconstruct a mutable builder from a compiled trie, preserve the currently stored local counts, accept additional insertions, and then compile a new read-only trie. Reconstruction operates on the compiled form, so if the source trie was already reduced by subtree merging, the reopened builder reflects that compiled state rather than the original unreduced insertion history.
```java
import java.io.IOException;
@@ -111,17 +162,15 @@ import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.StemmerPatchTrieBinaryIO;
public final class ModifyCompiledExample {
public final class ExtendCompiledStemmerExample {
private ModifyCompiledExample() {
private ExtendCompiledStemmerExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final Path input = Path.of("stemmers", "english.radixor.gz");
final Path output = Path.of("stemmers", "english-custom.radixor.gz");
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(input);
final FrequencyTrie<String> compiledTrie = StemmerPatchTrieBinaryIO.read(
Path.of("stemmers", "english.radixor.gz"));
final ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
@@ -131,18 +180,25 @@ public final class ModifyCompiledExample {
String[]::new,
settings);
builder.put("microservices", PatchCommandEncoder.NOOP_PATCH);
builder.put("microservices", "Na");
final FrequencyTrie<String> updatedTrie = builder.build();
StemmerPatchTrieBinaryIO.write(updatedTrie, output);
StemmerPatchTrieBinaryIO.write(
updatedTrie,
Path.of("stemmers", "english-custom.radixor.gz"));
}
}
```
## Operational note on memory and preparation
Dictionary compilation is usually a one-time preparation step and is generally fast. The more relevant operational constraint is memory consumption during preparation: before reduction, the mutable build-time structure keeps the full dictionary-derived content in RAM. Reduction then compacts it substantially, but very large source dictionaries can still require significant memory during the initial build phase. The best operational model is therefore to compile once, persist the resulting binary artifact, and load that artifact directly in runtime environments.
## Where to continue
* [Dictionary format](dictionary-format.md)
* [CLI compilation](cli-compilation.md)
* [Programmatic usage](programmatic-usage.md)
* [Built-in languages](built-in-languages.md)
* [Architecture and reduction](architecture-and-reduction.md)
- [Programmatic Usage](programmatic-usage.md)
- [Dictionary format](dictionary-format.md)
- [CLI compilation](cli-compilation.md)
- [Built-in languages](built-in-languages.md)
- [Architecture and reduction](architecture-and-reduction.md)

208
docs/reduction-semantics.md Normal file
View File

@@ -0,0 +1,208 @@
# Reduction Semantics
This document explains how **Radixor** decides that two subtrees are equivalent, how the different reduction modes work, and how those choices affect observable runtime behavior.
## Why reduction exists
Without reduction, the trie would still work, but many subtrees that mean the same thing would remain duplicated. The result would be a much larger runtime artifact than necessary.
Reduction solves that by merging semantically equivalent subtrees into one canonical representative.
The key idea is simple:
> if two subtrees behave the same way under the semantic contract chosen for compilation, only one physical copy is needed.
## Reduction is semantic, not merely structural
Radixor does not reduce nodes merely because they look similar locally. It reduces subtrees only when their **meaning** matches according to the selected mode.
That is why reduction is based on a **signature** that captures both:
1. the local semantics of the current node,
2. the structure and semantics of all descendant edges.
Conceptually:
```text
Signature = (LocalDescriptor, SortedChildDescriptors)
```
Two subtrees are merged only if their signatures are equal.
## Local descriptors
The local descriptor defines what “equivalent” means for the values stored at one node.
Radixor supports three semantic views.
### Ranked descriptor
The ranked descriptor preserves the full ordered result semantics of `getAll()`.
That means:
- candidate membership is preserved,
- local ordering is preserved,
- observable ranked multi-result behavior remains stable.
This is the most semantically faithful mode when ambiguity handling matters.
### Unordered descriptor
The unordered descriptor preserves the set of reachable results, but not their local ordering.
That means:
- candidate membership is preserved,
- ordering differences may be ignored,
- more subtrees can be merged than in ranked mode.
This mode is useful when alternative candidates matter but exact ranking does not.
### Dominant descriptor
The dominant descriptor focuses on the preferred result returned by `get()`.
This mode is used only when the dominant local candidate is strong enough according to configured thresholds:
- minimum winner percentage,
- winner-over-second ratio.
If that local dominance is not strong enough, Radixor does not force dominant semantics anyway. It falls back to ranked semantics for that node to avoid unsafe over-reduction.
That fallback is one of the most important safeguards in the design.
## Child descriptors
A subtree is not defined only by the values stored at the current node. It is also defined by what behavior is reachable through its children.
Each child contributes:
```text
(edge character, child signature)
```
Children are sorted by edge character so that signatures remain deterministic and stable.
This matters because reduction must not depend on incidental map iteration order or other non-semantic implementation details.
## Canonicalization
Once a subtree signature is computed, the reduction process checks whether an equivalent canonical subtree already exists.
If yes, the existing reduced node is reused.
If no, a new canonical reduced node is created and registered.
This turns reduction into a canonicalization process:
- compute semantic identity,
- find canonical representative,
- reuse or create,
- continue bottom-up.
That is how Radixor eliminates duplicated equivalent subtrees.
## Count aggregation and compiled state
When multiple original build-time subtrees collapse into one canonical reduced node, local counts may be aggregated.
This is an important point for understanding compiled artifacts.
A compiled trie is not always a verbatim replay of original insertion history. It is a canonical runtime structure that preserves the semantics guaranteed by the chosen reduction mode.
This explains two things:
- why compiled artifacts can become dramatically smaller,
- why reconstructing a builder from a compiled trie reflects the compiled state rather than the full original unreduced history.
## Reduction modes
### `MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS`
This mode merges subtrees only when their `getAll()` results are equivalent for every reachable key suffix and when local ordering is preserved.
Use this mode when:
- ambiguity handling matters,
- `getAll()` ordering should remain meaningful,
- behavioral fidelity is more important than maximum compression.
This is the safest and most generally recommended mode.
### `MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS`
This mode also preserves `getAll()`-level membership equivalence for every reachable key suffix, but it ignores local ordering differences.
Use this mode when:
- alternative candidates still matter,
- exact ordering is less important,
- stronger reduction is acceptable.
This mode is more aggressive than ranked mode, but less semantically rich.
### `MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS`
This mode focuses on preserving dominant `get()` semantics for every reachable key suffix, subject to dominance thresholds.
Use this mode when:
- the main operational concern is the preferred result,
- richer alternative-result behavior is less important,
- stronger reduction is desirable.
Because non-dominant nodes fall back to ranked semantics, this mode is not simply “discard everything except the winner”. It is a controlled reduction strategy with a built-in safety condition.
## Practical effect on runtime behavior
Reduction mode is not just a storage optimization setting. It affects what distinctions remain visible after compilation.
### When ranked mode is used
You can rely on full ranked `getAll()` semantics being preserved.
### When unordered mode is used
You can rely on candidate membership, but not necessarily on preserving the same local ranking distinctions.
### When dominant mode is used
You optimize primarily for preferred-result semantics. Alternative-result behavior may still exist, but it is no longer the primary semantic contract of the reduction.
## Choosing a mode
A practical rule of thumb is:
- choose **ranked** if you are unsure,
- choose **unordered** if alternative membership matters but ranking does not,
- choose **dominant** only when your application is fundamentally driven by `get()` and you understand the trade-off.
## Why this design works well
The reduction model succeeds because it does not confuse “smaller” with “acceptable”.
Instead, it makes the semantic contract explicit:
- what exactly must be preserved,
- what differences may be ignored,
- when a more aggressive mode is safe,
- when the system must fall back to a stricter interpretation.
That explicitness is what makes the compression trustworthy.
## Mental model to keep
If you want one concise mental model for reduction, use this one:
- build-time insertion collects examples,
- reduction asks which subtrees mean the same thing,
- the answer depends on the chosen semantic contract,
- canonical representatives are shared,
- the compiled trie preserves the behavior promised by that contract.
## Continue with
- [Architecture](architecture.md)
- [Programmatic usage](programmatic-usage.md)
- [CLI compilation](cli-compilation.md)

61
docs/reports.md Normal file
View File

@@ -0,0 +1,61 @@
# Reports and Published Build Artifacts
Radixor publishes durable build outputs to GitHub Pages from qualifying runs of `.github/workflows/pages.yml`.
This page is the central entry point for published project artifacts, including build summaries, API documentation, test and quality reports, benchmark outputs, and software composition materials. It is intended both for routine project inspection and for linking stable report surfaces from external references such as the README, release notes, or development workflows.
## Stable entry points
The following links are the primary stable locations for the most recent published build outputs:
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
- [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/)
Use `builds/latest/` when you want the current published report surface. Use `builds/` when you need to inspect or compare retained historical runs.
## API and developer documentation
These reports are primarily useful when reviewing the published API surface and generated developer-facing documentation:
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
## Verification and code quality reports
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
- [Dependency vulnerability report](https://leogalambos.github.io/Radixor/builds/latest/dependency-check/dependency-check-report.html)
Together, these reports provide the most direct published view of functional correctness, static quality signals, coverage, mutation resistance, and dependency-level security review outputs.
## Software composition artifacts
These artifacts expose the published software bill of materials for the latest build:
- [SBOM (JSON)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.json)
- [SBOM (XML)](https://leogalambos.github.io/Radixor/builds/latest/sbom/radixor-sbom.xml)
They are useful for dependency inspection, downstream integration, compliance-oriented workflows, and artifact traceability.
## Benchmark outputs and badge metadata
These resources expose benchmark results and generated badge metadata derived from the latest published build:
- [JMH benchmark results (TXT)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)
- [JMH benchmark results (CSV)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.csv)
- [Coverage badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)
- [Mutation badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)
- [Benchmark badge metadata](https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)
The benchmark outputs provide direct access to the published JMH result files, while the badge metadata endpoints are intended for status surfaces such as the project README or other generated dashboards.
## Practical usage
In most cases, the recommended entry path is:
1. start with the [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/),
2. open the specific report category relevant to your task,
3. use [Browse historical build reports](https://leogalambos.github.io/Radixor/builds/) when historical inspection is needed.

View File

@@ -7,6 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd
com.google.errorprone:error_prone_annotations:2.41.0=pmd
net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-api:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-engine:1.9.3=jmhRuntimeClasspath,testRuntimeClasspath
net.jqwik:jqwik-time:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-web:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.sf.jopt-simple:jopt-simple:4.9=pitest
net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath
net.sf.saxon:Saxon-HE:12.9=pmd
@@ -19,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest
org.apache.commons:commons-lang3:3.20.0=pmd
org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath
org.apache.commons:commons-text:1.14.0=pitest
org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath
org.apiguardian:apiguardian-api:1.1.2=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
org.checkerframework:checker-qual:3.52.1=pmd
org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt
org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt

View File

@@ -1,12 +1,14 @@
#
# After changing dependency versions:
#
# unlock temporarily: LockMode.STRICT -> LockMode.LENIENT
#
# refresh verification metadata:
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
#
# run:
# ./gradlew --write-locks classes testClasses jmh distZip cyclonedxBom
#
# if needed, refresh verification metadata:
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
#
# (optional - for Eclipse IDE)
# insert trusted-artifacts into gradle/verification-metadata.xml/verification-metadata/configuration:
# <trusted-artifacts>
@@ -21,6 +23,7 @@
[versions]
junit = "5.14.3"
mockito = "5.23.0"
jqwik = "1.9.3"
[libraries]
junit-bom = { module = "org.junit:junit-bom", version.ref = "junit" }
@@ -29,3 +32,5 @@ junit-platform-launcher = { module = "org.junit.platform:junit-platform-launcher
mockito-core = { module = "org.mockito:mockito-core", version.ref = "mockito" }
mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter", version.ref = "mockito" }
jqwik = { module = "net.jqwik:jqwik", version.ref = "jqwik" }

View File

@@ -131,7 +131,10 @@ tasks.register('centralBundle', Zip) {
dependsOn(tasks.named('createCentralChecksums'))
from(centralStagingRepositoryDirectory)
from(centralStagingRepositoryDirectory) {
exclude '**/maven-metadata*.xml*'
}
destinationDirectory = centralBundleDirectory
archiveFileName = "radixor-${project.version}-central-bundle.zip"
}

View File

@@ -568,6 +568,46 @@
<sha256 value="1af699f8d9ddab67f9a0d202fbd7915eb0362a5a6dfd5ffc54cafa3465c9cb0a" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik" version="1.9.3">
<artifact name="jqwik-1.9.3.jar">
<sha256 value="562931e1667308180056a8ce85791f71ab8c37ca8efc2006a163ba5d650e5f73" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-1.9.3.module">
<sha256 value="681316f856db4ea3cac8fcced811127fc1d7016875e5b50aa4a55024513a93d7" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-api" version="1.9.3">
<artifact name="jqwik-api-1.9.3.jar">
<sha256 value="4bce7e80beb6d9d7092a799fa8a509d76cc31dbb20c938a9952965c15d1dd9b2" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-api-1.9.3.module">
<sha256 value="69984416ea2e9f7fde40cfac983d2f540d3a37e9766fd3b0a06fada8f9b4cff2" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-engine" version="1.9.3">
<artifact name="jqwik-engine-1.9.3.jar">
<sha256 value="b85592ee78e30239ccfdca7a134f918ee94ebec51ad29a313fc9a676d97b3ede" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-engine-1.9.3.module">
<sha256 value="2c68479ebda9e334bc9033abd2ef227353808f20114f197947b5c7b9646ab8e5" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-time" version="1.9.3">
<artifact name="jqwik-time-1.9.3.jar">
<sha256 value="9fd09021d8f03d44990457bf3095cf0aaf34d2785d1108ff22590286c233b3e5" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-time-1.9.3.module">
<sha256 value="c2b056576c8767bfcd7efdd982890fbc71e608fb5c9c80fc145cfee6adeeaa24" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-web" version="1.9.3">
<artifact name="jqwik-web-1.9.3.jar">
<sha256 value="6aee9d583c1ff9efe319b2fa0bc9d75fc616de6d1f240ddbd2af9eabda483dbe" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-web-1.9.3.module">
<sha256 value="38c86130c8b86c1657b4f8256e065ee08551f7c5ce728d1a5be8f63133b14554" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.sf.jopt-simple" name="jopt-simple" version="4.9">
<artifact name="jopt-simple-4.9.jar">
<sha256 value="26c5856e954b5f864db76f13b86919b59c6eecf9fd930b96baa8884626baf2f5" origin="Generated by Gradle"/>

65
mkdocs.yml Normal file
View File

@@ -0,0 +1,65 @@
site_name: Radixor
site_description: High-performance multi-language stemming toolkit for Java
site_url: https://leogalambos.github.io/Radixor/
repo_url: https://github.com/leogalambos/Radixor
repo_name: leogalambos/Radixor
copyright: "&copy; 2026 Egothor. Licensed under <a href='https://github.com/leogalambos/Radixor/blob/main/LICENSE'>BSD-3-Clause</a>."
theme:
name: material
language: en
features:
- navigation.instant
- navigation.sections
- navigation.top
- search.suggest
- search.highlight
- content.code.copy
palette:
- scheme: default
primary: indigo
accent: indigo
extra:
generator: false
extra_css:
- assets/stylesheets/extra.css
markdown_extensions:
- admonition
- attr_list
- md_in_html
- pymdownx.details
- pymdownx.highlight
- pymdownx.superfences
- tables
nav:
- Home: index.md
- Getting Started:
- Quick Start: quick-start.md
- Built-in Languages: built-in-languages.md
- Dictionary Format: dictionary-format.md
- CLI Compilation: cli-compilation.md
- Programmatic Usage:
- Overview: programmatic-usage.md
- Loading and Building Stemmers: programmatic-loading-and-building.md
- Querying and Ambiguity Handling: programmatic-querying-and-ambiguity.md
- Extending and Persisting Compiled Tries: programmatic-extending-and-persistence.md
- Architecture and Semantics:
- Overview: architecture-and-reduction.md
- Architecture: architecture.md
- Reduction Semantics: reduction-semantics.md
- Compatibility and Guarantees: compatibility-and-guarantees.md
- Dictionaries:
- Contributing Dictionaries: contributing-dictionaries.md
- Quality and Operations:
- Quality and Operations: quality-and-operations.md
- Benchmarking: benchmarking.md
- Reports: reports.md

View File

@@ -426,6 +426,8 @@ public final class FrequencyTrie<V> {
childNodeIds[edgeIndex] = dataInput.readInt();
}
validateSerializedEdges(nodeIndex, edgeLabels);
final int valueCount = dataInput.readInt();
if (valueCount < 0) {
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
@@ -474,6 +476,28 @@ public final class FrequencyTrie<V> {
return nodes;
}
/**
* Validates the serialized edge-label sequence for one node.
*
* <p>
* Compiled nodes rely on binary search for child lookup and therefore require
* edge labels to be stored in strict ascending order without duplicates.
* Rejecting malformed streams here keeps lookup semantics deterministic and
* avoids silently constructing a trie whose search behavior would be undefined.
*
* @param nodeIndex serialized node identifier
* @param edgeLabels serialized edge labels
* @throws IOException if the edge labels are not strictly ascending
*/
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
}
}
}
/**
* Locates the compiled node for the supplied key.
*

View File

@@ -117,7 +117,14 @@ public final class PatchCommandEncoder {
private static final int MISMATCH_PENALTY = 100;
/**
* Extra headroom added when internal matrices need to grow.
* Extra matrix headroom reserved beyond the immediately required dimensions.
*
* <p>
* A small fixed margin reduces repeated reallocation when a caller encodes many
* similarly sized terms in sequence. The value is intentionally modest: large
* enough to absorb minor size fluctuations, yet small enough to avoid
* materially over-allocating the reused dynamic-programming matrices.
* </p>
*/
private static final int CAPACITY_MARGIN = 8;
@@ -288,6 +295,7 @@ public final class PatchCommandEncoder {
* @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null}
*/
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
public static String apply(String source, String patchCommand) {
if (source == null) {
return null;
@@ -299,6 +307,10 @@ public final class PatchCommandEncoder {
return source;
}
if ((patchCommand.length() & 1) != 0) {
return source;
}
StringBuilder result = new StringBuilder(source);
if (result.isEmpty()) {
@@ -312,11 +324,14 @@ public final class PatchCommandEncoder {
char opcode = patchCommand.charAt(patchIndex);
char argument = patchCommand.charAt(patchIndex + 1);
int encodedCount = argument - 'a' + 1;
switch (opcode) {
case SKIP_OPCODE:
position = position - encodedCount + 1;
final int skipCount = decodeEncodedCount(argument);
if (skipCount < 1) {
return source;
}
position = position - skipCount + 1;
break;
case REPLACE_OPCODE:
@@ -324,8 +339,12 @@ public final class PatchCommandEncoder {
break;
case DELETE_OPCODE:
final int deleteCount = decodeEncodedCount(argument);
if (deleteCount < 1) {
return source;
}
int deleteEndExclusive = position + 1;
position -= encodedCount - 1;
position -= deleteCount - 1;
result.delete(position, deleteEndExclusive);
break;
@@ -353,6 +372,26 @@ public final class PatchCommandEncoder {
return result.toString();
}
/**
* Decodes a compact count argument used by skip and delete instructions.
*
* <p>
* Valid encoded counts start at {@code 'a'} for one affected character. Values
* below {@code 'a'} are malformed and are reported to callers via the
* compatibility fallback path rather than by throwing a dedicated exception.
* </p>
*
* @param argument serialized count argument
* @return decoded positive count, or {@code -1} when the argument is malformed
*/
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
private static int decodeEncodedCount(final char argument) {
if (argument < 'a') {
return -1;
}
return argument - 'a' + 1;
}
/**
* Applies a patch command to an empty source word.
*

View File

@@ -31,6 +31,7 @@
package org.egothor.stemmer.trie;
import java.util.Arrays;
import java.util.Objects;
/**
* Immutable compiled trie node optimized for read access.
@@ -38,7 +39,9 @@ import java.util.Arrays;
* <p>
* The returned arrays are the internal backing storage of the compiled node.
* They are exposed for efficient access by closely related trie infrastructure
* and therefore must never be modified by callers.
* and therefore must never be modified by callers. The node itself is still
* immutable from the public API perspective because construction wires these
* arrays once and all lookup operations thereafter treat them as read-only.
*
* @param <V> value type
* @param edgeLabels internal edge label array
@@ -46,8 +49,90 @@ import java.util.Arrays;
* @param orderedValues internal ordered values array
* @param orderedCounts internal ordered counts array
*/
@SuppressWarnings("PMD.DataClass")
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated compiled node.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public CompiledNode {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(children, "children");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != children.length) {
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only by external callers.
*
* @return internal child-node array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public CompiledNode<V>[] children() {
return this.children;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
/**
* Finds a child for the supplied edge character.
*

View File

@@ -30,14 +30,18 @@
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Objects;
/**
* Intermediate node data used during deserialization before child references
* are resolved.
*
* <p>
* The arrays exposed by the accessors are the internal backing storage of this
* holder. They are returned directly for efficiency and therefore must be
* treated as read-only by callers.
* holder. They are returned directly for efficiency because the deserialization
* pipeline copies references into immutable compiled nodes immediately after
* the record is created. Callers must therefore treat every returned array as
* read-only.
*
* @param <V> value type
* @param edgeLabels edge labels
@@ -45,6 +49,87 @@ package org.egothor.stemmer.trie;
* @param orderedValues ordered values
* @param orderedCounts ordered counts
*/
@SuppressWarnings("PMD.DataClass")
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated node-data holder.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public NodeData {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(childNodeIds, "childNodeIds");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != childNodeIds.length) {
throw new IllegalArgumentException("edgeLabels and childNodeIds must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node identifier array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal child-node identifier array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] childNodeIds() {
return this.childNodeIds;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
}

View File

@@ -0,0 +1,53 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Radixor Overview</title>
</head>
<body>
<h1>Radixor</h1>
<p>
Radixor is a high-performance Java toolkit for dictionary-driven stemming based on
the proven Egothor patch-command trie approach. It is designed for production-grade
search and text-processing systems that require deterministic behavior, efficient
runtime execution, and maintainable lexical assets.
</p>
<p>
In addition to compiling and executing stemming dictionaries, Radixor extends the
traditional Egothor model with support for evolving compiled dictionary artifacts
through additional transformation layers. This allows existing lexical resources to
be refined incrementally without requiring full recompilation from source dictionaries.
</p>
<h2>Project Scope</h2>
<ul>
<li>Compilation of Egothor-compatible stemming dictionaries</li>
<li>Runtime stemming over compact compiled trie artifacts</li>
<li>Transformation and reduction infrastructure for lexical processing</li>
<li>CLI and programmatic integration for Java 21 and newer</li>
</ul>
<h2>API Documentation</h2>
<p>
This Javadoc site documents the Java API of the project. For usage guidance,
architectural context, benchmarking methodology, published reports, and general
project documentation, refer to the main project site:
<a href="https://leogalambos.github.io/Radixor/">leogalambos.github.io/Radixor</a>.
</p>
<h2>License</h2>
<p>
Radixor is distributed under the
<a href="https://github.com/leogalambos/Radixor/blob/main/LICENSE">BSD-3-Clause License</a>.
</p>
<h2>Packages</h2>
<p>
The main API is located in <code>org.egothor.stemmer</code>. Supporting trie-oriented
structures and related implementation components are located in
<code>org.egothor.stemmer.trie</code>.
</p>
</body>
</html>

View File

@@ -0,0 +1,218 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.List;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for the compiled trie abstraction.
*
* <p>
* These properties focus on deterministic compilation, observable lookup
* alignment, binary persistence stability, and safe reconstruction back into a
* writable builder. Together they guard the most valuable invariants of the
* core algorithm without overfitting to particular fixture data.
*/
@Label("FrequencyTrie properties")
@Tag("unit")
@Tag("property")
@Tag("trie")
class FrequencyTrieProperties extends PropertyBasedTestSupport {
/**
* Binary codec used by generic trie round-trip assertions.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Verifies that compiling the same insertion scenario repeatedly yields the
* same observable lookups.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 80)
@Label("compilation should be deterministic for the same insertion scenario")
void compilationShouldBeDeterministicForTheSameInsertionScenario(
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> first = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> second = buildTrie(scenario, reductionMode);
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(first, second, key);
}
}
/**
* Verifies that {@link FrequencyTrie#get(String)},
* {@link FrequencyTrie#getAll(String)}, and
* {@link FrequencyTrie#getEntries(String)} remain aligned for every probed key.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 80)
@Label("get, getAll, and getEntries should stay semantically aligned")
void getGetAllAndGetEntriesShouldStaySemanticallyAligned(@ForAll("trieScenarios") final TrieScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> trie = buildTrie(scenario, reductionMode);
for (String key : scenario.observedKeys()) {
final String preferred = trie.get(key);
final String[] allValues = trie.getAll(key);
final List<ValueCount<String>> entries = trie.getEntries(key);
assertEquals(allValues.length, entries.size(), "getAll() and getEntries() must have equal cardinality.");
if (allValues.length == 0) {
assertNull(preferred, "get() must return null when no terminal value exists.");
assertTrue(entries.isEmpty(), "getEntries() must be empty when getAll() is empty.");
continue;
}
assertEquals(allValues[0], preferred, "get() must expose the preferred first getAll() value.");
int previousCount = Integer.MAX_VALUE;
for (int index = 0; index < entries.size(); index++) {
final ValueCount<String> entry = entries.get(index);
assertEquals(allValues[index], entry.value(), "entry ordering must match getAll() ordering.");
assertTrue(entry.count() >= 1, "stored frequencies must remain positive.");
assertTrue(entry.count() <= previousCount, "entry counts must be ordered descending.");
previousCount = entry.count();
}
}
}
/**
* Verifies that binary serialization and deserialization preserve all
* observable lookup semantics for generated scenarios.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 40)
@Label("binary round-trip should preserve observable trie semantics")
void binaryRoundTripShouldPreserveObservableTrieSemantics(@ForAll("trieScenarios") final TrieScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> roundTripped = roundTrip(original);
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(original, roundTripped, key);
}
}
/**
* Verifies that reconstructing a writable builder from a compiled trie and
* recompiling it preserves observable lookup semantics.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 60)
@Label("builder reconstruction should preserve observable trie semantics")
void builderReconstructionShouldPreserveObservableTrieSemantics(
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> rebuilt = FrequencyTrieBuilders
.copyOf(original, STRING_ARRAY_FACTORY, reductionMode).build();
for (String key : scenario.observedKeys()) {
assertEquals(original.get(key), rebuilt.get(key), "preferred lookup must survive reconstruction.");
assertArrayEquals(original.getAll(key), rebuilt.getAll(key),
"complete ordered result set must survive reconstruction.");
}
}
/**
* Asserts full observable trie equality for one key.
*
* @param expected expected trie
* @param actual actual trie
* @param key key to probe
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key) {
assertEquals(expected.get(key), actual.get(key), "preferred lookup drifted.");
assertArrayEquals(expected.getAll(key), actual.getAll(key), "ordered result set drifted.");
assertIterableEquals(expected.getEntries(key), actual.getEntries(key), "entry list drifted.");
}
/**
* Round-trips one trie through its binary representation.
*
* @param trie trie to persist and reload
* @return reloaded trie
*/
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
trie.writeTo(dataOutputStream, STRING_CODEC);
}
try (DataInputStream dataInputStream = new DataInputStream(
new ByteArrayInputStream(byteArrayOutputStream.toByteArray()))) {
return FrequencyTrie.readFrom(dataInputStream, STRING_ARRAY_FACTORY, STRING_CODEC);
}
} catch (IOException exception) {
throw new UncheckedIOException("Unexpected binary round-trip failure.", exception);
}
}
}

View File

@@ -733,6 +733,30 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Invalid root node id"));
}
/**
* Verifies that deserialization rejects unsorted or duplicate serialized edge
* labels because compiled lookup relies on binary search over a strictly
* ascending edge array.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects non-ascending serialized edge labels")
void readFromRejectsNonAscendingSerializedEdgeLabels() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(2);
dataOutput.writeChar('b');
dataOutput.writeInt(0);
dataOutput.writeChar('a');
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Edge labels must be strictly ascending"));
}
/**
* Verifies that deserialization rejects non-positive stored counts.
*/

View File

@@ -0,0 +1,308 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Set;
import java.util.function.IntFunction;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
* Deterministic fuzz-style tests for trie compilation and generated stemming
* dictionaries.
*
* <p>
* These tests exercise bounded pseudo-random inputs with fixed seeds. The suite
* focuses on invariants that are meaningful for CI: compilation must remain
* stable, lookups must remain deterministic, binary round-trips must preserve
* observable behavior, and generated patch commands must reconstruct one of the
* stems declared by the source dictionary.
*/
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
@Tag("unit")
@Tag("fuzz")
@Tag("trie")
@Tag("stemming")
class FuzzStemmerAndTrieCompilationTest {
/**
* Shared array factory used by generated tries.
*/
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
/**
* Binary codec used for generic trie round-trip assertions.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Temporary directory for generated dictionaries and binary artifacts.
*/
@TempDir
Path temporaryDirectory;
/**
* Verifies that bounded pseudo-random trie insertions compile deterministically
* and preserve observable semantics across rebuild, binary serialization, and
* builder reconstruction.
*
* @throws IOException if an unexpected binary I/O failure occurs
*/
@Test
@DisplayName("generated trie insertions should preserve semantics across compilation forms")
void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios()
.toList()) {
final FrequencyTrie<String> compiled = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> rebuilt = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> roundTripped = roundTrip(compiled);
final FrequencyTrie<String> reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
reductionSettings).build();
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(compiled, rebuilt, key,
describeScenario("repeated compilation drifted", reductionMode, scenario, key));
assertTrieStateEquals(compiled, roundTripped, key,
describeScenario("binary round-trip drifted", reductionMode, scenario, key));
assertTrieLookupSemanticsEqual(compiled, reconstructed, key,
describeScenario("builder reconstruction drifted", reductionMode, scenario, key));
}
}
}
}
/**
* Verifies that generated dictionaries compile without failure and that the
* preferred patch command for each generated word reconstructs one acceptable
* source stem.
*
* @throws IOException if the generated dictionary cannot be written or read
*/
@Test
@DisplayName("generated dictionaries should compile and stem consistently")
void generatedDictionariesShouldCompileAndStemConsistently() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory
.resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> trie = assertDoesNotThrow(
() -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode),
describeScenario("generated dictionary must compile", reductionMode, scenario, null));
for (String word : scenario.expectedStemsByWord().keySet()) {
final Set<String> acceptableStems = scenario.expectedStemsByWord().get(word);
final String preferredPatch = trie.get(word);
final String[] allPatches = trie.getAll(word);
assertAll(
() -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
() -> assertTrue(allPatches.length >= 1,
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
describeScenario("preferred patch reconstructed an unexpected stem",
reductionMode, scenario, word)),
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
describeScenario("getAll() contained a patch outside the accepted stem set",
reductionMode, scenario, word)));
}
}
}
}
/**
* Verifies that binary persistence of generated stemmer tries preserves all
* observable lookups for the generated vocabulary.
*
* @throws IOException if persistence unexpectedly fails
*/
@Test
@DisplayName("generated stemmer tries should survive binary persistence")
void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt");
final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
final FrequencyTrie<String> reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile);
for (String word : scenario.expectedStemsByWord().keySet()) {
assertTrieStateEquals(original, reloaded, word,
"Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'.");
}
}
}
/**
* Builds one trie from the supplied generated scenario.
*
* @param scenario generated scenario
* @param reductionSettings reduction settings
* @return compiled trie
*/
private static FrequencyTrie<String> buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario,
final ReductionSettings reductionSettings) {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings);
for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) {
builder.put(insertion.key(), insertion.value(), insertion.count());
}
return builder.build();
}
/**
* Performs a generic binary round-trip of a compiled trie.
*
* @param trie source trie
* @return deserialized trie
* @throws IOException if persistence fails
*/
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
trie.writeTo(outputStream, STRING_CODEC);
return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC);
}
/**
* Compares all observable lookup views for one key.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage),
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage));
}
/**
* Compares only lookup semantics that are expected to survive reconstruction
* from a reduced compiled trie.
*
* <p>
* Some reduction modes intentionally ignore absolute local frequencies when
* identifying equivalent subtrees. Reconstructing a mutable builder from the
* reduced compiled form and compiling it again must therefore preserve
* observable lookup semantics, but it does not necessarily preserve original
* local counts reported by {@link FrequencyTrie#getEntries(String)}.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieLookupSemanticsEqual(final FrequencyTrie<String> expected,
final FrequencyTrie<String> actual, final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage));
}
/**
* Verifies that every patch in the array reconstructs one acceptable stem.
*
* @param word original surface form
* @param patches patch commands
* @param acceptableStems acceptable stems
* @return {@code true} when all patches are acceptable
*/
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
final Set<String> acceptableStems) {
for (String patch : patches) {
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
return false;
}
}
return true;
}
/**
* Builds a contextual assertion message.
*
* @param prefix failure prefix
* @param reductionMode reduction mode under test
* @param scenario source scenario
* @param word current word or key, may be {@code null}
* @return contextual message
*/
private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario,
final String word) {
final StringBuilder builder = new StringBuilder(128);
builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=")
.append(scenario);
if (word != null) {
builder.append(", token='").append(word).append('\'');
}
return builder.toString();
}
}

View File

@@ -0,0 +1,339 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.stream.Stream;
/**
* Deterministic support utilities for fuzz-style tests of trie compilation and
* stemming dictionary loading.
*
* <p>
* The generators in this helper intentionally use bounded input sizes and fixed
* seeds so that the resulting tests remain reproducible and suitable for CI.
* The goal is not statistical randomness, but broad structured coverage of
* unusual combinations that are cumbersome to author manually.
*/
final class FuzzTestSupport {
/**
* Shared deterministic seeds used across all generated scenarios.
*/
private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L };
/**
* Lower-case alphabet used for generated word material.
*/
private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray();
/**
* Utility class.
*/
private FuzzTestSupport() {
throw new AssertionError("No instances.");
}
/**
* Returns deterministic trie-compilation scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<TrieCompilationScenario> trieCompilationScenarios() {
final List<TrieCompilationScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createTrieCompilationScenario(seed));
}
return scenarios.stream();
}
/**
* Returns deterministic stemmer-dictionary scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<StemmerDictionaryScenario> stemmerDictionaryScenarios() {
final List<StemmerDictionaryScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createStemmerDictionaryScenario(seed));
}
return scenarios.stream();
}
/**
* Creates one trie scenario with repeated insertions, empty-key coverage, and a
* stable set of observed keys.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static TrieCompilationScenario createTrieCompilationScenario(final long seed) {
final Random random = new Random(seed);
final List<TrieInsertion> insertions = new ArrayList<>();
final Set<String> observedKeys = new LinkedHashSet<>();
observedKeys.add("");
final int insertionCount = 50 + random.nextInt(15);
for (int index = 0; index < insertionCount; index++) {
final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10);
final String value = nextWord(random, 0, 8);
final int count = 1 + random.nextInt(4);
insertions.add(new TrieInsertion(key, value, count));
observedKeys.add(key);
if (!key.isEmpty() && random.nextBoolean()) {
observedKeys.add(key.substring(0, Math.max(0, key.length() - 1)));
}
observedKeys.add(nextWord(random, 1, 8));
}
return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys));
}
/**
* Creates one dictionary scenario made of compact stem-to-variants groups.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) {
final Random random = new Random(seed);
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<>();
final StringBuilder dictionary = new StringBuilder(512);
dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n');
dictionary.append("// blank and remark handling is part of the exercised input\n\n");
final int entryCount = 18 + random.nextInt(8);
for (int index = 0; index < entryCount; index++) {
final String stem = nextWord(random, 1, 8);
final LinkedHashSet<String> variants = new LinkedHashSet<>();
final int variantCount = 1 + random.nextInt(4);
while (variants.size() < variantCount) {
if (random.nextInt(6) == 0) {
variants.add(stem);
} else {
variants.add(createVariant(random, stem));
}
}
dictionary.append(stem);
for (String variant : variants) {
dictionary.append(' ').append(variant);
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
}
dictionary.append(" # entry ").append(index).append('\n');
if (random.nextInt(5) == 0) {
dictionary.append("\n");
}
}
return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord));
}
/**
* Creates a variant related to a supplied stem.
*
* @param random source of deterministic pseudo-randomness
* @param stem canonical stem
* @return generated variant
*/
private static String createVariant(final Random random, final String stem) {
final int mode = random.nextInt(6);
switch (mode) {
case 0:
return stem + suffix(random);
case 1:
return prefix(random) + stem;
case 2:
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
case 3:
return stem + nextLetter(random) + nextLetter(random);
case 4:
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
default:
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
}
}
/**
* Returns a generated word in lower case.
*
* @param random source of deterministic pseudo-randomness
* @param minLength minimum inclusive length
* @param maxLength maximum inclusive length
* @return generated word
*/
private static String nextWord(final Random random, final int minLength, final int maxLength) {
final int length = minLength + random.nextInt(maxLength - minLength + 1);
final StringBuilder builder = new StringBuilder(length);
for (int index = 0; index < length; index++) {
builder.append(nextLetter(random));
}
return builder.toString().toLowerCase(Locale.ROOT);
}
/**
* Returns one generated prefix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return prefix fragment
*/
private static String prefix(final Random random) {
return String.valueOf(nextLetter(random));
}
/**
* Returns one generated suffix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return suffix fragment
*/
private static String suffix(final Random random) {
final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" };
return suffixes[random.nextInt(suffixes.length)];
}
/**
* Returns one generated lower-case letter.
*
* @param random source of deterministic pseudo-randomness
* @return generated character
*/
private static char nextLetter(final Random random) {
return ALPHABET[random.nextInt(ALPHABET.length)];
}
/**
* Creates an immutable map view whose nested sets are also immutable.
*
* @param source mutable source map
* @return immutable copy
*/
private static Map<String, Set<String>> immutableMapOfSets(final Map<String, Set<String>> source) {
final Map<String, Set<String>> copy = new LinkedHashMap<>(source.size());
for (Map.Entry<String, Set<String>> entry : source.entrySet()) {
copy.put(entry.getKey(), Set.copyOf(entry.getValue()));
}
return Map.copyOf(copy);
}
/**
* Generated trie scenario for deterministic fuzz testing.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
record TrieCompilationScenario(long seed, List<TrieInsertion> insertions, List<String> observedKeys) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
TrieCompilationScenario {
Objects.requireNonNull(insertions, "insertions");
Objects.requireNonNull(observedKeys, "observedKeys");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
/**
* One generated insertion into a trie builder.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
record TrieInsertion(String key, String value, int count) {
/**
* Creates a validated insertion.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
TrieInsertion {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) {
throw new IllegalArgumentException("count must be positive.");
}
}
}
/**
* Generated dictionary scenario for deterministic fuzz testing of stemming.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
StemmerDictionaryScenario {
Objects.requireNonNull(dictionaryContent, "dictionaryContent");
Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
}

View File

@@ -0,0 +1,93 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for {@link PatchCommandEncoder}.
*
* <p>
* These properties protect the most important behavioral contract of the patch
* language: encoding must be deterministic and applying an encoded patch must
* reconstruct the exact requested target.
*/
@Label("PatchCommandEncoder properties")
@Tag("unit")
@Tag("property")
@Tag("patch")
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
/**
* Verifies that encoding followed by application reconstructs the original
* target word for bounded generated inputs.
*
* @param source source word
* @param target target word
*/
@Property(tries = 200)
@Label("encode followed by apply should reconstruct the target word")
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
@ForAll("words") final String target) {
final PatchCommandEncoder encoder = new PatchCommandEncoder();
final String patch = encoder.encode(source, target);
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
assertEquals(target, PatchCommandEncoder.apply(source, patch),
"applying the encoded patch must reconstruct the target word.");
}
/**
* Verifies that encoding is deterministic for the same source-target pair, both
* within one encoder instance and across fresh instances.
*
* @param source source word
* @param target target word
*/
@Property(tries = 150)
@Label("encode should be deterministic for one source-target pair")
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
@ForAll("words") final String target) {
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
final String first = sharedEncoder.encode(source, target);
final String second = sharedEncoder.encode(source, target);
final String fresh = new PatchCommandEncoder().encode(source, target);
assertEquals(first, second, "one encoder instance must produce stable output.");
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
}
}

View File

@@ -174,7 +174,13 @@ class PatchCommandEncoderTest {
// 9
Arguments.of(9, "", "-a"),
// 10
Arguments.of(10, "", "Ra"));
Arguments.of(10, "", "Ra"),
// 11
Arguments.of(11, "abc", "D`"),
// 12
Arguments.of(12, "abc", "-`"),
// 13
Arguments.of(13, "", "D`"));
}
/**

View File

@@ -0,0 +1,326 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.function.IntFunction;
import net.jqwik.api.Arbitraries;
import net.jqwik.api.Arbitrary;
import net.jqwik.api.Combinators;
import net.jqwik.api.Provide;
import net.jqwik.api.arbitraries.ListArbitrary;
/**
* Shared jqwik generators and helpers for property-based tests covering the
* Radixor algorithmic core.
*
* <p>
* The generated domains are intentionally bounded to keep CI execution time
* predictable while still exploring a broad range of trie shapes, duplicate
* insertions, missing lookups, and patch-command transformations.
*/
abstract class PropertyBasedTestSupport {
/**
* Shared array factory for string tries.
*/
protected static final IntFunction<String[]> STRING_ARRAY_FACTORY = String[]::new;
/**
* Provides bounded lowercase words suitable for trie keys, stems, and patch
* encoder inputs.
*
* @return bounded word generator
*/
@Provide
protected Arbitrary<String> words() {
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
.ofMinLength(0).ofMaxLength(12);
}
/**
* Provides non-empty lowercase words suitable for dictionary variants and
* stems.
*
* @return bounded non-empty word generator
*/
@Provide
protected Arbitrary<String> nonEmptyWords() {
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
.ofMinLength(1).ofMaxLength(12);
}
/**
* Provides bounded insertion scenarios for trie-focused properties.
*
* @return trie scenario generator
*/
@Provide
protected Arbitrary<TrieScenario> trieScenarios() {
final Arbitrary<TrieInsertion> insertionArbitrary = Combinators
.combine(words(), nonEmptyWords(), Arbitraries.integers().between(1, 5)).as(TrieInsertion::new);
final ListArbitrary<TrieInsertion> insertions = insertionArbitrary.list().ofMinSize(1).ofMaxSize(24);
final Arbitrary<List<String>> observedKeys = words().list().ofMinSize(0).ofMaxSize(16);
return Combinators.combine(insertions, observedKeys)
.as((scenarioInsertions, additionalObservedKeys) -> new TrieScenario(scenarioInsertions,
mergeObservedKeys(scenarioInsertions, additionalObservedKeys)));
}
/**
* Provides bounded stemmer scenarios where each variant word maps to one or
* more acceptable stems.
*
* @return stemmer scenario generator
*/
@Provide
protected Arbitrary<StemmerScenario> stemmerScenarios() {
final Arbitrary<StemmerEntry> entryArbitrary = Combinators
.combine(nonEmptyWords(), nonEmptyWords().set().ofMinSize(1).ofMaxSize(4)).as((stem, variants) -> {
final LinkedHashSet<String> normalizedVariants = new LinkedHashSet<>(variants);
normalizedVariants.add(stem);
return new StemmerEntry(stem, normalizedVariants);
});
return entryArbitrary.list().ofMinSize(1).ofMaxSize(10).map(StemmerScenario::new);
}
/**
* Builds a compiled trie from one generated scenario.
*
* @param scenario trie scenario
* @param reductionMode reduction mode
* @return compiled trie
*/
protected FrequencyTrie<String> buildTrie(final TrieScenario scenario, final ReductionMode reductionMode) {
Objects.requireNonNull(scenario, "scenario");
Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
for (TrieInsertion insertion : scenario.insertions()) {
builder.put(insertion.key(), insertion.value(), insertion.count());
}
return builder.build();
}
/**
* Builds a patch-command trie from one generated stemmer scenario.
*
* @param scenario stemmer scenario
* @param reductionMode reduction mode
* @param storeOriginal whether original stems should be stored using the
* canonical no-op patch
* @return compiled patch-command trie
*/
protected FrequencyTrie<String> buildStemmerTrie(final StemmerScenario scenario, final ReductionMode reductionMode,
final boolean storeOriginal) {
Objects.requireNonNull(scenario, "scenario");
Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
final PatchCommandEncoder encoder = new PatchCommandEncoder();
for (StemmerEntry entry : scenario.entries()) {
if (storeOriginal) {
builder.put(entry.stem(), PatchCommandEncoder.NOOP_PATCH);
}
for (String variant : entry.variants()) {
if (!variant.equals(entry.stem())) {
builder.put(variant, encoder.encode(variant, entry.stem()));
}
}
}
return builder.build();
}
/**
* Merges observed lookup keys while preserving order and keeping scenario keys
* relevant to actual trie content.
*
* @param insertions inserted trie mappings
* @param additionalObservedKeys extra lookup probes
* @return merged lookup-key set
*/
private static Set<String> mergeObservedKeys(final List<TrieInsertion> insertions,
final List<String> additionalObservedKeys) {
final LinkedHashSet<String> observedKeys = new LinkedHashSet<>();
for (TrieInsertion insertion : insertions) {
observedKeys.add(insertion.key());
}
observedKeys.addAll(additionalObservedKeys);
return observedKeys;
}
/**
* Generated insertion into a trie builder.
*
* @param key trie key
* @param value stored value
* @param count positive insertion count
*/
protected record TrieInsertion(String key, String value, int count) {
/**
* Creates a validated insertion descriptor.
*
* @param key trie key
* @param value stored value
* @param count positive insertion count
*/
public TrieInsertion {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) {
throw new IllegalArgumentException("count must be at least 1.");
}
}
}
/**
* Generated trie scenario used by multiple properties.
*
* @param insertions generated insertions
* @param observedKeys lookup probes
*/
protected record TrieScenario(List<TrieInsertion> insertions, Set<String> observedKeys) {
/**
* Creates a validated trie scenario.
*
* @param insertions generated insertions
* @param observedKeys lookup probes
*/
public TrieScenario {
Objects.requireNonNull(insertions, "insertions");
Objects.requireNonNull(observedKeys, "observedKeys");
insertions = List.copyOf(insertions);
observedKeys = Set.copyOf(observedKeys);
if (insertions.isEmpty()) {
throw new IllegalArgumentException("insertions must not be empty.");
}
}
@Override
public String toString() {
return "TrieScenario[insertions=" + this.insertions.size() + ", observedKeys=" + this.observedKeys.size()
+ "]";
}
}
/**
* Generated stemmer dictionary line equivalent.
*
* @param stem canonical stem
* @param variants variants accepted for the stem
*/
protected record StemmerEntry(String stem, Set<String> variants) {
/**
* Creates a validated stemmer entry.
*
* @param stem canonical stem
* @param variants variants accepted for the stem
*/
public StemmerEntry {
Objects.requireNonNull(stem, "stem");
Objects.requireNonNull(variants, "variants");
variants = Set.copyOf(variants);
if (stem.isEmpty()) {
throw new IllegalArgumentException("stem must not be empty.");
}
if (variants.isEmpty()) {
throw new IllegalArgumentException("variants must not be empty.");
}
}
}
/**
* Generated stemmer scenario used by patch-command trie properties.
*
* @param entries generated entries
*/
protected record StemmerScenario(List<StemmerEntry> entries) {
/**
* Creates a validated stemmer scenario.
*
* @param entries generated entries
*/
public StemmerScenario {
Objects.requireNonNull(entries, "entries");
entries = List.copyOf(entries);
if (entries.isEmpty()) {
throw new IllegalArgumentException("entries must not be empty.");
}
}
/**
* Returns all known source words that should be probeable in the resulting
* trie.
*
* @return observed lookup words
*/
public Set<String> observedWords() {
final LinkedHashSet<String> observedWords = new LinkedHashSet<>();
for (StemmerEntry entry : this.entries) {
observedWords.add(entry.stem());
observedWords.addAll(entry.variants());
}
return observedWords;
}
/**
* Returns all acceptable stems for one observed word.
*
* @param word observed word
* @return acceptable stems
*/
public Set<String> acceptableStemsFor(final String word) {
final LinkedHashSet<String> stems = new LinkedHashSet<>();
for (StemmerEntry entry : this.entries) {
if (entry.stem().equals(word) || entry.variants().contains(word)) {
stems.add(entry.stem());
}
}
return stems;
}
@Override
public String toString() {
return "StemmerScenario[entries=" + this.entries.size() + "]";
}
}
}

View File

@@ -0,0 +1,151 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashSet;
import java.util.Set;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for patch-command stemmer tries.
*
* <p>
* These properties verify the most important semantic contract of compiled
* stemmer dictionaries: every patch returned for a known input word must decode
* to one of the acceptable stems declared by the source scenario, and binary
* persistence must not alter that behavior.
*/
@Label("Stemmer patch trie properties")
@Tag("unit")
@Tag("property")
@Tag("stemming")
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
/**
* Verifies that every returned patch reconstructs only acceptable stems for the
* observed word set represented by one generated stemmer scenario.
*
* @param scenario generated stemmer scenario
* @param reductionMode reduction mode
*/
@Property(tries = 60)
@Label("returned patches should reconstruct only acceptable stems")
void returnedPatchesShouldReconstructOnlyAcceptableStems(@ForAll("stemmerScenarios") final StemmerScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> trie = buildStemmerTrie(scenario, reductionMode, true);
for (String observedWord : scenario.observedWords()) {
final Set<String> acceptableStems = scenario.acceptableStemsFor(observedWord);
final String preferredPatch = trie.get(observedWord);
final String[] allPatches = trie.getAll(observedWord);
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
"preferred patch must exist for an observed word.");
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
"preferred patch reconstructed an unexpected stem.");
final Set<String> producedStems = applyAll(observedWord, allPatches);
assertTrue(acceptableStems.containsAll(producedStems),
"getAll() must not expose a patch that reconstructs an undeclared stem.");
if (acceptableStems.contains(observedWord)) {
assertTrue(producedStems.contains(observedWord),
"storeOriginal semantics must preserve the original stem among returned results.");
}
}
}
/**
* Verifies that GZip-compressed binary persistence preserves patch-command trie
* lookups.
*
* @param scenario generated stemmer scenario
* @param reductionMode reduction mode
*/
@Property(tries = 30)
@Label("binary persistence should preserve patch-command trie lookups")
void binaryPersistenceShouldPreservePatchCommandTrieLookups(
@ForAll("stemmerScenarios") final StemmerScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildStemmerTrie(scenario, reductionMode, true);
final FrequencyTrie<String> roundTripped = roundTripCompressed(original);
for (String observedWord : scenario.observedWords()) {
assertEquals(original.get(observedWord), roundTripped.get(observedWord),
"preferred patch lookup drifted after persistence.");
assertArrayEquals(original.getAll(observedWord), roundTripped.getAll(observedWord),
"complete patch result set drifted after persistence.");
}
}
/**
* Applies all returned patches to the supplied source word.
*
* @param source source word
* @param patches returned patches
* @return decoded stem set
*/
private static Set<String> applyAll(final String source, final String[] patches) {
final LinkedHashSet<String> stems = new LinkedHashSet<>();
for (String patch : patches) {
stems.add(PatchCommandEncoder.apply(source, patch));
}
return stems;
}
/**
* Round-trips one patch-command trie through the compressed binary helper.
*
* @param trie trie to persist and reload
* @return reloaded trie
*/
private static FrequencyTrie<String> roundTripCompressed(final FrequencyTrie<String> trie) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
return StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(byteArrayOutputStream.toByteArray()));
} catch (IOException exception) {
throw new UncheckedIOException("Unexpected compressed binary round-trip failure.", exception);
}
}
}

View File

@@ -0,0 +1,148 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link CompiledNode} and {@link NodeData} validation and
* documented backing-array exposure.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("CompiledNode and NodeData")
class CompiledNodeAndNodeDataTest {
/**
* Verifies that {@link NodeData} rejects mismatched edge-related array lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched edge arrays")
void nodeDataShouldRejectMismatchedEdgeArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[] { 'a' }, new int[0], new String[0], new int[0]));
assertEquals("edgeLabels and childNodeIds must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} rejects mismatched value-related array
* lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched value arrays")
void nodeDataShouldRejectMismatchedValueArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[0], new int[0], new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("NodeData accessors expose documented backing arrays")
void nodeDataAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
final int[] childNodeIds = new int[] { 7 };
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 3 };
final NodeData<String> nodeData = new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts);
assertSame(edgeLabels, nodeData.edgeLabels());
assertSame(childNodeIds, nodeData.childNodeIds());
assertSame(orderedValues, nodeData.orderedValues());
assertSame(orderedCounts, nodeData.orderedCounts());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched edge and child arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched edge and child arrays")
void compiledNodeShouldRejectMismatchedEdgeAndChildArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[] { 'a' }, children, new String[0], new int[0]));
assertEquals("edgeLabels and children must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched value arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched value arrays")
void compiledNodeShouldRejectMismatchedValueArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[0], children, new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("CompiledNode accessors expose documented backing arrays")
void compiledNodeAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[1];
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 5 };
final CompiledNode<String> node = new CompiledNode<>(edgeLabels, children, orderedValues, orderedCounts);
assertSame(edgeLabels, node.edgeLabels());
assertSame(children, node.children());
assertSame(orderedValues, node.orderedValues());
assertSame(orderedCounts, node.orderedCounts());
}
}

253
tools/generate-pages-badges.py Executable file
View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
Generate GitHub Pages badge endpoint JSON files from CI report artifacts.
This script derives compact machine-readable badge payloads from:
- JaCoCo XML coverage report
- PIT mutation testing XML report
- JMH CSV benchmark report
The generated JSON files are intended to be consumed by Shields endpoint badges.
"""
from __future__ import annotations
import argparse
import csv
import json
import os
from pathlib import Path
import xml.etree.ElementTree as ET
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Generate GitHub Pages badge metadata from build reports."
)
parser.add_argument(
"--jacoco-xml",
required=True,
help="Path to the JaCoCo XML report."
)
parser.add_argument(
"--pit-xml",
required=True,
help="Path to the PIT XML report."
)
parser.add_argument(
"--jmh-csv",
required=True,
help="Path to the JMH CSV report."
)
parser.add_argument(
"--run-metrics-dir",
required=True,
help="Target directory for the current build badge JSON files."
)
parser.add_argument(
"--latest-metrics-dir",
required=True,
help="Target directory for the latest build badge JSON files."
)
return parser.parse_args()
def write_json(target: Path, payload: dict[str, object]) -> None:
"""Write a badge payload as formatted UTF-8 JSON."""
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(json.dumps(payload, indent=2) + os.linesep, encoding="utf-8")
def unavailable_payload(label: str) -> dict[str, object]:
"""Create a standard payload for unavailable metrics."""
return {
"schemaVersion": 1,
"label": label,
"message": "not available",
"color": "lightgrey"
}
def color_for_percentage(value: float) -> str:
"""Select a badge color for a percentage value."""
if value >= 85.0:
return "brightgreen"
if value >= 70.0:
return "green"
if value >= 55.0:
return "yellow"
if value >= 40.0:
return "orange"
return "red"
def color_for_speedup(value: float) -> str:
"""Select a badge color for a speedup factor."""
if value >= 4.0:
return "brightgreen"
if value >= 3.0:
return "green"
if value >= 2.0:
return "yellow"
if value >= 1.0:
return "orange"
return "red"
def coverage_payload(jacoco_xml: Path) -> dict[str, object]:
"""Build a line coverage badge payload from a JaCoCo XML report."""
if not jacoco_xml.is_file():
return unavailable_payload("coverage")
root = ET.parse(jacoco_xml).getroot()
line_counter = None
for counter in root.findall("counter"):
if counter.attrib.get("type") == "LINE":
line_counter = counter
break
if line_counter is None:
return unavailable_payload("coverage")
missed = int(line_counter.attrib.get("missed", "0"))
covered = int(line_counter.attrib.get("covered", "0"))
total = missed + covered
percentage = 0.0 if total == 0 else (100.0 * covered / total)
return {
"schemaVersion": 1,
"label": "coverage",
"message": f"{percentage:.1f}%",
"color": color_for_percentage(percentage)
}
def mutation_payload(pit_xml: Path) -> dict[str, object]:
"""Build a mutation score badge payload from a PIT XML report."""
if not pit_xml.is_file():
return unavailable_payload("mutation")
root = ET.parse(pit_xml).getroot()
mutation_coverage = root.attrib.get("mutationCoverage")
if mutation_coverage is not None:
score = float(mutation_coverage)
else:
detected_statuses = {
"KILLED",
"TIMED_OUT",
"MEMORY_ERROR",
"RUN_ERROR",
"NON_VIABLE"
}
mutations = root.findall("mutation")
total = len(mutations)
detected = sum(
1
for mutation in mutations
if mutation.attrib.get("status") in detected_statuses
)
score = 0.0 if total == 0 else (100.0 * detected / total)
return {
"schemaVersion": 1,
"label": "mutation",
"message": f"{score:.1f}%",
"color": color_for_percentage(score)
}
def parse_family_count(row: dict[str, str]) -> int:
"""Extract the JMH familyCount parameter from a CSV row."""
for key, value in row.items():
if key.startswith("Param: ") and key.endswith("familyCount"):
try:
return int(value)
except (TypeError, ValueError):
return -1
return -1
def benchmark_payload(jmh_csv: Path) -> dict[str, object]:
"""Build a benchmark speedup badge payload from a JMH CSV report."""
if not jmh_csv.is_file():
return unavailable_payload("english benchmark")
with jmh_csv.open("r", encoding="utf-8", newline="") as input_file:
rows = list(csv.DictReader(input_file))
if not rows:
return unavailable_payload("english benchmark")
relevant_rows: list[tuple[int, str, float]] = []
for row in rows:
benchmark = row.get("Benchmark", "")
if not benchmark.endswith(
"EnglishStemmerComparisonBenchmark.radixorUsUkProfiPreferredStem"
) and not benchmark.endswith(
"EnglishStemmerComparisonBenchmark.snowballOriginalPorter"
):
continue
try:
score = float(row["Score"])
except (KeyError, TypeError, ValueError):
continue
relevant_rows.append((parse_family_count(row), benchmark, score))
if not relevant_rows:
return unavailable_payload("english benchmark")
best_family_count = max(family_count for family_count, _, _ in relevant_rows)
radixor_score = None
porter_score = None
for family_count, benchmark, score in relevant_rows:
if family_count != best_family_count:
continue
if benchmark.endswith(".radixorUsUkProfiPreferredStem"):
radixor_score = score
elif benchmark.endswith(".snowballOriginalPorter"):
porter_score = score
if radixor_score is None or porter_score is None or porter_score <= 0.0:
return unavailable_payload("english benchmark")
# score is time for the batch processing, i.e. longer => slower, i.e. speedup is porter/radixor
speedup = porter_score / radixor_score
family_suffix = "" if best_family_count < 0 else f" ({best_family_count})"
return {
"schemaVersion": 1,
"label": "english benchmark",
"message": f"{speedup:.1f}x vs Porter{family_suffix}",
"color": color_for_speedup(speedup)
}
def main() -> int:
"""Generate all requested badge metadata files."""
arguments = parse_args()
jacoco_xml = Path(arguments.jacoco_xml)
pit_xml = Path(arguments.pit_xml)
jmh_csv = Path(arguments.jmh_csv)
run_metrics_dir = Path(arguments.run_metrics_dir)
latest_metrics_dir = Path(arguments.latest_metrics_dir)
payloads = {
"coverage-badge.json": coverage_payload(jacoco_xml),
"pitest-badge.json": mutation_payload(pit_xml),
"jmh-badge.json": benchmark_payload(jmh_csv)
}
for file_name, payload in payloads.items():
write_json(run_metrics_dir / file_name, payload)
write_json(latest_metrics_dir / file_name, payload)
return 0
if __name__ == "__main__":
raise SystemExit(main())

114
tools/generate-release-notes.sh Executable file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env bash
set -Eeuo pipefail
current_tag="${GITHUB_REF_NAME:-${1:-}}"
if [[ -z "${current_tag}" ]]; then
echo "Current tag is not set. Provide it as GITHUB_REF_NAME or as the first argument." >&2
exit 1
fi
release_prefix="release@"
if [[ "${current_tag}" != "${release_prefix}"* ]]; then
echo "Current tag '${current_tag}' does not start with expected prefix '${release_prefix}'." >&2
exit 1
fi
git fetch --tags --force >/dev/null 2>&1 || true
all_versions="$(git tag --list "${release_prefix}*" | sed "s/^${release_prefix}//" | sort -V)"
previous_tag=""
for version in ${all_versions}; do
if [[ "${release_prefix}${version}" == "${current_tag}" ]]; then
break
fi
previous_tag="${release_prefix}${version}"
done
if [[ -n "${previous_tag}" ]]; then
range="${previous_tag}..${current_tag}"
else
range="${current_tag}"
fi
echo "Generating release notes for range: ${range}" >&2
declare -a CATEGORY_ORDER=(
"feat|Features"
"fix|Bug Fixes"
"perf|Performance"
"refactor|Refactoring"
"docs|Documentation"
"test|Tests"
"build|Build System"
"ci|CI/CD"
"style|Style"
"chore|Maintenance"
"revert|Reverts"
)
declare -A CATEGORY_TITLES
declare -A CATEGORY_ITEMS
for entry in "${CATEGORY_ORDER[@]}"; do
key="${entry%%|*}"
title="${entry##*|}"
CATEGORY_TITLES["${key}"]="${title}"
CATEGORY_ITEMS["${key}"]=""
done
supported_prefix_pattern='^(feat|fix|perf|refactor|docs|test|build|ci|style|chore|revert)(\([^)]+\))?!?:[[:space:]]*(.+)$'
separator=$'\x1f'
append_line() {
local line="$1"
local normalized_line
local category
local message
normalized_line="$(printf '%s' "${line}" | tr -d '\r' | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')"
[[ -z "${normalized_line}" ]] && return 0
if [[ "${normalized_line}" =~ ${supported_prefix_pattern} ]]; then
category="${BASH_REMATCH[1]}"
message="${BASH_REMATCH[3]}"
[[ -z "${message}" ]] && return 0
CATEGORY_ITEMS["${category}"]+="- ${message}"$'\n'
fi
}
while IFS="${separator}" read -r commit_hash subject body; do
[[ -z "${commit_hash}" ]] && continue
if [[ "${subject}" =~ ^Merge[[:space:]] ]] || [[ "${subject}" == "Initial commit" ]]; then
continue
fi
append_line "${subject}"
while IFS= read -r body_line; do
append_line "${body_line}"
done <<< "${body}"
done < <(git log "${range}" --no-merges --pretty=format:"%H${separator}%s${separator}%b")
body_text="## What's New"
for entry in "${CATEGORY_ORDER[@]}"; do
key="${entry%%|*}"
title="${CATEGORY_TITLES[${key}]}"
items="${CATEGORY_ITEMS[${key}]}"
if [[ -n "${items}" ]]; then
body_text+=$'\n\n'"### ${title}"$'\n'
body_text+="$(printf '%s' "${items}" | sed '/^[[:space:]]*$/d')"
fi
done
if [[ "${body_text}" == "## What's New" ]]; then
body_text+=$'\n\n'"No categorized changes were found in commit subjects or bodies for this release range."
fi
printf '%s\n' "${body_text}"