feat: JMH benchmarks added

This commit is contained in:
2026-04-14 02:40:30 +02:00
parent 3c3f3b4312
commit 85e33f2f60
8 changed files with 568 additions and 4 deletions

View File

@@ -3,20 +3,27 @@
<classpathentry kind="src" output="bin/main" path="src/main/java"> <classpathentry kind="src" output="bin/main" path="src/main/java">
<attributes> <attributes>
<attribute name="gradle_scope" value="main"/> <attribute name="gradle_scope" value="main"/>
<attribute name="gradle_used_by_scope" value="main,test"/> <attribute name="gradle_used_by_scope" value="main,test,jmh"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="bin/test" path="src/test/java"> <classpathentry kind="src" output="bin/test" path="src/test/java">
<attributes> <attributes>
<attribute name="gradle_scope" value="test"/> <attribute name="gradle_scope" value="test"/>
<attribute name="gradle_used_by_scope" value="test"/> <attribute name="gradle_used_by_scope" value="test,jmh"/>
<attribute name="test" value="true"/> <attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="bin/main" path="src/main/resources"> <classpathentry kind="src" output="bin/main" path="src/main/resources">
<attributes> <attributes>
<attribute name="gradle_scope" value="main"/> <attribute name="gradle_scope" value="main"/>
<attribute name="gradle_used_by_scope" value="main,test"/> <attribute name="gradle_used_by_scope" value="main,test,jmh"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="bin/jmh" path="src/jmh/java">
<attributes>
<attribute name="gradle_scope" value="jmh"/>
<attribute name="gradle_used_by_scope" value="jmh"/>
<attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-21/"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-21/"/>

51
.github/workflows/benchmarks.yml vendored Normal file
View File

@@ -0,0 +1,51 @@
name: Benchmarks
on:
workflow_dispatch:
schedule:
- cron: '0 3 * * 1'
push:
branches:
- main
paths:
- 'src/main/**'
- 'src/jmh/**'
- 'build.gradle'
- 'gradle/**'
- 'gradlew'
- 'gradlew.bat'
- '.github/workflows/benchmarks.yml'
jobs:
jmh:
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
steps:
- name: Check out sources
uses: actions/checkout@v4
- name: Set up JDK 21
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: '21'
cache: gradle
- name: Make Gradle executable
run: chmod +x ./gradlew
- name: Run JMH benchmarks
run: ./gradlew clean jmh --no-daemon
- name: Upload JMH reports
uses: actions/upload-artifact@v4
with:
name: jmh-reports
path: |
build/reports/jmh/**
build/results/jmh/**
if-no-files-found: warn

View File

@@ -4,6 +4,16 @@ on:
push: push:
branches: branches:
- main - main
paths:
- 'src/main/**'
- 'src/test/**'
- 'src/jmh/**'
- 'build.gradle'
- 'settings.gradle'
- 'gradle/**'
- 'gradlew'
- 'gradlew.bat'
- '.github/workflows/pages.yml'
workflow_dispatch: workflow_dispatch:
permissions: permissions:
@@ -37,7 +47,7 @@ jobs:
uses: gradle/actions/setup-gradle@v4 uses: gradle/actions/setup-gradle@v4
- name: Build reports for publication - name: Build reports for publication
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh
- name: Prepare gh-pages worktree - name: Prepare gh-pages worktree
shell: bash shell: bash

View File

@@ -1,15 +1,19 @@
plugins { plugins {
id 'java' id 'java'
id 'eclipse'
id 'application' id 'application'
id 'pmd' id 'pmd'
id 'jacoco' id 'jacoco'
id 'info.solidsoft.pitest' version '1.19.0' id 'info.solidsoft.pitest' version '1.19.0'
id 'me.champeau.jmh' version '0.7.2'
id 'com.palantir.git-version' version '4.0.0' id 'com.palantir.git-version' version '4.0.0'
} }
group = 'org.egothor.stemmer' group = 'org.egothor.stemmer'
version = gitVersion(prefix:'release@') version = gitVersion(prefix:'release@')
def benchmarkReportsDirectory = layout.buildDirectory.dir('reports/jmh')
configurations { configurations {
mockitoAgent mockitoAgent
} }
@@ -34,6 +38,8 @@ repositories {
} }
dependencies { dependencies {
jmhImplementation sourceSets.main.output
testImplementation platform(libs.junit.bom) testImplementation platform(libs.junit.bom)
testImplementation libs.junit.jupiter testImplementation libs.junit.jupiter
testRuntimeOnly libs.junit.platform.launcher testRuntimeOnly libs.junit.platform.launcher
@@ -104,6 +110,24 @@ application {
mainClass = 'org.egothor.stemmer.Compile' mainClass = 'org.egothor.stemmer.Compile'
} }
jmh {
jmhVersion = '1.37'
warmupIterations = 3
iterations = 5
fork = 1
benchmarkMode = ['avgt']
timeUnit = 'ns'
resultFormat = 'CSV'
resultsFile = benchmarkReportsDirectory.map { it.file('jmh-results.csv').asFile }.get()
humanOutputFile = benchmarkReportsDirectory.map { it.file('jmh-results.txt').asFile }.get()
duplicateClassesStrategy = DuplicatesStrategy.EXCLUDE
}
tasks.named('jmh') {
group = 'verification'
description = 'Runs JMH benchmarks for the Radixor algorithmic core.'
}
javadoc { javadoc {
failOnError = false failOnError = false

View File

@@ -0,0 +1,208 @@
package org.egothor.stemmer.benchmark;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.SplittableRandom;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.StemmerDictionaryParser;
/**
* Builds deterministic benchmark corpora used by the JMH suite.
*
* <p>
* The generated corpus is intentionally synthetic but morphology-shaped: it
* creates a stable base vocabulary and derives common inflectional and
* derivational variants from each stem. The corpus also injects a controlled
* amount of homograph ambiguity so that {@link FrequencyTrie#getAll(String)} is
* measured on keys that really produce multiple candidate patch commands.
* </p>
*/
final class BenchmarkCorpusSupport {
/**
* Prefixes used to synthesize pronounceable stems.
*/
private static final String[] PREFIXES = {
"adapt", "align", "anchor", "answer", "apply", "balance", "build", "capture", "center",
"change", "collect", "connect", "convert", "cover", "create", "cycle", "declare", "define",
"deliver", "derive", "design", "detect", "develop", "drive", "encode", "extend", "filter",
"form", "govern", "handle", "improve", "index", "inform", "inspect", "join", "launch",
"limit", "manage", "map", "model", "move", "observe", "operate", "organ", "pattern",
"perform", "plan", "predict", "prepare", "process", "project", "protect", "publish", "query",
"reduce", "refresh", "render", "repeat", "resolve", "return", "scale", "search", "select",
"shape", "signal", "sort", "state", "store", "stream", "structure", "supply", "support",
"switch", "trace", "transform", "update", "validate", "value"
};
/**
* Suffixes used to diversify stems.
*/
private static final String[] STEM_SUFFIXES = {
"", "er", "or", "al", "ive", "ion", "ent", "ant", "ure", "ment", "ist", "ity"
};
/**
* Number of neighboring stems sharing one ambiguous surface form.
*/
private static final int HOMOGRAPH_GROUP_SIZE = 4;
/**
* Utility class.
*/
private BenchmarkCorpusSupport() {
throw new AssertionError("No instances.");
}
/**
* Creates a deterministic benchmark corpus.
*
* @param stemCount number of canonical stems to generate
* @return immutable benchmark corpus description
*/
static BenchmarkCorpus createCorpus(final int stemCount) {
if (stemCount < 1) {
throw new IllegalArgumentException("stemCount must be at least 1.");
}
final StringBuilder dictionaryBuilder = new StringBuilder(stemCount * 120);
final LinkedHashSet<String> lookupKeys = new LinkedHashSet<>(stemCount * 8);
final LinkedHashSet<String> ambiguousLookupKeys = new LinkedHashSet<>(Math.max(1, stemCount / 4));
final SplittableRandom random = new SplittableRandom(20260414L);
for (int index = 0; index < stemCount; index++) {
final String stem = createStem(index);
final String[] variants = createVariants(stem, random, index);
dictionaryBuilder.append(stem);
lookupKeys.add(stem);
for (String variant : variants) {
dictionaryBuilder.append(' ').append(variant);
lookupKeys.add(variant);
}
final String homograph = createHomograph(index);
dictionaryBuilder.append(' ').append(homograph);
lookupKeys.add(homograph);
ambiguousLookupKeys.add(homograph);
dictionaryBuilder.append('\n');
}
return new BenchmarkCorpus(
dictionaryBuilder.toString(),
lookupKeys.toArray(String[]::new),
ambiguousLookupKeys.toArray(String[]::new));
}
/**
* Builds a compiled trie from benchmark corpus text.
*
* @param corpusText line-oriented dictionary text
* @param reductionSettings reduction settings
* @param storeOriginalStem whether the canonical stem itself should also be
* inserted with the no-op patch
* @return compiled trie containing patch commands
* @throws IOException if parsing fails
*/
static FrequencyTrie<String> compilePatchTrie(
final String corpusText,
final ReductionSettings reductionSettings,
final boolean storeOriginalStem) throws IOException {
Objects.requireNonNull(corpusText, "corpusText");
Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
final PatchCommandEncoder encoder = new PatchCommandEncoder();
StemmerDictionaryParser.parse(
new StringReader(corpusText),
"benchmark-corpus",
(stem, variants, lineNumber) -> {
if (storeOriginalStem) {
builder.put(stem, encoder.encode(stem, stem));
}
for (String variant : variants) {
builder.put(variant, encoder.encode(variant, stem));
}
});
return builder.build();
}
/**
* Creates one deterministic stem.
*
* @param index stem ordinal
* @return generated stem
*/
private static String createStem(final int index) {
final String prefix = PREFIXES[index % PREFIXES.length];
final String suffix = STEM_SUFFIXES[(index / PREFIXES.length) % STEM_SUFFIXES.length];
return (prefix + suffix + base36(index)).toLowerCase(Locale.ROOT);
}
/**
* Creates a set of deterministic variants for one stem.
*
* @param stem canonical stem
* @param random deterministic random source
* @param index stem ordinal
* @return generated variants in stable order
*/
private static String[] createVariants(final String stem, final SplittableRandom random, final int index) {
final List<String> variants = new ArrayList<>(8);
variants.add(stem + "s");
variants.add(stem + "ed");
variants.add(stem + "ing");
variants.add(stem + "er");
variants.add(stem + "ers");
variants.add("pre" + stem);
variants.add(stem + random.nextInt(10));
if ((index & 1) == 0) {
variants.add(stem + "ly");
}
if (stem.length() > 5) {
variants.add(stem.substring(0, stem.length() - 1));
}
return variants.toArray(String[]::new);
}
/**
* Creates an ambiguous surface form shared by a small group of stems.
*
* @param index stem ordinal
* @return shared homograph form
*/
private static String createHomograph(final int index) {
return "shared" + base36(index / HOMOGRAPH_GROUP_SIZE);
}
/**
* Converts an ordinal into a compact base-36 discriminator.
*
* @param value numeric value
* @return compact discriminator
*/
private static String base36(final int value) {
return Integer.toString(value, Character.MAX_RADIX);
}
/**
* Immutable benchmark corpus.
*
* @param dictionaryText full line-oriented dictionary text
* @param lookupKeys keys used for general lookup measurements
* @param ambiguousLookupKeys keys that return multiple patch candidates from
* {@code getAll()}
*/
record BenchmarkCorpus(String dictionaryText, String[] lookupKeys, String[] ambiguousLookupKeys) {
}
}

View File

@@ -0,0 +1,94 @@
package org.egothor.stemmer.benchmark;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
/**
* Benchmarks end-to-end dictionary compilation for different reduction modes.
*
* <p>
* This benchmark measures the offline path that matters for dictionary build
* workflows: dictionary parsing, patch-command generation, mutable trie
* population, subtree reduction, and freezing into the compiled read-only trie.
* </p>
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
public class FrequencyTrieCompilationBenchmark {
/**
* Shared benchmark state for compilation scenarios.
*/
@State(Scope.Benchmark)
public static class CompilationState {
/**
* Number of canonical stems to generate.
*/
@Param({ "2000", "10000" })
public int stemCount;
/**
* Reduction mode used during trie compilation.
*/
@Param({
"MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
"MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
"MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
})
public String reductionMode;
/**
* Whether to store the stem itself using the canonical no-op patch.
*/
@Param({ "true", "false" })
public boolean storeOriginalStem;
/**
* Full dictionary text used as the benchmark input.
*/
private String dictionaryText;
/**
* Initializes the benchmark state.
*/
@Setup(Level.Trial)
public void setUp() {
this.dictionaryText = BenchmarkCorpusSupport.createCorpus(this.stemCount).dictionaryText();
}
}
/**
* Measures end-to-end patch trie compilation latency.
*
* @param state prepared compilation state
* @param blackhole sink preventing dead-code elimination
* @throws IOException if dictionary parsing fails
*/
@Benchmark
public void compilePatchTrie(final CompilationState state, final Blackhole blackhole) throws IOException {
final ReductionSettings settings =
ReductionSettings.withDefaults(ReductionMode.valueOf(state.reductionMode));
blackhole.consume(
BenchmarkCorpusSupport.compilePatchTrie(
state.dictionaryText,
settings,
state.storeOriginalStem));
}
}

View File

@@ -0,0 +1,160 @@
package org.egothor.stemmer.benchmark;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
/**
* Benchmarks lookup-oriented operations on compiled Radixor tries.
*
* <p>
* The benchmark uses a deterministic morphology-shaped corpus and measures the
* latency of the hot-path lookup operations that are relevant at runtime:
* retrieving the preferred patch command, retrieving all candidate patch
* commands, and reconstructing stems from the returned patch values.
* </p>
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
public class FrequencyTrieLookupBenchmark {
/**
* Shared benchmark state for lookup scenarios.
*/
@State(Scope.Benchmark)
public static class LookupState {
/**
* Number of canonical stems to generate.
*/
@Param({ "2000", "10000" })
public int stemCount;
/**
* Reduction mode used to compile the lookup trie.
*/
@Param({
"MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
"MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
"MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
})
public String reductionMode;
/**
* Compiled trie under test.
*/
private FrequencyTrie<String> trie;
/**
* Deterministic lookup keys.
*/
private String[] lookupKeys;
/**
* Keys that are known to return multiple patch candidates from
* {@code getAll()}.
*/
private String[] ambiguousLookupKeys;
/**
* Initializes the benchmark state.
*
* @throws IOException if corpus compilation fails
*/
@Setup(Level.Trial)
public void setUp() throws IOException {
final BenchmarkCorpusSupport.BenchmarkCorpus corpus = BenchmarkCorpusSupport.createCorpus(this.stemCount);
final ReductionSettings settings =
ReductionSettings.withDefaults(ReductionMode.valueOf(this.reductionMode));
this.trie = BenchmarkCorpusSupport.compilePatchTrie(corpus.dictionaryText(), settings, true);
this.lookupKeys = corpus.lookupKeys();
this.ambiguousLookupKeys = corpus.ambiguousLookupKeys();
}
}
/**
* Measures preferred patch lookup latency.
*
* @param state prepared lookup state
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void lookupPreferredPatch(final LookupState state, final Blackhole blackhole) {
final String[] keys = state.lookupKeys;
for (String key : keys) {
final String patch = state.trie.get(key);
if (patch == null) {
throw new IllegalStateException("Missing preferred patch for key " + key + '.');
}
blackhole.consume(patch);
}
}
/**
* Measures retrieval of all patch candidates on ambiguous forms.
*
* @param state prepared lookup state
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void lookupAllPatches(final LookupState state, final Blackhole blackhole) {
final String[] keys = state.ambiguousLookupKeys;
for (String key : keys) {
final String[] patches = state.trie.getAll(key);
if (patches.length < 2) {
throw new IllegalStateException("Expected multiple patches for key " + key + '.');
}
blackhole.consume(patches);
}
}
/**
* Measures end-to-end preferred stemming from lookup plus patch application.
*
* @param state prepared lookup state
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void stemPreferredVariant(final LookupState state, final Blackhole blackhole) {
final String[] keys = state.lookupKeys;
for (String key : keys) {
final String patch = state.trie.get(key);
blackhole.consume(PatchCommandEncoder.apply(key, patch));
}
}
/**
* Measures end-to-end full candidate stemming from {@code getAll()} plus
* patch application.
*
* @param state prepared lookup state
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void stemAllVariants(final LookupState state, final Blackhole blackhole) {
final String[] keys = state.ambiguousLookupKeys;
for (String key : keys) {
final String[] patches = state.trie.getAll(key);
for (String patch : patches) {
blackhole.consume(PatchCommandEncoder.apply(key, patch));
}
}
}
}

View File

@@ -0,0 +1,10 @@
/**
* JMH benchmarks for the Radixor algorithmic core.
*
* <p>
* The benchmarks in this package focus on trie lookup latency, retrieval of all
* candidate patch commands, and end-to-end dictionary compilation with
* different reduction modes.
* </p>
*/
package org.egothor.stemmer.benchmark;