feat: JMH benchmarks added
This commit is contained in:
@@ -0,0 +1,208 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.SplittableRandom;
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.egothor.stemmer.StemmerDictionaryParser;
|
||||
|
||||
/**
|
||||
* Builds deterministic benchmark corpora used by the JMH suite.
|
||||
*
|
||||
* <p>
|
||||
* The generated corpus is intentionally synthetic but morphology-shaped: it
|
||||
* creates a stable base vocabulary and derives common inflectional and
|
||||
* derivational variants from each stem. The corpus also injects a controlled
|
||||
* amount of homograph ambiguity so that {@link FrequencyTrie#getAll(String)} is
|
||||
* measured on keys that really produce multiple candidate patch commands.
|
||||
* </p>
|
||||
*/
|
||||
final class BenchmarkCorpusSupport {
|
||||
|
||||
/**
|
||||
* Prefixes used to synthesize pronounceable stems.
|
||||
*/
|
||||
private static final String[] PREFIXES = {
|
||||
"adapt", "align", "anchor", "answer", "apply", "balance", "build", "capture", "center",
|
||||
"change", "collect", "connect", "convert", "cover", "create", "cycle", "declare", "define",
|
||||
"deliver", "derive", "design", "detect", "develop", "drive", "encode", "extend", "filter",
|
||||
"form", "govern", "handle", "improve", "index", "inform", "inspect", "join", "launch",
|
||||
"limit", "manage", "map", "model", "move", "observe", "operate", "organ", "pattern",
|
||||
"perform", "plan", "predict", "prepare", "process", "project", "protect", "publish", "query",
|
||||
"reduce", "refresh", "render", "repeat", "resolve", "return", "scale", "search", "select",
|
||||
"shape", "signal", "sort", "state", "store", "stream", "structure", "supply", "support",
|
||||
"switch", "trace", "transform", "update", "validate", "value"
|
||||
};
|
||||
|
||||
/**
|
||||
* Suffixes used to diversify stems.
|
||||
*/
|
||||
private static final String[] STEM_SUFFIXES = {
|
||||
"", "er", "or", "al", "ive", "ion", "ent", "ant", "ure", "ment", "ist", "ity"
|
||||
};
|
||||
|
||||
/**
|
||||
* Number of neighboring stems sharing one ambiguous surface form.
|
||||
*/
|
||||
private static final int HOMOGRAPH_GROUP_SIZE = 4;
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private BenchmarkCorpusSupport() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a deterministic benchmark corpus.
|
||||
*
|
||||
* @param stemCount number of canonical stems to generate
|
||||
* @return immutable benchmark corpus description
|
||||
*/
|
||||
static BenchmarkCorpus createCorpus(final int stemCount) {
|
||||
if (stemCount < 1) {
|
||||
throw new IllegalArgumentException("stemCount must be at least 1.");
|
||||
}
|
||||
|
||||
final StringBuilder dictionaryBuilder = new StringBuilder(stemCount * 120);
|
||||
final LinkedHashSet<String> lookupKeys = new LinkedHashSet<>(stemCount * 8);
|
||||
final LinkedHashSet<String> ambiguousLookupKeys = new LinkedHashSet<>(Math.max(1, stemCount / 4));
|
||||
final SplittableRandom random = new SplittableRandom(20260414L);
|
||||
|
||||
for (int index = 0; index < stemCount; index++) {
|
||||
final String stem = createStem(index);
|
||||
final String[] variants = createVariants(stem, random, index);
|
||||
|
||||
dictionaryBuilder.append(stem);
|
||||
lookupKeys.add(stem);
|
||||
for (String variant : variants) {
|
||||
dictionaryBuilder.append(' ').append(variant);
|
||||
lookupKeys.add(variant);
|
||||
}
|
||||
|
||||
final String homograph = createHomograph(index);
|
||||
dictionaryBuilder.append(' ').append(homograph);
|
||||
lookupKeys.add(homograph);
|
||||
ambiguousLookupKeys.add(homograph);
|
||||
|
||||
dictionaryBuilder.append('\n');
|
||||
}
|
||||
|
||||
return new BenchmarkCorpus(
|
||||
dictionaryBuilder.toString(),
|
||||
lookupKeys.toArray(String[]::new),
|
||||
ambiguousLookupKeys.toArray(String[]::new));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a compiled trie from benchmark corpus text.
|
||||
*
|
||||
* @param corpusText line-oriented dictionary text
|
||||
* @param reductionSettings reduction settings
|
||||
* @param storeOriginalStem whether the canonical stem itself should also be
|
||||
* inserted with the no-op patch
|
||||
* @return compiled trie containing patch commands
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
static FrequencyTrie<String> compilePatchTrie(
|
||||
final String corpusText,
|
||||
final ReductionSettings reductionSettings,
|
||||
final boolean storeOriginalStem) throws IOException {
|
||||
Objects.requireNonNull(corpusText, "corpusText");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
StemmerDictionaryParser.parse(
|
||||
new StringReader(corpusText),
|
||||
"benchmark-corpus",
|
||||
(stem, variants, lineNumber) -> {
|
||||
if (storeOriginalStem) {
|
||||
builder.put(stem, encoder.encode(stem, stem));
|
||||
}
|
||||
for (String variant : variants) {
|
||||
builder.put(variant, encoder.encode(variant, stem));
|
||||
}
|
||||
});
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one deterministic stem.
|
||||
*
|
||||
* @param index stem ordinal
|
||||
* @return generated stem
|
||||
*/
|
||||
private static String createStem(final int index) {
|
||||
final String prefix = PREFIXES[index % PREFIXES.length];
|
||||
final String suffix = STEM_SUFFIXES[(index / PREFIXES.length) % STEM_SUFFIXES.length];
|
||||
return (prefix + suffix + base36(index)).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a set of deterministic variants for one stem.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param random deterministic random source
|
||||
* @param index stem ordinal
|
||||
* @return generated variants in stable order
|
||||
*/
|
||||
private static String[] createVariants(final String stem, final SplittableRandom random, final int index) {
|
||||
final List<String> variants = new ArrayList<>(8);
|
||||
variants.add(stem + "s");
|
||||
variants.add(stem + "ed");
|
||||
variants.add(stem + "ing");
|
||||
variants.add(stem + "er");
|
||||
variants.add(stem + "ers");
|
||||
variants.add("pre" + stem);
|
||||
variants.add(stem + random.nextInt(10));
|
||||
|
||||
if ((index & 1) == 0) {
|
||||
variants.add(stem + "ly");
|
||||
}
|
||||
if (stem.length() > 5) {
|
||||
variants.add(stem.substring(0, stem.length() - 1));
|
||||
}
|
||||
return variants.toArray(String[]::new);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an ambiguous surface form shared by a small group of stems.
|
||||
*
|
||||
* @param index stem ordinal
|
||||
* @return shared homograph form
|
||||
*/
|
||||
private static String createHomograph(final int index) {
|
||||
return "shared" + base36(index / HOMOGRAPH_GROUP_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an ordinal into a compact base-36 discriminator.
|
||||
*
|
||||
* @param value numeric value
|
||||
* @return compact discriminator
|
||||
*/
|
||||
private static String base36(final int value) {
|
||||
return Integer.toString(value, Character.MAX_RADIX);
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable benchmark corpus.
|
||||
*
|
||||
* @param dictionaryText full line-oriented dictionary text
|
||||
* @param lookupKeys keys used for general lookup measurements
|
||||
* @param ambiguousLookupKeys keys that return multiple patch candidates from
|
||||
* {@code getAll()}
|
||||
*/
|
||||
record BenchmarkCorpus(String dictionaryText, String[] lookupKeys, String[] ambiguousLookupKeys) {
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
/**
|
||||
* Benchmarks end-to-end dictionary compilation for different reduction modes.
|
||||
*
|
||||
* <p>
|
||||
* This benchmark measures the offline path that matters for dictionary build
|
||||
* workflows: dictionary parsing, patch-command generation, mutable trie
|
||||
* population, subtree reduction, and freezing into the compiled read-only trie.
|
||||
* </p>
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@Warmup(iterations = 3, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
public class FrequencyTrieCompilationBenchmark {
|
||||
|
||||
/**
|
||||
* Shared benchmark state for compilation scenarios.
|
||||
*/
|
||||
@State(Scope.Benchmark)
|
||||
public static class CompilationState {
|
||||
|
||||
/**
|
||||
* Number of canonical stems to generate.
|
||||
*/
|
||||
@Param({ "2000", "10000" })
|
||||
public int stemCount;
|
||||
|
||||
/**
|
||||
* Reduction mode used during trie compilation.
|
||||
*/
|
||||
@Param({
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
|
||||
})
|
||||
public String reductionMode;
|
||||
|
||||
/**
|
||||
* Whether to store the stem itself using the canonical no-op patch.
|
||||
*/
|
||||
@Param({ "true", "false" })
|
||||
public boolean storeOriginalStem;
|
||||
|
||||
/**
|
||||
* Full dictionary text used as the benchmark input.
|
||||
*/
|
||||
private String dictionaryText;
|
||||
|
||||
/**
|
||||
* Initializes the benchmark state.
|
||||
*/
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() {
|
||||
this.dictionaryText = BenchmarkCorpusSupport.createCorpus(this.stemCount).dictionaryText();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures end-to-end patch trie compilation latency.
|
||||
*
|
||||
* @param state prepared compilation state
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
* @throws IOException if dictionary parsing fails
|
||||
*/
|
||||
@Benchmark
|
||||
public void compilePatchTrie(final CompilationState state, final Blackhole blackhole) throws IOException {
|
||||
final ReductionSettings settings =
|
||||
ReductionSettings.withDefaults(ReductionMode.valueOf(state.reductionMode));
|
||||
blackhole.consume(
|
||||
BenchmarkCorpusSupport.compilePatchTrie(
|
||||
state.dictionaryText,
|
||||
settings,
|
||||
state.storeOriginalStem));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
/**
|
||||
* Benchmarks lookup-oriented operations on compiled Radixor tries.
|
||||
*
|
||||
* <p>
|
||||
* The benchmark uses a deterministic morphology-shaped corpus and measures the
|
||||
* latency of the hot-path lookup operations that are relevant at runtime:
|
||||
* retrieving the preferred patch command, retrieving all candidate patch
|
||||
* commands, and reconstructing stems from the returned patch values.
|
||||
* </p>
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@Warmup(iterations = 3, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
public class FrequencyTrieLookupBenchmark {
|
||||
|
||||
/**
|
||||
* Shared benchmark state for lookup scenarios.
|
||||
*/
|
||||
@State(Scope.Benchmark)
|
||||
public static class LookupState {
|
||||
|
||||
/**
|
||||
* Number of canonical stems to generate.
|
||||
*/
|
||||
@Param({ "2000", "10000" })
|
||||
public int stemCount;
|
||||
|
||||
/**
|
||||
* Reduction mode used to compile the lookup trie.
|
||||
*/
|
||||
@Param({
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
|
||||
"MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
|
||||
})
|
||||
public String reductionMode;
|
||||
|
||||
/**
|
||||
* Compiled trie under test.
|
||||
*/
|
||||
private FrequencyTrie<String> trie;
|
||||
|
||||
/**
|
||||
* Deterministic lookup keys.
|
||||
*/
|
||||
private String[] lookupKeys;
|
||||
|
||||
/**
|
||||
* Keys that are known to return multiple patch candidates from
|
||||
* {@code getAll()}.
|
||||
*/
|
||||
private String[] ambiguousLookupKeys;
|
||||
|
||||
/**
|
||||
* Initializes the benchmark state.
|
||||
*
|
||||
* @throws IOException if corpus compilation fails
|
||||
*/
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() throws IOException {
|
||||
final BenchmarkCorpusSupport.BenchmarkCorpus corpus = BenchmarkCorpusSupport.createCorpus(this.stemCount);
|
||||
final ReductionSettings settings =
|
||||
ReductionSettings.withDefaults(ReductionMode.valueOf(this.reductionMode));
|
||||
this.trie = BenchmarkCorpusSupport.compilePatchTrie(corpus.dictionaryText(), settings, true);
|
||||
this.lookupKeys = corpus.lookupKeys();
|
||||
this.ambiguousLookupKeys = corpus.ambiguousLookupKeys();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures preferred patch lookup latency.
|
||||
*
|
||||
* @param state prepared lookup state
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void lookupPreferredPatch(final LookupState state, final Blackhole blackhole) {
|
||||
final String[] keys = state.lookupKeys;
|
||||
for (String key : keys) {
|
||||
final String patch = state.trie.get(key);
|
||||
if (patch == null) {
|
||||
throw new IllegalStateException("Missing preferred patch for key " + key + '.');
|
||||
}
|
||||
blackhole.consume(patch);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures retrieval of all patch candidates on ambiguous forms.
|
||||
*
|
||||
* @param state prepared lookup state
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void lookupAllPatches(final LookupState state, final Blackhole blackhole) {
|
||||
final String[] keys = state.ambiguousLookupKeys;
|
||||
for (String key : keys) {
|
||||
final String[] patches = state.trie.getAll(key);
|
||||
if (patches.length < 2) {
|
||||
throw new IllegalStateException("Expected multiple patches for key " + key + '.');
|
||||
}
|
||||
blackhole.consume(patches);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures end-to-end preferred stemming from lookup plus patch application.
|
||||
*
|
||||
* @param state prepared lookup state
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void stemPreferredVariant(final LookupState state, final Blackhole blackhole) {
|
||||
final String[] keys = state.lookupKeys;
|
||||
for (String key : keys) {
|
||||
final String patch = state.trie.get(key);
|
||||
blackhole.consume(PatchCommandEncoder.apply(key, patch));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures end-to-end full candidate stemming from {@code getAll()} plus
|
||||
* patch application.
|
||||
*
|
||||
* @param state prepared lookup state
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void stemAllVariants(final LookupState state, final Blackhole blackhole) {
|
||||
final String[] keys = state.ambiguousLookupKeys;
|
||||
for (String key : keys) {
|
||||
final String[] patches = state.trie.getAll(key);
|
||||
for (String patch : patches) {
|
||||
blackhole.consume(PatchCommandEncoder.apply(key, patch));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
10
src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
Normal file
10
src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
Normal file
@@ -0,0 +1,10 @@
|
||||
/**
|
||||
* JMH benchmarks for the Radixor algorithmic core.
|
||||
*
|
||||
* <p>
|
||||
* The benchmarks in this package focus on trie lookup latency, retrieval of all
|
||||
* candidate patch commands, and end-to-end dictionary compilation with
|
||||
* different reduction modes.
|
||||
* </p>
|
||||
*/
|
||||
package org.egothor.stemmer.benchmark;
|
||||
Reference in New Issue
Block a user