refactor: apply minor Radixor refinements and refresh dependency locks

This commit is contained in:
2026-04-16 21:31:01 +02:00
parent 594abe2c4b
commit 7e1aea72bf
8 changed files with 425 additions and 14 deletions

View File

@@ -7,11 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd
com.google.errorprone:error_prone_annotations:2.41.0=pmd com.google.errorprone:error_prone_annotations:2.41.0=pmd
net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-api:1.9.3=testCompileClasspath,testRuntimeClasspath net.jqwik:jqwik-api:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-engine:1.9.3=testRuntimeClasspath net.jqwik:jqwik-engine:1.9.3=jmhRuntimeClasspath,testRuntimeClasspath
net.jqwik:jqwik-time:1.9.3=testCompileClasspath,testRuntimeClasspath net.jqwik:jqwik-time:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-web:1.9.3=testCompileClasspath,testRuntimeClasspath net.jqwik:jqwik-web:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik:1.9.3=testCompileClasspath,testRuntimeClasspath net.jqwik:jqwik:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.sf.jopt-simple:jopt-simple:4.9=pitest net.sf.jopt-simple:jopt-simple:4.9=pitest
net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath
net.sf.saxon:Saxon-HE:12.9=pmd net.sf.saxon:Saxon-HE:12.9=pmd
@@ -24,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest
org.apache.commons:commons-lang3:3.20.0=pmd org.apache.commons:commons-lang3:3.20.0=pmd
org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath
org.apache.commons:commons-text:1.14.0=pitest org.apache.commons:commons-text:1.14.0=pitest
org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath,testRuntimeClasspath org.apiguardian:apiguardian-api:1.1.2=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
org.checkerframework:checker-qual:3.52.1=pmd org.checkerframework:checker-qual:3.52.1=pmd
org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt
org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt

View File

@@ -426,6 +426,8 @@ public final class FrequencyTrie<V> {
childNodeIds[edgeIndex] = dataInput.readInt(); childNodeIds[edgeIndex] = dataInput.readInt();
} }
validateSerializedEdges(nodeIndex, edgeLabels);
final int valueCount = dataInput.readInt(); final int valueCount = dataInput.readInt();
if (valueCount < 0) { if (valueCount < 0) {
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount); throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
@@ -474,6 +476,28 @@ public final class FrequencyTrie<V> {
return nodes; return nodes;
} }
/**
* Validates the serialized edge-label sequence for one node.
*
* <p>
* Compiled nodes rely on binary search for child lookup and therefore require
* edge labels to be stored in strict ascending order without duplicates.
* Rejecting malformed streams here keeps lookup semantics deterministic and
* avoids silently constructing a trie whose search behavior would be undefined.
*
* @param nodeIndex serialized node identifier
* @param edgeLabels serialized edge labels
* @throws IOException if the edge labels are not strictly ascending
*/
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
}
}
}
/** /**
* Locates the compiled node for the supplied key. * Locates the compiled node for the supplied key.
* *

View File

@@ -117,7 +117,14 @@ public final class PatchCommandEncoder {
private static final int MISMATCH_PENALTY = 100; private static final int MISMATCH_PENALTY = 100;
/** /**
* Extra headroom added when internal matrices need to grow. * Extra matrix headroom reserved beyond the immediately required dimensions.
*
* <p>
* A small fixed margin reduces repeated reallocation when a caller encodes many
* similarly sized terms in sequence. The value is intentionally modest: large
* enough to absorb minor size fluctuations, yet small enough to avoid
* materially over-allocating the reused dynamic-programming matrices.
* </p>
*/ */
private static final int CAPACITY_MARGIN = 8; private static final int CAPACITY_MARGIN = 8;
@@ -288,6 +295,7 @@ public final class PatchCommandEncoder {
* @param patchCommand compact patch command * @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null} * @return transformed word, or {@code null} when {@code source} is {@code null}
*/ */
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
public static String apply(String source, String patchCommand) { public static String apply(String source, String patchCommand) {
if (source == null) { if (source == null) {
return null; return null;
@@ -299,6 +307,10 @@ public final class PatchCommandEncoder {
return source; return source;
} }
if ((patchCommand.length() & 1) != 0) {
return source;
}
StringBuilder result = new StringBuilder(source); StringBuilder result = new StringBuilder(source);
if (result.isEmpty()) { if (result.isEmpty()) {
@@ -312,11 +324,14 @@ public final class PatchCommandEncoder {
char opcode = patchCommand.charAt(patchIndex); char opcode = patchCommand.charAt(patchIndex);
char argument = patchCommand.charAt(patchIndex + 1); char argument = patchCommand.charAt(patchIndex + 1);
int encodedCount = argument - 'a' + 1;
switch (opcode) { switch (opcode) {
case SKIP_OPCODE: case SKIP_OPCODE:
position = position - encodedCount + 1; final int skipCount = decodeEncodedCount(argument);
if (skipCount < 1) {
return source;
}
position = position - skipCount + 1;
break; break;
case REPLACE_OPCODE: case REPLACE_OPCODE:
@@ -324,8 +339,12 @@ public final class PatchCommandEncoder {
break; break;
case DELETE_OPCODE: case DELETE_OPCODE:
final int deleteCount = decodeEncodedCount(argument);
if (deleteCount < 1) {
return source;
}
int deleteEndExclusive = position + 1; int deleteEndExclusive = position + 1;
position -= encodedCount - 1; position -= deleteCount - 1;
result.delete(position, deleteEndExclusive); result.delete(position, deleteEndExclusive);
break; break;
@@ -353,6 +372,26 @@ public final class PatchCommandEncoder {
return result.toString(); return result.toString();
} }
/**
* Decodes a compact count argument used by skip and delete instructions.
*
* <p>
* Valid encoded counts start at {@code 'a'} for one affected character. Values
* below {@code 'a'} are malformed and are reported to callers via the
* compatibility fallback path rather than by throwing a dedicated exception.
* </p>
*
* @param argument serialized count argument
* @return decoded positive count, or {@code -1} when the argument is malformed
*/
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
private static int decodeEncodedCount(final char argument) {
if (argument < 'a') {
return -1;
}
return argument - 'a' + 1;
}
/** /**
* Applies a patch command to an empty source word. * Applies a patch command to an empty source word.
* *

View File

@@ -31,6 +31,7 @@
package org.egothor.stemmer.trie; package org.egothor.stemmer.trie;
import java.util.Arrays; import java.util.Arrays;
import java.util.Objects;
/** /**
* Immutable compiled trie node optimized for read access. * Immutable compiled trie node optimized for read access.
@@ -38,7 +39,9 @@ import java.util.Arrays;
* <p> * <p>
* The returned arrays are the internal backing storage of the compiled node. * The returned arrays are the internal backing storage of the compiled node.
* They are exposed for efficient access by closely related trie infrastructure * They are exposed for efficient access by closely related trie infrastructure
* and therefore must never be modified by callers. * and therefore must never be modified by callers. The node itself is still
* immutable from the public API perspective because construction wires these
* arrays once and all lookup operations thereafter treat them as read-only.
* *
* @param <V> value type * @param <V> value type
* @param edgeLabels internal edge label array * @param edgeLabels internal edge label array
@@ -46,8 +49,90 @@ import java.util.Arrays;
* @param orderedValues internal ordered values array * @param orderedValues internal ordered values array
* @param orderedCounts internal ordered counts array * @param orderedCounts internal ordered counts array
*/ */
@SuppressWarnings("PMD.DataClass")
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) { public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated compiled node.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public CompiledNode {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(children, "children");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != children.length) {
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only by external callers.
*
* @return internal child-node array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public CompiledNode<V>[] children() {
return this.children;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
/** /**
* Finds a child for the supplied edge character. * Finds a child for the supplied edge character.
* *

View File

@@ -30,14 +30,18 @@
******************************************************************************/ ******************************************************************************/
package org.egothor.stemmer.trie; package org.egothor.stemmer.trie;
import java.util.Objects;
/** /**
* Intermediate node data used during deserialization before child references * Intermediate node data used during deserialization before child references
* are resolved. * are resolved.
* *
* <p> * <p>
* The arrays exposed by the accessors are the internal backing storage of this * The arrays exposed by the accessors are the internal backing storage of this
* holder. They are returned directly for efficiency and therefore must be * holder. They are returned directly for efficiency because the deserialization
* treated as read-only by callers. * pipeline copies references into immutable compiled nodes immediately after
* the record is created. Callers must therefore treat every returned array as
* read-only.
* *
* @param <V> value type * @param <V> value type
* @param edgeLabels edge labels * @param edgeLabels edge labels
@@ -45,6 +49,87 @@ package org.egothor.stemmer.trie;
* @param orderedValues ordered values * @param orderedValues ordered values
* @param orderedCounts ordered counts * @param orderedCounts ordered counts
*/ */
@SuppressWarnings("PMD.DataClass")
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) { public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated node-data holder.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public NodeData {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(childNodeIds, "childNodeIds");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != childNodeIds.length) {
throw new IllegalArgumentException("edgeLabels and childNodeIds must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node identifier array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal child-node identifier array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] childNodeIds() {
return this.childNodeIds;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
} }

View File

@@ -733,6 +733,30 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Invalid root node id")); assertTrue(exception.getMessage().contains("Invalid root node id"));
} }
/**
* Verifies that deserialization rejects unsorted or duplicate serialized edge
* labels because compiled lookup relies on binary search over a strictly
* ascending edge array.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects non-ascending serialized edge labels")
void readFromRejectsNonAscendingSerializedEdgeLabels() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(2);
dataOutput.writeChar('b');
dataOutput.writeInt(0);
dataOutput.writeChar('a');
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Edge labels must be strictly ascending"));
}
/** /**
* Verifies that deserialization rejects non-positive stored counts. * Verifies that deserialization rejects non-positive stored counts.
*/ */

View File

@@ -174,7 +174,13 @@ class PatchCommandEncoderTest {
// 9 // 9
Arguments.of(9, "", "-a"), Arguments.of(9, "", "-a"),
// 10 // 10
Arguments.of(10, "", "Ra")); Arguments.of(10, "", "Ra"),
// 11
Arguments.of(11, "abc", "D`"),
// 12
Arguments.of(12, "abc", "-`"),
// 13
Arguments.of(13, "", "D`"));
} }
/** /**

View File

@@ -0,0 +1,148 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link CompiledNode} and {@link NodeData} validation and
* documented backing-array exposure.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("CompiledNode and NodeData")
class CompiledNodeAndNodeDataTest {
/**
* Verifies that {@link NodeData} rejects mismatched edge-related array lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched edge arrays")
void nodeDataShouldRejectMismatchedEdgeArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[] { 'a' }, new int[0], new String[0], new int[0]));
assertEquals("edgeLabels and childNodeIds must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} rejects mismatched value-related array
* lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched value arrays")
void nodeDataShouldRejectMismatchedValueArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[0], new int[0], new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("NodeData accessors expose documented backing arrays")
void nodeDataAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
final int[] childNodeIds = new int[] { 7 };
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 3 };
final NodeData<String> nodeData = new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts);
assertSame(edgeLabels, nodeData.edgeLabels());
assertSame(childNodeIds, nodeData.childNodeIds());
assertSame(orderedValues, nodeData.orderedValues());
assertSame(orderedCounts, nodeData.orderedCounts());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched edge and child arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched edge and child arrays")
void compiledNodeShouldRejectMismatchedEdgeAndChildArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[] { 'a' }, children, new String[0], new int[0]));
assertEquals("edgeLabels and children must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched value arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched value arrays")
void compiledNodeShouldRejectMismatchedValueArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[0], children, new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("CompiledNode accessors expose documented backing arrays")
void compiledNodeAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[1];
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 5 };
final CompiledNode<String> node = new CompiledNode<>(edgeLabels, children, orderedValues, orderedCounts);
assertSame(edgeLabels, node.edgeLabels());
assertSame(children, node.children());
assertSame(orderedValues, node.orderedValues());
assertSame(orderedCounts, node.orderedCounts());
}
}