test to see if ai-generated fuzzy search is better

This commit is contained in:
Valentijn van der Jagt
2025-12-29 23:14:52 +01:00
parent adfb3a9fb2
commit 5b765f9a7d
2 changed files with 100 additions and 34 deletions

View File

@@ -65,59 +65,104 @@ public class BonusManager {
public static int fuzzyMatchScore(String query, String title) { public static int fuzzyMatchScore(String query, String title) {
if (query == null || title == null || query.isEmpty() || title.isEmpty()) { if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0); return 0;
} }
// Normalize both strings: remove diacritics, lowercase, remove special chars
String normalizedQuery = normalize(query); String normalizedQuery = normalize(query);
String normalizedTitle = normalize(title); String normalizedTitle = normalize(title);
// Exact match after normalization // ===== TIER 1: EXACT WORD MATCH (highest priority) =====
if (normalizedTitle.equals(normalizedQuery)) { if (isExactWordMatch(normalizedTitle, normalizedQuery)) {
return 100; return 100;
} }
// Substring match (query is contained in title) // ===== TIER 2: WORD-BOUNDARY SUBSTRING =====
if (normalizedTitle.contains(normalizedQuery)) { if (isWordBoundaryMatch(normalizedTitle, normalizedQuery)) {
return 95; // Very high score but slightly less than exact return 95;
} }
// ===== TIER 3: PREFIX MATCH =====
if (isPrefixMatch(normalizedTitle, normalizedQuery)) {
return 85;
}
// ===== TIER 4: LEVENSHTEIN (typo tolerance) =====
int qlen = normalizedQuery.length(); int qlen = normalizedQuery.length();
int tlen = normalizedTitle.length(); int tlen = normalizedTitle.length();
// Query longer than title - impossible match
if (qlen > tlen) { if (qlen > tlen) {
return 0; return 0;
} }
// Find the best matching substring using Levenshtein distance
int bestDistance = Integer.MAX_VALUE; int bestDistance = Integer.MAX_VALUE;
int bestPosition = -1;
for (int i = 0; i <= tlen - qlen; i++) { for (int i = 0; i <= tlen - qlen; i++) {
String sub = normalizedTitle.substring(i, i + qlen); String sub = normalizedTitle.substring(i, i + qlen);
int dist = LevenshteinDistance.calculate(normalizedQuery, sub); int dist = levenshteinDistance(normalizedQuery, sub);
if (dist < bestDistance) { if (dist < bestDistance) {
bestDistance = dist; bestDistance = dist;
bestPosition = i; if (dist == 0) break;
if (dist == 0) break; // Perfect match found, can't do better
} }
} }
// Calculate score: 100% at distance 0, scales down with distance // Allow up to 2 edits (typo tolerance)
// Normalize by query length for consistency if (bestDistance <= 2) {
double similarity = 1.0 - (bestDistance / (double) qlen); // Distance 0 = 80, Distance 1 = 70, Distance 2 = 60
int score = 80 - (bestDistance * 10);
// Apply position bonus: matches at the start are better return Math.max(0, score);
if (bestPosition == 0) {
similarity *= 1.1; // 10% boost for start matches
} }
// Clamp to 0-100 return 0;
int score = (int) (similarity * 100.0); }
return Math.max(0, Math.min(100, score));
/**
* Exact word match: query must be surrounded by word boundaries or string edges
* "LOR" matches "L OR" or "LOR coffee" but NOT "LOREAL"
*/
private static boolean isExactWordMatch(String title, String query) {
String[] words = title.split("\\s+");
for (String word : words) {
if (word.equals(query)) {
return true;
}
}
return false;
}
/**
* Word boundary match: query matches at word start/end
* "LOR" matches in "L'OR" (after special char removed)
* "REAL" matches in "LOREAL" as word boundary? No, stays in Tier 4
*/
private static boolean isWordBoundaryMatch(String title, String query) {
// Check if query appears after space or at start
if (title.startsWith(query + " ")) {
return true;
}
if (title.contains(" " + query)) {
return true;
}
// Check if query ends at word boundary
if (title.endsWith(" " + query)) {
return true;
}
return false;
}
/**
* Prefix match: query is the start of any word
* "CAF" matches in "CAFFE" or "CAFE LATTE"
*/
private static boolean isPrefixMatch(String title, String query) {
for (String word : title.split("\\s+")) {
if (word.startsWith(query) && word.length() > query.length()) {
return true;
}
}
return false;
} }
private static String normalize(String input) { private static String normalize(String input) {
@@ -125,14 +170,12 @@ public class BonusManager {
return input; return input;
} }
// Unicode decomposition: separate base chars from diacritics // Remove diacritics
String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD); String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
// Remove all combining diacritical marks
String withoutDiacritics = decomposed String withoutDiacritics = decomposed
.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
// Lowercase and remove special characters (keep alphanumeric + spaces) // Lowercase, remove special chars, normalize spaces
String cleaned = withoutDiacritics String cleaned = withoutDiacritics
.toLowerCase() .toLowerCase()
.replaceAll("[^a-z0-9\\s]", "") .replaceAll("[^a-z0-9\\s]", "")
@@ -141,4 +184,32 @@ public class BonusManager {
return cleaned; return cleaned;
} }
private static int levenshteinDistance(String query, String title) {
int qlen = query.length();
int tlen = title.length();
if (qlen == 0) return tlen;
if (tlen == 0) return qlen;
int[][] dp = new int[qlen + 1][tlen + 1];
for (int i = 0; i <= qlen; i++) dp[i][0] = i;
for (int j = 0; j <= tlen; j++) dp[0][j] = j;
for (int i = 1; i <= qlen; i++) {
for (int j = 1; j <= tlen; j++) {
if (query.charAt(i - 1) == title.charAt(j - 1)) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + Math.min(
Math.min(dp[i - 1][j - 1], dp[i - 1][j]),
dp[i][j - 1]
);
}
}
}
return dp[qlen][tlen];
}
} }

View File

@@ -1,13 +1,8 @@
package nl.herpiederpiee.appie_scraper; package nl.herpiederpiee.appie_scraper;
import com.microsoft.playwright.*;
import org.json.*;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import java.util.ArrayList;
import java.util.Scanner;
@SpringBootApplication @SpringBootApplication
public class Main { public class Main {