test to see if ai-generated fuzzy search is better

This commit is contained in:
Valentijn van der Jagt
2025-12-29 23:09:55 +01:00
parent b619afe24f
commit adfb3a9fb2

View File

@@ -3,6 +3,7 @@ package nl.herpiederpiee.appie_scraper;
import com.microsoft.playwright.*;
import xyz.nextn.levenshteindistance.LevenshteinDistance;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.concurrent.TimeUnit;
@@ -63,25 +64,81 @@ public class BonusManager {
}
public static int fuzzyMatchScore(String query, String title) {
query = query.toLowerCase();
title = title.toLowerCase();
if (title.contains(query)) {
return 100; // perfect match
if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0);
}
int best = Integer.MAX_VALUE;
// Normalize both strings: remove diacritics, lowercase, remove special chars
String normalizedQuery = normalize(query);
String normalizedTitle = normalize(title);
int qlen = query.length();
int tlen = title.length();
// Exact match after normalization
if (normalizedTitle.equals(normalizedQuery)) {
return 100;
}
// Substring match (query is contained in title)
if (normalizedTitle.contains(normalizedQuery)) {
return 95; // Very high score but slightly less than exact
}
int qlen = normalizedQuery.length();
int tlen = normalizedTitle.length();
// Query longer than title - impossible match
if (qlen > tlen) {
return 0;
}
// Find the best matching substring using Levenshtein distance
int bestDistance = Integer.MAX_VALUE;
int bestPosition = -1;
for (int i = 0; i <= tlen - qlen; i++) {
String sub = title.substring(i, i + qlen);
int dist = LevenshteinDistance.calculate(query, sub);
if (dist < best) best = dist;
String sub = normalizedTitle.substring(i, i + qlen);
int dist = LevenshteinDistance.calculate(normalizedQuery, sub);
if (dist < bestDistance) {
bestDistance = dist;
bestPosition = i;
if (dist == 0) break; // Perfect match found, can't do better
}
}
int score = (int)(100.0 * (1.0 - (best / (double) qlen))); // fancy manier om t naar een % match om te zetten
// Calculate score: 100% at distance 0, scales down with distance
// Normalize by query length for consistency
double similarity = 1.0 - (bestDistance / (double) qlen);
// Apply position bonus: matches at the start are better
if (bestPosition == 0) {
similarity *= 1.1; // 10% boost for start matches
}
// Clamp to 0-100
int score = (int) (similarity * 100.0);
return Math.max(0, Math.min(100, score));
}
private static String normalize(String input) {
if (input == null || input.isEmpty()) {
return input;
}
// Unicode decomposition: separate base chars from diacritics
String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
// Remove all combining diacritical marks
String withoutDiacritics = decomposed
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
// Lowercase and remove special characters (keep alphanumeric + spaces)
String cleaned = withoutDiacritics
.toLowerCase()
.replaceAll("[^a-z0-9\\s]", "")
.replaceAll("\\s+", " ")
.trim();
return cleaned;
}
}