diff --git a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java index d26a2f3..07c86d9 100644 --- a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java +++ b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java @@ -3,6 +3,7 @@ package nl.herpiederpiee.appie_scraper; import com.microsoft.playwright.*; import xyz.nextn.levenshteindistance.LevenshteinDistance; +import java.text.Normalizer; import java.util.ArrayList; import java.util.concurrent.TimeUnit; @@ -63,25 +64,81 @@ public class BonusManager { } public static int fuzzyMatchScore(String query, String title) { - query = query.toLowerCase(); - title = title.toLowerCase(); - - if (title.contains(query)) { - return 100; // perfect match + if (query == null || title == null || query.isEmpty() || title.isEmpty()) { + return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0); } - int best = Integer.MAX_VALUE; + // Normalize both strings: remove diacritics, lowercase, remove special chars + String normalizedQuery = normalize(query); + String normalizedTitle = normalize(title); - int qlen = query.length(); - int tlen = title.length(); + // Exact match after normalization + if (normalizedTitle.equals(normalizedQuery)) { + return 100; + } + + // Substring match (query is contained in title) + if (normalizedTitle.contains(normalizedQuery)) { + return 95; // Very high score but slightly less than exact + } + + int qlen = normalizedQuery.length(); + int tlen = normalizedTitle.length(); + + // Query longer than title - impossible match + if (qlen > tlen) { + return 0; + } + + // Find the best matching substring using Levenshtein distance + int bestDistance = Integer.MAX_VALUE; + int bestPosition = -1; for (int i = 0; i <= tlen - qlen; i++) { - String sub = title.substring(i, i + qlen); - int dist = LevenshteinDistance.calculate(query, sub); - if (dist < best) best = dist; + String sub = normalizedTitle.substring(i, i + qlen); + int dist = LevenshteinDistance.calculate(normalizedQuery, sub); + + if (dist < bestDistance) { + bestDistance = dist; + bestPosition = i; + + if (dist == 0) break; // Perfect match found, can't do better + } } - int score = (int)(100.0 * (1.0 - (best / (double) qlen))); // fancy manier om t naar een % match om te zetten + // Calculate score: 100% at distance 0, scales down with distance + // Normalize by query length for consistency + double similarity = 1.0 - (bestDistance / (double) qlen); + + // Apply position bonus: matches at the start are better + if (bestPosition == 0) { + similarity *= 1.1; // 10% boost for start matches + } + + // Clamp to 0-100 + int score = (int) (similarity * 100.0); return Math.max(0, Math.min(100, score)); } + + private static String normalize(String input) { + if (input == null || input.isEmpty()) { + return input; + } + + // Unicode decomposition: separate base chars from diacritics + String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD); + + // Remove all combining diacritical marks + String withoutDiacritics = decomposed + .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); + + // Lowercase and remove special characters (keep alphanumeric + spaces) + String cleaned = withoutDiacritics + .toLowerCase() + .replaceAll("[^a-z0-9\\s]", "") + .replaceAll("\\s+", " ") + .trim(); + + return cleaned; + } }