220 lines
6.8 KiB
Java
220 lines
6.8 KiB
Java
package nl.herpiederpiee.appie_scraper;
|
|
|
|
import com.microsoft.playwright.*;
|
|
import xyz.nextn.levenshteindistance.LevenshteinDistance;
|
|
|
|
import java.text.Normalizer;
|
|
import java.util.ArrayList;
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
public class BonusManager {
|
|
static ArrayList<BonusItem> bonusItems = new ArrayList<BonusItem>();;
|
|
|
|
public static void updateBonusItems(){
|
|
try (Playwright playwright = Playwright.create()) {
|
|
|
|
Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(false));
|
|
|
|
BrowserContext context = browser.newContext();
|
|
Page bonusPagina = context.newPage();
|
|
|
|
|
|
bonusPagina.navigate("https://www.ah.nl/bonus");
|
|
TimeUnit.SECONDS.sleep(5); // wait for page to actaully fully load
|
|
|
|
Locator bonusElements = bonusPagina.locator(".promotion-card_root__tQA3z");
|
|
for (ElementHandle bonusElement : bonusElements.elementHandles()){
|
|
BonusItem bonusItem = new BonusItem(bonusElement);
|
|
|
|
// exclude annoying elements
|
|
if (bonusItem.category.equals( "onlineOnly")) continue;
|
|
if (bonusItem.category.equals( "gall")) continue;
|
|
if (bonusItem.category.equals( "gall-card")) continue;
|
|
if (bonusItem.category.equals( "etos")) continue;
|
|
|
|
bonusItems.add(bonusItem);
|
|
System.out.println("added bonusItem " + bonusItem.title);
|
|
}
|
|
|
|
} catch (InterruptedException e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
}
|
|
|
|
public static ArrayList<BonusItem> getBonusItems(String name){
|
|
ArrayList<Pair<BonusItem, Integer>> list = new ArrayList<>();
|
|
|
|
if (name == null || name.trim().isEmpty()){
|
|
return bonusItems;
|
|
}
|
|
|
|
for (BonusItem bonusItem : bonusItems) {
|
|
Integer score = fuzzyMatchScore(name, bonusItem.title);
|
|
list.add(Pair.pair(bonusItem, score));
|
|
}
|
|
list.sort((a, b) -> Integer.compare(b.second, a.second));
|
|
|
|
if (list.get(0).second.equals(0)){
|
|
return new ArrayList<>();
|
|
}
|
|
ArrayList<BonusItem> top10 = new ArrayList<>();
|
|
int i = 0;
|
|
while (top10.size() < 10) {
|
|
top10.add(list.get(i).first);
|
|
i++;
|
|
}
|
|
|
|
return top10;
|
|
}
|
|
|
|
public static int fuzzyMatchScore(String query, String title) {
|
|
if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
|
|
return 0;
|
|
}
|
|
|
|
String normalizedQuery = normalize(query);
|
|
String normalizedTitle = normalize(title);
|
|
|
|
// ===== TIER 1: EXACT WORD MATCH (highest priority) =====
|
|
if (isExactWordMatch(normalizedTitle, normalizedQuery)) {
|
|
return 100;
|
|
}
|
|
|
|
// ===== TIER 2: WORD-BOUNDARY SUBSTRING =====
|
|
if (isWordBoundaryMatch(normalizedTitle, normalizedQuery)) {
|
|
return 95;
|
|
}
|
|
|
|
// ===== TIER 3: PREFIX MATCH =====
|
|
if (isPrefixMatch(normalizedTitle, normalizedQuery)) {
|
|
return 85;
|
|
}
|
|
|
|
// ===== TIER 4: LEVENSHTEIN (typo tolerance) =====
|
|
int qlen = normalizedQuery.length();
|
|
int tlen = normalizedTitle.length();
|
|
|
|
if (qlen > tlen) {
|
|
return 0;
|
|
}
|
|
|
|
int bestDistance = Integer.MAX_VALUE;
|
|
|
|
for (int i = 0; i <= tlen - qlen; i++) {
|
|
String sub = normalizedTitle.substring(i, i + qlen);
|
|
int dist = levenshteinDistance(normalizedQuery, sub);
|
|
if (dist < bestDistance) {
|
|
bestDistance = dist;
|
|
if (dist == 0) break;
|
|
}
|
|
}
|
|
|
|
// Allow up to 2 edits (typo tolerance)
|
|
if (bestDistance <= 2) {
|
|
// Distance 0 = 80, Distance 1 = 70, Distance 2 = 60
|
|
int score = 80 - (bestDistance * 10);
|
|
return Math.max(0, score);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Exact word match: query must be surrounded by word boundaries or string edges
|
|
* "LOR" matches "L OR" or "LOR coffee" but NOT "LOREAL"
|
|
*/
|
|
private static boolean isExactWordMatch(String title, String query) {
|
|
String[] words = title.split("\\s+");
|
|
for (String word : words) {
|
|
if (word.equals(query)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Word boundary match: query matches at word start/end
|
|
* "LOR" matches in "L'OR" (after special char removed)
|
|
* "REAL" matches in "LOREAL" as word boundary? No, stays in Tier 4
|
|
*/
|
|
private static boolean isWordBoundaryMatch(String title, String query) {
|
|
// Check if query appears after space or at start
|
|
if (title.startsWith(query + " ")) {
|
|
return true;
|
|
}
|
|
|
|
if (title.contains(" " + query)) {
|
|
return true;
|
|
}
|
|
|
|
// Check if query ends at word boundary
|
|
if (title.endsWith(" " + query)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Prefix match: query is the start of any word
|
|
* "CAF" matches in "CAFFE" or "CAFE LATTE"
|
|
*/
|
|
private static boolean isPrefixMatch(String title, String query) {
|
|
for (String word : title.split("\\s+")) {
|
|
if (word.startsWith(query) && word.length() > query.length()) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
private static String normalize(String input) {
|
|
if (input == null || input.isEmpty()) {
|
|
return input;
|
|
}
|
|
|
|
// Remove diacritics
|
|
String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
|
|
String withoutDiacritics = decomposed
|
|
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
|
|
|
|
// Lowercase, remove special chars, normalize spaces
|
|
String cleaned = withoutDiacritics
|
|
.toLowerCase()
|
|
.replaceAll("[^a-z0-9\\s]", "")
|
|
.replaceAll("\\s+", " ")
|
|
.trim();
|
|
|
|
return cleaned;
|
|
}
|
|
|
|
private static int levenshteinDistance(String query, String title) {
|
|
int qlen = query.length();
|
|
int tlen = title.length();
|
|
|
|
if (qlen == 0) return tlen;
|
|
if (tlen == 0) return qlen;
|
|
|
|
int[][] dp = new int[qlen + 1][tlen + 1];
|
|
|
|
for (int i = 0; i <= qlen; i++) dp[i][0] = i;
|
|
for (int j = 0; j <= tlen; j++) dp[0][j] = j;
|
|
|
|
for (int i = 1; i <= qlen; i++) {
|
|
for (int j = 1; j <= tlen; j++) {
|
|
if (query.charAt(i - 1) == title.charAt(j - 1)) {
|
|
dp[i][j] = dp[i - 1][j - 1];
|
|
} else {
|
|
dp[i][j] = 1 + Math.min(
|
|
Math.min(dp[i - 1][j - 1], dp[i - 1][j]),
|
|
dp[i][j - 1]
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
return dp[qlen][tlen];
|
|
}
|
|
}
|