|
|
@@ -0,0 +1,278 @@
|
|
|
+package parser;
|
|
|
+
|
|
|
+import java.awt.image.BufferedImage;
|
|
|
+import java.io.File;
|
|
|
+import java.io.IOException;
|
|
|
+import java.net.MalformedURLException;
|
|
|
+import java.net.URL;
|
|
|
+import java.time.Duration;
|
|
|
+import java.util.Arrays;
|
|
|
+import java.util.Collections;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+import javax.imageio.ImageIO;
|
|
|
+
|
|
|
+import org.imgscalr.Scalr;
|
|
|
+import org.openqa.selenium.By;
|
|
|
+import org.openqa.selenium.JavascriptExecutor;
|
|
|
+import org.openqa.selenium.WebElement;
|
|
|
+import org.openqa.selenium.chrome.ChromeDriver;
|
|
|
+import org.openqa.selenium.chrome.ChromeOptions;
|
|
|
+import org.openqa.selenium.support.ui.ExpectedConditions;
|
|
|
+import org.openqa.selenium.support.ui.WebDriverWait;
|
|
|
+
|
|
|
+import com.google.common.base.Strings;
|
|
|
+
|
|
|
+import database.Database;
|
|
|
+import obejcts.Recepie;
|
|
|
+
|
|
|
+public class ParserBase implements Parser {
|
|
|
+
|
|
|
+ List<String> measurements = Arrays.asList("msk", "tsk", "g", "kg", "ml", "dl", "l", "st", "krm", "förp", "kruka", "färsk",
|
|
|
+ "burk", "knippe", "kvist", "cm", "burkar", "cl", "port");
|
|
|
+ protected Database database = new Database();
|
|
|
+ protected ChromeDriver driver;
|
|
|
+ protected WebDriverWait wait;
|
|
|
+ protected JavascriptExecutor jsExecutor;
|
|
|
+
|
|
|
+ public void findRecepiesWithSearchWords(List<String> searchedWords, String urlSearchPattern, String pathSeparator,
|
|
|
+ String baseUrl,
|
|
|
+ String recepiesItemListXpath, String recepieTitleXpath, String cookieConsentButtonXpath) {
|
|
|
+ try {
|
|
|
+ String url = baseUrl;
|
|
|
+ url += urlSearchPattern;
|
|
|
+ String searchWords = "";
|
|
|
+ for (String word : searchedWords) {
|
|
|
+ searchWords += pathSeparator + word;
|
|
|
+ }
|
|
|
+ url += searchWords;
|
|
|
+
|
|
|
+ driver = getSeleniumDriver();
|
|
|
+ wait = getWaitDriver(driver);
|
|
|
+ jsExecutor = getJsExecutor(driver);
|
|
|
+
|
|
|
+ driver.get(url);
|
|
|
+ wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(By.xpath(recepiesItemListXpath), 0));
|
|
|
+
|
|
|
+ Thread.sleep(500);
|
|
|
+ if (checkIfElementExists(driver, cookieConsentButtonXpath)) {
|
|
|
+ driver.findElement(By.xpath(cookieConsentButtonXpath)).click();
|
|
|
+ Thread.sleep(100);
|
|
|
+ }
|
|
|
+
|
|
|
+ List<WebElement> recepies = driver.findElements(By.xpath(recepiesItemListXpath));
|
|
|
+
|
|
|
+ for (int i = 0; i < recepies.size(); i++) {
|
|
|
+ Thread.sleep(1000);
|
|
|
+ recepies = driver.findElements(By.xpath(recepiesItemListXpath));
|
|
|
+ WebElement recepie = recepies.get(i);
|
|
|
+
|
|
|
+ scrollElementIntoViewCenter(driver, recepie);
|
|
|
+ String recepieTitle = recepie.findElement(By.xpath(recepieTitleXpath)).getText();
|
|
|
+
|
|
|
+ Recepie recepieFromDb = database.getGetRecepieByTitle(recepieTitle);
|
|
|
+ handleRecepie(recepie, recepieTitle, recepieFromDb);
|
|
|
+ }
|
|
|
+
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ driver.close();
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testImportImages(int count) {
|
|
|
+ List<Recepie> recepiesWithoutImage = database.getRecepiesWithoutImage(count);
|
|
|
+
|
|
|
+ for (Recepie recepie : recepiesWithoutImage) {
|
|
|
+ if (Ica.isFromHereRecepie(recepie.getUrl())) {
|
|
|
+ saveImage(recepie, Ica.IMAGE_XPATH);
|
|
|
+ } else if (Koket.isFromHereRecepie(recepie.getUrl())) {
|
|
|
+ saveImage(recepie, Koket.IMAGE_XPATH);
|
|
|
+ } else if (Tasteline.isFromHereRecepie(recepie.getUrl())) {
|
|
|
+ saveImage(recepie, Tasteline.IMAGE_XPATH);
|
|
|
+ } else if (Arla.isFromHereRecepie(recepie.getUrl())) {
|
|
|
+ saveImage(recepie, Arla.IMAGE_XPATH);
|
|
|
+ } else {
|
|
|
+ System.out.println("Cound not find url " + recepie.getUrl());
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ driver.close();
|
|
|
+ }
|
|
|
+
|
|
|
+ protected boolean checkIfElementExists(ChromeDriver driver, String xpath) {
|
|
|
+ return !driver.findElements(By.xpath(xpath)).isEmpty();
|
|
|
+ }
|
|
|
+
|
|
|
+ protected boolean checkIfElementExists(WebElement element, String xpath) {
|
|
|
+ return !element.findElements(By.xpath(xpath)).isEmpty();
|
|
|
+ }
|
|
|
+
|
|
|
+ protected String fixName(String name) {
|
|
|
+ // name = name.replaceAll("[0-9]", "");
|
|
|
+ if (name.indexOf("(") > -1) {
|
|
|
+ int firstIndex = name.indexOf("(");
|
|
|
+ int secondIndex = name.indexOf(")");
|
|
|
+
|
|
|
+ name = name.substring(0, firstIndex) + name.substring(secondIndex + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (name.indexOf("[") > -1) {
|
|
|
+ int firstIndex = name.indexOf("[");
|
|
|
+ int secondIndex = name.indexOf("]");
|
|
|
+
|
|
|
+ name = name.substring(0, firstIndex) + name.substring(secondIndex + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ return name.trim();
|
|
|
+ }
|
|
|
+
|
|
|
+ protected float formatFloat(String string) {
|
|
|
+ float result = -1f;
|
|
|
+ try {
|
|
|
+ result = Float.parseFloat(string);
|
|
|
+ } catch (NumberFormatException e) {
|
|
|
+ // Empty by design
|
|
|
+ }
|
|
|
+
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ protected String getAmountFromText(String text) {
|
|
|
+ text = text.replaceAll("[^0-9\\/\\.,-½]", "").trim();
|
|
|
+ if (text.endsWith(",")) {
|
|
|
+ text.substring(0, text.length() - 1);
|
|
|
+ }
|
|
|
+ return text.replaceAll("[^0-9\\/\\.,-½]", "").trim();
|
|
|
+ }
|
|
|
+
|
|
|
+ protected JavascriptExecutor getJsExecutor(ChromeDriver driver) {
|
|
|
+ return driver;
|
|
|
+ }
|
|
|
+
|
|
|
+ protected String getMeasurement(String ingredientText) {
|
|
|
+ String result = "";
|
|
|
+ String[] parts = ingredientText.split(" ");
|
|
|
+ for (String part : parts) {
|
|
|
+ if (measurements.contains(part)) {
|
|
|
+ result = part.replaceAll("[\\/\\s]*", "").trim();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ protected ChromeDriver getSeleniumDriver() {
|
|
|
+ ChromeOptions options = new ChromeOptions();
|
|
|
+
|
|
|
+ System.setProperty("webdriver.chrome.driver",
|
|
|
+ System.getProperty("user.dir") + "/chromedriver.exe");
|
|
|
+ System.setProperty("webdriver.chrome.silentOutput", "true");
|
|
|
+ // Fixing 255 Error crashes
|
|
|
+ options.addArguments("--no-sandbox");
|
|
|
+ options.addArguments("--disable-dev-shm-usage");
|
|
|
+
|
|
|
+ // Options to trick bot detection
|
|
|
+ // Removing webdriver property
|
|
|
+ options.addArguments("--disable-blink-features=AutomationControlled");
|
|
|
+ options.setExperimentalOption("excludeSwitches", Collections.singletonList("enable-automation"));
|
|
|
+ options.setExperimentalOption("useAutomationExtension", null);
|
|
|
+
|
|
|
+ // Changing the user agent / browser fingerprint
|
|
|
+ options.addArguments("window-size=1920,1080");
|
|
|
+ options.addArguments(
|
|
|
+ "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
|
|
|
+
|
|
|
+ // Other
|
|
|
+ options.addArguments("disable-infobars");
|
|
|
+
|
|
|
+ ChromeDriver driver = new ChromeDriver(options);
|
|
|
+ driver.executeScript("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
|
|
|
+ return driver;
|
|
|
+ }
|
|
|
+
|
|
|
+ protected WebDriverWait getWaitDriver(ChromeDriver driver) {
|
|
|
+ return new WebDriverWait(driver, Duration.ofSeconds(30));
|
|
|
+ }
|
|
|
+
|
|
|
+ protected String onlyNumbers(String value) {
|
|
|
+ return value.replaceAll("[^0-9]", "");
|
|
|
+ }
|
|
|
+
|
|
|
+ protected void parseRecepie(WebElement recepie, String recepieTitle) {
|
|
|
+ // Empty by design, overridden by parsers
|
|
|
+ }
|
|
|
+
|
|
|
+ protected void scrollElementIntoView(ChromeDriver driver, WebElement element) {
|
|
|
+ ((JavascriptExecutor) driver).executeScript("arguments[0].scrollIntoView();", element);
|
|
|
+ }
|
|
|
+
|
|
|
+ protected void scrollElementIntoViewCenter(ChromeDriver driver, WebElement element) {
|
|
|
+ ((JavascriptExecutor) driver).executeScript("arguments[0].scrollIntoView({ block: 'center' });", element);
|
|
|
+ }
|
|
|
+
|
|
|
+ protected void scrollToTopOfPage(ChromeDriver driver) {
|
|
|
+ ((JavascriptExecutor) driver)
|
|
|
+ .executeScript("document.body.scrollTop = document.documentElement.scrollTop = 0;");
|
|
|
+ }
|
|
|
+
|
|
|
+ void handleRecepie(WebElement recepie, String recepieTitle, Recepie recepieFromDb) {
|
|
|
+ if (Strings.isNullOrEmpty(recepieFromDb.getName())) {
|
|
|
+ parseRecepie(recepie, recepieTitle);
|
|
|
+ } else {
|
|
|
+ System.out.println(recepieFromDb.getName() + " ALREADY EXISTS IN DB");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ void saveImage(Recepie newRecepie, String imageXpath) {
|
|
|
+
|
|
|
+ if (driver == null) {
|
|
|
+ driver = getSeleniumDriver();
|
|
|
+ }
|
|
|
+
|
|
|
+ driver.get(newRecepie.getUrl());
|
|
|
+
|
|
|
+ try {
|
|
|
+ Thread.sleep(500);
|
|
|
+ } catch (InterruptedException e1) {
|
|
|
+ // TODO Auto-generated catch block
|
|
|
+ e1.printStackTrace();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!checkIfElementExists(driver, imageXpath)) {
|
|
|
+ File file = new File("empty.png");
|
|
|
+ try {
|
|
|
+ file.createNewFile();
|
|
|
+ } catch (IOException e) {
|
|
|
+ // TODO Auto-generated catch block
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ database.saveRecepieImage(newRecepie.getId(), file);
|
|
|
+ } else {
|
|
|
+
|
|
|
+ WebElement imageElement = driver.findElement(By.xpath(imageXpath));
|
|
|
+ String imgSrc = imageElement.getAttribute("currentSrc");
|
|
|
+
|
|
|
+ try {
|
|
|
+ if (imgSrc.contains(".webp")) {
|
|
|
+ imgSrc = imgSrc.replace(".webp", "");
|
|
|
+ }
|
|
|
+ URL imageUrl = new URL(imgSrc);
|
|
|
+ BufferedImage savedImage = ImageIO.read(imageUrl);
|
|
|
+
|
|
|
+ BufferedImage resizedImage = Scalr.resize(savedImage, 300);
|
|
|
+
|
|
|
+ File file = new File("recepie-image.png");
|
|
|
+ ImageIO.write(resizedImage, "png", file);
|
|
|
+
|
|
|
+ database.saveRecepieImage(newRecepie.getId(), file);
|
|
|
+
|
|
|
+ } catch (MalformedURLException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|