Parcourir la source

Update 2024-01-21

Axel Nordh il y a 2 ans
Parent
commit
48487154b4

+ 7 - 3
recept/.classpath

@@ -13,18 +13,22 @@
 	</classpathentry>
 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
 		<attributes>
+			<attribute name="test" value="true"/>
 			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
-			<attribute name="test" value="true"/>
 		</attributes>
 	</classpathentry>
 	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
 		<attributes>
-			<attribute name="maven.pomderived" value="true"/>
 			<attribute name="test" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-18">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-18"/>
 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>

+ 2 - 5
recept/.settings/org.eclipse.jdt.core.prefs

@@ -1,11 +1,8 @@
 eclipse.preferences.version=1
-org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 org.eclipse.jdt.core.compiler.codegen.targetPlatform=18
 org.eclipse.jdt.core.compiler.compliance=18
-org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
 org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
-org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
 org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
-org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning
-org.eclipse.jdt.core.compiler.release=enabled
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.release=disabled
 org.eclipse.jdt.core.compiler.source=18

+ 17 - 2
recept/pom.xml

@@ -5,6 +5,10 @@
   <version>0.0.1-SNAPSHOT</version>
   <name>Recept App</name>
   <description>En egen recept app</description>
+  <properties>
+	<maven.compiler.source>18</maven.compiler.source>
+	<maven.compiler.target>18</maven.compiler.target>
+  </properties>
   <repositories>
     <repository>
       <id>nordhs-repo</id>
@@ -14,7 +18,7 @@
   </repositories>
   
   <dependencies>
-	  <dependency>
+  	<dependency>
 	    <groupId>net.sourceforge.htmlunit</groupId>
 	    <artifactId>htmlunit</artifactId>
 	    <version>2.68.0</version>
@@ -28,8 +32,13 @@
 	<dependency>
 	    <groupId>org.seleniumhq.selenium</groupId>
 	    <artifactId>selenium-java</artifactId>
-	    <version>4.6.0</version>
+	    <version>4.12.1</version>
 	</dependency>
+		<dependency>
+			<groupId>io.github.bonigarcia</groupId>
+			<artifactId>webdrivermanager</artifactId>
+			<version>5.5.3</version>
+		</dependency>
 	    <dependency>
         <groupId>mysql</groupId>
         <artifactId>mysql-connector-java</artifactId>
@@ -40,5 +49,11 @@
 	    <artifactId>imgscalr-lib</artifactId>
 	    <version>4.2</version>
 	</dependency>
+
+	<dependency>
+		<groupId>com.github.mertakdut</groupId>
+		<artifactId>EpubParser</artifactId>
+		<version>1.0.95</version>
+	</dependency>
   </dependencies>
 </project>

+ 6 - 0
recept/recept.iml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="SonarLintModuleSettings">
+    <option name="uniqueId" value="f0575c91-9494-454d-a2ab-e2f924b3c7ee" />
+  </component>
+</module>

+ 0 - 56
recept/src/main/java/Main.java

@@ -1,56 +0,0 @@
-import java.util.ArrayList;
-import java.util.List;
-
-import parser.Arla;
-import parser.Ica;
-import parser.Koket;
-import parser.ParserBase;
-import parser.Tasteline;
-
-public class Main {
-
-	public static void main(String[] args) {
-		ParserBase pb = new ParserBase();
-
-		Ica i = new Ica();
-		Koket k = new Koket();
-		Tasteline tl = new Tasteline();
-		Arla a = new Arla();
-		// Kokaihop ko = new Kokaihop();
-
-		List<String> wordsList = new ArrayList<>();
-		// wordsList.add("kyckling");
-
-		// wordsList.add("halloumi");
-		// wordsList.add("potatis");
-		// wordsList.add("kikärtor");
-		// wordsList.add("röda linser");
-		// wordsList.add("köttfärs");
-		// wordsList.add("paj");
-		// wordsList.add("alkoholfri");
-		// wordsList.add("lax");
-		// wordsList.add("torsk");
-		// wordsList.add("glass");
-		// wordsList.add("hamburgare");
-		// wordsList.add("jordgubbar");
-		// wordsList.add("vetemjöl special");
-		// wordsList.add("vita bönor");
-		// wordsList.add("parmesan");
-		// wordsList.add("feta");
-		// wordsList.add("lakritspulver");
-		// wordsList.add("vit sirap");
-
-		// i.findRecepiesWithSearchWords(wordsList, Ica.URL_SEARCH_PATTERN, Ica.PATH_SEPARATOR, Ica.BASE_URL,
-		// Ica.RECEPIELIST_ITEMS_XPATH, Ica.RECEPIE_TITLE_XPATH, Ica.COOKIE_CONSENT_BUTTON_XPATH);
-		// k.findRecepiesWithSearchWords(wordsList, Koket.URL_SEARCH_PATTERN, Koket.PATH_SEPARATOR, Koket.BASE_URL,
-		// Koket.RECEPIELIST_ITEMS_XPATH, Koket.RECEPIE_TITLE_XPATH, Koket.COOKIE_CONSENT_BUTTON_XPATH);
-		// tl.findRecepiesWithSearchWords(wordsList, Tasteline.URL_SEARCH_PATTERN, Tasteline.PATH_SEPARATOR, Tasteline.BASE_URL,
-		// Tasteline.RECEPIELIST_ITEMS_XPATH, Tasteline.RECEPIE_TITLE_XPATH,
-		// Tasteline.COOKIE_CONSENT_BUTTON_XPATH);
-		a.findRecepiesWithSearchWords(wordsList, Arla.URL_SEARCH_PATTERN, Arla.PATH_SEPARATOR, Arla.BASE_URL,
-				Arla.RECEPIELIST_ITEMS_XPATH, Arla.RECEPIE_TITLE_XPATH, Arla.COOKIE_CONSENT_BUTTON_XPATH);
-
-		// ko.findRecepiesWithSearchWords(wordsList, Kokaihop.URL_SEARCH_PATTERN, Kokaihop.PATH_SEPARATOR, Kokaihop.BASE_URL,
-		// Kokaihop.RECEPIELIST_ITEMS_XPATH, Kokaihop.RECEPIE_TITLE_XPATH, Kokaihop.COOKIE_CONSENT_BUTTON_XPATH);
-	}
-}

+ 63 - 0
recept/src/main/java/ReceptMain.java

@@ -0,0 +1,63 @@
+import parser.*;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ReceptMain {
+
+    public static void main(String[] args) {
+        ParserBase pb = new ParserBase();
+
+        Ica i = new Ica();
+        Koket k = new Koket();
+        Tasteline tl = new Tasteline();
+        Arla a = new Arla();
+
+        EpubParser epub = new EpubParser();
+
+        epub.parseBook(new File("F:\\Books\\Cooking\\fireandspice.epub"));
+
+        // Kokaihop ko = new Kokaihop();
+
+        List<String> wordsList = new ArrayList<>();
+        // wordsList.add("kyckling");
+
+        // wordsList.add("halloumi");
+        // wordsList.add("potatis");
+        // wordsList.add("kik�rtor");
+        // wordsList.add("r�da linser");
+        // wordsList.add("k�ttf�rs");
+        // wordsList.add("paj");
+        // wordsList.add("alkoholfri");
+        // wordsList.add("lax");
+        // wordsList.add("torsk");
+        // wordsList.add("glass");
+        // wordsList.add("hamburgare");
+        // wordsList.add("jordgubbar");
+        // wordsList.add("vetemj�l special");
+        // wordsList.add("vita b�nor");
+        // wordsList.add("parmesan");
+        // wordsList.add("feta");
+        // wordsList.add("lakritspulver");
+        // wordsList.add("vit sirap");
+        //wordsList.add("kladdig");
+        //wordsList.add("snabb");
+        //wordsList.add("enkel");
+
+        i.findRecepiesWithSearchWords(wordsList, Ica.URL_SEARCH_PATTERN, Ica.PATH_SEPARATOR, Ica.BASE_URL,
+        Ica.RECEPIELIST_ITEMS_XPATH, Ica.RECEPIE_TITLE_XPATH, Ica.COOKIE_CONSENT_BUTTON_XPATH);
+        k.findRecepiesWithSearchWords(wordsList, Koket.URL_SEARCH_PATTERN, Koket.PATH_SEPARATOR, Koket.BASE_URL,
+                Koket.RECEPIELIST_ITEMS_XPATH, Koket.RECEPIE_TITLE_XPATH, Koket.COOKIE_CONSENT_BUTTON_XPATH);
+        tl.findRecepiesWithSearchWords(wordsList, Tasteline.URL_SEARCH_PATTERN, Tasteline.PATH_SEPARATOR,
+                Tasteline.BASE_URL,
+                Tasteline.RECEPIELIST_ITEMS_XPATH, Tasteline.RECEPIE_TITLE_XPATH,
+                Tasteline.COOKIE_CONSENT_BUTTON_XPATH);
+        a.findRecepiesWithSearchWords(wordsList, Arla.URL_SEARCH_PATTERN, Arla.PATH_SEPARATOR, Arla.BASE_URL,
+                Arla.RECEPIELIST_ITEMS_XPATH, Arla.RECEPIE_TITLE_XPATH, Arla.COOKIE_CONSENT_BUTTON_XPATH);
+
+        // ko.findRecepiesWithSearchWords(wordsList, Kokaihop.URL_SEARCH_PATTERN, Kokaihop.PATH_SEPARATOR, Kokaihop
+        // .BASE_URL,
+        // Kokaihop.RECEPIELIST_ITEMS_XPATH, Kokaihop.RECEPIE_TITLE_XPATH, Kokaihop.COOKIE_CONSENT_BUTTON_XPATH);
+    }
+}

+ 8 - 10
recept/src/main/java/parser/Arla.java

@@ -1,5 +1,12 @@
 package parser;
 
+import obejcts.Ingredient;
+import obejcts.Recepie;
+import org.imgscalr.Scalr;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebElement;
+
+import javax.imageio.ImageIO;
 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
@@ -7,20 +14,11 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.List;
 
-import javax.imageio.ImageIO;
-
-import org.imgscalr.Scalr;
-import org.openqa.selenium.By;
-import org.openqa.selenium.WebElement;
-
-import obejcts.Ingredient;
-import obejcts.Recepie;
-
 public class Arla extends ParserBase {
 
 	public static final String BASE_URL = "https://www.arla.se/recept/";
 	public static final String RECEPIE_TITLE_XPATH = ".//a[contains(@class,'c-card__title')]";
-	public static final String RECEPIELIST_ITEMS_XPATH = "//div[@class='c-card c-card--vertical']";
+	public static final String RECEPIELIST_ITEMS_XPATH = "//div[contains(@class,'c-card--vertical')]";
 	public static final String PATH_SEPARATOR = "+";
 	public static final String URL_SEARCH_PATTERN = "?search=";
 	public static final String COOKIE_CONSENT_BUTTON_XPATH = "//button[@class='save-preference-btn-handler onetrust-close-btn-handler']";

+ 33 - 0
recept/src/main/java/parser/EpubParser.java

@@ -0,0 +1,33 @@
+package parser;
+
+import java.io.File;
+
+import com.github.mertakdut.BookSection;
+import com.github.mertakdut.Reader;
+import com.github.mertakdut.exception.OutOfPagesException;
+import com.github.mertakdut.exception.ReadingException;
+
+public class EpubParser {
+
+	public void parseBook(File book) {
+		Reader reader = new Reader();
+		reader.setMaxContentPerSection(1000);
+		reader.setIsIncludingTextContent(true);
+		try {
+			reader.setFullContent(book.getAbsolutePath());
+
+			BookSection bookSection = reader.readSection(100);
+			String sectionContentAsHtml = bookSection.getSectionContent();
+			String sectionContentAsText = bookSection.getSectionTextContent();
+
+			System.out.println(sectionContentAsText);
+
+		} catch (ReadingException e) {
+			e.printStackTrace();
+		} catch (OutOfPagesException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+
+}

+ 50 - 21
recept/src/main/java/parser/ParserBase.java

@@ -1,17 +1,9 @@
 package parser;
 
-import java.awt.image.BufferedImage;
-import java.io.File;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.time.Duration;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-import javax.imageio.ImageIO;
-
+import com.google.common.base.Strings;
+import database.Database;
+import io.github.bonigarcia.wdm.WebDriverManager;
+import obejcts.Recepie;
 import org.imgscalr.Scalr;
 import org.openqa.selenium.By;
 import org.openqa.selenium.JavascriptExecutor;
@@ -21,14 +13,20 @@ import org.openqa.selenium.chrome.ChromeOptions;
 import org.openqa.selenium.support.ui.ExpectedConditions;
 import org.openqa.selenium.support.ui.WebDriverWait;
 
-import com.google.common.base.Strings;
-
-import database.Database;
-import obejcts.Recepie;
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
 
 public class ParserBase implements Parser {
 
-	List<String> measurements = Arrays.asList("msk", "tsk", "g", "kg", "ml", "dl", "l", "st", "krm", "förp", "kruka", "färsk",
+	List<String> measurements = Arrays.asList("msk", "tsk", "g", "kg", "ml", "dl", "l", "st", "krm", "f�rp", "kruka", "f�rsk",
 			"burk", "knippe", "kvist", "cm", "burkar", "cl", "port");
 	protected Database database = new Database();
 	protected ChromeDriver driver;
@@ -47,12 +45,12 @@ public class ParserBase implements Parser {
 			}
 			url += searchWords;
 
-			driver = getSeleniumDriver();
+			// driver = getSeleniumDriver();
+			driver = getDriver();
 			wait = getWaitDriver(driver);
 			jsExecutor = getJsExecutor(driver);
 
 			driver.get(url);
-			wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(By.xpath(recepiesItemListXpath), 0));
 
 			Thread.sleep(500);
 			if (checkIfElementExists(driver, cookieConsentButtonXpath)) {
@@ -60,6 +58,8 @@ public class ParserBase implements Parser {
 				Thread.sleep(100);
 			}
 
+			wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(By.xpath(recepiesItemListXpath), 0));
+
 			List<WebElement> recepies = driver.findElements(By.xpath(recepiesItemListXpath));
 
 			for (int i = 0; i < recepies.size(); i++) {
@@ -140,11 +140,11 @@ public class ParserBase implements Parser {
 	}
 
 	protected String getAmountFromText(String text) {
-		text = text.replaceAll("[^0-9\\/\\.,-½]", "").trim();
+		text = text.replaceAll("[^0-9\\/\\.,-�]", "").trim();
 		if (text.endsWith(",")) {
 			text.substring(0, text.length() - 1);
 		}
-		return text.replaceAll("[^0-9\\/\\.,-½]", "").trim();
+		return text.replaceAll("[^0-9\\/\\.,-�]", "").trim();
 	}
 
 	protected JavascriptExecutor getJsExecutor(ChromeDriver driver) {
@@ -163,6 +163,35 @@ public class ParserBase implements Parser {
 		return result;
 	}
 
+	public ChromeDriver getDriver() {
+		WebDriverManager.chromedriver().setup();
+		ChromeDriver driver = new ChromeDriver(getChromeOptions());
+		return driver;
+	}
+
+	private ChromeOptions getChromeOptions() {
+		ChromeOptions options = new ChromeOptions();
+		// Fixing 255 Error crashes
+		options.addArguments("--no-sandbox");
+		options.addArguments("--disable-dev-shm-usage");
+
+		// Options to trick bot detection
+		// Removing webdriver property
+		options.addArguments("--disable-blink-features=AutomationControlled");
+		options.setExperimentalOption("excludeSwitches", Collections.singletonList("enable-automation"));
+		options.setExperimentalOption("useAutomationExtension", null);
+
+		// Changing the user agent / browser fingerprint
+		options.addArguments("window-size=1920,1080");
+		options.addArguments(
+				"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
+
+		// Other
+		options.addArguments("disable-infobars");
+
+		return options;
+	}
+
 	protected ChromeDriver getSeleniumDriver() {
 		ChromeOptions options = new ChromeOptions();