Add budgetyourtrip-scrapper.py

2020-04-30 19:34:55 +02:00 · 2020-04-30 19:34:55 +02:00 · 6b585547df
commit 6b585547df
parent 4156ff9ac7
1 changed files with 134 additions and 0 deletions
--- a/Scripts/budgetyourtrip-scrapper.py
+++ b/Scripts/budgetyourtrip-scrapper.py
@ -0,0 +1,134 @@
+# -----------------------------------------------------------
+# Scrappes data from budgetyourtrip.com
+# Links must be provided in a links.csv file in the same directory as the script
+#
+# Author:   Timo John
+# Version:  1.0
+# -----------------------------------------------------------
+
+import requests
+import shutil
+from pathlib import Path
+import xlrd
+from bs4 import BeautifulSoup
+import xlsxwriter
+
+# Creates a directory for the downloaded HTML
+Path("./HTML").mkdir(parents=True, exist_ok=True)
+
+# Read from links.xlsx
+links = xlrd.open_workbook('./links.xlsx').sheet_by_index(0)
+
+# .xlsx for results
+workbook = xlsxwriter.Workbook('results.xlsx')
+results = workbook.add_worksheet('results')
+row = 1
+
+# Write headers in results
+headers = ["city", "country", "travelstyle", "currency", "average_per_day", "accomodation", "food", "water",
+           "local_transportation", "entertainment", "tips_and_handouts", "scams_robberies_and_mishaps", "alcohol"]
+
+for col in range(0, len(headers)):
+    results.write(0, col, headers[col])
+
+
+# Access xlsx with links row by row
+for cnt in range(1, links.nrows):
+    cols = links.row_values(cnt)
+    city = cols[0]
+    country = cols[1]
+    url = cols[2]
+
+    # Acces Website per link for all 3 Traveltypes
+    for traveltype in range(1, 4):
+        url_w_traveltype = url.replace('&budgettype=', '&budgettype=' + str(traveltype))
+
+        # get the HTML of the page
+        page = requests.get(url_w_traveltype).text.encode('utf-8')
+
+        # Save the .html in the folder (optional for later use)
+        output_file = open("./HTML/" + str(cnt) + "_" + city + "_" + country + "_" + str(traveltype) + ".html", "wb")
+        output_file.write(page)
+        output_file.close()
+
+        # make page parseable
+        soup = BeautifulSoup(page, "html.parser")
+        filtered = soup.find_all(class_="cost-tile")
+
+        ret_data = []
+
+        ret_data.append(city)
+        ret_data.append(country)
+        ret_data.append(traveltype)
+
+        # Try to get the data from the website
+        # except it does not exist, write 0
+        try:
+            ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'symbol'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(
+                soup.find("li", {"class": 'cost-tile-category-accommodation'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(
+                soup.find("li", {"class": 'cost-tile-category-food'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(
+                soup.find("li", {"class": 'cost-tile-category-water'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(soup.find("li", {"class": 'cost-tile-category-local-transportation'}).find("span", {
+                "class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(
+                soup.find("li", {"class": 'cost-tile-category-entertainment'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(soup.find("li", {"class": 'cost-tile-category-tips-and-handouts'}).find("span", {
+                "class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(soup.find("li", {"class": 'cost-tile-category-scams-robberies-and-mishaps'}).find("span", {
+                "class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        try:
+            ret_data.append(
+                soup.find("li", {"class": 'cost-tile-category-alcohol'}).find("span", {"class": 'curvalue'}).text)
+        except:
+            ret_data.append(0)
+
+        print(ret_data)
+
+        # Write resulting data in row of .xlsx
+        col = 0
+        for data in ret_data:
+            results.write(row, col, data)
+            col += 1
+
+        row += 1
+
+workbook.close()