From 6b585547dfaa591494e4e844e006e568905d7e28 Mon Sep 17 00:00:00 2001 From: Timo John Date: Thu, 30 Apr 2020 19:34:55 +0200 Subject: [PATCH] Add budgetyourtrip-scrapper.py --- Scripts/budgetyourtrip-scrapper.py | 134 +++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 Scripts/budgetyourtrip-scrapper.py diff --git a/Scripts/budgetyourtrip-scrapper.py b/Scripts/budgetyourtrip-scrapper.py new file mode 100644 index 0000000..0e7d1c3 --- /dev/null +++ b/Scripts/budgetyourtrip-scrapper.py @@ -0,0 +1,134 @@ +# ----------------------------------------------------------- +# Scrappes data from budgetyourtrip.com +# Links must be provided in a links.csv file in the same directory as the script +# +# Author: Timo John +# Version: 1.0 +# ----------------------------------------------------------- + +import requests +import shutil +from pathlib import Path +import xlrd +from bs4 import BeautifulSoup +import xlsxwriter + +# Creates a directory for the downloaded HTML +Path("./HTML").mkdir(parents=True, exist_ok=True) + +# Read from links.xlsx +links = xlrd.open_workbook('./links.xlsx').sheet_by_index(0) + +# .xlsx for results +workbook = xlsxwriter.Workbook('results.xlsx') +results = workbook.add_worksheet('results') +row = 1 + +# Write headers in results +headers = ["city", "country", "travelstyle", "currency", "average_per_day", "accomodation", "food", "water", + "local_transportation", "entertainment", "tips_and_handouts", "scams_robberies_and_mishaps", "alcohol"] + +for col in range(0, len(headers)): + results.write(0, col, headers[col]) + + +# Access xlsx with links row by row +for cnt in range(1, links.nrows): + cols = links.row_values(cnt) + city = cols[0] + country = cols[1] + url = cols[2] + + # Acces Website per link for all 3 Traveltypes + for traveltype in range(1, 4): + url_w_traveltype = url.replace('&budgettype=', '&budgettype=' + str(traveltype)) + + # get the HTML of the page + page = requests.get(url_w_traveltype).text.encode('utf-8') + + # Save the .html in the folder (optional for later use) + output_file = open("./HTML/" + str(cnt) + "_" + city + "_" + country + "_" + str(traveltype) + ".html", "wb") + output_file.write(page) + output_file.close() + + # make page parseable + soup = BeautifulSoup(page, "html.parser") + filtered = soup.find_all(class_="cost-tile") + + ret_data = [] + + ret_data.append(city) + ret_data.append(country) + ret_data.append(traveltype) + + # Try to get the data from the website + # except it does not exist, write 0 + try: + ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'symbol'}).text) + except: + ret_data.append(0) + + try: + ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append( + soup.find("li", {"class": 'cost-tile-category-accommodation'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append( + soup.find("li", {"class": 'cost-tile-category-food'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append( + soup.find("li", {"class": 'cost-tile-category-water'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append(soup.find("li", {"class": 'cost-tile-category-local-transportation'}).find("span", { + "class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append( + soup.find("li", {"class": 'cost-tile-category-entertainment'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append(soup.find("li", {"class": 'cost-tile-category-tips-and-handouts'}).find("span", { + "class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append(soup.find("li", {"class": 'cost-tile-category-scams-robberies-and-mishaps'}).find("span", { + "class": 'curvalue'}).text) + except: + ret_data.append(0) + + try: + ret_data.append( + soup.find("li", {"class": 'cost-tile-category-alcohol'}).find("span", {"class": 'curvalue'}).text) + except: + ret_data.append(0) + + print(ret_data) + + # Write resulting data in row of .xlsx + col = 0 + for data in ret_data: + results.write(row, col, data) + col += 1 + + row += 1 + +workbook.close() \ No newline at end of file