Add budgetyourtrip-scrapper.py
This commit is contained in:
parent
4156ff9ac7
commit
6b585547df
134
Scripts/budgetyourtrip-scrapper.py
Normal file
134
Scripts/budgetyourtrip-scrapper.py
Normal file
@ -0,0 +1,134 @@
|
||||
# -----------------------------------------------------------
|
||||
# Scrappes data from budgetyourtrip.com
|
||||
# Links must be provided in a links.csv file in the same directory as the script
|
||||
#
|
||||
# Author: Timo John
|
||||
# Version: 1.0
|
||||
# -----------------------------------------------------------
|
||||
|
||||
import requests
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import xlrd
|
||||
from bs4 import BeautifulSoup
|
||||
import xlsxwriter
|
||||
|
||||
# Creates a directory for the downloaded HTML
|
||||
Path("./HTML").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read from links.xlsx
|
||||
links = xlrd.open_workbook('./links.xlsx').sheet_by_index(0)
|
||||
|
||||
# .xlsx for results
|
||||
workbook = xlsxwriter.Workbook('results.xlsx')
|
||||
results = workbook.add_worksheet('results')
|
||||
row = 1
|
||||
|
||||
# Write headers in results
|
||||
headers = ["city", "country", "travelstyle", "currency", "average_per_day", "accomodation", "food", "water",
|
||||
"local_transportation", "entertainment", "tips_and_handouts", "scams_robberies_and_mishaps", "alcohol"]
|
||||
|
||||
for col in range(0, len(headers)):
|
||||
results.write(0, col, headers[col])
|
||||
|
||||
|
||||
# Access xlsx with links row by row
|
||||
for cnt in range(1, links.nrows):
|
||||
cols = links.row_values(cnt)
|
||||
city = cols[0]
|
||||
country = cols[1]
|
||||
url = cols[2]
|
||||
|
||||
# Acces Website per link for all 3 Traveltypes
|
||||
for traveltype in range(1, 4):
|
||||
url_w_traveltype = url.replace('&budgettype=', '&budgettype=' + str(traveltype))
|
||||
|
||||
# get the HTML of the page
|
||||
page = requests.get(url_w_traveltype).text.encode('utf-8')
|
||||
|
||||
# Save the .html in the folder (optional for later use)
|
||||
output_file = open("./HTML/" + str(cnt) + "_" + city + "_" + country + "_" + str(traveltype) + ".html", "wb")
|
||||
output_file.write(page)
|
||||
output_file.close()
|
||||
|
||||
# make page parseable
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
filtered = soup.find_all(class_="cost-tile")
|
||||
|
||||
ret_data = []
|
||||
|
||||
ret_data.append(city)
|
||||
ret_data.append(country)
|
||||
ret_data.append(traveltype)
|
||||
|
||||
# Try to get the data from the website
|
||||
# except it does not exist, write 0
|
||||
try:
|
||||
ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'symbol'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(
|
||||
soup.find("li", {"class": 'cost-tile-category-accommodation'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(
|
||||
soup.find("li", {"class": 'cost-tile-category-food'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(
|
||||
soup.find("li", {"class": 'cost-tile-category-water'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(soup.find("li", {"class": 'cost-tile-category-local-transportation'}).find("span", {
|
||||
"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(
|
||||
soup.find("li", {"class": 'cost-tile-category-entertainment'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(soup.find("li", {"class": 'cost-tile-category-tips-and-handouts'}).find("span", {
|
||||
"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(soup.find("li", {"class": 'cost-tile-category-scams-robberies-and-mishaps'}).find("span", {
|
||||
"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
try:
|
||||
ret_data.append(
|
||||
soup.find("li", {"class": 'cost-tile-category-alcohol'}).find("span", {"class": 'curvalue'}).text)
|
||||
except:
|
||||
ret_data.append(0)
|
||||
|
||||
print(ret_data)
|
||||
|
||||
# Write resulting data in row of .xlsx
|
||||
col = 0
|
||||
for data in ret_data:
|
||||
results.write(row, col, data)
|
||||
col += 1
|
||||
|
||||
row += 1
|
||||
|
||||
workbook.close()
|
||||
Loading…
Reference in New Issue
Block a user