# ----------------------------------------------------------- # Scrappes data from budgetyourtrip.com # Links must be provided in a links.csv file in the same directory as the script # # Author: Timo John # Version: 1.0 # ----------------------------------------------------------- import requests import shutil from pathlib import Path import xlrd from bs4 import BeautifulSoup import xlsxwriter # Creates a directory for the downloaded HTML Path("./HTML").mkdir(parents=True, exist_ok=True) # Read from links.xlsx links = xlrd.open_workbook('./links.xlsx').sheet_by_index(0) # .xlsx for results workbook = xlsxwriter.Workbook('results.xlsx') results = workbook.add_worksheet('results') row = 1 # Write headers in results headers = ["city", "country", "travelstyle", "currency", "average_per_day", "accomodation", "food", "water", "local_transportation", "entertainment", "tips_and_handouts", "scams_robberies_and_mishaps", "alcohol"] for col in range(0, len(headers)): results.write(0, col, headers[col]) # Access xlsx with links row by row for cnt in range(1, links.nrows): cols = links.row_values(cnt) city = cols[0] country = cols[1] url = cols[2] # Acces Website per link for all 3 Traveltypes for traveltype in range(1, 4): url_w_traveltype = url.replace('&budgettype=', '&budgettype=' + str(traveltype)) # get the HTML of the page page = requests.get(url_w_traveltype).text.encode('utf-8') # Save the .html in the folder (optional for later use) output_file = open("./HTML/" + str(cnt) + "_" + city + "_" + country + "_" + str(traveltype) + ".html", "wb") output_file.write(page) output_file.close() # make page parseable soup = BeautifulSoup(page, "html.parser") filtered = soup.find_all(class_="cost-tile") ret_data = [] ret_data.append(city) ret_data.append(country) ret_data.append(traveltype) # Try to get the data from the website # except it does not exist, write 0 try: ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'symbol'}).text) except: ret_data.append(0) try: ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append( soup.find("li", {"class": 'cost-tile-category-accommodation'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append( soup.find("li", {"class": 'cost-tile-category-food'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append( soup.find("li", {"class": 'cost-tile-category-water'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append(soup.find("li", {"class": 'cost-tile-category-local-transportation'}).find("span", { "class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append( soup.find("li", {"class": 'cost-tile-category-entertainment'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append(soup.find("li", {"class": 'cost-tile-category-tips-and-handouts'}).find("span", { "class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append(soup.find("li", {"class": 'cost-tile-category-scams-robberies-and-mishaps'}).find("span", { "class": 'curvalue'}).text) except: ret_data.append(0) try: ret_data.append( soup.find("li", {"class": 'cost-tile-category-alcohol'}).find("span", {"class": 'curvalue'}).text) except: ret_data.append(0) print(ret_data) # Write resulting data in row of .xlsx col = 0 for data in ret_data: results.write(row, col, data) col += 1 row += 1 workbook.close()