travopti/Scripts/budgetyourtrip-scrapper.py

# -----------------------------------------------------------
# Scrappes data from budgetyourtrip.com
# Links must be provided in a links.csv file in the same directory as the script
#
# Author:   Timo John
# Version:  1.0
# -----------------------------------------------------------

import requests
import shutil
from pathlib import Path
import xlrd
from bs4 import BeautifulSoup
import xlsxwriter

# Creates a directory for the downloaded HTML
Path("./HTML").mkdir(parents=True, exist_ok=True)

# Read from links.xlsx
links = xlrd.open_workbook('./links.xlsx').sheet_by_index(0)

# .xlsx for results
workbook = xlsxwriter.Workbook('results.xlsx')
results = workbook.add_worksheet('results')
row = 1

# Write headers in results
headers = ["city", "country", "travelstyle", "currency", "average_per_day", "accomodation", "food", "water",
           "local_transportation", "entertainment", "tips_and_handouts", "scams_robberies_and_mishaps", "alcohol"]

for col in range(0, len(headers)):
    results.write(0, col, headers[col])


# Access xlsx with links row by row
for cnt in range(1, links.nrows):
    cols = links.row_values(cnt)
    city = cols[0]
    country = cols[1]
    url = cols[2]

    # Acces Website per link for all 3 Traveltypes
    for traveltype in range(1, 4):
        url_w_traveltype = url.replace('&budgettype=', '&budgettype=' + str(traveltype))

        # get the HTML of the page
        page = requests.get(url_w_traveltype).text.encode('utf-8')

        # Save the .html in the folder (optional for later use)
        output_file = open("./HTML/" + str(cnt) + "_" + city + "_" + country + "_" + str(traveltype) + ".html", "wb")
        output_file.write(page)
        output_file.close()

        # make page parseable
        soup = BeautifulSoup(page, "html.parser")
        filtered = soup.find_all(class_="cost-tile")

        ret_data = []

        ret_data.append(city)
        ret_data.append(country)
        ret_data.append(traveltype)

        # Try to get the data from the website
        # except it does not exist, write 0
        try:
            ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'symbol'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(soup.find("li", {"class": 'cost-tile-main'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(
                soup.find("li", {"class": 'cost-tile-category-accommodation'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(
                soup.find("li", {"class": 'cost-tile-category-food'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(
                soup.find("li", {"class": 'cost-tile-category-water'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(soup.find("li", {"class": 'cost-tile-category-local-transportation'}).find("span", {
                "class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(
                soup.find("li", {"class": 'cost-tile-category-entertainment'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(soup.find("li", {"class": 'cost-tile-category-tips-and-handouts'}).find("span", {
                "class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(soup.find("li", {"class": 'cost-tile-category-scams-robberies-and-mishaps'}).find("span", {
                "class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        try:
            ret_data.append(
                soup.find("li", {"class": 'cost-tile-category-alcohol'}).find("span", {"class": 'curvalue'}).text)
        except:
            ret_data.append(0)

        print(ret_data)

        # Write resulting data in row of .xlsx
        col = 0
        for data in ret_data:
            results.write(row, col, data)
            col += 1

        row += 1

workbook.close()