Subject: XJTLU int303 (Big data analytics)
Project: Scrape quotes from http://quotes.toscrape.com
ID: 1931391
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Initialize variables
quotes_content = []
quotes_tags = []
quotes_authors = []
quotes_description = []
authors_birthday = []
authors_country = []
authors_genre = []
authors_rating = []
authors_reviews = []
# Login
s = requests.Session()
login_url = "https://quotes.toscrape.com/login"
login_data = {
'username': '123',
'password': '123'
}
s.post(login_url, data=login_data)
# Find all quotes in 10 pages
for i in range(1, 11):
url = f"https://quotes.toscrape.com/page/{i}/"
response = s.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
quotes = soup.find_all('div', class_='quote')
for quote in quotes:
# Content
content = quote.find('span', class_='text').text.strip("“”")
quotes_content.append(content)
# Tags
tags = [tag.text for tag in quote.find_all('a', class_='tag')]
quotes_tags.append(tags)
# Author
author = quote.find('small', class_='author').text
quotes_authors.append(author)
# Access to author's about page
about_url = [a['href'] for a in quote.find_all('a') if a.text == '(about)'][0]
about_response = s.get(f"https://quotes.toscrape.com{about_url}")
about_soup = BeautifulSoup(about_response.text, 'html.parser')
# Description of the author
description_tag = about_soup.find('div', class_='author-description')
description = description_tag.text.strip() if description_tag else 'NA'
quotes_description.append(description)
# Access to goodreads website
author_url = [a['href'] for a in quote.find_all('a') if a['href'].startswith('http://goodreads.com/')][0]
author_response = s.get(author_url)
author_soup = BeautifulSoup(author_response.text, 'html.parser')
# Birthday of the author
birthday_tag = author_soup.find('div', itemprop='birthDate')
birthday = birthday_tag.text.strip() if birthday_tag else 'NA'
authors_birthday.append(birthday)
# Country of the author
country_tag = author_soup.find('div', class_='dataTitle', string='Born')
country = country_tag.next_sibling.replace('in ', '').strip() if country_tag else 'NA'
authors_country.append(country)
# Genre of the author
genre_tag = author_soup.find('div', class_='dataTitle', string='Genre')
genre = [a.text for a in genre_tag.find_next_sibling('div').find_all('a')] if genre_tag else 'NA'
authors_genre.append(genre)
# Average rating of the author
rating_tag = author_soup.find('span', itemprop='ratingValue')
rating = rating_tag.text.strip() if rating_tag else 'NA'
authors_rating.append(rating)
# Number of reviews
reviews_tag = author_soup.find('span', itemprop='reviewCount')
reviews = reviews_tag.text.strip() if reviews_tag else 'NA'
authors_reviews.append(reviews)
df = pd.DataFrame({
'Content': quotes_content,
'Tags': quotes_tags,
'Author': quotes_authors,
'Description': quotes_description,
'Birthday': authors_birthday,
'Country': authors_country,
'Genre': authors_genre,
'AVG_Rating': authors_rating,
'Reviews': authors_reviews
})
df.to_csv('HuangTianChi Zhu_1931391.csv', index=False)
df
Content | Tags | Author | Description | Birthday | Country | Genre | AVG_Rating | Reviews | |
---|---|---|---|---|---|---|---|---|---|
0 | The world as we have created it is a process o... | [change, deep-thoughts, thinking, world] | Albert Einstein | In 1879, Albert Einstein was born in Ulm, Germ... | March 14, 1879 | Ulm, kingdom of Württemberg, German empire | [Science, Philosophy, Physics] | 4.07 | 3,124 |
1 | It is our choices, Harry, that show what we tr... | [abilities, choices] | J.K. Rowling | See also: Robert GalbraithAlthough she writes ... | July 31, 1965 | Yate, South Gloucestershire, England, The Unit... | [Fiction, Young Adult, Fantasy] | 4.46 | 768,449 |
2 | There are only two ways to live your life. One... | [inspirational, life, live, miracle, miracles] | Albert Einstein | In 1879, Albert Einstein was born in Ulm, Germ... | March 14, 1879 | Ulm, kingdom of Württemberg, German empire | [Science, Philosophy, Physics] | 4.07 | 3,124 |
3 | The person, be it gentleman or lady, who has n... | [aliteracy, books, classic, humor] | Jane Austen | Jane Austen was an English novelist whose work... | December 16, 1775 | Steventon Rectory, Hampshire, England, The Uni... | [Fiction, Romance, Humor and Comedy] | 4.15 | 264,582 |
4 | Imperfection is beauty, madness is genius and ... | [be-yourself, inspirational] | Marilyn Monroe | Marilyn Monroe (born Norma Jeane Mortenson; Ju... | June 01, 1926 | The United States | [Biographies & Memoirs, Nonfiction, Poetry] | 4.12 | 1,005 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | You never really understand a person until you... | [better-life-empathy] | Harper Lee | Harper Lee, known as Nelle, was born in the Al... | April 28, 1926 | Monroeville, Alabama, The United States | [Literature & Fiction] | 4.22 | 147,085 |
96 | You have to write the book that wants to be wr... | [books, children, difficult, grown-ups, write,... | Madeleine L'Engle | Madeleine L'Engle was an American writer best ... | November 29, 1918 | New York City, New York, The United States | [Literature & Fiction, Science Fiction & Fanta... | 4.0 | 64,264 |
97 | Never tell the truth to people who are not wor... | [truth] | Mark Twain | Samuel Langhorne Clemens, better known by his ... | November 30, 1835 | Florida, Missouri, The United States | [Literature & Fiction, Short Stories, Biograph... | 3.87 | 61,142 |
98 | A person's a person, no matter how small. | [inspirational] | Dr. Seuss | Theodor Seuss Geisel was born 2 March 1904 in ... | March 02, 1904 | Springfield, MA, The United States | [Children's Books] | 4.26 | 67,268 |
99 | ... a mind needs books as a sword needs a whet... | [books, mind] | George R.R. Martin | George R. R. Martin was born September 20, 194... | September 20, 1948 | Bayonne, New Jersey, The United States | [Fantasy, Science Fiction, Horror] | 4.37 | 230,558 |
100 rows × 9 columns
Method 1: Calculate the proportion of each tag and find the top 5 (using pie plot).
Result: The top 5 tags in Quotes to Scrape website are love, life, inspirational, humor, books which shown that the leading quotes are mostly about the chores in daily life.
import matplotlib.pyplot as plt
# Filter the top 15
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
tags = df['Tags'].explode()
tag_counts = tags.value_counts().sort_values(ascending=False)
top_15_tags = tag_counts[:15]
percentages = top_15_tags / top_15_tags.sum() * 100
# Draw the pie plot of top 15 tags
fig, ax = plt.subplots(figsize=(10, 6))
wedges, texts, autotexts = ax.pie(top_15_tags, autopct='%1.1f%%', pctdistance=0.85)
legend_labels = [f'{tag}: {percentage:.1f}%' for tag, percentage in zip(top_15_tags.index, percentages)]
ax.legend(wedges, legend_labels,
title="Tags",
loc="center left",
bbox_to_anchor=(1, 0, 0.5, 1))
plt.setp(autotexts, size=8, weight="bold")
ax.set_title("Top 15 Tags")
plt.show()
# Print the top 5 tags and their global percentages
top_5_tags = tag_counts[:5]
percentages = top_5_tags / len(tags) * 100
print("Top Five Tags (Global Percentage):")
for tag, percentage in zip(top_5_tags.index, percentages):
print(f"{tag} ({percentage:.1f}%)")
Top Five Tags (Global Percentage): ['love'] (4.0%) ['inspirational'] (3.0%) [] (3.0%) ['attributed-no-source'] (3.0%) ['humor'] (3.0%)
Method 2: Plot histogram and boxplot of the author's birth year to analysis the distribution.
Result: The histogram is left skewed, and most of the quote's authors were born between 1879 and 1939. In descending order, 1879, 1965, 1926, 1904, 1835 are the years with the highest birthrate of these authors, which proves that this was the heyday of the creation of quote in the 20th century.
import matplotlib.pyplot as plt
import seaborn as sns
# Extract year of birth
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df0 = df1 = df
df0['BirthYear'] = df0['Birthday'].str[-4:]
df0['BirthYear'] = df0['BirthYear'].apply(lambda x: int(x) if str(x).isdigit() else None)
# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['BirthYear'], kde=False, bins=50)
plt.title('Histogram of birth year')
plt.xlabel('Birth year')
plt.ylabel('Frequency')
plt.show()
# Skew
skewness = df0['BirthYear'].skew()
if skewness > 0.1:
print("The histogram is right skewed.")
elif skewness < -0.1:
print("The histogram is left skewed.")
else:
print("The histogram is roughly symmetric.")
# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=df0['BirthYear'])
plt.title('Boxplot of birth year')
plt.xlabel('Birth year')
plt.show()
# IQR
Q1 = df0['BirthYear'].quantile(0.25)
Q3 = df0['BirthYear'].quantile(0.75)
print(f"The IQR range of birth year is from {int(round(Q1, 0))} to {int(round(Q3, 0))}.")
# Year of birth for the top 5 frequencies
df1['BirthYear'] = df1['Birthday'].str[-4:]
df1['BirthYear'] = df1['BirthYear'].apply(lambda x: int(x) if str(x).isdigit() else 0)
top_three_years = df1['BirthYear'].value_counts().sort_values(ascending=False)[:5]
total = len(df1)
print("Year of birth for the top 5 frequencies:")
for year, count in top_three_years.items():
percentage = count / total * 100
print(f"{year} ({percentage:.1f}%)")
The histogram is left skewed.
The IQR range of birth year is from 1879 to 1939. Year of birth for the top 5 frequencies: 1879 (10.0%) 1965 (10.0%) 1926 (8.0%) 1904 (7.0%) 1835 (6.0%)
Method 3: Plot the scatterplot of of AVG_Rating and Reviews to find some useful information between them.
Result: The scatter plot can be viewed as a linear distribution, where the correlation between AVG_Rating and Reviews is 0.54. Furthermore, J.K. Rowling, Suzanne Collins, Stephenie Meyer, Jane Austen, Haruki Murakami have both relatively high AVG_Rating and Reviews. The higher the Reviews, the higher the credibility of the data. And the higher the AVG_Rating, the higher the recognition of the authors' works. Therefore, these 5 authors are very excellent.
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import pandas as pd
import seaborn as sns
# Converts the AVG Rating and Reviews columns to numeric types
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df['AVG_Rating'] = pd.to_numeric(df['AVG_Rating'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'].str.replace(',', ''), errors='coerce')
# Scatterplot and boxplot
fig = plt.figure(figsize=(10, 5))
gs = GridSpec(4, 5, figure=fig)
ax1 = fig.add_subplot(gs[0:3, 0:4])
ax2 = fig.add_subplot(gs[3, 0:4])
ax3 = fig.add_subplot(gs[0:3, 4])
ax1.scatter(df['AVG_Rating'], df['Reviews'], s=20)
ax1.set_xlabel('AVG_Rating')
ax1.set_ylabel('Reviews')
ax2.boxplot(df['AVG_Rating'].dropna(), vert=False)
ax2.set_yticks([])
ax3.boxplot(df['Reviews'].dropna(), vert=True)
ax3.set_xlabel('Reviews')
sns.regplot(x='AVG_Rating', y='Reviews', data=df, ax=ax1, scatter_kws={'s':20}, line_kws={'color':'red'})
plt.tight_layout()
plt.show()
# Correlations
correlation = df['AVG_Rating'].corr(df['Reviews'])
print(f'The correlation between AVG_Rating and Reviews is: {correlation}')
# Delete duplicate authors and NA at df
df_unique_authors = df.drop_duplicates(subset='Author').copy()
df_unique_authors = df_unique_authors[(df_unique_authors['AVG_Rating'] != 'NA') & (df_unique_authors['Reviews'] != 'NA')]
# Creates a new column that is the ratio of 'AVG_Rating' and 'Reviews' to their corresponding column average
avg_rating_mean = df_unique_authors['AVG_Rating'].mean()
reviews_mean = df_unique_authors['Reviews'].mean()
df_unique_authors.loc[:, 'Adjusted_Rating_Review_Product'] = (df_unique_authors['AVG_Rating'] / avg_rating_mean) * (
df_unique_authors['Reviews'] / reviews_mean)
# Select the top 5 authors and their 'Adjusted_Rating_Review_Product' value
df_sorted = df_unique_authors.sort_values(by='Adjusted_Rating_Review_Product', ascending=False)
top_authors = df_sorted[['Author', 'Adjusted_Rating_Review_Product']].head(5)
print('Top 5 authors with relatively high AVG_Rating and Reviews:')
for i, row in top_authors.iterrows():
print(f"{row['Author']} ({row['Adjusted_Rating_Review_Product']:.2f})")
The correlation between AVG_Rating and Reviews is: 0.5434018683345798 Top 5 authors with relatively high AVG_Rating and Reviews: J.K. Rowling (10.04) Suzanne Collins (6.55) Stephenie Meyer (4.39) Jane Austen (3.22) Haruki Murakami (3.21)
Method 4: Plot horizontal barchart of each attribute at genre column and select specific author by genre.
Result: Top 5 most common genres are Fiction, Literature & Fiction, Biographies & Memoirs, Poetry, Nonfiction. Also, I select some genres that I was interested in ('Science', 'Physics', 'Fantasy', 'Science Fiction', 'Fiction'), which find out some authors.
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import pandas as pd
import ast
# Delete duplicate authors and NA at df
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df_unique_authors = df.drop_duplicates(subset='Author').copy()
df_unique_authors = df_unique_authors[df_unique_authors['Genre'] != 'NA']
# Convert the Genre column to a list
df_unique_authors['Genre'] = df_unique_authors['Genre'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
# Expand the list in the Genre column and calculate the frequency of each attribute
genre_frequency = Counter(chain.from_iterable(df_unique_authors['Genre']))
# Horizontal barchart
plt.figure(figsize=(15, 10))
plt.barh(list(genre_frequency.keys()), list(genre_frequency.values()))
plt.xlabel('Frequency')
plt.title('Frequency of Each Genre')
plt.show()
# Select top 5 attributes at genre column
print('Top 5 attributes at genre column:')
for genre, freq in genre_frequency.most_common(5):
print(f'{genre} ({freq})')
# Select your interested genre to filter the author
selected_genres = ['Science', 'Physics', 'Fantasy', 'Science Fiction', 'Fiction']
selected_authors = df_unique_authors[df_unique_authors['Genre'].apply(lambda x: any(genre in x for genre in selected_genres))]['Author']
print('Here are the authors contain the genre of your interest:')
for author in selected_authors:
print(author)
Top 5 attributes at genre column: Fiction (12) Literature & Fiction (12) Poetry (11) Biographies & Memoirs (10) Nonfiction (6) Here are the authors contain the genre of your interest: Albert Einstein J.K. Rowling Jane Austen Thomas A. Edison Douglas Adams Elie Wiesel Garrison Keillor Jorge Luis Borges George R.R. Martin James Baldwin Haruki Murakami Ernest Hemingway Charles Bukowski Suzanne Collins J.R.R. Tolkien J.M. Barrie
From the previous 4 method in Task 2, we have a very clear understanding of Quotes to Scrape website integrated data and have a lot of useful information to analyze and select quotes or authors.