Written by Sümeyye Sever (notes I took while learning Python)


Beautiful Soup is a Python library used for web scraping purposes to extract data from HTML and XML files. It parses the webpage content and enables developers to navigate, search, and modify the HTML or XML tree structure. It's particularly useful for extracting specific information from websites.

Let’s create a simple project with BeautifulSoup and Python.

There is a website called Hacker News https://news.ycombinator.com/

image.png

Let’s scrape their articles’ name, link and points. But first, it’s important to check whether we are allowed to scrape the website. To do this, append robots.txt to the end of the website's URL and check if there are any restrictions on what you are going to scrape.

https://news.ycombinator.com/robots.txt

image.png

We are good to go. Let’s code.

from bs4 import BeautifulSoup
import requests

response = requests.get("<https://news.ycombinator.com/>")
yc_web_page = response.text
soup = BeautifulSoup(yc_web_page, "html.parser") 
# now soup contains the live page's source code

article_titles = []
article_links = []
article_votes = []

for article_tag in soup.find_all(name="span", class_="titleline"):
	print(article_tag)

image.png

from bs4 import BeautifulSoup
import requests

response = requests.get("<https://news.ycombinator.com/>")
yc_web_page = response.text
soup = BeautifulSoup(yc_web_page, "html.parser")

article_titles = []
article_links = []
article_votes = []

for article_tag in soup.find_all(name="span", class_="titleline"):
    article_titles.append(article_tag.getText())
    article_links.append(article_tag.find(name="a")["href"])

print(article_titles)
print(article_links)

image.png

from bs4 import BeautifulSoup
import requests

response = requests.get("<https://news.ycombinator.com/>")
yc_web_page = response.text
soup = BeautifulSoup(yc_web_page, "html.parser")

article_titles = []
article_links = []
article_votes = []

for article_tag in soup.find_all(name="span", class_="titleline"):
    article_titles.append(article_tag.getText())
    article_links.append(article_tag.find(name="a")["href"])

for article in soup.find_all(name="td", class_="subtext"):
    print(article)

image.png


from bs4 import BeautifulSoup
import requests

response = requests.get("<https://news.ycombinator.com/>")
yc_web_page = response.text
soup = BeautifulSoup(yc_web_page, "html.parser")

article_titles = []
article_links = []
article_votes = []

for article_tag in soup.find_all(name="span", class_="titleline"):
    article_titles.append(article_tag.getText())
    article_links.append(article_tag.find(name="a")["href"])

for article in soup.find_all(name="td", class_="subtext"):
		# some of the articles dont have the score so it must be handled
    if article.find(class_="score") is None:
        article_votes.append(0)
    else:
        article_votes.append(int(article.getText().split()[0]))
 
print(article_votes)

image.png