In terminal do python3 -m ensurepip curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py python3 -m pip install requests python3 -m pip install html5lib python3 -m pip install bs4 python3 -m pip install texttable BeautifulSoup is a Python package useful to parse HTML and XML documents. Pandas is a Python package useful for data manipulation and analysis. import os import requests from bs4 import BeautifulSoup from pathlib import Path import numpy os.chdir("/Users/rickpaikschoenberg/Desktop") ## EXAMPLE 1. https://example.com from bs4 import BeautifulSoup import requests url = "https://example.com" response = requests.get(url) data = response.text soup = BeautifulSoup(data, 'html.parser') titles = soup.find_all('h1') for title in titles: print(title.text) ## EXAMPLE 2, from https://www.scrapingbee.com/blog/python-web-scraping-beautiful-soup import requests from bs4 import BeautifulSoup response = requests.get("https://news.ycombinator.com/") html_content = response.content soup = BeautifulSoup(html_content, "html.parser") print(soup.title) print(soup.title.string) # All links in the page nb_links = len(soup.find_all("a")) print(f"There are {nb_links} links in this page") print(soup.get_text()) ## EXAMPLE 3. import requests from bs4 import BeautifulSoup # Fetch the content from the URL response = requests.get("https://news.ycombinator.com") html_content = response.content # Use Beautiful Soup to parse the HTML soup = BeautifulSoup(html_content, "html.parser") articles = soup.find_all(class_="athing") # Check if articles were found if articles: # Loop through the selected elements for article in articles: # Print each article's text content to the console print(article.text) ## EXAMPLE 4. import requests from bs4 import BeautifulSoup # Fetch the content from the URL response = requests.get("https://news.ycombinator.com") html_content = response.content # Use Beautiful Soup to parse the HTML soup = BeautifulSoup(html_content, "html.parser") articles = soup.find_all(class_="athing") scraped_data = [] # Check if articles are found if articles is not None: for article in articles: data = { "URL": article.find(class_="titleline").find("a").get("href"), "title": article.find(class_="titleline").getText(), "rank": article.find(class_="rank").getText().replace(".", ""), } scraped_data.append(data) # Print the output list print(scraped_data) ## EXAMPLE 5. import requests from bs4 import BeautifulSoup from pathlib import Path import numpy url = 'https://en.wikipedia.org/wiki/List_of_best-selling_books' # get URL html page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') data = [] # soup.find_all('td') will scrape every # element in the url's table data_iterator = iter(soup.find_all('td')) b = [] # data_iterator is the iterator of the table # This loop will keep repeating til there is no # data available in the iterator for i in range(50): for j in range(10): b.append(str(next(data_iterator).text)) content = str(numpy.array(b)) out = open("new.txt","w") for i in range(60): out.write(b[i]+"\n") out.close() out = open("condensed.txt","w") out.write(content) out.close()