Web Scrapping:
Part1: Web Scrapping (Text)
In this part, we scrap and parse the web page, then create a DataFrame and Store the DataFrame into MongoDB as a collection.
Web scraping is a technique used to fetch webpages from the internet and are parsed
to understand and extract specific information.
Web scrapping
consists of two parts:
1) Web Crawling: Accessing the webpages and pulling data from
them.
2) HTML Parsing: Parsing the HTML content of the webpages obtained through web crawling and then extracting specific information from it.
Scrapping the wikipedia for Covid data and Creating a DataFrame.
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as ureq
def getHTMLContent(link):
html = urlopen(link)
soup = BeautifulSoup(html, 'html.parser')
return soup
#class = wikitable plainrowheaders sortable jquery-tablesorted
wiki = 'https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data'
page=ureq(wiki)
page_parse = bs(page)
print(page_parse.prettify)
<bound method Tag.prettify of <!DOCTYPE html>
.
..
..
</body></html>>
page_parse.title.string
all_tables=page_parse.find_all('table')
required_table=page_parse.find('table', {'class': 'wikitable plainrowheaders sortable'})
required_table
<table class="wikitable plainrowheaders sortable"
..
..
..
</td></tr></tbody></table>:#Generate lists A=[] B=[] C=[] D=[] for row in required_table.findAll("tr"): cells1 = row.findAll('th') cells2 = row.findAll('td') #Country_name=row.findAll('th') #To store second column data if len(cells2)>1: #Only extract table body not heading A.append(cells1[1].find(text=True)) B.append(cells2[0].find(text=True)) C.append(cells2[1].find(text=True)) D.append(cells2[2].find(text=True))
B
['31,682,046\n', '14,074,564\n', '13,758,093\n'
.
.
.
E=[] for b in B: E.append(b.strip('\n')) E
Out[12]:F=[] for c in C: F.append(c.strip('\n')) FOut[14]:G=[] for d in D: G.append(d.strip('\n')) GOut[17]:#import pandas to convert list to data frame import pandas as pd df=pd.DataFrame(data=A, columns=['Country_name']) df['Total_cases']=E df['deaths']=F df['recovery']=G dfOut[18]:
Part2: Web Scrapping (Text) -Flipkart website:
# web scrapping
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as uReq
search = input("enter product name for search :")
flipkart_url = "https://www.flipkart.com/search?q=" + search
enter product name for search :nokia
uClient = uReq(flipkart_url) # requesting the webpage from the internet flipkartPage = uClient.read() # reading the webpage uClient.close() # closing the connection to the web server
flipkart_html = bs(flipkartPage, "html.parser") # parsing the webpage as HTML
boxes = flipkart_html.findAll("div", {"class": "_1AtVbE col-12-12"})
# seacrhing for appropriate tag to redirect to the product link
len(boxes)
46
box=boxes[2] #for 1st Iam getting nonetype error, so taking 2nd phone len(box) type(box)
bs4.element.Tag
box = box.div.div.div.a['href'] box
productLink = "https://www.flipkart.com" + box # extracting the actual product link
#link of particular phone(box)
productLink
:
prodRes = requests.get(productLink) # getting the product page from server
prod_html = bs(prodRes.text, "html.parser") # parsing the product page as HTML
commentboxes = prod_html.find_all('div', {'class': "_16PBlm"})
len(commentboxes)
11
commentbox =commentboxes[1] #extracting 1st comment
name = commentbox.div.div.find_all('p', {'class': '_2sc7ZR _2V5EHH'})[0].text #extract name from comment
name
Out[176]:
rating = commentbox.div.div.div.div.text
rating
commentHead = commentbox.div.div.div.p.text
commentHead
comtag = commentbox.div.div.find_all('div', {'class': ''})
comtag
:
custComment = comtag[0].div.text
custComment
reviews = []
for commentbox in commentboxes: try: #name.encode(encoding='utf-8') name = commentbox.div.div.find_all('p', {'class': '_2sc7ZR _2V5EHH'})[0].text except: name = 'No Name'
:
mydict = {"Product": search, "Name": name, "Rating": rating, "CommentHead": commentHead,
"Comment": custComment}
reviews.append(mydict)
reviews
:
Working DB part
import pymongo dbConn = pymongo.MongoClient("mongodb://localhost:27017/") # opening a connection to Mongo db = dbConn['scrappingDB']
collection = db['reviews'] # reviews -collection name in MongoDB
result= collection.find({})
result.count()
O/p: 1
if result.count() > 0:
print('g')
g