Search This Blog

3 Web Scrapping

Web Scrapping:

Part1: Web Scrapping (Text)



In this part, we scrap and parse the web page, then create a DataFrame and Store the DataFrame into MongoDB as a collection.

Web scraping is a technique used to fetch webpages from the internet and are parsed to understand and extract specific information. 

Web scrapping consists of two parts: 

1) Web Crawling: Accessing the webpages and pulling data from them. 

2) HTML Parsing:  Parsing the HTML content of the webpages obtained through web crawling and then extracting specific information from it.

Scrapping the wikipedia for Covid data and Creating a DataFrame.


from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as ureq

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

#class = wikitable plainrowheaders sortable jquery-tablesorted
wiki = 'https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data'

page=ureq(wiki)
page_parse = bs(page)

print(page_parse.prettify)
<bound method Tag.prettify of <!DOCTYPE html>
.
..
..
</body></html>>

page_parse.title.string
'Template:COVID-19 pandemic data - Wikipedia'

all_tables=page_parse.find_all('table')

required_table=page_parse.find('table', {'class': 'wikitable plainrowheaders sortable'})
required_table
<table class="wikitable plainrowheaders sortable" 
..
..
..
</td></tr></tbody></table>
:
#Generate lists
A=[]
B=[]
C=[]
D=[]

for row in required_table.findAll("tr"):
    cells1 = row.findAll('th')
    cells2 = row.findAll('td')
    #Country_name=row.findAll('th') #To store second column data
    if len(cells2)>1: #Only extract table body not heading
        A.append(cells1[1].find(text=True))
        B.append(cells2[0].find(text=True))
        C.append(cells2[1].find(text=True))
        D.append(cells2[2].find(text=True))

B
['31,682,046\n',
 '14,074,564\n',
 '13,758,093\n'
.
.
.

E=[]
for b in B:
    E.append(b.strip('\n'))
E
Out[12]:
['31,608,403',
 '13,873,825',
F=[]
for c in C:
    F.append(c.strip('\n'))
F
Out[14]:
['569,556',
 '172,085',
 '362,180',


G=[]
for d in D:
    G.append(d.strip('\n'))
G
Out[17]:
['No data',
 '12,336,036',

#import pandas to convert list to data frame
import pandas as pd
df=pd.DataFrame(data=A, columns=['Country_name'])
df['Total_cases']=E
df['deaths']=F
df['recovery']=G
df
Out[18]:
Country_nameTotal_casesdeathsrecovery
0United States31,608,403569,556No data
1India13,873,825172,08512,336,036
2Brazil13,677,564362,18012,170,771
3France5,067,21699,135No data
4Russia4,666,209104,0004,291,223
...............
234American Samoa403
235Samoa402
236Vanuatu301
237Federated States of Micronesia101
238TanzaniaNo dataNo dataNo data














Part2: Web Scrapping (Text) -Flipkart website:

# web scrapping
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as uReq


search = input("enter product name for search :")
flipkart_url = "https://www.flipkart.com/search?q=" + search
enter product name for search :nokia

uClient = uReq(flipkart_url)  # requesting the webpage from the internet
flipkartPage = uClient.read() # reading the webpage
uClient.close()               # closing the connection to the web server

flipkart_html = bs(flipkartPage, "html.parser") # parsing the webpage as HTML

boxes = flipkart_html.findAll("div", {"class": "_1AtVbE col-12-12"}) 
# seacrhing for appropriate tag to redirect to the product link

len(boxes)

46

box=boxes[2]    #for 1st Iam getting nonetype error, so taking 2nd phone 
len(box)
type(box)

bs4.element.Tag

box = box.div.div.div.a['href']
box
'/nokia-105-ss-2020/p/itm172df9f2ea432?pid=MOBFH7HQMBAQMGBX&lid=LSTMOBFH7HQMBAQMGBXGJXMZW
&marketplace=FLIPKART&q=nokia&store=search.flipkart.com&srno=s_1_1&otracker=search&fm=organic
&iid=a71b71d0-095d-458c-baf9-c8b00dc75329.MOBFH7HQMBAQMGBX.SEARCH&ppt=None&ppn=None&ssid=
f3413qvqkw0000001618680178861&qH=0c23a8bf29a191f1'
productLink = "https://www.flipkart.com" + box # extracting the actual product link
#link of particular phone(box)
productLink
:
'https://www.flipkart.com/nokia-105-ss-2020/p/itm172df9f2ea432?pid=MOBFH7HQMBAQMGBX&lid=LSTMOBFH7HQMBAQMGBXGJXMZW&marketplace=FLIPKART&q=nokia&store=search.flipkart.com&srno=s_1_1&otracker=search&fm=organic&iid=a71b71d0-095d-458c-baf9-c8b00dc75329.MOBFH7HQMBAQMGBX.SEARCH&ppt=None&ppn=None&ssid=f3413qvqkw0000001618680178861&qH=0c23a8bf29a191f1'

prodRes = requests.get(productLink) # getting the product page from server
prod_html = bs(prodRes.text, "html.parser") # parsing the product page as HTML

commentboxes = prod_html.find_all('div', {'class': "_16PBlm"})
len(commentboxes)

11

commentbox =commentboxes[1] #extracting 1st comment

name = commentbox.div.div.find_all('p', {'class': '_2sc7ZR _2V5EHH'})[0].text #extract name from comment
name
Out[176]:
'siyad

rating = commentbox.div.div.div.div.text
rating
'5'

commentHead = commentbox.div.div.div.p.text
commentHead
'Fabulous!'

comtag = commentbox.div.div.find_all('div', {'class': ''})
comtag
[<div><div class="">Good feature phone, i prefer to free from the addiction to smartphones. I am fully satisfied. 2000 contact memory is impressive. I am loving it.</div><span class="_1H-bmy"><span>READ MORE</span></span></div>,
 <div class="">Good feature phone, i prefer to free from the addiction to smartphones. I am fully satisfied. 2000 contact memory is impressive. I am loving it.</div>,
 <div></div>]

:
custComment = comtag[0].div.text
custComment
'Good feature phone, i prefer to free from the addiction to smartphones. I am fully satisfied. 2000 contact memory is impressive. I am loving it.'
reviews = []

for commentbox in commentboxes:
    try:
        #name.encode(encoding='utf-8')
        name = commentbox.div.div.find_all('p', {'class': '_2sc7ZR _2V5EHH'})[0].text
    
    except:
        name = 'No Name'
:
mydict = {"Product": search, "Name": name, "Rating": rating, "CommentHead": commentHead,
                          "Comment": custComment}

reviews.append(mydict)
reviews

:
[{'Product': 'nokia',
  'Name': 'No Name',
  'Rating': '5',
  'CommentHead': 'Fabulous!',
  'Comment': 'Good feature phone, i prefer to free from the addiction to smartphones. I am fully satisfied. 2000 contact memory is impressive. I am loving it.'}]


Working DB part

import pymongo
dbConn = pymongo.MongoClient("mongodb://localhost:27017/")  # opening a connection to Mongo
db = dbConn['scrappingDB']

collection = db['reviews']   # reviews -collection name in MongoDB

result= collection.find({})
result.count()
O/p: 1

if result.count() > 0:
    print('g')
g