IR practicals 5,6,7,8 - adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia PDF

Title	IR practicals 5,6,7,8 - adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia
Author	AKHILESH SONI
Course	Modern Poetry (a)
Institution	University of Delhi
Pages	7
File Size	82.2 KB
File Type	PDF
Total Downloads	69
Total Views	122

Preview

CLICK TO PREVIEW PDF

Summary

adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia mda p...

Description

PRACTICAL 5 import requests #Fetching the url and then requesting the server for that page. page_url = 'http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168' page = requests.get(page_url) #page variable -> html content and meta data, everything #A status_code of 200 means that the page downloaded successfully. print("Page Status: ",page.status_code) #To see the content of page -> print(page.content) #Use the BeautifulSoup library to parse the document, and extract the text from the tag from bs4 import BeautifulSoup soup = BeautifulSoup(page.content, 'html.parser') #Find div tag by its id seven_day = soup.find(id="seven-day-forecast") #prettify() method print the content in nice formatted manner #print("Content: ",seven_day.prettify()) #Find Elements by HTML Class Name forecast_items = seven_day.find_all(class_="tombstone-container") print("Getting to the actual data: ") #Extract Text From HTML Elements for item in forecast_items: period = item.find(class_="period-name").get_text() short_desc = item.find(class_="short-desc").get_text() temp = item.find(class_="temp").get_text() if None in (period, short_desc, temp): continue print("Day and Time: ",period.strip()) print("Weather: ",short_desc.strip()) print("Temperature: ",temp.strip()) print()

PRACTICAL 6 #Prac-6: Write python program to implement simple web crawler

import requests as rq from bs4 import BeautifulSoup as bs from PIL import Image as pillow import io

def webpage(page,weburl): #checks whether there is atleast one web page present in the provided weburl if(page>0): url=weburl #sends GET request to the specified url code=rq.get(url) plain=code.text #parse HTML page fetched from url soup=bs(plain,"html.parser")

child="" #fetching child nodes of the html for i in str(list(soup.children)[3]).encode("UTF-8")[:166]: #encode(): converts to Byte object #chr(): converts ASCII value to respective character child+=chr(i) print("Child:") print(child) print()

#finds all the anchor tags with class 'w3-btn w3-green' for link in soup.findAll('a',{'class':'w3-btn w3-green'}): #get the href attributes

getHref=link.get('href') #get the text getText=link.get_text() print("getHref: ",getHref) print("getText: ",getText)

#finds all the img tags for item in soup.findAll('img'): #fetch the source link for the image print("getImageLink: ",item['src']) #excludes images with '.gif' extension if(item['src'][-3:] == "gif"): pass else: #if image not present in folder, append '/' in the start if(item['src'][0]!="/"): item['src']='/'+item['src'] response=rq.get(weburl+item["src"]) #loads the data as bytes object imageByte=io.BytesIO(response.content) #opens and displays the image using pillow (PILLOW package) img=pillow.open(imageByte) img.show()

webpage(1,"https://www.w3schools.com")

PRACTICAL 7

import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize

#download nltk packages(stopwords, punkt) nltk.download('stopwords') nltk.download('punkt')

doc=input("Enter a sentence:")

#unique stop words of 'english' language stop_words=set(stopwords.words("english"))

#tokenizes the words in the document words_tokens=word_tokenize(doc) print("Word tokens:\n",words_tokens)

#add words to filtered list which are not present in stop_words filtered=[i for i in words_tokens if i not in stop_words] print("Sentence without stop words:") print(*filtered)

PRACTICAL 8

#cosine similarity formula = a . b / (||a|| . ||b||) import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import CountVectorizer as cv from math import sqrt,cos

docs=[] doc1=input("Enter document-1: ") doc2=input("Enter document-2: ") docs.append(doc1); docs.append(doc2) print("Doc list: ",docs)

# initializing CountVectorizer count_vectorizer=cv()

#count_vectorizer.fit_transform(): converts a collection of text documents to a matrix of token counts matrix= count_vectorizer.fit_transform(docs) print("Matrix:\n",matrix);print()

#matrix.toarray(): prints in array format doc_matrix= matrix.toarray()

print("Doc_Matrix: \n",doc_matrix);print()

#it creates a dataframe alongwith row and column names data_frame= pd.DataFrame(doc_matrix,columns=count_vectorizer.get_feature_names(),index=["Doc1","Doc2"]) print("Doc-Term Matrix") print(data_frame);print()

print("Cosine similarity between two documents:") #prints cosine similarity between the two documents print(cs(data_frame,data_frame)) print()

'''without packages'''

#converts numpy matrix to list doc1,doc2=doc_matrix.tolist() c=0;sqa=0;sqb=0 #sqa,sqb store sum of squares of doc1 and doc2

for i in range(len(doc1)): c+=doc1[i]*doc2[i] sqa+=doc1[i]**2;sqb+=doc2[i]**2

ans=round(c/(sqrt(sqa)*sqrt(sqb)),5) print("The similarity between doc1 and doc2 is",ans)...