IR practicals 5,6,7,8 - adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia PDF

Title IR practicals 5,6,7,8 - adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia
Author AKHILESH SONI
Course Modern Poetry (a)
Institution University of Delhi
Pages 7
File Size 82.2 KB
File Type PDF
Total Downloads 69
Total Views 122

Summary

adcasdasda asdfasa anaosn an asnfian ok aiosd ioam sdioas mdioasm dmioas dia mda p...


Description

PRACTICAL 5 import requests #Fetching the url and then requesting the server for that page. page_url = 'http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168' page = requests.get(page_url) #page variable -> html content and meta data, everything #A status_code of 200 means that the page downloaded successfully. print("Page Status: ",page.status_code) #To see the content of page -> print(page.content) #Use the BeautifulSoup library to parse the document, and extract the text from the tag from bs4 import BeautifulSoup soup = BeautifulSoup(page.content, 'html.parser') #Find div tag by its id seven_day = soup.find(id="seven-day-forecast") #prettify() method print the content in nice formatted manner #print("Content: ",seven_day.prettify()) #Find Elements by HTML Class Name forecast_items = seven_day.find_all(class_="tombstone-container") print("Getting to the actual data: ") #Extract Text From HTML Elements for item in forecast_items: period = item.find(class_="period-name").get_text() short_desc = item.find(class_="short-desc").get_text() temp = item.find(class_="temp").get_text() if None in (period, short_desc, temp): continue print("Day and Time: ",period.strip()) print("Weather: ",short_desc.strip()) print("Temperature: ",temp.strip()) print()

PRACTICAL 6 #Prac-6: Write python program to implement simple web crawler

import requests as rq from bs4 import BeautifulSoup as bs from PIL import Image as pillow import io

def webpage(page,weburl): #checks whether there is atleast one web page present in the provided weburl if(page>0): url=weburl #sends GET request to the specified url code=rq.get(url) plain=code.text #parse HTML page fetched from url soup=bs(plain,"html.parser")

child="" #fetching child nodes of the html for i in str(list(soup.children)[3]).encode("UTF-8")[:166]: #encode(): converts to Byte object #chr(): converts ASCII value to respective character child+=chr(i) print("Child:") print(child) print()

#finds all the anchor tags with class 'w3-btn w3-green' for link in soup.findAll('a',{'class':'w3-btn w3-green'}): #get the href attributes

getHref=link.get('href') #get the text getText=link.get_text() print("getHref: ",getHref) print("getText: ",getText)

#finds all the img tags for item in soup.findAll('img'): #fetch the source link for the image print("getImageLink: ",item['src']) #excludes images with '.gif' extension if(item['src'][-3:] == "gif"): pass else: #if image not present in folder, append '/' in the start if(item['src'][0]!="/"): item['src']='/'+item['src'] response=rq.get(weburl+item["src"]) #loads the data as bytes object imageByte=io.BytesIO(response.content) #opens and displays the image using pillow (PILLOW package) img=pillow.open(imageByte) img.show()

webpage(1,"https://www.w3schools.com")

PRACTICAL 7

import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize

#download nltk packages(stopwords, punkt) nltk.download('stopwords') nltk.download('punkt')

doc=input("Enter a sentence:")

#unique stop words of 'english' language stop_words=set(stopwords.words("english"))

#tokenizes the words in the document words_tokens=word_tokenize(doc) print("Word tokens:\n",words_tokens)

#add words to filtered list which are not present in stop_words filtered=[i for i in words_tokens if i not in stop_words] print("Sentence without stop words:") print(*filtered)

PRACTICAL 8

#cosine similarity formula = a . b / (||a|| . ||b||) import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import CountVectorizer as cv from math import sqrt,cos

docs=[] doc1=input("Enter document-1: ") doc2=input("Enter document-2: ") docs.append(doc1); docs.append(doc2) print("Doc list: ",docs)

# initializing CountVectorizer count_vectorizer=cv()

#count_vectorizer.fit_transform(): converts a collection of text documents to a matrix of token counts matrix= count_vectorizer.fit_transform(docs) print("Matrix:\n",matrix);print()

#matrix.toarray(): prints in array format doc_matrix= matrix.toarray()

print("Doc_Matrix: \n",doc_matrix);print()

#it creates a dataframe alongwith row and column names data_frame= pd.DataFrame(doc_matrix,columns=count_vectorizer.get_feature_names(),index=["Doc1","Doc2"]) print("Doc-Term Matrix") print(data_frame);print()

print("Cosine similarity between two documents:") #prints cosine similarity between the two documents print(cs(data_frame,data_frame)) print()

'''without packages'''

#converts numpy matrix to list doc1,doc2=doc_matrix.tolist() c=0;sqa=0;sqb=0 #sqa,sqb store sum of squares of doc1 and doc2

for i in range(len(doc1)): c+=doc1[i]*doc2[i] sqa+=doc1[i]**2;sqb+=doc2[i]**2

ans=round(c/(sqrt(sqa)*sqrt(sqb)),5) print("The similarity between doc1 and doc2 is",ans)...


Similar Free PDFs