level 2: 10、案例:编写爬虫爬取豆瓣电影排行榜(电影名称,评分),保存为csv文件 a、用numpy加载csv数据 b、把评分列转换成float64类型 c、计算电影的平均评分 d、求评分最高的电影 e、求评分在9分以上的电影 """ import requests from lxml import etree import csv import numpy as np def getHtml(): url = 'https://movie.douban.com/chart' headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", 'Cookie':'ll="118237"; bid=kQ4wCGaUHxM; dbcl2="198098900:5Dr+gGK65ck"; ck=u-be; _pk_id.100001.4cf6=842ffa65a9a6b8b3.1560771548.1.1560771681.1560771548.; _pk_ses.100001.4cf6=*; __yadk_uid=ACadYi5zL218X3UjCuwIiXTk7lThAmup; __utma=30149280.26375845.1560771555.1560771555.1560771555.1; __utmb=30149280.2.10.1560771555; __utmc=30149280; __utmz=30149280.1560771555.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1679117071.1560771555.1560771555.1560771555.1; __utmb=223695111.0.10.1560771555; __utmc=223695111; __utmz=223695111.1560771555.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); push_noty_num=0; push_doumail_num=0; _vwo_uuid_v2=D0397C64E4418CF03F84A9F99DED3AE28|9c841c774e9ad1066dc8a2ca931d9a9a; __utmt=1; __utmv=30149280.19809' } # 1.请求网页源代码 strHtml = requests.get(url,headers=headers).text # print(strHtml) """ 页面分析之标题: 页面分析之评分: 8.9 6.6 """ #2. 数据提取 html = etree.HTML(strHtml) # 获取到电影名称 tittle = html.xpath('//tr[@class="item"]//a/@title') print(tittle) # 获取到评分 grade = html.xpath('//span[@class="rating_nums"]/text()') print(grade) # 3.处理数据(使用拉链函数,让数据一一对应) list=[] res=zip(tittle,grade) for i in res: # 将元组数据保存进列表中 list.append(i) # print(list) # 4.保存成csv文件 with open('./doubandianying.csv','w',) as f: csv_f = csv.writer(f) # 添加第一行 csv_f.writerow(["title","grade"]) # 将数据遍历存储 for row in list: csv_f.writerow(row) def loadTxt(): filePath = './doubandianying.csv' res=np.loadtxt( filePath, delimiter=',', dtype=str, usecols=(0,1), skiprows=1 ) return res def chage(): filePath = './doubandianying.csv' res = np.loadtxt( filePath, delimiter=',', dtype=str, usecols=(1), skiprows=1 ) res = res.astype(np.float) return res def mean(gradeFloat): mean = np.mean(gradeFloat) return mean def movie(gradeFloat): index = np.argmax(gradeFloat) # print(index) filePath = './doubandianying.csv' title = np.loadtxt( filePath, delimiter=',', dtype=str, usecols=(0), skiprows=1 ) # print(title) return title[index] # index = np.argmin(gradeFloat) def movies(gradeFloat): res = gradeFloat[(gradeFloat>9)] print(res) if __name__ == '__main__': # 1.爬取数据 getHtml() # 2.加载数据 lt = loadTxt() print(lt) # 3.将分数列转换成浮点类型 gradeFloat = chage() print(gradeFloat) print(type(gradeFloat)) # 4.计算电影的平均分 gradeMean = mean(gradeFloat) print(gradeMean) # 5.评分最高的电影 movieFirst = movie(gradeFloat) print(movieFirst) # 6. q求评分高于9分以上的电影 (无) movies = movies(gradeFloat)