REF:爬虫实战之爬取链家_m0的博客_房价数据获取
参考:
爬虫实践中的爬链家_m0_62036306的博客-CSDN博客_爬链家爬虫实践二-爬链家链家思想分析向网页发起请求,分析网页,用xpath提取,名称,单价链家房价,总价分析网页可以看到li标签有两个属性:class="clear LOGCLICKDATA"和@class="clear LOGVIEWDATA LOGCLICKDATA",然后打开xpath工具写xpath语法匹配结果 //ul[@class=" sellListContent"]/ li[@class="clear LOGCLICKDATA"]| //ul[@class="sellListCont
链家全国房价数据分析:数据采集这方面,频繁爬取对于技术保全还是很重要的。所以这次打算爬取链上的房价数据,主要目的是把爬虫和Python的东西整合起来,然后做一个分析。以链家广州为例,查看网页的结构,可以看到如下图所示: 看起来内容元素的结构很清晰,分类也很好,都是我们的想。链家对爬虫容忍度高,不会封IP链家房价,也不会...
1. 准备更多标题
def getUserAgent():
UA_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; U; IntelMac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
return random.choice(UA_list)
2.上海新楼盘数据爬取
# 时间:2022/1/1 21:57
import requests
from lxml import etree
import time
import csv
import pickle
class Lianjiaspider(object):
def __init__(self):
# url地址
self.url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
def get_page(self,url):
time.sleep(3) # 请求不要太过频繁
self.henders={'User-Agent':getUserAgent()} # 从准备的header中随机选一个
respones=requests.get(url=url,headers=self.henders)
respones.encoding='utf-8'
html=respones.text
return self.parse_page(html)
def parse_page(self,html):
paser_html=etree.HTML(html)
parse_page_result = {}
li_list = paser_html.xpath('//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll has-results"]|//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll"]') # xpath可以通过插件XpathHelper来获取
print(li_list)
for i in range(len(li_list)):
li = li_list[i]
house_dict = {}
# 名称
house_dict['house_name'] = li.xpath('.//div/div[1]/a/text()')#.strip()
# 总价
house_dict['total_price']=li.xpath('.//div[@class="second"]/text()')#[0].strip()
# 单价
house_dict['unit_price']=li.xpath('.//div[@class="main-price"]/span[@class="number"]/text()')#[0].strip()
house_dict['unit_price_desc']=li.xpath('.//div[@class="main-price"]/span[@class="desc"]/text()')#[0].strip()
# 房屋类型
house_dict['type']=li.xpath('.//div[@class="resblock-name"]/span[1]/text()')
# 房屋所在区
house_dict['location1']=li.xpath('.//div[@class="resblock-location"]/span[1]/text()')
# 房屋面积
house_dict['area']=li.xpath('.//div[@class="resblock-area"]/span[1]/text()')
print(house_dict)
parse_page_result[i] = house_dict
return parse_page_result
def main(self):
self.main_result = {}
for pg in range(88,91):
print('page:',pg)
url=self.url.format(pg)
self.main_result['pg'+str(pg)] = self.get_page(url)
if __name__ == "__main__":
start=time.time()
lianjia=Lianjiaspider()
lianjia.main()
end=time.time()
print('执行时间为:%0.2f'%(end - start))
3.结果已保存
# 暂时存一下爬取结果:爬到85页时,要求进行验证,所以手动进行验证后,重新从85页开始爬取
# result_1_8 = lianjia.main_result
# result_8_61 = lianjia.main_result
# result_85_90 = lianjia.main_result
shanghai_new = result_1_7.copy()
shanghai_new.update(result_8_61)
shanghai_new.update(result_85_90)
# 保存
def pkl_save(filename, file):
#将file保存到filename中
output = open(filename, 'wb')
pickle.dump(file, output)
output.close()
filename = './shanghai_new.pkl' # 文件名
file = shanghai_new # 文件
pkl_save(filename, file)
备注:
其他数据(上海二手房、杭州新房、杭州二手房)的爬取操作类似。但是二手房数量巨大,所以从网页获取区域列表后,按照区域抓取二手房信息(以上海为例:具体到街道或城镇如北菜、川沙、花木等)。