参考:

爬虫实践中的爬链家_m0_62036306的博客-CSDN博客_爬链家爬虫实践二-爬链家链家思想分析向网页发起请求,分析网页,用xpath提取,名称,单价链家房价,总价分析网页可以看到li标签有两个属性:class="clear LOGCLICKDATA"和@class="clear LOGVIEWDATA LOGCLICKDATA",然后打开xpath工具写xpath语法匹配结果 //ul[@class=" sellListContent"]/ li[@class="clear LOGCLICKDATA"]| //ul[@class="sellListCont

链家全国房价数据分析:数据采集这方面,频繁爬取对于技术保全还是很重要的。所以这次打算爬取链上的房价数据,主要目的是把爬虫和Python的东西整合起来,然后做一个分析。以链家广州为例,查看网页的结构,可以看到如下图所示: 看起来内容元素的结构很清晰,分类也很好,都是我们的想。链家对爬虫容忍度高,不会封IP链家房价,也不会...

1. 准备更多标题

def getUserAgent():
    UA_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Macintosh; U; IntelMac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
    return random.choice(UA_list)

2.上海新楼盘数据爬取

# 时间:2022/1/1 21:57
import requests
from lxml import etree

链家房价_上海汤臣一品房价链家_上海经纬城市绿洲房价链家

import time import csv import pickle class Lianjiaspider(object): def __init__(self): # url地址 self.url = 'https://sh.fang.lianjia.com/loupan/pg{}/' def get_page(self,url): time.sleep(3) # 请求不要太过频繁 self.henders={'User-Agent':getUserAgent()} # 从准备的header中随机选一个 respones=requests.get(url=url,headers=self.henders) respones.encoding='utf-8' html=respones.text return self.parse_page(html) def parse_page(self,html): paser_html=etree.HTML(html) parse_page_result = {} li_list = paser_html.xpath('//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll has-results"]|//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll"]') # xpath可以通过插件XpathHelper来获取 print(li_list) for i in range(len(li_list)): li = li_list[i] house_dict = {} # 名称 house_dict['house_name'] = li.xpath('.//div/div[1]/a/text()')#.strip() # 总价 house_dict['total_price']=li.xpath('.//div[@class="second"]/text()')#[0].strip() # 单价 house_dict['unit_price']=li.xpath('.//div[@class="main-price"]/span[@class="number"]/text()')#[0].strip() house_dict['unit_price_desc']=li.xpath('.//div[@class="main-price"]/span[@class="desc"]/text()')#[0].strip() # 房屋类型 house_dict['type']=li.xpath('.//div[@class="resblock-name"]/span[1]/text()') # 房屋所在区 house_dict['location1']=li.xpath('.//div[@class="resblock-location"]/span[1]/text()') # 房屋面积 house_dict['area']=li.xpath('.//div[@class="resblock-area"]/span[1]/text()')

链家房价_上海经纬城市绿洲房价链家_上海汤臣一品房价链家

print(house_dict) parse_page_result[i] = house_dict return parse_page_result def main(self): self.main_result = {} for pg in range(88,91): print('page:',pg) url=self.url.format(pg) self.main_result['pg'+str(pg)] = self.get_page(url) if __name__ == "__main__": start=time.time() lianjia=Lianjiaspider() lianjia.main() end=time.time() print('执行时间为:%0.2f'%(end - start))

3.结果已保存

# 暂时存一下爬取结果:爬到85页时,要求进行验证,所以手动进行验证后,重新从85页开始爬取

# result_1_8 = lianjia.main_result
# result_8_61 = lianjia.main_result
# result_85_90 = lianjia.main_result
shanghai_new = result_1_7.copy()
shanghai_new.update(result_8_61)
shanghai_new.update(result_85_90)
# 保存
def pkl_save(filename, file):
    #将file保存到filename中
    output = open(filename, 'wb')
    pickle.dump(file, output)
    output.close()
    
filename = './shanghai_new.pkl' # 文件名
file = shanghai_new # 文件
pkl_save(filename, file)

备注:

其他数据(上海二手房、杭州新房、杭州二手房)的爬取操作类似。但是二手房数量巨大,所以从网页获取区域列表后,按照区域抓取二手房信息(以上海为例:具体到街道或城镇如北菜、川沙、花木等)。