关注和粉丝信息的页面结构相同,所以只要写一个函数就可以完成两个类似的任务。首先剖析URL的特性,打开任意一个移动端网页版微博的关注页和粉丝页。关注URL,有以下两种方式
关注: https://weibo.cn/uid/follow?page=页码
粉丝: https://weibo.cn/uid/fans?page=页码
很容易才能打开相应的页面。再来看系统容许我们查看多少页
写一个函数获得可以迭代的次数
def getPageNum(soup):
"""
soup是关注页或粉丝页html的解析的返回结果,BeautifulSoup的实例
"""
return int(soup.find(name='input', attrs={'name': 'mp'})['value'])
同样地,F12检测
每一个用户的主页链接出现两次(红色方框),并且父标签都是、属性都是valign=“top”,所以每位关注(粉丝)用户信息出现两次。获取的方式比较简单,用findAll函数找出所有标签,后以step=2挨个提取。从蓝色方框中也能看到,用户主页URL不一定包含UID(优先展示个性域名),所以第一步获得UID很重要。其实这儿使用了一种麻烦的方式,因为每条信息都出现两次。观察图中第一个标签,它还有style="width: 52px"属性。还有一个更简单的方案是黄色方框的内容,这个表情是“关注他”按钮的链接,每个用户只出现一次微博粉丝链接,而且链接中早已包含了uid。如果借助这个标签的信息,第一步获取uid就是不必要的。
利用白色方框标签信息代码
def getRelation(person, relation, mobile=False):
if not (relation := relation.lower()) in ['focus', 'fans']:
raise ValueError('参数必须是 [\'focus\',\'fans\']')
if relation == 'focus':
pageUrlFunc = focusPageUrl
addFunc = person.addFocus
else:
pageUrlFunc = fansPageUrl
addFunc = person.addFans
url = pageUrlFunc(person, 1, mobile)
soup = getHtml(url=url, headers=header)
pageNum = getPageNum(soup)
for page in range(1, pageNum+1):
url = pageUrlFunc(person, page, mobile)
soup = getHtml(url=url, headers=header)
blank = ' ' if page < 10 else ''
print('正在抓取第', str(page)+blank, '页信息')
memberList = soup.findAll(name='td', attrs={'valign': 'top'})
for i in range(1, len(memberList), 2):
memberInfo = memberList[i].find(name='a')
name = memberInfo.text
uid = memberInfo['href'].split('/')[-1]
addFunc(WeiboUser(name=name, uid=uid))
print(person.name + ': ' + relation + " 信息抓取成功\n-------------------------------------------")
4. 基本链接和HTML
def homepageUrl(person, mobile=False):
if mobile:
pofix = ''
if person.uid:
pofix = person.uid
elif person.customDomain:
pofix = person.customDomain
else:
raise RuntimeError('缺少必要信息')
return 'https://weibo.cn/' + pofix
if oid := person.oid:
return 'https://www.weibo.com/u/' + oid
if pid := person.pageId:
return 'https://www.weibo.com/p/' + pid
if domain := person.customDomain:
return 'https://www.weibo.com/' + domain
def focusPageUrl(person, page=1, mobile=False):
if mobile:
return 'https://weibo.cn/' + person.uid + '/follow?page=' + str(page)
return 'https://weibo.com/p/' + person.pageId + '/follow?page=' + str(page)
def fansPageUrl(person, page=1, mobile=False):
if mobile:
return 'https://weibo.cn/' + person.uid + '/fans?page=' + str(page)
return 'https://weibo.com/p/' + person.pageId + '/follow?relate=fans&page=' + str(page)
def getHtml(url, headers):
response = requests.get(url=url, headers=headers)
if (html := response.text):
return bs(html, 'lxml')
print('无内容,正在重新请求')
getHtml(url, headers)
getHtml函数接收URL和恳求头,返回经过BeautifulSoup实例。因为不登陆微博,我们看不到用户的关注,所以恳求头里应当包含登入信息,cookie正是包含登入信息的一项。打开浏览器登陆微博,按F12步入开发者工具,选中网路(Network)
把cookie所有内容保存出来。为了模拟浏览器浏览,把user-agent也保存出来。
cookie = '**************'
userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ****'
header = {'User-Agent': userAgent,
'cookie': cookie
}
除此之外,getHtml函数还可能步入递归。有时候由于各类缘由,请求没有响应。getHtml函数调用自身相当于可以多恳求几次直至有返回结果。但是没必要始终恳求(python也不容许仍然递归,不能超过递归深度)微博粉丝链接,设置递归深度可以实现这一功能,在文件的开头设置
import sys
sys.setrecursionlimit(10) #设置递归深度
在程序最后,恢复默认递归深度(998)。(递归深度10时,matplotlib包导入失败)
sys.setrecursionlimit(998)
高频地恳求可能会造成访问被限制,在getHtml函数里添加命令,使每次恳求前暂停一段时间,模拟人的操作。
import time
pauseTime = 1 # 1秒
def getHtml(url, headers):
# *** 其他 ***
time.sleep(pauseTime)
# *** 其他 ***
5. 数据保存
使用类作为用户的模板可以增强代码的可读性,每创建一个用户的时侯只要创建一个实例。准备工作早已实现微博用户类的创建。
6. 完整代码
import requests
from bs4 import BeautifulSoup as bs
from Person import WeiboUser # 准备工作
import regex
import time
import sys
import os
sys.setrecursionlimit(10) # 设置递归深度,不必多次请求同一个页面
pauseTime = 1
basicInfoType = {'nickname': '昵称',
'identity': '认证',
'sex': '性别',
'location': '地区',
'description': '简介'
}
cookie = '************************************'
userAgent = '********************************'
header = {'User-Agent': userAgent,
'cookie': cookie
}
def homepageUrl(person, mobile=False):
if mobile:
pofix = ''
if person.uid:
pofix = person.uid
elif person.customDomain:
pofix = person.customDomain
else:
raise RuntimeError('neccessary information is needed')
return 'https://weibo.cn/' + pofix
if oid := person.oid:
return 'https://www.weibo.com/u/' + oid
if pid := person.pageId:
return 'https://www.weibo.com/p/' + pid
if domain := person.customDomain:
return 'https://www.weibo.com/' + domain
def focusPageUrl(person, page=1, mobile=False):
if mobile:
return 'https://weibo.cn/' + person.oid + '/follow?page=' + str(page)
return 'https://weibo.com/p/' + person.pageId + '/follow?page=' + str(page)
def fansPageUrl(person, page=1, mobile=False):
if mobile:
return 'https://weibo.cn/' + person.oid + '/fans?page=' + str(page)
return 'https://weibo.com/p/' + person.pageId + '/follow?relate=fans&page=' + str(page)
def getHtml(url, headers):
time.sleep(pauseTime)
response = requests.get(url=url, headers=headers)
if (html := response.text):
return bs(html, 'lxml')
print('无内容,正在重新请求')
getHtml(url, headers)
def getInfoFromText(soup, tagName, attrs):
return soup.find(name=tagName, attrs=attrs).text
def getInfoFromAttr(soup, searchTagName, searchAttr, targetedAttrName):
return soup.find(name=searchTagName, attrs=searchAttr)[targetedAttrName].text
def getPageNum(soup):
return int(soup.find(name='input', attrs={'name': 'mp'})['value'])
def getUid(soup):
addr = soup.find(name='a', attrs={'href': regex.compile(r"\S*/info")})
print(addr)
if addr:
return addr['href'].split('/')[1]
print('uid查找失败 跳过')
return None
def infoPageUrl(person):
return 'https://weibo.cn/' + person.uid + '/info'
def getCustomDomain(soup):
"""
soup: html of info page
"""
addr = soup.find(text=regex.compile(r"手机版:https://weibo.cn/*"))
#print(addr)
if addr:
if len(urlSection := addr.split('/')) == 4:
return urlSection[-1]
return ''
def getBirthdate(soup):
"""
html of info page
"""
date = soup.find(text=regex.compile(r"生日:*"))
if date:
dateBlock = date.split(':')[-1].split('-')
if len(dateBlock) == 3:
return int(dateBlock[0]), int(dateBlock[1]), int(dateBlock[2])
if len(dateBlock) == 2:
return int(dateBlock[0]), int(dateBlock[1])
if len(dateBlock) == 1:
return int(dateBlock[0])
return None
def getBasicInfo(soup, infoType):
"""
html of info page
infoType: 昵称, 认证, 性别, 地区, 简介
"""
if infoType.lower() not in basicInfoType:
raise ValueError('wrong basic infomation type\n' + str(basicInfoType))
pattern = basicInfoType[infoType] + ':*'
infoSection = soup.find(text=regex.compile(pattern))
if infoSection:
info = infoSection.split(':')[1:]
basicInfo = ''
for item in info:
basicInfo += item
return basicInfo
return ''
def getUserHomepageInfo(person, mobile=False):
# mobile用来注明是否是移动端网页
homeURL = homepageUrl(person, mobile)
print(homeURL)
soup = getHtml(url=homeURL, headers=header)
if uid := getUid(soup):
person.uid = uid
person.realFansNum = soup.find(name='a', attrs={'href': '/'+person.oid+'/fans'}).text.split('[')[1][:-1]
person.realFocusNum = soup.find(name='a', attrs={'href': '/'+person.oid+'/follow'}).text.split('[')[1][:-1]
infoURL = infoPageUrl(person)
infoSoup = getHtml(url=infoURL, headers=header)
if (customDomain := getCustomDomain(infoSoup)):
person.CustomDomain = customDomain
if (date := getBirthdate(infoSoup)):
try:
if len(date) == 3:
person.birthYear, person.birthMonth, person.birthDay = date
if len(date) == 2:
if date[0] > 12:
person.birthYear, person.birthMonth = date
else:
person.birthMonth, person.birthDay = date
if len(date) == 1:
person.birthYear = date
except:
pass
if (name := getBasicInfo(infoSoup, 'nickname')):
person.name = name
if (identity := getBasicInfo(infoSoup, 'identity')):
person.identity = identity
if (sex := getBasicInfo(infoSoup, 'sex')):
person.sex = sex
if (description := getBasicInfo(infoSoup, 'description')):
person.description = description
if (location := getBasicInfo(infoSoup, 'location')):
person.location = location
print(person.name + " 主页信息抓取成功\n-----------------------------------------------")
def getRelation(person, relation, mobile=False):
if not (relation := relation.lower()) in ['focus', 'fans']:
raise ValueError('argument relation must be in [\'focus\',\'fans\']')
if relation == 'focus':
pageUrlFunc = focusPageUrl
addFunc = person.addFocus
else:
pageUrlFunc = fansPageUrl
addFunc = person.addFans
url = pageUrlFunc(person, 1, mobile)
soup = getHtml(url=url, headers=header)
pageNum = getPageNum(soup)
for page in range(1, pageNum+1):
url = pageUrlFunc(person, page, mobile)
soup = getHtml(url=url, headers=header)
blank = ' ' if page < 10 else ''
print('正在抓取第', str(page)+blank, '页信息')
memberList = soup.findAll(name='td', attrs={'valign': 'top'})
for i in range(1, len(memberList), 2):
memberInfo = memberList[i].find(name='a')
name = memberInfo.text
uid = memberInfo['href'].split('/')[-1]
addFunc(WeiboUser(name=name, uid=uid))
print(person.name + ': ' + relation + " 信息抓取成功\n-------------------------------------------")
def userInformation(person, mobile=True):
getUserHomepageInfo(person, mobile)
getRelation(person, 'focus', mobile)
getRelation(person, 'fans', mobile)
def test(person, func, mobile=True):
url = func(person, mobile=mobile)
return getHtml(url, header)
lijian = WeiboUser(uid = '1744395855')
userInformation(lijian)
sys.setrecursionlimit(998)
尽管如此,有时候访问还是被限制。暂停时间设置为2秒或以上促使获取数据很慢。在技术上应当可以通过以下方法改进:
使用代理。不断地更换ip使用多个帐号。不断更换cookie更换User-Agent。不断修改user-agent,fakeuseragent包提供该功能。 7. 测试
从获得的199个粉丝数据中统计性别比列,其中有130个信息填写为男性
说明在最新的199个粉丝中,女性粉丝占比比较大。
从199个粉丝中统计填写了生日而且出生年份在1970-2005的人数,00-04年龄段最多,号称80后和堪称80后的最多。但在这组统计中只有81个有效的数据。