快乐每一天

自己选择的路,跪着也要走下去...

标签 爬虫 下的文章

利用ThinkPHP框架 抓取插件querylist 抓取妹子图首页数据跟详情页码

抓取工具地址:https://querylist.cc/

    public function homepage(){
        $url='https://www.mzitu.com';
        $rules=[
            'detail'=>['#pins>li>a','href'],
            'face'=>['#pins>li>a>img','data-original'],
            'alt'=>['#pins>li>a>img','alt'],
        ];
        $data = QueryList::get($url)->rules($rules)->query()->getData();

        $rt=$data->map(function($item){
            $item['count'] = $this->detail($item['detail']);
            return $item;
        })->all();

        echo '<pre>';
        var_dump($rt);
        echo '</pre>';
    }

    /*获取详情页数*/
    public function detail($url='https://www.mzitu.com/178724'){
        $rules=[
            'pagenavi'=>['.pagenavi','text']
        ];
        $rt = QueryList::get($url,[
            'headers'=>[
                'Referer'=>$url,
            ]
        ])
        ->rules($rules)->query()->getData()->all();
        return $this->cut('…','下一页',$rt[0]['pagenavi']);
    }

    //截取指定两个字符之间的字符串
    public function cut($begin,$end,$str){
        $b = mb_strpos($str,$begin) + mb_strlen($begin);
        $e = mb_strpos($str,$end) - $b;
        return mb_substr($str,$b,$e);
    }

数据展示效果:

array(24) {
  [0]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178724"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178724_05a31_236.jpg"
    ["alt"]=>
    string(79) "美貌与身材兼具 黑丝美女周于希性感翘臀浑圆饱满紧致诱人"
    ["count"]=>
    string(2) "42"
  }
  [1]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178809"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178809_05b03_236.jpg"
    ["alt"]=>
    string(67) "肥美尤物王婉悠血滴子内衣写真 暗黑魅惑嗜血销魂"
    ["count"]=>
    string(2) "41"
  }
  [2]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178630"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178630_04b37_236.jpg"
    ["alt"]=>
    string(64) "头条女神笑笑软萌可爱又豪放 撩衣挤胸作风大胆"
    ["count"]=>
    string(2) "40"
  }
  [3]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/183163"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/183163_05b27_236.jpg"
    ["alt"]=>
    string(76) "人间尤物奶瓶土肥圆无圣光人体写真 高跟美腿魅力无法挡"
    ["count"]=>
    string(2) "46"
  }
  [4]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/182255"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/182255_29b20_236.jpg"
    ["alt"]=>
    string(66) "御姐PK小可爱 性感模特韩恩熙姐妹花诱惑再次上演"
    ["count"]=>
    string(2) "47"
  }
  [5]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/182852"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/182852_03b24_236.jpg"
    ["alt"]=>
    string(73) "满足你对老师的幻想 女神许诺Sabrina教师OL制服知性迷人"
    ["count"]=>
    string(2) "44"
  }
  [6]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/182994"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/182994_04b48_236.jpg"
    ["alt"]=>
    string(75) "玲珑美人Annie安妮性感写真 美乳翘臀下修长玉腿异常诱人"
    ["count"]=>
    string(2) "48"
  }
  [7]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/183043"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/183043_04c32_236.jpg"
    ["alt"]=>
    string(63) "美人醉语园中烟 女神芝芝Booty上演华丽丽的诱惑"
    ["count"]=>
    string(2) "65"
  }
  [8]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178473"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178473_03a39_236.jpg"
    ["alt"]=>
    string(64) "乳沟好深 极品美女李可可性感美乳无法一手掌握"
    ["count"]=>
    string(2) "39"
  }
  [9]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178166"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178166_30c11_236.jpg"
    ["alt"]=>
    string(67) "把自己当礼物 风情尤物Egg尤妮丝巨乳肥臀等你调教"
    ["count"]=>
    string(2) "41"
  }
  [10]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178380"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178380_02b35_236.jpg"
    ["alt"]=>
    string(64) "丰满身材呼之欲出 蜜桃社美绪超级豪乳震惊网友"
    ["count"]=>
    string(2) "43"
  }
  [11]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178208"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/178208_01a04_236.jpg"
    ["alt"]=>
    string(64) "黑丝美人酥小白性感床照 惹火姿势尽显女王气质"
    ["count"]=>
    string(2) "37"
  }
  [12]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/173453"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/173453_26a21_236.jpg"
    ["alt"]=>
    string(66) "妩媚女神妲己Toxic薄纱湿身诱惑 喷血豪乳一目了然"
    ["count"]=>
    string(2) "41"
  }
  [13]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/175200"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/175200_12a02_236.jpg"
    ["alt"]=>
    string(68) "极品嫩妹徐cake性感写真秀美乳 一嘴酸奶画面好内涵"
    ["count"]=>
    string(2) "30"
  }
  [14]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/174906"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/05/174906_09a18_236.jpg"
    ["alt"]=>
    string(67) "绝色美女张雨萌裸体泡温泉 饱满酥胸令人热血上涌"
    ["count"]=>
    string(2) "40"
  }
  [15]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/182201"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/182201_29a14_236.jpg"
    ["alt"]=>
    string(69) "第一视角带剧情,性感美女王雨纯女仆幻想不容错过"
    ["count"]=>
    string(2) "53"
  }
  [16]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178424"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/178424_02c14_236.jpg"
    ["alt"]=>
    string(61) "性感模特月音瞳透视睡衣照 胸口敞开雪白诱人"
    ["count"]=>
    string(2) "48"
  }
  [17]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178335"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/178335_02a30_236.jpg"
    ["alt"]=>
    string(70) "爱蜜社兔女郎酥小白性感黑丝诱惑 火辣身材引人遐想"
    ["count"]=>
    string(2) "44"
  }
  [18]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/181701"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/181701_25a02_236.jpg"
    ["alt"]=>
    string(70) "尤物美女萌汉药baby迪拜旅拍,媚骨柔情如魔鬼般惹火"
    ["count"]=>
    string(2) "63"
  }
  [19]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178121"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/178121_30b04_236.jpg"
    ["alt"]=>
    string(69) "甜美脸蛋魔鬼身材 美少妇芝芝Booty美妙胴体双手难挡"
    ["count"]=>
    string(2) "44"
  }
  [20]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/176101"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/176101_18b25_236.jpg"
    ["alt"]=>
    string(71) "清纯可人不失性感 高冷妹子小姿2002五官精致气场强大"
    ["count"]=>
    string(2) "42"
  }
  [21]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/177562"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/177562_27a12_236.jpg"
    ["alt"]=>
    string(59) "宅男杀手小尤奈COS不知火舞 G奶晃动呼之欲出"
    ["count"]=>
    string(2) "40"
  }
  [22]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/178246"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/178246_01b20_236.jpg"
    ["alt"]=>
    string(73) "藏不住的波涛胸涌 大胸嫩模Evelyn艾莉雪乳风情有沟必火"
    ["count"]=>
    string(2) "40"
  }
  [23]=>
  array(4) {
    ["detail"]=>
    string(28) "https://www.mzitu.com/181907"
    ["face"]=>
    string(57) "https://i.meizitu.net/thumbs/2019/04/181907_25d04_236.jpg"
    ["alt"]=>
    string(64) "可清纯可妖艳 火辣御姐穆菲菲曼妙身姿完美绝伦"
    ["count"]=>
    string(2) "40"
  }
}
阅读全文

抓取博客的文章标题和链接(持续更新)

现在可以打开我们需要抓取数据的网址了,由于我是抓取我自己博客的信息,所以直接通过F12 ,浏览器的查看功能,找到需要抓的信息是在哪里

20180405194407564.png

通过观察,可以发现我们需要找的标题是在

<span class="link_title">

下的的

<a>

标签中,因此我们可以通过这个来快速查找 (注意:这种办法在确定唯一性时候可以使用,否则的话,会抓到你不想要的结果)

获取浏览器的Header信息

通过图片介绍的方法复制第4步的User-Agent的信息,为什么要用到User-Agent呢,因为CSDN为了阻止别人恶意采集信息采取了反爬虫的设置

20180405200903148.png

CSDN最近的页面改版了,所以上面的代码已经不适用了。

选择器的路径需要改成下图中圈起来的才行。

20181220225234145.png

另外还需要去除原创或者转载的表示。

通过手动改变链接,让链接的页数超出总博文的页数,对比发现如果是非法输入超过总页数的链接的话,页面底部是没有分页的功能的,所以可以添加一个for循环,通过循环足够多的次数,当没有分页时,就退出整个程序,这样就不至于一直循环。

20181220232328570.png

201812202323514.png


由于是之前写的博客,当时没有学到很多。现在,建议不要使用urllib.request了,强烈推荐使用requests库。

Demo1:直接运行下面代码即可


import urllib.request
from bs4 import BeautifulSoup
#如果没有安装好BeautifulSoup,这里是会报错的

#自定义一个函数拿到博客的链接
def getUrl (url):
    #定义一个headers,存储刚才复制下来的报头,模拟成浏览器
    headers = ('User-Agent',
               "Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:48.0) Gucko/20130101 Firefox/58.0")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    # 将opener安装为全局
    urllib.request.install_opener(opener)
    html = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    # print(html)
    bs = BeautifulSoup(html,'lxml')
    # 用beautifulsoup的select,找到所有的<a>标签
    pages = bs.select(".pagination-box")
    if pages:
        # 判断是否包含下一页的选项
        # 用beautifulsoup的select,找到所有的<a>标签
        links = bs.select('.article-list > .article-item-box > h4 > a')
        return links
    else:
        # 结束整个程序
        sys.exit()

import  sys
if __name__ == '__main__':

    # 要爬取的网页链接 ,循环足够多的页数,所以填了1000000
    for i in range(1,1000000):
        url = 'https://blog.csdn.net/stormdony/article/list/{}'.format(i)
    # 获取对应网页的链接地址
        linklist = getUrl(url)
    # 定义一个列表texts存储文章的标题
        texts = []
        # 定义一个列表links存储文章的链接
        links = []
        # 遍历linkllist,存储标题和链接
        for link in linklist:
            texts.append(link.text.strip())
            links.append(link.get('href'))
        #    通过zip,将信息输出到控制台
        for text, link in zip(texts, links):
            text = text.strip().replace("原        \n        ", "")
            text = text.strip().replace("转        \n        ", "")
            data = {'tittle': text, 'link': link}
            print(data)

Demo2:
=========== 再再此更新于2018/12/21 ========

更新功能:可以分类爬取原创和转载两类文章;

20181221181325716.png

import requests
from bs4 import BeautifulSoup

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def getOriginalArticle(username):
    '''获取原创文章'''
    for i in range(1,1000000):
        url = 'https://blog.csdn.net/{}/article/list/{}?t=1'.format(username,i)
        web_data = requests.get(url, headers=header)
        soup = BeautifulSoup(web_data.content, 'lxml')
        pages = soup.select(".pagination-box")
        links = soup.select('.article-list > .article-item-box > h4 > a')[1:]
        readnum = soup.select('.read-num')
        if pages:
            # 判断是否包含下一页的选项
            # 用beautifulsoup的select,找到所有的<a>标签
            printOriginalData(links,readnum,'原        \n        ')
        else:
            # 结束循环
            break

def getTranshipmentArticle(username):
    '''获取转载文章'''
    for i in range(1,1000000):
        url = 'https://blog.csdn.net/{}/article/list/{}?t=2'.format(username,i)
        web_data = requests.get(url,headers=header)
        soup = BeautifulSoup(web_data.content, 'lxml')
        pages = soup.select(".pagination-box")
        links = soup.select('.article-list > .article-item-box > h4 > a')[1:]
        readnum = soup.select('.read-num')
        if pages:
            printTranshipmentData(links,readnum,'转        \n        ')
        else:
            # 这里可能会有问题,因为我的转载文章还不够分页的数量,所以需要添加多一条printTranshipmentData()
            printTranshipmentData(links, readnum, '转        \n        ')
            # 结束循环
            break


def printTranshipmentData(links,readnum,stripText):
    for link in links:
        url = link.get('href')
        title = link.text.strip().replace(stripText, "")
        read = readnum[0].text.strip("阅读数:")
        comment = readnum[1].text.strip("评论数:")
        data = {
            'url': url,
            'title': title,
            'readnum': read,
            'comment': comment
        }
        print(data)


def printOriginalData(links,readnum,stripText):
    for link in links:
        url = link.get('href')
        title = link.text.strip().replace(stripText, "")
        read = readnum[0].text.strip("阅读数:")
        comment = readnum[1].text.strip("评论数:")
        data = {
            'url': url,
            'title': title,
            'readnum': read,
            'comment': comment
        }
        print(data)

if __name__ == '__main__':
    username = "stormdony"
    getTranshipmentArticle(username)
    print('------------------以下是原创文章--------------')
    getOriginalArticle(username)

上面的程序,还有一个问题没有解决: 那就是分页的问题。
因为我的转载文章刚好只够一页,还不够分页的数量,所以需要添加多一条printTranshipmentData()语句。

转载:https://blog.csdn.net/stormdony/article/details/79828842

阅读全文

python3爬虫爬取网页图片简单示例

Demo1(图片存在这个位置 D:\test):

import urllib.request
import re
import os
import urllib
#根据给定的网址来获取网页详细信息,得到的html就是网页的源代码  
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html.decode('UTF-8')

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = imgre.findall(html)#表示在整个网页中过滤出所有图片的地址,放在imglist中
    x = 0
    path = 'D:\\test'  
   # 将图片保存到D:\\test文件夹中,如果没有test文件夹则创建
    if not os.path.isdir(path):  
        os.makedirs(path)  
    paths = path+'\\'      #保存在test路径下  

    for imgurl in imglist:  
        urllib.request.urlretrieve(imgurl,'{}{}.jpg'.format(paths,x))  #打开imglist中保存的图片网址,并下载图片保存在本地,format格式化字符串 
        x = x + 1  
    return imglist
html = getHtml("http://tieba.baidu.com/p/2460150866")#获取该网址网页详细信息,得到的html就是网页的源代码  
print (getImg(html)) #从网页源代码中分析并下载保存图片

Demo2(需要事先在同级目录新建一个pic目录):

import re
import urllib.request

#爬取网页html
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html


html = getHtml("http://tieba.baidu.com/p/3205263090")
html = html.decode('UTF-8')

#获取图片链接的方法
def getImg(html):
    # 利用正则表达式匹配网页里的图片地址
    reg = r'src="([.*\S]*\.jpg)" pic_ext="jpeg"'
    imgre=re.compile(reg)
    imglist=re.findall(imgre,html)
    return imglist

imgList=getImg(html)
imgCount=0
#for把获取到的图片都下载到本地pic文件夹里,保存之前先在本地建一个pic文件夹
for imgPath in imgList:
    f=open("pic/"+str(imgCount)+".jpg",'wb')
    f.write((urllib.request.urlopen(imgPath)).read())
    f.close()
    imgCount+=1
print("全部抓取完成")

Demo3(爬取图片并存入指定地点(D:/test1/abc.jpg)):

import requests
import os
url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1533128040259&di=601acd33bcb188bfeb41cb50bc51ed41&imgtype=0&src=http%3A%2F%2Fs1.sinaimg.cn%2Fmw690%2F006LDoUHzy7auXElZGE40%26690"
path = "D://test1/abc.jpg"
try:
    
        r = requests.get(url)
        with open(path,'wb') as f:
            f.write(r.content)
            f.close()
            print("文件已保存")
       
except :
    print("爬取失败")
阅读全文