ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 120} 啟用 FILES_STORE = '/path/to/valid/dir' 文件管道存放位置 IMAGES_STORE = '/path/to/valid/dir' 圖片管道存放位置 FILES_URLS_FIELD = 'field_name_for_your_files_urls' 自定義文件url字段 FILES_RESULT_FIELD = 'field_name_for_your_processed_files' 自定義結(jié)果字段 IMAGES_URLS_FIELD = 'field_name_for_your_images_urls' 自定義圖片url字段 IMAGES_RESULT_FIELD = 'field_name_for_your_processed_images' 結(jié)果字段 FILES_EXPIRES = 90 文件過期時間 默認(rèn)90天 IMAGES_EXPIRES = 90 圖片過期時間 默認(rèn)90天 IMAGES_THUMBS = {'small': (50, 50), 'big':(270, 270)} 縮略圖尺寸 IMAGES_MIN_HEIGHT = 110 過濾最小高度 IMAGES_MIN_WIDTH = 110 過濾最小寬度 MEDIA_ALLOW_REDIRECTS = True 是否重定向
#解析settings里的配置字段 def __init__(self, store_uri, download_func=None, settings=None) #圖片下載 def image_downloaded(self, response, request, info) #圖片獲取 圖片大小的過濾 #縮略圖的生成 def get_images(self, response, request, info) #轉(zhuǎn)化圖片格式 def convert_image(self, image, size=None) #生成媒體請求 可重寫 def get_media_requests(self, item, info) return [Request(x) for x in item.get(self.images_urls_field, [])] #得到圖片url 變成請求 發(fā)給引擎 #此方法獲取文件名 進(jìn)行改寫 def item_completed(self, results, item, info) #文件路徑 def file_path(self, request, response=None, info=None) #縮略圖的存儲路徑 def thumb_path(self, request, thumb_id, response=None, info=None):
(當(dāng)然不使用圖片管道的話也是可以爬取百度圖片的,但這還需要我們?nèi)シ治鼍W(wǎng)頁的代碼,還是有點(diǎn)麻煩,使用圖片管道就可以省去這個步驟了)
注意:由于需要添加所有的請求頭,所以我們要重寫start_requests函數(shù)
import re import scrapy from ..items import DbimgItem class DbSpider(scrapy.Spider): name = 'db' # allowed_domains = ['xxx.com'] start_urls = ['https://image.baidu.com/search/index?tn=baiduimageipn=rct=201326592cl=2lm=-1st=-1fm=indexfr=hs=0xthttps=111110sf=1fmq=pv=ic=0nc=1z=se=1showtab=0fb=0width=height=face=0istype=2ie=utf-8word=%E7%8B%97oq=%E7%8B%97rsp=-1'] def start_requests(self): #因?yàn)樾枰砑铀械恼埱箢^,所以我們要重寫start_requests函數(shù) # url = 'https://image.baidu.com/search/index?tn=baiduimageipn=rct=201326592cl=2lm=-1st=-1fm=indexfr=hs=0xthttps=111110sf=1fmq=pv=ic=0nc=1z=se=1showtab=0fb=0width=height=face=0istype=2ie=utf-8word=%E7%8B%97oq=%E7%8B%97rsp=-1' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "BIDUPSID=4B61D634D704A324E3C7E274BF11F280; PSTM=1624157516; BAIDUID=4B61D634D704A324C7EA5BA47BA5886E:FG=1; __yjs_duid=1_f7116f04cddf75093b9236654a2d70931624173362209; BAIDUID_BFESS=101022AEE931E08A9B9A3BA623709CFE:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; cleanHistoryStatus=0; H_PS_PSSID=34099_33969_34222_31660_34226_33848_34113_34073_33607_34107_34134_34118_26350_22159; delPer=0; PSINO=6; BA_HECTOR=24ak842ka421210koq1gdtj070r; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; userFrom=www.baidu.com; firstShowTip=1; indexPageSugList=%5B%22%E7%8B%97%22%2C%22%E7%8C%AB%E5%92%AA%22%2C%22%E5%B0%8F%E9%80%8F%E6%98%8E%22%5D; ab_sr=1.0.1_OGYwMTZiMjg5ZTNiYmUxODIxOTgyYTllZGMyMzhjODE2ZWE5OGY4YmEyZWVjOGZhOWIxM2NlM2FhZTQxMmFjODY0OWZiNzQxMjVlMWIyODVlZWFiZjY2NTQyMTZhY2NjNTM5NDNmYTFmZjgxMTlkOGYxYTUzYTIzMzA0NDE3MGNmZDhkYTBkZmJiMmJhZmFkZDNmZTM1ZmI2MWZkNzYyYQ==", "Host": "image.baidu.com", "Referer": "https://image.baidu.com/", "sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', "sec-ch-ua-mobile": "?0", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36" } for url in self.start_urls: yield scrapy.Request(url,headers=headers,callback=self.parse,dont_filter=True) def parse(self, response): img_urls = re.findall('"thumbURL":"(.*?)"', response.text) # print(img_urls) item = DbimgItem() item['image_urls'] = img_urls yield item
import scrapy class DbimgItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() image_urls = scrapy.Field()
ROBOTSTXT_OBEY = False #打開我們寫的管道 ITEM_PIPELINES = { # 'dbimg.pipelines.DbimgPipeline': 300, 'dbimg.pipelines.ImgPipe': 300, } #圖片存放位置 IMAGES_STORE = 'D:/python test/爬蟲/scrapy6/dbimg/imgs'
import os from itemadapter import ItemAdapter from scrapy.pipelines.images import ImagesPipeline import settings """ def item_completed(self, results, item, info): with suppress(KeyError): ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok] return item """ class ImgPipe(ImagesPipeline): num=0 #重寫此函數(shù)修改獲取的圖片的名字 不然圖片名稱就是一串?dāng)?shù)字字母 def item_completed(self, results, item, info): images_path = [x['path'] for ok, x in results if ok] #print('results: ',results) 先查看下results的數(shù)據(jù)格式,然后才能獲取到我們需要的值 for image_path in images_path: os.rename(settings.IMAGES_STORE + "/" + image_path, settings.IMAGES_STORE + "/" + str(self.num) + ".jpg") self.num += 1
結(jié)果:
以上就是python爬蟲Scrapy框架:媒體管道原理學(xué)習(xí)分析的詳細(xì)內(nèi)容,更多關(guān)于python爬蟲Scrapy框架的資料請關(guān)注腳本之家其它相關(guān)文章!
標(biāo)簽:銀川 葫蘆島 安慶 三亞 呼倫貝爾 湘西 呼倫貝爾 烏魯木齊
巨人網(wǎng)絡(luò)通訊聲明:本文標(biāo)題《python爬蟲Scrapy框架:媒體管道原理學(xué)習(xí)分析》,本文關(guān)鍵詞 python,爬蟲,Scrapy,框架,媒體,;如發(fā)現(xiàn)本文內(nèi)容存在版權(quán)問題,煩請?zhí)峁┫嚓P(guān)信息告之我們,我們將及時溝通與處理。本站內(nèi)容系統(tǒng)采集于網(wǎng)絡(luò),涉及言論、版權(quán)與本站無關(guān)。