css
鍦╯hell搴曚笅璋冭瘯姣旇緝鏂逛究
scrapy shell http://blog.jobbole.com/112127/
鍏堟妸鎵鏈夌殑浠g爜璐村嚭鏉
# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/category/php-programmer/']
def parse(self, response):
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
#extract()涔嬪悗鍙樻垚涓涓暟缁勶紝灏辨棤娉曚簩娆℃搷浣
# post_urls = response.css("#archive .floated-thumb .post-thumb").extract()
for post_node in post_nodes:
post_url = post_node.css("::attr(href)").extract_first("")
img_url = post_node.css("img::attr(src)").extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_img_url": img_url}, callback=self.parse_detail)
next_url = response.css(".next.page-numbers::attr(href)").extract_first()
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
article_item = JobBoleArticleItem()
# title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
# time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("路","")
# praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
title = response.css(".entry-header h1::text").extract()[0]
front_img_url = response.meta.get("front_img_url", "")
url = response.url
create_time = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace("路", "")
praise_nums = response.css(".vote-post-up h10::text").extract_first("0")
fav_nums = response.css(".bookmark-btn::text").extract_first("0")
match_nums = re.match(".*(\d+).*", fav_nums)
if(match_nums):
fav_nums = match_nums.group(1)
else:
fav_nums = 0
content = response.css("div.entry").extract()[0]
# for i, p in enumerate(content):
# print(i, p)
article_item["title"] = title
article_item["front_img_url"] = front_img_url
article_item["praise_nums"] = praise_nums
article_item["fav_nums"] = fav_nums
try:
create_time = datetime.datetime.strftime(create_time, "%Y%m%d").date()
except Exception as e:
create_time = datetime.datetime.now().date()
article_item["create_time"] = create_time
article_item["url"] = url
article_item["content"] = content
#鐢ㄨ繃item Loader鍔犺浇
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
item_loader.add_value("front_img_url", [front_img_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()
yield article_item
item_loader
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
import datetime
import re
from scrapy.loader.processors import MapCompose,TakeFirst
from scrapy.loader import ItemLoader
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def date_convert(value):
try:
create_time = datetime.datetime.strftime(value, "%Y%m%d").date()
except Exception as e:
create_time = datetime.datetime.now().date()
return create_time
def get_num_value(value):
match_nums = re.match(".*(\d+).*", value)
if (match_nums):
nums = match_nums.group(1)
else:
nums = 0
return nums
class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
# front_img_path = scrapy.Field()
url = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_num_value),
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_num_value),
)
content = scrapy.Field()
create_time = scrapy.Field(
input_processor = MapCompose(date_convert),
)
front_img_url = scrapy.Field()
Item Loaders 鎻愪緵浜嗕竴绉嶇畝渚跨殑鏋勪欢锛坢echanism锛夋潵鎶撳彇:ref:Items. 铏界劧Items鍙互浠庡畠鑷繁鐨勭被浼煎瓧鍏革紙dictionary-like锛夌殑API寰楀埌鎵闇淇℃伅 ,涓嶈繃 Item Loaders鎻愪緵浜嗚澶氭洿鍔犳柟渚跨殑API锛岃繖浜汚PI閫氳繃鑷姩瀹屾垚閭d簺鍏锋湁鍏遍氭х殑浠诲姟锛屽彲浠庢姄鍙栬繘绋嬩腑寰楀埌杩欎簺淇℃伅, 姣斿棰勫厛瑙f瀽鎻愬彇鍒扮殑鍘熺敓鏁版嵁銆 鎹㈠彞璇濇潵瑙i噴, Items 鎻愪緵浜嗙洓瑁呮姄鍙栧埌鐨勬暟鎹殑瀹瑰櫒 , 鑰孖tem Loaders鎻愪緵浜嗘瀯浠瑁呰浇populating璇ュ鍣ㄣ
item_loader鐨勪緥瀛
#鐢ㄨ繃item Loader鍔犺浇
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
item_loader.add_value("front_img_url", [front_img_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()
鍙戣〃鍥炲