
无需加好友免费技术支持
爬虫,如豆瓣影片、书本、工作组、相册、物品等。
编码详细地址:私信推送:豆瓣爬虫,系统软件自动回复内容下载地址。下载地址不可以放到文章里,只能这样。
####依靠服务项目
MongoDB
####架包
pip install scrapy
pip install pybloom
pip install pymongo
###
进入douban/movie文件目录
实行scrapy crawl movie
###
进入douban/album文件目录
实行scrapy crawl album
主编码表明:
1#encoding: utf-8
2 from scrapy import Field, Item
3
4 class MovieItem(Item):
5 subject_id = Field()
6 name = Field()
7 year = Field()
8 directors = Field()
9 actors = Field()
10 languages = Field()
11 genres = Field() #种类
12 runtime = Field()
13 stars = Field() #5星 4星 3星 2星 每一颗星的总数, 顺序为:5 4 3 2 1
14 channel = Field()
15 average = Field() #均分
16 vote = Field() #得分总数
17 tags = Field()
18 watched = Field() #看了
19 wish = Field() #爱看
20 comment = Field() #微评数
21 question = Field() #提出问题数
22 review = Field() #电影影评数
23 discussion = Field() #探讨
24 image = Field() #照片数
25 countries = Field() #制片人我国
26 summary = Field()
27
28
29 #豆瓣相册 文本格式
30 AlbumItem = dict(
31 from_url = "",
32 album_name = "青少年听雨歌楼,青壮年画雨客舟",
33 author = dict(
34 home_page = "",
35 nickname = "等温线",
36 avatar = "",
37 ),
38 photos = [
39 dict(
40 large_img_url = "",
41 like_count = 2,
42 recommend_count = 22,
43 desc = "蒸排骨的李子果达粉!",
44 comments = [
45 dict(
46 avatar = "",
47 nickname = "muse",
48 post_datetime = "2014-07-29 08:37:14",
49 content = "都看淌口水",
50 home_page = "",
51 ),
52 ]
53 ),
54 ],
55 tags = ["漂亮美女", "标识", "时尚潮流"],
56 recommend_total = 67,
57 like_total = 506,
58 create_date = "2014-07-21",
59 photo_count = 201,
60 follow_count = 37,
61 desc = "蛇蛇蛇 马马马",
62 )
63
64 class AlbumItem(Item):
65 album_name = Field()
66 author = Field()
67 photos = Field()
68 recommend_total = Field()
69 like_total = Field()
70 create_date = Field()
71 from_url = Field()
72 photo_count = Field()
73 follow_count = Field()
74 desc = Field()
75 tags = Field()
76
77
78 class PhotoItem(Item):
79 large_img_url = Field()
80 like_count = Field()
81 recommend_count = Field()
82 desc = Field()
#encoding: utf-8
2 import scrapy
3 from scrapy.contrib.linkextractors import LinkExtractor
4 from scrapy.contrib.spiders import CrawlSpider, Rule
5
6 from misc.store import doubanDB
7 from parsers import *
8
9 class AlbumSpider(CrawlSpider):
10 name = "album"
11 allowed_domains = [""]
12 start_urls = [
13 "",
14 ]
15
16 rules = (
17 #相册详细信息
18 Rule(LinkExtractor(allow=r"($|?start=d )"),
19 callback="parse_album",
20 follow=True
21 ),
22
23 #相片详细信息
24 Rule(LinkExtractor(allow=r""),
25 callback = "parse_photo",
26 follow = True
27 ),
28
29 #豆列结合
30 # Rule(LinkExtractor(allow=r""),
31 # follow=True
32 # ),
33
34 #单独豆列
35 Rule(LinkExtractor(allow=r""),
36 follow=True
37 ),
38 )
39
40 def parse_album(self, response):
41 album_parser = AlbumParser(response)
42 item = dict(album_parser.item)
43
44 if album_parser.next_page: return None
45 spec = dict(from_url = item["from_url"])
46 doubanDB.album.update(spec, {"$set": item}, upsert=True)
47
48 def parse_photo(self, response):
49 single = SinglePhotoParser(response)
50 from_url = single.from_url
51 if from_url is None: return
52 doc = doubanDB.album.find_one({"from_url": from_url}, {"from_url":True})
53
54 item = dict(single.item)
55 if not doc:
56 new_item = {}
57 new_item["from_url"] = from_url
58 new_item["photos"] = item
59 doubanDB.album.save(new_item)
60 else:
61 spec = {"from_url": from_url}
62 doc = doubanDB.album.find_one({"photos.large_img_url": item["large_img_url"]})
63 if not doc:
64 doubanDB.album.update(spec, {"$push": {"photos": item}})
65
66 cp = CommentParser(response)
67 comments = cp.get_comments()
68 if not comments: return
69 large_img_url = item["large_img_url"]
70 spec = {"photos.large_img_url": large_img_url }
71 doubanDB.album.update(spec, {"$set": {"photos.$.comments": comments} }, upsert=True)
编码详细地址:私信推送:豆瓣爬虫,系统软件自动回复内容下载地址。下载地址不可以放到文章里,只能这样。