add scrapy

This commit is contained in:
lostecho
2024-05-25 22:26:20 +08:00
parent e4d3170769
commit 51c54107ce
3 changed files with 33 additions and 0 deletions

12
quotes.jsonl Normal file
View File

@@ -0,0 +1,12 @@
gi{"author": "Jane Austen", "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"}
{"author": "Steve Martin", "text": "\u201cA day without sunshine is like, you know, night.\u201d"}
{"author": "Garrison Keillor", "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"}
{"author": "Jim Henson", "text": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d"}
{"author": "Charles M. Schulz", "text": "\u201cAll you need is love. But a little chocolate now and then doesn't hurt.\u201d"}
{"author": "Suzanne Collins", "text": "\u201cRemember, we're madly in love, so it's all right to kiss me anytime you feel like it.\u201d"}
{"author": "Charles Bukowski", "text": "\u201cSome people never go crazy. What truly horrible lives they must lead.\u201d"}
{"author": "Terry Pratchett", "text": "\u201cThe trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.\u201d"}
{"author": "Dr. Seuss", "text": "\u201cThink left and think right and think low and think high. Oh, the thinks you can think up if only you try!\u201d"}
{"author": "George Carlin", "text": "\u201cThe reason I talk to myself is because I\u2019m the only one whose answers I accept.\u201d"}
{"author": "W.C. Fields", "text": "\u201cI am free of all prejudice. I hate everyone equally. \u201d"}
{"author": "Jane Austen", "text": "\u201cA lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.\u201d"}

19
quotes_spider.py Normal file
View File

@@ -0,0 +1,19 @@
import scrapy
class QuoteSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'author': quote.css('small.author::text').get(),
'text': quote.css('span.text::text').get(),
# 'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

View File

@@ -0,0 +1,2 @@
requests~=2.31.0
ipython~=8.24.0