add scrapy

2024-05-25 22:26:20 +08:00
parent e4d3170769
commit 51c54107ce
3 changed files with 33 additions and 0 deletions
--- a/quotes.jsonl
+++ b/quotes.jsonl
@@ -0,0 +1,12 @@
 gi{"author": "Jane Austen", "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"}
 {"author": "Steve Martin", "text": "\u201cA day without sunshine is like, you know, night.\u201d"}
 {"author": "Garrison Keillor", "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"}
 {"author": "Jim Henson", "text": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d"}
 {"author": "Charles M. Schulz", "text": "\u201cAll you need is love. But a little chocolate now and then doesn't hurt.\u201d"}
 {"author": "Suzanne Collins", "text": "\u201cRemember, we're madly in love, so it's all right to kiss me anytime you feel like it.\u201d"}
 {"author": "Charles Bukowski", "text": "\u201cSome people never go crazy. What truly horrible lives they must lead.\u201d"}
 {"author": "Terry Pratchett", "text": "\u201cThe trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.\u201d"}
 {"author": "Dr. Seuss", "text": "\u201cThink left and think right and think low and think high. Oh, the thinks you can think up if only you try!\u201d"}
 {"author": "George Carlin", "text": "\u201cThe reason I talk to myself is because I\u2019m the only one whose answers I accept.\u201d"}
 {"author": "W.C. Fields", "text": "\u201cI am free of all prejudice. I hate everyone equally. \u201d"}
 {"author": "Jane Austen", "text": "\u201cA lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.\u201d"}
--- a/quotes_spider.py
+++ b/quotes_spider.py
@@ -0,0 +1,19 @@
 import scrapy
 class QuoteSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/tag/humor/',
    ]
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'author': quote.css('small.author::text').get(),
                'text': quote.css('span.text::text').get(),
                # 'tags': quote.css('div.tags a.tag::text').getall(),
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 requests~=2.31.0
 ipython~=8.24.0