diff --git a/quotes.jsonl b/quotes.jsonl new file mode 100644 index 0000000..7cc355a --- /dev/null +++ b/quotes.jsonl @@ -0,0 +1,12 @@ +gi{"author": "Jane Austen", "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"} +{"author": "Steve Martin", "text": "\u201cA day without sunshine is like, you know, night.\u201d"} +{"author": "Garrison Keillor", "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"} +{"author": "Jim Henson", "text": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d"} +{"author": "Charles M. Schulz", "text": "\u201cAll you need is love. But a little chocolate now and then doesn't hurt.\u201d"} +{"author": "Suzanne Collins", "text": "\u201cRemember, we're madly in love, so it's all right to kiss me anytime you feel like it.\u201d"} +{"author": "Charles Bukowski", "text": "\u201cSome people never go crazy. What truly horrible lives they must lead.\u201d"} +{"author": "Terry Pratchett", "text": "\u201cThe trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.\u201d"} +{"author": "Dr. Seuss", "text": "\u201cThink left and think right and think low and think high. Oh, the thinks you can think up if only you try!\u201d"} +{"author": "George Carlin", "text": "\u201cThe reason I talk to myself is because I\u2019m the only one whose answers I accept.\u201d"} +{"author": "W.C. Fields", "text": "\u201cI am free of all prejudice. I hate everyone equally. \u201d"} +{"author": "Jane Austen", "text": "\u201cA lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.\u201d"} diff --git a/quotes_spider.py b/quotes_spider.py new file mode 100644 index 0000000..3447873 --- /dev/null +++ b/quotes_spider.py @@ -0,0 +1,19 @@ +import scrapy + +class QuoteSpider(scrapy.Spider): + name = "quotes" + start_urls = [ + 'http://quotes.toscrape.com/tag/humor/', + ] + + def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.css('small.author::text').get(), + 'text': quote.css('span.text::text').get(), + # 'tags': quote.css('div.tags a.tag::text').getall(), + } + + next_page = response.css('li.next a::attr(href)').get() + if next_page is not None: + yield response.follow(next_page, callback=self.parse) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..8d5fcaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests~=2.31.0 +ipython~=8.24.0 \ No newline at end of file