Getting blocked while using Scrapy (With User Agent)
问题 {#heading}
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class QuoteSpider(CrawlSpider):
name = "Quote"
allowed_domains = ["avaldsnes.spoortz.no"]
start_urls = ["https://avaldsnes.spoortz.no/portal/arego/club/7"]
rules = (Rule(LinkExtractor(allow="")),)
custom_settings = {"USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
def parse(self, response):
我尝试爬取该网站的链接,但没有一个尝试成功。目前用户代理设置为Google Bot,但我也尝试过常规的用户代理。 英文:
I am trying to scrape a casual sports-team website in my country that keeps blocking my Scrapy attempts. I have tried setting a User Agent, but without any success.. as soon as i run Scrapy, I get the 429 Unknown Status. Not one 200 success. I am able to visit the website in my browser so I know my IP is not blocked. Any help would be appreciated.
Here is the code I am using:
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class QuoteSpider(CrawlSpider):
name = "Quote"
allowed_domains = ["avaldsnes.spoortz.no"]
start_urls = ["https://avaldsnes.spoortz.no/portal/arego/club/7"]
rules = (Rule(LinkExtractor(allow="")),)
custom_settings = {"USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
def parse(self, response):
I tried Crawling the website for it´s links, But not one attempt succeeded. Right now the user agent is set as Google Bot, but I have tried regular ones as well.
答案1 {#1}
得分: 1
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class QuoteSpider(CrawlSpider):
name = "Quote"
allowed_domains = ["avaldsnes.spoortz.no"]
start_urls = ["https://avaldsnes.spoortz.no/portal/arego/club/7"]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "avaldsnes.spoortz.no",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
rules = (Rule(LinkExtractor(allow="")),)
custom_settings = {
def parse(self, response):
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://avaldsnes.spoortz.no/portal/arego/club/7> (referer: None)
In this case you need to set the headers (and not just the user agent).
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class QuoteSpider(CrawlSpider):
name = "Quote"
allowed_domains = ["avaldsnes.spoortz.no"]
start_urls = ["https://avaldsnes.spoortz.no/portal/arego/club/7"]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "avaldsnes.spoortz.no",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
rules = (Rule(LinkExtractor(allow="")),)
custom_settings = {
def parse(self, response):
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://avaldsnes.spoortz.no/portal/arego/club/7> (referer: None)