This is the BaseSpider example from the Scrapy tutorial:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
domain_name = "dmoz.org"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select( //ul[2]/li )
items = []
for site in sites:
item = DmozItem()
item[ title ] = site.select( a/text() ).extract()
item[ link ] = site.select( a/@href ).extract()
item[ desc ] = site.select( text() ).extract()
items.append(item)
return items
SPIDER = DmozSpider()
I copied it with changes for my project:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from firm.items import FirmItem
class Spider1(CrawlSpider):
domain_name = wc2
start_urls = [ http://www.whitecase.com/Attorneys/List.aspx?LastName=A ]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select( //td[@class="altRow"][1]/a/@href ).re( /.aw+ )
items = []
for site in sites:
item = FirmItem
item[ school ] = hxs.select( //td[@class="mainColumnTDa"] ).re( (JD)(.*?)(d+) )
items.append(item)
return items
SPIDER = Spider1()
and I get the error
[wc2] ERROR: Spider exception caught while processing
<http://www.whitecase.com/Attorneys/List.aspx?LastName=A> (referer: <None>):
[Failure instance: Traceback: <type exceptions.TypeError >:
ItemMeta object does not support item assignment
I would greatly appreciate it if experts here take a look at the code and give me a clue about where I am going wrong.
Thank you