下面应符合:
# Python 3.6.4
# Scrapy 1.5.1
# SQLite 3.21.0+
# APSW 3.9.2.post1
如果您希望使用APSW,则仅取代评论中所指出的qlite。
如果您有<><<>条码>项目>,py。 参看:
...
class NewAdsItem(Item):
AdId = Field()
DateR = Field()
DateA = Field()
DateN = Field()
DateE = Field()
URL = Field() # AdURL
在<><<>条码>上,py:
ITEM_PIPELINES = {
adbot.pipelines.DbPipeline : 100,
}
SQLITE_FILE = mad.db
SQLITE_TABLE = ads
在<><>条码>中,py:
import os
import sqlite3 # pip install pysqlite3
#import apsw # pip install apsw
from scrapy import signals
from scrapy.conf import settings
from adbot.items import NewAdsItem # Get items from "items.py"
con = None
ikeys = None
class DbPipeline(object):
dbfile = settings.get( SQLITE_FILE ) # ./test.db
dbtable = settings.get( SQLITE_TABLE )
def __init__(self):
self.setupDBCon()
self.createTables()
def setupDBCon(self):
#self.con = apsw.Connection(self.dbfile) # apsw
self.con = sqlite3.connect(self.dbfile) # sqlite3
self.cur = self.con.cursor()
def createTables(self):
self.dropDbTable()
self.createDbTable()
def dropDbTable(self):
print("Dropping old table: %s" % self.dbtable )
self.cur.execute("DROP TABLE IF EXISTS %s" % self.dbtable )
def closeDB(self):
self.con.close()
def __del__(self):
self.closeDB()
def createDbTable(self):
print("Creating new table: %s" % self.dbtable )
#------------------------------
# Construct the item strings:
#------------------------------
# NOTE: This does not work, because items.py class re-orders the items!
#self.ikeys = NewAdsItem.fields.keys()
#print("Keys in creatDbTable: %s" % ",".join(self.ikeys) )
#cols = " TEXT, ".join(self.ikeys) + " TEXT"
#print("cols: %s:" % cols, flush=True)
#------------------------------
cols = "AdId TEXT, DateR TEXT, DateA TEXT, DateN TEXT, DateE TEXT, URL TEXT"
# NOTE: Use "INSERT OR IGNORE", if you also use: "AdId TEXT NOT NULL UNIQUE"
sql = "CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY NOT NULL, %s)" % (self.dbtable, cols )
#print (sql)
self.cur.execute(sql)
def process_item(self, item, spider):
self.storeInDb(item)
return item
def storeInDb(self, item):
# NOTE: Use "INSERT OR IGNORE", if you also use: "AdId TEXT NOT NULL UNIQUE"
# "INSERT INTO ads ( AdId, DateR, AdURL ) VALUES (?, ?, ?)"
sql = "INSERT INTO {0} ({1}) VALUES ({2})".format(self.dbtable, , .join(item.keys()), , .join([ ? ] * len(item.keys())) )
# (item.get( AdId , ),item.get( DateR , ),item.get( AdURL , ), ...)
itkeys = , .join(item.keys()) # item keys as a list
itvals = , .join(item.values()) # item values as a list
ivals = tuple(item.values()) # item values as a tuple
#print (sql)
#print(" itkeys: %s" % itkeys, flush=True)
#print(" itvals: %s" % itvals, flush=True)
self.cur.execute(sql, ivals) # WARNING: Does NOT handle [] s ==> use: in spider
self.con.commit() # used in sqlite3 ONLY! (Remove for APSW)
之后,你可以把行文从指挥线检查到:
echo "select * from ads;" | sqlite3 -csv -header mad.db
<><>
Because a difference in how the items.py
item keys are ordered
when obtained via item.keys()
or by importing your item class
directly via self.ikeys = NewAdsItem.fields.keys()
, you will find
that the first case is sorted according to the order of appearance (in the file),
whereas in the second case it is alphabetically ordered. This is very
sad, since it creates trouble when you re trying to create the DB
tables dynamically, before having executed process_item()
.