我需要计算图像数量(就本案而言,图1)。 明显使用“透镜”?
这是HTML:
<div class="detail-headline">
Fotogaléria
</div>
<div class="detail-indent">
<table id="ctl00_ctl00_ctl00_containerHolder_mainContentHolder_innnerContentHolder_ZakazkaControl_ZakazkaObrazky1_ObrazkyDataList" cellspacing="0" border="0" style="width:100%;border-collapse:collapse;">
<tr>
<td align="center" style="width:25%;">
<div id="ctl00_ctl00_ctl00_containerHolder_mainContentHolder_innnerContentHolder_ZakazkaControl_ZakazkaObrazky1_ObrazkyDataList_ctl02_PictureContainer">
<a title="1-izb. Kaspická" class="highslide detail-img-link" onclick="return hs.expand(this);" href="/imgcache/cache231/3186-000393~8621457~640x480.jpg"><img src="/imgcache/cache231/3186-000393~8621457~120x120.jpg" class="detail-img" width="89" height="120" alt="1-izb. Kaspická" /></a>
</div>
</td><td></td>
</tr>
</table>
</div>
I used before HTMLParser and the number of images must be added to "self.srcData".. Previous code:
def handle_starttag(self, tag, attrs):
if tag == div and len(attrs) > 1 and attrs[1][0] == class and attrs[1][1] == detail-headline
and self.srcData[self.getpos()[0]].strip() == u Realitná kancelária :
self.status = 2
if self.status == 2 and tag == div and len(attrs) > 0 and attrs[0][0] == class and attrs[0][1] == name :
self.record[-1] = decode(self.srcData[self.getpos()[0]].strip())
self.status = 0
那么(检查起始标记)..像这样吗?
if tag == div and len(attrs) > 0 and attrs[0][0] == class and attrs[0][1] == detail-headline
and self.srcData[self.getpos()[0]].strip() == Fotogaléria :
self.status = 3
可以吗?还有呢?谢谢。
import urllib
import urllib2
import HTMLParser
import codecs
import time
from BeautifulSoup import BeautifulSoup
# decode string
def decode(istr):
ostr = u
idx = 0
while idx < len(istr):
add = True
if istr[idx] == & and len(istr) > idx + 1 and istr[idx + 1] == # :
iend = istr.find( ; , idx)
if iend > idx:
ostr += unichr(int(istr[idx + 2:iend]))
idx = iend
add = False
if add:
ostr += istr[idx]
idx += 1
return ostr
# parser 1
class FlatDetailParser (HTMLParser.HTMLParser):
def __init__ (self):
HTMLParser.HTMLParser.__init__(self)
def loadDetails(self, link):
self.record = (len(self.characts) + 1) * [ ]
self.status = 0
self.index = -1
self.reset()
request = urllib2.Request(link)
data = urllib2.urlopen(request) # URL obtained from the next class
self.srcData = []
for line in data:
line = line.decode( utf8 )
self.srcData.append(line)
for line in self.srcData:
self.feed(line)
self.close()
return self.record
def handle_starttag(self, tag, attrs):
if tag == div and len(attrs) > 1 and attrs[1][0] == class and attrs[1][1] == detail-headline
and self.srcData[self.getpos()[0]].strip() == u Realitná kancelária :
self.status = 2
if self.status == 2 and tag == div and len(attrs) > 0 and attrs[0][0] == class
and attrs[0][1] == name :
self.record[-1] = decode(self.srcData[self.getpos()[0]].strip())
self.status = 0
下一个解析器类,并将数据添加到txt文件中。
When I use BeautifulSoup.. What is soup=BeautifulSoup(???). How can I add to srcData? This can be combined? How?