|
請教已經將PAGELINK部分改成與3F大大相同..但還是出錯
快瘋了..可以請高人指點一下嗎...
from lxml import etree
import cStringIO
import codecs
import contextlib
import csv
import datetime
import glob
import itertools
import operator
import os
import os.path
import re
import urllib
import urllib2
import time
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
idfilter = re.compile(r'^\d{4}$')
def get_html(link):
with contextlib.closing(urllib2.urlopen(link)) as netfile:
html = netfile.read()
return html
def get_sii_pagenum(id):
pagelink = 'http://bsr.twse.com.tw/bshtm/bsMenu.aspx?HiddenField_page=PAGE_BS&HiddenField_spDate=&__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwUKLTQzNzI3ODE3MQ9kFgICAQ9kFg4CBQ8WAh4JaW5uZXJodG1sBQoyMDE0LzA5LzA1ZAIGDxYCHwAFCDIwMTQwOTA1ZAIIDw8WBh4JRm9udF9Cb2xkZx4EXyFTQgKEEB4JRm9yZUNvbG9yCj1kZAIKD2QWAgIBDw9kFgIeB09uQ2xpY2sFHGphdmFzY3JpcHQ6YnV0Q2xlYXJfQ2xpY2soKTtkAgwPDxYGHwFoHwIChBAfAwpHZGQCDg8PFgIeB1Zpc2libGVoZGQCEA8PFgYfAWgfAgKEEB8DCkdkZGSi2zjocmrGJllnPH1VNPbh&__EVENTVALIDATION=%2FwEdAAkJ3iIwU1Mi8o7slM8DAmCjib%2BVrHjO6GeEEDcmd50Vv%2FHinYr2havdthGBI4bn%2FuVafk25GKI%2BxFnm8toEIF08OdRfizmino3LPjd4bEI%2Fa8n%2BshoO65Mgov1Gk1LiWrVFJyI109haYg0KuKnXCrDTo3DdQysje%2Ft8sfeZbMNL%2FQdIuRSdO0Evx9MQYYVhnqHnlqy1Mf5yy3u9S%2Fzb2lLx&HiddenField_spDate=&HiddenField_page=PAGE_BS&txtTASKNO=1101&hidTASKNO=&btnOK=%E6%9F%A5%E8%A9%A2' + id
html = get_html(pagelink)
pagenum = etree.HTML(html).xpath('//span[@id="sp_ListCount"]')[0].text
return pagenum
def save_sii_csv(csvfile, html):
result = []
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.parse(cStringIO.StringIO(html), parser)
table1_first = tree.xpath('//table[@id="table1"]')[0]
tds = table1_first.xpath('tr//td[@id or @class]')
tds_text = [s.text.replace(u'\xa0', ' ').strip() for s in tds]
result.append(tds_text[0::2])
result.append(tds_text[1::2])
column_title_1 = tree.xpath('//tr[@class="column_title_1"]')[0]
column_title_1_text = [s.text.strip() for s in column_title_1.xpath('td')]
result.append(column_title_1_text)
column_value = tree.xpath('//tr[starts-with(@class, "column_value")]')
column_value_text = [[s.text.strip() for s in cv.xpath('td')] for cv in column_value]
column_value_text = filter(all, column_value_text)
column_value_text.sort(key = lambda cvt: int(cvt[0]))
result.extend(column_value_text)
with open(csvfile, 'wb') as outfile:
writer = UnicodeWriter(outfile, csv.excel, 'cp950')
writer.writerows(result)
def get_sii(idlist):
for id in idlist:
pagenum = get_sii_pagenum(id)
if pagenum:
link = 'http://bsr.twse.com.tw/bshtm/bsContent.aspx?StartNumber=' + id + '&FocusIndex=All_' + pagenum
html = get_html(link)
save_sii_csv(id + '.csv', html)
mkttypes = (('1', 'sii', get_sii),('2', 'sii', get_sii))
id_getfunc_list = []
for mktidx, mktname, getfunc in mkttypes:
with open(mktname + '.txt', 'rb') as infile:
content = infile.readlines()
idlist = [s.strip().split(' ')[0] for s in content]
idlist = [id for id in idlist if idfilter.match(id)]
id_getfunc_list.append(list(itertools.izip_longest(idlist, [getfunc], fillvalue=getfunc)))
for stuffs in itertools.izip_longest(*id_getfunc_list):
for stuff in stuffs:
if stuff is not None:
stuff[1]([stuff[0]])
time.sleep(1)
for fn in glob.iglob('*.csv'):
if os.path.getsize(fn) < 2:
os.remove(fn)
|
|