最近需要使用rss, 下载到自己的数据库,方便阅读,快速搞,当然用python, 库齐全嘛,而且资料多,废话不说,以下是代码:
#coding=utf8
import requests
import re,os,time,sys
import string
import socket
socket.setdefaulttimeout(20)
import feedparser
# feedparser 文档=> http://pythonhosted.org/feedparser/
def GetHtml3(mhtml):
headers = dict()
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
gg=1
while 1:
s= requests.Session()
try:
ss = s.get(mhtml,headers=headers,timeout=19)
#print ss.text
ct = ss.headers['Content-Type']
print 'Content-Type:',ct
if ct.find('xml') ==-1:
return 'no,data'
WS_response1=ss.content
try:
WS_response2 = WS_response1.decode("gbk").encode('utf8','ignore')
bianma= "gbk"
except:
WS_response2 = WS_response1
bianma= "utf8"
print "bianma:",bianma
return WS_response2
except Exception, e:
errstr= str(e)
print "GetHtml2 error:",errstr
if errstr.find('HTTP Error 400') !=-1:
return 'no,data'
if gg>1:
print "have try again , system exit\n"
return 'no,data'
#sys.exit()
print "2:......try again"+str(gg)
time.sleep(2)
gg=gg+1
pass
def chuli_rss(getstrx):
try:
d = feedparser.parse(getstrx)
print d.feed.title
#print d.entries[0].title
print len(d['items'])
if len(d['items'])!=0:
for x in xrange(len(d['items'])):
print d.entries[x].title
print d.entries[x].link
print d.entries[x].description
print d.entries[x].published
#print d.entries[x].keys()
return "ok"
except Exception, e:
print "chuli rss error:",e
return ""
eurl="https://www.lpfrx.com/feed/"
pstr = GetHtml3(eurl)
chuli_rss(pstr)
feedparser和requests库需要安装的,最近蛮喜欢用requests这个库,比urllib方便.