最近需要使用rss, 下载到自己的数据库,方便阅读,快速搞,当然用python, 库齐全嘛,而且资料多,废话不说,以下是代码:
#coding=utf8 import requests import re,os,time,sys import string import socket socket.setdefaulttimeout(20) import feedparser # feedparser 文档=> http://pythonhosted.org/feedparser/ def GetHtml3(mhtml): headers = dict() headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31" headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" gg=1 while 1: s= requests.Session() try: ss = s.get(mhtml,headers=headers,timeout=19) #print ss.text ct = ss.headers['Content-Type'] print 'Content-Type:',ct if ct.find('xml') ==-1: return 'no,data' WS_response1=ss.content try: WS_response2 = WS_response1.decode("gbk").encode('utf8','ignore') bianma= "gbk" except: WS_response2 = WS_response1 bianma= "utf8" print "bianma:",bianma return WS_response2 except Exception, e: errstr= str(e) print "GetHtml2 error:",errstr if errstr.find('HTTP Error 400') !=-1: return 'no,data' if gg>1: print "have try again , system exit\n" return 'no,data' #sys.exit() print "2:......try again"+str(gg) time.sleep(2) gg=gg+1 pass def chuli_rss(getstrx): try: d = feedparser.parse(getstrx) print d.feed.title #print d.entries[0].title print len(d['items']) if len(d['items'])!=0: for x in xrange(len(d['items'])): print d.entries[x].title print d.entries[x].link print d.entries[x].description print d.entries[x].published #print d.entries[x].keys() return "ok" except Exception, e: print "chuli rss error:",e return "" eurl="https://www.lpfrx.com/feed/" pstr = GetHtml3(eurl) chuli_rss(pstr)
feedparser和requests库需要安装的,最近蛮喜欢用requests这个库,比urllib方便.