python获取Rss

最近需要使用rss, 下载到自己的数据库,方便阅读,快速搞,当然用python, 库齐全嘛,而且资料多,废话不说,以下是代码:

#coding=utf8

import requests
import re,os,time,sys
import string
import socket

socket.setdefaulttimeout(20) 
import feedparser



# feedparser 文档=> http://pythonhosted.org/feedparser/


def GetHtml3(mhtml):
   headers = dict()
   headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31"
   headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"

   
   gg=1 
   while 1:
      s= requests.Session()
      try:
         
         ss = s.get(mhtml,headers=headers,timeout=19)
         #print ss.text
         ct = ss.headers['Content-Type']
         print 'Content-Type:',ct
         if ct.find('xml') ==-1:
            return 'no,data'

         WS_response1=ss.content
         try:
            WS_response2 = WS_response1.decode("gbk").encode('utf8','ignore')
            bianma= "gbk"
         except:
            WS_response2 =  WS_response1
            bianma= "utf8"
            
         print "bianma:",bianma
         return WS_response2
      except Exception, e:
         
         errstr= str(e)
         print "GetHtml2 error:",errstr
         if errstr.find('HTTP Error 400') !=-1:
            return 'no,data'
          
            
         if gg>1:
            print "have try again , system exit\n"
            return 'no,data'
            #sys.exit()
            
         print  "2:......try again"+str(gg)
         time.sleep(2)
         gg=gg+1
         pass


def chuli_rss(getstrx):
   try:
      d = feedparser.parse(getstrx)
      print d.feed.title
      #print d.entries[0].title
      print len(d['items'])

      if len(d['items'])!=0:
         for x in xrange(len(d['items'])):
            print d.entries[x].title
            print d.entries[x].link
            print d.entries[x].description
            print d.entries[x].published
            #print d.entries[x].keys()


      return "ok"
   except Exception, e:
      print "chuli rss error:",e
      return ""



eurl="https://www.lpfrx.com/feed/"

pstr = GetHtml3(eurl)

chuli_rss(pstr)

feedparser和requests库需要安装的,最近蛮喜欢用requests这个库,比urllib方便.

关于无聊人

一个无聊人而已
此条目发表在python分类目录。将固定链接加入收藏夹。

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注