python에서 XML을 파싱할 때 주로 elementtree 라이브러리를 사용한다. 나 같이 XML에 대해 잘 몰라도 쉽게 노드에 접근 및 추가할 수 있다.

요넘을 이용한 간단한 rss 파서를 만들어봤다. 아주 간단히...ㅋㅋ

#-*-encoding:utf-8
import socket
from urllib2 import Request, urlopen
import elementtree.ElementTree as ET

class Rss(object):
    id = int()
    link = str()
    title = str()
    description = str()
    item_list = list()

class RssItem(object):
    id = int()
    title = str()
    link = str()
    description = str()
    pub_date = str()
    site_id = int()

def get_rss(rss_url):
    rss = Rss()
    req = Request(rss_url)
    rss_content = str()
    response = None

    try:
        timeout = 3
        socket.setdefaulttimeout(timeout)
        response = urlopen(req)
    except IOError, e:
        if hasattr(e, 'reason'):
            print 'We failed to reach a server.'
            print 'Reason: ', e.reason
        elif hasattr(e, 'code'):
            print 'The server couldn\'t fulfill the request.'
            print 'Error code: ', e.code
        sys.exit(0)

    try:
        rss_content = response.read()
        tree = ET.fromstring(rss_content)
        channel = tree[0]
        rss.title = channel.find('title').text.strip()
        rss.link = channel.find('link').text.strip()
        rss.description = channel.find('description').text.strip()
        items = channel.findall('item')

        for item in items:
            rss_item = RssItem()
            rss_item.title = item.find('title').text.strip()
            rss_item.link = item.find('link').text.strip()
            rss_item.description = item.find('description').text.strip()
            rss_item.pub_date = item.find('pubDate').text.strip()
            rss.item_list.append(rss_item)
    except Exception, e:
        print e

    return rss

if __name__ == '__main__':
    site = get_rss("http://no99.tistory.com/rss")
    print site.title


참고문서
(참고하기 보다는 그냥 복사해왔다는 말이 정확하겠다^^)
urllib2 - http://www.voidspace.org.uk/python/articles/urllib2.shtml
elementtree - http://effbot.org/zone/element-index.htm



Posted by xHuro
,