#!/usr/bin/python
# -*- coding: utf-8 -*-
#
#  The zlib/libpng License
#  Copyright (c) 2005 skyisle <skyisle@gmail.com>
#
#  This software is provided 'as-is', without any express or implied warranty. 
#  In no event will the authors be held liable for any damages arising from the use of this software.
#
#  Permission is granted to anyone to use this software for any purpose, 
#  including commercial applications, and to alter it and redistribute it freely, 
#  subject to the following restrictions:
#
#  1. The origin of this software must not be misrepresented; 
#     you must not claim that you wrote the original software. 
#     If you use this software in a product, 
#     an acknowledgment in the product documentation would be appreciated but is not required.
#
#  2. Altered source versions must be plainly marked as such, 
#     and must not be misrepresented as being the original software.
#
#  3. This notice may not be removed or altered from any source distribution.
#
# Author: skyisle <skyisle@gmail.com>

import socket, httplib, urllib
import time, string, re
import datetime
import urllib2

WWWENCODING = 'cp949'

def unescape(data, entities={}):
    """Unescape &amp;, &lt;, and &gt; in a string of data.

    You can unescape other strings of data by passing a dictionary as
    the optional entities parameter.  The keys and values must all be
    strings; each key will be replaced with its corresponding value.
    """
    data = data.replace("&lt;", "<")
    data = data.replace("&gt;", ">")
    if entities:
        data = __dict_replace(data, entities)
    # must do ampersand last
    return data.replace("&amp;", "&")

class Comment:
    """
    """
    author     = None
    author_url = None
    date       = None
    content    = None
    approved   = None

    def __init__ (self, author , author_url, date ,content ,approved):
        self.author     = author
        self.author_url = author_url
        self.date       = date
        self.content    = content
        self.approved   = approved

    def __str__ (self):
        return '[' + self.author.encode(WWWENCODING) + '] ' + str(self.date) +  ' :' + self.content.encode(WWWENCODING)

    def __lt__ (self, other):
    	return self.date < other.date

class Image:
    desc       = None
    width       = None
    height      = None
    src_url     = None

    def __init__ (self, desc, width, height, src_url ):
        self.desc        = desc
        self.width       = width
        self.height      = height
    	self.src_url     = src_url

class Post:
    title    = None
    author   = None
    date     = None
    content  = None
    images   = None
    category = None

    status = 'private'
    comment_status = 'open'
    ping_status    = 'open'

    comments = []
    images = []

    def __init__ (self, title, author, date, content, images,  category, status, comment_status, comments, ping_status):
        self.title     = title
        self.author    = author
        self.date      = date
        self.title     = title
        self.author    = author
        self.date      = date
        self.content   = content
        self.images    = images
        self.category  = category

        self.status    = status

        self.comment_status = comment_status
        self.comments  = comments
        self.ping_status    = ping_status

    def __str__ (self):
        strRet = 'title : ' + self.title.encode(WWWENCODING) + '\n' + 'author : ' + self.author.encode(WWWENCODING) + '\n' + 'date : ' + str(self.date) + '\n' + 'status : ' + self.status.encode(WWWENCODING)
        strRet += '\n' + self.content.encode(WWWENCODING)
        for comment in self.comments:
            strRet += '\n\t' + str(comment)
        return    strRet

    def __lt__ (self, other):
    	return self.date < other.date


class webConnecter:
    HOST = ''
    def __init__ (self):
        self.opener = urllib2.build_opener()
        self.opener.add_handler(urllib2.HTTPCookieProcessor())
        self.opener.addheaders = [('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.0; ko-KR; rv:1.7.8) Gecko/20050511 Firefox/1.0.4'),
                        ('Accept', 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'),
                        ('Accept-Language', 'ko-kr,ko;q=0.8,en-us;q=0.5,en;q=0.3'),
                        ('Accept-Encoding', 'gzip,deflate'),
                        ('Accept-Charset', 'cp949,utf-8;q=0.7,*;q=0.7'),
                        ('Keep-Alive', '300'),
                        ('Connection', 'keep-alive') ]


    def HTTPPostContents( self, sURL, sParams ,Referer = None):
        sParams = urllib.urlencode( sParams );
        if Referer:
            self.setReferer( Referer)

        r = self.opener.open( sURL ,sParams);
        return r.read()

    def HTTPGetContents( self, sURL ,Referer = None):
        if Referer:
            self.setReferer( Referer)
        r = self.opener.open( sURL );
        return r.read()

    def removeReferer ( self ):
        headers = self.opener.addheaders
        for header in headers :
            if header[0] == 'Referer':
                headers.remove(header)
                return
        return

    def setReferer (self, Referer):
        self.removeReferer()
        headers = self.opener.addheaders
        headers.append( ('Referer', Referer) )



class egloosGrabber(webConnecter):
    def __init__ (self, host=None, loginurl=None):
        webConnecter.__init__(self)
        if host == None:
            self.HOST = 'http://www.egloos.com'
        else:
            self.HOST = host
        if loginurl == None:
            self.LOGINURL = 'http://www.egloos.com/authid.asp'
        else:
            self.LOGINURL = loginurl

    def Login (self, sMyUrl, USERID, PASSWD ):
        print self.HOST
        self.HTTPGetContents(self.HOST, self.HOST);
        sParams = {'userid': USERID, 'userpwd': PASSWD }
        print self.LOGINURL
        self.HTTPPostContents(self.LOGINURL, sParams, self.HOST)
        self.USERHOST = sMyUrl

    def GetCategories (self):
        categories = {}
        url = self.USERHOST
        print url
        content = self.HTTPGetContents(url)
        # 카테고리 = \xc4\xab\xc5\xd7\xb0\xed\xb8\xae
        cate1_pat = re.compile(r'<DIV CLASS=[A-Z]+>\xc4\xab\xc5\xd7\xb0\xed\xb8\xae</DIV>\s*<DIV CLASS=[A-Z]+>(.+?)</DIV>',re.I)
        cate2_pat = re.compile(r'<DIV CLASS=[A-Z]+>\xc4\xab\xc5\xd7\xb0\xed\xb8\xae</DIV>\s*(.+?)\s*<DIV',re.I)
        cate_dep2_pat = re.compile(r'<A HREF='+self.USERHOST+'(?P<category_url>/l\d+)>(?P<category_name>.+?)</A><BR>',re.I)

        #print content.decode(WWWENCODING).encode('utf-8')
        m = cate1_pat.search(content)
        if m == None:
            m = cate2_pat.search(content)

        cateiter = cate_dep2_pat.finditer(m.group(1))

        for cate in cateiter:
            if cate.group('category_name') <> None and cate.group('category_name') <> '\xc0\xfc\xc3\xbc' : #전체
                if cate.group('category_name') == '\xb9\xcc\xba\xd0\xb7\xf9': # 미분류
                    categories[ 'Uncategorized' ] = cate.group('category_url')
                else:
                    categories[ cate.group('category_name').decode(WWWENCODING) ] = cate.group('category_url')

        return categories

    def GetPost( self, uid ):
        #print uid.group('uid')
        url = self.USERHOST + '/' + uid
        print url
        content1 = self.HTTPGetContents(url)

        post_pat = re.compile(r'<DIV CLASS=POST>\s*(.+?)\s*</DIV>.+?<DIV CLASS=POST_BODY>\s*(.*?)\s*</DIV>\s*<DIV CLASS=POST_TAIL.*?>(.+?)</DIV>',re.S | re.I)
        subject_pat = re.compile(r'<A NAME=(\d+)>(.+?)</A>',re.I)
        s_subject_pat = re.compile(r'<img src=http://md.egloos.com/img/eg/post_security.gif WIDTH=13 HEIGHT=16 ALIGN=ABSMIDDLE> <A NAME=(\d+)>(.+?)</A>' ,re.I)
        author_time1_pat = re.compile(r'by <SPAN CLASS=AUTHOR>(.+?)</SPAN> \| <A HREF='+self.USERHOST+'/\d+/>(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</A></SPAN> \| <A HREF='+self.USERHOST+'/i\d+/>(.+?)</A>',re.I)
        author_time2_pat = re.compile(r'by <SPAN CLASS=AUTHOR>(.+?)</SPAN> \| <A HREF='+self.USERHOST+'/\d+/>(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</A></SPAN> \| <A HREF=#trb OnClick="trbview.*?;"><B>',re.I)
        comment_pat = re.compile(r'\xb5\xa1\xb1\xdb') #덧글
        trackback_pat = re.compile(r'\xb0\xfc\xb7\xc3\xb1\xdb') #관련글

        post = post_pat.search(content1)

        post_title = None
        post_author = None
        post_date = None
        post_status = 'publish'
        post_category = ''
        post_comments = []

        #print post.group(1)
        m = subject_pat.search(post.group(1))
        if m :
            post_title = unescape(m.group(2))
            post_status = 'publish'
        else:
            m = s_subject_pat.search(post.group(1))
            if m:
                post_title = unescape(m.group(2))
                post_status = 'private'
            else:
                post_title = None

        if post_title == None:
            raise Execption

        #글쓴이
        #print post.group(3)
        m = author_time1_pat.search(post.group(3))
        if m:
            post_author = m.group(1)
            #글쓴시간
            post_date = time.strptime( m.group(2),"%Y-%m-%d %H:%M")
            post_category = m.group(3)
        else:
            m = author_time2_pat.search(post.group(3))
            #print post.group(3)
            if m:
                post_author = m.group(1)
                #글쓴시간
                post_date = time.strptime( m.group(2),"%Y-%m-%d %H:%M")
                post_category = ''

        #본문
        post_content = unescape(post.group(2).replace('<BR>',''))

        #이미지
        img_pat = re.compile(r'<IMG CLASS=[A-Z_]+ SRC="(?P<src_url>http.+?)" BORDER=0 WIDTH=(?P<width>\d+) HEIGHT=(?P<height>\d+) .+?">',re.I)
        image_iter = img_pat.finditer(post_content)

        post_images = []
        imagecount = 0
        for image in image_iter:
            post_images.append( Image( '' ,image.group('width'),image.group('height'),image.group('src_url') ) )
            imagecount = imagecount + 1
            post_content = img_pat.sub(r'[#IMAGE'+str(imagecount)+']', post_content ,1)


        #이어지는 내용
        more_pat = re.compile(r'<A HREF='+uid+'_1>(.*?)</A>',re.I)
        post_content = more_pat.sub('<!--more-->',post_content)

        #print post_content
        #print post_title + '-' + str(post_date)

        #print post.group(3)
        #덧글허용
        comment_status = None

        if post.group(3).find('\xb5\xa1\xb1\xdb')  == -1 :#덧글
            comment_status = 'close'
        else:
            comment_status = 'open'
        #관련글허용
        ping_status = None

        if post.group(3).find('\xb0\xfc\xb7\xc3\xb1\xdb') == -1  : #관련글
            ping_status = 'close'
        else:
            ping_status = 'open'

        #print post.group(3)
        if comment_status :
            comment_author      = None
            comment_author_url  = None
            comment_date        = None
            comment_content     = None

            comment_pat         = re.compile(r'<DIV CLASS=COMMENT_TAIL><IMG SRC=http://md\.egloos\.com/img/eg/([a-z0-9_]+)\.gif\s+WIDTH=\d+ HEIGHT=\d+> Commented\s+by <B>(.+?)</B></A> at (\d{4}-\d{2}-\d{2} \d{2}:\d{2})\s*.<A HREF=.+?>x</A>\s*</DIV>\s*<DIV CLASS=COMMENT_BODY>\s*(.+?)\s*</DIV>',re.I)
            linked_comment_pat  = re.compile(r'<DIV CLASS=COMMENT_TAIL><IMG SRC=http://md\.egloos\.com/img/eg/([a-z0-9_]+)\.gif\s+WIDTH=\d+ HEIGHT=\d+> Commented\s+by\s+<A HREF=(.*?) TITLE=".*?"><B>(.+?)</B></A> at (\d{4}-\d{2}-\d{2} \d{2}:\d{2})\s+<A HREF=# OnClick="delComment.+?">x</A></DIV><DIV CLASS=COMMENT_BODY>\s*(.+?)\s*</DIV>',re.I)
            commentiter = comment_pat.finditer(content1)
            for comment in commentiter:
                comment_author      = comment.group(2) #덧글쓴이
                comment_author_url  = ''
                comment_date        = time.strptime( comment.group(3),"%Y-%m-%d %H:%M") #덧글쓴시간
                comment_content     = unescape(comment.group(4).replace('<BR>','\n')) # 덧글내용
                comment_approved    = '0'
                if comment.group(1).find('secu') == -1 :
                    comment_approved = '1'

                post_comments.append(Comment(comment_author.decode(WWWENCODING)
                                            ,comment_author_url.decode(WWWENCODING)
                                            ,comment_date
                                            ,comment_content.decode(WWWENCODING)
                                            ,comment_approved))

            commentiter = linked_comment_pat.finditer(content1)
            for comment in commentiter:
                comment_author      = comment.group(3) # 덧글쓴이
                comment_author_url  = comment.group(2) # 글쓴이링크
                comment_date        = time.strptime( comment.group(4),"%Y-%m-%d %H:%M")# 덧글쓴시간
                comment_content     = unescape(comment.group(5).replace('<BR>','\n')) # 덧글내용
                comment_approved    = '0'
                if comment.group(1).find('secu') == -1 :
                    comment_approved = '1'
                post_comments.append(Comment(comment_author.decode(WWWENCODING)
                                            ,comment_author_url.decode(WWWENCODING)
                                            ,comment_date
                                            ,comment_content.decode(WWWENCODING)
                                            ,comment_approved))

            post_comments.sort()

        return Post( post_title.decode(WWWENCODING)
                    ,post_author.decode(WWWENCODING)
                    ,post_date
                    ,post_content.decode(WWWENCODING)
                    ,post_images
                    ,post_category.decode(WWWENCODING)
                    ,post_status
                    ,comment_status
                    ,post_comments
                    ,ping_status)

    def GetPostsCategory ( self, category_url ):
        url = self.USERHOST + category_url
        print url
        content = self.HTTPGetContents(url)

        posts = []

        list_pat = re.compile(r'<DIV CLASS=ARCHIVE_BODY.*?>\s*(?P<uidlist>.*?)\s*</DIV>',re.S | re.I)
        uid_find_pat = re.compile(r'<A HREF='+self.USERHOST+'/(?P<uid>\d+)>.+?</A>',re.I)

        m = list_pat.search(content)
        #print m.group('uidlist')
        if not m:
            return posts

        uiditer = uid_find_pat.finditer( m.group('uidlist') )

        for uid in uiditer:
            posts.append(self.GetPost(uid.group('uid')))

        return posts

    def GetPostsMonth ( self , startDay ):
        url = self.USERHOST + startDay.strftime('/m%Y-%m-%d')
        print url
        content = self.HTTPGetContents(url)

        list_pat = re.compile(r'<DIV CLASS=POST_BODY.*?>\s*(?P<uidlist>.*?)\s*</DIV>',re.S | re.I)
        uid_find_pat = re.compile(r'<A HREF=#(?P<uid>\d+)>.+?</A>',re.I)

        m = list_pat.search(content)

        uiditer = uid_find_pat.finditer( m.group('uidlist') )

        posts = []

        for uid in uiditer:
            posts.append(self.GetPost(uid.group('uid')))

        return posts



class wpPoster(webConnecter):
    def __init__ (self, host, posturl=None, loginurl=None):
        webConnecter.__init__(self)
        if host == None:
            self.HOST = 'http://test.net'
        else:
            self.HOST = host

        if posturl == None:
            self.POSTURL = self.HOST + '/wp-admin/egloos-import.php'
        else:
            self.POSTURL = posturl

        if loginurl == None:
            self.LOGINURL = self.HOST + '/wp-login.php'
        else:
            self.LOGINURL = loginurl

    def Login (self, USERID, PASSWD, REDIRECT_TO = '/wp-admin/post.php'):
        sParams = { 'action':'login'
                   ,'log': USERID
                   ,'pwd': PASSWD
                   ,'redirect_to' : REDIRECT_TO }
        self.HTTPPostContents(self.LOGINURL, sParams, self.HOST)

    def AddCategories ( self, categories ):
        for category in categories:
            self.AddCategory(category)

    def AddCategory ( self, category_name ):
        sParams = {'action':'addcat'
                  ,'cat_name': category_name.encode('utf-8')
                  ,'category_description': category_name.encode('utf-8')
                  ,'cat': '0'
                  }

        self.HTTPPostContents(self.POSTURL, sParams, self.POSTURL)

    def AddPosts ( self, posts ):
        print "Import %d posts to Wordpress" % len(posts)
        for post in posts:
            self.AddPost(post)

    def AddPost ( self , post):
        #post_title, post_author, post_date, post_content, post_category, post_status, comment_status, post_comments, ping_status))
        if len(post.images) > 0:
            image_pat = re.compile(r'\[#IMAGE(?P<img_num>\d+)\]')
            image_iter = image_pat.finditer(post.content)

            for image_m in image_iter:
                img_num = int(image_m.group('img_num')) - 1
                print img_num
                image_string = self.Upload(post.images[img_num])
                post.content = image_pat.sub(image_string, post.content, 1)

        print post.content
        
        sParams = {'action':'post'
                  ,'post_title': post.title.encode('utf-8')
                  ,'post_author': post.author.encode('utf-8')
                  ,'content': post.content.encode('utf-8')
                  ,'post_cat_name': post.category.encode('utf-8')
                  ,'post_status': post.status
                  ,'comment_status': post.comment_status
                  ,'ping_status': post.ping_status
                  ,'edit_date' : '1'
                  ,'aa': post.date.tm_year
                  ,'mm': post.date.tm_mon
                  ,'jj': post.date.tm_mday
                  ,'hh': post.date.tm_hour
                  ,'mn': post.date.tm_min
                  ,'ss': post.date.tm_sec
                  }

        comment_post_ID = self.HTTPPostContents(self.POSTURL, sParams, self.POSTURL)

        #comment_author , comment_author_url, comment_date ,comment_content, comment_approved):
        for comment in post.comments:
            #print comment
            sParams = None
            sParams = {'action':'comment'
                      ,'comment_post_ID':comment_post_ID
                      ,'author':comment.author.encode('utf-8')
                      ,'email':''
                      ,'url':comment.author_url.encode('utf-8')
                      ,'comment':comment.content.encode('utf-8')
                      ,'approved':comment.approved
                      ,'edit_date' : '1'
                      ,'aa': comment.date.tm_year
                      ,'mm': comment.date.tm_mon
                      ,'jj': comment.date.tm_mday
                      ,'hh': comment.date.tm_hour
                      ,'mn': comment.date.tm_min
                      ,'ss': comment.date.tm_sec }

            self.HTTPPostContents(self.POSTURL, sParams, self.POSTURL)

    def Upload ( self, image ):
        sParams = {'action':'uploadurl'
                  ,'img1': image.src_url
                  ,'imgdesc': image.desc
                  ,'width': image.width
                  ,'height': image.height
                  }

        return self.HTTPPostContents(self.POSTURL, sParams, self.POSTURL)

## 환경정보
G_USERID = 'skyisle'
G_PASSWD = 'xxx'
G_MYURL = 'http://abouts.egloos.com'

WP_USERNAME = 'admin'
WP_PASSWORD = 'xxx'
WP_HOST = 'http://test.net'

if __name__ == '__main__':
    e = egloosGrabber()
    e.Login( G_MYURL, G_USERID , G_PASSWD)
    categoriesDict = e.GetCategories()
    
    posts = []
    
    #posts.extend(e.GetPost('1038493'))

    for category in categoriesDict.keys():
        posts.extend( e.GetPostsCategory(categoriesDict[ category ]) )

    '''import cPickle

    f = open(G_USERID + '.txt','w')
    cPickle.dump(posts,f)
    f.close()

    print "All Post in " + e.USERHOST + " saveed in "+ G_USERID + ".txt"
    '''

    '''import cPickle
    f = open(G_USERID + '.txt','r')
    posts = cPickle.load(f)
    f.close()'''

    wp= wpPoster(WP_HOST)
    wp.Login(WP_USERNAME,WP_PASSWORD)
    wp.AddCategories(categoriesDict.keys())
    posts.sort()
    wp.AddPosts(posts)
