python3 « 我的天

2020-06

BlogBus 2 WordPress – by xrspook

By xrspook @ 19:50:07 归类于: 扮IT

为什么要学习python？因为我见识过python有多牛逼，简单一个脚本文件，轻量级的东西实现强大的功能。因为要做XML文件的格式转换，所以我觉得我要学好python。Think Python 2看到第14章，我就转向去研究10年多以前网友写的BlogBus转WordPress的python脚本。之所以要研究，因为当时的WordPress格式和现在不一样，用以前的脚本转换出来的东西已经没办法直接导入到现在的WordPress里了。再去找写代码的那些人，有些网站还在，但有些已经消失了。我不能等待别人拯救我，我只能自己拯救自己。

我需要转换格式的是“回到过去——Betty迷的独白”和“Mi Internacional Cielo”，这两个旧BlogBus站点和我的主站“我的天”不一样，虽然里面有不少我原创的东西，但我从四面八方搜集回来的内容也不少。当时的BlogBus默认编辑界面是富文本，我看上去觉得格式没问题大概就可以了，但实际上格式是有问题的。从五湖四海搜集回来的文字里面怎么可能不夹杂各种格式，那些东西在富文本编辑下可能看不出来，但在源代码界面一团糟。如果当年我复制粘贴的时候有先去记事本过渡一下就不会有那么多的问题。所以除了要转换BlogBus和WordPress的标签以外我还要筛选删除那些坏事的源代码。

经过接近一周的努力，我终于整出来了。运行下面的python3脚本，如果能顺利完成，自动生成出新的XML文件，用官方途径导入WordPress 5.4.2是完全没有问题的，但我只测试了我自己的两个blog，是不是兼容其它我不知道。因为转换blog我是有自己的想法的，所以脚本中有一些个性化的东西，比如我把blog的标题变成了分类，把原来的分类变成了标签。脚本中有大量的反转义替换，主要是为了人去看CDATA的时候不太头晕迷糊，因为我那两个旧blog里有大量的西班牙语字符，不同的编码下，BlogBus的导出文件里有些被转义了有些没有。那些转义了的放到WordPress里不知道WordPress会不会转回来，我试过标题被BlogBus转义之后WordPress不会转回来，看得我云里雾里。因为转义很头痛，所以除了少数几个内容，有可能被转义的文字都被我用CDATA包裹了起来。

脚本不是我一个人的功劳，我只是在当年网友脚本的基础上做了调整，使之适配python3和WordPress 5.4.2。

我的脚本：xbus2wp.py （PS:下面脚本330行的《/p》是什么鬼怪！WordPress的脚本插件在搞什么！）

'''
***使用说明***
终端界面输入xbus2wp.py bus.xml xrspook。其中：
xbus2wp.py为脚本名字，bus.xml为BlogBus导出文件，xrspook为博主名字，3个参数以空格分开
若运行无误，输出的文件名为[原文件名_xbus2wp.xml]
脚本基于python3，适配WordPress 5.4.2（2020-06-18）
'''
 
import re, sys, getopt, datetime
from xml.dom import minidom
from time import time
 
def convert(inputFileName, owner, order='asc'):
    """"""
    try:
        xmldoc = minidom.parse(inputFileName)
    except Exception as e:
        print ('Fail.')
        print (e)
        print ('Please repair or delete invalid token like "& < >" there.')
        sys.exit(1)
 
    bus = xmldoc.documentElement
    logs = bus.getElementsByTagName('Log')
 
    dom = minidom.Document()
    rss = dom.createElement('rss') # rss是root，根元素
    dom.appendChild(rss)      
    rss.setAttribute('version', '2.0')
    rss.setAttribute('xmlns:content', 'http://purl.org/rss/1.0/modules/content/')
    rss.setAttribute('xmlns:wfw', 'http://wellformedweb.org/CommentAPI/')
    rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/')
    rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/')
    channel = dom.createElement('channel')
    rss.appendChild(channel)
    wxr_version = dom.createElement('wp:wxr_version') # 加入wxr戳，无戳无法进行WordPress导入
    channel.appendChild(wxr_version)
    wxr_version_node = dom.createTextNode('1.1')
    wxr_version.appendChild(wxr_version_node)
 
    busname = bus.getElementsByTagName('BlogName')[0] # 提取原BlogBus名字
    busname_text = getElementData(busname).replace(' ', '_')
 
    # create a list to contain items instead of appending them to
    # channel directly in order to sort them of lately according to order.
    if order == 'desc':
        item_list = []
    else:
        item_list = None
 
    for log in logs:
        title = log.getElementsByTagName('Title')[0]
        title_text = getElementData(title)
        content = log.getElementsByTagName('Content')[0]
        content_text = getElementData(content)
        logdate = log.getElementsByTagName('LogDate')[0]
        pubdate = getElementData(logdate)
        writer = log.getElementsByTagName('Writer')[0]
        creator = owner # BlogBus的writer根本没包含元素！
        category = getElementData(log.getElementsByTagName('Sort')[0])
        tagi = log.getElementsByTagName('Tags')[0]
        tags = getElementData(tagi).split(' ')
        new_tags = unique_tag(category, tags) # 新的wp标签里包含了原BlogBus里的分类与标签
        comments = log.getElementsByTagName('Comment')
 
        #-----
        item = dom.createElement('item')
 
        # handle title
        title_element = createElement(dom, 'title', title_text, 'cdata')
        item.appendChild(title_element)
 
        # handle type
        type_element = createElement(dom, 'wp:post_type', 'post', 'cdata')
        item.appendChild(type_element)
 
        # handle pubdate
        pubdate_element = createElement(dom, 'pubDate', convertPubDate(pubdate))
        item.appendChild(pubdate_element)
 
        # handle creator
        creator_element = createElement(dom, 'dc:creator', creator, 'cdata')
        item.appendChild(creator_element)
 
        # handle categories with domain
        category_element = createElement(dom, 'category', busname_text, 'cdata') # 把BlogBus标题设置为分类，因为我要合并多个旧blog
        category_element.setAttribute('domain','category')
        category_element.setAttribute('nicename', busname_text)
        item.appendChild(category_element)
 
        # handle tags
        for tag in new_tags:
            tag = tag.replace('&ntilde;', 'n')
            tag = tag.replace('summary_of_BLF', 'summary_of_BLF(from_rincondebetty)')
            tag = tag.replace('summary_of_EcoModa', 'summary_of_EcoModa(from_rincondebetty)')
            category_element = createElement(dom, 'category', tag, 'cdata')
            category_element.setAttribute('domain','post_tag')
            category_element.setAttribute('nicename', tag)
            item.appendChild(category_element)
 
        # handle content
        content_element = createElement(dom, "content:encoded", content_text, 'cdata')        
        item.appendChild(content_element)
 
        # handle post_date
        post_date_element = createElement(dom, "wp:post_date", pubdate)
        item.appendChild(post_date_element)
 
        # handle status
        status_element = createElement(dom, "wp:status", 'publish')
        item.appendChild(status_element)
 
        # handle comments
        if comments:
            commentElements = createComments(dom, comments)
            for commentElement in commentElements:
                item.appendChild(commentElement)
 
        if item_list != None:
            item_list.append(item)
        else:
            channel.appendChild(item)
 
    if item_list:
        item_list.reverse()
        for m in item_list:
            channel.appendChild(m)
 
    global filename # 输出设置
    output = filename + '_xbus2wp.xml'
    f = open(output ,'wb+')
    import codecs
    writer = codecs.lookup('utf-8')[3](f)
    dom.writexml(writer, '', ' ' * 4, '\n', encoding='utf-8')
    writer.close()
 
def unique_tag(category,tags): # 只保留唯一的标签
    category = category.replace(' ', '_')
    l = category.split() + tags
    new_l = []
    for item in l:
        if item not in new_l and item != '(from_rincondebetty)':
            new_l.append(item.replace(' ', '_')) # 替换空格为下划线
    return new_l
 
def getElementData(element): # 获取节点数据
    """"""
    data = ''
    for node in element.childNodes:
        if node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE):
            data += node.data
    return data
 
def createComments(dom, comments):
    """"""
    l = []
    count = 0
    for comment in comments:
        count += 1 # 每篇文章的评论序号，没有序号，评论只能导入每篇最后一条
        email = comment.getElementsByTagName('Email')[0]
        homepage = comment.getElementsByTagName('HomePage')[0]
        name = comment.getElementsByTagName('NiceName')[0]
        content = comment.getElementsByTagName('CommentText')[0]
        date = comment.getElementsByTagName('CreateTime')[0]
        comment_element = createCommentElement(count, dom, email, homepage, name, content, date)
        l.append(comment_element)
    return l
 
def createCommentElement(count, dom, email, homepage, name, content, date):
    """"""
    comment_author = getElementData(name)
    comment_author_email = getElementData(email)
    comment_author_url = getElementData(homepage)
    comment_date = getElementData(date)
    comment_content = getElementData(content)
 
    comment_id_element = createElement(dom, 'wp:comment_id', str(count))
    comment_author_element = createElement(dom, 'wp:comment_author', comment_author)
    comment_author_email_element = createElement(dom, 'wp:comment_author_email', comment_author_email)
    comment_author_url_element = createElement(dom, 'wp:comment_author_url', comment_author_url)
    comment_date_element = createElement(dom, 'wp:comment_date', comment_date)
    comment_date_gmt_element = createElement(dom, 'wp:comment_date_gmt', comment_date)
    comment_content_element = createElement(dom, 'wp:comment_content', comment_content, 'cdata')
    comment_approved_element = createElement(dom, 'wp:comment_approved', '1')
 
    # make the comment element
    comment_element = dom.createElement('wp:comment')
    comment_element.appendChild(comment_id_element)
    comment_element.appendChild(comment_author_element)
 
    # validate email and url
    validEmail = validateEmail(comment_author_email)
    if (validEmail):
        comment_element.appendChild(comment_author_email_element)
 
    validUrl = validateUrl(comment_author_url)
    if (validUrl):
        comment_element.appendChild(comment_author_url_element)    
 
    comment_element.appendChild(comment_date_element)
    comment_element.appendChild(comment_date_gmt_element)
    comment_element.appendChild(comment_content_element)
    comment_element.appendChild(comment_approved_element)
 
    return comment_element
 
def createElement(dom, elementName, elementValue, type='text'): #建立节点标签和节点
    """"""
    global owner
    tag = dom.createElement(elementName)
    if elementValue.find(']]>') > -1:
        type = 'text'
    if type == 'text':
        text = dom.createTextNode(elementValue)
    elif type == 'cdata':
        elementValue = elementValue.replace('&amp;', '&')
        elementValue = elementValue.replace('&lt;', '<')
        elementValue = elementValue.replace('&gt;', '>')
        elementValue = elementValue.replace('&apos;', '\'')
        elementValue = elementValue.replace('&quot;', '"')
 
        # 大量替换与我的旧blog有各种编码的西班牙语字符有关
        elementValue = elementValue.replace('&copy;', '') # 版权标志
        elementValue = elementValue.replace('&nbsp;', '') # 空格
        elementValue = elementValue.replace('&ldquo;', '“') # 左双引号
        elementValue = elementValue.replace('&rdquo;', '”') # 右双引号
        elementValue = elementValue.replace('&lsquo;', '‘') # 左单引号
        elementValue = elementValue.replace('&rsquo;', '’') # 右单引号
        elementValue = elementValue.replace('&acute;', '´') # 单引号
        elementValue = elementValue.replace('&hellip;', '...') # 省略号
        elementValue = elementValue.replace('&mdash;', '—') # 破折号
        elementValue = elementValue.replace('&middot;', '·') # 分隔号
        elementValue = elementValue.replace('&deg;', '°') # 单位度
        elementValue = elementValue.replace('&iexcl;', '¡') # 西班牙语反叹号
        elementValue = elementValue.replace('&iquest;', '¿') # 西班牙语反问号
        elementValue = elementValue.replace('&ntilde;', 'ñ') # 西班牙语n
        elementValue = elementValue.replace('&Ntilde;', 'Ñ') # 西班牙语N
        elementValue = elementValue.replace('&aacute;', 'á') # 西班牙语a
        elementValue = elementValue.replace('&eacute;', 'é') # 西班牙语e
        elementValue = elementValue.replace('&iacute;', 'í') # 西班牙语i
        elementValue = elementValue.replace('&oacute;', 'ó') # 西班牙语o
        elementValue = elementValue.replace('&uacute;', 'ú') # 西班牙语u
        elementValue = elementValue.replace('&Aacute;', 'Á') # 西班牙语A
        elementValue = elementValue.replace('&Eacute;', 'É') # 西班牙语E
        elementValue = elementValue.replace('&Iacute;', 'Í') # 西班牙语I
        elementValue = elementValue.replace('&Oacute;', 'Ó') # 西班牙语O
        elementValue = elementValue.replace('&Uacute;', 'Ú') # 西班牙语U
        elementValue = elementValue.replace('&Atilde;', 'Ã') # 西班牙语A~
        elementValue = elementValue.replace('&ordf;', 'ª') # 西班牙语上标a
        elementValue = elementValue.replace('&ordm;', 'º') # 西班牙语上标o
 
        elementValue = elementValue.replace('<!--msnavigation-->', '')
        elementValue = elementValue.replace('博主', owner)
        elementValue = elementValue.replace('<i>', '')
        elementValue = elementValue.replace('</i>', '')
        elementValue = elementValue.replace('<br /><br />', '<br />')
 
        elementValue = re.sub(r"(?:<\?xml.*?>)", "", elementValue)
        elementValue = re.sub(r"(?:<[TDSFHI].*?>)", "", elementValue)
        elementValue = re.sub(r"(?:<\/[TDSFHI].*?>)", "", elementValue)
        elementValue = re.sub(r"(?:<P.*?>)", "<p>", elementValue)
        elementValue = re.sub(r"(?:<(table|tbody|tr|td|div|span|img|script|font|hr|object|param).*?>)", "", elementValue)
        elementValue = re.sub(r"(?:<\/(table|tbody|tr|td|div|span|img|script|font|object).*?>)", "", elementValue)
        elementValue = re.sub(r"\n", "", elementValue) # 把替换造成的空行删除
 
        text = dom.createCDATASection(elementValue)
    tag.appendChild(text)
    return tag
 
def convertPubDate(date, timediff='+0000'):
    """
    convert 2003-08-22 16:01:56
    to Thu, 23 Aug 2007 05:47:54 +0000
    """
    year, mon, day = int(date[:4]), int(date[5:7]), int(date[8:10])
    time = date[11:]
    aday = datetime.datetime(year, mon, day)
    d = {'1':'Mon', '2':'Tus', '3':'Wen', '4':'Thur', '5':'Fri', '6':'Sat', '7':'Sun'}
    m = {'1':'Jan', '2':'Feb', '3':'Mar', '4':'Apr', '5':'May', '6':'Jun',
         '7':'Jul', '8':'Aug', '9':'Sep', '10':'Oct', '11':'Nov', '12':'Dec'}
    weekday = d[str(aday.isoweekday())]
    month = m[str(mon)]
    pubdate = "%s, %d %s %s %s %s" % (weekday, day, month, year, time, timediff)
    return pubdate
 
def validateEmail(email):
    '''
    '''
    pattern = r'^[0-9a-z][_.0-9a-z-]{0,31}@([0-9a-z][0-9a-z-]{0,30}[0-9a-z]\.){1,4}[a-z]{2,4}$'
    p = re.compile(pattern)
    m = p.match(email)
    if m:
        return True
    else:
        return False
 
def validateUrl(url):
    '''
    '''
    pattern = r'^[a-zA-z]+://(\w+(-\w+)*)(\.(\w+(-\w+)*))*(\?\S*)?$'
    p = re.compile(pattern)
    m = p.match(url)
    if m:
        return True
    else:
        return False
 
def main(argv=None):    
    global filename
    global owner
 
    if argv is None:
        argv = sys.argv
    # parse command line options
 
    args = sys.argv[1:]
    order='asc'
    if (len(args) == 2):
        print ('Converting...'),
        sys.stdout.flush()
        start = time()
        filename = args[0].replace('.xml', '')
        owner = args[1] # BlogBus没把博主名字输出，只能手动
        convert(args[0], args[1], order)
        end = time()
        print ('Done. Elapse %g seconds.' % (end - start))
 
if __name__ == "__main__":
    sys.exit(main())
</p>

''' ***使用说明*** 终端界面输入xbus2wp.py bus.xml xrspook。其中： xbus2wp.py为脚本名字，bus.xml为BlogBus导出文件，xrspook为博主名字，3个参数以空格分开若运行无误，输出的文件名为[原文件名_xbus2wp.xml] 脚本基于python3，适配WordPress 5.4.2（2020-06-18） ''' import re, sys, getopt, datetime from xml.dom import minidom from time import time def convert(inputFileName, owner, order='asc'): """""" try: xmldoc = minidom.parse(inputFileName) except Exception as e: print ('Fail.') print (e) print ('Please repair or delete invalid token like "& < >" there.') sys.exit(1) bus = xmldoc.documentElement logs = bus.getElementsByTagName('Log') dom = minidom.Document() rss = dom.createElement('rss') # rss是root，根元素 dom.appendChild(rss) rss.setAttribute('version', '2.0') rss.setAttribute('xmlns:content', 'http://purl.org/rss/1.0/modules/content/') rss.setAttribute('xmlns:wfw', 'http://wellformedweb.org/CommentAPI/') rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/') rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/') channel = dom.createElement('channel') rss.appendChild(channel) wxr_version = dom.createElement('wp:wxr_version') # 加入wxr戳，无戳无法进行WordPress导入 channel.appendChild(wxr_version) wxr_version_node = dom.createTextNode('1.1') wxr_version.appendChild(wxr_version_node) busname = bus.getElementsByTagName('BlogName')[0] # 提取原BlogBus名字 busname_text = getElementData(busname).replace(' ', '_') # create a list to contain items instead of appending them to # channel directly in order to sort them of lately according to order. if order == 'desc': item_list = [] else: item_list = None for log in logs: title = log.getElementsByTagName('Title')[0] title_text = getElementData(title) content = log.getElementsByTagName('Content')[0] content_text = getElementData(content) logdate = log.getElementsByTagName('LogDate')[0] pubdate = getElementData(logdate) writer = log.getElementsByTagName('Writer')[0] creator = owner # BlogBus的writer根本没包含元素！ category = getElementData(log.getElementsByTagName('Sort')[0]) tagi = log.getElementsByTagName('Tags')[0] tags = getElementData(tagi).split(' ') new_tags = unique_tag(category, tags) # 新的wp标签里包含了原BlogBus里的分类与标签 comments = log.getElementsByTagName('Comment') #----- item = dom.createElement('item') # handle title title_element = createElement(dom, 'title', title_text, 'cdata') item.appendChild(title_element) # handle type type_element = createElement(dom, 'wp:post_type', 'post', 'cdata') item.appendChild(type_element) # handle pubdate pubdate_element = createElement(dom, 'pubDate', convertPubDate(pubdate)) item.appendChild(pubdate_element) # handle creator creator_element = createElement(dom, 'dc:creator', creator, 'cdata') item.appendChild(creator_element) # handle categories with domain category_element = createElement(dom, 'category', busname_text, 'cdata') # 把BlogBus标题设置为分类，因为我要合并多个旧blog category_element.setAttribute('domain','category') category_element.setAttribute('nicename', busname_text) item.appendChild(category_element) # handle tags for tag in new_tags: tag = tag.replace('ñ', 'n') tag = tag.replace('summary_of_BLF', 'summary_of_BLF(from_rincondebetty)') tag = tag.replace('summary_of_EcoModa', 'summary_of_EcoModa(from_rincondebetty)') category_element = createElement(dom, 'category', tag, 'cdata') category_element.setAttribute('domain','post_tag') category_element.setAttribute('nicename', tag) item.appendChild(category_element) # handle content content_element = createElement(dom, "content:encoded", content_text, 'cdata') item.appendChild(content_element) # handle post_date post_date_element = createElement(dom, "wp:post_date", pubdate) item.appendChild(post_date_element) # handle status status_element = createElement(dom, "wp:status", 'publish') item.appendChild(status_element) # handle comments if comments: commentElements = createComments(dom, comments) for commentElement in commentElements: item.appendChild(commentElement) if item_list != None: item_list.append(item) else: channel.appendChild(item) if item_list: item_list.reverse() for m in item_list: channel.appendChild(m) global filename # 输出设置 output = filename + '_xbus2wp.xml' f = open(output ,'wb+') import codecs writer = codecs.lookup('utf-8')[3](f) dom.writexml(writer, '', ' ' * 4, '\n', encoding='utf-8') writer.close() def unique_tag(category,tags): # 只保留唯一的标签 category = category.replace(' ', '_') l = category.split() + tags new_l = [] for item in l: if item not in new_l and item != '(from_rincondebetty)': new_l.append(item.replace(' ', '_')) # 替换空格为下划线 return new_l def getElementData(element): # 获取节点数据 """""" data = '' for node in element.childNodes: if node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE): data += node.data return data def createComments(dom, comments): """""" l = [] count = 0 for comment in comments: count += 1 # 每篇文章的评论序号，没有序号，评论只能导入每篇最后一条 email = comment.getElementsByTagName('Email')[0] homepage = comment.getElementsByTagName('HomePage')[0] name = comment.getElementsByTagName('NiceName')[0] content = comment.getElementsByTagName('CommentText')[0] date = comment.getElementsByTagName('CreateTime')[0] comment_element = createCommentElement(count, dom, email, homepage, name, content, date) l.append(comment_element) return l def createCommentElement(count, dom, email, homepage, name, content, date): """""" comment_author = getElementData(name) comment_author_email = getElementData(email) comment_author_url = getElementData(homepage) comment_date = getElementData(date) comment_content = getElementData(content) comment_id_element = createElement(dom, 'wp:comment_id', str(count)) comment_author_element = createElement(dom, 'wp:comment_author', comment_author) comment_author_email_element = createElement(dom, 'wp:comment_author_email', comment_author_email) comment_author_url_element = createElement(dom, 'wp:comment_author_url', comment_author_url) comment_date_element = createElement(dom, 'wp:comment_date', comment_date) comment_date_gmt_element = createElement(dom, 'wp:comment_date_gmt', comment_date) comment_content_element = createElement(dom, 'wp:comment_content', comment_content, 'cdata') comment_approved_element = createElement(dom, 'wp:comment_approved', '1') # make the comment element comment_element = dom.createElement('wp:comment') comment_element.appendChild(comment_id_element) comment_element.appendChild(comment_author_element) # validate email and url validEmail = validateEmail(comment_author_email) if (validEmail): comment_element.appendChild(comment_author_email_element) validUrl = validateUrl(comment_author_url) if (validUrl): comment_element.appendChild(comment_author_url_element) comment_element.appendChild(comment_date_element) comment_element.appendChild(comment_date_gmt_element) comment_element.appendChild(comment_content_element) comment_element.appendChild(comment_approved_element) return comment_element def createElement(dom, elementName, elementValue, type='text'): #建立节点标签和节点 """""" global owner tag = dom.createElement(elementName) if elementValue.find(']]>') > -1: type = 'text' if type == 'text': text = dom.createTextNode(elementValue) elif type == 'cdata': elementValue = elementValue.replace('&', '&') elementValue = elementValue.replace('<', '<') elementValue = elementValue.replace('>', '>') elementValue = elementValue.replace(''', '\'') elementValue = elementValue.replace('"', '"') # 大量替换与我的旧blog有各种编码的西班牙语字符有关 elementValue = elementValue.replace('©', '') # 版权标志 elementValue = elementValue.replace(' ', '') # 空格 elementValue = elementValue.replace('“', '“') # 左双引号 elementValue = elementValue.replace('”', '”') # 右双引号 elementValue = elementValue.replace('‘', '‘') # 左单引号 elementValue = elementValue.replace('’', '’') # 右单引号 elementValue = elementValue.replace('´', '´') # 单引号 elementValue = elementValue.replace('…', '...') # 省略号 elementValue = elementValue.replace('—', '—') # 破折号 elementValue = elementValue.replace('·', '·') # 分隔号 elementValue = elementValue.replace('°', '°') # 单位度 elementValue = elementValue.replace('¡', '¡') # 西班牙语反叹号 elementValue = elementValue.replace('¿', '¿') # 西班牙语反问号 elementValue = elementValue.replace('ñ', 'ñ') # 西班牙语n elementValue = elementValue.replace('Ñ', 'Ñ') # 西班牙语N elementValue = elementValue.replace('á', 'á') # 西班牙语a elementValue = elementValue.replace('é', 'é') # 西班牙语e elementValue = elementValue.replace('í', 'í') # 西班牙语i elementValue = elementValue.replace('ó', 'ó') # 西班牙语o elementValue = elementValue.replace('ú', 'ú') # 西班牙语u elementValue = elementValue.replace('Á', 'Á') # 西班牙语A elementValue = elementValue.replace('É', 'É') # 西班牙语E elementValue = elementValue.replace('Í', 'Í') # 西班牙语I elementValue = elementValue.replace('Ó', 'Ó') # 西班牙语O elementValue = elementValue.replace('Ú', 'Ú') # 西班牙语U elementValue = elementValue.replace('Ã', 'Ã') # 西班牙语A~ elementValue = elementValue.replace('ª', 'ª') # 西班牙语上标a elementValue = elementValue.replace('º', 'º') # 西班牙语上标o elementValue = elementValue.replace('', '') elementValue = elementValue.replace('博主', owner) elementValue = elementValue.replace('', '') elementValue = elementValue.replace('', '') elementValue = elementValue.replace(' ', ' ') elementValue = re.sub(r"(?:<\?xml.*?>)", "", elementValue) elementValue = re.sub(r"(?:<[TDSFHI].*?>)", "", elementValue) elementValue = re.sub(r"(?:<\/[TDSFHI].*?>)", "", elementValue) elementValue = re.sub(r"(?:<P.*?>)", "", elementValue) elementValue = re.sub(r"(?:<(table|tbody|tr|td|div|span|img|script|font|hr|object|param).*?>)", "", elementValue) elementValue = re.sub(r"(?:<\/(table|tbody|tr|td|div|span|img|script|font|object).*?>)", "", elementValue) elementValue = re.sub(r"\n", "", elementValue) # 把替换造成的空行删除 text = dom.createCDATASection(elementValue) tag.appendChild(text) return tag def convertPubDate(date, timediff='+0000'): """ convert 2003-08-22 16:01:56 to Thu, 23 Aug 2007 05:47:54 +0000 """ year, mon, day = int(date[:4]), int(date[5:7]), int(date[8:10]) time = date[11:] aday = datetime.datetime(year, mon, day) d = {'1':'Mon', '2':'Tus', '3':'Wen', '4':'Thur', '5':'Fri', '6':'Sat', '7':'Sun'} m = {'1':'Jan', '2':'Feb', '3':'Mar', '4':'Apr', '5':'May', '6':'Jun', '7':'Jul', '8':'Aug', '9':'Sep', '10':'Oct', '11':'Nov', '12':'Dec'} weekday = d[str(aday.isoweekday())] month = m[str(mon)] pubdate = "%s, %d %s %s %s %s" % (weekday, day, month, year, time, timediff) return pubdate def validateEmail(email): ''' ''' pattern = r'^[0-9a-z][_.0-9a-z-]{0,31}@([0-9a-z][0-9a-z-]{0,30}[0-9a-z]\.){1,4}[a-z]{2,4}$' p = re.compile(pattern) m = p.match(email) if m: return True else: return False def validateUrl(url): ''' ''' pattern = r'^[a-zA-z]+://(\w+(-\w+)*)(\.(\w+(-\w+)*))*(\?\S*)?$' p = re.compile(pattern) m = p.match(url) if m: return True else: return False def main(argv=None): global filename global owner if argv is None: argv = sys.argv # parse command line options args = sys.argv[1:] order='asc' if (len(args) == 2): print ('Converting...'), sys.stdout.flush() start = time() filename = args[0].replace('.xml', '') owner = args[1] # BlogBus没把博主名字输出，只能手动 convert(args[0], args[1], order) end = time() print ('Done. Elapse %g seconds.' % (end - start)) if __name__ == "__main__": sys.exit(main())

标签：blogbus, python3, wordpress, xml, 扮IT, 脚本, 转换

评论关闭

我的天

BlogBus 2 WordPress – by xrspook

戳这只鬼

随机日志

我的天

BlogBus 2 WordPress – by xrspook

戳这只鬼

标签云了

随机日志