http://blogsome-forum.blogsome.com/view ... de94d21fa8Google me ha encontrado esto, pero no lo he probado.
#BlogSome to WordPress parser. Builds a wordpress export file from a bunch of files
#in plain text exported from your blogsome blog using ecto.
#
# usage: 'Usage: usage: ./byTowp.py [directoryWhereYouHaveYourPostsInPlainText] [XMLfileForTheOutput]'
#
#Copyright (C) <year> <name of author>
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
#!/usr/bin/env python
import sys
import getopt
import os
import re
from datetime import date
import codecs
class Parser:
file_dest = 0
file_srcs = 0
categories = []
dir = 0
titles = []
links =[]
short_dates = []
long_dates = []
post_categories = []
bodies = []
def __init__(self, filedestname, directory):
self.file_dest = filedestname
self.dir = directory
def setFilesReading(self, filesrcnames):
self.file_srcs = filesrcnames
def preparament(self):
fd = open(self.file_dest, 'w')
#initialization, things that have to be done
fd.write("""<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.0/"
>
<channel>
<title>A Estocolm ja estariem</title>
<link>http://astalcaure.wordpress.com</link>
<description>Just another WordPress.com weblog</description>
<pubDate>Mon, 13 Aug 2007 16:35:53 +0000</pubDate>,
<generator>http://wordpress.org/?v=MU</generator>
<language>ca</language>
""")
#find and insert categories
self.findCategories()
for cat in self.categories:
#print cat
fd.write("<wp:category><wp:category_nicename>" + cat + "</wp:category_nicename><wp:category_parent></wp:category_parent><wp:posts_private>0</wp:posts_private><wp:links_private>0</wp:links_private><wp:cat_name><![CDATA[" + cat.capitalize() + "]]></wp:cat_name></wp:category>\n")
#collect all the data from the posts (title, link, date, creator <-- not in my case, permalink, content, post id, short date, bla bla bla
self.getPostData()
if float(((len(self.titles)+len(self.bodies)+len(self.links)+len(self.short_dates)+len(self.long_dates)+len(self.post_categories))/6.0)) != float(len(self.titles)):
print 'Not all the posts have all the fields'
os.abort()
print 'Number of posts parsed: ' + str(len(self.titles))
for i in range(len(self.titles)):
fd.write("""
<item>
<title>"""+self.titles[i]+"""</title>
<link>"""+self.links[i]+"""</link>
<pubDate>"""+self.long_dates[i]+"""</pubDate>
<dc:creator>asticalcaure</dc:creator>
""")
for j in range(len(self.post_categories[i])):
fd.write("""
<category><![CDATA["""+self.post_categories[i][j]+"""]]></category>
""")
fd.write("""
<guid isPermaLink="false">"""+self.links[i]+"""</guid>
<description></description>
<content:encoded><![CDATA["""+self.bodies[i]+"""]]></content:encoded>
<wp:post_id>"""+str(i)+"""</wp:post_id>
<wp:post_date>"""+self.short_dates[i]+"""</wp:post_date>
<wp:post_date_gmt>"""+self.short_dates[i]+"""</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>"""+self.titles[i]+"""</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
</item>
""")
fd.close()
def findCategories(self):
#categ_re = re.compile('\s*Categories:\s[A-Za-z\x90-\xff]+(\s*\|(\s[A-Za-z\x90-\xff]+\s*)+)*')
categ_re = re.compile('\s*Categories:(\s*w+)*')
for post in self.file_srcs:
try:
fs = open(self.dir+post, 'r')
for line in fs:
line = line.rstrip('\n')
line = line.rstrip('\r')
cat = categ_re.match(line)
if cat:
poss_cat = self.getCategories(line)
for i in poss_cat:
if not self.categories.__contains__(i):
self.categories.append(i)
fs.close()
except:
print "Couldn't open the file", self.dir+post
print self.categories
def getCategories(self, poss_cat):
categ_name_re = re.compile('([(\x80-\xff*\w*\-*.*)*\s*]+)')
r = categ_name_re.findall(poss_cat)
r = r[1:]
for i in range(len(r)):
r[i] = r[i].lstrip()
r[i] = r[i].rstrip()
return r
def getPostData(self):
title_re = re.compile('\s*Title:(\s*\w+)*')
link_re = re.compile('\s*Link:\s\S*')
date_re = re.compile('\s*Date:\s\S*\s\S*')
post_categ_re = re.compile('\s*Categories:\s*[A-Za-z\x90-\xff]+\s*(\s*\|(\s*[A-Za-z\x90-\xff]+\s*)+)*')
body_re = re.compile('\s*Body:(\s\S)*')
body_coming = 0
for post in self.file_srcs:
#try:
fs = open(self.dir+post, 'r')
for line in fs:
line = line.strip('\n')
line = line.strip('\r')
#title
tit = title_re.match(line)
if tit:
tit_found = self.getTitle(line)
self.titles.append(tit_found)
#link
lin = link_re.match(line)
if lin:
lin_found = self.getLink(lin)
self.links.append(lin_found)
#dates
dat = date_re.match(line)
if dat:
short_dat_found, long_dat_found = self.getDate(dat)
self.short_dates.append(short_dat_found)
self.long_dates.append(long_dat_found)
#categories
cat = post_categ_re.match(line)
if cat:
poss_cat = self.getCategories(line)
self.post_categories.append(poss_cat)
#body
if body_coming == 1:
self.bodies.append(line)
body_coming = 0
bod = body_re.match(line)
if bod:
body_coming = 1
#print self.titles
#print self.links
#print self.short_dates
#print self.long_dates
#print self.post_categories
#print self.bodies
def getTitle(self, poss_tit):
title_name_re = re.compile('([(\x40-\xff*\w*\-*.*)*\s*]+)')
r = title_name_re.findall(poss_tit)
return r[1:][0].lstrip()
def getLink(self, poss_lin):
link_name_re = re.compile('\S+')
r = link_name_re.findall(poss_lin.group())
res = r[1:][0].replace('http://xinkt.blogsome.com/', 'http://astalcaure.wordpress.com/')
return res
def getDate(self, poss_dat):
#first the short date
date_day_re = re.compile('[0-9/]+')
date_hour_re = re.compile('[0-9]+:[0-9]+:[0-9]+')
d = date_day_re.findall(poss_dat.group())
h = date_hour_re.findall(poss_dat.group())
d = d[0].split('/')
short_date = d[2]+'-'+d[1]+'-'+d[0] + ' ' + h[0]
#and now the long one
day = date(int(d[2]), int(d[1]), int(d[0]))
day_list = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
long_date = day_list[date.weekday(day)] + ', ' + d[0] + ' ' + month_list[int(d[1])-1] + ' ' + d[2] + ' ' + h[0] + ' +0000'
return short_date, long_date
print """
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY.
This is free software, and you are welcome to redistribute it
under certain conditions.\n
"""
if len(sys.argv) != 3:
print 'Error: bad arguments'
print 'Usage: usage: ./byTowp.py [directoryWhereYouHaveYourPostsInPlainText] [XMLfileForTheOutput]'
os.abort()
arguments = sys.argv[1:]
post_data = os.listdir(arguments[0])
p = Parser(arguments[1], arguments[0])
p.setFilesReading(post_data)
p.preparament()
#for i in post_data:
# print i