3 # Copyright 2009, Sean M. Graham (www.sean-graham.com)
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are
10 # - Redistributions of source code must retain the above copyright notice,
11 # this list of conditions and the following disclaimer.
13 # - Redistributions in binary form must reproduce the above copyright notice,
14 # this list of conditions and the following disclaimer in the documentation
15 # and/or other materials provided with the distribution.
17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
20 # EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
23 # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 import xml.dom.minidom
35 from time import strptime, strftime
37 def getNodeText(doc, nodename):
41 nodelist = doc.getElementsByTagName(nodename)[0].childNodes
46 if node.nodeType == node.TEXT_NODE:
51 def appendTextNode(doc, parent, nodename, value):
54 # make sure value is properly encoded
56 bytes = nodeValue.encode("UTF-8")
58 bytes = nodeValue.encode("cp1252")
59 nodeValue = unicode(bytes, "UTF-8")
61 element = doc.createElement(nodename)
63 if( nodeValue != "" ):
64 textNode = doc.createTextNode(nodeValue)
65 element.appendChild(textNode)
67 parent.appendChild(element)
70 def addEntryForId(outDoc, element, username, id, includeSecure):
71 entryFile = open("%s/L-%s" % (username,id), "r")
72 inDoc = xml.dom.minidom.parse(entryFile)
74 # Create an entry element
75 entry = outDoc.createElement("entry")
77 # Create an itemid element
78 appendTextNode(outDoc, entry, "itemid", getNodeText(inDoc,"itemid"))
80 # Create an eventtime element
81 appendTextNode(outDoc, entry, "eventtime", getNodeText(inDoc, "eventtime"))
83 # Create an subject element
84 appendTextNode(outDoc, entry, "subject", getNodeText(inDoc, "subject"))
86 # Create an event node (special case because for some reason there are two
87 # 'event' elements in the pydump output, which is probably LJ's fault)
88 event = inDoc.getElementsByTagName("event")[0]
89 eventText = getNodeText(event, "event")
91 appendTextNode(outDoc, entry, "event", replaceLJTags(eventText))
93 security = getNodeText(inDoc, "security")
96 # don't append this entry unless the user provided the argument
97 if(includeSecure == False):
98 print("omitting secure entry: L-%s" % id)
101 if(security == "usemask"):
102 print("including allowmask entry: L-%s" % id)
104 # Create an allowmask element
105 maskText = getNodeText(inDoc, "allowmask")
108 appendTextNode(outDoc, entry, "allowmask", maskText)
110 appendTextNode(outDoc, entry, "allowmask", "0")
112 print("including private entry: L-%s" % id)
114 appendTextNode(outDoc, entry, "security", security)
116 # Create a taglist element
117 appendTextNode(outDoc, entry, "taglist", getNodeText(inDoc, "taglist"))
119 # XXXSMG: make sure there is a comment file before trying to do anything
121 addCommentsForId(outDoc, entry, username, id)
123 element.appendChild(entry)
125 def addCommentsForId(outDoc, entry, username, id):
127 commentFile = open("%s/C-%s" % (username,id), "r")
128 except IOError: # there are no comments for this entry
131 inDoc = xml.dom.minidom.parse(commentFile)
133 comments = inDoc.getElementsByTagName("comment")
135 for comment in comments:
136 outComment = outDoc.createElement("comment")
137 entry.appendChild(outComment)
139 # add the item id for the comment
140 appendTextNode(outDoc, outComment, "itemid",
141 getNodeText(comment, "id"))
143 # convert the time string
144 timeString = getNodeText(comment, "date")
145 if( timeString != "" ):
146 inDate = strptime(timeString, "%Y-%m-%dT%H:%M:%SZ")
147 outDate = strftime("%Y-%m-%d %H:%M:%S", inDate)
148 appendTextNode(outDoc, outComment, "eventtime", outDate)
150 emptyTime = outDoc.createElement("eventtime")
151 outComment.appendChild(emptyTime)
153 # Create an subject element
154 appendTextNode(outDoc, outComment, "subject",
155 getNodeText(comment, "subject"))
157 # Create an event element
158 bodyText = getNodeText(comment, "body")
159 appendTextNode(outDoc, outComment, "event", replaceLJTags(bodyText))
161 # Create the author element
162 author = outDoc.createElement("author")
163 outComment.appendChild(author)
166 cUser = getNodeText(comment, "user")
170 appendTextNode(outDoc, author, "name", cUser)
171 appendTextNode(outDoc, author, "email", cUser + "@livejournal.com")
173 # Create the parent_itemid
174 parentId = getNodeText(comment, "parentid")
176 appendTextNode(outDoc, outComment, "parent_itemid", parentId)
179 # regular expressions used in replaceLJTags()
180 # (global for later reuse - suggestion by jparise)
182 userRE = re.compile('<lj user="(.*?)" ?/?>', re.IGNORECASE)
183 commRE = re.compile('<lj comm="(.*?)" ?/?>', re.IGNORECASE)
184 namedCutRE = re.compile('<lj-cut +text="(.*?)" ?/?>',
185 re.IGNORECASE|re.DOTALL)
186 cutRE = re.compile('<lj-cut>', re.IGNORECASE)
187 cutRE = re.compile('</lj-cut>', re.IGNORECASE)
188 embedRE = re.compile('<lj-embed id="[0-9]+">', re.IGNORECASE)
190 def replaceLJTags(entry):
193 # replace lj user tags
194 rv = re.sub(userRE, '<a href="http://www.livejournal.com/users/\\1" class="lj-user">\\1</a>', rv)
196 # replace lj comm tags
197 rv = re.sub(commRE, '<a href="http://community.livejournal.com/\\1/" class="lj-comm">\\1</a>', rv)
199 # replace lj-cut tags
200 rv = re.sub(namedCutRE, '<!--more \\1-->', rv)
201 rv = re.sub(cutRE, '<!--more-->', rv)
202 rv = re.sub(cutRE, '', rv)
204 # replace lj-embed tags
205 # this doesn't actually work. LJ doesn't include the embedded content
206 # when ljdump calls 'getevents', but instead includes an lj-embed tag
207 # with an id and nothing else.
208 #rv = re.sub(embedRE, '', rv)
214 print( "Usage: convertdump.py [arguments]" )
216 This will convert a pydump archive into something compatible with the
217 WordPress LiveJournal importer. This is the same format used by the Windows
221 -u --user username of archive to process [required]
222 -l --limit limit the number of entries in each xml file (default 250)
223 -i --insecure include private and protected entries in the output
224 -h --help show this help page
227 ./convertdump.py --user stevemartin --limit 200 --insecure
234 includeSecure = False;
236 if( len(argv) == 0 ):
241 opts, args = getopt.getopt(sys.argv[1:], "hu:l:i", ["help",
245 except getopt.GetoptError, err:
246 # print help information and exit:
247 print str(err) # will print something like "option -a not recognized"
254 elif o in ("-u", "--user"):
256 elif o in ("-l", "--limit"):
258 elif o in ("-i", "--insecure"):
259 print( "Warning: Including secure entries in XML output" )
261 elif o in ("-h", "--help"):
265 assert False, "unhandled option"
267 userDir = os.listdir(username)
272 # get the list of entries
274 if file.startswith("L-"):
275 entryNum = int(file.replace("L-",""))
277 entryArray.append(entryNum)
279 if( highNum < entryNum ):
284 # Create the minidom document
285 outDoc = xml.dom.minidom.Document()
287 # Create the <livejournal> base element
288 ljElement = outDoc.createElement("livejournal")
289 outDoc.appendChild(ljElement)
293 # start processing entries
294 for entry in entryArray:
295 addEntryForId(outDoc, ljElement, username, entry, includeSecure)
297 currentFileEntry += 1
299 if( currentFileEntry == entryLimit or entry == entryArray[-1] ):
301 f = open("%s - %s.xml" % (username, entry), "w")
302 tempXML = outDoc.toxml("UTF-8")
307 # Create the minidom document
308 outDoc = xml.dom.minidom.Document()
310 # Create the <livejournal> base element
311 ljElement = outDoc.createElement("livejournal")
312 outDoc.appendChild(ljElement)
314 if __name__ == "__main__":